{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 23583, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012721027859051011, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.835254192352295, "learning_rate": 0.0, "loss": 0.7982, "mean_token_accuracy": 0.7762961387634277, "num_tokens": 38493.0, "step": 1 }, { "epoch": 0.00025442055718102023, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.588627338409424, "learning_rate": 4.2390843577787196e-10, "loss": 0.8329, "mean_token_accuracy": 0.765798807144165, "num_tokens": 80419.0, "step": 2 }, { "epoch": 0.0003816308357715303, "ewc_loss": 1.0344866747892447e-14, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0344867151556586e-17, "grad_norm": 4.7253828048706055, "learning_rate": 8.478168715557439e-10, "loss": 0.7225, "mean_token_accuracy": 0.7960407137870789, "num_tokens": 118717.0, "step": 3 }, { "epoch": 0.0005088411143620405, "ewc_loss": 4.131985552324746e-13, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.131985535384087e-16, "grad_norm": 5.367035865783691, "learning_rate": 1.271725307333616e-09, "loss": 0.8139, "mean_token_accuracy": 0.7712426781654358, "num_tokens": 150155.0, "step": 4 }, { "epoch": 0.0006360513929525506, "ewc_loss": 3.1407409225442384e-12, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140740993695006e-15, "grad_norm": 4.298873424530029, "learning_rate": 1.6956337431114878e-09, "loss": 0.7919, "mean_token_accuracy": 0.7746833562850952, "num_tokens": 193616.0, "step": 5 }, { "epoch": 0.0007632616715430606, "ewc_loss": 1.7045517775038377e-11, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.70455182358243e-14, "grad_norm": 5.209399700164795, "learning_rate": 2.1195421788893596e-09, "loss": 0.7895, "mean_token_accuracy": 0.77817302942276, "num_tokens": 227640.0, "step": 6 }, { "epoch": 0.0008904719501335708, "ewc_loss": 3.277421042890971e-11, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277421194679275e-14, "grad_norm": 4.8380632400512695, "learning_rate": 2.543450614667232e-09, "loss": 0.8162, "mean_token_accuracy": 0.7750344276428223, "num_tokens": 265114.0, "step": 7 }, { "epoch": 0.001017682228724081, "ewc_loss": 1.4781827151200133e-10, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.478182675004533e-13, "grad_norm": 4.945592403411865, "learning_rate": 2.967359050445104e-09, "loss": 0.7583, "mean_token_accuracy": 0.7881873846054077, "num_tokens": 299865.0, "step": 8 }, { "epoch": 0.001144892507314591, "ewc_loss": 2.3054187725524145e-10, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305418768215606e-13, "grad_norm": 4.5715765953063965, "learning_rate": 3.3912674862229757e-09, "loss": 0.814, "mean_token_accuracy": 0.7738676071166992, "num_tokens": 342063.0, "step": 9 }, { "epoch": 0.0012721027859051012, "ewc_loss": 4.290613697666146e-10, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2906138169283847e-13, "grad_norm": 5.4446282386779785, "learning_rate": 3.815175922000847e-09, "loss": 0.8638, "mean_token_accuracy": 0.7647137641906738, "num_tokens": 374864.0, "step": 10 }, { "epoch": 0.0013993130644956112, "ewc_loss": 1.292879470149444e-09, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2928794319855275e-12, "grad_norm": 4.307544231414795, "learning_rate": 4.239084357778719e-09, "loss": 0.7765, "mean_token_accuracy": 0.7753095030784607, "num_tokens": 416605.0, "step": 11 }, { "epoch": 0.0015265233430861213, "ewc_loss": 1.7164589749540937e-09, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7164590061791163e-12, "grad_norm": 5.429047584533691, "learning_rate": 4.662992793556591e-09, "loss": 0.8341, "mean_token_accuracy": 0.7689259648323059, "num_tokens": 448798.0, "step": 12 }, { "epoch": 0.0016537336216766315, "ewc_loss": 2.172475976891519e-09, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.172475957809561e-12, "grad_norm": 5.431949615478516, "learning_rate": 5.086901229334464e-09, "loss": 0.8735, "mean_token_accuracy": 0.7542296051979065, "num_tokens": 480084.0, "step": 13 }, { "epoch": 0.0017809439002671415, "ewc_loss": 6.605827440608891e-09, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.6058274302005504e-12, "grad_norm": 4.395055294036865, "learning_rate": 5.510809665112336e-09, "loss": 0.806, "mean_token_accuracy": 0.7796204090118408, "num_tokens": 524543.0, "step": 14 }, { "epoch": 0.0019081541788576518, "ewc_loss": 1.0024291619004089e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0024291292876075e-11, "grad_norm": 4.773983478546143, "learning_rate": 5.934718100890208e-09, "loss": 0.7518, "mean_token_accuracy": 0.7888144254684448, "num_tokens": 563314.0, "step": 15 }, { "epoch": 0.002035364457448162, "ewc_loss": 1.1956072576424503e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1956072416829944e-11, "grad_norm": 5.081827163696289, "learning_rate": 6.3586265366680796e-09, "loss": 0.8422, "mean_token_accuracy": 0.762329638004303, "num_tokens": 598421.0, "step": 16 }, { "epoch": 0.002162574736038672, "ewc_loss": 1.4552910876375336e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4552910959642062e-11, "grad_norm": 5.038153171539307, "learning_rate": 6.782534972445951e-09, "loss": 0.8321, "mean_token_accuracy": 0.7628723978996277, "num_tokens": 634690.0, "step": 17 }, { "epoch": 0.002289785014629182, "ewc_loss": 1.6562013982479584e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6562013913090645e-11, "grad_norm": 5.129044532775879, "learning_rate": 7.206443408223823e-09, "loss": 0.8692, "mean_token_accuracy": 0.7594057321548462, "num_tokens": 674653.0, "step": 18 }, { "epoch": 0.0024169952932196924, "ewc_loss": 4.3577617958590054e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.357761720918951e-11, "grad_norm": 5.055450916290283, "learning_rate": 7.630351844001695e-09, "loss": 0.8006, "mean_token_accuracy": 0.7766693830490112, "num_tokens": 708238.0, "step": 19 }, { "epoch": 0.0025442055718102024, "ewc_loss": 6.60221743942202e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.602217383910869e-11, "grad_norm": 4.618276119232178, "learning_rate": 8.054260279779567e-09, "loss": 0.7826, "mean_token_accuracy": 0.778657853603363, "num_tokens": 749312.0, "step": 20 }, { "epoch": 0.0026714158504007124, "ewc_loss": 7.873870089269985e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.873870305763475e-11, "grad_norm": 5.4140825271606445, "learning_rate": 8.478168715557438e-09, "loss": 0.8522, "mean_token_accuracy": 0.766889214515686, "num_tokens": 783532.0, "step": 21 }, { "epoch": 0.0027986261289912225, "ewc_loss": 8.663698736199876e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.66369892493779e-11, "grad_norm": 5.112366199493408, "learning_rate": 8.902077151335311e-09, "loss": 0.8305, "mean_token_accuracy": 0.7729626893997192, "num_tokens": 817429.0, "step": 22 }, { "epoch": 0.0029258364075817325, "ewc_loss": 9.871566675201393e-08, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.87156675846812e-11, "grad_norm": 4.8394341468811035, "learning_rate": 9.325985587113182e-09, "loss": 0.7637, "mean_token_accuracy": 0.7884804010391235, "num_tokens": 853964.0, "step": 23 }, { "epoch": 0.0030530466861722425, "ewc_loss": 1.0781543124949167e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0781543019477979e-10, "grad_norm": 5.454253673553467, "learning_rate": 9.749894022891054e-09, "loss": 0.8277, "mean_token_accuracy": 0.767450213432312, "num_tokens": 885070.0, "step": 24 }, { "epoch": 0.003180256964762753, "ewc_loss": 1.1794261212116908e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1794261400854822e-10, "grad_norm": 4.823245525360107, "learning_rate": 1.0173802458668929e-08, "loss": 0.8797, "mean_token_accuracy": 0.7512175440788269, "num_tokens": 926893.0, "step": 25 }, { "epoch": 0.003307467243353263, "ewc_loss": 2.814117863181309e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8141178276541723e-10, "grad_norm": 4.887279033660889, "learning_rate": 1.05977108944468e-08, "loss": 0.8161, "mean_token_accuracy": 0.7676985263824463, "num_tokens": 964773.0, "step": 26 }, { "epoch": 0.003434677521943773, "ewc_loss": 4.2243689790666394e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.224368910232812e-10, "grad_norm": 4.744808673858643, "learning_rate": 1.1021619330224672e-08, "loss": 0.7692, "mean_token_accuracy": 0.7854693531990051, "num_tokens": 1002725.0, "step": 27 }, { "epoch": 0.003561887800534283, "ewc_loss": 4.97113717301545e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.971137390619162e-10, "grad_norm": 4.995190620422363, "learning_rate": 1.1445527766002543e-08, "loss": 0.8732, "mean_token_accuracy": 0.7570828199386597, "num_tokens": 1040296.0, "step": 28 }, { "epoch": 0.003689098079124793, "ewc_loss": 5.460819920699578e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.460820129421506e-10, "grad_norm": 4.429684162139893, "learning_rate": 1.1869436201780416e-08, "loss": 0.7658, "mean_token_accuracy": 0.7853672504425049, "num_tokens": 1081711.0, "step": 29 }, { "epoch": 0.0038163083577153036, "ewc_loss": 5.782919743069215e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.782919698660294e-10, "grad_norm": 4.807643890380859, "learning_rate": 1.2293344637558287e-08, "loss": 0.8121, "mean_token_accuracy": 0.7750920653343201, "num_tokens": 1120556.0, "step": 30 }, { "epoch": 0.003943518636305814, "ewc_loss": 6.130763949840912e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.130763674505602e-10, "grad_norm": 4.7922210693359375, "learning_rate": 1.2717253073336159e-08, "loss": 0.7641, "mean_token_accuracy": 0.7895288467407227, "num_tokens": 1157723.0, "step": 31 }, { "epoch": 0.004070728914896324, "ewc_loss": 6.64881667944428e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.648816497367704e-10, "grad_norm": 4.688297271728516, "learning_rate": 1.314116150911403e-08, "loss": 0.8102, "mean_token_accuracy": 0.7737645506858826, "num_tokens": 1197879.0, "step": 32 }, { "epoch": 0.004197939193486834, "ewc_loss": 7.073331858009624e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.073331920182113e-10, "grad_norm": 4.77283239364624, "learning_rate": 1.3565069944891903e-08, "loss": 0.8108, "mean_token_accuracy": 0.7755350470542908, "num_tokens": 1237342.0, "step": 33 }, { "epoch": 0.004325149472077344, "ewc_loss": 7.572943445666169e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.57294338349368e-10, "grad_norm": 4.496525287628174, "learning_rate": 1.3988978380669775e-08, "loss": 0.7821, "mean_token_accuracy": 0.7827553749084473, "num_tokens": 1280197.0, "step": 34 }, { "epoch": 0.004452359750667854, "ewc_loss": 9.766446282810648e-07, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.76644654038239e-10, "grad_norm": 4.651466369628906, "learning_rate": 1.4412886816447646e-08, "loss": 0.7642, "mean_token_accuracy": 0.7845703363418579, "num_tokens": 1318625.0, "step": 35 }, { "epoch": 0.004579570029258364, "ewc_loss": 1.926285676745465e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.926285575493125e-09, "grad_norm": 4.81342887878418, "learning_rate": 1.4836795252225519e-08, "loss": 0.8218, "mean_token_accuracy": 0.7724317312240601, "num_tokens": 1356868.0, "step": 36 }, { "epoch": 0.004706780307848874, "ewc_loss": 2.6309344320907257e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6309343592600953e-09, "grad_norm": 4.848520755767822, "learning_rate": 1.526070368800339e-08, "loss": 0.7935, "mean_token_accuracy": 0.7808405160903931, "num_tokens": 1394696.0, "step": 37 }, { "epoch": 0.004833990586439385, "ewc_loss": 3.0031167170818662e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.003116644251236e-09, "grad_norm": 4.389008522033691, "learning_rate": 1.5684612123781262e-08, "loss": 0.7823, "mean_token_accuracy": 0.7822158336639404, "num_tokens": 1438738.0, "step": 38 }, { "epoch": 0.004961200865029895, "ewc_loss": 3.2363643640564987e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2363642876731546e-09, "grad_norm": 4.780418395996094, "learning_rate": 1.6108520559559135e-08, "loss": 0.7482, "mean_token_accuracy": 0.7922781109809875, "num_tokens": 1475089.0, "step": 39 }, { "epoch": 0.005088411143620405, "ewc_loss": 3.4441068237356376e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4441067775503598e-09, "grad_norm": 4.753481388092041, "learning_rate": 1.6532428995337004e-08, "loss": 0.8084, "mean_token_accuracy": 0.7742729187011719, "num_tokens": 1514566.0, "step": 40 }, { "epoch": 0.005215621422210915, "ewc_loss": 3.6422893572307657e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6422893590071226e-09, "grad_norm": 4.94625997543335, "learning_rate": 1.6956337431114877e-08, "loss": 0.8233, "mean_token_accuracy": 0.773354172706604, "num_tokens": 1552560.0, "step": 41 }, { "epoch": 0.005342831700801425, "ewc_loss": 3.803717845585197e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.803717785189065e-09, "grad_norm": 5.3966875076293945, "learning_rate": 1.738024586689275e-08, "loss": 0.8005, "mean_token_accuracy": 0.7765425443649292, "num_tokens": 1584759.0, "step": 42 }, { "epoch": 0.005470041979391935, "ewc_loss": 3.965336418332299e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.9653365035974275e-09, "grad_norm": 4.86146879196167, "learning_rate": 1.7804154302670622e-08, "loss": 0.8018, "mean_token_accuracy": 0.7752366065979004, "num_tokens": 1621825.0, "step": 43 }, { "epoch": 0.005597252257982445, "ewc_loss": 4.188284947304055e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.188284830064504e-09, "grad_norm": 4.793634414672852, "learning_rate": 1.8228062738448494e-08, "loss": 0.8277, "mean_token_accuracy": 0.7681418657302856, "num_tokens": 1662946.0, "step": 44 }, { "epoch": 0.005724462536572955, "ewc_loss": 4.3759341679106e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.375934281597438e-09, "grad_norm": 4.668981075286865, "learning_rate": 1.8651971174226364e-08, "loss": 0.8154, "mean_token_accuracy": 0.7739959955215454, "num_tokens": 1699433.0, "step": 45 }, { "epoch": 0.005851672815163465, "ewc_loss": 4.5432143451762386e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.543214249252969e-09, "grad_norm": 4.357739448547363, "learning_rate": 1.9075879610004236e-08, "loss": 0.7463, "mean_token_accuracy": 0.7889032959938049, "num_tokens": 1742812.0, "step": 46 }, { "epoch": 0.005978883093753975, "ewc_loss": 4.720551714854082e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.720551505243975e-09, "grad_norm": 4.872357368469238, "learning_rate": 1.949978804578211e-08, "loss": 0.8016, "mean_token_accuracy": 0.7788931131362915, "num_tokens": 1778725.0, "step": 47 }, { "epoch": 0.006106093372344485, "ewc_loss": 5.063615390099585e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.063615304834457e-09, "grad_norm": 4.830690383911133, "learning_rate": 1.9923696481559985e-08, "loss": 0.8479, "mean_token_accuracy": 0.7619249820709229, "num_tokens": 1816592.0, "step": 48 }, { "epoch": 0.006233303650934996, "ewc_loss": 6.244334599614376e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.244334382898842e-09, "grad_norm": 4.430736064910889, "learning_rate": 2.0347604917337857e-08, "loss": 0.7398, "mean_token_accuracy": 0.7902494072914124, "num_tokens": 1859907.0, "step": 49 }, { "epoch": 0.006360513929525506, "ewc_loss": 9.811779818846844e-06, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.811779833057699e-09, "grad_norm": 4.693438529968262, "learning_rate": 2.0771513353115727e-08, "loss": 0.7287, "mean_token_accuracy": 0.8001155257225037, "num_tokens": 1896627.0, "step": 50 }, { "epoch": 0.006487724208116016, "ewc_loss": 1.326559140579775e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3265591292110912e-08, "grad_norm": 4.9847211837768555, "learning_rate": 2.11954217888936e-08, "loss": 0.8446, "mean_token_accuracy": 0.7665042281150818, "num_tokens": 1934041.0, "step": 51 }, { "epoch": 0.006614934486706526, "ewc_loss": 1.5803178030182607e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.580317743332671e-08, "grad_norm": 4.58689022064209, "learning_rate": 2.1619330224671472e-08, "loss": 0.7737, "mean_token_accuracy": 0.7844233512878418, "num_tokens": 1976482.0, "step": 52 }, { "epoch": 0.006742144765297036, "ewc_loss": 1.759140104695689e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7591400336414154e-08, "grad_norm": 5.323040008544922, "learning_rate": 2.2043238660449344e-08, "loss": 0.8029, "mean_token_accuracy": 0.7721055150032043, "num_tokens": 2009224.0, "step": 53 }, { "epoch": 0.006869355043887546, "ewc_loss": 1.9025796063942835e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.902579604973198e-08, "grad_norm": 4.903144836425781, "learning_rate": 2.2467147096227214e-08, "loss": 0.8472, "mean_token_accuracy": 0.7623937129974365, "num_tokens": 2049235.0, "step": 54 }, { "epoch": 0.006996565322478056, "ewc_loss": 2.0089008103241213e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.008900779060241e-08, "grad_norm": 4.627391815185547, "learning_rate": 2.2891055532005086e-08, "loss": 0.8604, "mean_token_accuracy": 0.7609710693359375, "num_tokens": 2090260.0, "step": 55 }, { "epoch": 0.007123775601068566, "ewc_loss": 2.08691562875174e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0869157069114408e-08, "grad_norm": 5.045936107635498, "learning_rate": 2.331496396778296e-08, "loss": 0.8658, "mean_token_accuracy": 0.7604072093963623, "num_tokens": 2126686.0, "step": 56 }, { "epoch": 0.007250985879659076, "ewc_loss": 2.1453479348565452e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1453478993294084e-08, "grad_norm": 4.454467296600342, "learning_rate": 2.373887240356083e-08, "loss": 0.7676, "mean_token_accuracy": 0.7844216823577881, "num_tokens": 2171355.0, "step": 57 }, { "epoch": 0.007378196158249586, "ewc_loss": 2.182192838517949e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1821929152565644e-08, "grad_norm": 4.634786605834961, "learning_rate": 2.4162780839338704e-08, "loss": 0.7408, "mean_token_accuracy": 0.7930451035499573, "num_tokens": 2211660.0, "step": 58 }, { "epoch": 0.007505406436840096, "ewc_loss": 2.209709055023268e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2097090379702422e-08, "grad_norm": 5.581204414367676, "learning_rate": 2.4586689275116573e-08, "loss": 0.8712, "mean_token_accuracy": 0.7591845989227295, "num_tokens": 2244411.0, "step": 59 }, { "epoch": 0.007632616715430607, "ewc_loss": 2.2289430489763618e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2289430745559002e-08, "grad_norm": 4.381511211395264, "learning_rate": 2.5010597710894446e-08, "loss": 0.7975, "mean_token_accuracy": 0.7708513140678406, "num_tokens": 2285253.0, "step": 60 }, { "epoch": 0.007759826994021117, "ewc_loss": 2.254178616567515e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.254178532723472e-08, "grad_norm": 4.542095184326172, "learning_rate": 2.5434506146672318e-08, "loss": 0.7622, "mean_token_accuracy": 0.784878671169281, "num_tokens": 2328577.0, "step": 61 }, { "epoch": 0.007887037272611627, "ewc_loss": 2.298742583661806e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.298742529660558e-08, "grad_norm": 4.743452548980713, "learning_rate": 2.585841458245019e-08, "loss": 0.8421, "mean_token_accuracy": 0.7695640325546265, "num_tokens": 2366524.0, "step": 62 }, { "epoch": 0.008014247551202136, "ewc_loss": 2.3457072529708967e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3457072728660933e-08, "grad_norm": 4.255460262298584, "learning_rate": 2.628232301822806e-08, "loss": 0.7555, "mean_token_accuracy": 0.7896115779876709, "num_tokens": 2408628.0, "step": 63 }, { "epoch": 0.008141457829792647, "ewc_loss": 2.3854889150243253e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3854889619201458e-08, "grad_norm": 4.338486194610596, "learning_rate": 2.6706231454005933e-08, "loss": 0.7949, "mean_token_accuracy": 0.7758411765098572, "num_tokens": 2451800.0, "step": 64 }, { "epoch": 0.008268668108383158, "ewc_loss": 2.4361770556424744e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.436177126696748e-08, "grad_norm": 5.723403453826904, "learning_rate": 2.7130139889783805e-08, "loss": 0.8662, "mean_token_accuracy": 0.7549936175346375, "num_tokens": 2481448.0, "step": 65 }, { "epoch": 0.008395878386973667, "ewc_loss": 2.5412415197934024e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5412415283199152e-08, "grad_norm": 4.321010112762451, "learning_rate": 2.7554048325561678e-08, "loss": 0.7929, "mean_token_accuracy": 0.7753411531448364, "num_tokens": 2526339.0, "step": 66 }, { "epoch": 0.008523088665564178, "ewc_loss": 2.7038178814109415e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7038179695182407e-08, "grad_norm": 4.500014305114746, "learning_rate": 2.797795676133955e-08, "loss": 0.8178, "mean_token_accuracy": 0.770281195640564, "num_tokens": 2570691.0, "step": 67 }, { "epoch": 0.008650298944154687, "ewc_loss": 3.0492803489323705e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0492802949311226e-08, "grad_norm": 4.647278308868408, "learning_rate": 2.840186519711742e-08, "loss": 0.8047, "mean_token_accuracy": 0.7753961086273193, "num_tokens": 2609207.0, "step": 68 }, { "epoch": 0.008777509222745198, "ewc_loss": 4.110304871574044e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.110304985260882e-08, "grad_norm": 4.993348598480225, "learning_rate": 2.8825773632895292e-08, "loss": 0.7929, "mean_token_accuracy": 0.7773557305335999, "num_tokens": 2645494.0, "step": 69 }, { "epoch": 0.008904719501335707, "ewc_loss": 5.9482696087798104e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.94826978783658e-08, "grad_norm": 4.558168411254883, "learning_rate": 2.9249682068673165e-08, "loss": 0.7499, "mean_token_accuracy": 0.7915124893188477, "num_tokens": 2686258.0, "step": 70 }, { "epoch": 0.009031929779926218, "ewc_loss": 7.451020064763725e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.451020422877264e-08, "grad_norm": 5.156729698181152, "learning_rate": 2.9673590504451037e-08, "loss": 0.8047, "mean_token_accuracy": 0.7776601314544678, "num_tokens": 2720544.0, "step": 71 }, { "epoch": 0.009159140058516728, "ewc_loss": 8.468919259030372e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.468919077131432e-08, "grad_norm": 4.944009780883789, "learning_rate": 3.0097498940228907e-08, "loss": 0.7677, "mean_token_accuracy": 0.7859935760498047, "num_tokens": 2758068.0, "step": 72 }, { "epoch": 0.009286350337107238, "ewc_loss": 9.135550499195233e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.135550271821558e-08, "grad_norm": 5.160391330718994, "learning_rate": 3.052140737600678e-08, "loss": 0.8519, "mean_token_accuracy": 0.7629214525222778, "num_tokens": 2793342.0, "step": 73 }, { "epoch": 0.009413560615697748, "ewc_loss": 9.601809142623097e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.601809125570071e-08, "grad_norm": 5.337886333465576, "learning_rate": 3.094531581178465e-08, "loss": 0.8888, "mean_token_accuracy": 0.7553894519805908, "num_tokens": 2828003.0, "step": 74 }, { "epoch": 0.009540770894288259, "ewc_loss": 9.957809379557148e-05, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.957809510297011e-08, "grad_norm": 4.403321266174316, "learning_rate": 3.1369224247562524e-08, "loss": 0.764, "mean_token_accuracy": 0.7839239835739136, "num_tokens": 2874755.0, "step": 75 }, { "epoch": 0.00966798117287877, "ewc_loss": 0.00010120908700628206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0120908910948856e-07, "grad_norm": 4.751422882080078, "learning_rate": 3.17931326833404e-08, "loss": 0.7861, "mean_token_accuracy": 0.7764467000961304, "num_tokens": 2909703.0, "step": 76 }, { "epoch": 0.009795191451469279, "ewc_loss": 0.00010223128629149869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.022312829945804e-07, "grad_norm": 4.282577037811279, "learning_rate": 3.221704111911827e-08, "loss": 0.7164, "mean_token_accuracy": 0.7986818552017212, "num_tokens": 2954020.0, "step": 77 }, { "epoch": 0.00992240173005979, "ewc_loss": 0.00010254813969368115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0254814242216526e-07, "grad_norm": 5.3300909996032715, "learning_rate": 3.264094955489614e-08, "loss": 0.8116, "mean_token_accuracy": 0.775070071220398, "num_tokens": 2984894.0, "step": 78 }, { "epoch": 0.010049612008650299, "ewc_loss": 0.0001035406967275776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0354069956974854e-07, "grad_norm": 5.055810451507568, "learning_rate": 3.306485799067401e-08, "loss": 0.7182, "mean_token_accuracy": 0.7953206896781921, "num_tokens": 3017773.0, "step": 79 }, { "epoch": 0.01017682228724081, "ewc_loss": 0.00010414893768029287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.041489383624139e-07, "grad_norm": 4.43277645111084, "learning_rate": 3.348876642645188e-08, "loss": 0.7178, "mean_token_accuracy": 0.798122763633728, "num_tokens": 3059739.0, "step": 80 }, { "epoch": 0.010304032565831319, "ewc_loss": 0.00010387129441369325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0387129378841564e-07, "grad_norm": 5.005242824554443, "learning_rate": 3.391267486222975e-08, "loss": 0.8099, "mean_token_accuracy": 0.7719380259513855, "num_tokens": 3094910.0, "step": 81 }, { "epoch": 0.01043124284442183, "ewc_loss": 0.00010360042506363243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0360042779211653e-07, "grad_norm": 4.712270736694336, "learning_rate": 3.4336583298007626e-08, "loss": 0.8138, "mean_token_accuracy": 0.7744051814079285, "num_tokens": 3133884.0, "step": 82 }, { "epoch": 0.010558453123012339, "ewc_loss": 0.00010279689740855247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0279689632852751e-07, "grad_norm": 5.277714252471924, "learning_rate": 3.47604917337855e-08, "loss": 0.8461, "mean_token_accuracy": 0.7606594562530518, "num_tokens": 3166755.0, "step": 83 }, { "epoch": 0.01068566340160285, "ewc_loss": 0.0001016959868138656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0169598851916817e-07, "grad_norm": 4.857763767242432, "learning_rate": 3.518440016956337e-08, "loss": 0.8152, "mean_token_accuracy": 0.7708461284637451, "num_tokens": 3204825.0, "step": 84 }, { "epoch": 0.010812873680193359, "ewc_loss": 0.00010059539636131376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0059539334861256e-07, "grad_norm": 4.863193035125732, "learning_rate": 3.5608308605341244e-08, "loss": 0.8507, "mean_token_accuracy": 0.7629363536834717, "num_tokens": 3244036.0, "step": 85 }, { "epoch": 0.01094008395878387, "ewc_loss": 0.00010020592162618414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0020592355886038e-07, "grad_norm": 4.625710964202881, "learning_rate": 3.6032217041119116e-08, "loss": 0.8382, "mean_token_accuracy": 0.7668886184692383, "num_tokens": 3285969.0, "step": 86 }, { "epoch": 0.01106729423737438, "ewc_loss": 0.00010038215987151489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0038215947361095e-07, "grad_norm": 4.596709251403809, "learning_rate": 3.645612547689699e-08, "loss": 0.8091, "mean_token_accuracy": 0.774452269077301, "num_tokens": 3327648.0, "step": 87 }, { "epoch": 0.01119450451596489, "ewc_loss": 0.00010095123434439301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0095123315068122e-07, "grad_norm": 4.731423854827881, "learning_rate": 3.6880033912674855e-08, "loss": 0.8121, "mean_token_accuracy": 0.7730964422225952, "num_tokens": 3367399.0, "step": 88 }, { "epoch": 0.0113217147945554, "ewc_loss": 0.00010179771925322711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0179771692264694e-07, "grad_norm": 4.936839580535889, "learning_rate": 3.730394234845273e-08, "loss": 0.7953, "mean_token_accuracy": 0.779653787612915, "num_tokens": 3405402.0, "step": 89 }, { "epoch": 0.01144892507314591, "ewc_loss": 0.00010300115536665544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0300115604877647e-07, "grad_norm": 5.035910129547119, "learning_rate": 3.77278507842306e-08, "loss": 0.8787, "mean_token_accuracy": 0.754289984703064, "num_tokens": 3441791.0, "step": 90 }, { "epoch": 0.01157613535173642, "ewc_loss": 0.00010462375939823687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0462375854558559e-07, "grad_norm": 5.002112865447998, "learning_rate": 3.815175922000847e-08, "loss": 0.8385, "mean_token_accuracy": 0.7688091993331909, "num_tokens": 3480151.0, "step": 91 }, { "epoch": 0.01170334563032693, "ewc_loss": 0.0001077046908903867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0770469316412346e-07, "grad_norm": 4.87518310546875, "learning_rate": 3.8575667655786345e-08, "loss": 0.8019, "mean_token_accuracy": 0.7753129005432129, "num_tokens": 3516867.0, "step": 92 }, { "epoch": 0.01183055590891744, "ewc_loss": 0.00011294858995825052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1294859092458864e-07, "grad_norm": 4.476454257965088, "learning_rate": 3.899957609156422e-08, "loss": 0.7781, "mean_token_accuracy": 0.7827637791633606, "num_tokens": 3557466.0, "step": 93 }, { "epoch": 0.01195776618750795, "ewc_loss": 0.0001222819701069966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2228197476815694e-07, "grad_norm": 4.714138984680176, "learning_rate": 3.94234845273421e-08, "loss": 0.8073, "mean_token_accuracy": 0.7728319764137268, "num_tokens": 3596009.0, "step": 94 }, { "epoch": 0.012084976466098461, "ewc_loss": 0.0001404721988365054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4047219565327396e-07, "grad_norm": 4.937018871307373, "learning_rate": 3.984739296311997e-08, "loss": 0.7995, "mean_token_accuracy": 0.7747558951377869, "num_tokens": 3631582.0, "step": 95 }, { "epoch": 0.01221218674468897, "ewc_loss": 0.0001786014181561768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7860142520476074e-07, "grad_norm": 4.948243618011475, "learning_rate": 4.027130139889784e-08, "loss": 0.8265, "mean_token_accuracy": 0.7678459882736206, "num_tokens": 3672855.0, "step": 96 }, { "epoch": 0.012339397023279481, "ewc_loss": 0.00025019588065333664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.50195881790205e-07, "grad_norm": 4.813232898712158, "learning_rate": 4.0695209834675715e-08, "loss": 0.818, "mean_token_accuracy": 0.7730340957641602, "num_tokens": 3714100.0, "step": 97 }, { "epoch": 0.012466607301869992, "ewc_loss": 0.00033681225613690913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3681226341286674e-07, "grad_norm": 4.972222805023193, "learning_rate": 4.111911827045358e-08, "loss": 0.7694, "mean_token_accuracy": 0.784964919090271, "num_tokens": 3752834.0, "step": 98 }, { "epoch": 0.012593817580460501, "ewc_loss": 0.00040091638220474124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.0091637742989406e-07, "grad_norm": 5.044023036956787, "learning_rate": 4.154302670623145e-08, "loss": 0.7802, "mean_token_accuracy": 0.7811787128448486, "num_tokens": 3791513.0, "step": 99 }, { "epoch": 0.012721027859051012, "ewc_loss": 0.00042969081550836563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.296908286960388e-07, "grad_norm": 5.525644302368164, "learning_rate": 4.1966935142009326e-08, "loss": 0.8029, "mean_token_accuracy": 0.7744281888008118, "num_tokens": 3825085.0, "step": 100 }, { "epoch": 0.012848238137641521, "ewc_loss": 0.00043929790263064206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3929790649599454e-07, "grad_norm": 4.911616802215576, "learning_rate": 4.23908435777872e-08, "loss": 0.8028, "mean_token_accuracy": 0.7766979932785034, "num_tokens": 3866492.0, "step": 101 }, { "epoch": 0.012975448416232032, "ewc_loss": 0.00043724747956730425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.372474791125569e-07, "grad_norm": 5.237082481384277, "learning_rate": 4.281475201356507e-08, "loss": 0.83, "mean_token_accuracy": 0.7627468109130859, "num_tokens": 3904053.0, "step": 102 }, { "epoch": 0.013102658694822541, "ewc_loss": 0.00043524958891794086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3524957504814665e-07, "grad_norm": 5.25083589553833, "learning_rate": 4.3238660449342943e-08, "loss": 0.7385, "mean_token_accuracy": 0.7949064373970032, "num_tokens": 3935118.0, "step": 103 }, { "epoch": 0.013229868973413052, "ewc_loss": 0.000436797650763765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.367976487174019e-07, "grad_norm": 4.784030914306641, "learning_rate": 4.3662568885120816e-08, "loss": 0.7884, "mean_token_accuracy": 0.7762641906738281, "num_tokens": 3977899.0, "step": 104 }, { "epoch": 0.013357079252003561, "ewc_loss": 0.00043760283733718097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3760283574556524e-07, "grad_norm": 4.84134578704834, "learning_rate": 4.408647732089869e-08, "loss": 0.8088, "mean_token_accuracy": 0.7735235095024109, "num_tokens": 4019416.0, "step": 105 }, { "epoch": 0.013484289530594072, "ewc_loss": 0.0004371947725303471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3719478526327293e-07, "grad_norm": 4.723477363586426, "learning_rate": 4.451038575667656e-08, "loss": 0.7974, "mean_token_accuracy": 0.7728472948074341, "num_tokens": 4063887.0, "step": 106 }, { "epoch": 0.013611499809184581, "ewc_loss": 0.00043521690531633794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.352169185040111e-07, "grad_norm": 4.887113571166992, "learning_rate": 4.493429419245443e-08, "loss": 0.8287, "mean_token_accuracy": 0.7672586441040039, "num_tokens": 4101309.0, "step": 107 }, { "epoch": 0.013738710087775092, "ewc_loss": 0.00043173585436306894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3173585595468467e-07, "grad_norm": 4.993725776672363, "learning_rate": 4.53582026282323e-08, "loss": 0.8129, "mean_token_accuracy": 0.7702515125274658, "num_tokens": 4136669.0, "step": 108 }, { "epoch": 0.013865920366365603, "ewc_loss": 0.00042809676961041987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2809676870092517e-07, "grad_norm": 4.555239200592041, "learning_rate": 4.578211106401017e-08, "loss": 0.7438, "mean_token_accuracy": 0.7902252674102783, "num_tokens": 4178583.0, "step": 109 }, { "epoch": 0.013993130644956112, "ewc_loss": 0.00042110358481295407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2110357867386483e-07, "grad_norm": 4.555953502655029, "learning_rate": 4.6206019499788045e-08, "loss": 0.7744, "mean_token_accuracy": 0.7772376537322998, "num_tokens": 4219552.0, "step": 110 }, { "epoch": 0.014120340923546623, "ewc_loss": 0.00041343350312672555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.1343349721501e-07, "grad_norm": 4.6143975257873535, "learning_rate": 4.662992793556592e-08, "loss": 0.767, "mean_token_accuracy": 0.7851443290710449, "num_tokens": 4264487.0, "step": 111 }, { "epoch": 0.014247551202137132, "ewc_loss": 0.00040487691876478493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.048769142173114e-07, "grad_norm": 4.696949481964111, "learning_rate": 4.705383637134379e-08, "loss": 0.7333, "mean_token_accuracy": 0.7900246977806091, "num_tokens": 4303015.0, "step": 112 }, { "epoch": 0.014374761480727643, "ewc_loss": 0.00039676870801486075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.967687121075869e-07, "grad_norm": 4.633031845092773, "learning_rate": 4.747774480712166e-08, "loss": 0.7881, "mean_token_accuracy": 0.7773329019546509, "num_tokens": 4345446.0, "step": 113 }, { "epoch": 0.014501971759318152, "ewc_loss": 0.00038886204129084945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.888620483394334e-07, "grad_norm": 4.741722106933594, "learning_rate": 4.7901653242899535e-08, "loss": 0.7405, "mean_token_accuracy": 0.7912123203277588, "num_tokens": 4383854.0, "step": 114 }, { "epoch": 0.014629182037908663, "ewc_loss": 0.0003820311976596713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.820312031166395e-07, "grad_norm": 4.807215213775635, "learning_rate": 4.832556167867741e-08, "loss": 0.8092, "mean_token_accuracy": 0.7731891870498657, "num_tokens": 4420750.0, "step": 115 }, { "epoch": 0.014756392316499172, "ewc_loss": 0.00037515946314670146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.751594590539753e-07, "grad_norm": 4.9274797439575195, "learning_rate": 4.8749470114455274e-08, "loss": 0.8065, "mean_token_accuracy": 0.7736368179321289, "num_tokens": 4458761.0, "step": 116 }, { "epoch": 0.014883602595089683, "ewc_loss": 0.000369238288840279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6923827906321094e-07, "grad_norm": 4.718540191650391, "learning_rate": 4.9173378550233146e-08, "loss": 0.7665, "mean_token_accuracy": 0.7803465723991394, "num_tokens": 4496547.0, "step": 117 }, { "epoch": 0.015010812873680193, "ewc_loss": 0.0003620004281401634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.620004349613737e-07, "grad_norm": 4.761748790740967, "learning_rate": 4.959728698601102e-08, "loss": 0.7647, "mean_token_accuracy": 0.7890099287033081, "num_tokens": 4533357.0, "step": 118 }, { "epoch": 0.015138023152270703, "ewc_loss": 0.0003552461275830865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.552461294020759e-07, "grad_norm": 4.584695339202881, "learning_rate": 5.002119542178889e-08, "loss": 0.7392, "mean_token_accuracy": 0.7909218072891235, "num_tokens": 4573570.0, "step": 119 }, { "epoch": 0.015265233430861214, "ewc_loss": 0.00034908659290522337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4908660495602817e-07, "grad_norm": 4.7856950759887695, "learning_rate": 5.0445103857566764e-08, "loss": 0.7941, "mean_token_accuracy": 0.7748690843582153, "num_tokens": 4613195.0, "step": 120 }, { "epoch": 0.015392443709451724, "ewc_loss": 0.00034442247124388814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444224603299517e-07, "grad_norm": 4.460550308227539, "learning_rate": 5.0869012293344637e-08, "loss": 0.7147, "mean_token_accuracy": 0.7980554699897766, "num_tokens": 4657230.0, "step": 121 }, { "epoch": 0.015519653988042234, "ewc_loss": 0.00033875706139951944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38757075724061e-07, "grad_norm": 4.590123653411865, "learning_rate": 5.129292072912251e-08, "loss": 0.7885, "mean_token_accuracy": 0.7803485989570618, "num_tokens": 4701436.0, "step": 122 }, { "epoch": 0.015646864266632744, "ewc_loss": 0.0003342967829667032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342967715980194e-07, "grad_norm": 4.695259094238281, "learning_rate": 5.171682916490038e-08, "loss": 0.7761, "mean_token_accuracy": 0.780422031879425, "num_tokens": 4741806.0, "step": 123 }, { "epoch": 0.015774074545223254, "ewc_loss": 0.00033132024691440165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313202512345015e-07, "grad_norm": 4.877005577087402, "learning_rate": 5.2140737600678254e-08, "loss": 0.8131, "mean_token_accuracy": 0.765304684638977, "num_tokens": 4780671.0, "step": 124 }, { "epoch": 0.015901284823813765, "ewc_loss": 0.00033015350345522165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3015351164067397e-07, "grad_norm": 4.877334117889404, "learning_rate": 5.256464603645612e-08, "loss": 0.7618, "mean_token_accuracy": 0.7856589555740356, "num_tokens": 4816879.0, "step": 125 }, { "epoch": 0.016028495102404273, "ewc_loss": 0.0003297674411442131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2976743113977136e-07, "grad_norm": 4.746260643005371, "learning_rate": 5.298855447223399e-08, "loss": 0.8145, "mean_token_accuracy": 0.7676055431365967, "num_tokens": 4858704.0, "step": 126 }, { "epoch": 0.016155705380994784, "ewc_loss": 0.0003303244011476636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30324411379479e-07, "grad_norm": 4.702452659606934, "learning_rate": 5.3412462908011865e-08, "loss": 0.7718, "mean_token_accuracy": 0.7820144891738892, "num_tokens": 4896780.0, "step": 127 }, { "epoch": 0.016282915659585295, "ewc_loss": 0.00033294648164883256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3294648460469034e-07, "grad_norm": 4.937474727630615, "learning_rate": 5.383637134378974e-08, "loss": 0.8013, "mean_token_accuracy": 0.7718050479888916, "num_tokens": 4932108.0, "step": 128 }, { "epoch": 0.016410125938175806, "ewc_loss": 0.0003405250608921051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405250481591793e-07, "grad_norm": 4.6907877922058105, "learning_rate": 5.426027977956761e-08, "loss": 0.7777, "mean_token_accuracy": 0.7826104164123535, "num_tokens": 4970384.0, "step": 129 }, { "epoch": 0.016537336216766316, "ewc_loss": 0.00035067263524979353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50672621607373e-07, "grad_norm": 4.9973063468933105, "learning_rate": 5.468418821534548e-08, "loss": 0.7864, "mean_token_accuracy": 0.7770544290542603, "num_tokens": 5005996.0, "step": 130 }, { "epoch": 0.016664546495356824, "ewc_loss": 0.0003686737036332488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.686737102270854e-07, "grad_norm": 4.816444396972656, "learning_rate": 5.5108096651123356e-08, "loss": 0.7664, "mean_token_accuracy": 0.781592071056366, "num_tokens": 5044208.0, "step": 131 }, { "epoch": 0.016791756773947335, "ewc_loss": 0.00039383149123750627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.9383149896821124e-07, "grad_norm": 4.725933074951172, "learning_rate": 5.553200508690123e-08, "loss": 0.7812, "mean_token_accuracy": 0.7757578492164612, "num_tokens": 5085173.0, "step": 132 }, { "epoch": 0.016918967052537846, "ewc_loss": 0.0004297833947930485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2978339820365363e-07, "grad_norm": 5.138036251068115, "learning_rate": 5.59559135226791e-08, "loss": 0.7982, "mean_token_accuracy": 0.7728251814842224, "num_tokens": 5118440.0, "step": 133 }, { "epoch": 0.017046177331128357, "ewc_loss": 0.0004817235458176583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.817235321752378e-07, "grad_norm": 5.241933345794678, "learning_rate": 5.637982195845697e-08, "loss": 0.8023, "mean_token_accuracy": 0.7738269567489624, "num_tokens": 5155202.0, "step": 134 }, { "epoch": 0.017173387609718864, "ewc_loss": 0.0005541444988921285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.541444920709182e-07, "grad_norm": 5.144245147705078, "learning_rate": 5.680373039423484e-08, "loss": 0.7882, "mean_token_accuracy": 0.7795883417129517, "num_tokens": 5193240.0, "step": 135 }, { "epoch": 0.017300597888309375, "ewc_loss": 0.000644915213342756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.449151896958938e-07, "grad_norm": 5.306818008422852, "learning_rate": 5.722763883001271e-08, "loss": 0.7767, "mean_token_accuracy": 0.780173659324646, "num_tokens": 5230975.0, "step": 136 }, { "epoch": 0.017427808166899886, "ewc_loss": 0.0007593707414343953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.593707209707645e-07, "grad_norm": 5.805558204650879, "learning_rate": 5.7651547265790585e-08, "loss": 0.8392, "mean_token_accuracy": 0.7636455297470093, "num_tokens": 5264786.0, "step": 137 }, { "epoch": 0.017555018445490397, "ewc_loss": 0.0008861648966558278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.861649121172377e-07, "grad_norm": 5.531774520874023, "learning_rate": 5.807545570156846e-08, "loss": 0.8013, "mean_token_accuracy": 0.773167610168457, "num_tokens": 5301554.0, "step": 138 }, { "epoch": 0.017682228724080904, "ewc_loss": 0.0009988286765292287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.98828681986197e-07, "grad_norm": 5.452812671661377, "learning_rate": 5.849936413734633e-08, "loss": 0.7407, "mean_token_accuracy": 0.7898585200309753, "num_tokens": 5337356.0, "step": 139 }, { "epoch": 0.017809439002671415, "ewc_loss": 0.0010833111591637135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0833111900865333e-06, "grad_norm": 5.386810779571533, "learning_rate": 5.89232725731242e-08, "loss": 0.8117, "mean_token_accuracy": 0.7730395197868347, "num_tokens": 5375767.0, "step": 140 }, { "epoch": 0.017936649281261926, "ewc_loss": 0.0011464699637144804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.146469912782777e-06, "grad_norm": 5.274552345275879, "learning_rate": 5.9347181008902075e-08, "loss": 0.7618, "mean_token_accuracy": 0.782381534576416, "num_tokens": 5413683.0, "step": 141 }, { "epoch": 0.018063859559852437, "ewc_loss": 0.0011965625453740358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1965624935328378e-06, "grad_norm": 4.991997718811035, "learning_rate": 5.977108944467995e-08, "loss": 0.7426, "mean_token_accuracy": 0.787469744682312, "num_tokens": 5457750.0, "step": 142 }, { "epoch": 0.018191069838442948, "ewc_loss": 0.0012351868208497763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2351868008408928e-06, "grad_norm": 5.2390055656433105, "learning_rate": 6.019499788045781e-08, "loss": 0.7377, "mean_token_accuracy": 0.7889706492424011, "num_tokens": 5495743.0, "step": 143 }, { "epoch": 0.018318280117033455, "ewc_loss": 0.0012765146093443036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2765145811499679e-06, "grad_norm": 5.278755187988281, "learning_rate": 6.061890631623569e-08, "loss": 0.7507, "mean_token_accuracy": 0.7813336253166199, "num_tokens": 5532198.0, "step": 144 }, { "epoch": 0.018445490395623966, "ewc_loss": 0.0013132819440215826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.313281927650678e-06, "grad_norm": 5.476741790771484, "learning_rate": 6.104281475201356e-08, "loss": 0.7638, "mean_token_accuracy": 0.7819845676422119, "num_tokens": 5568977.0, "step": 145 }, { "epoch": 0.018572700674214477, "ewc_loss": 0.0013446948723867536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3446948514683754e-06, "grad_norm": 5.3730645179748535, "learning_rate": 6.146672318779143e-08, "loss": 0.7757, "mean_token_accuracy": 0.7741209268569946, "num_tokens": 5606229.0, "step": 146 }, { "epoch": 0.018699910952804988, "ewc_loss": 0.0013630758039653301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.363075853078044e-06, "grad_norm": 5.1705780029296875, "learning_rate": 6.18906316235693e-08, "loss": 0.7763, "mean_token_accuracy": 0.7756699323654175, "num_tokens": 5649828.0, "step": 147 }, { "epoch": 0.018827121231395495, "ewc_loss": 0.0013664510333910584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3664509879163234e-06, "grad_norm": 5.299121856689453, "learning_rate": 6.231454005934718e-08, "loss": 0.7226, "mean_token_accuracy": 0.7959492206573486, "num_tokens": 5687433.0, "step": 148 }, { "epoch": 0.018954331509986006, "ewc_loss": 0.001362218288704753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3622183132611099e-06, "grad_norm": 5.399551868438721, "learning_rate": 6.273844849512505e-08, "loss": 0.754, "mean_token_accuracy": 0.7827410697937012, "num_tokens": 5723247.0, "step": 149 }, { "epoch": 0.019081541788576517, "ewc_loss": 0.0013545789988711476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3545790125135682e-06, "grad_norm": 5.265580654144287, "learning_rate": 6.316235693090292e-08, "loss": 0.768, "mean_token_accuracy": 0.7785357236862183, "num_tokens": 5760305.0, "step": 150 }, { "epoch": 0.019208752067167028, "ewc_loss": 0.0013380312593653798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3380312111621606e-06, "grad_norm": 5.080399513244629, "learning_rate": 6.35862653666808e-08, "loss": 0.768, "mean_token_accuracy": 0.7768315672874451, "num_tokens": 5800586.0, "step": 151 }, { "epoch": 0.01933596234575754, "ewc_loss": 0.0013177716173231602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.31777164824598e-06, "grad_norm": 5.089745044708252, "learning_rate": 6.401017380245867e-08, "loss": 0.7594, "mean_token_accuracy": 0.7785707712173462, "num_tokens": 5840351.0, "step": 152 }, { "epoch": 0.019463172624348046, "ewc_loss": 0.0012947452487424016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2947452887601685e-06, "grad_norm": 5.180991172790527, "learning_rate": 6.443408223823654e-08, "loss": 0.8, "mean_token_accuracy": 0.7699950337409973, "num_tokens": 5880162.0, "step": 153 }, { "epoch": 0.019590382902938557, "ewc_loss": 0.0012739135418087244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.273913539989735e-06, "grad_norm": 5.147226333618164, "learning_rate": 6.485799067401441e-08, "loss": 0.8557, "mean_token_accuracy": 0.7549962401390076, "num_tokens": 5922213.0, "step": 154 }, { "epoch": 0.019717593181529068, "ewc_loss": 0.0012528840452432632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2528840898085036e-06, "grad_norm": 5.079524517059326, "learning_rate": 6.528189910979228e-08, "loss": 0.7373, "mean_token_accuracy": 0.7846897840499878, "num_tokens": 5957461.0, "step": 155 }, { "epoch": 0.01984480346011958, "ewc_loss": 0.001229965011589229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2299650506975013e-06, "grad_norm": 4.984165668487549, "learning_rate": 6.570580754557016e-08, "loss": 0.7221, "mean_token_accuracy": 0.7852187156677246, "num_tokens": 5994675.0, "step": 156 }, { "epoch": 0.019972013738710086, "ewc_loss": 0.001204538973979652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2045389894410619e-06, "grad_norm": 4.914888381958008, "learning_rate": 6.612971598134802e-08, "loss": 0.685, "mean_token_accuracy": 0.8014256954193115, "num_tokens": 6032514.0, "step": 157 }, { "epoch": 0.020099224017300597, "ewc_loss": 0.0011796621838584542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1796621492976556e-06, "grad_norm": 4.983272075653076, "learning_rate": 6.655362441712589e-08, "loss": 0.7577, "mean_token_accuracy": 0.7795827984809875, "num_tokens": 6069160.0, "step": 158 }, { "epoch": 0.020226434295891108, "ewc_loss": 0.0011585148749873042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1585149195525446e-06, "grad_norm": 5.052260875701904, "learning_rate": 6.697753285290376e-08, "loss": 0.8117, "mean_token_accuracy": 0.7642656564712524, "num_tokens": 6106652.0, "step": 159 }, { "epoch": 0.02035364457448162, "ewc_loss": 0.001139528932981193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1395289902793593e-06, "grad_norm": 5.0427141189575195, "learning_rate": 6.740144128868163e-08, "loss": 0.7462, "mean_token_accuracy": 0.7850375771522522, "num_tokens": 6143070.0, "step": 160 }, { "epoch": 0.020480854853072127, "ewc_loss": 0.001120507251471281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1205072496522916e-06, "grad_norm": 4.736694812774658, "learning_rate": 6.78253497244595e-08, "loss": 0.7382, "mean_token_accuracy": 0.7864063382148743, "num_tokens": 6186495.0, "step": 161 }, { "epoch": 0.020608065131662637, "ewc_loss": 0.001092419377528131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0924193247774383e-06, "grad_norm": 4.97039270401001, "learning_rate": 6.824925816023738e-08, "loss": 0.7391, "mean_token_accuracy": 0.7854900360107422, "num_tokens": 6222454.0, "step": 162 }, { "epoch": 0.02073527541025315, "ewc_loss": 0.0010721901198849082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0721901162469294e-06, "grad_norm": 5.223143100738525, "learning_rate": 6.867316659601525e-08, "loss": 0.7518, "mean_token_accuracy": 0.7845188975334167, "num_tokens": 6253260.0, "step": 163 }, { "epoch": 0.02086248568884366, "ewc_loss": 0.0010632736375555396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.063273657564423e-06, "grad_norm": 4.766889572143555, "learning_rate": 6.909707503179312e-08, "loss": 0.7323, "mean_token_accuracy": 0.7890094518661499, "num_tokens": 6294773.0, "step": 164 }, { "epoch": 0.02098969596743417, "ewc_loss": 0.0010409012902528048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.040901338456024e-06, "grad_norm": 4.825233459472656, "learning_rate": 6.9520983467571e-08, "loss": 0.7761, "mean_token_accuracy": 0.7749238014221191, "num_tokens": 6335334.0, "step": 165 }, { "epoch": 0.021116906246024678, "ewc_loss": 0.001020619529299438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.02061949291965e-06, "grad_norm": 4.87228536605835, "learning_rate": 6.994489190334887e-08, "loss": 0.7114, "mean_token_accuracy": 0.7913763523101807, "num_tokens": 6372255.0, "step": 166 }, { "epoch": 0.02124411652461519, "ewc_loss": 0.0010020392946898937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.002039311970293e-06, "grad_norm": 4.685209274291992, "learning_rate": 7.036880033912674e-08, "loss": 0.6873, "mean_token_accuracy": 0.7999774217605591, "num_tokens": 6411971.0, "step": 167 }, { "epoch": 0.0213713268032057, "ewc_loss": 0.0009850780479609966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.85078031590092e-07, "grad_norm": 5.031465530395508, "learning_rate": 7.079270877490461e-08, "loss": 0.7586, "mean_token_accuracy": 0.7795425653457642, "num_tokens": 6450366.0, "step": 168 }, { "epoch": 0.02149853708179621, "ewc_loss": 0.0009786406299099326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.786406280909432e-07, "grad_norm": 5.092385292053223, "learning_rate": 7.121661721068249e-08, "loss": 0.7216, "mean_token_accuracy": 0.7889194488525391, "num_tokens": 6482199.0, "step": 169 }, { "epoch": 0.021625747360386718, "ewc_loss": 0.0009739997331053019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.739997040014714e-07, "grad_norm": 4.76797342300415, "learning_rate": 7.164052564646036e-08, "loss": 0.7025, "mean_token_accuracy": 0.7976128458976746, "num_tokens": 6519697.0, "step": 170 }, { "epoch": 0.02175295763897723, "ewc_loss": 0.0009580928599461913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.580928690411383e-07, "grad_norm": 4.9002203941345215, "learning_rate": 7.206443408223823e-08, "loss": 0.7629, "mean_token_accuracy": 0.7775094509124756, "num_tokens": 6554687.0, "step": 171 }, { "epoch": 0.02188016791756774, "ewc_loss": 0.0009492177050560713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.492176786807249e-07, "grad_norm": 4.900696754455566, "learning_rate": 7.24883425180161e-08, "loss": 0.7259, "mean_token_accuracy": 0.7815107703208923, "num_tokens": 6593066.0, "step": 172 }, { "epoch": 0.02200737819615825, "ewc_loss": 0.0009434286039322615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.434286312171025e-07, "grad_norm": 4.600334167480469, "learning_rate": 7.291225095379398e-08, "loss": 0.6835, "mean_token_accuracy": 0.8017786741256714, "num_tokens": 6635484.0, "step": 173 }, { "epoch": 0.02213458847474876, "ewc_loss": 0.0009297092910856009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.297093015447899e-07, "grad_norm": 4.668344020843506, "learning_rate": 7.333615938957185e-08, "loss": 0.7763, "mean_token_accuracy": 0.7753795981407166, "num_tokens": 6677090.0, "step": 174 }, { "epoch": 0.02226179875333927, "ewc_loss": 0.0009210269781760871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.210269809045712e-07, "grad_norm": 5.104239463806152, "learning_rate": 7.376006782534971e-08, "loss": 0.6916, "mean_token_accuracy": 0.7961164116859436, "num_tokens": 6710250.0, "step": 175 }, { "epoch": 0.02238900903192978, "ewc_loss": 0.0009287816355936229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.287816169489815e-07, "grad_norm": 4.6324968338012695, "learning_rate": 7.418397626112758e-08, "loss": 0.7181, "mean_token_accuracy": 0.7872416973114014, "num_tokens": 6752998.0, "step": 176 }, { "epoch": 0.02251621931052029, "ewc_loss": 0.0009222854860126972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.222854941981495e-07, "grad_norm": 4.7841010093688965, "learning_rate": 7.460788469690545e-08, "loss": 0.7192, "mean_token_accuracy": 0.7932695150375366, "num_tokens": 6789568.0, "step": 177 }, { "epoch": 0.0226434295891108, "ewc_loss": 0.0009207341936416924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.207341804540192e-07, "grad_norm": 5.184961318969727, "learning_rate": 7.503179313268333e-08, "loss": 0.7602, "mean_token_accuracy": 0.7769347429275513, "num_tokens": 6822810.0, "step": 178 }, { "epoch": 0.02277063986770131, "ewc_loss": 0.0009277401259168983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.277401318286138e-07, "grad_norm": 4.690395355224609, "learning_rate": 7.54557015684612e-08, "loss": 0.699, "mean_token_accuracy": 0.7927713394165039, "num_tokens": 6861598.0, "step": 179 }, { "epoch": 0.02289785014629182, "ewc_loss": 0.0009245026740245521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.245026717508154e-07, "grad_norm": 4.6237640380859375, "learning_rate": 7.587961000423907e-08, "loss": 0.7215, "mean_token_accuracy": 0.7902977466583252, "num_tokens": 6900220.0, "step": 180 }, { "epoch": 0.02302506042488233, "ewc_loss": 0.0009219826315529644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.219826324624592e-07, "grad_norm": 4.753368377685547, "learning_rate": 7.630351844001694e-08, "loss": 0.83, "mean_token_accuracy": 0.7593704462051392, "num_tokens": 6946983.0, "step": 181 }, { "epoch": 0.02315227070347284, "ewc_loss": 0.0009276257478632033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.276257628698659e-07, "grad_norm": 4.848891735076904, "learning_rate": 7.672742687579482e-08, "loss": 0.7352, "mean_token_accuracy": 0.787043571472168, "num_tokens": 6986019.0, "step": 182 }, { "epoch": 0.02327948098206335, "ewc_loss": 0.0009375300141982734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.375299896419165e-07, "grad_norm": 4.917862892150879, "learning_rate": 7.715133531157269e-08, "loss": 0.7095, "mean_token_accuracy": 0.786895751953125, "num_tokens": 7020722.0, "step": 183 }, { "epoch": 0.02340669126065386, "ewc_loss": 0.0009499385487288237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.499385669187177e-07, "grad_norm": 4.789134502410889, "learning_rate": 7.757524374735056e-08, "loss": 0.7804, "mean_token_accuracy": 0.7740026712417603, "num_tokens": 7062691.0, "step": 184 }, { "epoch": 0.02353390153924437, "ewc_loss": 0.0009572387207299471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.572387398293358e-07, "grad_norm": 4.724222660064697, "learning_rate": 7.799915218312844e-08, "loss": 0.7282, "mean_token_accuracy": 0.7879832983016968, "num_tokens": 7101090.0, "step": 185 }, { "epoch": 0.02366111181783488, "ewc_loss": 0.0009692835155874491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.692835192254279e-07, "grad_norm": 4.720833778381348, "learning_rate": 7.842306061890631e-08, "loss": 0.7017, "mean_token_accuracy": 0.7896859645843506, "num_tokens": 7139891.0, "step": 186 }, { "epoch": 0.023788322096425393, "ewc_loss": 0.00098423904273659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.842390227277065e-07, "grad_norm": 4.758607387542725, "learning_rate": 7.88469690546842e-08, "loss": 0.7504, "mean_token_accuracy": 0.7808230519294739, "num_tokens": 7179501.0, "step": 187 }, { "epoch": 0.0239155323750159, "ewc_loss": 0.001002415083348751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.002415046968963e-06, "grad_norm": 4.857119560241699, "learning_rate": 7.927087749046207e-08, "loss": 0.7195, "mean_token_accuracy": 0.7883572578430176, "num_tokens": 7213809.0, "step": 188 }, { "epoch": 0.02404274265360641, "ewc_loss": 0.0010243779979646206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0243779797747266e-06, "grad_norm": 4.802131652832031, "learning_rate": 7.969478592623994e-08, "loss": 0.7947, "mean_token_accuracy": 0.7659889459609985, "num_tokens": 7254493.0, "step": 189 }, { "epoch": 0.024169952932196922, "ewc_loss": 0.0010455710580572486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0455710253154393e-06, "grad_norm": 4.95021390914917, "learning_rate": 8.011869436201781e-08, "loss": 0.6526, "mean_token_accuracy": 0.8078532218933105, "num_tokens": 7292643.0, "step": 190 }, { "epoch": 0.024297163210787433, "ewc_loss": 0.0010730171343311667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0730171879913541e-06, "grad_norm": 5.185595989227295, "learning_rate": 8.054260279779568e-08, "loss": 0.7764, "mean_token_accuracy": 0.7749682068824768, "num_tokens": 7329810.0, "step": 191 }, { "epoch": 0.02442437348937794, "ewc_loss": 0.0011066258884966373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1066258593928069e-06, "grad_norm": 4.92654275894165, "learning_rate": 8.096651123357356e-08, "loss": 0.7624, "mean_token_accuracy": 0.7768832445144653, "num_tokens": 7367630.0, "step": 192 }, { "epoch": 0.02455158376796845, "ewc_loss": 0.001133536803536117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1335367844367283e-06, "grad_norm": 4.762055397033691, "learning_rate": 8.139041966935143e-08, "loss": 0.7433, "mean_token_accuracy": 0.7813098430633545, "num_tokens": 7411580.0, "step": 193 }, { "epoch": 0.024678794046558962, "ewc_loss": 0.0011558695696294308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1558695405256003e-06, "grad_norm": 4.9828925132751465, "learning_rate": 8.181432810512929e-08, "loss": 0.6987, "mean_token_accuracy": 0.7942925691604614, "num_tokens": 7451631.0, "step": 194 }, { "epoch": 0.024806004325149473, "ewc_loss": 0.00119126052595675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1912604804820148e-06, "grad_norm": 4.87737512588501, "learning_rate": 8.223823654090716e-08, "loss": 0.7071, "mean_token_accuracy": 0.7908155918121338, "num_tokens": 7493645.0, "step": 195 }, { "epoch": 0.024933214603739984, "ewc_loss": 0.001224829233251512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2248292478034273e-06, "grad_norm": 4.925421237945557, "learning_rate": 8.266214497668503e-08, "loss": 0.6678, "mean_token_accuracy": 0.804735541343689, "num_tokens": 7538042.0, "step": 196 }, { "epoch": 0.02506042488233049, "ewc_loss": 0.001257838448509574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.257838448509574e-06, "grad_norm": 4.951901912689209, "learning_rate": 8.30860534124629e-08, "loss": 0.7314, "mean_token_accuracy": 0.7847526669502258, "num_tokens": 7575374.0, "step": 197 }, { "epoch": 0.025187635160921002, "ewc_loss": 0.0012936178827658296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2936178563904832e-06, "grad_norm": 5.267557621002197, "learning_rate": 8.350996184824078e-08, "loss": 0.7244, "mean_token_accuracy": 0.7855781316757202, "num_tokens": 7606880.0, "step": 198 }, { "epoch": 0.025314845439511513, "ewc_loss": 0.0013393197441473603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3393197377808974e-06, "grad_norm": 5.000096321105957, "learning_rate": 8.393387028401865e-08, "loss": 0.7017, "mean_token_accuracy": 0.794185996055603, "num_tokens": 7644840.0, "step": 199 }, { "epoch": 0.025442055718102024, "ewc_loss": 0.00137690594419837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3769059705737163e-06, "grad_norm": 5.1265034675598145, "learning_rate": 8.435777871979652e-08, "loss": 0.742, "mean_token_accuracy": 0.7807843685150146, "num_tokens": 7683856.0, "step": 200 }, { "epoch": 0.02556926599669253, "ewc_loss": 0.0014136863173916936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4136862773739267e-06, "grad_norm": 5.789750099182129, "learning_rate": 8.47816871555744e-08, "loss": 0.7014, "mean_token_accuracy": 0.7932408452033997, "num_tokens": 7715306.0, "step": 201 }, { "epoch": 0.025696476275283042, "ewc_loss": 0.0014717914164066315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4717913927597692e-06, "grad_norm": 5.702394485473633, "learning_rate": 8.520559559135227e-08, "loss": 0.6612, "mean_token_accuracy": 0.8041902184486389, "num_tokens": 7752442.0, "step": 202 }, { "epoch": 0.025823686553873553, "ewc_loss": 0.0015215547755360603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5215547364277882e-06, "grad_norm": 5.373038291931152, "learning_rate": 8.562950402713014e-08, "loss": 0.7449, "mean_token_accuracy": 0.7834124565124512, "num_tokens": 7799891.0, "step": 203 }, { "epoch": 0.025950896832464064, "ewc_loss": 0.0015492468373849988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5492468037336948e-06, "grad_norm": 5.20060920715332, "learning_rate": 8.605341246290801e-08, "loss": 0.7347, "mean_token_accuracy": 0.7816221714019775, "num_tokens": 7830696.0, "step": 204 }, { "epoch": 0.026078107111054575, "ewc_loss": 0.0015704297693446279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5704297311458504e-06, "grad_norm": 5.048939228057861, "learning_rate": 8.647732089868589e-08, "loss": 0.7043, "mean_token_accuracy": 0.7942108511924744, "num_tokens": 7870968.0, "step": 205 }, { "epoch": 0.026205317389645082, "ewc_loss": 0.001591554726473987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.591554678270768e-06, "grad_norm": 4.985835075378418, "learning_rate": 8.690122933446376e-08, "loss": 0.7276, "mean_token_accuracy": 0.7871673107147217, "num_tokens": 7914418.0, "step": 206 }, { "epoch": 0.026332527668235593, "ewc_loss": 0.0016143821412697434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6143820857905666e-06, "grad_norm": 4.922812461853027, "learning_rate": 8.732513777024163e-08, "loss": 0.6655, "mean_token_accuracy": 0.8077979683876038, "num_tokens": 7954544.0, "step": 207 }, { "epoch": 0.026459737946826104, "ewc_loss": 0.0016401236644014716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6401236280216835e-06, "grad_norm": 5.254729270935059, "learning_rate": 8.77490462060195e-08, "loss": 0.7145, "mean_token_accuracy": 0.7930368185043335, "num_tokens": 7997373.0, "step": 208 }, { "epoch": 0.026586948225416615, "ewc_loss": 0.001675379229709506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6753792806412093e-06, "grad_norm": 5.71150541305542, "learning_rate": 8.817295464179738e-08, "loss": 0.6789, "mean_token_accuracy": 0.7953677773475647, "num_tokens": 8033314.0, "step": 209 }, { "epoch": 0.026714158504007122, "ewc_loss": 0.0017265091883018613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.726509140098642e-06, "grad_norm": 5.6034321784973145, "learning_rate": 8.859686307757525e-08, "loss": 0.6968, "mean_token_accuracy": 0.7946735620498657, "num_tokens": 8069362.0, "step": 210 }, { "epoch": 0.026841368782597633, "ewc_loss": 0.001765398308634758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7653983377385885e-06, "grad_norm": 5.294490337371826, "learning_rate": 8.902077151335312e-08, "loss": 0.7297, "mean_token_accuracy": 0.7830262184143066, "num_tokens": 8109126.0, "step": 211 }, { "epoch": 0.026968579061188144, "ewc_loss": 0.001781304134055972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.781304149517382e-06, "grad_norm": 5.826267242431641, "learning_rate": 8.944467994913098e-08, "loss": 0.7844, "mean_token_accuracy": 0.7741256952285767, "num_tokens": 8143083.0, "step": 212 }, { "epoch": 0.027095789339778655, "ewc_loss": 0.0018115303246304393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8115302964361035e-06, "grad_norm": 5.320303916931152, "learning_rate": 8.986858838490885e-08, "loss": 0.6333, "mean_token_accuracy": 0.8067989945411682, "num_tokens": 8176636.0, "step": 213 }, { "epoch": 0.027222999618369163, "ewc_loss": 0.0018212530994787812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8212531358585693e-06, "grad_norm": 5.197332859039307, "learning_rate": 9.029249682068673e-08, "loss": 0.6921, "mean_token_accuracy": 0.7932111024856567, "num_tokens": 8220262.0, "step": 214 }, { "epoch": 0.027350209896959674, "ewc_loss": 0.0018233658047392964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.82336577836395e-06, "grad_norm": 6.0413689613342285, "learning_rate": 9.07164052564646e-08, "loss": 0.7114, "mean_token_accuracy": 0.7908226251602173, "num_tokens": 8256303.0, "step": 215 }, { "epoch": 0.027477420175550184, "ewc_loss": 0.0018644266529008746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8644266219780548e-06, "grad_norm": 5.2200026512146, "learning_rate": 9.114031369224247e-08, "loss": 0.676, "mean_token_accuracy": 0.8010443449020386, "num_tokens": 8293888.0, "step": 216 }, { "epoch": 0.027604630454140695, "ewc_loss": 0.0018640122143551707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8640122334545595e-06, "grad_norm": 5.584051609039307, "learning_rate": 9.156422212802034e-08, "loss": 0.6377, "mean_token_accuracy": 0.809569239616394, "num_tokens": 8335368.0, "step": 217 }, { "epoch": 0.027731840732731206, "ewc_loss": 0.0018770508468151093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8770508631860139e-06, "grad_norm": 6.937500953674316, "learning_rate": 9.198813056379822e-08, "loss": 0.6932, "mean_token_accuracy": 0.7967815399169922, "num_tokens": 8370338.0, "step": 218 }, { "epoch": 0.027859051011321714, "ewc_loss": 0.0019376015989109874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.937601609824924e-06, "grad_norm": 5.530782699584961, "learning_rate": 9.241203899957609e-08, "loss": 0.6773, "mean_token_accuracy": 0.7981741428375244, "num_tokens": 8409904.0, "step": 219 }, { "epoch": 0.027986261289912225, "ewc_loss": 0.0019323349697515368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9323349533806322e-06, "grad_norm": 7.417784690856934, "learning_rate": 9.283594743535396e-08, "loss": 0.8055, "mean_token_accuracy": 0.7647086381912231, "num_tokens": 8446579.0, "step": 220 }, { "epoch": 0.028113471568502735, "ewc_loss": 0.0019810826051980257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9810825051536085e-06, "grad_norm": 5.556659698486328, "learning_rate": 9.325985587113183e-08, "loss": 0.6514, "mean_token_accuracy": 0.8075335025787354, "num_tokens": 8483663.0, "step": 221 }, { "epoch": 0.028240681847093246, "ewc_loss": 0.0019456037553027272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.945603798958473e-06, "grad_norm": 5.871337413787842, "learning_rate": 9.368376430690971e-08, "loss": 0.6834, "mean_token_accuracy": 0.7966337203979492, "num_tokens": 8516346.0, "step": 222 }, { "epoch": 0.028367892125683754, "ewc_loss": 0.001923921867273748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.923921900015557e-06, "grad_norm": 6.339285850524902, "learning_rate": 9.410767274268758e-08, "loss": 0.6788, "mean_token_accuracy": 0.8004685640335083, "num_tokens": 8556914.0, "step": 223 }, { "epoch": 0.028495102404274265, "ewc_loss": 0.0019313804805278778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9313804386911215e-06, "grad_norm": 5.999726295471191, "learning_rate": 9.453158117846545e-08, "loss": 0.6667, "mean_token_accuracy": 0.7992265224456787, "num_tokens": 8596548.0, "step": 224 }, { "epoch": 0.028622312682864776, "ewc_loss": 0.0019260058179497719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9260057797509944e-06, "grad_norm": 5.261327266693115, "learning_rate": 9.495548961424333e-08, "loss": 0.697, "mean_token_accuracy": 0.7964688539505005, "num_tokens": 8634366.0, "step": 225 }, { "epoch": 0.028749522961455286, "ewc_loss": 0.0018855882808566093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8855882899515564e-06, "grad_norm": 5.682164669036865, "learning_rate": 9.53793980500212e-08, "loss": 0.637, "mean_token_accuracy": 0.8089631795883179, "num_tokens": 8666480.0, "step": 226 }, { "epoch": 0.028876733240045797, "ewc_loss": 0.0018837335519492626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.883733602880966e-06, "grad_norm": 6.156785011291504, "learning_rate": 9.580330648579907e-08, "loss": 0.7869, "mean_token_accuracy": 0.772575318813324, "num_tokens": 8705880.0, "step": 227 }, { "epoch": 0.029003943518636305, "ewc_loss": 0.0019070255802944303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9070255348196952e-06, "grad_norm": 5.828412055969238, "learning_rate": 9.622721492157694e-08, "loss": 0.6489, "mean_token_accuracy": 0.8071107864379883, "num_tokens": 8743772.0, "step": 228 }, { "epoch": 0.029131153797226816, "ewc_loss": 0.0019094756571576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9094757135462714e-06, "grad_norm": 6.384690761566162, "learning_rate": 9.665112335735482e-08, "loss": 0.6741, "mean_token_accuracy": 0.7947925329208374, "num_tokens": 8775919.0, "step": 229 }, { "epoch": 0.029258364075817327, "ewc_loss": 0.0019227939192205667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9227938992116833e-06, "grad_norm": 5.566083908081055, "learning_rate": 9.707503179313267e-08, "loss": 0.6212, "mean_token_accuracy": 0.8125802278518677, "num_tokens": 8809336.0, "step": 230 }, { "epoch": 0.029385574354407838, "ewc_loss": 0.0018897817935794592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8897817426477559e-06, "grad_norm": 5.550972938537598, "learning_rate": 9.749894022891055e-08, "loss": 0.6737, "mean_token_accuracy": 0.798698902130127, "num_tokens": 8845952.0, "step": 231 }, { "epoch": 0.029512784632998345, "ewc_loss": 0.0018689667340368032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8689667058424675e-06, "grad_norm": 5.421104907989502, "learning_rate": 9.792284866468842e-08, "loss": 0.6939, "mean_token_accuracy": 0.7900537252426147, "num_tokens": 8889801.0, "step": 232 }, { "epoch": 0.029639994911588856, "ewc_loss": 0.0018521760357543826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8521760694056866e-06, "grad_norm": 5.7197957038879395, "learning_rate": 9.834675710046629e-08, "loss": 0.6533, "mean_token_accuracy": 0.8007138967514038, "num_tokens": 8925429.0, "step": 233 }, { "epoch": 0.029767205190179367, "ewc_loss": 0.0018594992579892278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8594992070575245e-06, "grad_norm": 5.642838001251221, "learning_rate": 9.877066553624416e-08, "loss": 0.6853, "mean_token_accuracy": 0.7942389249801636, "num_tokens": 8963360.0, "step": 234 }, { "epoch": 0.029894415468769878, "ewc_loss": 0.001858580857515335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8585808447824093e-06, "grad_norm": 5.562649250030518, "learning_rate": 9.919457397202204e-08, "loss": 0.6971, "mean_token_accuracy": 0.7951210141181946, "num_tokens": 8998314.0, "step": 235 }, { "epoch": 0.030021625747360385, "ewc_loss": 0.001849974156357348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8499741827326943e-06, "grad_norm": 5.905096530914307, "learning_rate": 9.961848240779991e-08, "loss": 0.7675, "mean_token_accuracy": 0.7743921279907227, "num_tokens": 9034804.0, "step": 236 }, { "epoch": 0.030148836025950896, "ewc_loss": 0.001861086580902338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8610866163726314e-06, "grad_norm": 5.53373384475708, "learning_rate": 1.0004239084357778e-07, "loss": 0.6898, "mean_token_accuracy": 0.7951380014419556, "num_tokens": 9070840.0, "step": 237 }, { "epoch": 0.030276046304541407, "ewc_loss": 0.0018476779805496335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8476779359843931e-06, "grad_norm": 5.55380392074585, "learning_rate": 1.0046629927935566e-07, "loss": 0.6705, "mean_token_accuracy": 0.8008818030357361, "num_tokens": 9109370.0, "step": 238 }, { "epoch": 0.030403256583131918, "ewc_loss": 0.001836104434914887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8361043885306572e-06, "grad_norm": 5.7140398025512695, "learning_rate": 1.0089020771513353e-07, "loss": 0.6136, "mean_token_accuracy": 0.816096305847168, "num_tokens": 9148191.0, "step": 239 }, { "epoch": 0.03053046686172243, "ewc_loss": 0.0018372678896412253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8372678596279002e-06, "grad_norm": 6.479672908782959, "learning_rate": 1.013141161509114e-07, "loss": 0.6809, "mean_token_accuracy": 0.7978737354278564, "num_tokens": 9182923.0, "step": 240 }, { "epoch": 0.030657677140312936, "ewc_loss": 0.0018716377671808004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8716377780947369e-06, "grad_norm": 6.042015075683594, "learning_rate": 1.0173802458668927e-07, "loss": 0.6803, "mean_token_accuracy": 0.7946972250938416, "num_tokens": 9215882.0, "step": 241 }, { "epoch": 0.030784887418903447, "ewc_loss": 0.0018675198080018163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8675198134587845e-06, "grad_norm": 6.1328630447387695, "learning_rate": 1.0216193302246715e-07, "loss": 0.7123, "mean_token_accuracy": 0.7829322814941406, "num_tokens": 9249718.0, "step": 242 }, { "epoch": 0.030912097697493958, "ewc_loss": 0.0018544589402154088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8544589011071366e-06, "grad_norm": 5.1489667892456055, "learning_rate": 1.0258584145824502e-07, "loss": 0.6331, "mean_token_accuracy": 0.8104750514030457, "num_tokens": 9291146.0, "step": 243 }, { "epoch": 0.03103930797608447, "ewc_loss": 0.0017893137410283089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.789313728295383e-06, "grad_norm": 6.583431243896484, "learning_rate": 1.0300974989402289e-07, "loss": 0.6266, "mean_token_accuracy": 0.8103264570236206, "num_tokens": 9326403.0, "step": 244 }, { "epoch": 0.031166518254674976, "ewc_loss": 0.0018247809493914247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8247809521199088e-06, "grad_norm": 5.889529705047607, "learning_rate": 1.0343365832980076e-07, "loss": 0.6789, "mean_token_accuracy": 0.7972758412361145, "num_tokens": 9368491.0, "step": 245 }, { "epoch": 0.03129372853326549, "ewc_loss": 0.0018228780245408416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8228780618301244e-06, "grad_norm": 5.994251251220703, "learning_rate": 1.0385756676557864e-07, "loss": 0.6752, "mean_token_accuracy": 0.7993248105049133, "num_tokens": 9409041.0, "step": 246 }, { "epoch": 0.031420938811855995, "ewc_loss": 0.001816876232624054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.816876192606287e-06, "grad_norm": 5.783542156219482, "learning_rate": 1.0428147520135651e-07, "loss": 0.6472, "mean_token_accuracy": 0.8069716691970825, "num_tokens": 9448591.0, "step": 247 }, { "epoch": 0.03154814909044651, "ewc_loss": 0.0017946930602192879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7946930483958567e-06, "grad_norm": 5.6547441482543945, "learning_rate": 1.0470538363713437e-07, "loss": 0.6779, "mean_token_accuracy": 0.7954425811767578, "num_tokens": 9486604.0, "step": 248 }, { "epoch": 0.031675359369037016, "ewc_loss": 0.0017700097523629665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7700097032502526e-06, "grad_norm": 5.78983211517334, "learning_rate": 1.0512929207291224e-07, "loss": 0.6838, "mean_token_accuracy": 0.7908849716186523, "num_tokens": 9525285.0, "step": 249 }, { "epoch": 0.03180256964762753, "ewc_loss": 0.001767987385392189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7679873280940228e-06, "grad_norm": 6.893365859985352, "learning_rate": 1.0555320050869011e-07, "loss": 0.6635, "mean_token_accuracy": 0.7966063022613525, "num_tokens": 9557898.0, "step": 250 }, { "epoch": 0.03192977992621804, "ewc_loss": 0.0018192576244473457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8192575907960418e-06, "grad_norm": 6.465714931488037, "learning_rate": 1.0597710894446799e-07, "loss": 0.7239, "mean_token_accuracy": 0.7805198431015015, "num_tokens": 9590438.0, "step": 251 }, { "epoch": 0.032056990204808546, "ewc_loss": 0.001824367674998939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8243677004647907e-06, "grad_norm": 5.847236633300781, "learning_rate": 1.0640101738024586e-07, "loss": 0.633, "mean_token_accuracy": 0.8093520998954773, "num_tokens": 9631402.0, "step": 252 }, { "epoch": 0.03218420048339906, "ewc_loss": 0.0017750818515196443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7750818415152025e-06, "grad_norm": 7.099810600280762, "learning_rate": 1.0682492581602373e-07, "loss": 0.6148, "mean_token_accuracy": 0.8156517148017883, "num_tokens": 9663702.0, "step": 253 }, { "epoch": 0.03231141076198957, "ewc_loss": 0.0018016045214608312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8016045260083047e-06, "grad_norm": 6.114535808563232, "learning_rate": 1.072488342518016e-07, "loss": 0.6029, "mean_token_accuracy": 0.8148601055145264, "num_tokens": 9698969.0, "step": 254 }, { "epoch": 0.03243862104058008, "ewc_loss": 0.001770168892107904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.770168864823063e-06, "grad_norm": 6.160754203796387, "learning_rate": 1.0767274268757948e-07, "loss": 0.6723, "mean_token_accuracy": 0.7963333129882812, "num_tokens": 9738075.0, "step": 255 }, { "epoch": 0.03256583131917059, "ewc_loss": 0.0017448329599574208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7448329572289367e-06, "grad_norm": 7.33935022354126, "learning_rate": 1.0809665112335735e-07, "loss": 0.6773, "mean_token_accuracy": 0.7988369464874268, "num_tokens": 9782675.0, "step": 256 }, { "epoch": 0.0326930415977611, "ewc_loss": 0.0017858273349702358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7858272940429742e-06, "grad_norm": 5.87387228012085, "learning_rate": 1.0852055955913522e-07, "loss": 0.5768, "mean_token_accuracy": 0.8244099020957947, "num_tokens": 9820585.0, "step": 257 }, { "epoch": 0.03282025187635161, "ewc_loss": 0.0017367159016430378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7367159443892888e-06, "grad_norm": 6.405545234680176, "learning_rate": 1.089444679949131e-07, "loss": 0.7021, "mean_token_accuracy": 0.7908193469047546, "num_tokens": 9860693.0, "step": 258 }, { "epoch": 0.03294746215494212, "ewc_loss": 0.001729306299239397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7293062910539447e-06, "grad_norm": 7.108027458190918, "learning_rate": 1.0936837643069097e-07, "loss": 0.635, "mean_token_accuracy": 0.8089601397514343, "num_tokens": 9902629.0, "step": 259 }, { "epoch": 0.03307467243353263, "ewc_loss": 0.0017537809908390045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7537810208523297e-06, "grad_norm": 6.041744709014893, "learning_rate": 1.0979228486646884e-07, "loss": 0.6069, "mean_token_accuracy": 0.8134684562683105, "num_tokens": 9937304.0, "step": 260 }, { "epoch": 0.03320188271212314, "ewc_loss": 0.0017148329643532634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7148329334304435e-06, "grad_norm": 6.669983386993408, "learning_rate": 1.1021619330224671e-07, "loss": 0.7407, "mean_token_accuracy": 0.7795989513397217, "num_tokens": 9976911.0, "step": 261 }, { "epoch": 0.03332909299071365, "ewc_loss": 0.0017155470559373498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.715547000458173e-06, "grad_norm": 6.180924892425537, "learning_rate": 1.1064010173802458e-07, "loss": 0.6744, "mean_token_accuracy": 0.7984122633934021, "num_tokens": 10015740.0, "step": 262 }, { "epoch": 0.03345630326930416, "ewc_loss": 0.0016901812050491571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.690181193225726e-06, "grad_norm": 7.363229274749756, "learning_rate": 1.1106401017380246e-07, "loss": 0.6124, "mean_token_accuracy": 0.8159462213516235, "num_tokens": 10053054.0, "step": 263 }, { "epoch": 0.03358351354789467, "ewc_loss": 0.0017249472439289093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7249471966351848e-06, "grad_norm": 7.223573207855225, "learning_rate": 1.1148791860958033e-07, "loss": 0.7091, "mean_token_accuracy": 0.7840509414672852, "num_tokens": 10082088.0, "step": 264 }, { "epoch": 0.03371072382648518, "ewc_loss": 0.0017388226697221398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.738822675179108e-06, "grad_norm": 6.880312919616699, "learning_rate": 1.119118270453582e-07, "loss": 0.6815, "mean_token_accuracy": 0.7942314743995667, "num_tokens": 10121122.0, "step": 265 }, { "epoch": 0.03383793410507569, "ewc_loss": 0.001708169118501246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7081690657505533e-06, "grad_norm": 6.697245121002197, "learning_rate": 1.1233573548113607e-07, "loss": 0.6757, "mean_token_accuracy": 0.7980775833129883, "num_tokens": 10161017.0, "step": 266 }, { "epoch": 0.0339651443836662, "ewc_loss": 0.001678171451203525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6781714293756522e-06, "grad_norm": 5.853512287139893, "learning_rate": 1.1275964391691393e-07, "loss": 0.6409, "mean_token_accuracy": 0.808228611946106, "num_tokens": 10204913.0, "step": 267 }, { "epoch": 0.03409235466225671, "ewc_loss": 0.001628685393370688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6286853679048363e-06, "grad_norm": 5.7038187980651855, "learning_rate": 1.131835523526918e-07, "loss": 0.6892, "mean_token_accuracy": 0.7910395860671997, "num_tokens": 10245154.0, "step": 268 }, { "epoch": 0.03421956494084722, "ewc_loss": 0.0016143653774634004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6143653738254216e-06, "grad_norm": 6.064852714538574, "learning_rate": 1.1360746078846968e-07, "loss": 0.6964, "mean_token_accuracy": 0.7901771068572998, "num_tokens": 10283006.0, "step": 269 }, { "epoch": 0.03434677521943773, "ewc_loss": 0.0016341895097866654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6341895161531284e-06, "grad_norm": 7.70340633392334, "learning_rate": 1.1403136922424755e-07, "loss": 0.6567, "mean_token_accuracy": 0.8000376224517822, "num_tokens": 10316689.0, "step": 270 }, { "epoch": 0.03447398549802824, "ewc_loss": 0.0017115314258262515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7115314676630078e-06, "grad_norm": 6.540721416473389, "learning_rate": 1.1445527766002542e-07, "loss": 0.6507, "mean_token_accuracy": 0.8015018105506897, "num_tokens": 10358145.0, "step": 271 }, { "epoch": 0.03460119577661875, "ewc_loss": 0.0016796011477708817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6796011550468393e-06, "grad_norm": 6.980354309082031, "learning_rate": 1.148791860958033e-07, "loss": 0.7518, "mean_token_accuracy": 0.7755789160728455, "num_tokens": 10393424.0, "step": 272 }, { "epoch": 0.034728406055209264, "ewc_loss": 0.001664737588725984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6647376241962775e-06, "grad_norm": 5.565552234649658, "learning_rate": 1.1530309453158117e-07, "loss": 0.6655, "mean_token_accuracy": 0.7983748316764832, "num_tokens": 10433560.0, "step": 273 }, { "epoch": 0.03485561633379977, "ewc_loss": 0.0015993209090083838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.599320853529207e-06, "grad_norm": 5.799602508544922, "learning_rate": 1.1572700296735904e-07, "loss": 0.6626, "mean_token_accuracy": 0.7987792491912842, "num_tokens": 10472024.0, "step": 274 }, { "epoch": 0.03498282661239028, "ewc_loss": 0.0016014693537726998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.60146930738847e-06, "grad_norm": 6.123725891113281, "learning_rate": 1.1615091140313691e-07, "loss": 0.7016, "mean_token_accuracy": 0.7889229655265808, "num_tokens": 10510874.0, "step": 275 }, { "epoch": 0.03511003689098079, "ewc_loss": 0.0016307063633576035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6307063788190135e-06, "grad_norm": 6.36743688583374, "learning_rate": 1.1657481983891479e-07, "loss": 0.6389, "mean_token_accuracy": 0.8072806596755981, "num_tokens": 10552411.0, "step": 276 }, { "epoch": 0.0352372471695713, "ewc_loss": 0.0016565347323194146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6565347777941497e-06, "grad_norm": 7.310931205749512, "learning_rate": 1.1699872827469266e-07, "loss": 0.6554, "mean_token_accuracy": 0.8036515712738037, "num_tokens": 10591537.0, "step": 277 }, { "epoch": 0.03536445744816181, "ewc_loss": 0.0017028659349307418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.702865915831353e-06, "grad_norm": 6.457952499389648, "learning_rate": 1.1742263671047053e-07, "loss": 0.6637, "mean_token_accuracy": 0.8003478050231934, "num_tokens": 10632228.0, "step": 278 }, { "epoch": 0.03549166772675232, "ewc_loss": 0.0016694413498044014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6694413034201716e-06, "grad_norm": 7.835733890533447, "learning_rate": 1.178465451462484e-07, "loss": 0.7191, "mean_token_accuracy": 0.7823771238327026, "num_tokens": 10672829.0, "step": 279 }, { "epoch": 0.03561887800534283, "ewc_loss": 0.0016980321379378438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6980321788651054e-06, "grad_norm": 7.33796501159668, "learning_rate": 1.1827045358202628e-07, "loss": 0.6932, "mean_token_accuracy": 0.7969327569007874, "num_tokens": 10706146.0, "step": 280 }, { "epoch": 0.035746088283933344, "ewc_loss": 0.0016961012734100223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6961013216132415e-06, "grad_norm": 6.021906852722168, "learning_rate": 1.1869436201780415e-07, "loss": 0.6097, "mean_token_accuracy": 0.8169496059417725, "num_tokens": 10744112.0, "step": 281 }, { "epoch": 0.03587329856252385, "ewc_loss": 0.0016286218306049705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.62862181696255e-06, "grad_norm": 6.429383754730225, "learning_rate": 1.1911827045358202e-07, "loss": 0.6406, "mean_token_accuracy": 0.8116634488105774, "num_tokens": 10783726.0, "step": 282 }, { "epoch": 0.03600050884111436, "ewc_loss": 0.001633550738915801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6335507098119706e-06, "grad_norm": 6.463576793670654, "learning_rate": 1.195421788893599e-07, "loss": 0.7047, "mean_token_accuracy": 0.7912468910217285, "num_tokens": 10824476.0, "step": 283 }, { "epoch": 0.036127719119704874, "ewc_loss": 0.0016548437997698784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6548437997698784e-06, "grad_norm": 6.9571614265441895, "learning_rate": 1.1996608732513778e-07, "loss": 0.6649, "mean_token_accuracy": 0.7998129725456238, "num_tokens": 10859997.0, "step": 284 }, { "epoch": 0.03625492939829538, "ewc_loss": 0.0016820095479488373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.682009497017134e-06, "grad_norm": 6.254878044128418, "learning_rate": 1.2038999576091563e-07, "loss": 0.6353, "mean_token_accuracy": 0.8093360662460327, "num_tokens": 10898800.0, "step": 285 }, { "epoch": 0.036382139676885895, "ewc_loss": 0.0016527895350009203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6527894786122488e-06, "grad_norm": 8.588366508483887, "learning_rate": 1.208139041966935e-07, "loss": 0.6009, "mean_token_accuracy": 0.8195208311080933, "num_tokens": 10937462.0, "step": 286 }, { "epoch": 0.0365093499554764, "ewc_loss": 0.00173643056768924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7364305904266075e-06, "grad_norm": 6.593824863433838, "learning_rate": 1.2123781263247137e-07, "loss": 0.6629, "mean_token_accuracy": 0.7974703311920166, "num_tokens": 10970739.0, "step": 287 }, { "epoch": 0.03663656023406691, "ewc_loss": 0.0016913319705054164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6913319313971442e-06, "grad_norm": 6.717362403869629, "learning_rate": 1.2166172106824924e-07, "loss": 0.7144, "mean_token_accuracy": 0.7919477224349976, "num_tokens": 11003000.0, "step": 288 }, { "epoch": 0.036763770512657425, "ewc_loss": 0.001673965365625918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6739653574404656e-06, "grad_norm": 7.065039157867432, "learning_rate": 1.2208562950402712e-07, "loss": 0.6194, "mean_token_accuracy": 0.8090562224388123, "num_tokens": 11039665.0, "step": 289 }, { "epoch": 0.03689098079124793, "ewc_loss": 0.0016869843238964677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6869843193489942e-06, "grad_norm": 6.777809143066406, "learning_rate": 1.22509537939805e-07, "loss": 0.6469, "mean_token_accuracy": 0.7994014620780945, "num_tokens": 11078368.0, "step": 290 }, { "epoch": 0.03701819106983844, "ewc_loss": 0.0016860708128660917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.686070845607901e-06, "grad_norm": 5.739058494567871, "learning_rate": 1.2293344637558286e-07, "loss": 0.6721, "mean_token_accuracy": 0.7982948422431946, "num_tokens": 11122654.0, "step": 291 }, { "epoch": 0.037145401348428954, "ewc_loss": 0.001646543387323618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6465434100609855e-06, "grad_norm": 6.990419864654541, "learning_rate": 1.2335735481136073e-07, "loss": 0.6275, "mean_token_accuracy": 0.8050577640533447, "num_tokens": 11162622.0, "step": 292 }, { "epoch": 0.03727261162701946, "ewc_loss": 0.0017086868174374104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7086867956095375e-06, "grad_norm": 7.483335494995117, "learning_rate": 1.237812632471386e-07, "loss": 0.6391, "mean_token_accuracy": 0.8065530061721802, "num_tokens": 11202564.0, "step": 293 }, { "epoch": 0.037399821905609976, "ewc_loss": 0.001755991019308567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7559909792908002e-06, "grad_norm": 6.641315937042236, "learning_rate": 1.2420517168291648e-07, "loss": 0.6573, "mean_token_accuracy": 0.7981082797050476, "num_tokens": 11243422.0, "step": 294 }, { "epoch": 0.03752703218420048, "ewc_loss": 0.0017261193133890629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7261193079320947e-06, "grad_norm": 7.7735676765441895, "learning_rate": 1.2462908011869435e-07, "loss": 0.639, "mean_token_accuracy": 0.803296685218811, "num_tokens": 11280867.0, "step": 295 }, { "epoch": 0.03765424246279099, "ewc_loss": 0.0017539883265271783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7539882719574962e-06, "grad_norm": 6.758813381195068, "learning_rate": 1.2505298855447223e-07, "loss": 0.587, "mean_token_accuracy": 0.8200638890266418, "num_tokens": 11318454.0, "step": 296 }, { "epoch": 0.037781452741381505, "ewc_loss": 0.0017209131037816405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.720913132885471e-06, "grad_norm": 6.3175225257873535, "learning_rate": 1.254768969902501e-07, "loss": 0.7286, "mean_token_accuracy": 0.782640278339386, "num_tokens": 11354721.0, "step": 297 }, { "epoch": 0.03790866301997201, "ewc_loss": 0.0016947939293459058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6947939229794429e-06, "grad_norm": 8.375428199768066, "learning_rate": 1.2590080542602797e-07, "loss": 0.6871, "mean_token_accuracy": 0.7938622832298279, "num_tokens": 11385938.0, "step": 298 }, { "epoch": 0.03803587329856253, "ewc_loss": 0.0017807177500799298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.780717752808414e-06, "grad_norm": 6.036844253540039, "learning_rate": 1.2632471386180584e-07, "loss": 0.6656, "mean_token_accuracy": 0.7989691495895386, "num_tokens": 11424911.0, "step": 299 }, { "epoch": 0.038163083577153034, "ewc_loss": 0.0017090762266889215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.709076173028734e-06, "grad_norm": 6.537691593170166, "learning_rate": 1.2674862229758372e-07, "loss": 0.6336, "mean_token_accuracy": 0.8091764450073242, "num_tokens": 11460580.0, "step": 300 }, { "epoch": 0.03829029385574354, "ewc_loss": 0.0017183732707053423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7183732552439324e-06, "grad_norm": 6.636387825012207, "learning_rate": 1.271725307333616e-07, "loss": 0.6236, "mean_token_accuracy": 0.8123064041137695, "num_tokens": 11501031.0, "step": 301 }, { "epoch": 0.038417504134334056, "ewc_loss": 0.0017427565762773156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7427565808247891e-06, "grad_norm": 6.274569034576416, "learning_rate": 1.2759643916913946e-07, "loss": 0.6333, "mean_token_accuracy": 0.8083422183990479, "num_tokens": 11544053.0, "step": 302 }, { "epoch": 0.03854471441292456, "ewc_loss": 0.0017313320422545075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7313320768153062e-06, "grad_norm": 7.0614519119262695, "learning_rate": 1.2802034760491733e-07, "loss": 0.65, "mean_token_accuracy": 0.8016883134841919, "num_tokens": 11576944.0, "step": 303 }, { "epoch": 0.03867192469151508, "ewc_loss": 0.0017730921972543001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7730922081682365e-06, "grad_norm": 5.75217866897583, "learning_rate": 1.284442560406952e-07, "loss": 0.6998, "mean_token_accuracy": 0.7882005572319031, "num_tokens": 11619735.0, "step": 304 }, { "epoch": 0.038799134970105585, "ewc_loss": 0.001722458633594215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7224585917574586e-06, "grad_norm": 6.745477199554443, "learning_rate": 1.2886816447647308e-07, "loss": 0.7378, "mean_token_accuracy": 0.7753361463546753, "num_tokens": 11655025.0, "step": 305 }, { "epoch": 0.03892634524869609, "ewc_loss": 0.0017795724561437964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7795724716052064e-06, "grad_norm": 8.514620780944824, "learning_rate": 1.2929207291225095e-07, "loss": 0.6343, "mean_token_accuracy": 0.8082094192504883, "num_tokens": 11691970.0, "step": 306 }, { "epoch": 0.03905355552728661, "ewc_loss": 0.0018845431040972471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8845430531655438e-06, "grad_norm": 7.491915702819824, "learning_rate": 1.2971598134802882e-07, "loss": 0.6505, "mean_token_accuracy": 0.8017749786376953, "num_tokens": 11729561.0, "step": 307 }, { "epoch": 0.039180765805877114, "ewc_loss": 0.0018693094607442617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8693094716581982e-06, "grad_norm": 9.519035339355469, "learning_rate": 1.301398897838067e-07, "loss": 0.6699, "mean_token_accuracy": 0.7953784465789795, "num_tokens": 11765707.0, "step": 308 }, { "epoch": 0.03930797608446762, "ewc_loss": 0.0019148915307596326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9148915271216538e-06, "grad_norm": 7.424365520477295, "learning_rate": 1.3056379821958457e-07, "loss": 0.6789, "mean_token_accuracy": 0.8003882169723511, "num_tokens": 11806223.0, "step": 309 }, { "epoch": 0.039435186363058136, "ewc_loss": 0.0018469784408807755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.846978420871892e-06, "grad_norm": 6.628312110900879, "learning_rate": 1.3098770665536244e-07, "loss": 0.6332, "mean_token_accuracy": 0.8094221949577332, "num_tokens": 11845477.0, "step": 310 }, { "epoch": 0.039562396641648644, "ewc_loss": 0.0017938370583578944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7938371001946507e-06, "grad_norm": 7.012590408325195, "learning_rate": 1.3141161509114031e-07, "loss": 0.6181, "mean_token_accuracy": 0.8077732920646667, "num_tokens": 11877973.0, "step": 311 }, { "epoch": 0.03968960692023916, "ewc_loss": 0.0018060244619846344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8060244428852457e-06, "grad_norm": 6.724865436553955, "learning_rate": 1.3183552352691819e-07, "loss": 0.6258, "mean_token_accuracy": 0.8099550604820251, "num_tokens": 11912093.0, "step": 312 }, { "epoch": 0.039816817198829665, "ewc_loss": 0.0018137955339625478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8137955066777067e-06, "grad_norm": 6.307589054107666, "learning_rate": 1.3225943196269603e-07, "loss": 0.6505, "mean_token_accuracy": 0.8063380718231201, "num_tokens": 11952541.0, "step": 313 }, { "epoch": 0.03994402747742017, "ewc_loss": 0.0017999507253989577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7999507235799683e-06, "grad_norm": 6.344663143157959, "learning_rate": 1.3268334039847393e-07, "loss": 0.6278, "mean_token_accuracy": 0.808083713054657, "num_tokens": 11990414.0, "step": 314 }, { "epoch": 0.04007123775601069, "ewc_loss": 0.0018110570963472128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8110571318175062e-06, "grad_norm": 8.901090621948242, "learning_rate": 1.3310724883425178e-07, "loss": 0.6969, "mean_token_accuracy": 0.7863668203353882, "num_tokens": 12025925.0, "step": 315 }, { "epoch": 0.040198448034601195, "ewc_loss": 0.001939071575179696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9390715806366643e-06, "grad_norm": 6.67766809463501, "learning_rate": 1.3353115727002968e-07, "loss": 0.6621, "mean_token_accuracy": 0.7987579107284546, "num_tokens": 12068879.0, "step": 316 }, { "epoch": 0.04032565831319171, "ewc_loss": 0.001872166059911251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8721660808296292e-06, "grad_norm": 6.317647933959961, "learning_rate": 1.3395506570580752e-07, "loss": 0.6284, "mean_token_accuracy": 0.8079732656478882, "num_tokens": 12111931.0, "step": 317 }, { "epoch": 0.040452868591782216, "ewc_loss": 0.0018346768338233232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8346768229093868e-06, "grad_norm": 7.588438510894775, "learning_rate": 1.3437897414158542e-07, "loss": 0.6983, "mean_token_accuracy": 0.7863376140594482, "num_tokens": 12140431.0, "step": 318 }, { "epoch": 0.040580078870372724, "ewc_loss": 0.0019000008469447494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9000008251168765e-06, "grad_norm": 7.210387229919434, "learning_rate": 1.3480288257736327e-07, "loss": 0.6258, "mean_token_accuracy": 0.8082602024078369, "num_tokens": 12173932.0, "step": 319 }, { "epoch": 0.04070728914896324, "ewc_loss": 0.0019091583089902997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9091582998953527e-06, "grad_norm": 6.169902801513672, "learning_rate": 1.3522679101314117e-07, "loss": 0.6047, "mean_token_accuracy": 0.8124117851257324, "num_tokens": 12210040.0, "step": 320 }, { "epoch": 0.040834499427553746, "ewc_loss": 0.001854053814895451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8540538349043345e-06, "grad_norm": 7.294887065887451, "learning_rate": 1.35650699448919e-07, "loss": 0.637, "mean_token_accuracy": 0.8070723414421082, "num_tokens": 12248556.0, "step": 321 }, { "epoch": 0.04096170970614425, "ewc_loss": 0.0019210290629416704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.921029024742893e-06, "grad_norm": 6.9186296463012695, "learning_rate": 1.360746078846969e-07, "loss": 0.6259, "mean_token_accuracy": 0.8089361190795898, "num_tokens": 12285664.0, "step": 322 }, { "epoch": 0.04108891998473477, "ewc_loss": 0.0019219378009438515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9219378373236395e-06, "grad_norm": 9.130308151245117, "learning_rate": 1.3649851632047476e-07, "loss": 0.6796, "mean_token_accuracy": 0.7941633462905884, "num_tokens": 12326964.0, "step": 323 }, { "epoch": 0.041216130263325275, "ewc_loss": 0.00201770500279963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0177049009362236e-06, "grad_norm": 6.604972839355469, "learning_rate": 1.3692242475625266e-07, "loss": 0.5874, "mean_token_accuracy": 0.8224017024040222, "num_tokens": 12366541.0, "step": 324 }, { "epoch": 0.04134334054191579, "ewc_loss": 0.0019285958260297775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9285957932879683e-06, "grad_norm": 7.622902870178223, "learning_rate": 1.373463331920305e-07, "loss": 0.6495, "mean_token_accuracy": 0.8078440427780151, "num_tokens": 12405664.0, "step": 325 }, { "epoch": 0.0414705508205063, "ewc_loss": 0.0019459080649539828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.945908024936216e-06, "grad_norm": 7.7424421310424805, "learning_rate": 1.377702416278084e-07, "loss": 0.7141, "mean_token_accuracy": 0.7843530178070068, "num_tokens": 12445039.0, "step": 326 }, { "epoch": 0.041597761099096804, "ewc_loss": 0.0019720729906111956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.972073050637846e-06, "grad_norm": 6.069230556488037, "learning_rate": 1.3819415006358625e-07, "loss": 0.5837, "mean_token_accuracy": 0.8217605352401733, "num_tokens": 12485481.0, "step": 327 }, { "epoch": 0.04172497137768732, "ewc_loss": 0.0018918284913524985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8918284467872581e-06, "grad_norm": 7.640346527099609, "learning_rate": 1.3861805849936415e-07, "loss": 0.6091, "mean_token_accuracy": 0.81722491979599, "num_tokens": 12530272.0, "step": 328 }, { "epoch": 0.041852181656277826, "ewc_loss": 0.0019711321219801903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9711321783688618e-06, "grad_norm": 7.04503059387207, "learning_rate": 1.39041966935142e-07, "loss": 0.6556, "mean_token_accuracy": 0.8006364703178406, "num_tokens": 12565854.0, "step": 329 }, { "epoch": 0.04197939193486834, "ewc_loss": 0.0019718403927981853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9718404473678675e-06, "grad_norm": 6.055924892425537, "learning_rate": 1.394658753709199e-07, "loss": 0.6406, "mean_token_accuracy": 0.8068730235099792, "num_tokens": 12607313.0, "step": 330 }, { "epoch": 0.04210660221345885, "ewc_loss": 0.0019171432359144092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.917143208629568e-06, "grad_norm": 6.662609577178955, "learning_rate": 1.3988978380669774e-07, "loss": 0.6781, "mean_token_accuracy": 0.7946599721908569, "num_tokens": 12642057.0, "step": 331 }, { "epoch": 0.042233812492049355, "ewc_loss": 0.0019623839762061834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9623839762061834e-06, "grad_norm": 6.599899768829346, "learning_rate": 1.403136922424756e-07, "loss": 0.6695, "mean_token_accuracy": 0.7968779802322388, "num_tokens": 12684118.0, "step": 332 }, { "epoch": 0.04236102277063987, "ewc_loss": 0.0019778998102992773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.977899728444754e-06, "grad_norm": 7.1766791343688965, "learning_rate": 1.4073760067825348e-07, "loss": 0.6345, "mean_token_accuracy": 0.8061606287956238, "num_tokens": 12721945.0, "step": 333 }, { "epoch": 0.04248823304923038, "ewc_loss": 0.0020145534072071314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0145535017945804e-06, "grad_norm": 5.975497245788574, "learning_rate": 1.4116150911403136e-07, "loss": 0.6482, "mean_token_accuracy": 0.802544116973877, "num_tokens": 12764175.0, "step": 334 }, { "epoch": 0.04261544332782089, "ewc_loss": 0.0019586514681577682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9586514099501073e-06, "grad_norm": 6.332671642303467, "learning_rate": 1.4158541754980923e-07, "loss": 0.599, "mean_token_accuracy": 0.8187001943588257, "num_tokens": 12801600.0, "step": 335 }, { "epoch": 0.0427426536064114, "ewc_loss": 0.001985313603654504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9853137018799316e-06, "grad_norm": 6.112852096557617, "learning_rate": 1.420093259855871e-07, "loss": 0.6303, "mean_token_accuracy": 0.8137680888175964, "num_tokens": 12843584.0, "step": 336 }, { "epoch": 0.042869863885001906, "ewc_loss": 0.0019879720639437437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.987972154893214e-06, "grad_norm": 6.8991923332214355, "learning_rate": 1.4243323442136497e-07, "loss": 0.645, "mean_token_accuracy": 0.803943932056427, "num_tokens": 12878105.0, "step": 337 }, { "epoch": 0.04299707416359242, "ewc_loss": 0.002049380214884877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0493803276622202e-06, "grad_norm": 6.622918605804443, "learning_rate": 1.4285714285714285e-07, "loss": 0.653, "mean_token_accuracy": 0.8020816445350647, "num_tokens": 12911712.0, "step": 338 }, { "epoch": 0.04312428444218293, "ewc_loss": 0.0020511983893811703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0511984075710643e-06, "grad_norm": 6.513576030731201, "learning_rate": 1.4328105129292072e-07, "loss": 0.6327, "mean_token_accuracy": 0.8090010285377502, "num_tokens": 12951303.0, "step": 339 }, { "epoch": 0.043251494720773435, "ewc_loss": 0.002046524314209819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0465242869249778e-06, "grad_norm": 6.68846321105957, "learning_rate": 1.437049597286986e-07, "loss": 0.6201, "mean_token_accuracy": 0.8116731643676758, "num_tokens": 12990544.0, "step": 340 }, { "epoch": 0.04337870499936395, "ewc_loss": 0.002062996616587043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.062996600216138e-06, "grad_norm": 8.54035472869873, "learning_rate": 1.4412886816447646e-07, "loss": 0.68, "mean_token_accuracy": 0.7934337258338928, "num_tokens": 13023766.0, "step": 341 }, { "epoch": 0.04350591527795446, "ewc_loss": 0.002169295446947217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1692953851015773e-06, "grad_norm": 6.9131083488464355, "learning_rate": 1.4455277660025434e-07, "loss": 0.5938, "mean_token_accuracy": 0.8189109563827515, "num_tokens": 13056337.0, "step": 342 }, { "epoch": 0.04363312555654497, "ewc_loss": 0.002108075888827443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.108075932483189e-06, "grad_norm": 7.829215049743652, "learning_rate": 1.449766850360322e-07, "loss": 0.6512, "mean_token_accuracy": 0.8025189638137817, "num_tokens": 13097345.0, "step": 343 }, { "epoch": 0.04376033583513548, "ewc_loss": 0.002130653243511915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.13065322896e-06, "grad_norm": 8.20564079284668, "learning_rate": 1.4540059347181008e-07, "loss": 0.6372, "mean_token_accuracy": 0.8061186075210571, "num_tokens": 13127332.0, "step": 344 }, { "epoch": 0.043887546113725986, "ewc_loss": 0.0021636837627738714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1636838027916383e-06, "grad_norm": 6.542110919952393, "learning_rate": 1.4582450190758795e-07, "loss": 0.666, "mean_token_accuracy": 0.7966707944869995, "num_tokens": 13170760.0, "step": 345 }, { "epoch": 0.0440147563923165, "ewc_loss": 0.0020736868027597666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.073686800940777e-06, "grad_norm": 7.507187843322754, "learning_rate": 1.4624841034336583e-07, "loss": 0.6351, "mean_token_accuracy": 0.8087153434753418, "num_tokens": 13207061.0, "step": 346 }, { "epoch": 0.04414196667090701, "ewc_loss": 0.002119968878105283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1199689399509225e-06, "grad_norm": 6.7656707763671875, "learning_rate": 1.466723187791437e-07, "loss": 0.6029, "mean_token_accuracy": 0.816129744052887, "num_tokens": 13247903.0, "step": 347 }, { "epoch": 0.04426917694949752, "ewc_loss": 0.0021005221642553806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1005221242376138e-06, "grad_norm": 6.4038310050964355, "learning_rate": 1.4709622721492157e-07, "loss": 0.6132, "mean_token_accuracy": 0.8087236881256104, "num_tokens": 13287193.0, "step": 348 }, { "epoch": 0.04439638722808803, "ewc_loss": 0.00208260677754879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.082606670228415e-06, "grad_norm": 6.929994583129883, "learning_rate": 1.4752013565069942e-07, "loss": 0.6177, "mean_token_accuracy": 0.8112128973007202, "num_tokens": 13323112.0, "step": 349 }, { "epoch": 0.04452359750667854, "ewc_loss": 0.0021331810858100653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.133181169483578e-06, "grad_norm": 7.8121442794799805, "learning_rate": 1.4794404408647732e-07, "loss": 0.6136, "mean_token_accuracy": 0.8141175508499146, "num_tokens": 13359653.0, "step": 350 }, { "epoch": 0.04465080778526905, "ewc_loss": 0.0021955007687211037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1955006559437606e-06, "grad_norm": 6.729165554046631, "learning_rate": 1.4836795252225516e-07, "loss": 0.5845, "mean_token_accuracy": 0.8188825845718384, "num_tokens": 13400571.0, "step": 351 }, { "epoch": 0.04477801806385956, "ewc_loss": 0.002132739871740341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1327398371795425e-06, "grad_norm": 7.526705741882324, "learning_rate": 1.4879186095803306e-07, "loss": 0.5627, "mean_token_accuracy": 0.8224931359291077, "num_tokens": 13437656.0, "step": 352 }, { "epoch": 0.04490522834245007, "ewc_loss": 0.002166321501135826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1663215648004552e-06, "grad_norm": 6.628541946411133, "learning_rate": 1.492157693938109e-07, "loss": 0.6214, "mean_token_accuracy": 0.8100993037223816, "num_tokens": 13474688.0, "step": 353 }, { "epoch": 0.04503243862104058, "ewc_loss": 0.0021338583901524544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1338582882890478e-06, "grad_norm": 6.015209674835205, "learning_rate": 1.496396778295888e-07, "loss": 0.5584, "mean_token_accuracy": 0.8274439573287964, "num_tokens": 13518256.0, "step": 354 }, { "epoch": 0.04515964889963109, "ewc_loss": 0.0020989554468542337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.09895551961381e-06, "grad_norm": 6.59260368347168, "learning_rate": 1.5006358626536665e-07, "loss": 0.6455, "mean_token_accuracy": 0.8046676516532898, "num_tokens": 13554315.0, "step": 355 }, { "epoch": 0.0452868591782216, "ewc_loss": 0.002154685091227293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1546850348386215e-06, "grad_norm": 6.913581848144531, "learning_rate": 1.5048749470114455e-07, "loss": 0.6677, "mean_token_accuracy": 0.7976675629615784, "num_tokens": 13594388.0, "step": 356 }, { "epoch": 0.04541406945681211, "ewc_loss": 0.0022011955734342337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2011956843925873e-06, "grad_norm": 6.800996780395508, "learning_rate": 1.509114031369224e-07, "loss": 0.5928, "mean_token_accuracy": 0.816232442855835, "num_tokens": 13633704.0, "step": 357 }, { "epoch": 0.04554127973540262, "ewc_loss": 0.0022021608892828226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.202160885644844e-06, "grad_norm": 6.829141616821289, "learning_rate": 1.513353115727003e-07, "loss": 0.6434, "mean_token_accuracy": 0.7995734214782715, "num_tokens": 13671183.0, "step": 358 }, { "epoch": 0.04566849001399313, "ewc_loss": 0.0021985035855323076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1985035800753394e-06, "grad_norm": 7.020492076873779, "learning_rate": 1.5175922000847814e-07, "loss": 0.598, "mean_token_accuracy": 0.8168132305145264, "num_tokens": 13709640.0, "step": 359 }, { "epoch": 0.04579570029258364, "ewc_loss": 0.002219161484390497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.219161387984059e-06, "grad_norm": 7.077926158905029, "learning_rate": 1.5218312844425604e-07, "loss": 0.62, "mean_token_accuracy": 0.8036713600158691, "num_tokens": 13751114.0, "step": 360 }, { "epoch": 0.045922910571174154, "ewc_loss": 0.0022310565691441298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231056669188547e-06, "grad_norm": 6.5631794929504395, "learning_rate": 1.526070368800339e-07, "loss": 0.5506, "mean_token_accuracy": 0.8247643113136292, "num_tokens": 13786448.0, "step": 361 }, { "epoch": 0.04605012084976466, "ewc_loss": 0.002195644425228238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1956443561066408e-06, "grad_norm": 6.6812286376953125, "learning_rate": 1.530309453158118e-07, "loss": 0.574, "mean_token_accuracy": 0.8267040848731995, "num_tokens": 13827663.0, "step": 362 }, { "epoch": 0.04617733112835517, "ewc_loss": 0.002207459881901741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2074598291510483e-06, "grad_norm": 7.143743991851807, "learning_rate": 1.5345485375158963e-07, "loss": 0.6527, "mean_token_accuracy": 0.8011759519577026, "num_tokens": 13866078.0, "step": 363 }, { "epoch": 0.04630454140694568, "ewc_loss": 0.0022517202887684107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.251720388812828e-06, "grad_norm": 6.650269508361816, "learning_rate": 1.5387876218736753e-07, "loss": 0.5301, "mean_token_accuracy": 0.8349038362503052, "num_tokens": 13909767.0, "step": 364 }, { "epoch": 0.04643175168553619, "ewc_loss": 0.0022140967193990946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.214096639363561e-06, "grad_norm": 7.023141384124756, "learning_rate": 1.5430267062314538e-07, "loss": 0.6389, "mean_token_accuracy": 0.8073136806488037, "num_tokens": 13948937.0, "step": 365 }, { "epoch": 0.0465589619641267, "ewc_loss": 0.002237614942714572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2376150354830315e-06, "grad_norm": 7.145341396331787, "learning_rate": 1.5472657905892328e-07, "loss": 0.649, "mean_token_accuracy": 0.8007035851478577, "num_tokens": 13984039.0, "step": 366 }, { "epoch": 0.04668617224271721, "ewc_loss": 0.002257978543639183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2579786218557274e-06, "grad_norm": 7.068796157836914, "learning_rate": 1.5515048749470113e-07, "loss": 0.6322, "mean_token_accuracy": 0.8052188158035278, "num_tokens": 14018162.0, "step": 367 }, { "epoch": 0.04681338252130772, "ewc_loss": 0.002252490958198905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.252490958198905e-06, "grad_norm": 6.897879123687744, "learning_rate": 1.55574395930479e-07, "loss": 0.6318, "mean_token_accuracy": 0.806551456451416, "num_tokens": 14056493.0, "step": 368 }, { "epoch": 0.046940592799898234, "ewc_loss": 0.002241854090243578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.241854190287995e-06, "grad_norm": 6.8616132736206055, "learning_rate": 1.5599830436625687e-07, "loss": 0.6255, "mean_token_accuracy": 0.8091020584106445, "num_tokens": 14097530.0, "step": 369 }, { "epoch": 0.04706780307848874, "ewc_loss": 0.0022470185067504644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2470185285783373e-06, "grad_norm": 6.716677188873291, "learning_rate": 1.5642221280203474e-07, "loss": 0.6438, "mean_token_accuracy": 0.8077830672264099, "num_tokens": 14136240.0, "step": 370 }, { "epoch": 0.04719501335707925, "ewc_loss": 0.0022458492312580347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2458491457655327e-06, "grad_norm": 6.685718059539795, "learning_rate": 1.5684612123781262e-07, "loss": 0.5541, "mean_token_accuracy": 0.828030526638031, "num_tokens": 14171010.0, "step": 371 }, { "epoch": 0.04732222363566976, "ewc_loss": 0.002257451880723238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2574517970497254e-06, "grad_norm": 6.765822410583496, "learning_rate": 1.572700296735905e-07, "loss": 0.6249, "mean_token_accuracy": 0.8146151900291443, "num_tokens": 14206985.0, "step": 372 }, { "epoch": 0.04744943391426027, "ewc_loss": 0.0022719362750649452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2719361822964856e-06, "grad_norm": 7.1514058113098145, "learning_rate": 1.576939381093684e-07, "loss": 0.6241, "mean_token_accuracy": 0.8057581186294556, "num_tokens": 14236728.0, "step": 373 }, { "epoch": 0.047576644192850785, "ewc_loss": 0.0023134243674576283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.313424374733586e-06, "grad_norm": 6.61872673034668, "learning_rate": 1.5811784654514623e-07, "loss": 0.5507, "mean_token_accuracy": 0.8307564854621887, "num_tokens": 14270758.0, "step": 374 }, { "epoch": 0.04770385447144129, "ewc_loss": 0.002284290501847863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2842905309516937e-06, "grad_norm": 6.720442295074463, "learning_rate": 1.5854175498092413e-07, "loss": 0.6683, "mean_token_accuracy": 0.7930923700332642, "num_tokens": 14315002.0, "step": 375 }, { "epoch": 0.0478310647500318, "ewc_loss": 0.002284399466589093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.284399442942231e-06, "grad_norm": 7.328858375549316, "learning_rate": 1.5896566341670198e-07, "loss": 0.6389, "mean_token_accuracy": 0.8013705015182495, "num_tokens": 14353385.0, "step": 376 }, { "epoch": 0.047958275028622314, "ewc_loss": 0.0023393610026687384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.339361117265071e-06, "grad_norm": 7.090360164642334, "learning_rate": 1.5938957185247988e-07, "loss": 0.6714, "mean_token_accuracy": 0.7912445664405823, "num_tokens": 14391702.0, "step": 377 }, { "epoch": 0.04808548530721282, "ewc_loss": 0.0023344128858298063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3344127839663997e-06, "grad_norm": 7.558786869049072, "learning_rate": 1.5981348028825772e-07, "loss": 0.6394, "mean_token_accuracy": 0.8042712211608887, "num_tokens": 14427876.0, "step": 378 }, { "epoch": 0.048212695585803336, "ewc_loss": 0.002360972575843334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.360972530368599e-06, "grad_norm": 7.771438121795654, "learning_rate": 1.6023738872403562e-07, "loss": 0.7266, "mean_token_accuracy": 0.777188777923584, "num_tokens": 14460349.0, "step": 379 }, { "epoch": 0.048339905864393844, "ewc_loss": 0.0023682645987719297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368264631513739e-06, "grad_norm": 6.159975528717041, "learning_rate": 1.6066129715981347e-07, "loss": 0.5755, "mean_token_accuracy": 0.8248581886291504, "num_tokens": 14497763.0, "step": 380 }, { "epoch": 0.04846711614298435, "ewc_loss": 0.0022619476076215506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2619476567342645e-06, "grad_norm": 6.302398204803467, "learning_rate": 1.6108520559559137e-07, "loss": 0.5689, "mean_token_accuracy": 0.8221918940544128, "num_tokens": 14537172.0, "step": 381 }, { "epoch": 0.048594326421574865, "ewc_loss": 0.0022997334599494934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2997335236141225e-06, "grad_norm": 7.339661121368408, "learning_rate": 1.6150911403136921e-07, "loss": 0.6566, "mean_token_accuracy": 0.803590714931488, "num_tokens": 14565515.0, "step": 382 }, { "epoch": 0.04872153670016537, "ewc_loss": 0.002407807158306241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4078071874100715e-06, "grad_norm": 6.312252044677734, "learning_rate": 1.619330224671471e-07, "loss": 0.6471, "mean_token_accuracy": 0.8038773536682129, "num_tokens": 14608104.0, "step": 383 }, { "epoch": 0.04884874697875588, "ewc_loss": 0.0023312431294471025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.331243194930721e-06, "grad_norm": 7.480045318603516, "learning_rate": 1.6235693090292496e-07, "loss": 0.6545, "mean_token_accuracy": 0.8044100999832153, "num_tokens": 14645328.0, "step": 384 }, { "epoch": 0.048975957257346395, "ewc_loss": 0.0024141224566847086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4141224912455073e-06, "grad_norm": 7.158812046051025, "learning_rate": 1.6278083933870286e-07, "loss": 0.6895, "mean_token_accuracy": 0.7864998579025269, "num_tokens": 14678792.0, "step": 385 }, { "epoch": 0.0491031675359369, "ewc_loss": 0.0024042297154664993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4042296900006477e-06, "grad_norm": 6.553189754486084, "learning_rate": 1.632047477744807e-07, "loss": 0.6122, "mean_token_accuracy": 0.8128995299339294, "num_tokens": 14715095.0, "step": 386 }, { "epoch": 0.049230377814527417, "ewc_loss": 0.002356599783524871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.356599679842475e-06, "grad_norm": 6.548966407775879, "learning_rate": 1.6362865621025858e-07, "loss": 0.6311, "mean_token_accuracy": 0.8085098266601562, "num_tokens": 14753641.0, "step": 387 }, { "epoch": 0.049357588093117924, "ewc_loss": 0.0023788248654454947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3788247744960245e-06, "grad_norm": 6.994729042053223, "learning_rate": 1.6405256464603645e-07, "loss": 0.6028, "mean_token_accuracy": 0.8138253688812256, "num_tokens": 14788816.0, "step": 388 }, { "epoch": 0.04948479837170843, "ewc_loss": 0.002428189618512988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.428189645797829e-06, "grad_norm": 7.424833297729492, "learning_rate": 1.6447647308181432e-07, "loss": 0.647, "mean_token_accuracy": 0.8009893894195557, "num_tokens": 14821607.0, "step": 389 }, { "epoch": 0.049612008650298946, "ewc_loss": 0.002465070690959692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4650707928230986e-06, "grad_norm": 6.509518146514893, "learning_rate": 1.649003815175922e-07, "loss": 0.6641, "mean_token_accuracy": 0.799027144908905, "num_tokens": 14861298.0, "step": 390 }, { "epoch": 0.04973921892888945, "ewc_loss": 0.0023911045864224434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391104544585687e-06, "grad_norm": 6.822322845458984, "learning_rate": 1.6532428995337007e-07, "loss": 0.5811, "mean_token_accuracy": 0.8191924095153809, "num_tokens": 14899951.0, "step": 391 }, { "epoch": 0.04986642920747997, "ewc_loss": 0.0024148235097527504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4148234842868987e-06, "grad_norm": 6.311705589294434, "learning_rate": 1.6574819838914794e-07, "loss": 0.5607, "mean_token_accuracy": 0.827603816986084, "num_tokens": 14937169.0, "step": 392 }, { "epoch": 0.049993639486070475, "ewc_loss": 0.002389676868915558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.389676865277579e-06, "grad_norm": 6.831667900085449, "learning_rate": 1.661721068249258e-07, "loss": 0.6147, "mean_token_accuracy": 0.8113434314727783, "num_tokens": 14975549.0, "step": 393 }, { "epoch": 0.05012084976466098, "ewc_loss": 0.002459877636283636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.459877578075975e-06, "grad_norm": 6.79219388961792, "learning_rate": 1.6659601526070368e-07, "loss": 0.6415, "mean_token_accuracy": 0.8049459457397461, "num_tokens": 15016630.0, "step": 394 }, { "epoch": 0.0502480600432515, "ewc_loss": 0.0024684963282197714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4684964046173263e-06, "grad_norm": 6.697468280792236, "learning_rate": 1.6701992369648156e-07, "loss": 0.623, "mean_token_accuracy": 0.8073617815971375, "num_tokens": 15057822.0, "step": 395 }, { "epoch": 0.050375270321842004, "ewc_loss": 0.002449300605803728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.449300609441707e-06, "grad_norm": 8.050454139709473, "learning_rate": 1.6744383213225943e-07, "loss": 0.5966, "mean_token_accuracy": 0.8112605810165405, "num_tokens": 15094385.0, "step": 396 }, { "epoch": 0.05050248060043251, "ewc_loss": 0.0025540590286254883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.554058937676018e-06, "grad_norm": 6.9437971115112305, "learning_rate": 1.678677405680373e-07, "loss": 0.556, "mean_token_accuracy": 0.8239841461181641, "num_tokens": 15130969.0, "step": 397 }, { "epoch": 0.050629690879023026, "ewc_loss": 0.0024767322465777397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.476732333889231e-06, "grad_norm": 7.141929626464844, "learning_rate": 1.6829164900381518e-07, "loss": 0.5767, "mean_token_accuracy": 0.8181336522102356, "num_tokens": 15166586.0, "step": 398 }, { "epoch": 0.05075690115761353, "ewc_loss": 0.0024706083349883556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.470608251314843e-06, "grad_norm": 6.729183197021484, "learning_rate": 1.6871555743959305e-07, "loss": 0.5668, "mean_token_accuracy": 0.8211676478385925, "num_tokens": 15209603.0, "step": 399 }, { "epoch": 0.05088411143620405, "ewc_loss": 0.00244282023049891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4428202323178994e-06, "grad_norm": 6.4904046058654785, "learning_rate": 1.6913946587537092e-07, "loss": 0.5839, "mean_token_accuracy": 0.8175663948059082, "num_tokens": 15251542.0, "step": 400 }, { "epoch": 0.051011321714794555, "ewc_loss": 0.0024323316756635904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4323317120433785e-06, "grad_norm": 6.596295356750488, "learning_rate": 1.695633743111488e-07, "loss": 0.5987, "mean_token_accuracy": 0.8172827959060669, "num_tokens": 15288940.0, "step": 401 }, { "epoch": 0.05113853199338506, "ewc_loss": 0.0024626648519188166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4626649519632338e-06, "grad_norm": 6.714887619018555, "learning_rate": 1.6998728274692667e-07, "loss": 0.5659, "mean_token_accuracy": 0.8277921676635742, "num_tokens": 15321965.0, "step": 402 }, { "epoch": 0.05126574227197558, "ewc_loss": 0.0024849423207342625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4849423425621353e-06, "grad_norm": 6.825057029724121, "learning_rate": 1.7041119118270454e-07, "loss": 0.5734, "mean_token_accuracy": 0.8237584233283997, "num_tokens": 15362895.0, "step": 403 }, { "epoch": 0.051392952550566084, "ewc_loss": 0.002487988444045186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4879884676920483e-06, "grad_norm": 7.512076377868652, "learning_rate": 1.7083509961848238e-07, "loss": 0.6337, "mean_token_accuracy": 0.8034713268280029, "num_tokens": 15396675.0, "step": 404 }, { "epoch": 0.0515201628291566, "ewc_loss": 0.002541037043556571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5410370199097088e-06, "grad_norm": 6.698177337646484, "learning_rate": 1.7125900805426028e-07, "loss": 0.6537, "mean_token_accuracy": 0.8004517555236816, "num_tokens": 15443070.0, "step": 405 }, { "epoch": 0.051647373107747106, "ewc_loss": 0.0024737645871937275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473764652677346e-06, "grad_norm": 6.797305107116699, "learning_rate": 1.7168291649003813e-07, "loss": 0.5814, "mean_token_accuracy": 0.8186070322990417, "num_tokens": 15476747.0, "step": 406 }, { "epoch": 0.051774583386337614, "ewc_loss": 0.0024842657148838043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4842656785040163e-06, "grad_norm": 7.0231032371521, "learning_rate": 1.7210682492581603e-07, "loss": 0.6063, "mean_token_accuracy": 0.8136448860168457, "num_tokens": 15514866.0, "step": 407 }, { "epoch": 0.05190179366492813, "ewc_loss": 0.002511126222088933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.511126240278827e-06, "grad_norm": 6.6753830909729, "learning_rate": 1.7253073336159387e-07, "loss": 0.583, "mean_token_accuracy": 0.8178266286849976, "num_tokens": 15554245.0, "step": 408 }, { "epoch": 0.052029003943518635, "ewc_loss": 0.00248787016607821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.487870233380818e-06, "grad_norm": 6.766290664672852, "learning_rate": 1.7295464179737177e-07, "loss": 0.6114, "mean_token_accuracy": 0.813146710395813, "num_tokens": 15593958.0, "step": 409 }, { "epoch": 0.05215621422210915, "ewc_loss": 0.002506035612896085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5060355710593285e-06, "grad_norm": 6.519018173217773, "learning_rate": 1.7337855023314962e-07, "loss": 0.5567, "mean_token_accuracy": 0.8261755108833313, "num_tokens": 15632567.0, "step": 410 }, { "epoch": 0.05228342450069966, "ewc_loss": 0.002491754712536931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.49175468525209e-06, "grad_norm": 7.214050769805908, "learning_rate": 1.7380245866892752e-07, "loss": 0.5671, "mean_token_accuracy": 0.8224152326583862, "num_tokens": 15668255.0, "step": 411 }, { "epoch": 0.052410634779290165, "ewc_loss": 0.002559581073001027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.559581162131508e-06, "grad_norm": 7.380876541137695, "learning_rate": 1.7422636710470536e-07, "loss": 0.6202, "mean_token_accuracy": 0.807479202747345, "num_tokens": 15705435.0, "step": 412 }, { "epoch": 0.05253784505788068, "ewc_loss": 0.0025671052280813456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5671051844256e-06, "grad_norm": 6.995377540588379, "learning_rate": 1.7465027554048326e-07, "loss": 0.6402, "mean_token_accuracy": 0.8025685548782349, "num_tokens": 15744657.0, "step": 413 }, { "epoch": 0.05266505533647119, "ewc_loss": 0.002522429684177041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5224296678061364e-06, "grad_norm": 6.685955047607422, "learning_rate": 1.750741839762611e-07, "loss": 0.5823, "mean_token_accuracy": 0.8218557238578796, "num_tokens": 15777374.0, "step": 414 }, { "epoch": 0.052792265615061694, "ewc_loss": 0.0025093210861086845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.509321120669483e-06, "grad_norm": 6.793571472167969, "learning_rate": 1.75498092412039e-07, "loss": 0.5553, "mean_token_accuracy": 0.8282347917556763, "num_tokens": 15814047.0, "step": 415 }, { "epoch": 0.05291947589365221, "ewc_loss": 0.0025356272235512733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.535627118049888e-06, "grad_norm": 6.6360063552856445, "learning_rate": 1.7592200084781686e-07, "loss": 0.6375, "mean_token_accuracy": 0.8031055927276611, "num_tokens": 15849110.0, "step": 416 }, { "epoch": 0.053046686172242716, "ewc_loss": 0.002531267236918211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.531267227823264e-06, "grad_norm": 6.817892551422119, "learning_rate": 1.7634590928359475e-07, "loss": 0.6069, "mean_token_accuracy": 0.8154351711273193, "num_tokens": 15888910.0, "step": 417 }, { "epoch": 0.05317389645083323, "ewc_loss": 0.0025547684635967016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5547685709170764e-06, "grad_norm": 7.031945705413818, "learning_rate": 1.767698177193726e-07, "loss": 0.5938, "mean_token_accuracy": 0.8163715600967407, "num_tokens": 15925604.0, "step": 418 }, { "epoch": 0.05330110672942374, "ewc_loss": 0.002579152351245284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579152351245284e-06, "grad_norm": 6.6187896728515625, "learning_rate": 1.771937261551505e-07, "loss": 0.5671, "mean_token_accuracy": 0.8298128247261047, "num_tokens": 15961401.0, "step": 419 }, { "epoch": 0.053428317008014245, "ewc_loss": 0.0025439916644245386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5439917408220936e-06, "grad_norm": 6.668218612670898, "learning_rate": 1.7761763459092835e-07, "loss": 0.5973, "mean_token_accuracy": 0.8157391548156738, "num_tokens": 16001029.0, "step": 420 }, { "epoch": 0.05355552728660476, "ewc_loss": 0.0025597636122256517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5597635158192134e-06, "grad_norm": 6.529715061187744, "learning_rate": 1.7804154302670624e-07, "loss": 0.5789, "mean_token_accuracy": 0.821397066116333, "num_tokens": 16036014.0, "step": 421 }, { "epoch": 0.05368273756519527, "ewc_loss": 0.002564039546996355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5640395051595988e-06, "grad_norm": 7.384525775909424, "learning_rate": 1.784654514624841e-07, "loss": 0.5613, "mean_token_accuracy": 0.8233770728111267, "num_tokens": 16075373.0, "step": 422 }, { "epoch": 0.05380994784378578, "ewc_loss": 0.002646095585078001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646095481395605e-06, "grad_norm": 6.896103382110596, "learning_rate": 1.7888935989826196e-07, "loss": 0.6028, "mean_token_accuracy": 0.8100244998931885, "num_tokens": 16111352.0, "step": 423 }, { "epoch": 0.05393715812237629, "ewc_loss": 0.0025887719821184874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5887720767059363e-06, "grad_norm": 6.564666271209717, "learning_rate": 1.7931326833403984e-07, "loss": 0.6196, "mean_token_accuracy": 0.8080500364303589, "num_tokens": 16148826.0, "step": 424 }, { "epoch": 0.054064368400966796, "ewc_loss": 0.002559772925451398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5597728381399065e-06, "grad_norm": 6.452038288116455, "learning_rate": 1.797371767698177e-07, "loss": 0.5258, "mean_token_accuracy": 0.8377149105072021, "num_tokens": 16193491.0, "step": 425 }, { "epoch": 0.05419157867955731, "ewc_loss": 0.002569893142208457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5698932404338848e-06, "grad_norm": 7.304237365722656, "learning_rate": 1.8016108520559558e-07, "loss": 0.5766, "mean_token_accuracy": 0.8221750855445862, "num_tokens": 16235595.0, "step": 426 }, { "epoch": 0.05431878895814782, "ewc_loss": 0.0026630156207829714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6630157208273886e-06, "grad_norm": 7.158932685852051, "learning_rate": 1.8058499364137345e-07, "loss": 0.631, "mean_token_accuracy": 0.810440719127655, "num_tokens": 16274105.0, "step": 427 }, { "epoch": 0.054445999236738325, "ewc_loss": 0.0026370545383542776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6370546493126312e-06, "grad_norm": 6.815001010894775, "learning_rate": 1.8100890207715133e-07, "loss": 0.674, "mean_token_accuracy": 0.790015459060669, "num_tokens": 16315886.0, "step": 428 }, { "epoch": 0.05457320951532884, "ewc_loss": 0.0025949657429009676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.59496573562501e-06, "grad_norm": 6.736267566680908, "learning_rate": 1.814328105129292e-07, "loss": 0.6149, "mean_token_accuracy": 0.8090880513191223, "num_tokens": 16353021.0, "step": 429 }, { "epoch": 0.05470041979391935, "ewc_loss": 0.00260252901352942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6025290935649537e-06, "grad_norm": 7.3956074714660645, "learning_rate": 1.8185671894870707e-07, "loss": 0.59, "mean_token_accuracy": 0.8140595555305481, "num_tokens": 16380887.0, "step": 430 }, { "epoch": 0.05482763007250986, "ewc_loss": 0.0026883629616349936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.688362883418449e-06, "grad_norm": 6.713736057281494, "learning_rate": 1.8228062738448494e-07, "loss": 0.6463, "mean_token_accuracy": 0.8017617464065552, "num_tokens": 16421066.0, "step": 431 }, { "epoch": 0.05495484035110037, "ewc_loss": 0.0026247447822242975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62474486589781e-06, "grad_norm": 7.545395374298096, "learning_rate": 1.8270453582026282e-07, "loss": 0.6152, "mean_token_accuracy": 0.8088387250900269, "num_tokens": 16457197.0, "step": 432 }, { "epoch": 0.055082050629690876, "ewc_loss": 0.002692432841286063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.692432872208883e-06, "grad_norm": 6.454502582550049, "learning_rate": 1.831284442560407e-07, "loss": 0.5689, "mean_token_accuracy": 0.8221576809883118, "num_tokens": 16497385.0, "step": 433 }, { "epoch": 0.05520926090828139, "ewc_loss": 0.002602153457701206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6021534722531214e-06, "grad_norm": 6.52385950088501, "learning_rate": 1.8355235269181856e-07, "loss": 0.584, "mean_token_accuracy": 0.8206011056900024, "num_tokens": 16540099.0, "step": 434 }, { "epoch": 0.0553364711868719, "ewc_loss": 0.0026279476005584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6279476514901035e-06, "grad_norm": 6.89023494720459, "learning_rate": 1.8397626112759643e-07, "loss": 0.5821, "mean_token_accuracy": 0.8192732930183411, "num_tokens": 16572760.0, "step": 435 }, { "epoch": 0.05546368146546241, "ewc_loss": 0.0026892770547419786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6892771529674064e-06, "grad_norm": 7.148346900939941, "learning_rate": 1.844001695633743e-07, "loss": 0.541, "mean_token_accuracy": 0.8306475877761841, "num_tokens": 16608549.0, "step": 436 }, { "epoch": 0.05559089174405292, "ewc_loss": 0.002703828504309058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7038286134484224e-06, "grad_norm": 7.322117328643799, "learning_rate": 1.8482407799915218e-07, "loss": 0.5797, "mean_token_accuracy": 0.8208547830581665, "num_tokens": 16643800.0, "step": 437 }, { "epoch": 0.05571810202264343, "ewc_loss": 0.002710967790335417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.710967692109989e-06, "grad_norm": 7.91243314743042, "learning_rate": 1.8524798643493005e-07, "loss": 0.6522, "mean_token_accuracy": 0.7966675758361816, "num_tokens": 16677948.0, "step": 438 }, { "epoch": 0.05584531230123394, "ewc_loss": 0.002770066261291504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770066203083843e-06, "grad_norm": 6.813676834106445, "learning_rate": 1.8567189487070792e-07, "loss": 0.6242, "mean_token_accuracy": 0.8094488978385925, "num_tokens": 16716905.0, "step": 439 }, { "epoch": 0.05597252257982445, "ewc_loss": 0.002662880113348365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6628802061168244e-06, "grad_norm": 6.537591934204102, "learning_rate": 1.8609580330648577e-07, "loss": 0.5357, "mean_token_accuracy": 0.8325480222702026, "num_tokens": 16756158.0, "step": 440 }, { "epoch": 0.05609973285841496, "ewc_loss": 0.0026469745207577944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6469745080248686e-06, "grad_norm": 7.160908222198486, "learning_rate": 1.8651971174226367e-07, "loss": 0.6274, "mean_token_accuracy": 0.8093438744544983, "num_tokens": 16794261.0, "step": 441 }, { "epoch": 0.05622694313700547, "ewc_loss": 0.00273348786868155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.733487917794264e-06, "grad_norm": 6.520073890686035, "learning_rate": 1.8694362017804152e-07, "loss": 0.5839, "mean_token_accuracy": 0.8177218437194824, "num_tokens": 16834953.0, "step": 442 }, { "epoch": 0.05635415341559598, "ewc_loss": 0.002669048495590687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6690483991842484e-06, "grad_norm": 7.075374126434326, "learning_rate": 1.8736752861381941e-07, "loss": 0.5424, "mean_token_accuracy": 0.8324819803237915, "num_tokens": 16873954.0, "step": 443 }, { "epoch": 0.05648136369418649, "ewc_loss": 0.002729530679062009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72953070634685e-06, "grad_norm": 7.044656276702881, "learning_rate": 1.8779143704959726e-07, "loss": 0.566, "mean_token_accuracy": 0.8210129141807556, "num_tokens": 16906018.0, "step": 444 }, { "epoch": 0.056608573972777, "ewc_loss": 0.002728249877691269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728249910433078e-06, "grad_norm": 6.9435296058654785, "learning_rate": 1.8821534548537516e-07, "loss": 0.6202, "mean_token_accuracy": 0.8090937733650208, "num_tokens": 16938945.0, "step": 445 }, { "epoch": 0.05673578425136751, "ewc_loss": 0.0027212421409785748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7212420263822423e-06, "grad_norm": 7.219270706176758, "learning_rate": 1.88639253921153e-07, "loss": 0.5773, "mean_token_accuracy": 0.8230530023574829, "num_tokens": 16982214.0, "step": 446 }, { "epoch": 0.05686299452995802, "ewc_loss": 0.002750520361587405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750520252448041e-06, "grad_norm": 6.829458236694336, "learning_rate": 1.890631623569309e-07, "loss": 0.5856, "mean_token_accuracy": 0.8181992769241333, "num_tokens": 17024367.0, "step": 447 }, { "epoch": 0.05699020480854853, "ewc_loss": 0.0027208479586988688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.720847987802699e-06, "grad_norm": 7.734235763549805, "learning_rate": 1.8948707079270875e-07, "loss": 0.6094, "mean_token_accuracy": 0.8072896003723145, "num_tokens": 17057451.0, "step": 448 }, { "epoch": 0.057117415087139044, "ewc_loss": 0.0028089138213545084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.808913905028021e-06, "grad_norm": 8.227560997009277, "learning_rate": 1.8991097922848665e-07, "loss": 0.5603, "mean_token_accuracy": 0.8247660994529724, "num_tokens": 17094633.0, "step": 449 }, { "epoch": 0.05724462536572955, "ewc_loss": 0.0028421070892363787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842107051037601e-06, "grad_norm": 7.226709842681885, "learning_rate": 1.903348876642645e-07, "loss": 0.6146, "mean_token_accuracy": 0.8077759742736816, "num_tokens": 17135892.0, "step": 450 }, { "epoch": 0.05737183564432006, "ewc_loss": 0.0027257921174168587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.725792001001537e-06, "grad_norm": 7.07546854019165, "learning_rate": 1.907587961000424e-07, "loss": 0.5563, "mean_token_accuracy": 0.8267805576324463, "num_tokens": 17173881.0, "step": 451 }, { "epoch": 0.05749904592291057, "ewc_loss": 0.0027218847535550594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7218848117627203e-06, "grad_norm": 6.7511749267578125, "learning_rate": 1.9118270453582024e-07, "loss": 0.6063, "mean_token_accuracy": 0.8128122091293335, "num_tokens": 17215668.0, "step": 452 }, { "epoch": 0.05762625620150108, "ewc_loss": 0.0027152232360094786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7152232178195845e-06, "grad_norm": 7.330747127532959, "learning_rate": 1.9160661297159814e-07, "loss": 0.6308, "mean_token_accuracy": 0.8075101375579834, "num_tokens": 17254339.0, "step": 453 }, { "epoch": 0.057753466480091595, "ewc_loss": 0.0027938124258071184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.793812427626108e-06, "grad_norm": 6.685074806213379, "learning_rate": 1.9203052140737599e-07, "loss": 0.6142, "mean_token_accuracy": 0.8082898259162903, "num_tokens": 17288858.0, "step": 454 }, { "epoch": 0.0578806767586821, "ewc_loss": 0.002724449150264263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.724449132074369e-06, "grad_norm": 6.667526721954346, "learning_rate": 1.9245442984315389e-07, "loss": 0.591, "mean_token_accuracy": 0.8185092210769653, "num_tokens": 17330601.0, "step": 455 }, { "epoch": 0.05800788703727261, "ewc_loss": 0.0027475422248244286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747542339420761e-06, "grad_norm": 6.907434463500977, "learning_rate": 1.9287833827893173e-07, "loss": 0.6101, "mean_token_accuracy": 0.8098900318145752, "num_tokens": 17367797.0, "step": 456 }, { "epoch": 0.058135097315863124, "ewc_loss": 0.0027991419192403555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7991418392048217e-06, "grad_norm": 7.120530128479004, "learning_rate": 1.9330224671470963e-07, "loss": 0.5968, "mean_token_accuracy": 0.8147121667861938, "num_tokens": 17403206.0, "step": 457 }, { "epoch": 0.05826230759445363, "ewc_loss": 0.002814894774928689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8148947421868797e-06, "grad_norm": 7.030184745788574, "learning_rate": 1.9372615515048748e-07, "loss": 0.6426, "mean_token_accuracy": 0.8044142723083496, "num_tokens": 17437207.0, "step": 458 }, { "epoch": 0.05838951787304414, "ewc_loss": 0.0028060597833245993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806059683280182e-06, "grad_norm": 6.8029561042785645, "learning_rate": 1.9415006358626535e-07, "loss": 0.5786, "mean_token_accuracy": 0.8229972124099731, "num_tokens": 17473335.0, "step": 459 }, { "epoch": 0.05851672815163465, "ewc_loss": 0.002792689483612776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7926894290430937e-06, "grad_norm": 7.69108772277832, "learning_rate": 1.9457397202204322e-07, "loss": 0.5685, "mean_token_accuracy": 0.8202700614929199, "num_tokens": 17505104.0, "step": 460 }, { "epoch": 0.05864393843022516, "ewc_loss": 0.0028942597564309835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8942597509740153e-06, "grad_norm": 7.009799003601074, "learning_rate": 1.949978804578211e-07, "loss": 0.5527, "mean_token_accuracy": 0.8302071690559387, "num_tokens": 17539052.0, "step": 461 }, { "epoch": 0.058771148708815675, "ewc_loss": 0.0028178603388369083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.817860377035686e-06, "grad_norm": 6.5801849365234375, "learning_rate": 1.9542178889359897e-07, "loss": 0.6258, "mean_token_accuracy": 0.8056204915046692, "num_tokens": 17578942.0, "step": 462 }, { "epoch": 0.05889835898740618, "ewc_loss": 0.0027881620917469263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7881621917913435e-06, "grad_norm": 7.086729049682617, "learning_rate": 1.9584569732937684e-07, "loss": 0.6047, "mean_token_accuracy": 0.8158680200576782, "num_tokens": 17611931.0, "step": 463 }, { "epoch": 0.05902556926599669, "ewc_loss": 0.0028794719837605953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879472049244214e-06, "grad_norm": 7.088396072387695, "learning_rate": 1.962696057651547e-07, "loss": 0.5096, "mean_token_accuracy": 0.8413988947868347, "num_tokens": 17648764.0, "step": 464 }, { "epoch": 0.059152779544587204, "ewc_loss": 0.0028720537666231394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.872053755709203e-06, "grad_norm": 8.133665084838867, "learning_rate": 1.9669351420093258e-07, "loss": 0.6207, "mean_token_accuracy": 0.8074618577957153, "num_tokens": 17683099.0, "step": 465 }, { "epoch": 0.05927998982317771, "ewc_loss": 0.0029557659290730953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9557659217971377e-06, "grad_norm": 7.126007556915283, "learning_rate": 1.9711742263671046e-07, "loss": 0.6048, "mean_token_accuracy": 0.8166797757148743, "num_tokens": 17727120.0, "step": 466 }, { "epoch": 0.059407200101768226, "ewc_loss": 0.002841512905433774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.841512923623668e-06, "grad_norm": 7.04038667678833, "learning_rate": 1.9754133107248833e-07, "loss": 0.56, "mean_token_accuracy": 0.8189737796783447, "num_tokens": 17761676.0, "step": 467 }, { "epoch": 0.059534410380358734, "ewc_loss": 0.00283940345980227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8394033506629057e-06, "grad_norm": 6.995879173278809, "learning_rate": 1.979652395082662e-07, "loss": 0.6276, "mean_token_accuracy": 0.8079206943511963, "num_tokens": 17802311.0, "step": 468 }, { "epoch": 0.05966162065894924, "ewc_loss": 0.002857951447367668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8579513582371874e-06, "grad_norm": 6.6657633781433105, "learning_rate": 1.9838914794404408e-07, "loss": 0.5179, "mean_token_accuracy": 0.8352547883987427, "num_tokens": 17839048.0, "step": 469 }, { "epoch": 0.059788830937539755, "ewc_loss": 0.0028362588491290808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.836258772731526e-06, "grad_norm": 6.919410705566406, "learning_rate": 1.9881305637982195e-07, "loss": 0.5905, "mean_token_accuracy": 0.8167771100997925, "num_tokens": 17881510.0, "step": 470 }, { "epoch": 0.05991604121613026, "ewc_loss": 0.0028718402609229088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8718402518279618e-06, "grad_norm": 8.024829864501953, "learning_rate": 1.9923696481559982e-07, "loss": 0.6168, "mean_token_accuracy": 0.8108803033828735, "num_tokens": 17915080.0, "step": 471 }, { "epoch": 0.06004325149472077, "ewc_loss": 0.0029827572870999575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9827572234353283e-06, "grad_norm": 7.11704683303833, "learning_rate": 1.996608732513777e-07, "loss": 0.6486, "mean_token_accuracy": 0.8013911247253418, "num_tokens": 17957972.0, "step": 472 }, { "epoch": 0.060170461773311285, "ewc_loss": 0.0028563772793859243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856377250282094e-06, "grad_norm": 6.861221790313721, "learning_rate": 2.0008478168715557e-07, "loss": 0.5186, "mean_token_accuracy": 0.834934413433075, "num_tokens": 17998008.0, "step": 473 }, { "epoch": 0.06029767205190179, "ewc_loss": 0.0028331365901976824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8331364774203394e-06, "grad_norm": 6.84547233581543, "learning_rate": 2.0050869012293344e-07, "loss": 0.527, "mean_token_accuracy": 0.8318033814430237, "num_tokens": 18032427.0, "step": 474 }, { "epoch": 0.060424882330492306, "ewc_loss": 0.002869976218789816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869976242436678e-06, "grad_norm": 7.047168254852295, "learning_rate": 2.009325985587113e-07, "loss": 0.5572, "mean_token_accuracy": 0.8251253366470337, "num_tokens": 18069848.0, "step": 475 }, { "epoch": 0.060552092609082814, "ewc_loss": 0.0028951135464012623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8951135391253047e-06, "grad_norm": 6.8174614906311035, "learning_rate": 2.0135650699448918e-07, "loss": 0.6504, "mean_token_accuracy": 0.7995619773864746, "num_tokens": 18109240.0, "step": 476 }, { "epoch": 0.06067930288767332, "ewc_loss": 0.0028696779627352953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8696779281744966e-06, "grad_norm": 6.828601360321045, "learning_rate": 2.0178041543026706e-07, "loss": 0.6207, "mean_token_accuracy": 0.8104013204574585, "num_tokens": 18146849.0, "step": 477 }, { "epoch": 0.060806513166263836, "ewc_loss": 0.0028754614759236574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8754614049830707e-06, "grad_norm": 7.3947224617004395, "learning_rate": 2.022043238660449e-07, "loss": 0.6515, "mean_token_accuracy": 0.7980683445930481, "num_tokens": 18186872.0, "step": 478 }, { "epoch": 0.06093372344485434, "ewc_loss": 0.002937259152531624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9372590688581113e-06, "grad_norm": 7.067476272583008, "learning_rate": 2.026282323018228e-07, "loss": 0.627, "mean_token_accuracy": 0.8083223104476929, "num_tokens": 18222299.0, "step": 479 }, { "epoch": 0.06106093372344486, "ewc_loss": 0.00288976589217782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8897659376525553e-06, "grad_norm": 6.657526969909668, "learning_rate": 2.0305214073760065e-07, "loss": 0.6304, "mean_token_accuracy": 0.8090165257453918, "num_tokens": 18261645.0, "step": 480 }, { "epoch": 0.061188144002035365, "ewc_loss": 0.0028576506301760674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857650542864576e-06, "grad_norm": 6.77970552444458, "learning_rate": 2.0347604917337855e-07, "loss": 0.601, "mean_token_accuracy": 0.8148271441459656, "num_tokens": 18292970.0, "step": 481 }, { "epoch": 0.06131535428062587, "ewc_loss": 0.002903164364397526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9031643862253986e-06, "grad_norm": 7.2989983558654785, "learning_rate": 2.038999576091564e-07, "loss": 0.548, "mean_token_accuracy": 0.8307986259460449, "num_tokens": 18331133.0, "step": 482 }, { "epoch": 0.06144256455921639, "ewc_loss": 0.002963821403682232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9638213163707405e-06, "grad_norm": 7.069794178009033, "learning_rate": 2.043238660449343e-07, "loss": 0.6346, "mean_token_accuracy": 0.8043317198753357, "num_tokens": 18368905.0, "step": 483 }, { "epoch": 0.061569774837806894, "ewc_loss": 0.0029210278298705816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9210277716629207e-06, "grad_norm": 7.14760160446167, "learning_rate": 2.0474777448071214e-07, "loss": 0.5591, "mean_token_accuracy": 0.8277987241744995, "num_tokens": 18406362.0, "step": 484 }, { "epoch": 0.0616969851163974, "ewc_loss": 0.0029311368707567453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.931136805273127e-06, "grad_norm": 7.200026988983154, "learning_rate": 2.0517168291649004e-07, "loss": 0.5966, "mean_token_accuracy": 0.8166173696517944, "num_tokens": 18439246.0, "step": 485 }, { "epoch": 0.061824195394987916, "ewc_loss": 0.002942687598988414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9426876153593184e-06, "grad_norm": 7.119572162628174, "learning_rate": 2.0559559135226788e-07, "loss": 0.5998, "mean_token_accuracy": 0.8123502731323242, "num_tokens": 18473408.0, "step": 486 }, { "epoch": 0.06195140567357842, "ewc_loss": 0.002935225609689951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9352256660786225e-06, "grad_norm": 6.908423900604248, "learning_rate": 2.0601949978804578e-07, "loss": 0.5538, "mean_token_accuracy": 0.8263892531394958, "num_tokens": 18513246.0, "step": 487 }, { "epoch": 0.06207861595216894, "ewc_loss": 0.002933134324848652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9331342830118956e-06, "grad_norm": 6.880773544311523, "learning_rate": 2.0644340822382363e-07, "loss": 0.526, "mean_token_accuracy": 0.8340566754341125, "num_tokens": 18549125.0, "step": 488 }, { "epoch": 0.062205826230759445, "ewc_loss": 0.0029504201374948025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950420139313792e-06, "grad_norm": 7.248259544372559, "learning_rate": 2.0686731665960153e-07, "loss": 0.5761, "mean_token_accuracy": 0.8257603645324707, "num_tokens": 18585306.0, "step": 489 }, { "epoch": 0.06233303650934995, "ewc_loss": 0.0030023916624486446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0023916224308778e-06, "grad_norm": 6.91868257522583, "learning_rate": 2.0729122509537937e-07, "loss": 0.5358, "mean_token_accuracy": 0.8340350985527039, "num_tokens": 18624378.0, "step": 490 }, { "epoch": 0.06246024678794047, "ewc_loss": 0.0029683373868465424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.968337412312394e-06, "grad_norm": 7.050333499908447, "learning_rate": 2.0771513353115727e-07, "loss": 0.5796, "mean_token_accuracy": 0.8194262981414795, "num_tokens": 18660814.0, "step": 491 }, { "epoch": 0.06258745706653097, "ewc_loss": 0.002997560426592827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99756038657506e-06, "grad_norm": 6.995673179626465, "learning_rate": 2.0813904196693512e-07, "loss": 0.5473, "mean_token_accuracy": 0.8311386108398438, "num_tokens": 18702129.0, "step": 492 }, { "epoch": 0.06271466734512149, "ewc_loss": 0.0029981019906699657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9981019906699657e-06, "grad_norm": 8.052435874938965, "learning_rate": 2.0856295040271302e-07, "loss": 0.6201, "mean_token_accuracy": 0.8061023950576782, "num_tokens": 18737496.0, "step": 493 }, { "epoch": 0.06284187762371199, "ewc_loss": 0.0031193820759654045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1193819722830085e-06, "grad_norm": 6.662468433380127, "learning_rate": 2.0898685883849086e-07, "loss": 0.5757, "mean_token_accuracy": 0.82356858253479, "num_tokens": 18779332.0, "step": 494 }, { "epoch": 0.0629690879023025, "ewc_loss": 0.0029486443381756544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9486443509085802e-06, "grad_norm": 6.726720333099365, "learning_rate": 2.0941076727426874e-07, "loss": 0.5616, "mean_token_accuracy": 0.8248851299285889, "num_tokens": 18817976.0, "step": 495 }, { "epoch": 0.06309629818089302, "ewc_loss": 0.0030030577909201384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0030578272999264e-06, "grad_norm": 6.679955005645752, "learning_rate": 2.098346757100466e-07, "loss": 0.5524, "mean_token_accuracy": 0.8266445994377136, "num_tokens": 18862860.0, "step": 496 }, { "epoch": 0.06322350845948353, "ewc_loss": 0.0030327618587762117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0327619242598303e-06, "grad_norm": 7.188624858856201, "learning_rate": 2.1025858414582448e-07, "loss": 0.5941, "mean_token_accuracy": 0.8142133355140686, "num_tokens": 18899025.0, "step": 497 }, { "epoch": 0.06335071873807403, "ewc_loss": 0.003091796999797225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0917969979782356e-06, "grad_norm": 7.127205848693848, "learning_rate": 2.1068249258160238e-07, "loss": 0.5604, "mean_token_accuracy": 0.8192036747932434, "num_tokens": 18933118.0, "step": 498 }, { "epoch": 0.06347792901666455, "ewc_loss": 0.003059651702642441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0596515898650978e-06, "grad_norm": 7.017346382141113, "learning_rate": 2.1110640101738023e-07, "loss": 0.5615, "mean_token_accuracy": 0.8264576196670532, "num_tokens": 18969165.0, "step": 499 }, { "epoch": 0.06360513929525506, "ewc_loss": 0.0030552686657756567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055268734897254e-06, "grad_norm": 6.92117166519165, "learning_rate": 2.1153030945315813e-07, "loss": 0.5701, "mean_token_accuracy": 0.8219751715660095, "num_tokens": 19003882.0, "step": 500 }, { "epoch": 0.06373234957384556, "ewc_loss": 0.0030559413135051727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055941306229215e-06, "grad_norm": 7.214004993438721, "learning_rate": 2.1195421788893597e-07, "loss": 0.5702, "mean_token_accuracy": 0.8238542675971985, "num_tokens": 19037540.0, "step": 501 }, { "epoch": 0.06385955985243608, "ewc_loss": 0.00310176657512784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.101766651525395e-06, "grad_norm": 7.081700801849365, "learning_rate": 2.1237812632471387e-07, "loss": 0.5804, "mean_token_accuracy": 0.8153712749481201, "num_tokens": 19077270.0, "step": 502 }, { "epoch": 0.06398677013102659, "ewc_loss": 0.003085361560806632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085361640842166e-06, "grad_norm": 6.870479106903076, "learning_rate": 2.1280203476049172e-07, "loss": 0.6682, "mean_token_accuracy": 0.7891519665718079, "num_tokens": 19117579.0, "step": 503 }, { "epoch": 0.06411398040961709, "ewc_loss": 0.003064578864723444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0645787774119526e-06, "grad_norm": 6.797508716583252, "learning_rate": 2.1322594319626962e-07, "loss": 0.608, "mean_token_accuracy": 0.810834527015686, "num_tokens": 19156599.0, "step": 504 }, { "epoch": 0.0642411906882076, "ewc_loss": 0.0030863224528729916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086322521994589e-06, "grad_norm": 7.521345138549805, "learning_rate": 2.1364985163204746e-07, "loss": 0.5996, "mean_token_accuracy": 0.8140000104904175, "num_tokens": 19187367.0, "step": 505 }, { "epoch": 0.06436840096679812, "ewc_loss": 0.0031683549750596285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1683550787420245e-06, "grad_norm": 7.444014549255371, "learning_rate": 2.1407376006782536e-07, "loss": 0.5338, "mean_token_accuracy": 0.8327023983001709, "num_tokens": 19223916.0, "step": 506 }, { "epoch": 0.06449561124538863, "ewc_loss": 0.003133600577712059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1336005577031756e-06, "grad_norm": 7.087090492248535, "learning_rate": 2.144976685036032e-07, "loss": 0.5492, "mean_token_accuracy": 0.828668475151062, "num_tokens": 19260336.0, "step": 507 }, { "epoch": 0.06462282152397913, "ewc_loss": 0.00309326802380383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0932681056583533e-06, "grad_norm": 7.091794490814209, "learning_rate": 2.149215769393811e-07, "loss": 0.6126, "mean_token_accuracy": 0.8124549984931946, "num_tokens": 19299895.0, "step": 508 }, { "epoch": 0.06475003180256965, "ewc_loss": 0.0031237248331308365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1237248094839742e-06, "grad_norm": 7.107532978057861, "learning_rate": 2.1534548537515895e-07, "loss": 0.5526, "mean_token_accuracy": 0.8261700868606567, "num_tokens": 19336614.0, "step": 509 }, { "epoch": 0.06487724208116016, "ewc_loss": 0.003130801022052765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1308011330111185e-06, "grad_norm": 7.148705005645752, "learning_rate": 2.1576939381093685e-07, "loss": 0.6036, "mean_token_accuracy": 0.8143637180328369, "num_tokens": 19378683.0, "step": 510 }, { "epoch": 0.06500445235975066, "ewc_loss": 0.0031397566199302673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139756699965801e-06, "grad_norm": 6.93116569519043, "learning_rate": 2.161933022467147e-07, "loss": 0.5873, "mean_token_accuracy": 0.8143268823623657, "num_tokens": 19422598.0, "step": 511 }, { "epoch": 0.06513166263834118, "ewc_loss": 0.0031268883030861616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126888259430416e-06, "grad_norm": 6.805487632751465, "learning_rate": 2.166172106824926e-07, "loss": 0.5776, "mean_token_accuracy": 0.823233962059021, "num_tokens": 19466972.0, "step": 512 }, { "epoch": 0.06525887291693169, "ewc_loss": 0.003129685763269663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1296858651330695e-06, "grad_norm": 7.106764316558838, "learning_rate": 2.1704111911827044e-07, "loss": 0.5688, "mean_token_accuracy": 0.8191152811050415, "num_tokens": 19503809.0, "step": 513 }, { "epoch": 0.0653860831955222, "ewc_loss": 0.0031851758249104023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185175728503964e-06, "grad_norm": 7.052379131317139, "learning_rate": 2.1746502755404831e-07, "loss": 0.6237, "mean_token_accuracy": 0.8127418756484985, "num_tokens": 19545713.0, "step": 514 }, { "epoch": 0.06551329347411271, "ewc_loss": 0.0031642611138522625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164261215715669e-06, "grad_norm": 6.826256275177002, "learning_rate": 2.178889359898262e-07, "loss": 0.6109, "mean_token_accuracy": 0.8136683702468872, "num_tokens": 19582272.0, "step": 515 }, { "epoch": 0.06564050375270322, "ewc_loss": 0.003144017653539777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1440176826436073e-06, "grad_norm": 6.802126407623291, "learning_rate": 2.1831284442560406e-07, "loss": 0.6261, "mean_token_accuracy": 0.8084827661514282, "num_tokens": 19625574.0, "step": 516 }, { "epoch": 0.06576771403129372, "ewc_loss": 0.0031646292190998793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1646293336962117e-06, "grad_norm": 6.986204624176025, "learning_rate": 2.1873675286138193e-07, "loss": 0.6129, "mean_token_accuracy": 0.8079556226730347, "num_tokens": 19667791.0, "step": 517 }, { "epoch": 0.06589492430988424, "ewc_loss": 0.0031962781213223934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1962781577021815e-06, "grad_norm": 7.169719219207764, "learning_rate": 2.191606612971598e-07, "loss": 0.6428, "mean_token_accuracy": 0.8029787540435791, "num_tokens": 19708289.0, "step": 518 }, { "epoch": 0.06602213458847475, "ewc_loss": 0.0032101378310471773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2101377200888237e-06, "grad_norm": 6.815049648284912, "learning_rate": 2.1958456973293768e-07, "loss": 0.6207, "mean_token_accuracy": 0.8040130734443665, "num_tokens": 19747444.0, "step": 519 }, { "epoch": 0.06614934486706527, "ewc_loss": 0.0031685594003647566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168559487676248e-06, "grad_norm": 6.996628284454346, "learning_rate": 2.2000847816871555e-07, "loss": 0.5093, "mean_token_accuracy": 0.8366851806640625, "num_tokens": 19787061.0, "step": 520 }, { "epoch": 0.06627655514565577, "ewc_loss": 0.003214311320334673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2143113912752597e-06, "grad_norm": 7.0105814933776855, "learning_rate": 2.2043238660449342e-07, "loss": 0.5509, "mean_token_accuracy": 0.8290227055549622, "num_tokens": 19826893.0, "step": 521 }, { "epoch": 0.06640376542424628, "ewc_loss": 0.0032131869811564684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213187028450193e-06, "grad_norm": 7.012134552001953, "learning_rate": 2.208562950402713e-07, "loss": 0.5441, "mean_token_accuracy": 0.8265193700790405, "num_tokens": 19864285.0, "step": 522 }, { "epoch": 0.0665309757028368, "ewc_loss": 0.003211907809600234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2119078241521493e-06, "grad_norm": 6.968630313873291, "learning_rate": 2.2128020347604917e-07, "loss": 0.5795, "mean_token_accuracy": 0.8193727731704712, "num_tokens": 19903806.0, "step": 523 }, { "epoch": 0.0666581859814273, "ewc_loss": 0.0032134016510099173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2134016691998113e-06, "grad_norm": 7.137932300567627, "learning_rate": 2.2170411191182704e-07, "loss": 0.557, "mean_token_accuracy": 0.8240228295326233, "num_tokens": 19950382.0, "step": 524 }, { "epoch": 0.06678539626001781, "ewc_loss": 0.0032252694945782423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225269438189571e-06, "grad_norm": 7.053427696228027, "learning_rate": 2.221280203476049e-07, "loss": 0.5807, "mean_token_accuracy": 0.81590336561203, "num_tokens": 19984765.0, "step": 525 }, { "epoch": 0.06691260653860832, "ewc_loss": 0.0032182810828089714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218281108274823e-06, "grad_norm": 6.9592180252075195, "learning_rate": 2.2255192878338279e-07, "loss": 0.621, "mean_token_accuracy": 0.8105862736701965, "num_tokens": 20021081.0, "step": 526 }, { "epoch": 0.06703981681719882, "ewc_loss": 0.003227662295103073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2276623187499354e-06, "grad_norm": 7.246740818023682, "learning_rate": 2.2297583721916066e-07, "loss": 0.5945, "mean_token_accuracy": 0.8153942823410034, "num_tokens": 20055860.0, "step": 527 }, { "epoch": 0.06716702709578934, "ewc_loss": 0.0032637466210871935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2637465210427763e-06, "grad_norm": 7.239075183868408, "learning_rate": 2.2339974565493853e-07, "loss": 0.5487, "mean_token_accuracy": 0.8301361203193665, "num_tokens": 20095023.0, "step": 528 }, { "epoch": 0.06729423737437985, "ewc_loss": 0.0032593782525509596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259378217990161e-06, "grad_norm": 7.1542887687683105, "learning_rate": 2.238236540907164e-07, "loss": 0.6029, "mean_token_accuracy": 0.8134745955467224, "num_tokens": 20134620.0, "step": 529 }, { "epoch": 0.06742144765297035, "ewc_loss": 0.0032395711168646812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2395710150012746e-06, "grad_norm": 7.4204864501953125, "learning_rate": 2.2424756252649428e-07, "loss": 0.5748, "mean_token_accuracy": 0.8186647891998291, "num_tokens": 20169690.0, "step": 530 }, { "epoch": 0.06754865793156087, "ewc_loss": 0.003283102996647358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283103069406934e-06, "grad_norm": 7.372212886810303, "learning_rate": 2.2467147096227215e-07, "loss": 0.5674, "mean_token_accuracy": 0.8221678733825684, "num_tokens": 20203609.0, "step": 531 }, { "epoch": 0.06767586821015138, "ewc_loss": 0.003281627083197236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281627186879632e-06, "grad_norm": 7.0042290687561035, "learning_rate": 2.2509537939805002e-07, "loss": 0.5739, "mean_token_accuracy": 0.8189626932144165, "num_tokens": 20234616.0, "step": 532 }, { "epoch": 0.0678030784887419, "ewc_loss": 0.00324755790643394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247557970098569e-06, "grad_norm": 6.776683330535889, "learning_rate": 2.2551928783382787e-07, "loss": 0.5626, "mean_token_accuracy": 0.8222702145576477, "num_tokens": 20271634.0, "step": 533 }, { "epoch": 0.0679302887673324, "ewc_loss": 0.0032569081522524357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256908257753821e-06, "grad_norm": 7.407297611236572, "learning_rate": 2.2594319626960577e-07, "loss": 0.5909, "mean_token_accuracy": 0.8195164799690247, "num_tokens": 20309683.0, "step": 534 }, { "epoch": 0.06805749904592291, "ewc_loss": 0.0033648789394646883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3648789212747943e-06, "grad_norm": 7.052948951721191, "learning_rate": 2.263671047053836e-07, "loss": 0.5053, "mean_token_accuracy": 0.8362441658973694, "num_tokens": 20346013.0, "step": 535 }, { "epoch": 0.06818470932451343, "ewc_loss": 0.0032901272643357515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290127324362402e-06, "grad_norm": 7.165536403656006, "learning_rate": 2.267910131411615e-07, "loss": 0.5631, "mean_token_accuracy": 0.8259164094924927, "num_tokens": 20387613.0, "step": 536 }, { "epoch": 0.06831191960310393, "ewc_loss": 0.0033238488249480724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323848886793712e-06, "grad_norm": 7.224480152130127, "learning_rate": 2.2721492157693936e-07, "loss": 0.5618, "mean_token_accuracy": 0.8241614103317261, "num_tokens": 20417579.0, "step": 537 }, { "epoch": 0.06843912988169444, "ewc_loss": 0.0033438587561249733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3438586797274183e-06, "grad_norm": 6.907526016235352, "learning_rate": 2.2763883001271726e-07, "loss": 0.5345, "mean_token_accuracy": 0.8309933543205261, "num_tokens": 20455100.0, "step": 538 }, { "epoch": 0.06856634016028496, "ewc_loss": 0.0032999739050865173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2999739687511465e-06, "grad_norm": 7.300130367279053, "learning_rate": 2.280627384484951e-07, "loss": 0.5965, "mean_token_accuracy": 0.8114744424819946, "num_tokens": 20496458.0, "step": 539 }, { "epoch": 0.06869355043887546, "ewc_loss": 0.0033762785606086254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376278527866816e-06, "grad_norm": 7.1487321853637695, "learning_rate": 2.28486646884273e-07, "loss": 0.5702, "mean_token_accuracy": 0.8212630152702332, "num_tokens": 20533758.0, "step": 540 }, { "epoch": 0.06882076071746597, "ewc_loss": 0.003335345769301057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3353458093188237e-06, "grad_norm": 7.029478073120117, "learning_rate": 2.2891055532005085e-07, "loss": 0.6154, "mean_token_accuracy": 0.812088131904602, "num_tokens": 20567259.0, "step": 541 }, { "epoch": 0.06894797099605648, "ewc_loss": 0.0033383264672011137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338326450830209e-06, "grad_norm": 7.1149582862854, "learning_rate": 2.2933446375582875e-07, "loss": 0.5962, "mean_token_accuracy": 0.817589521408081, "num_tokens": 20599333.0, "step": 542 }, { "epoch": 0.06907518127464699, "ewc_loss": 0.003366424236446619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3664241527731065e-06, "grad_norm": 7.131585121154785, "learning_rate": 2.297583721916066e-07, "loss": 0.533, "mean_token_accuracy": 0.8340435028076172, "num_tokens": 20640195.0, "step": 543 }, { "epoch": 0.0692023915532375, "ewc_loss": 0.0033738468773663044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3738469937816262e-06, "grad_norm": 7.342902660369873, "learning_rate": 2.301822806273845e-07, "loss": 0.5539, "mean_token_accuracy": 0.8264334797859192, "num_tokens": 20682617.0, "step": 544 }, { "epoch": 0.06932960183182801, "ewc_loss": 0.0033953366801142693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3953367619687924e-06, "grad_norm": 7.558504104614258, "learning_rate": 2.3060618906316234e-07, "loss": 0.6066, "mean_token_accuracy": 0.814323902130127, "num_tokens": 20718032.0, "step": 545 }, { "epoch": 0.06945681211041853, "ewc_loss": 0.003411803627386689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411803618291742e-06, "grad_norm": 7.008846759796143, "learning_rate": 2.3103009749894024e-07, "loss": 0.5695, "mean_token_accuracy": 0.822439432144165, "num_tokens": 20755190.0, "step": 546 }, { "epoch": 0.06958402238900903, "ewc_loss": 0.003334121545776725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3341216294502374e-06, "grad_norm": 6.698106288909912, "learning_rate": 2.3145400593471808e-07, "loss": 0.5954, "mean_token_accuracy": 0.8139240741729736, "num_tokens": 20799088.0, "step": 547 }, { "epoch": 0.06971123266759954, "ewc_loss": 0.003341532777994871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3415328744013095e-06, "grad_norm": 7.248479843139648, "learning_rate": 2.3187791437049598e-07, "loss": 0.6268, "mean_token_accuracy": 0.8037992715835571, "num_tokens": 20831819.0, "step": 548 }, { "epoch": 0.06983844294619006, "ewc_loss": 0.003439716063439846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4397160106891533e-06, "grad_norm": 7.339686870574951, "learning_rate": 2.3230182280627383e-07, "loss": 0.5569, "mean_token_accuracy": 0.8273900747299194, "num_tokens": 20861291.0, "step": 549 }, { "epoch": 0.06996565322478056, "ewc_loss": 0.0034230374731123447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423037469474366e-06, "grad_norm": 6.9116716384887695, "learning_rate": 2.327257312420517e-07, "loss": 0.5324, "mean_token_accuracy": 0.8349759578704834, "num_tokens": 20897446.0, "step": 550 }, { "epoch": 0.07009286350337107, "ewc_loss": 0.003367768367752433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367768385942327e-06, "grad_norm": 7.469824314117432, "learning_rate": 2.3314963967782957e-07, "loss": 0.5559, "mean_token_accuracy": 0.8227893114089966, "num_tokens": 20934423.0, "step": 551 }, { "epoch": 0.07022007378196159, "ewc_loss": 0.0034794164821505547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479416591289919e-06, "grad_norm": 7.080539226531982, "learning_rate": 2.3357354811360745e-07, "loss": 0.573, "mean_token_accuracy": 0.8240954875946045, "num_tokens": 20978394.0, "step": 552 }, { "epoch": 0.07034728406055209, "ewc_loss": 0.0034056075382977724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4056074582622387e-06, "grad_norm": 7.416191101074219, "learning_rate": 2.3399745654938532e-07, "loss": 0.6001, "mean_token_accuracy": 0.8141235709190369, "num_tokens": 21010669.0, "step": 553 }, { "epoch": 0.0704744943391426, "ewc_loss": 0.003456398146227002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456398189882748e-06, "grad_norm": 7.008810997009277, "learning_rate": 2.344213649851632e-07, "loss": 0.5686, "mean_token_accuracy": 0.8235671520233154, "num_tokens": 21047696.0, "step": 554 }, { "epoch": 0.07060170461773312, "ewc_loss": 0.0034093058202415705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4093059184669983e-06, "grad_norm": 6.89812707901001, "learning_rate": 2.3484527342094106e-07, "loss": 0.6496, "mean_token_accuracy": 0.8019787073135376, "num_tokens": 21091437.0, "step": 555 }, { "epoch": 0.07072891489632362, "ewc_loss": 0.003421715460717678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421715518925339e-06, "grad_norm": 6.963183879852295, "learning_rate": 2.3526918185671894e-07, "loss": 0.6102, "mean_token_accuracy": 0.810714602470398, "num_tokens": 21128880.0, "step": 556 }, { "epoch": 0.07085612517491413, "ewc_loss": 0.0034555871970951557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455587147982442e-06, "grad_norm": 7.072309494018555, "learning_rate": 2.356930902924968e-07, "loss": 0.5629, "mean_token_accuracy": 0.825519859790802, "num_tokens": 21168523.0, "step": 557 }, { "epoch": 0.07098333545350465, "ewc_loss": 0.0034679218661040068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4679219425015617e-06, "grad_norm": 6.9532365798950195, "learning_rate": 2.3611699872827468e-07, "loss": 0.6491, "mean_token_accuracy": 0.7984637022018433, "num_tokens": 21211376.0, "step": 558 }, { "epoch": 0.07111054573209516, "ewc_loss": 0.0034513757564127445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4513757327658823e-06, "grad_norm": 7.049041271209717, "learning_rate": 2.3654090716405255e-07, "loss": 0.581, "mean_token_accuracy": 0.8191401958465576, "num_tokens": 21249162.0, "step": 559 }, { "epoch": 0.07123775601068566, "ewc_loss": 0.003479717066511512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479717179288855e-06, "grad_norm": 7.085023880004883, "learning_rate": 2.3696481559983043e-07, "loss": 0.6084, "mean_token_accuracy": 0.8096276521682739, "num_tokens": 21282351.0, "step": 560 }, { "epoch": 0.07136496628927617, "ewc_loss": 0.0034890954848378897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4890954339061864e-06, "grad_norm": 7.065969467163086, "learning_rate": 2.373887240356083e-07, "loss": 0.5984, "mean_token_accuracy": 0.8141652345657349, "num_tokens": 21319508.0, "step": 561 }, { "epoch": 0.07149217656786669, "ewc_loss": 0.0034853285178542137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4853285342251183e-06, "grad_norm": 7.109659194946289, "learning_rate": 2.3781263247138617e-07, "loss": 0.5629, "mean_token_accuracy": 0.8254855871200562, "num_tokens": 21357111.0, "step": 562 }, { "epoch": 0.07161938684645719, "ewc_loss": 0.00349447806365788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494478050924954e-06, "grad_norm": 7.180149555206299, "learning_rate": 2.3823654090716404e-07, "loss": 0.5648, "mean_token_accuracy": 0.820852518081665, "num_tokens": 21393222.0, "step": 563 }, { "epoch": 0.0717465971250477, "ewc_loss": 0.0035128211602568626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5128211948176613e-06, "grad_norm": 6.9097514152526855, "learning_rate": 2.386604493429419e-07, "loss": 0.5195, "mean_token_accuracy": 0.8346417546272278, "num_tokens": 21429595.0, "step": 564 }, { "epoch": 0.07187380740363822, "ewc_loss": 0.0034840498119592667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4840497846744256e-06, "grad_norm": 7.126351356506348, "learning_rate": 2.390843577787198e-07, "loss": 0.6157, "mean_token_accuracy": 0.8051284551620483, "num_tokens": 21467341.0, "step": 565 }, { "epoch": 0.07200101768222872, "ewc_loss": 0.003538060002028942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5380599001655355e-06, "grad_norm": 7.021290302276611, "learning_rate": 2.3950826621449766e-07, "loss": 0.4925, "mean_token_accuracy": 0.846234917640686, "num_tokens": 21510927.0, "step": 566 }, { "epoch": 0.07212822796081923, "ewc_loss": 0.0035251558292657137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5251557619631058e-06, "grad_norm": 7.155942916870117, "learning_rate": 2.3993217465027556e-07, "loss": 0.5683, "mean_token_accuracy": 0.8243403434753418, "num_tokens": 21551604.0, "step": 567 }, { "epoch": 0.07225543823940975, "ewc_loss": 0.0035581488627940416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.558148819138296e-06, "grad_norm": 7.23037576675415, "learning_rate": 2.403560830860534e-07, "loss": 0.6195, "mean_token_accuracy": 0.8122060298919678, "num_tokens": 21591871.0, "step": 568 }, { "epoch": 0.07238264851800025, "ewc_loss": 0.0035651500802487135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.565150109352544e-06, "grad_norm": 7.259463310241699, "learning_rate": 2.4077999152183125e-07, "loss": 0.6165, "mean_token_accuracy": 0.8081046342849731, "num_tokens": 21629810.0, "step": 569 }, { "epoch": 0.07250985879659076, "ewc_loss": 0.003566080005839467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5660800676851068e-06, "grad_norm": 7.137604713439941, "learning_rate": 2.4120389995760915e-07, "loss": 0.5149, "mean_token_accuracy": 0.8415237665176392, "num_tokens": 21668706.0, "step": 570 }, { "epoch": 0.07263706907518128, "ewc_loss": 0.0035494740586727858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5494740586727858e-06, "grad_norm": 7.2036871910095215, "learning_rate": 2.41627808393387e-07, "loss": 0.5599, "mean_token_accuracy": 0.8229019641876221, "num_tokens": 21712725.0, "step": 571 }, { "epoch": 0.07276427935377179, "ewc_loss": 0.003579916898161173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5799168927042047e-06, "grad_norm": 7.1291117668151855, "learning_rate": 2.420517168291649e-07, "loss": 0.5692, "mean_token_accuracy": 0.8246779441833496, "num_tokens": 21746086.0, "step": 572 }, { "epoch": 0.07289148963236229, "ewc_loss": 0.003573383204638958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5733833101403434e-06, "grad_norm": 7.206053256988525, "learning_rate": 2.4247562526494274e-07, "loss": 0.5295, "mean_token_accuracy": 0.8317787647247314, "num_tokens": 21779573.0, "step": 573 }, { "epoch": 0.0730186999109528, "ewc_loss": 0.0035901672672480345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5901673527405364e-06, "grad_norm": 7.1158952713012695, "learning_rate": 2.4289953370072064e-07, "loss": 0.6014, "mean_token_accuracy": 0.8118337988853455, "num_tokens": 21818766.0, "step": 574 }, { "epoch": 0.07314591018954332, "ewc_loss": 0.003580220742151141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.580220663934597e-06, "grad_norm": 7.158740520477295, "learning_rate": 2.433234421364985e-07, "loss": 0.5574, "mean_token_accuracy": 0.8300471305847168, "num_tokens": 21855539.0, "step": 575 }, { "epoch": 0.07327312046813382, "ewc_loss": 0.0035907337442040443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5907337405660655e-06, "grad_norm": 7.1324591636657715, "learning_rate": 2.437473505722764e-07, "loss": 0.5219, "mean_token_accuracy": 0.8341590166091919, "num_tokens": 21892567.0, "step": 576 }, { "epoch": 0.07340033074672433, "ewc_loss": 0.0035881767980754375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.588176696212031e-06, "grad_norm": 7.088303089141846, "learning_rate": 2.4417125900805423e-07, "loss": 0.5413, "mean_token_accuracy": 0.8234398365020752, "num_tokens": 21928607.0, "step": 577 }, { "epoch": 0.07352754102531485, "ewc_loss": 0.003587933024391532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.587932951631956e-06, "grad_norm": 7.134523391723633, "learning_rate": 2.4459516744383213e-07, "loss": 0.5887, "mean_token_accuracy": 0.8172109723091125, "num_tokens": 21963781.0, "step": 578 }, { "epoch": 0.07365475130390535, "ewc_loss": 0.0036005040165036917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6005039873998612e-06, "grad_norm": 7.165399074554443, "learning_rate": 2.4501907587961e-07, "loss": 0.5488, "mean_token_accuracy": 0.828801155090332, "num_tokens": 21999140.0, "step": 579 }, { "epoch": 0.07378196158249586, "ewc_loss": 0.0036187272053211927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.61872730536561e-06, "grad_norm": 7.016936302185059, "learning_rate": 2.454429843153879e-07, "loss": 0.6485, "mean_token_accuracy": 0.7989641427993774, "num_tokens": 22038703.0, "step": 580 }, { "epoch": 0.07390917186108638, "ewc_loss": 0.003600207157433033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.600207264753408e-06, "grad_norm": 7.254775524139404, "learning_rate": 2.458668927511657e-07, "loss": 0.5625, "mean_token_accuracy": 0.816965639591217, "num_tokens": 22071052.0, "step": 581 }, { "epoch": 0.07403638213967688, "ewc_loss": 0.0036449560429900885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.644955995696364e-06, "grad_norm": 7.064737319946289, "learning_rate": 2.462908011869436e-07, "loss": 0.5272, "mean_token_accuracy": 0.834592878818512, "num_tokens": 22109629.0, "step": 582 }, { "epoch": 0.0741635924182674, "ewc_loss": 0.0036041843704879284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6041842577105854e-06, "grad_norm": 6.9632439613342285, "learning_rate": 2.4671470962272147e-07, "loss": 0.5398, "mean_token_accuracy": 0.831484317779541, "num_tokens": 22151544.0, "step": 583 }, { "epoch": 0.07429080269685791, "ewc_loss": 0.003616264322772622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.616264393713209e-06, "grad_norm": 7.460886478424072, "learning_rate": 2.4713861805849937e-07, "loss": 0.518, "mean_token_accuracy": 0.833998441696167, "num_tokens": 22189241.0, "step": 584 }, { "epoch": 0.07441801297544842, "ewc_loss": 0.0036986754275858402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6986755276302574e-06, "grad_norm": 7.13917350769043, "learning_rate": 2.475625264942772e-07, "loss": 0.5333, "mean_token_accuracy": 0.831840455532074, "num_tokens": 22229138.0, "step": 585 }, { "epoch": 0.07454522325403892, "ewc_loss": 0.003609859850257635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6098599593969993e-06, "grad_norm": 6.966987609863281, "learning_rate": 2.479864349300551e-07, "loss": 0.5716, "mean_token_accuracy": 0.8227202296257019, "num_tokens": 22264930.0, "step": 586 }, { "epoch": 0.07467243353262944, "ewc_loss": 0.003619410330429673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6194103358866414e-06, "grad_norm": 7.138376235961914, "learning_rate": 2.4841034336583296e-07, "loss": 0.5129, "mean_token_accuracy": 0.8330458402633667, "num_tokens": 22297162.0, "step": 587 }, { "epoch": 0.07479964381121995, "ewc_loss": 0.003664348041638732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.664348014353891e-06, "grad_norm": 7.125414848327637, "learning_rate": 2.488342518016108e-07, "loss": 0.5976, "mean_token_accuracy": 0.8109279870986938, "num_tokens": 22334779.0, "step": 588 }, { "epoch": 0.07492685408981045, "ewc_loss": 0.003645161632448435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6451615414989647e-06, "grad_norm": 7.161956310272217, "learning_rate": 2.492581602373887e-07, "loss": 0.5543, "mean_token_accuracy": 0.8284602165222168, "num_tokens": 22372820.0, "step": 589 }, { "epoch": 0.07505406436840097, "ewc_loss": 0.003656483255326748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6564831589203095e-06, "grad_norm": 7.198824405670166, "learning_rate": 2.4968206867316655e-07, "loss": 0.5748, "mean_token_accuracy": 0.8206281661987305, "num_tokens": 22417841.0, "step": 590 }, { "epoch": 0.07518127464699148, "ewc_loss": 0.0036736666224896908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6736666970682563e-06, "grad_norm": 7.16663122177124, "learning_rate": 2.5010597710894445e-07, "loss": 0.4889, "mean_token_accuracy": 0.845779538154602, "num_tokens": 22449133.0, "step": 591 }, { "epoch": 0.07530848492558198, "ewc_loss": 0.0036705774255096912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.670577370940009e-06, "grad_norm": 7.188871383666992, "learning_rate": 2.505298855447223e-07, "loss": 0.5541, "mean_token_accuracy": 0.8265836238861084, "num_tokens": 22491145.0, "step": 592 }, { "epoch": 0.0754356952041725, "ewc_loss": 0.0036768082063645124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6768083191418555e-06, "grad_norm": 7.065460205078125, "learning_rate": 2.509537939805002e-07, "loss": 0.5668, "mean_token_accuracy": 0.8256126046180725, "num_tokens": 22531738.0, "step": 593 }, { "epoch": 0.07556290548276301, "ewc_loss": 0.003657144494354725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.657144588942174e-06, "grad_norm": 7.375016212463379, "learning_rate": 2.513777024162781e-07, "loss": 0.5458, "mean_token_accuracy": 0.8301382064819336, "num_tokens": 22563005.0, "step": 594 }, { "epoch": 0.07569011576135352, "ewc_loss": 0.0037198010832071304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7198010431893636e-06, "grad_norm": 7.166384696960449, "learning_rate": 2.5180161085205594e-07, "loss": 0.5268, "mean_token_accuracy": 0.835742175579071, "num_tokens": 22601333.0, "step": 595 }, { "epoch": 0.07581732603994402, "ewc_loss": 0.0036568923387676477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6568924315361073e-06, "grad_norm": 7.1724724769592285, "learning_rate": 2.522255192878338e-07, "loss": 0.5818, "mean_token_accuracy": 0.8192142248153687, "num_tokens": 22639528.0, "step": 596 }, { "epoch": 0.07594453631853454, "ewc_loss": 0.003682429902255535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.682429905893514e-06, "grad_norm": 7.166138172149658, "learning_rate": 2.526494277236117e-07, "loss": 0.5643, "mean_token_accuracy": 0.8227243423461914, "num_tokens": 22675290.0, "step": 597 }, { "epoch": 0.07607174659712505, "ewc_loss": 0.003691200166940689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.6912001633027103e-06, "grad_norm": 7.22000789642334, "learning_rate": 2.530733361593896e-07, "loss": 0.5388, "mean_token_accuracy": 0.8288723230361938, "num_tokens": 22712823.0, "step": 598 }, { "epoch": 0.07619895687571555, "ewc_loss": 0.0037037963047623634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7037962101749144e-06, "grad_norm": 7.228002548217773, "learning_rate": 2.5349724459516743e-07, "loss": 0.5599, "mean_token_accuracy": 0.825318455696106, "num_tokens": 22750916.0, "step": 599 }, { "epoch": 0.07632616715430607, "ewc_loss": 0.0036992374807596207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.699237595355953e-06, "grad_norm": 7.2220611572265625, "learning_rate": 2.539211530309453e-07, "loss": 0.5118, "mean_token_accuracy": 0.8364802002906799, "num_tokens": 22782537.0, "step": 600 }, { "epoch": 0.07645337743289658, "ewc_loss": 0.003708639880642295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7086399515828816e-06, "grad_norm": 7.325777053833008, "learning_rate": 2.543450614667232e-07, "loss": 0.6041, "mean_token_accuracy": 0.8133940100669861, "num_tokens": 22814805.0, "step": 601 }, { "epoch": 0.07658058771148708, "ewc_loss": 0.0037314752116799355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.731475317181321e-06, "grad_norm": 7.253108024597168, "learning_rate": 2.547689699025011e-07, "loss": 0.6021, "mean_token_accuracy": 0.8116050958633423, "num_tokens": 22852654.0, "step": 602 }, { "epoch": 0.0767077979900776, "ewc_loss": 0.003725949442014098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.725949454747024e-06, "grad_norm": 7.277219772338867, "learning_rate": 2.551928783382789e-07, "loss": 0.5572, "mean_token_accuracy": 0.8234388828277588, "num_tokens": 22894205.0, "step": 603 }, { "epoch": 0.07683500826866811, "ewc_loss": 0.0037324901204556227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.732490085894824e-06, "grad_norm": 7.171298027038574, "learning_rate": 2.5561678677405677e-07, "loss": 0.5233, "mean_token_accuracy": 0.8326396346092224, "num_tokens": 22924095.0, "step": 604 }, { "epoch": 0.07696221854725861, "ewc_loss": 0.0037315753288567066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.731575361598516e-06, "grad_norm": 7.130458831787109, "learning_rate": 2.5604069520983467e-07, "loss": 0.6051, "mean_token_accuracy": 0.8089252710342407, "num_tokens": 22960857.0, "step": 605 }, { "epoch": 0.07708942882584913, "ewc_loss": 0.003735180711373687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7351808259700192e-06, "grad_norm": 7.133752822875977, "learning_rate": 2.564646036456125e-07, "loss": 0.5571, "mean_token_accuracy": 0.823346734046936, "num_tokens": 23004670.0, "step": 606 }, { "epoch": 0.07721663910443964, "ewc_loss": 0.0037452313117682934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7452314245456364e-06, "grad_norm": 7.403529167175293, "learning_rate": 2.568885120813904e-07, "loss": 0.5573, "mean_token_accuracy": 0.8221938014030457, "num_tokens": 23041999.0, "step": 607 }, { "epoch": 0.07734384938303016, "ewc_loss": 0.00379082839936018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7908284866716713e-06, "grad_norm": 7.166350364685059, "learning_rate": 2.5731242051716826e-07, "loss": 0.5459, "mean_token_accuracy": 0.8300257921218872, "num_tokens": 23080704.0, "step": 608 }, { "epoch": 0.07747105966162066, "ewc_loss": 0.0037475719582289457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.747572009160649e-06, "grad_norm": 7.211627960205078, "learning_rate": 2.5773632895294616e-07, "loss": 0.5809, "mean_token_accuracy": 0.8202927112579346, "num_tokens": 23118662.0, "step": 609 }, { "epoch": 0.07759826994021117, "ewc_loss": 0.003786971792578697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.786971774388803e-06, "grad_norm": 7.162473201751709, "learning_rate": 2.58160237388724e-07, "loss": 0.504, "mean_token_accuracy": 0.8368527889251709, "num_tokens": 23150367.0, "step": 610 }, { "epoch": 0.07772548021880168, "ewc_loss": 0.003782436018809676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7824361243110616e-06, "grad_norm": 7.3729248046875, "learning_rate": 2.585841458245019e-07, "loss": 0.5621, "mean_token_accuracy": 0.8231391310691833, "num_tokens": 23185836.0, "step": 611 }, { "epoch": 0.07785269049739219, "ewc_loss": 0.0038085365667939186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.8085365758888656e-06, "grad_norm": 7.204222679138184, "learning_rate": 2.5900805426027975e-07, "loss": 0.5027, "mean_token_accuracy": 0.8402069807052612, "num_tokens": 23219346.0, "step": 612 }, { "epoch": 0.0779799007759827, "ewc_loss": 0.0037811181973665953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.7811182664881926e-06, "grad_norm": 7.208951950073242, "learning_rate": 2.5943196269605765e-07, "loss": 0.5748, "mean_token_accuracy": 0.8253000974655151, "num_tokens": 23257569.0, "step": 613 }, { "epoch": 0.07810711105457321, "ewc_loss": 0.0038088939618319273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.8088940073066624e-06, "grad_norm": 7.0842814445495605, "learning_rate": 2.598558711318355e-07, "loss": 0.5643, "mean_token_accuracy": 0.8254748582839966, "num_tokens": 23302944.0, "step": 614 }, { "epoch": 0.07823432133316371, "ewc_loss": 0.003801335347816348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.801335424213903e-06, "grad_norm": 7.187703609466553, "learning_rate": 2.602797795676134e-07, "loss": 0.5698, "mean_token_accuracy": 0.8223065137863159, "num_tokens": 23341660.0, "step": 615 }, { "epoch": 0.07836153161175423, "ewc_loss": 0.0038346038199961185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.834603830910055e-06, "grad_norm": 7.37612247467041, "learning_rate": 2.6070368800339124e-07, "loss": 0.5688, "mean_token_accuracy": 0.816596508026123, "num_tokens": 23375956.0, "step": 616 }, { "epoch": 0.07848874189034474, "ewc_loss": 0.0038547327276319265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.854732767649693e-06, "grad_norm": 7.282115936279297, "learning_rate": 2.6112759643916914e-07, "loss": 0.521, "mean_token_accuracy": 0.8336124420166016, "num_tokens": 23407568.0, "step": 617 }, { "epoch": 0.07861595216893524, "ewc_loss": 0.0038253210950642824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.8253210732364096e-06, "grad_norm": 7.260770320892334, "learning_rate": 2.61551504874947e-07, "loss": 0.554, "mean_token_accuracy": 0.8269816637039185, "num_tokens": 23440261.0, "step": 618 }, { "epoch": 0.07874316244752576, "ewc_loss": 0.003842129372060299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.842129444819875e-06, "grad_norm": 7.255640029907227, "learning_rate": 2.619754133107249e-07, "loss": 0.535, "mean_token_accuracy": 0.8322317600250244, "num_tokens": 23481706.0, "step": 619 }, { "epoch": 0.07887037272611627, "ewc_loss": 0.003847119864076376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.847119842248503e-06, "grad_norm": 7.374706268310547, "learning_rate": 2.623993217465028e-07, "loss": 0.5958, "mean_token_accuracy": 0.8109703063964844, "num_tokens": 23515410.0, "step": 620 }, { "epoch": 0.07899758300470679, "ewc_loss": 0.0038653051014989614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.865305188810453e-06, "grad_norm": 7.251829624176025, "learning_rate": 2.6282323018228063e-07, "loss": 0.5807, "mean_token_accuracy": 0.8218872547149658, "num_tokens": 23554627.0, "step": 621 }, { "epoch": 0.07912479328329729, "ewc_loss": 0.0038521105889230967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.852110694424482e-06, "grad_norm": 7.168360710144043, "learning_rate": 2.632471386180585e-07, "loss": 0.6028, "mean_token_accuracy": 0.8114225268363953, "num_tokens": 23593400.0, "step": 622 }, { "epoch": 0.0792520035618878, "ewc_loss": 0.0038544535636901855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.854453552776249e-06, "grad_norm": 7.60891580581665, "learning_rate": 2.6367104705383637e-07, "loss": 0.6007, "mean_token_accuracy": 0.8105802536010742, "num_tokens": 23628242.0, "step": 623 }, { "epoch": 0.07937921384047832, "ewc_loss": 0.003938361536711454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.938361714972416e-06, "grad_norm": 7.417240619659424, "learning_rate": 2.6409495548961427e-07, "loss": 0.5871, "mean_token_accuracy": 0.8207350373268127, "num_tokens": 23671041.0, "step": 624 }, { "epoch": 0.07950642411906882, "ewc_loss": 0.0038711405359208584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.871140506817028e-06, "grad_norm": 7.3652544021606445, "learning_rate": 2.6451886392539206e-07, "loss": 0.5647, "mean_token_accuracy": 0.825488269329071, "num_tokens": 23701904.0, "step": 625 }, { "epoch": 0.07963363439765933, "ewc_loss": 0.003899917472153902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.899917373928474e-06, "grad_norm": 7.336487770080566, "learning_rate": 2.6494277236116996e-07, "loss": 0.5277, "mean_token_accuracy": 0.8322540521621704, "num_tokens": 23742374.0, "step": 626 }, { "epoch": 0.07976084467624985, "ewc_loss": 0.003909635357558727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.909635324816918e-06, "grad_norm": 7.400649547576904, "learning_rate": 2.6536668079694786e-07, "loss": 0.6341, "mean_token_accuracy": 0.8036903142929077, "num_tokens": 23780216.0, "step": 627 }, { "epoch": 0.07988805495484035, "ewc_loss": 0.003920786548405886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.920786639355356e-06, "grad_norm": 7.387842655181885, "learning_rate": 2.6579058923272576e-07, "loss": 0.487, "mean_token_accuracy": 0.847695529460907, "num_tokens": 23813641.0, "step": 628 }, { "epoch": 0.08001526523343086, "ewc_loss": 0.003919305745512247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.919305527233519e-06, "grad_norm": 7.333657264709473, "learning_rate": 2.6621449766850356e-07, "loss": 0.5731, "mean_token_accuracy": 0.8223084807395935, "num_tokens": 23850189.0, "step": 629 }, { "epoch": 0.08014247551202137, "ewc_loss": 0.003912495914846659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.912496140401345e-06, "grad_norm": 7.289188861846924, "learning_rate": 2.6663840610428145e-07, "loss": 0.5268, "mean_token_accuracy": 0.833908200263977, "num_tokens": 23884410.0, "step": 630 }, { "epoch": 0.08026968579061187, "ewc_loss": 0.003923109266906977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.923109488823684e-06, "grad_norm": 7.290462493896484, "learning_rate": 2.6706231454005935e-07, "loss": 0.5371, "mean_token_accuracy": 0.8310571908950806, "num_tokens": 23930139.0, "step": 631 }, { "epoch": 0.08039689606920239, "ewc_loss": 0.0039205108769237995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.920511062460719e-06, "grad_norm": 7.118284702301025, "learning_rate": 2.6748622297583725e-07, "loss": 0.5346, "mean_token_accuracy": 0.8325585126876831, "num_tokens": 23973124.0, "step": 632 }, { "epoch": 0.0805241063477929, "ewc_loss": 0.003911340609192848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.911340627382742e-06, "grad_norm": 7.268191814422607, "learning_rate": 2.6791013141161505e-07, "loss": 0.5434, "mean_token_accuracy": 0.8306018114089966, "num_tokens": 24008676.0, "step": 633 }, { "epoch": 0.08065131662638342, "ewc_loss": 0.0039448910392820835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.944890977436444e-06, "grad_norm": 7.341336250305176, "learning_rate": 2.6833403984739294e-07, "loss": 0.5413, "mean_token_accuracy": 0.8290817737579346, "num_tokens": 24044880.0, "step": 634 }, { "epoch": 0.08077852690497392, "ewc_loss": 0.003956486936658621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.956487034884049e-06, "grad_norm": 7.220371246337891, "learning_rate": 2.6875794828317084e-07, "loss": 0.5321, "mean_token_accuracy": 0.836337685585022, "num_tokens": 24090140.0, "step": 635 }, { "epoch": 0.08090573718356443, "ewc_loss": 0.003935391083359718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.9353913052764256e-06, "grad_norm": 7.429671764373779, "learning_rate": 2.6918185671894874e-07, "loss": 0.6175, "mean_token_accuracy": 0.8114154934883118, "num_tokens": 24123311.0, "step": 636 }, { "epoch": 0.08103294746215495, "ewc_loss": 0.003990825265645981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.990825462096836e-06, "grad_norm": 7.349060535430908, "learning_rate": 2.6960576515472654e-07, "loss": 0.6511, "mean_token_accuracy": 0.797637939453125, "num_tokens": 24164019.0, "step": 637 }, { "epoch": 0.08116015774074545, "ewc_loss": 0.003962479531764984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.9624796954740304e-06, "grad_norm": 7.2981390953063965, "learning_rate": 2.7002967359050443e-07, "loss": 0.5182, "mean_token_accuracy": 0.8369762301445007, "num_tokens": 24205986.0, "step": 638 }, { "epoch": 0.08128736801933596, "ewc_loss": 0.003958963789045811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.958963588956976e-06, "grad_norm": 7.272194862365723, "learning_rate": 2.7045358202628233e-07, "loss": 0.5756, "mean_token_accuracy": 0.8171404004096985, "num_tokens": 24251956.0, "step": 639 }, { "epoch": 0.08141457829792648, "ewc_loss": 0.003976491745561361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.976491825596895e-06, "grad_norm": 7.244899272918701, "learning_rate": 2.7087749046206023e-07, "loss": 0.5208, "mean_token_accuracy": 0.8334391117095947, "num_tokens": 24284990.0, "step": 640 }, { "epoch": 0.08154178857651698, "ewc_loss": 0.003978890832513571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.978890617872821e-06, "grad_norm": 7.37060022354126, "learning_rate": 2.71301398897838e-07, "loss": 0.5764, "mean_token_accuracy": 0.8189292550086975, "num_tokens": 24323305.0, "step": 641 }, { "epoch": 0.08166899885510749, "ewc_loss": 0.0039999657310545444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.999965883849654e-06, "grad_norm": 7.255589962005615, "learning_rate": 2.717253073336159e-07, "loss": 0.4877, "mean_token_accuracy": 0.8446201086044312, "num_tokens": 24365847.0, "step": 642 }, { "epoch": 0.081796209133698, "ewc_loss": 0.003982591908425093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.982591806561686e-06, "grad_norm": 7.370284080505371, "learning_rate": 2.721492157693938e-07, "loss": 0.495, "mean_token_accuracy": 0.8421735167503357, "num_tokens": 24407269.0, "step": 643 }, { "epoch": 0.0819234194122885, "ewc_loss": 0.004005255177617073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.005255050287815e-06, "grad_norm": 7.413078308105469, "learning_rate": 2.7257312420517167e-07, "loss": 0.5525, "mean_token_accuracy": 0.8254009485244751, "num_tokens": 24444990.0, "step": 644 }, { "epoch": 0.08205062969087902, "ewc_loss": 0.004010929260402918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.010929387732176e-06, "grad_norm": 7.269196033477783, "learning_rate": 2.729970326409495e-07, "loss": 0.5829, "mean_token_accuracy": 0.8199290037155151, "num_tokens": 24487293.0, "step": 645 }, { "epoch": 0.08217783996946953, "ewc_loss": 0.00398537190631032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.985371677117655e-06, "grad_norm": 7.422852516174316, "learning_rate": 2.734209410767274e-07, "loss": 0.5449, "mean_token_accuracy": 0.8290958404541016, "num_tokens": 24521403.0, "step": 646 }, { "epoch": 0.08230505024806005, "ewc_loss": 0.004024431109428406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.024431291327346e-06, "grad_norm": 7.3436102867126465, "learning_rate": 2.738448495125053e-07, "loss": 0.5528, "mean_token_accuracy": 0.8289819359779358, "num_tokens": 24562740.0, "step": 647 }, { "epoch": 0.08243226052665055, "ewc_loss": 0.004009434022009373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.009434178442461e-06, "grad_norm": 7.289801120758057, "learning_rate": 2.7426875794828316e-07, "loss": 0.5606, "mean_token_accuracy": 0.8192341327667236, "num_tokens": 24597507.0, "step": 648 }, { "epoch": 0.08255947080524106, "ewc_loss": 0.0040190857835114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.019085736217676e-06, "grad_norm": 7.3530473709106445, "learning_rate": 2.74692666384061e-07, "loss": 0.5222, "mean_token_accuracy": 0.8333437442779541, "num_tokens": 24635629.0, "step": 649 }, { "epoch": 0.08268668108383158, "ewc_loss": 0.0040328046306967735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.032804554299219e-06, "grad_norm": 7.329703330993652, "learning_rate": 2.751165748198389e-07, "loss": 0.5479, "mean_token_accuracy": 0.8258308172225952, "num_tokens": 24676087.0, "step": 650 }, { "epoch": 0.08281389136242208, "ewc_loss": 0.004032061900943518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.032061951875221e-06, "grad_norm": 7.427306175231934, "learning_rate": 2.755404832556168e-07, "loss": 0.5668, "mean_token_accuracy": 0.8223755955696106, "num_tokens": 24713652.0, "step": 651 }, { "epoch": 0.0829411016410126, "ewc_loss": 0.004052407573908567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.052407348353881e-06, "grad_norm": 7.286339282989502, "learning_rate": 2.7596439169139465e-07, "loss": 0.5491, "mean_token_accuracy": 0.8282400369644165, "num_tokens": 24750130.0, "step": 652 }, { "epoch": 0.08306831191960311, "ewc_loss": 0.004032732453197241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.032732249470428e-06, "grad_norm": 7.471740245819092, "learning_rate": 2.763883001271725e-07, "loss": 0.5989, "mean_token_accuracy": 0.8119896054267883, "num_tokens": 24787337.0, "step": 653 }, { "epoch": 0.08319552219819361, "ewc_loss": 0.00408130744472146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.081307451997418e-06, "grad_norm": 7.464869976043701, "learning_rate": 2.768122085629504e-07, "loss": 0.5905, "mean_token_accuracy": 0.8086401224136353, "num_tokens": 24822736.0, "step": 654 }, { "epoch": 0.08332273247678412, "ewc_loss": 0.0040697804652154446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.069780516147148e-06, "grad_norm": 7.37197732925415, "learning_rate": 2.772361169987283e-07, "loss": 0.5311, "mean_token_accuracy": 0.8359166383743286, "num_tokens": 24858743.0, "step": 655 }, { "epoch": 0.08344994275537464, "ewc_loss": 0.004060928709805012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.060928858962143e-06, "grad_norm": 7.412881851196289, "learning_rate": 2.7766002543450614e-07, "loss": 0.5399, "mean_token_accuracy": 0.8273303508758545, "num_tokens": 24901286.0, "step": 656 }, { "epoch": 0.08357715303396514, "ewc_loss": 0.004075048957020044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.0750487642071676e-06, "grad_norm": 7.3783369064331055, "learning_rate": 2.78083933870284e-07, "loss": 0.6108, "mean_token_accuracy": 0.8111220002174377, "num_tokens": 24943457.0, "step": 657 }, { "epoch": 0.08370436331255565, "ewc_loss": 0.004068599082529545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.068599082529545e-06, "grad_norm": 7.4293646812438965, "learning_rate": 2.785078423060619e-07, "loss": 0.5496, "mean_token_accuracy": 0.8249130249023438, "num_tokens": 24979247.0, "step": 658 }, { "epoch": 0.08383157359114617, "ewc_loss": 0.0040918136946856976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.091813480044948e-06, "grad_norm": 7.411806106567383, "learning_rate": 2.789317507418398e-07, "loss": 0.5612, "mean_token_accuracy": 0.8224286437034607, "num_tokens": 25017456.0, "step": 659 }, { "epoch": 0.08395878386973668, "ewc_loss": 0.0040916684083640575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.091668415640015e-06, "grad_norm": 7.408743381500244, "learning_rate": 2.7935565917761763e-07, "loss": 0.5182, "mean_token_accuracy": 0.8332744836807251, "num_tokens": 25054682.0, "step": 660 }, { "epoch": 0.08408599414832718, "ewc_loss": 0.004103401210159063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.103401352040237e-06, "grad_norm": 7.66235876083374, "learning_rate": 2.797795676133955e-07, "loss": 0.5925, "mean_token_accuracy": 0.8131225109100342, "num_tokens": 25092501.0, "step": 661 }, { "epoch": 0.0842132044269177, "ewc_loss": 0.004139491356909275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.139491466048639e-06, "grad_norm": 7.420248985290527, "learning_rate": 2.802034760491734e-07, "loss": 0.459, "mean_token_accuracy": 0.8535194396972656, "num_tokens": 25129253.0, "step": 662 }, { "epoch": 0.08434041470550821, "ewc_loss": 0.004085797816514969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.085797627340071e-06, "grad_norm": 7.216893672943115, "learning_rate": 2.806273844849512e-07, "loss": 0.4986, "mean_token_accuracy": 0.8422826528549194, "num_tokens": 25169119.0, "step": 663 }, { "epoch": 0.08446762498409871, "ewc_loss": 0.004091898910701275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.091898972546915e-06, "grad_norm": 7.538860321044922, "learning_rate": 2.810512929207291e-07, "loss": 0.5702, "mean_token_accuracy": 0.8149495124816895, "num_tokens": 25207654.0, "step": 664 }, { "epoch": 0.08459483526268922, "ewc_loss": 0.004162042401731014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.162042387179099e-06, "grad_norm": 7.363222599029541, "learning_rate": 2.8147520135650697e-07, "loss": 0.5146, "mean_token_accuracy": 0.833468496799469, "num_tokens": 25243258.0, "step": 665 }, { "epoch": 0.08472204554127974, "ewc_loss": 0.004102837294340134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.102837465325138e-06, "grad_norm": 7.353979587554932, "learning_rate": 2.8189910979228487e-07, "loss": 0.5546, "mean_token_accuracy": 0.8274434208869934, "num_tokens": 25281301.0, "step": 666 }, { "epoch": 0.08484925581987024, "ewc_loss": 0.004134529270231724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.134529262955766e-06, "grad_norm": 7.43433952331543, "learning_rate": 2.823230182280627e-07, "loss": 0.5048, "mean_token_accuracy": 0.8398503065109253, "num_tokens": 25313500.0, "step": 667 }, { "epoch": 0.08497646609846075, "ewc_loss": 0.004161221906542778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.1612220229581e-06, "grad_norm": 7.332987308502197, "learning_rate": 2.827469266638406e-07, "loss": 0.5429, "mean_token_accuracy": 0.828248143196106, "num_tokens": 25355121.0, "step": 668 }, { "epoch": 0.08510367637705127, "ewc_loss": 0.004138185176998377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.1381849769095425e-06, "grad_norm": 7.29673957824707, "learning_rate": 2.8317083509961846e-07, "loss": 0.5585, "mean_token_accuracy": 0.8259774446487427, "num_tokens": 25398868.0, "step": 669 }, { "epoch": 0.08523088665564178, "ewc_loss": 0.004157855175435543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.1578550735721365e-06, "grad_norm": 7.382999897003174, "learning_rate": 2.8359474353539636e-07, "loss": 0.5371, "mean_token_accuracy": 0.8292016386985779, "num_tokens": 25438670.0, "step": 670 }, { "epoch": 0.08535809693423228, "ewc_loss": 0.00417928583920002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.179285951977363e-06, "grad_norm": 7.437513828277588, "learning_rate": 2.840186519711742e-07, "loss": 0.5843, "mean_token_accuracy": 0.8208475112915039, "num_tokens": 25479243.0, "step": 671 }, { "epoch": 0.0854853072128228, "ewc_loss": 0.004190268460661173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.190268555248622e-06, "grad_norm": 7.493671894073486, "learning_rate": 2.844425604069521e-07, "loss": 0.568, "mean_token_accuracy": 0.8197603225708008, "num_tokens": 25514615.0, "step": 672 }, { "epoch": 0.08561251749141331, "ewc_loss": 0.00420304574072361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.203045591566479e-06, "grad_norm": 7.557240962982178, "learning_rate": 2.8486646884272995e-07, "loss": 0.5455, "mean_token_accuracy": 0.8275424242019653, "num_tokens": 25552753.0, "step": 673 }, { "epoch": 0.08573972777000381, "ewc_loss": 0.004202958662062883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2029587348224595e-06, "grad_norm": 7.401675701141357, "learning_rate": 2.8529037727850785e-07, "loss": 0.4819, "mean_token_accuracy": 0.8445488214492798, "num_tokens": 25587415.0, "step": 674 }, { "epoch": 0.08586693804859433, "ewc_loss": 0.004187896382063627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.187896593066398e-06, "grad_norm": 7.363788604736328, "learning_rate": 2.857142857142857e-07, "loss": 0.513, "mean_token_accuracy": 0.8392720222473145, "num_tokens": 25625962.0, "step": 675 }, { "epoch": 0.08599414832718484, "ewc_loss": 0.004197585862129927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.197585894871736e-06, "grad_norm": 7.551448345184326, "learning_rate": 2.861381941500636e-07, "loss": 0.561, "mean_token_accuracy": 0.8230307102203369, "num_tokens": 25669680.0, "step": 676 }, { "epoch": 0.08612135860577534, "ewc_loss": 0.004231669474393129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.231669663568027e-06, "grad_norm": 7.5292067527771, "learning_rate": 2.8656210258584144e-07, "loss": 0.5966, "mean_token_accuracy": 0.8095893859863281, "num_tokens": 25709221.0, "step": 677 }, { "epoch": 0.08624856888436586, "ewc_loss": 0.004218473099172115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.218472895445302e-06, "grad_norm": 7.681221961975098, "learning_rate": 2.869860110216193e-07, "loss": 0.5967, "mean_token_accuracy": 0.8103411793708801, "num_tokens": 25741880.0, "step": 678 }, { "epoch": 0.08637577916295637, "ewc_loss": 0.0042495643720030785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2495644265727606e-06, "grad_norm": 7.409604072570801, "learning_rate": 2.874099194573972e-07, "loss": 0.529, "mean_token_accuracy": 0.8322601318359375, "num_tokens": 25786282.0, "step": 679 }, { "epoch": 0.08650298944154687, "ewc_loss": 0.004206100478768349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.206100584269734e-06, "grad_norm": 7.453587055206299, "learning_rate": 2.878338278931751e-07, "loss": 0.5759, "mean_token_accuracy": 0.819945216178894, "num_tokens": 25820229.0, "step": 680 }, { "epoch": 0.08663019972013739, "ewc_loss": 0.004241808783262968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.241808710503392e-06, "grad_norm": 7.437098026275635, "learning_rate": 2.8825773632895293e-07, "loss": 0.5104, "mean_token_accuracy": 0.841906726360321, "num_tokens": 25856465.0, "step": 681 }, { "epoch": 0.0867574099987279, "ewc_loss": 0.004233475774526596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.233475920045748e-06, "grad_norm": 7.426371097564697, "learning_rate": 2.886816447647308e-07, "loss": 0.5096, "mean_token_accuracy": 0.8383582830429077, "num_tokens": 25897407.0, "step": 682 }, { "epoch": 0.08688462027731841, "ewc_loss": 0.0042556035332381725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.255603471392533e-06, "grad_norm": 7.492711067199707, "learning_rate": 2.891055532005087e-07, "loss": 0.5324, "mean_token_accuracy": 0.8321801424026489, "num_tokens": 25938825.0, "step": 683 }, { "epoch": 0.08701183055590891, "ewc_loss": 0.004262469708919525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2624697016435675e-06, "grad_norm": 7.479849338531494, "learning_rate": 2.8952946163628657e-07, "loss": 0.5639, "mean_token_accuracy": 0.8195412755012512, "num_tokens": 25974944.0, "step": 684 }, { "epoch": 0.08713904083449943, "ewc_loss": 0.004263164009898901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.263164100848371e-06, "grad_norm": 7.52025032043457, "learning_rate": 2.899533700720644e-07, "loss": 0.5219, "mean_token_accuracy": 0.8323460221290588, "num_tokens": 26012864.0, "step": 685 }, { "epoch": 0.08726625111308994, "ewc_loss": 0.00427903700619936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.279037057131063e-06, "grad_norm": 7.418582439422607, "learning_rate": 2.9037727850784227e-07, "loss": 0.5682, "mean_token_accuracy": 0.8216203451156616, "num_tokens": 26049779.0, "step": 686 }, { "epoch": 0.08739346139168044, "ewc_loss": 0.0042680674232542515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.26806764153298e-06, "grad_norm": 7.4645466804504395, "learning_rate": 2.9080118694362016e-07, "loss": 0.5383, "mean_token_accuracy": 0.8305039405822754, "num_tokens": 26089095.0, "step": 687 }, { "epoch": 0.08752067167027096, "ewc_loss": 0.004285774659365416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.285774593881797e-06, "grad_norm": 7.374009609222412, "learning_rate": 2.9122509537939806e-07, "loss": 0.5379, "mean_token_accuracy": 0.8308423757553101, "num_tokens": 26132553.0, "step": 688 }, { "epoch": 0.08764788194886147, "ewc_loss": 0.004274864215403795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.274864295439329e-06, "grad_norm": 7.45374059677124, "learning_rate": 2.916490038151759e-07, "loss": 0.5882, "mean_token_accuracy": 0.8126552700996399, "num_tokens": 26171301.0, "step": 689 }, { "epoch": 0.08777509222745197, "ewc_loss": 0.004293531179428101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2935312194458675e-06, "grad_norm": 7.462558269500732, "learning_rate": 2.9207291225095376e-07, "loss": 0.5755, "mean_token_accuracy": 0.8171422481536865, "num_tokens": 26217871.0, "step": 690 }, { "epoch": 0.08790230250604249, "ewc_loss": 0.004293206613510847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.2932065298373345e-06, "grad_norm": 7.3933868408203125, "learning_rate": 2.9249682068673166e-07, "loss": 0.5392, "mean_token_accuracy": 0.8274285793304443, "num_tokens": 26255927.0, "step": 691 }, { "epoch": 0.088029512784633, "ewc_loss": 0.004291292279958725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.291292498237453e-06, "grad_norm": 7.521371841430664, "learning_rate": 2.9292072912250955e-07, "loss": 0.5225, "mean_token_accuracy": 0.8364301919937134, "num_tokens": 26296041.0, "step": 692 }, { "epoch": 0.0881567230632235, "ewc_loss": 0.004315818659961224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.315818841860164e-06, "grad_norm": 7.5212883949279785, "learning_rate": 2.933446375582874e-07, "loss": 0.6046, "mean_token_accuracy": 0.8114738464355469, "num_tokens": 26333785.0, "step": 693 }, { "epoch": 0.08828393334181402, "ewc_loss": 0.00430343160405755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.303431524022017e-06, "grad_norm": 7.507277965545654, "learning_rate": 2.9376854599406525e-07, "loss": 0.5109, "mean_token_accuracy": 0.8357145190238953, "num_tokens": 26368837.0, "step": 694 }, { "epoch": 0.08841114362040453, "ewc_loss": 0.004306223709136248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3062236727564596e-06, "grad_norm": 7.573521137237549, "learning_rate": 2.9419245442984315e-07, "loss": 0.5861, "mean_token_accuracy": 0.8176381587982178, "num_tokens": 26402151.0, "step": 695 }, { "epoch": 0.08853835389899505, "ewc_loss": 0.00432550348341465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.325503596191993e-06, "grad_norm": 7.506876468658447, "learning_rate": 2.9461636286562104e-07, "loss": 0.5381, "mean_token_accuracy": 0.8284311890602112, "num_tokens": 26435550.0, "step": 696 }, { "epoch": 0.08866556417758555, "ewc_loss": 0.004315683618187904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.315683781896951e-06, "grad_norm": 7.459182262420654, "learning_rate": 2.9504027130139884e-07, "loss": 0.5279, "mean_token_accuracy": 0.8333017826080322, "num_tokens": 26479299.0, "step": 697 }, { "epoch": 0.08879277445617606, "ewc_loss": 0.004314012825489044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.314013040129794e-06, "grad_norm": 7.47184419631958, "learning_rate": 2.9546417973717674e-07, "loss": 0.5648, "mean_token_accuracy": 0.8215467929840088, "num_tokens": 26519380.0, "step": 698 }, { "epoch": 0.08891998473476657, "ewc_loss": 0.004330435302108526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.330435331212357e-06, "grad_norm": 7.491747856140137, "learning_rate": 2.9588808817295464e-07, "loss": 0.491, "mean_token_accuracy": 0.8419569134712219, "num_tokens": 26557306.0, "step": 699 }, { "epoch": 0.08904719501335707, "ewc_loss": 0.004339769948273897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.339769930084003e-06, "grad_norm": 7.69190788269043, "learning_rate": 2.9631199660873253e-07, "loss": 0.5613, "mean_token_accuracy": 0.8234524726867676, "num_tokens": 26593549.0, "step": 700 }, { "epoch": 0.08917440529194759, "ewc_loss": 0.004378539510071278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.378539415483829e-06, "grad_norm": 7.523373126983643, "learning_rate": 2.9673590504451033e-07, "loss": 0.4856, "mean_token_accuracy": 0.8466492295265198, "num_tokens": 26629926.0, "step": 701 }, { "epoch": 0.0893016155705381, "ewc_loss": 0.004330753348767757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.330753199610626e-06, "grad_norm": 7.491402626037598, "learning_rate": 2.9715981348028823e-07, "loss": 0.5181, "mean_token_accuracy": 0.8366631865501404, "num_tokens": 26668652.0, "step": 702 }, { "epoch": 0.0894288258491286, "ewc_loss": 0.004356426186859608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.356425961304922e-06, "grad_norm": 7.636791229248047, "learning_rate": 2.975837219160661e-07, "loss": 0.5802, "mean_token_accuracy": 0.8168724775314331, "num_tokens": 26708084.0, "step": 703 }, { "epoch": 0.08955603612771912, "ewc_loss": 0.004375203978270292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.375203843665076e-06, "grad_norm": 7.574222087860107, "learning_rate": 2.98007630351844e-07, "loss": 0.5463, "mean_token_accuracy": 0.8281139135360718, "num_tokens": 26750134.0, "step": 704 }, { "epoch": 0.08968324640630963, "ewc_loss": 0.004358373582363129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.358373644208768e-06, "grad_norm": 7.6764116287231445, "learning_rate": 2.984315387876218e-07, "loss": 0.5545, "mean_token_accuracy": 0.8238237500190735, "num_tokens": 26785410.0, "step": 705 }, { "epoch": 0.08981045668490013, "ewc_loss": 0.004385719541460276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.385719421406975e-06, "grad_norm": 7.596452713012695, "learning_rate": 2.988554472233997e-07, "loss": 0.5584, "mean_token_accuracy": 0.8265796303749084, "num_tokens": 26825226.0, "step": 706 }, { "epoch": 0.08993766696349065, "ewc_loss": 0.004373770672827959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.373770480015082e-06, "grad_norm": 7.521562576293945, "learning_rate": 2.992793556591776e-07, "loss": 0.5516, "mean_token_accuracy": 0.8253829479217529, "num_tokens": 26865401.0, "step": 707 }, { "epoch": 0.09006487724208116, "ewc_loss": 0.004368359688669443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.368359896034235e-06, "grad_norm": 7.498547077178955, "learning_rate": 2.997032640949555e-07, "loss": 0.5616, "mean_token_accuracy": 0.824439525604248, "num_tokens": 26905365.0, "step": 708 }, { "epoch": 0.09019208752067168, "ewc_loss": 0.004385152831673622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.3851528062077705e-06, "grad_norm": 7.540804386138916, "learning_rate": 3.001271725307333e-07, "loss": 0.5026, "mean_token_accuracy": 0.8400956988334656, "num_tokens": 26941348.0, "step": 709 }, { "epoch": 0.09031929779926218, "ewc_loss": 0.004400364123284817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.400364105094923e-06, "grad_norm": 7.622807025909424, "learning_rate": 3.005510809665112e-07, "loss": 0.5377, "mean_token_accuracy": 0.83046954870224, "num_tokens": 26975803.0, "step": 710 }, { "epoch": 0.09044650807785269, "ewc_loss": 0.004417294170707464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.417294348968426e-06, "grad_norm": 7.630709648132324, "learning_rate": 3.009749894022891e-07, "loss": 0.4835, "mean_token_accuracy": 0.8456529378890991, "num_tokens": 27014931.0, "step": 711 }, { "epoch": 0.0905737183564432, "ewc_loss": 0.004403029568493366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.403029379318468e-06, "grad_norm": 7.527478218078613, "learning_rate": 3.01398897838067e-07, "loss": 0.5407, "mean_token_accuracy": 0.8308919668197632, "num_tokens": 27055505.0, "step": 712 }, { "epoch": 0.0907009286350337, "ewc_loss": 0.004406488500535488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.406488642416662e-06, "grad_norm": 7.616127967834473, "learning_rate": 3.018228062738448e-07, "loss": 0.5678, "mean_token_accuracy": 0.824155330657959, "num_tokens": 27092121.0, "step": 713 }, { "epoch": 0.09082813891362422, "ewc_loss": 0.004427408333867788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.427408384799492e-06, "grad_norm": 7.564854145050049, "learning_rate": 3.022467147096227e-07, "loss": 0.5051, "mean_token_accuracy": 0.8397316336631775, "num_tokens": 27129827.0, "step": 714 }, { "epoch": 0.09095534919221474, "ewc_loss": 0.00442130072042346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.4213006731297355e-06, "grad_norm": 7.541112899780273, "learning_rate": 3.026706231454006e-07, "loss": 0.522, "mean_token_accuracy": 0.8336969614028931, "num_tokens": 27174977.0, "step": 715 }, { "epoch": 0.09108255947080524, "ewc_loss": 0.0044417125172913074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.441712462721625e-06, "grad_norm": 7.708664894104004, "learning_rate": 3.0309453158117844e-07, "loss": 0.5279, "mean_token_accuracy": 0.8317605257034302, "num_tokens": 27211472.0, "step": 716 }, { "epoch": 0.09120976974939575, "ewc_loss": 0.004461449570953846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.4614494072448e-06, "grad_norm": 7.6318511962890625, "learning_rate": 3.035184400169563e-07, "loss": 0.6105, "mean_token_accuracy": 0.8116955757141113, "num_tokens": 27249160.0, "step": 717 }, { "epoch": 0.09133698002798626, "ewc_loss": 0.004443641752004623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.443641955731437e-06, "grad_norm": 7.745664596557617, "learning_rate": 3.039423484527342e-07, "loss": 0.5192, "mean_token_accuracy": 0.8336893320083618, "num_tokens": 27287946.0, "step": 718 }, { "epoch": 0.09146419030657676, "ewc_loss": 0.0044694035314023495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.469403393159155e-06, "grad_norm": 7.601171970367432, "learning_rate": 3.043662568885121e-07, "loss": 0.5529, "mean_token_accuracy": 0.8202266693115234, "num_tokens": 27323388.0, "step": 719 }, { "epoch": 0.09159140058516728, "ewc_loss": 0.004448047373443842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.448047548066825e-06, "grad_norm": 7.688189506530762, "learning_rate": 3.0479016532428993e-07, "loss": 0.6008, "mean_token_accuracy": 0.8110003471374512, "num_tokens": 27357524.0, "step": 720 }, { "epoch": 0.0917186108637578, "ewc_loss": 0.004482526332139969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.482526492211036e-06, "grad_norm": 7.654862403869629, "learning_rate": 3.052140737600678e-07, "loss": 0.5072, "mean_token_accuracy": 0.8377057313919067, "num_tokens": 27392037.0, "step": 721 }, { "epoch": 0.09184582114234831, "ewc_loss": 0.004472394939512014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.472394721233286e-06, "grad_norm": 7.520678997039795, "learning_rate": 3.056379821958457e-07, "loss": 0.5193, "mean_token_accuracy": 0.8355110883712769, "num_tokens": 27429149.0, "step": 722 }, { "epoch": 0.09197303142093881, "ewc_loss": 0.004474443383514881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.4744433580490295e-06, "grad_norm": 7.633563995361328, "learning_rate": 3.060618906316236e-07, "loss": 0.5246, "mean_token_accuracy": 0.8356041312217712, "num_tokens": 27465023.0, "step": 723 }, { "epoch": 0.09210024169952932, "ewc_loss": 0.004514321684837341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.514321517490316e-06, "grad_norm": 7.6368327140808105, "learning_rate": 3.064857990674014e-07, "loss": 0.5156, "mean_token_accuracy": 0.8379541039466858, "num_tokens": 27501268.0, "step": 724 }, { "epoch": 0.09222745197811984, "ewc_loss": 0.004503741394728422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.503741365624592e-06, "grad_norm": 7.66057825088501, "learning_rate": 3.0690970750317927e-07, "loss": 0.5682, "mean_token_accuracy": 0.8211438655853271, "num_tokens": 27539937.0, "step": 725 }, { "epoch": 0.09235466225671034, "ewc_loss": 0.004517627414315939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.517627530731261e-06, "grad_norm": 7.692842960357666, "learning_rate": 3.0733361593895717e-07, "loss": 0.5299, "mean_token_accuracy": 0.8318036198616028, "num_tokens": 27574576.0, "step": 726 }, { "epoch": 0.09248187253530085, "ewc_loss": 0.004512759856879711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.512759915087372e-06, "grad_norm": 7.645793914794922, "learning_rate": 3.0775752437473507e-07, "loss": 0.5385, "mean_token_accuracy": 0.8255300521850586, "num_tokens": 27616036.0, "step": 727 }, { "epoch": 0.09260908281389137, "ewc_loss": 0.004520030226558447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.520030415733345e-06, "grad_norm": 7.600572109222412, "learning_rate": 3.081814328105129e-07, "loss": 0.5622, "mean_token_accuracy": 0.822781503200531, "num_tokens": 27662040.0, "step": 728 }, { "epoch": 0.09273629309248187, "ewc_loss": 0.004525168798863888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.525168606051011e-06, "grad_norm": 7.6234893798828125, "learning_rate": 3.0860534124629076e-07, "loss": 0.5911, "mean_token_accuracy": 0.8135236501693726, "num_tokens": 27705136.0, "step": 729 }, { "epoch": 0.09286350337107238, "ewc_loss": 0.004538430366665125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.538430403044913e-06, "grad_norm": 7.686649799346924, "learning_rate": 3.0902924968206866e-07, "loss": 0.5411, "mean_token_accuracy": 0.8287477493286133, "num_tokens": 27740564.0, "step": 730 }, { "epoch": 0.0929907136496629, "ewc_loss": 0.0045502400025725365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.5502401917474344e-06, "grad_norm": 7.840452671051025, "learning_rate": 3.0945315811784656e-07, "loss": 0.5133, "mean_token_accuracy": 0.8348547220230103, "num_tokens": 27774788.0, "step": 731 }, { "epoch": 0.0931179239282534, "ewc_loss": 0.0045777615159749985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.577761501423083e-06, "grad_norm": 7.6244707107543945, "learning_rate": 3.098770665536244e-07, "loss": 0.6589, "mean_token_accuracy": 0.7908710837364197, "num_tokens": 27818300.0, "step": 732 }, { "epoch": 0.09324513420684391, "ewc_loss": 0.00452976580709219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.529765647021122e-06, "grad_norm": 7.666248321533203, "learning_rate": 3.1030097498940225e-07, "loss": 0.5012, "mean_token_accuracy": 0.8397356867790222, "num_tokens": 27857739.0, "step": 733 }, { "epoch": 0.09337234448543442, "ewc_loss": 0.004575124476104975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.575124421535293e-06, "grad_norm": 7.744977951049805, "learning_rate": 3.1072488342518015e-07, "loss": 0.5409, "mean_token_accuracy": 0.8235355019569397, "num_tokens": 27893897.0, "step": 734 }, { "epoch": 0.09349955476402494, "ewc_loss": 0.004585456568747759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.585456736094784e-06, "grad_norm": 7.603727340698242, "learning_rate": 3.11148791860958e-07, "loss": 0.5271, "mean_token_accuracy": 0.8305493593215942, "num_tokens": 27930511.0, "step": 735 }, { "epoch": 0.09362676504261544, "ewc_loss": 0.004549712408334017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.549712230073055e-06, "grad_norm": 7.768916606903076, "learning_rate": 3.115727002967359e-07, "loss": 0.5452, "mean_token_accuracy": 0.826348066329956, "num_tokens": 27961128.0, "step": 736 }, { "epoch": 0.09375397532120595, "ewc_loss": 0.00460849842056632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.60849832961685e-06, "grad_norm": 7.646435260772705, "learning_rate": 3.1199660873251374e-07, "loss": 0.5455, "mean_token_accuracy": 0.8289345502853394, "num_tokens": 28000748.0, "step": 737 }, { "epoch": 0.09388118559979647, "ewc_loss": 0.004574924241751432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.574924332700903e-06, "grad_norm": 7.60281229019165, "learning_rate": 3.1242051716829164e-07, "loss": 0.5788, "mean_token_accuracy": 0.8149600028991699, "num_tokens": 28046455.0, "step": 738 }, { "epoch": 0.09400839587838697, "ewc_loss": 0.004587839357554913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.587839157466078e-06, "grad_norm": 7.679741859436035, "learning_rate": 3.128444256040695e-07, "loss": 0.5549, "mean_token_accuracy": 0.8257454633712769, "num_tokens": 28087026.0, "step": 739 }, { "epoch": 0.09413560615697748, "ewc_loss": 0.00460880296304822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.608803010341944e-06, "grad_norm": 7.603694915771484, "learning_rate": 3.132683340398474e-07, "loss": 0.4778, "mean_token_accuracy": 0.84546959400177, "num_tokens": 28126382.0, "step": 740 }, { "epoch": 0.094262816435568, "ewc_loss": 0.004611302632838488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.611302756529767e-06, "grad_norm": 7.675550937652588, "learning_rate": 3.1369224247562523e-07, "loss": 0.5047, "mean_token_accuracy": 0.8402376770973206, "num_tokens": 28161198.0, "step": 741 }, { "epoch": 0.0943900267141585, "ewc_loss": 0.004621954634785652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.621954758476932e-06, "grad_norm": 7.657515048980713, "learning_rate": 3.1411615091140313e-07, "loss": 0.4735, "mean_token_accuracy": 0.8494409322738647, "num_tokens": 28204612.0, "step": 742 }, { "epoch": 0.09451723699274901, "ewc_loss": 0.004622349515557289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.622349479177501e-06, "grad_norm": 7.609181880950928, "learning_rate": 3.14540059347181e-07, "loss": 0.5277, "mean_token_accuracy": 0.8321778178215027, "num_tokens": 28241050.0, "step": 743 }, { "epoch": 0.09464444727133953, "ewc_loss": 0.004626590292900801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.626590452971868e-06, "grad_norm": 7.715063095092773, "learning_rate": 3.149639677829589e-07, "loss": 0.4878, "mean_token_accuracy": 0.8420574069023132, "num_tokens": 28278032.0, "step": 744 }, { "epoch": 0.09477165754993004, "ewc_loss": 0.004644747357815504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.644747150450712e-06, "grad_norm": 7.751466274261475, "learning_rate": 3.153878762187368e-07, "loss": 0.5748, "mean_token_accuracy": 0.822908878326416, "num_tokens": 28315599.0, "step": 745 }, { "epoch": 0.09489886782852054, "ewc_loss": 0.004647655412554741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.647655259759631e-06, "grad_norm": 7.645275592803955, "learning_rate": 3.158117846545146e-07, "loss": 0.5984, "mean_token_accuracy": 0.8144811391830444, "num_tokens": 28358504.0, "step": 746 }, { "epoch": 0.09502607810711106, "ewc_loss": 0.0046335505321621895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.6335503611771856e-06, "grad_norm": 7.778705596923828, "learning_rate": 3.1623569309029247e-07, "loss": 0.5147, "mean_token_accuracy": 0.836819589138031, "num_tokens": 28389626.0, "step": 747 }, { "epoch": 0.09515328838570157, "ewc_loss": 0.004667087458074093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.667087523557711e-06, "grad_norm": 7.678708553314209, "learning_rate": 3.1665960152607037e-07, "loss": 0.4975, "mean_token_accuracy": 0.8402286767959595, "num_tokens": 28430361.0, "step": 748 }, { "epoch": 0.09528049866429207, "ewc_loss": 0.004627358168363571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.6273580665001646e-06, "grad_norm": 7.66365909576416, "learning_rate": 3.1708350996184826e-07, "loss": 0.5345, "mean_token_accuracy": 0.830319881439209, "num_tokens": 28469792.0, "step": 749 }, { "epoch": 0.09540770894288259, "ewc_loss": 0.004656228236854076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.656228156818543e-06, "grad_norm": 7.759313106536865, "learning_rate": 3.175074183976261e-07, "loss": 0.5138, "mean_token_accuracy": 0.8390259742736816, "num_tokens": 28507569.0, "step": 750 }, { "epoch": 0.0955349192214731, "ewc_loss": 0.0046670399606227875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.667039775085868e-06, "grad_norm": 7.777710914611816, "learning_rate": 3.1793132683340396e-07, "loss": 0.5393, "mean_token_accuracy": 0.8292829394340515, "num_tokens": 28542336.0, "step": 751 }, { "epoch": 0.0956621295000636, "ewc_loss": 0.004662156570702791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.6621566980320495e-06, "grad_norm": 7.741722106933594, "learning_rate": 3.1835523526918186e-07, "loss": 0.489, "mean_token_accuracy": 0.8431620597839355, "num_tokens": 28578587.0, "step": 752 }, { "epoch": 0.09578933977865411, "ewc_loss": 0.004664214327931404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.664214429794811e-06, "grad_norm": 7.75126838684082, "learning_rate": 3.1877914370495975e-07, "loss": 0.5696, "mean_token_accuracy": 0.8212167024612427, "num_tokens": 28617505.0, "step": 753 }, { "epoch": 0.09591655005724463, "ewc_loss": 0.004669555462896824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.669555437430972e-06, "grad_norm": 7.701303482055664, "learning_rate": 3.1920305214073755e-07, "loss": 0.5754, "mean_token_accuracy": 0.8203718662261963, "num_tokens": 28658526.0, "step": 754 }, { "epoch": 0.09604376033583513, "ewc_loss": 0.0046686953864991665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.668695510190446e-06, "grad_norm": 7.744815826416016, "learning_rate": 3.1962696057651545e-07, "loss": 0.5108, "mean_token_accuracy": 0.8377974629402161, "num_tokens": 28698825.0, "step": 755 }, { "epoch": 0.09617097061442564, "ewc_loss": 0.0046952697448432446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.695269581134198e-06, "grad_norm": 7.78569221496582, "learning_rate": 3.2005086901229335e-07, "loss": 0.5534, "mean_token_accuracy": 0.8221767544746399, "num_tokens": 28733238.0, "step": 756 }, { "epoch": 0.09629818089301616, "ewc_loss": 0.0047023543156683445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.7023540901136585e-06, "grad_norm": 7.810697078704834, "learning_rate": 3.2047477744807125e-07, "loss": 0.555, "mean_token_accuracy": 0.8271586298942566, "num_tokens": 28771552.0, "step": 757 }, { "epoch": 0.09642539117160667, "ewc_loss": 0.004702143371105194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.702143542090198e-06, "grad_norm": 7.732899188995361, "learning_rate": 3.2089868588384904e-07, "loss": 0.5865, "mean_token_accuracy": 0.8134673237800598, "num_tokens": 28810827.0, "step": 758 }, { "epoch": 0.09655260145019717, "ewc_loss": 0.004691078327596188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.691078174801078e-06, "grad_norm": 7.764404296875, "learning_rate": 3.2132259431962694e-07, "loss": 0.4977, "mean_token_accuracy": 0.8419533967971802, "num_tokens": 28849017.0, "step": 759 }, { "epoch": 0.09667981172878769, "ewc_loss": 0.004717853385955095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.717853244073922e-06, "grad_norm": 7.774485111236572, "learning_rate": 3.2174650275540484e-07, "loss": 0.5483, "mean_token_accuracy": 0.8317366242408752, "num_tokens": 28887659.0, "step": 760 }, { "epoch": 0.0968070220073782, "ewc_loss": 0.004718630574643612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.718630407296587e-06, "grad_norm": 7.751585960388184, "learning_rate": 3.2217041119118274e-07, "loss": 0.515, "mean_token_accuracy": 0.8351520299911499, "num_tokens": 28928983.0, "step": 761 }, { "epoch": 0.0969342322859687, "ewc_loss": 0.004716198891401291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.716198873211397e-06, "grad_norm": 7.787847518920898, "learning_rate": 3.2259431962696053e-07, "loss": 0.5484, "mean_token_accuracy": 0.8280816078186035, "num_tokens": 28962195.0, "step": 762 }, { "epoch": 0.09706144256455922, "ewc_loss": 0.004738430958241224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.738431016448885e-06, "grad_norm": 7.807636737823486, "learning_rate": 3.2301822806273843e-07, "loss": 0.5306, "mean_token_accuracy": 0.8295845985412598, "num_tokens": 28997351.0, "step": 763 }, { "epoch": 0.09718865284314973, "ewc_loss": 0.004739720840007067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.739720679935999e-06, "grad_norm": 7.816654682159424, "learning_rate": 3.2344213649851633e-07, "loss": 0.5576, "mean_token_accuracy": 0.8249828815460205, "num_tokens": 29040988.0, "step": 764 }, { "epoch": 0.09731586312174023, "ewc_loss": 0.004744821693748236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.744821580970893e-06, "grad_norm": 7.73134183883667, "learning_rate": 3.238660449342942e-07, "loss": 0.5195, "mean_token_accuracy": 0.8327502608299255, "num_tokens": 29081873.0, "step": 765 }, { "epoch": 0.09744307340033075, "ewc_loss": 0.00474065775051713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.740657914226176e-06, "grad_norm": 7.81379508972168, "learning_rate": 3.24289953370072e-07, "loss": 0.4899, "mean_token_accuracy": 0.8408322930335999, "num_tokens": 29115360.0, "step": 766 }, { "epoch": 0.09757028367892126, "ewc_loss": 0.004753451328724623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.753451321448665e-06, "grad_norm": 7.688974380493164, "learning_rate": 3.247138618058499e-07, "loss": 0.5356, "mean_token_accuracy": 0.8250294327735901, "num_tokens": 29160139.0, "step": 767 }, { "epoch": 0.09769749395751176, "ewc_loss": 0.004748085513710976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.748085302708205e-06, "grad_norm": 7.809523105621338, "learning_rate": 3.251377702416278e-07, "loss": 0.5652, "mean_token_accuracy": 0.8227285146713257, "num_tokens": 29202872.0, "step": 768 }, { "epoch": 0.09782470423610228, "ewc_loss": 0.004777645226567984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.777645244757878e-06, "grad_norm": 7.742593288421631, "learning_rate": 3.255616786774057e-07, "loss": 0.5125, "mean_token_accuracy": 0.8372198343276978, "num_tokens": 29246965.0, "step": 769 }, { "epoch": 0.09795191451469279, "ewc_loss": 0.00476022670045495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.7602266022295225e-06, "grad_norm": 7.729037284851074, "learning_rate": 3.259855871131835e-07, "loss": 0.4516, "mean_token_accuracy": 0.8539488315582275, "num_tokens": 29289531.0, "step": 770 }, { "epoch": 0.0980791247932833, "ewc_loss": 0.004765327554196119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.765327503264416e-06, "grad_norm": 7.7636237144470215, "learning_rate": 3.264094955489614e-07, "loss": 0.5084, "mean_token_accuracy": 0.837566614151001, "num_tokens": 29330484.0, "step": 771 }, { "epoch": 0.0982063350718738, "ewc_loss": 0.004782980773597956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.7829807954258285e-06, "grad_norm": 7.760991096496582, "learning_rate": 3.268334039847393e-07, "loss": 0.5356, "mean_token_accuracy": 0.8279521465301514, "num_tokens": 29369595.0, "step": 772 }, { "epoch": 0.09833354535046432, "ewc_loss": 0.00478496914729476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.784968950843904e-06, "grad_norm": 7.823261260986328, "learning_rate": 3.2725731242051715e-07, "loss": 0.5241, "mean_token_accuracy": 0.8358335494995117, "num_tokens": 29411856.0, "step": 773 }, { "epoch": 0.09846075562905483, "ewc_loss": 0.00478752376511693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.787523721461184e-06, "grad_norm": 7.748598098754883, "learning_rate": 3.27681220856295e-07, "loss": 0.5068, "mean_token_accuracy": 0.8398109078407288, "num_tokens": 29454754.0, "step": 774 }, { "epoch": 0.09858796590764533, "ewc_loss": 0.004772385116666555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.772385182150174e-06, "grad_norm": 7.810673236846924, "learning_rate": 3.281051292920729e-07, "loss": 0.5017, "mean_token_accuracy": 0.8364428281784058, "num_tokens": 29489131.0, "step": 775 }, { "epoch": 0.09871517618623585, "ewc_loss": 0.0047963522374629974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.796352186531294e-06, "grad_norm": 7.811455726623535, "learning_rate": 3.285290377278508e-07, "loss": 0.5569, "mean_token_accuracy": 0.820746123790741, "num_tokens": 29528292.0, "step": 776 }, { "epoch": 0.09884238646482636, "ewc_loss": 0.00478522852063179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.78522861158126e-06, "grad_norm": 7.774411678314209, "learning_rate": 3.2895294616362864e-07, "loss": 0.4891, "mean_token_accuracy": 0.8437037467956543, "num_tokens": 29564878.0, "step": 777 }, { "epoch": 0.09896959674341686, "ewc_loss": 0.004795921966433525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.7959219955373555e-06, "grad_norm": 7.788683891296387, "learning_rate": 3.293768545994065e-07, "loss": 0.5095, "mean_token_accuracy": 0.8344966173171997, "num_tokens": 29602826.0, "step": 778 }, { "epoch": 0.09909680702200738, "ewc_loss": 0.004801655653864145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.801655450137332e-06, "grad_norm": 7.831216335296631, "learning_rate": 3.298007630351844e-07, "loss": 0.5779, "mean_token_accuracy": 0.8195402026176453, "num_tokens": 29640323.0, "step": 779 }, { "epoch": 0.09922401730059789, "ewc_loss": 0.004818785469979048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.818785328097874e-06, "grad_norm": 7.762539386749268, "learning_rate": 3.302246714709623e-07, "loss": 0.5072, "mean_token_accuracy": 0.8374193906784058, "num_tokens": 29683302.0, "step": 780 }, { "epoch": 0.09935122757918839, "ewc_loss": 0.004812995437532663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.812995484826388e-06, "grad_norm": 7.842167854309082, "learning_rate": 3.3064857990674013e-07, "loss": 0.5327, "mean_token_accuracy": 0.8335422277450562, "num_tokens": 29721085.0, "step": 781 }, { "epoch": 0.0994784378577789, "ewc_loss": 0.004829711746424437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.829711542697623e-06, "grad_norm": 7.866201877593994, "learning_rate": 3.31072488342518e-07, "loss": 0.4842, "mean_token_accuracy": 0.8437325954437256, "num_tokens": 29753810.0, "step": 782 }, { "epoch": 0.09960564813636942, "ewc_loss": 0.004835347179323435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.835347226617159e-06, "grad_norm": 7.767823219299316, "learning_rate": 3.314963967782959e-07, "loss": 0.5123, "mean_token_accuracy": 0.8363851308822632, "num_tokens": 29796488.0, "step": 783 }, { "epoch": 0.09973285841495994, "ewc_loss": 0.004834931343793869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.834931132791098e-06, "grad_norm": 7.787467002868652, "learning_rate": 3.319203052140738e-07, "loss": 0.522, "mean_token_accuracy": 0.8337712287902832, "num_tokens": 29838181.0, "step": 784 }, { "epoch": 0.09986006869355044, "ewc_loss": 0.0048563722521066666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.856372470385395e-06, "grad_norm": 8.123170852661133, "learning_rate": 3.323442136498516e-07, "loss": 0.504, "mean_token_accuracy": 0.8391216993331909, "num_tokens": 29868893.0, "step": 785 }, { "epoch": 0.09998727897214095, "ewc_loss": 0.004904425702989101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.904425622953568e-06, "grad_norm": 7.902522563934326, "learning_rate": 3.3276812208562947e-07, "loss": 0.5198, "mean_token_accuracy": 0.8327905535697937, "num_tokens": 29903571.0, "step": 786 }, { "epoch": 0.10011448925073146, "ewc_loss": 0.004832681268453598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.832681042898912e-06, "grad_norm": 7.867058753967285, "learning_rate": 3.3319203052140737e-07, "loss": 0.5772, "mean_token_accuracy": 0.8242541551589966, "num_tokens": 29940438.0, "step": 787 }, { "epoch": 0.10024169952932196, "ewc_loss": 0.004862749017775059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.862748937739525e-06, "grad_norm": 7.851367473602295, "learning_rate": 3.336159389571852e-07, "loss": 0.4998, "mean_token_accuracy": 0.8443275690078735, "num_tokens": 29976538.0, "step": 788 }, { "epoch": 0.10036890980791248, "ewc_loss": 0.004874863661825657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.874863861914491e-06, "grad_norm": 7.766293525695801, "learning_rate": 3.340398473929631e-07, "loss": 0.5696, "mean_token_accuracy": 0.82017982006073, "num_tokens": 30019814.0, "step": 789 }, { "epoch": 0.100496120086503, "ewc_loss": 0.004873950965702534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.873951183981262e-06, "grad_norm": 7.92988395690918, "learning_rate": 3.3446375582874096e-07, "loss": 0.5197, "mean_token_accuracy": 0.8355565071105957, "num_tokens": 30057578.0, "step": 790 }, { "epoch": 0.1006233303650935, "ewc_loss": 0.0049029928632080555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.902992714050924e-06, "grad_norm": 7.842522621154785, "learning_rate": 3.3488766426451886e-07, "loss": 0.5919, "mean_token_accuracy": 0.8212980628013611, "num_tokens": 30094869.0, "step": 791 }, { "epoch": 0.10075054064368401, "ewc_loss": 0.004884681664407253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.88468185722013e-06, "grad_norm": 7.819901943206787, "learning_rate": 3.353115727002967e-07, "loss": 0.5531, "mean_token_accuracy": 0.8255347013473511, "num_tokens": 30136446.0, "step": 792 }, { "epoch": 0.10087775092227452, "ewc_loss": 0.004903675056993961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.903675289824605e-06, "grad_norm": 7.842646598815918, "learning_rate": 3.357354811360746e-07, "loss": 0.5465, "mean_token_accuracy": 0.8236743211746216, "num_tokens": 30179226.0, "step": 793 }, { "epoch": 0.10100496120086502, "ewc_loss": 0.004914937075227499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.914937107969308e-06, "grad_norm": 7.9251580238342285, "learning_rate": 3.3615938957185245e-07, "loss": 0.538, "mean_token_accuracy": 0.8289146423339844, "num_tokens": 30213890.0, "step": 794 }, { "epoch": 0.10113217147945554, "ewc_loss": 0.004931389819830656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.9313898671243805e-06, "grad_norm": 7.960380554199219, "learning_rate": 3.3658329800763035e-07, "loss": 0.5877, "mean_token_accuracy": 0.8119536638259888, "num_tokens": 30245691.0, "step": 795 }, { "epoch": 0.10125938175804605, "ewc_loss": 0.004937394522130489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.937394351145485e-06, "grad_norm": 7.873943328857422, "learning_rate": 3.370072064434082e-07, "loss": 0.5432, "mean_token_accuracy": 0.8203997611999512, "num_tokens": 30281738.0, "step": 796 }, { "epoch": 0.10138659203663657, "ewc_loss": 0.00492875138297677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.928751422994537e-06, "grad_norm": 7.832493782043457, "learning_rate": 3.374311148791861e-07, "loss": 0.5071, "mean_token_accuracy": 0.8363264799118042, "num_tokens": 30320707.0, "step": 797 }, { "epoch": 0.10151380231522707, "ewc_loss": 0.004954578820616007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.954578798788134e-06, "grad_norm": 8.025376319885254, "learning_rate": 3.3785502331496394e-07, "loss": 0.5165, "mean_token_accuracy": 0.8336578607559204, "num_tokens": 30351379.0, "step": 798 }, { "epoch": 0.10164101259381758, "ewc_loss": 0.004994045943021774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.994045866624219e-06, "grad_norm": 7.967950344085693, "learning_rate": 3.3827893175074184e-07, "loss": 0.5396, "mean_token_accuracy": 0.8267099857330322, "num_tokens": 30382395.0, "step": 799 }, { "epoch": 0.1017682228724081, "ewc_loss": 0.004968552850186825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.968552730133524e-06, "grad_norm": 7.938443183898926, "learning_rate": 3.387028401865197e-07, "loss": 0.5458, "mean_token_accuracy": 0.8304938077926636, "num_tokens": 30419758.0, "step": 800 }, { "epoch": 0.1018954331509986, "ewc_loss": 0.0049776011146605015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.977601292921463e-06, "grad_norm": 7.8909478187561035, "learning_rate": 3.391267486222976e-07, "loss": 0.5294, "mean_token_accuracy": 0.8283811807632446, "num_tokens": 30458115.0, "step": 801 }, { "epoch": 0.10202264342958911, "ewc_loss": 0.004984383471310139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.984383394912584e-06, "grad_norm": 8.05939769744873, "learning_rate": 3.3955065705807543e-07, "loss": 0.5641, "mean_token_accuracy": 0.8189548254013062, "num_tokens": 30489679.0, "step": 802 }, { "epoch": 0.10214985370817962, "ewc_loss": 0.005026651080697775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.026651251682779e-06, "grad_norm": 7.921543598175049, "learning_rate": 3.3997456549385333e-07, "loss": 0.5, "mean_token_accuracy": 0.8412334322929382, "num_tokens": 30524874.0, "step": 803 }, { "epoch": 0.10227706398677013, "ewc_loss": 0.004990330897271633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 4.990331035514828e-06, "grad_norm": 7.892416954040527, "learning_rate": 3.403984739296312e-07, "loss": 0.5744, "mean_token_accuracy": 0.8188869953155518, "num_tokens": 30558354.0, "step": 804 }, { "epoch": 0.10240427426536064, "ewc_loss": 0.005037922877818346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.037923074269202e-06, "grad_norm": 7.900135517120361, "learning_rate": 3.408223823654091e-07, "loss": 0.5049, "mean_token_accuracy": 0.8378796577453613, "num_tokens": 30594827.0, "step": 805 }, { "epoch": 0.10253148454395115, "ewc_loss": 0.005040446296334267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.040446467319271e-06, "grad_norm": 7.962499618530273, "learning_rate": 3.412462908011869e-07, "loss": 0.5452, "mean_token_accuracy": 0.8334643244743347, "num_tokens": 30635259.0, "step": 806 }, { "epoch": 0.10265869482254165, "ewc_loss": 0.005048528779298067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.048528691986576e-06, "grad_norm": 7.864771842956543, "learning_rate": 3.4167019923696477e-07, "loss": 0.5385, "mean_token_accuracy": 0.8338198661804199, "num_tokens": 30681850.0, "step": 807 }, { "epoch": 0.10278590510113217, "ewc_loss": 0.005035542882978916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.035542926634662e-06, "grad_norm": 7.878154754638672, "learning_rate": 3.4209410767274267e-07, "loss": 0.5054, "mean_token_accuracy": 0.8432101011276245, "num_tokens": 30721302.0, "step": 808 }, { "epoch": 0.10291311537972268, "ewc_loss": 0.0050549590960145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.054959274275461e-06, "grad_norm": 7.992548942565918, "learning_rate": 3.4251801610852057e-07, "loss": 0.5803, "mean_token_accuracy": 0.8142122030258179, "num_tokens": 30761975.0, "step": 809 }, { "epoch": 0.1030403256583132, "ewc_loss": 0.0050795115530490875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.079511538497172e-06, "grad_norm": 8.020805358886719, "learning_rate": 3.429419245442984e-07, "loss": 0.5336, "mean_token_accuracy": 0.8292084336280823, "num_tokens": 30797754.0, "step": 810 }, { "epoch": 0.1031675359369037, "ewc_loss": 0.0050787595100700855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.078759386378806e-06, "grad_norm": 7.972183704376221, "learning_rate": 3.4336583298007626e-07, "loss": 0.5074, "mean_token_accuracy": 0.8392037749290466, "num_tokens": 30834183.0, "step": 811 }, { "epoch": 0.10329474621549421, "ewc_loss": 0.00506653031334281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.066530320618767e-06, "grad_norm": 7.889223575592041, "learning_rate": 3.4378974141585416e-07, "loss": 0.5331, "mean_token_accuracy": 0.8314632177352905, "num_tokens": 30874160.0, "step": 812 }, { "epoch": 0.10342195649408473, "ewc_loss": 0.005052803084254265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.0528033170849085e-06, "grad_norm": 7.902170658111572, "learning_rate": 3.4421364985163206e-07, "loss": 0.4711, "mean_token_accuracy": 0.8480124473571777, "num_tokens": 30916827.0, "step": 813 }, { "epoch": 0.10354916677267523, "ewc_loss": 0.005083617754280567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.083617907075677e-06, "grad_norm": 7.958543300628662, "learning_rate": 3.446375582874099e-07, "loss": 0.5437, "mean_token_accuracy": 0.8291366696357727, "num_tokens": 30960063.0, "step": 814 }, { "epoch": 0.10367637705126574, "ewc_loss": 0.005085783079266548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.0857829592132475e-06, "grad_norm": 8.033742904663086, "learning_rate": 3.4506146672318775e-07, "loss": 0.4899, "mean_token_accuracy": 0.8410133123397827, "num_tokens": 30995172.0, "step": 815 }, { "epoch": 0.10380358732985626, "ewc_loss": 0.005100857466459274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.100857379147783e-06, "grad_norm": 8.035393714904785, "learning_rate": 3.4548537515896565e-07, "loss": 0.5515, "mean_token_accuracy": 0.8255429267883301, "num_tokens": 31035089.0, "step": 816 }, { "epoch": 0.10393079760844676, "ewc_loss": 0.005088798236101866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.088798388896976e-06, "grad_norm": 7.935186862945557, "learning_rate": 3.4590928359474355e-07, "loss": 0.5319, "mean_token_accuracy": 0.830965518951416, "num_tokens": 31074836.0, "step": 817 }, { "epoch": 0.10405800788703727, "ewc_loss": 0.0050774081610143185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.077408331999322e-06, "grad_norm": 7.957451820373535, "learning_rate": 3.463331920305214e-07, "loss": 0.5519, "mean_token_accuracy": 0.8243112564086914, "num_tokens": 31114700.0, "step": 818 }, { "epoch": 0.10418521816562779, "ewc_loss": 0.005085586104542017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.085586053610314e-06, "grad_norm": 8.03735065460205, "learning_rate": 3.4675710046629924e-07, "loss": 0.5002, "mean_token_accuracy": 0.8392267823219299, "num_tokens": 31148258.0, "step": 819 }, { "epoch": 0.1043124284442183, "ewc_loss": 0.005116845481097698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.116845386510249e-06, "grad_norm": 7.978488922119141, "learning_rate": 3.4718100890207714e-07, "loss": 0.5396, "mean_token_accuracy": 0.8295202851295471, "num_tokens": 31185720.0, "step": 820 }, { "epoch": 0.1044396387228088, "ewc_loss": 0.00509954709559679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.099547252029879e-06, "grad_norm": 8.048524856567383, "learning_rate": 3.4760491733785504e-07, "loss": 0.5509, "mean_token_accuracy": 0.8222159743309021, "num_tokens": 31220353.0, "step": 821 }, { "epoch": 0.10456684900139931, "ewc_loss": 0.005127517972141504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.127517852088204e-06, "grad_norm": 8.05381965637207, "learning_rate": 3.480288257736329e-07, "loss": 0.5277, "mean_token_accuracy": 0.8294636607170105, "num_tokens": 31256477.0, "step": 822 }, { "epoch": 0.10469405927998983, "ewc_loss": 0.005130308214575052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.130308181833243e-06, "grad_norm": 7.992459297180176, "learning_rate": 3.4845273420941073e-07, "loss": 0.5261, "mean_token_accuracy": 0.833828330039978, "num_tokens": 31294365.0, "step": 823 }, { "epoch": 0.10482126955858033, "ewc_loss": 0.005124800372868776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.124800281919306e-06, "grad_norm": 8.083945274353027, "learning_rate": 3.4887664264518863e-07, "loss": 0.5211, "mean_token_accuracy": 0.8307719230651855, "num_tokens": 31329357.0, "step": 824 }, { "epoch": 0.10494847983717084, "ewc_loss": 0.005155148450285196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.155148301128065e-06, "grad_norm": 7.984908103942871, "learning_rate": 3.4930055108096653e-07, "loss": 0.526, "mean_token_accuracy": 0.8334172964096069, "num_tokens": 31372180.0, "step": 825 }, { "epoch": 0.10507569011576136, "ewc_loss": 0.005125978495925665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.125978532305453e-06, "grad_norm": 7.920028209686279, "learning_rate": 3.497244595167443e-07, "loss": 0.51, "mean_token_accuracy": 0.8372269868850708, "num_tokens": 31414866.0, "step": 826 }, { "epoch": 0.10520290039435186, "ewc_loss": 0.005144193302839994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.14419343744521e-06, "grad_norm": 8.132389068603516, "learning_rate": 3.501483679525222e-07, "loss": 0.5792, "mean_token_accuracy": 0.8173249959945679, "num_tokens": 31450583.0, "step": 827 }, { "epoch": 0.10533011067294237, "ewc_loss": 0.005191509611904621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.191509444557596e-06, "grad_norm": 8.010838508605957, "learning_rate": 3.505722763883001e-07, "loss": 0.51, "mean_token_accuracy": 0.8352971076965332, "num_tokens": 31490371.0, "step": 828 }, { "epoch": 0.10545732095153289, "ewc_loss": 0.005145758856087923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.1457586778269615e-06, "grad_norm": 8.049134254455566, "learning_rate": 3.50996184824078e-07, "loss": 0.5526, "mean_token_accuracy": 0.8237804174423218, "num_tokens": 31525951.0, "step": 829 }, { "epoch": 0.10558453123012339, "ewc_loss": 0.00517954071983695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.179540949029615e-06, "grad_norm": 7.990433692932129, "learning_rate": 3.514200932598558e-07, "loss": 0.5919, "mean_token_accuracy": 0.8142980933189392, "num_tokens": 31570109.0, "step": 830 }, { "epoch": 0.1057117415087139, "ewc_loss": 0.005167265422642231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.167265499039786e-06, "grad_norm": 8.047292709350586, "learning_rate": 3.518440016956337e-07, "loss": 0.5337, "mean_token_accuracy": 0.8301711082458496, "num_tokens": 31610569.0, "step": 831 }, { "epoch": 0.10583895178730442, "ewc_loss": 0.005186942871659994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.186942871659994e-06, "grad_norm": 8.014242172241211, "learning_rate": 3.522679101314116e-07, "loss": 0.5451, "mean_token_accuracy": 0.8302505016326904, "num_tokens": 31653585.0, "step": 832 }, { "epoch": 0.10596616206589493, "ewc_loss": 0.00517823314294219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.178233095648466e-06, "grad_norm": 8.01352310180664, "learning_rate": 3.526918185671895e-07, "loss": 0.5551, "mean_token_accuracy": 0.822853684425354, "num_tokens": 31696353.0, "step": 833 }, { "epoch": 0.10609337234448543, "ewc_loss": 0.005192350130528212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.192350272409385e-06, "grad_norm": 8.02003002166748, "learning_rate": 3.531157270029673e-07, "loss": 0.4861, "mean_token_accuracy": 0.8448411226272583, "num_tokens": 31730620.0, "step": 834 }, { "epoch": 0.10622058262307595, "ewc_loss": 0.00520247220993042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.202472038945416e-06, "grad_norm": 8.078874588012695, "learning_rate": 3.535396354387452e-07, "loss": 0.5018, "mean_token_accuracy": 0.837413489818573, "num_tokens": 31770538.0, "step": 835 }, { "epoch": 0.10634779290166646, "ewc_loss": 0.005219114478677511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.219114427745808e-06, "grad_norm": 8.018421173095703, "learning_rate": 3.539635438745231e-07, "loss": 0.5443, "mean_token_accuracy": 0.8259886503219604, "num_tokens": 31808687.0, "step": 836 }, { "epoch": 0.10647500318025696, "ewc_loss": 0.005209577735513449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.209577921050368e-06, "grad_norm": 8.002852439880371, "learning_rate": 3.54387452310301e-07, "loss": 0.5229, "mean_token_accuracy": 0.8336869478225708, "num_tokens": 31852310.0, "step": 837 }, { "epoch": 0.10660221345884748, "ewc_loss": 0.005227069370448589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.227069323154865e-06, "grad_norm": 8.046046257019043, "learning_rate": 3.548113607460788e-07, "loss": 0.4922, "mean_token_accuracy": 0.8454318046569824, "num_tokens": 31887897.0, "step": 838 }, { "epoch": 0.10672942373743799, "ewc_loss": 0.005229796748608351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.229796897765482e-06, "grad_norm": 8.069239616394043, "learning_rate": 3.552352691818567e-07, "loss": 0.5179, "mean_token_accuracy": 0.836644172668457, "num_tokens": 31926159.0, "step": 839 }, { "epoch": 0.10685663401602849, "ewc_loss": 0.005236487369984388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.2364871407917235e-06, "grad_norm": 8.014453887939453, "learning_rate": 3.556591776176346e-07, "loss": 0.4911, "mean_token_accuracy": 0.8434535264968872, "num_tokens": 31968432.0, "step": 840 }, { "epoch": 0.106983844294619, "ewc_loss": 0.005226930603384972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.2269306252128445e-06, "grad_norm": 8.168706893920898, "learning_rate": 3.560830860534125e-07, "loss": 0.5262, "mean_token_accuracy": 0.8341963291168213, "num_tokens": 32000949.0, "step": 841 }, { "epoch": 0.10711105457320952, "ewc_loss": 0.005266355816274881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.266355856292648e-06, "grad_norm": 8.13648509979248, "learning_rate": 3.565069944891903e-07, "loss": 0.569, "mean_token_accuracy": 0.8194332122802734, "num_tokens": 32033862.0, "step": 842 }, { "epoch": 0.10723826485180002, "ewc_loss": 0.005237956065684557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.237955974735087e-06, "grad_norm": 8.088388442993164, "learning_rate": 3.569309029249682e-07, "loss": 0.5305, "mean_token_accuracy": 0.8326902389526367, "num_tokens": 32068623.0, "step": 843 }, { "epoch": 0.10736547513039053, "ewc_loss": 0.005263125523924828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.263125331111951e-06, "grad_norm": 8.153961181640625, "learning_rate": 3.573548113607461e-07, "loss": 0.5165, "mean_token_accuracy": 0.8363989591598511, "num_tokens": 32106658.0, "step": 844 }, { "epoch": 0.10749268540898105, "ewc_loss": 0.005296402145177126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.296401923260419e-06, "grad_norm": 8.111684799194336, "learning_rate": 3.577787197965239e-07, "loss": 0.5597, "mean_token_accuracy": 0.8214861750602722, "num_tokens": 32142031.0, "step": 845 }, { "epoch": 0.10761989568757156, "ewc_loss": 0.005275556351989508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.275556304695783e-06, "grad_norm": 8.031229972839355, "learning_rate": 3.5820262823230177e-07, "loss": 0.4902, "mean_token_accuracy": 0.8471697568893433, "num_tokens": 32182686.0, "step": 846 }, { "epoch": 0.10774710596616206, "ewc_loss": 0.0052817752584815025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.2817754294665065e-06, "grad_norm": 8.172398567199707, "learning_rate": 3.5862653666807967e-07, "loss": 0.5868, "mean_token_accuracy": 0.8145391941070557, "num_tokens": 32218027.0, "step": 847 }, { "epoch": 0.10787431624475258, "ewc_loss": 0.005340212024748325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.34021182829747e-06, "grad_norm": 8.125171661376953, "learning_rate": 3.5905044510385757e-07, "loss": 0.5411, "mean_token_accuracy": 0.8312860727310181, "num_tokens": 32257854.0, "step": 848 }, { "epoch": 0.10800152652334309, "ewc_loss": 0.005300602875649929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.300602879287908e-06, "grad_norm": 8.059484481811523, "learning_rate": 3.594743535396354e-07, "loss": 0.524, "mean_token_accuracy": 0.833707332611084, "num_tokens": 32295019.0, "step": 849 }, { "epoch": 0.10812873680193359, "ewc_loss": 0.005319733638316393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.3197336455923505e-06, "grad_norm": 8.124988555908203, "learning_rate": 3.5989826197541326e-07, "loss": 0.574, "mean_token_accuracy": 0.8185732960700989, "num_tokens": 32335652.0, "step": 850 }, { "epoch": 0.1082559470805241, "ewc_loss": 0.005340492352843285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.340492407412967e-06, "grad_norm": 8.08979320526123, "learning_rate": 3.6032217041119116e-07, "loss": 0.5211, "mean_token_accuracy": 0.8311577439308167, "num_tokens": 32377249.0, "step": 851 }, { "epoch": 0.10838315735911462, "ewc_loss": 0.005333075299859047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.3330754781200085e-06, "grad_norm": 8.050907135009766, "learning_rate": 3.6074607884696906e-07, "loss": 0.4921, "mean_token_accuracy": 0.8448087573051453, "num_tokens": 32422467.0, "step": 852 }, { "epoch": 0.10851036763770512, "ewc_loss": 0.005329424981027842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.329424766387092e-06, "grad_norm": 8.063366889953613, "learning_rate": 3.611699872827469e-07, "loss": 0.484, "mean_token_accuracy": 0.8439508676528931, "num_tokens": 32462120.0, "step": 853 }, { "epoch": 0.10863757791629564, "ewc_loss": 0.005342143587768078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.3421435950440355e-06, "grad_norm": 8.193467140197754, "learning_rate": 3.6159389571852475e-07, "loss": 0.6038, "mean_token_accuracy": 0.8112668991088867, "num_tokens": 32498484.0, "step": 854 }, { "epoch": 0.10876478819488615, "ewc_loss": 0.005374355241656303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.374355168896727e-06, "grad_norm": 8.097125053405762, "learning_rate": 3.6201780415430265e-07, "loss": 0.5384, "mean_token_accuracy": 0.829828143119812, "num_tokens": 32541536.0, "step": 855 }, { "epoch": 0.10889199847347665, "ewc_loss": 0.005340449512004852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.340449661161983e-06, "grad_norm": 8.126521110534668, "learning_rate": 3.6244171259008055e-07, "loss": 0.5628, "mean_token_accuracy": 0.8255841135978699, "num_tokens": 32579750.0, "step": 856 }, { "epoch": 0.10901920875206716, "ewc_loss": 0.005366561934351921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.366562163544586e-06, "grad_norm": 8.121728897094727, "learning_rate": 3.628656210258584e-07, "loss": 0.5698, "mean_token_accuracy": 0.8191002011299133, "num_tokens": 32617518.0, "step": 857 }, { "epoch": 0.10914641903065768, "ewc_loss": 0.005368463229387999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.368463007471291e-06, "grad_norm": 8.104976654052734, "learning_rate": 3.6328952946163624e-07, "loss": 0.5113, "mean_token_accuracy": 0.8364934921264648, "num_tokens": 32653931.0, "step": 858 }, { "epoch": 0.1092736293092482, "ewc_loss": 0.005366330035030842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.366330242395634e-06, "grad_norm": 8.106586456298828, "learning_rate": 3.6371343789741414e-07, "loss": 0.4993, "mean_token_accuracy": 0.8400260210037231, "num_tokens": 32693797.0, "step": 859 }, { "epoch": 0.1094008395878387, "ewc_loss": 0.005375319626182318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.3753196880279575e-06, "grad_norm": 8.10604476928711, "learning_rate": 3.6413734633319204e-07, "loss": 0.5479, "mean_token_accuracy": 0.8276596069335938, "num_tokens": 32732432.0, "step": 860 }, { "epoch": 0.10952804986642921, "ewc_loss": 0.005387688521295786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.387688361224718e-06, "grad_norm": 8.199151992797852, "learning_rate": 3.645612547689699e-07, "loss": 0.5009, "mean_token_accuracy": 0.8356739282608032, "num_tokens": 32769225.0, "step": 861 }, { "epoch": 0.10965526014501972, "ewc_loss": 0.0053972238674759865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.397223958425457e-06, "grad_norm": 8.100147247314453, "learning_rate": 3.6498516320474773e-07, "loss": 0.494, "mean_token_accuracy": 0.8433645963668823, "num_tokens": 32812027.0, "step": 862 }, { "epoch": 0.10978247042361022, "ewc_loss": 0.005368699319660664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.368699476093752e-06, "grad_norm": 8.093453407287598, "learning_rate": 3.6540907164052563e-07, "loss": 0.4739, "mean_token_accuracy": 0.8496701121330261, "num_tokens": 32849334.0, "step": 863 }, { "epoch": 0.10990968070220074, "ewc_loss": 0.005410437937825918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.4104380069475155e-06, "grad_norm": 8.218626022338867, "learning_rate": 3.658329800763035e-07, "loss": 0.5673, "mean_token_accuracy": 0.8186012506484985, "num_tokens": 32888422.0, "step": 864 }, { "epoch": 0.11003689098079125, "ewc_loss": 0.0054132831282913685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.413282906374661e-06, "grad_norm": 8.181258201599121, "learning_rate": 3.662568885120814e-07, "loss": 0.5266, "mean_token_accuracy": 0.8314107656478882, "num_tokens": 32922672.0, "step": 865 }, { "epoch": 0.11016410125938175, "ewc_loss": 0.005394176580011845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.3941766964271665e-06, "grad_norm": 8.2028226852417, "learning_rate": 3.666807969478592e-07, "loss": 0.556, "mean_token_accuracy": 0.8206788301467896, "num_tokens": 32961694.0, "step": 866 }, { "epoch": 0.11029131153797227, "ewc_loss": 0.005413053557276726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.4130537137098145e-06, "grad_norm": 8.153275489807129, "learning_rate": 3.671047053836371e-07, "loss": 0.5452, "mean_token_accuracy": 0.8242408037185669, "num_tokens": 32998595.0, "step": 867 }, { "epoch": 0.11041852181656278, "ewc_loss": 0.005421964917331934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.421964942797786e-06, "grad_norm": 8.194133758544922, "learning_rate": 3.6752861381941497e-07, "loss": 0.5046, "mean_token_accuracy": 0.8371569514274597, "num_tokens": 33035884.0, "step": 868 }, { "epoch": 0.11054573209515328, "ewc_loss": 0.005424266681075096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.424266873887973e-06, "grad_norm": 8.207086563110352, "learning_rate": 3.6795252225519287e-07, "loss": 0.5671, "mean_token_accuracy": 0.8233587741851807, "num_tokens": 33073335.0, "step": 869 }, { "epoch": 0.1106729423737438, "ewc_loss": 0.005434442777186632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.434442755358759e-06, "grad_norm": 8.120891571044922, "learning_rate": 3.6837643069097077e-07, "loss": 0.4659, "mean_token_accuracy": 0.8492449522018433, "num_tokens": 33111443.0, "step": 870 }, { "epoch": 0.11080015265233431, "ewc_loss": 0.005411633290350437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.411633082985645e-06, "grad_norm": 8.155908584594727, "learning_rate": 3.688003391267486e-07, "loss": 0.5258, "mean_token_accuracy": 0.8362253904342651, "num_tokens": 33156362.0, "step": 871 }, { "epoch": 0.11092736293092482, "ewc_loss": 0.005450396798551083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.450396656669909e-06, "grad_norm": 8.121647834777832, "learning_rate": 3.6922424756252646e-07, "loss": 0.4947, "mean_token_accuracy": 0.8410072922706604, "num_tokens": 33204968.0, "step": 872 }, { "epoch": 0.11105457320951533, "ewc_loss": 0.005439270753413439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.439270807983121e-06, "grad_norm": 8.26202392578125, "learning_rate": 3.6964815599830436e-07, "loss": 0.5432, "mean_token_accuracy": 0.824428915977478, "num_tokens": 33239265.0, "step": 873 }, { "epoch": 0.11118178348810584, "ewc_loss": 0.005484491121023893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.484491339302622e-06, "grad_norm": 8.234797477722168, "learning_rate": 3.7007206443408226e-07, "loss": 0.4749, "mean_token_accuracy": 0.8488744497299194, "num_tokens": 33275897.0, "step": 874 }, { "epoch": 0.11130899376669635, "ewc_loss": 0.005447219125926495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.447219336929265e-06, "grad_norm": 8.193602561950684, "learning_rate": 3.704959728698601e-07, "loss": 0.5996, "mean_token_accuracy": 0.8120070695877075, "num_tokens": 33313966.0, "step": 875 }, { "epoch": 0.11143620404528685, "ewc_loss": 0.005458296276628971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.458296072902158e-06, "grad_norm": 8.205317497253418, "learning_rate": 3.7091988130563795e-07, "loss": 0.5438, "mean_token_accuracy": 0.8278020620346069, "num_tokens": 33352909.0, "step": 876 }, { "epoch": 0.11156341432387737, "ewc_loss": 0.0054740081541240215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.474008048622636e-06, "grad_norm": 8.16148853302002, "learning_rate": 3.7134378974141585e-07, "loss": 0.5258, "mean_token_accuracy": 0.8339401483535767, "num_tokens": 33400120.0, "step": 877 }, { "epoch": 0.11169062460246788, "ewc_loss": 0.005465826019644737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.465826234285487e-06, "grad_norm": 8.253213882446289, "learning_rate": 3.7176769817719375e-07, "loss": 0.5547, "mean_token_accuracy": 0.8247142434120178, "num_tokens": 33436703.0, "step": 878 }, { "epoch": 0.11181783488105838, "ewc_loss": 0.005504735745489597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.504735781869385e-06, "grad_norm": 8.249553680419922, "learning_rate": 3.7219160661297154e-07, "loss": 0.4631, "mean_token_accuracy": 0.8528059720993042, "num_tokens": 33473515.0, "step": 879 }, { "epoch": 0.1119450451596489, "ewc_loss": 0.005483102519065142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.4831025408930145e-06, "grad_norm": 8.237787246704102, "learning_rate": 3.7261551504874944e-07, "loss": 0.5257, "mean_token_accuracy": 0.8350234627723694, "num_tokens": 33507659.0, "step": 880 }, { "epoch": 0.11207225543823941, "ewc_loss": 0.005490117706358433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.490117473527789e-06, "grad_norm": 8.198275566101074, "learning_rate": 3.7303942348452734e-07, "loss": 0.4713, "mean_token_accuracy": 0.8472206592559814, "num_tokens": 33542430.0, "step": 881 }, { "epoch": 0.11219946571682991, "ewc_loss": 0.005504085682332516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.504085493157618e-06, "grad_norm": 8.17248249053955, "learning_rate": 3.7346333192030524e-07, "loss": 0.4774, "mean_token_accuracy": 0.8508889675140381, "num_tokens": 33582579.0, "step": 882 }, { "epoch": 0.11232667599542043, "ewc_loss": 0.0055028595961630344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.502859494299628e-06, "grad_norm": 8.186339378356934, "learning_rate": 3.7388724035608303e-07, "loss": 0.5015, "mean_token_accuracy": 0.8431243300437927, "num_tokens": 33625928.0, "step": 883 }, { "epoch": 0.11245388627401094, "ewc_loss": 0.005512323696166277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.512323696166277e-06, "grad_norm": 8.21218490600586, "learning_rate": 3.7431114879186093e-07, "loss": 0.5438, "mean_token_accuracy": 0.8283449411392212, "num_tokens": 33661325.0, "step": 884 }, { "epoch": 0.11258109655260146, "ewc_loss": 0.005514407530426979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.514407348528039e-06, "grad_norm": 8.234174728393555, "learning_rate": 3.7473505722763883e-07, "loss": 0.5654, "mean_token_accuracy": 0.8205329775810242, "num_tokens": 33696472.0, "step": 885 }, { "epoch": 0.11270830683119196, "ewc_loss": 0.0055256434716284275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.525643700821092e-06, "grad_norm": 8.228389739990234, "learning_rate": 3.7515896566341673e-07, "loss": 0.5207, "mean_token_accuracy": 0.8314499855041504, "num_tokens": 33732637.0, "step": 886 }, { "epoch": 0.11283551710978247, "ewc_loss": 0.0055281370878219604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.528137080546003e-06, "grad_norm": 8.181666374206543, "learning_rate": 3.755828740991945e-07, "loss": 0.5443, "mean_token_accuracy": 0.8273501396179199, "num_tokens": 33775106.0, "step": 887 }, { "epoch": 0.11296272738837299, "ewc_loss": 0.005525763612240553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.525763754121726e-06, "grad_norm": 8.216194152832031, "learning_rate": 3.760067825349724e-07, "loss": 0.5064, "mean_token_accuracy": 0.840149462223053, "num_tokens": 33811297.0, "step": 888 }, { "epoch": 0.11308993766696349, "ewc_loss": 0.0055463435128331184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.546343345486093e-06, "grad_norm": 8.177497863769531, "learning_rate": 3.764306909707503e-07, "loss": 0.4682, "mean_token_accuracy": 0.8481780290603638, "num_tokens": 33854038.0, "step": 889 }, { "epoch": 0.113217147945554, "ewc_loss": 0.005536133423447609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.53613335796399e-06, "grad_norm": 8.269293785095215, "learning_rate": 3.768545994065282e-07, "loss": 0.5245, "mean_token_accuracy": 0.8326917886734009, "num_tokens": 33884929.0, "step": 890 }, { "epoch": 0.11334435822414451, "ewc_loss": 0.0055757369846105576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.575736850005342e-06, "grad_norm": 8.260052680969238, "learning_rate": 3.77278507842306e-07, "loss": 0.5331, "mean_token_accuracy": 0.8323830366134644, "num_tokens": 33926065.0, "step": 891 }, { "epoch": 0.11347156850273502, "ewc_loss": 0.005560582038015127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.560581939789699e-06, "grad_norm": 8.178367614746094, "learning_rate": 3.777024162780839e-07, "loss": 0.4825, "mean_token_accuracy": 0.8422343730926514, "num_tokens": 33965557.0, "step": 892 }, { "epoch": 0.11359877878132553, "ewc_loss": 0.005554266273975372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.554266408580588e-06, "grad_norm": 8.266192436218262, "learning_rate": 3.781263247138618e-07, "loss": 0.4921, "mean_token_accuracy": 0.8420578241348267, "num_tokens": 34004132.0, "step": 893 }, { "epoch": 0.11372598905991604, "ewc_loss": 0.005578024312853813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.578024229180301e-06, "grad_norm": 8.280313491821289, "learning_rate": 3.785502331496397e-07, "loss": 0.5202, "mean_token_accuracy": 0.8264427185058594, "num_tokens": 34039781.0, "step": 894 }, { "epoch": 0.11385319933850654, "ewc_loss": 0.005577263422310352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.5772634368622676e-06, "grad_norm": 8.244020462036133, "learning_rate": 3.789741415854175e-07, "loss": 0.5474, "mean_token_accuracy": 0.8256592750549316, "num_tokens": 34080487.0, "step": 895 }, { "epoch": 0.11398040961709706, "ewc_loss": 0.005579221528023481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.579221578955185e-06, "grad_norm": 8.28764533996582, "learning_rate": 3.793980500211954e-07, "loss": 0.5229, "mean_token_accuracy": 0.833740770816803, "num_tokens": 34119913.0, "step": 896 }, { "epoch": 0.11410761989568757, "ewc_loss": 0.0055984314531087875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.5984314712986816e-06, "grad_norm": 8.271764755249023, "learning_rate": 3.798219584569733e-07, "loss": 0.4576, "mean_token_accuracy": 0.8525469303131104, "num_tokens": 34155811.0, "step": 897 }, { "epoch": 0.11423483017427809, "ewc_loss": 0.005586050916463137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.586050974670798e-06, "grad_norm": 8.281390190124512, "learning_rate": 3.8024586689275115e-07, "loss": 0.5792, "mean_token_accuracy": 0.8208107948303223, "num_tokens": 34195022.0, "step": 898 }, { "epoch": 0.11436204045286859, "ewc_loss": 0.005621893331408501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.621893251372967e-06, "grad_norm": 8.321144104003906, "learning_rate": 3.80669775328529e-07, "loss": 0.5118, "mean_token_accuracy": 0.8332642316818237, "num_tokens": 34229182.0, "step": 899 }, { "epoch": 0.1144892507314591, "ewc_loss": 0.005621248856186867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.621248874376761e-06, "grad_norm": 8.227524757385254, "learning_rate": 3.810936837643069e-07, "loss": 0.4716, "mean_token_accuracy": 0.8468340039253235, "num_tokens": 34266931.0, "step": 900 }, { "epoch": 0.11461646101004962, "ewc_loss": 0.005616511218249798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.616511316475226e-06, "grad_norm": 8.345601081848145, "learning_rate": 3.815175922000848e-07, "loss": 0.465, "mean_token_accuracy": 0.8475489616394043, "num_tokens": 34301705.0, "step": 901 }, { "epoch": 0.11474367128864012, "ewc_loss": 0.005661678966134787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.661679097102024e-06, "grad_norm": 8.297577857971191, "learning_rate": 3.8194150063586264e-07, "loss": 0.5485, "mean_token_accuracy": 0.8299655914306641, "num_tokens": 34340975.0, "step": 902 }, { "epoch": 0.11487088156723063, "ewc_loss": 0.005633147899061441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.633147793560056e-06, "grad_norm": 8.275078773498535, "learning_rate": 3.823654090716405e-07, "loss": 0.5152, "mean_token_accuracy": 0.8332104086875916, "num_tokens": 34380024.0, "step": 903 }, { "epoch": 0.11499809184582115, "ewc_loss": 0.005650771781802177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.650771981891012e-06, "grad_norm": 8.304278373718262, "learning_rate": 3.827893175074184e-07, "loss": 0.5363, "mean_token_accuracy": 0.8363749384880066, "num_tokens": 34419654.0, "step": 904 }, { "epoch": 0.11512530212441165, "ewc_loss": 0.0056572360917925835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.657236215483863e-06, "grad_norm": 8.311320304870605, "learning_rate": 3.832132259431963e-07, "loss": 0.4675, "mean_token_accuracy": 0.8502041101455688, "num_tokens": 34453034.0, "step": 905 }, { "epoch": 0.11525251240300216, "ewc_loss": 0.005659148562699556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.6591484280943405e-06, "grad_norm": 8.310887336730957, "learning_rate": 3.8363713437897413e-07, "loss": 0.5739, "mean_token_accuracy": 0.8177034854888916, "num_tokens": 34493694.0, "step": 906 }, { "epoch": 0.11537972268159268, "ewc_loss": 0.0056684291921556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.668429366778582e-06, "grad_norm": 8.289931297302246, "learning_rate": 3.8406104281475197e-07, "loss": 0.566, "mean_token_accuracy": 0.8226586580276489, "num_tokens": 34531636.0, "step": 907 }, { "epoch": 0.11550693296018319, "ewc_loss": 0.005663242191076279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.6632420637470204e-06, "grad_norm": 8.296758651733398, "learning_rate": 3.8448495125052987e-07, "loss": 0.5501, "mean_token_accuracy": 0.822804868221283, "num_tokens": 34570851.0, "step": 908 }, { "epoch": 0.11563414323877369, "ewc_loss": 0.00567933265119791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.679332844010787e-06, "grad_norm": 8.31814193725586, "learning_rate": 3.8490885968630777e-07, "loss": 0.502, "mean_token_accuracy": 0.8410999774932861, "num_tokens": 34613314.0, "step": 909 }, { "epoch": 0.1157613535173642, "ewc_loss": 0.005684397183358669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.68439736525761e-06, "grad_norm": 8.25268840789795, "learning_rate": 3.853327681220856e-07, "loss": 0.5167, "mean_token_accuracy": 0.8336528539657593, "num_tokens": 34656701.0, "step": 910 }, { "epoch": 0.11588856379595472, "ewc_loss": 0.005682060960680246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.682060873368755e-06, "grad_norm": 8.40130615234375, "learning_rate": 3.8575667655786346e-07, "loss": 0.4926, "mean_token_accuracy": 0.8422941565513611, "num_tokens": 34688595.0, "step": 911 }, { "epoch": 0.11601577407454522, "ewc_loss": 0.005721176974475384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.721176876249956e-06, "grad_norm": 8.30599594116211, "learning_rate": 3.8618058499364136e-07, "loss": 0.5108, "mean_token_accuracy": 0.8374982476234436, "num_tokens": 34726609.0, "step": 912 }, { "epoch": 0.11614298435313573, "ewc_loss": 0.005687309429049492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.6873095672926866e-06, "grad_norm": 8.314939498901367, "learning_rate": 3.8660449342941926e-07, "loss": 0.5679, "mean_token_accuracy": 0.8229053020477295, "num_tokens": 34769213.0, "step": 913 }, { "epoch": 0.11627019463172625, "ewc_loss": 0.005712684243917465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.712684469472151e-06, "grad_norm": 8.31904125213623, "learning_rate": 3.870284018651971e-07, "loss": 0.5398, "mean_token_accuracy": 0.82972651720047, "num_tokens": 34806722.0, "step": 914 }, { "epoch": 0.11639740491031675, "ewc_loss": 0.005714477971196175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.7144779930240475e-06, "grad_norm": 8.319095611572266, "learning_rate": 3.8745231030097495e-07, "loss": 0.496, "mean_token_accuracy": 0.8435949087142944, "num_tokens": 34846408.0, "step": 915 }, { "epoch": 0.11652461518890726, "ewc_loss": 0.005719688721001148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.7196884881705046e-06, "grad_norm": 8.337940216064453, "learning_rate": 3.8787621873675285e-07, "loss": 0.4827, "mean_token_accuracy": 0.8438242077827454, "num_tokens": 34883791.0, "step": 916 }, { "epoch": 0.11665182546749778, "ewc_loss": 0.005740908905863762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.7409088185522705e-06, "grad_norm": 8.362153053283691, "learning_rate": 3.883001271725307e-07, "loss": 0.4787, "mean_token_accuracy": 0.8463245630264282, "num_tokens": 34922768.0, "step": 917 }, { "epoch": 0.11677903574608828, "ewc_loss": 0.005747443065047264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.747443083237158e-06, "grad_norm": 8.38056755065918, "learning_rate": 3.887240356083086e-07, "loss": 0.459, "mean_token_accuracy": 0.851388692855835, "num_tokens": 34962184.0, "step": 918 }, { "epoch": 0.11690624602467879, "ewc_loss": 0.005754532292485237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.754532139690127e-06, "grad_norm": 8.508649826049805, "learning_rate": 3.8914794404408644e-07, "loss": 0.5035, "mean_token_accuracy": 0.8380805253982544, "num_tokens": 34991455.0, "step": 919 }, { "epoch": 0.1170334563032693, "ewc_loss": 0.005777997896075249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.7779980124905705e-06, "grad_norm": 8.384074211120605, "learning_rate": 3.8957185247986434e-07, "loss": 0.5325, "mean_token_accuracy": 0.8287510871887207, "num_tokens": 35025044.0, "step": 920 }, { "epoch": 0.11716066658185982, "ewc_loss": 0.00574966287240386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.749662705056835e-06, "grad_norm": 8.351861953735352, "learning_rate": 3.899957609156422e-07, "loss": 0.5056, "mean_token_accuracy": 0.8370448350906372, "num_tokens": 35064564.0, "step": 921 }, { "epoch": 0.11728787686045032, "ewc_loss": 0.005774179007858038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.774179044237826e-06, "grad_norm": 8.354227066040039, "learning_rate": 3.904196693514201e-07, "loss": 0.5231, "mean_token_accuracy": 0.8343166708946228, "num_tokens": 35109786.0, "step": 922 }, { "epoch": 0.11741508713904084, "ewc_loss": 0.005779598373919725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.77959826841834e-06, "grad_norm": 8.361978530883789, "learning_rate": 3.9084357778719793e-07, "loss": 0.5865, "mean_token_accuracy": 0.8148424029350281, "num_tokens": 35148351.0, "step": 923 }, { "epoch": 0.11754229741763135, "ewc_loss": 0.005787339992821217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.787339887319831e-06, "grad_norm": 8.384697914123535, "learning_rate": 3.9126748622297583e-07, "loss": 0.5504, "mean_token_accuracy": 0.8268666863441467, "num_tokens": 35183464.0, "step": 924 }, { "epoch": 0.11766950769622185, "ewc_loss": 0.00580988172441721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.809881713503273e-06, "grad_norm": 8.408880233764648, "learning_rate": 3.916913946587537e-07, "loss": 0.5324, "mean_token_accuracy": 0.828195333480835, "num_tokens": 35218561.0, "step": 925 }, { "epoch": 0.11779671797481236, "ewc_loss": 0.005811823066323996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.8118230299442075e-06, "grad_norm": 8.351489067077637, "learning_rate": 3.921153030945316e-07, "loss": 0.5474, "mean_token_accuracy": 0.8257901668548584, "num_tokens": 35257740.0, "step": 926 }, { "epoch": 0.11792392825340288, "ewc_loss": 0.005820086225867271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.8200862440571655e-06, "grad_norm": 8.39977741241455, "learning_rate": 3.925392115303094e-07, "loss": 0.5036, "mean_token_accuracy": 0.8395248055458069, "num_tokens": 35291632.0, "step": 927 }, { "epoch": 0.11805113853199338, "ewc_loss": 0.005832292605191469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.83229257244966e-06, "grad_norm": 8.357196807861328, "learning_rate": 3.929631199660873e-07, "loss": 0.5079, "mean_token_accuracy": 0.8346760869026184, "num_tokens": 35327686.0, "step": 928 }, { "epoch": 0.1181783488105839, "ewc_loss": 0.005830888636410236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.830888767377473e-06, "grad_norm": 8.369598388671875, "learning_rate": 3.9338702840186517e-07, "loss": 0.5093, "mean_token_accuracy": 0.8346187472343445, "num_tokens": 35364480.0, "step": 929 }, { "epoch": 0.11830555908917441, "ewc_loss": 0.005864213686436415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.864213562745135e-06, "grad_norm": 8.376585960388184, "learning_rate": 3.9381093683764307e-07, "loss": 0.515, "mean_token_accuracy": 0.8372774720191956, "num_tokens": 35406017.0, "step": 930 }, { "epoch": 0.11843276936776491, "ewc_loss": 0.005860316567122936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.860316377948038e-06, "grad_norm": 8.361490249633789, "learning_rate": 3.942348452734209e-07, "loss": 0.5741, "mean_token_accuracy": 0.8181331157684326, "num_tokens": 35448447.0, "step": 931 }, { "epoch": 0.11855997964635542, "ewc_loss": 0.005866023246198893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.866023457201663e-06, "grad_norm": 8.399094581604004, "learning_rate": 3.946587537091988e-07, "loss": 0.5227, "mean_token_accuracy": 0.8317264914512634, "num_tokens": 35488120.0, "step": 932 }, { "epoch": 0.11868718992494594, "ewc_loss": 0.005885713268071413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.8857131080003455e-06, "grad_norm": 8.375649452209473, "learning_rate": 3.9508266214497666e-07, "loss": 0.4959, "mean_token_accuracy": 0.8406159281730652, "num_tokens": 35531759.0, "step": 933 }, { "epoch": 0.11881440020353645, "ewc_loss": 0.005872558802366257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.872558631381253e-06, "grad_norm": 8.403239250183105, "learning_rate": 3.9550657058075456e-07, "loss": 0.4806, "mean_token_accuracy": 0.8478529453277588, "num_tokens": 35571476.0, "step": 934 }, { "epoch": 0.11894161048212695, "ewc_loss": 0.005871917586773634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.871917437616503e-06, "grad_norm": 8.431044578552246, "learning_rate": 3.959304790165324e-07, "loss": 0.5213, "mean_token_accuracy": 0.8352728486061096, "num_tokens": 35608440.0, "step": 935 }, { "epoch": 0.11906882076071747, "ewc_loss": 0.005885286256670952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.885286100237863e-06, "grad_norm": 8.491150856018066, "learning_rate": 3.9635438745231025e-07, "loss": 0.517, "mean_token_accuracy": 0.8368033766746521, "num_tokens": 35641861.0, "step": 936 }, { "epoch": 0.11919603103930798, "ewc_loss": 0.005889695603400469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.889695785299409e-06, "grad_norm": 8.449736595153809, "learning_rate": 3.9677829588808815e-07, "loss": 0.539, "mean_token_accuracy": 0.8278950452804565, "num_tokens": 35684626.0, "step": 937 }, { "epoch": 0.11932324131789848, "ewc_loss": 0.005880503449589014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.88050352234859e-06, "grad_norm": 8.433516502380371, "learning_rate": 3.9720220432386605e-07, "loss": 0.5444, "mean_token_accuracy": 0.8256617784500122, "num_tokens": 35718125.0, "step": 938 }, { "epoch": 0.119450451596489, "ewc_loss": 0.005902968812733889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.902968950977083e-06, "grad_norm": 8.494565963745117, "learning_rate": 3.976261127596439e-07, "loss": 0.4892, "mean_token_accuracy": 0.8457925319671631, "num_tokens": 35752818.0, "step": 939 }, { "epoch": 0.11957766187507951, "ewc_loss": 0.005904915276914835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.904915269638877e-06, "grad_norm": 8.423140525817871, "learning_rate": 3.9805002119542174e-07, "loss": 0.4684, "mean_token_accuracy": 0.8493688106536865, "num_tokens": 35787097.0, "step": 940 }, { "epoch": 0.11970487215367001, "ewc_loss": 0.005909413564950228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.909413630433846e-06, "grad_norm": 8.497889518737793, "learning_rate": 3.9847392963119964e-07, "loss": 0.5667, "mean_token_accuracy": 0.8228686451911926, "num_tokens": 35825887.0, "step": 941 }, { "epoch": 0.11983208243226053, "ewc_loss": 0.005938577000051737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.938577032793546e-06, "grad_norm": 8.459839820861816, "learning_rate": 3.9889783806697754e-07, "loss": 0.4938, "mean_token_accuracy": 0.845323920249939, "num_tokens": 35863592.0, "step": 942 }, { "epoch": 0.11995929271085104, "ewc_loss": 0.005915387999266386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.915388101129793e-06, "grad_norm": 8.491740226745605, "learning_rate": 3.993217465027554e-07, "loss": 0.5698, "mean_token_accuracy": 0.8226594924926758, "num_tokens": 35904483.0, "step": 943 }, { "epoch": 0.12008650298944154, "ewc_loss": 0.0059416452422738075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.941645213169977e-06, "grad_norm": 8.493232727050781, "learning_rate": 3.9974565493853323e-07, "loss": 0.4564, "mean_token_accuracy": 0.85418701171875, "num_tokens": 35938662.0, "step": 944 }, { "epoch": 0.12021371326803205, "ewc_loss": 0.005937328562140465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.937328751315363e-06, "grad_norm": 8.443857192993164, "learning_rate": 4.0016956337431113e-07, "loss": 0.52, "mean_token_accuracy": 0.8339165449142456, "num_tokens": 35975176.0, "step": 945 }, { "epoch": 0.12034092354662257, "ewc_loss": 0.005938875488936901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.938875347055728e-06, "grad_norm": 8.459054946899414, "learning_rate": 4.0059347181008903e-07, "loss": 0.5067, "mean_token_accuracy": 0.836523175239563, "num_tokens": 36008893.0, "step": 946 }, { "epoch": 0.12046813382521308, "ewc_loss": 0.0059565347619354725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.956534550932702e-06, "grad_norm": 8.425703048706055, "learning_rate": 4.010173802458669e-07, "loss": 0.4662, "mean_token_accuracy": 0.8520858287811279, "num_tokens": 36049882.0, "step": 947 }, { "epoch": 0.12059534410380358, "ewc_loss": 0.005945817567408085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.94581752011436e-06, "grad_norm": 8.437342643737793, "learning_rate": 4.014412886816447e-07, "loss": 0.5798, "mean_token_accuracy": 0.8196514844894409, "num_tokens": 36090615.0, "step": 948 }, { "epoch": 0.1207225543823941, "ewc_loss": 0.005977225955575705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.977226010145387e-06, "grad_norm": 8.462921142578125, "learning_rate": 4.018651971174226e-07, "loss": 0.5102, "mean_token_accuracy": 0.8355296850204468, "num_tokens": 36128159.0, "step": 949 }, { "epoch": 0.12084976466098461, "ewc_loss": 0.005974020343273878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.974020496068988e-06, "grad_norm": 8.434924125671387, "learning_rate": 4.022891055532005e-07, "loss": 0.5232, "mean_token_accuracy": 0.8333213925361633, "num_tokens": 36172851.0, "step": 950 }, { "epoch": 0.12097697493957511, "ewc_loss": 0.005977297201752663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.977297405479476e-06, "grad_norm": 8.458919525146484, "learning_rate": 4.0271301398897837e-07, "loss": 0.5743, "mean_token_accuracy": 0.821308970451355, "num_tokens": 36214282.0, "step": 951 }, { "epoch": 0.12110418521816563, "ewc_loss": 0.005994390696287155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.994390903651947e-06, "grad_norm": 8.534249305725098, "learning_rate": 4.031369224247562e-07, "loss": 0.495, "mean_token_accuracy": 0.8371138572692871, "num_tokens": 36248619.0, "step": 952 }, { "epoch": 0.12123139549675614, "ewc_loss": 0.005997940897941589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.997941116220318e-06, "grad_norm": 8.405247688293457, "learning_rate": 4.035608308605341e-07, "loss": 0.5322, "mean_token_accuracy": 0.8294354677200317, "num_tokens": 36289070.0, "step": 953 }, { "epoch": 0.12135860577534664, "ewc_loss": 0.005979105830192566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 5.979105935693951e-06, "grad_norm": 8.478914260864258, "learning_rate": 4.03984739296312e-07, "loss": 0.531, "mean_token_accuracy": 0.8336080312728882, "num_tokens": 36330883.0, "step": 954 }, { "epoch": 0.12148581605393716, "ewc_loss": 0.00601942278444767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.0194229263288435e-06, "grad_norm": 8.49277114868164, "learning_rate": 4.044086477320898e-07, "loss": 0.511, "mean_token_accuracy": 0.8374305963516235, "num_tokens": 36371507.0, "step": 955 }, { "epoch": 0.12161302633252767, "ewc_loss": 0.006014145445078611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.014145583321806e-06, "grad_norm": 8.536867141723633, "learning_rate": 4.048325561678677e-07, "loss": 0.534, "mean_token_accuracy": 0.829365611076355, "num_tokens": 36407262.0, "step": 956 }, { "epoch": 0.12174023661111817, "ewc_loss": 0.006023249588906765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.023249625286553e-06, "grad_norm": 8.49036693572998, "learning_rate": 4.052564646036456e-07, "loss": 0.5299, "mean_token_accuracy": 0.8329101800918579, "num_tokens": 36448089.0, "step": 957 }, { "epoch": 0.12186744688970869, "ewc_loss": 0.006022205110639334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.022205070621567e-06, "grad_norm": 8.524613380432129, "learning_rate": 4.056803730394235e-07, "loss": 0.5427, "mean_token_accuracy": 0.8251646757125854, "num_tokens": 36484022.0, "step": 958 }, { "epoch": 0.1219946571682992, "ewc_loss": 0.006043111439794302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.043111625331221e-06, "grad_norm": 8.495274543762207, "learning_rate": 4.061042814752013e-07, "loss": 0.5188, "mean_token_accuracy": 0.8357226252555847, "num_tokens": 36519385.0, "step": 959 }, { "epoch": 0.12212186744688971, "ewc_loss": 0.0060304151847958565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.030415079294471e-06, "grad_norm": 8.577933311462402, "learning_rate": 4.065281899109792e-07, "loss": 0.5227, "mean_token_accuracy": 0.8321970701217651, "num_tokens": 36551943.0, "step": 960 }, { "epoch": 0.12224907772548022, "ewc_loss": 0.006063718348741531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.063718501536641e-06, "grad_norm": 8.48006534576416, "learning_rate": 4.069520983467571e-07, "loss": 0.4689, "mean_token_accuracy": 0.8502171039581299, "num_tokens": 36590927.0, "step": 961 }, { "epoch": 0.12237628800407073, "ewc_loss": 0.006031775381416082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.0317752286209725e-06, "grad_norm": 8.530495643615723, "learning_rate": 4.07376006782535e-07, "loss": 0.5017, "mean_token_accuracy": 0.8373183012008667, "num_tokens": 36625199.0, "step": 962 }, { "epoch": 0.12250349828266124, "ewc_loss": 0.006084012798964977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.084012966312002e-06, "grad_norm": 8.533292770385742, "learning_rate": 4.077999152183128e-07, "loss": 0.4637, "mean_token_accuracy": 0.8493717312812805, "num_tokens": 36658981.0, "step": 963 }, { "epoch": 0.12263070856125174, "ewc_loss": 0.00607262784615159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.072627911635209e-06, "grad_norm": 8.473840713500977, "learning_rate": 4.082238236540907e-07, "loss": 0.5237, "mean_token_accuracy": 0.836290717124939, "num_tokens": 36704214.0, "step": 964 }, { "epoch": 0.12275791883984226, "ewc_loss": 0.006073392927646637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.0733927966794e-06, "grad_norm": 8.506292343139648, "learning_rate": 4.086477320898686e-07, "loss": 0.493, "mean_token_accuracy": 0.8406989574432373, "num_tokens": 36744782.0, "step": 965 }, { "epoch": 0.12288512911843277, "ewc_loss": 0.006084229797124863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.084229880798375e-06, "grad_norm": 8.500615119934082, "learning_rate": 4.090716405256465e-07, "loss": 0.5287, "mean_token_accuracy": 0.8294767141342163, "num_tokens": 36783645.0, "step": 966 }, { "epoch": 0.12301233939702327, "ewc_loss": 0.006082167383283377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.082167601562105e-06, "grad_norm": 8.60316276550293, "learning_rate": 4.094955489614243e-07, "loss": 0.4966, "mean_token_accuracy": 0.8396202921867371, "num_tokens": 36817539.0, "step": 967 }, { "epoch": 0.12313954967561379, "ewc_loss": 0.006101945880800486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.10194592809421e-06, "grad_norm": 8.54789924621582, "learning_rate": 4.099194573972022e-07, "loss": 0.4982, "mean_token_accuracy": 0.8393788933753967, "num_tokens": 36859144.0, "step": 968 }, { "epoch": 0.1232667599542043, "ewc_loss": 0.006081029307097197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.081029368942836e-06, "grad_norm": 8.542227745056152, "learning_rate": 4.1034336583298007e-07, "loss": 0.5408, "mean_token_accuracy": 0.8277215361595154, "num_tokens": 36900833.0, "step": 969 }, { "epoch": 0.1233939702327948, "ewc_loss": 0.006093968171626329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.093968295317609e-06, "grad_norm": 8.509620666503906, "learning_rate": 4.1076727426875797e-07, "loss": 0.5302, "mean_token_accuracy": 0.8306743502616882, "num_tokens": 36940935.0, "step": 970 }, { "epoch": 0.12352118051138532, "ewc_loss": 0.006112409755587578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.112409664638108e-06, "grad_norm": 8.564183235168457, "learning_rate": 4.1119118270453577e-07, "loss": 0.5922, "mean_token_accuracy": 0.8105154037475586, "num_tokens": 36979874.0, "step": 971 }, { "epoch": 0.12364839078997583, "ewc_loss": 0.006130042485892773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.13004249316873e-06, "grad_norm": 8.587169647216797, "learning_rate": 4.1161509114031366e-07, "loss": 0.4741, "mean_token_accuracy": 0.8490318655967712, "num_tokens": 37017917.0, "step": 972 }, { "epoch": 0.12377560106856635, "ewc_loss": 0.00610377499833703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.1037749219394755e-06, "grad_norm": 8.539557456970215, "learning_rate": 4.1203899957609156e-07, "loss": 0.5379, "mean_token_accuracy": 0.829188883304596, "num_tokens": 37054914.0, "step": 973 }, { "epoch": 0.12390281134715685, "ewc_loss": 0.006104827858507633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.104827662056778e-06, "grad_norm": 8.644913673400879, "learning_rate": 4.124629080118694e-07, "loss": 0.4883, "mean_token_accuracy": 0.8402925729751587, "num_tokens": 37088277.0, "step": 974 }, { "epoch": 0.12403002162574736, "ewc_loss": 0.006147156003862619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.147156000224641e-06, "grad_norm": 8.547412872314453, "learning_rate": 4.1288681644764726e-07, "loss": 0.5347, "mean_token_accuracy": 0.8288030028343201, "num_tokens": 37126963.0, "step": 975 }, { "epoch": 0.12415723190433788, "ewc_loss": 0.006107218097895384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.107218268880388e-06, "grad_norm": 8.562424659729004, "learning_rate": 4.1331072488342515e-07, "loss": 0.548, "mean_token_accuracy": 0.8273612260818481, "num_tokens": 37161948.0, "step": 976 }, { "epoch": 0.12428444218292838, "ewc_loss": 0.006150261033326387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.150261015136493e-06, "grad_norm": 8.560718536376953, "learning_rate": 4.1373463331920305e-07, "loss": 0.4621, "mean_token_accuracy": 0.8519445657730103, "num_tokens": 37198552.0, "step": 977 }, { "epoch": 0.12441165246151889, "ewc_loss": 0.006145707797259092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.145707629912067e-06, "grad_norm": 8.58554458618164, "learning_rate": 4.141585417549809e-07, "loss": 0.5323, "mean_token_accuracy": 0.829123854637146, "num_tokens": 37235990.0, "step": 978 }, { "epoch": 0.1245388627401094, "ewc_loss": 0.006163309793919325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.1633099903701805e-06, "grad_norm": 8.66746997833252, "learning_rate": 4.1458245019075875e-07, "loss": 0.4916, "mean_token_accuracy": 0.8408505320549011, "num_tokens": 37269664.0, "step": 979 }, { "epoch": 0.1246660730186999, "ewc_loss": 0.006167768035084009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.1677678786509205e-06, "grad_norm": 8.591779708862305, "learning_rate": 4.1500635862653664e-07, "loss": 0.5232, "mean_token_accuracy": 0.8341238498687744, "num_tokens": 37308647.0, "step": 980 }, { "epoch": 0.12479328329729042, "ewc_loss": 0.006162158213555813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.162158115330385e-06, "grad_norm": 8.554139137268066, "learning_rate": 4.1543026706231454e-07, "loss": 0.4724, "mean_token_accuracy": 0.8486605882644653, "num_tokens": 37345953.0, "step": 981 }, { "epoch": 0.12492049357588093, "ewc_loss": 0.006181071046739817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.181071057653753e-06, "grad_norm": 8.601630210876465, "learning_rate": 4.158541754980924e-07, "loss": 0.4686, "mean_token_accuracy": 0.8520139455795288, "num_tokens": 37382735.0, "step": 982 }, { "epoch": 0.12504770385447145, "ewc_loss": 0.006192459724843502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.192459750309354e-06, "grad_norm": 8.62958812713623, "learning_rate": 4.1627808393387024e-07, "loss": 0.5353, "mean_token_accuracy": 0.8266748785972595, "num_tokens": 37419605.0, "step": 983 }, { "epoch": 0.12517491413306195, "ewc_loss": 0.006195574067533016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.195573860168224e-06, "grad_norm": 8.629398345947266, "learning_rate": 4.1670199236964813e-07, "loss": 0.5278, "mean_token_accuracy": 0.8261998891830444, "num_tokens": 37455395.0, "step": 984 }, { "epoch": 0.12530212441165245, "ewc_loss": 0.006206748075783253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.206747912074206e-06, "grad_norm": 8.607645034790039, "learning_rate": 4.1712590080542603e-07, "loss": 0.4797, "mean_token_accuracy": 0.8463618159294128, "num_tokens": 37492208.0, "step": 985 }, { "epoch": 0.12542933469024298, "ewc_loss": 0.0062117427587509155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.211742856976343e-06, "grad_norm": 8.52033805847168, "learning_rate": 4.175498092412039e-07, "loss": 0.5256, "mean_token_accuracy": 0.8345812559127808, "num_tokens": 37533675.0, "step": 986 }, { "epoch": 0.12555654496883348, "ewc_loss": 0.006225126329809427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.225126526260283e-06, "grad_norm": 8.608586311340332, "learning_rate": 4.179737176769817e-07, "loss": 0.4904, "mean_token_accuracy": 0.8435400724411011, "num_tokens": 37572792.0, "step": 987 }, { "epoch": 0.12568375524742398, "ewc_loss": 0.0062496596947312355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.249659691093257e-06, "grad_norm": 8.590333938598633, "learning_rate": 4.183976261127596e-07, "loss": 0.4868, "mean_token_accuracy": 0.8425536155700684, "num_tokens": 37611481.0, "step": 988 }, { "epoch": 0.1258109655260145, "ewc_loss": 0.00623337272554636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.233372914721258e-06, "grad_norm": 8.640275001525879, "learning_rate": 4.1882153454853747e-07, "loss": 0.5656, "mean_token_accuracy": 0.8199460506439209, "num_tokens": 37648253.0, "step": 989 }, { "epoch": 0.125938175804605, "ewc_loss": 0.006261559668928385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.261559519771254e-06, "grad_norm": 8.595327377319336, "learning_rate": 4.1924544298431537e-07, "loss": 0.4982, "mean_token_accuracy": 0.8414607644081116, "num_tokens": 37687086.0, "step": 990 }, { "epoch": 0.12606538608319554, "ewc_loss": 0.006236493121832609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.23649293629569e-06, "grad_norm": 8.543804168701172, "learning_rate": 4.196693514200932e-07, "loss": 0.4925, "mean_token_accuracy": 0.8430821299552917, "num_tokens": 37730928.0, "step": 991 }, { "epoch": 0.12619259636178604, "ewc_loss": 0.0062551372684538364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.255137122934684e-06, "grad_norm": 8.597197532653809, "learning_rate": 4.200932598558711e-07, "loss": 0.5177, "mean_token_accuracy": 0.8314274549484253, "num_tokens": 37771857.0, "step": 992 }, { "epoch": 0.12631980664037654, "ewc_loss": 0.0062710377387702465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.27103781880578e-06, "grad_norm": 8.611591339111328, "learning_rate": 4.2051716829164896e-07, "loss": 0.5204, "mean_token_accuracy": 0.8366706371307373, "num_tokens": 37817517.0, "step": 993 }, { "epoch": 0.12644701691896706, "ewc_loss": 0.00626119552180171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.2611957218905445e-06, "grad_norm": 8.65492057800293, "learning_rate": 4.2094107672742686e-07, "loss": 0.5631, "mean_token_accuracy": 0.82523512840271, "num_tokens": 37855891.0, "step": 994 }, { "epoch": 0.12657422719755757, "ewc_loss": 0.0062659080140292645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.26590781394043e-06, "grad_norm": 8.675098419189453, "learning_rate": 4.2136498516320476e-07, "loss": 0.4839, "mean_token_accuracy": 0.8461703658103943, "num_tokens": 37888719.0, "step": 995 }, { "epoch": 0.12670143747614807, "ewc_loss": 0.006280382163822651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.280381967371795e-06, "grad_norm": 8.585158348083496, "learning_rate": 4.217888935989826e-07, "loss": 0.5347, "mean_token_accuracy": 0.8294896483421326, "num_tokens": 37932968.0, "step": 996 }, { "epoch": 0.1268286477547386, "ewc_loss": 0.006272079888731241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.272080099734012e-06, "grad_norm": 8.640687942504883, "learning_rate": 4.2221280203476045e-07, "loss": 0.5265, "mean_token_accuracy": 0.8298648595809937, "num_tokens": 37974297.0, "step": 997 }, { "epoch": 0.1269558580333291, "ewc_loss": 0.006298430263996124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.298430434981128e-06, "grad_norm": 8.663156509399414, "learning_rate": 4.2263671047053835e-07, "loss": 0.5435, "mean_token_accuracy": 0.8274380564689636, "num_tokens": 38020294.0, "step": 998 }, { "epoch": 0.1270830683119196, "ewc_loss": 0.0062909903936088085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.290990313573275e-06, "grad_norm": 8.610015869140625, "learning_rate": 4.2306061890631625e-07, "loss": 0.5232, "mean_token_accuracy": 0.8336665034294128, "num_tokens": 38063322.0, "step": 999 }, { "epoch": 0.12721027859051012, "ewc_loss": 0.006281181238591671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.281181413214654e-06, "grad_norm": 8.679471015930176, "learning_rate": 4.234845273420941e-07, "loss": 0.6121, "mean_token_accuracy": 0.8138822317123413, "num_tokens": 38101052.0, "step": 1000 }, { "epoch": 0.12733748886910062, "ewc_loss": 0.006315640173852444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.3156403484754264e-06, "grad_norm": 8.63736343383789, "learning_rate": 4.2390843577787194e-07, "loss": 0.5309, "mean_token_accuracy": 0.8300867080688477, "num_tokens": 38145472.0, "step": 1001 }, { "epoch": 0.12746469914769112, "ewc_loss": 0.006290240678936243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.290240889939014e-06, "grad_norm": 8.640230178833008, "learning_rate": 4.2433234421364984e-07, "loss": 0.5076, "mean_token_accuracy": 0.8384510278701782, "num_tokens": 38182662.0, "step": 1002 }, { "epoch": 0.12759190942628165, "ewc_loss": 0.006311722565442324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.3117227000475395e-06, "grad_norm": 8.707191467285156, "learning_rate": 4.2475625264942774e-07, "loss": 0.494, "mean_token_accuracy": 0.8429721593856812, "num_tokens": 38224735.0, "step": 1003 }, { "epoch": 0.12771911970487215, "ewc_loss": 0.0063210418447852135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.321041837509256e-06, "grad_norm": 8.74701976776123, "learning_rate": 4.251801610852056e-07, "loss": 0.5379, "mean_token_accuracy": 0.8291147947311401, "num_tokens": 38255847.0, "step": 1004 }, { "epoch": 0.12784632998346265, "ewc_loss": 0.006325856316834688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.325856247713091e-06, "grad_norm": 8.694070816040039, "learning_rate": 4.2560406952098343e-07, "loss": 0.5511, "mean_token_accuracy": 0.8268263936042786, "num_tokens": 38296921.0, "step": 1005 }, { "epoch": 0.12797354026205318, "ewc_loss": 0.006325747352093458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.325747563096229e-06, "grad_norm": 8.640689849853516, "learning_rate": 4.2602797795676133e-07, "loss": 0.5612, "mean_token_accuracy": 0.8226964473724365, "num_tokens": 38336408.0, "step": 1006 }, { "epoch": 0.12810075054064368, "ewc_loss": 0.006322736851871014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.32273668088601e-06, "grad_norm": 8.626866340637207, "learning_rate": 4.2645188639253923e-07, "loss": 0.4693, "mean_token_accuracy": 0.8473015427589417, "num_tokens": 38372809.0, "step": 1007 }, { "epoch": 0.12822796081923418, "ewc_loss": 0.006347816437482834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.347816452034749e-06, "grad_norm": 8.71854019165039, "learning_rate": 4.26875794828317e-07, "loss": 0.5289, "mean_token_accuracy": 0.8303528428077698, "num_tokens": 38413312.0, "step": 1008 }, { "epoch": 0.1283551710978247, "ewc_loss": 0.0063560726121068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.356072844937444e-06, "grad_norm": 8.659682273864746, "learning_rate": 4.272997032640949e-07, "loss": 0.5496, "mean_token_accuracy": 0.8251247406005859, "num_tokens": 38457710.0, "step": 1009 }, { "epoch": 0.1284823813764152, "ewc_loss": 0.0063396538607776165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.339653737086337e-06, "grad_norm": 8.665486335754395, "learning_rate": 4.277236116998728e-07, "loss": 0.5094, "mean_token_accuracy": 0.838225781917572, "num_tokens": 38499630.0, "step": 1010 }, { "epoch": 0.1286095916550057, "ewc_loss": 0.0063695465214550495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.369546554196859e-06, "grad_norm": 8.668535232543945, "learning_rate": 4.281475201356507e-07, "loss": 0.5167, "mean_token_accuracy": 0.8361667394638062, "num_tokens": 38538133.0, "step": 1011 }, { "epoch": 0.12873680193359624, "ewc_loss": 0.006375172641128302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.375172688422026e-06, "grad_norm": 8.715115547180176, "learning_rate": 4.285714285714285e-07, "loss": 0.55, "mean_token_accuracy": 0.8259090185165405, "num_tokens": 38576816.0, "step": 1012 }, { "epoch": 0.12886401221218674, "ewc_loss": 0.0063953823409974575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.395382570190122e-06, "grad_norm": 8.705638885498047, "learning_rate": 4.289953370072064e-07, "loss": 0.4878, "mean_token_accuracy": 0.8434296250343323, "num_tokens": 38615270.0, "step": 1013 }, { "epoch": 0.12899122249077727, "ewc_loss": 0.006384258158504963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.384258085745387e-06, "grad_norm": 8.709128379821777, "learning_rate": 4.294192454429843e-07, "loss": 0.5398, "mean_token_accuracy": 0.8272790908813477, "num_tokens": 38654173.0, "step": 1014 }, { "epoch": 0.12911843276936777, "ewc_loss": 0.006391901057213545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.391901024471736e-06, "grad_norm": 8.675749778747559, "learning_rate": 4.298431538787622e-07, "loss": 0.4258, "mean_token_accuracy": 0.8648408055305481, "num_tokens": 38695758.0, "step": 1015 }, { "epoch": 0.12924564304795827, "ewc_loss": 0.006389632821083069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.389632744685514e-06, "grad_norm": 8.790831565856934, "learning_rate": 4.3026706231454e-07, "loss": 0.5718, "mean_token_accuracy": 0.8185831904411316, "num_tokens": 38729524.0, "step": 1016 }, { "epoch": 0.1293728533265488, "ewc_loss": 0.006443278398364782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.443278380174888e-06, "grad_norm": 8.737910270690918, "learning_rate": 4.306909707503179e-07, "loss": 0.5192, "mean_token_accuracy": 0.8377572298049927, "num_tokens": 38767500.0, "step": 1017 }, { "epoch": 0.1295000636051393, "ewc_loss": 0.006396183278411627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.396183380275033e-06, "grad_norm": 8.744110107421875, "learning_rate": 4.311148791860958e-07, "loss": 0.5064, "mean_token_accuracy": 0.8365095853805542, "num_tokens": 38798683.0, "step": 1018 }, { "epoch": 0.1296272738837298, "ewc_loss": 0.006444177124649286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.44417696094024e-06, "grad_norm": 8.748536109924316, "learning_rate": 4.315387876218737e-07, "loss": 0.5557, "mean_token_accuracy": 0.8252147436141968, "num_tokens": 38833653.0, "step": 1019 }, { "epoch": 0.12975448416232033, "ewc_loss": 0.006436958909034729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.436958756239619e-06, "grad_norm": 8.721237182617188, "learning_rate": 4.319626960576515e-07, "loss": 0.5531, "mean_token_accuracy": 0.8215570449829102, "num_tokens": 38869428.0, "step": 1020 }, { "epoch": 0.12988169444091083, "ewc_loss": 0.006444558966904879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.444558948714985e-06, "grad_norm": 8.734073638916016, "learning_rate": 4.323866044934294e-07, "loss": 0.4895, "mean_token_accuracy": 0.8389244079589844, "num_tokens": 38909502.0, "step": 1021 }, { "epoch": 0.13000890471950133, "ewc_loss": 0.006462761666625738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.462761575676268e-06, "grad_norm": 8.74411678314209, "learning_rate": 4.328105129292073e-07, "loss": 0.5144, "mean_token_accuracy": 0.8371400833129883, "num_tokens": 38951730.0, "step": 1022 }, { "epoch": 0.13013611499809186, "ewc_loss": 0.0064561436884105206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.456143637478817e-06, "grad_norm": 8.756901741027832, "learning_rate": 4.332344213649852e-07, "loss": 0.4918, "mean_token_accuracy": 0.8395594954490662, "num_tokens": 38988981.0, "step": 1023 }, { "epoch": 0.13026332527668236, "ewc_loss": 0.00648276973515749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.482769549620571e-06, "grad_norm": 8.740255355834961, "learning_rate": 4.33658329800763e-07, "loss": 0.5002, "mean_token_accuracy": 0.8413975238800049, "num_tokens": 39026289.0, "step": 1024 }, { "epoch": 0.13039053555527286, "ewc_loss": 0.006474232766777277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.474232577602379e-06, "grad_norm": 8.726595878601074, "learning_rate": 4.340822382365409e-07, "loss": 0.4912, "mean_token_accuracy": 0.8403236269950867, "num_tokens": 39069113.0, "step": 1025 }, { "epoch": 0.13051774583386339, "ewc_loss": 0.006483763922005892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.483764082076959e-06, "grad_norm": 8.718708038330078, "learning_rate": 4.345061466723188e-07, "loss": 0.4751, "mean_token_accuracy": 0.8478521704673767, "num_tokens": 39112991.0, "step": 1026 }, { "epoch": 0.13064495611245389, "ewc_loss": 0.006497021298855543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.497021331597352e-06, "grad_norm": 8.787391662597656, "learning_rate": 4.3493005510809663e-07, "loss": 0.538, "mean_token_accuracy": 0.8294299840927124, "num_tokens": 39149593.0, "step": 1027 }, { "epoch": 0.1307721663910444, "ewc_loss": 0.0065023633651435375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.502363248728216e-06, "grad_norm": 8.788541793823242, "learning_rate": 4.353539635438745e-07, "loss": 0.5015, "mean_token_accuracy": 0.8393784165382385, "num_tokens": 39186570.0, "step": 1028 }, { "epoch": 0.13089937666963491, "ewc_loss": 0.00649635773152113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.496357855212409e-06, "grad_norm": 8.785104751586914, "learning_rate": 4.357778719796524e-07, "loss": 0.5648, "mean_token_accuracy": 0.8175914287567139, "num_tokens": 39220666.0, "step": 1029 }, { "epoch": 0.13102658694822542, "ewc_loss": 0.006506663281470537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.506663339678198e-06, "grad_norm": 8.719993591308594, "learning_rate": 4.362017804154303e-07, "loss": 0.4837, "mean_token_accuracy": 0.8464421629905701, "num_tokens": 39258372.0, "step": 1030 }, { "epoch": 0.13115379722681592, "ewc_loss": 0.006499553099274635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.499552910099737e-06, "grad_norm": 8.737582206726074, "learning_rate": 4.366256888512081e-07, "loss": 0.4788, "mean_token_accuracy": 0.8474994897842407, "num_tokens": 39293647.0, "step": 1031 }, { "epoch": 0.13128100750540644, "ewc_loss": 0.006512068677693605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.512068466690835e-06, "grad_norm": 8.778142929077148, "learning_rate": 4.3704959728698597e-07, "loss": 0.5076, "mean_token_accuracy": 0.8364864587783813, "num_tokens": 39332811.0, "step": 1032 }, { "epoch": 0.13140821778399694, "ewc_loss": 0.006520074792206287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.520074748550542e-06, "grad_norm": 8.774373054504395, "learning_rate": 4.3747350572276386e-07, "loss": 0.5089, "mean_token_accuracy": 0.8349869251251221, "num_tokens": 39365971.0, "step": 1033 }, { "epoch": 0.13153542806258745, "ewc_loss": 0.006512660067528486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.512660092994338e-06, "grad_norm": 8.779874801635742, "learning_rate": 4.3789741415854176e-07, "loss": 0.5222, "mean_token_accuracy": 0.8348445296287537, "num_tokens": 39403178.0, "step": 1034 }, { "epoch": 0.13166263834117797, "ewc_loss": 0.006538103334605694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.5381032072764356e-06, "grad_norm": 8.762660026550293, "learning_rate": 4.383213225943196e-07, "loss": 0.5127, "mean_token_accuracy": 0.8343515396118164, "num_tokens": 39441303.0, "step": 1035 }, { "epoch": 0.13178984861976847, "ewc_loss": 0.006520844530314207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.5208446358155925e-06, "grad_norm": 8.776810646057129, "learning_rate": 4.3874523103009746e-07, "loss": 0.4581, "mean_token_accuracy": 0.8527283072471619, "num_tokens": 39477359.0, "step": 1036 }, { "epoch": 0.13191705889835897, "ewc_loss": 0.006535818800330162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.535819011332933e-06, "grad_norm": 8.795928001403809, "learning_rate": 4.3916913946587536e-07, "loss": 0.4676, "mean_token_accuracy": 0.8468494415283203, "num_tokens": 39511687.0, "step": 1037 }, { "epoch": 0.1320442691769495, "ewc_loss": 0.006542908027768135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.542908067785902e-06, "grad_norm": 8.803021430969238, "learning_rate": 4.3959304790165325e-07, "loss": 0.4832, "mean_token_accuracy": 0.8426418304443359, "num_tokens": 39547453.0, "step": 1038 }, { "epoch": 0.13217147945554, "ewc_loss": 0.00653681019321084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.536810360557865e-06, "grad_norm": 8.756210327148438, "learning_rate": 4.400169563374311e-07, "loss": 0.5, "mean_token_accuracy": 0.838679313659668, "num_tokens": 39588934.0, "step": 1039 }, { "epoch": 0.13229868973413053, "ewc_loss": 0.006538356654345989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.538356501550879e-06, "grad_norm": 8.810073852539062, "learning_rate": 4.4044086477320895e-07, "loss": 0.5925, "mean_token_accuracy": 0.8166037797927856, "num_tokens": 39627638.0, "step": 1040 }, { "epoch": 0.13242590001272103, "ewc_loss": 0.006575087551027536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.575087354576681e-06, "grad_norm": 8.846203804016113, "learning_rate": 4.4086477320898685e-07, "loss": 0.4993, "mean_token_accuracy": 0.8396193981170654, "num_tokens": 39661696.0, "step": 1041 }, { "epoch": 0.13255311029131153, "ewc_loss": 0.006567453034222126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.567453056049999e-06, "grad_norm": 8.721269607543945, "learning_rate": 4.4128868164476474e-07, "loss": 0.5084, "mean_token_accuracy": 0.84147047996521, "num_tokens": 39702357.0, "step": 1042 }, { "epoch": 0.13268032056990206, "ewc_loss": 0.0065533919259905815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.5533918132132385e-06, "grad_norm": 8.810694694519043, "learning_rate": 4.417125900805426e-07, "loss": 0.4921, "mean_token_accuracy": 0.8430976867675781, "num_tokens": 39744960.0, "step": 1043 }, { "epoch": 0.13280753084849256, "ewc_loss": 0.0066064391285181046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.606439001188846e-06, "grad_norm": 8.774020195007324, "learning_rate": 4.4213649851632044e-07, "loss": 0.5505, "mean_token_accuracy": 0.8283144235610962, "num_tokens": 39785394.0, "step": 1044 }, { "epoch": 0.13293474112708306, "ewc_loss": 0.006585222668945789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.585222763533238e-06, "grad_norm": 8.820679664611816, "learning_rate": 4.4256040695209834e-07, "loss": 0.5314, "mean_token_accuracy": 0.8338047862052917, "num_tokens": 39822698.0, "step": 1045 }, { "epoch": 0.1330619514056736, "ewc_loss": 0.006613953970372677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.613954155909596e-06, "grad_norm": 8.850312232971191, "learning_rate": 4.429843153878762e-07, "loss": 0.5289, "mean_token_accuracy": 0.8299546837806702, "num_tokens": 39856874.0, "step": 1046 }, { "epoch": 0.1331891616842641, "ewc_loss": 0.00662467023357749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.6246702772332355e-06, "grad_norm": 8.83719539642334, "learning_rate": 4.434082238236541e-07, "loss": 0.5608, "mean_token_accuracy": 0.8210283517837524, "num_tokens": 39899683.0, "step": 1047 }, { "epoch": 0.1333163719628546, "ewc_loss": 0.006632204633206129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.6322045313427225e-06, "grad_norm": 8.825278282165527, "learning_rate": 4.4383213225943193e-07, "loss": 0.5388, "mean_token_accuracy": 0.8303993940353394, "num_tokens": 39940565.0, "step": 1048 }, { "epoch": 0.13344358224144512, "ewc_loss": 0.00663400162011385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.634001692873426e-06, "grad_norm": 8.835959434509277, "learning_rate": 4.442560406952098e-07, "loss": 0.5383, "mean_token_accuracy": 0.8264076709747314, "num_tokens": 39979010.0, "step": 1049 }, { "epoch": 0.13357079252003562, "ewc_loss": 0.006646871566772461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.646871497650864e-06, "grad_norm": 8.822970390319824, "learning_rate": 4.4467994913098767e-07, "loss": 0.4871, "mean_token_accuracy": 0.8445752859115601, "num_tokens": 40021151.0, "step": 1050 }, { "epoch": 0.13369800279862612, "ewc_loss": 0.006641810759902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.641810614382848e-06, "grad_norm": 8.84504222869873, "learning_rate": 4.4510385756676557e-07, "loss": 0.5472, "mean_token_accuracy": 0.8248797655105591, "num_tokens": 40059324.0, "step": 1051 }, { "epoch": 0.13382521307721665, "ewc_loss": 0.006661879830062389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.6618799792195205e-06, "grad_norm": 8.815520286560059, "learning_rate": 4.455277660025434e-07, "loss": 0.5024, "mean_token_accuracy": 0.837541401386261, "num_tokens": 40104577.0, "step": 1052 }, { "epoch": 0.13395242335580715, "ewc_loss": 0.00664472347125411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.644723271165276e-06, "grad_norm": 8.893194198608398, "learning_rate": 4.459516744383213e-07, "loss": 0.5261, "mean_token_accuracy": 0.8283849954605103, "num_tokens": 40139734.0, "step": 1053 }, { "epoch": 0.13407963363439765, "ewc_loss": 0.006681296043097973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.681295872112969e-06, "grad_norm": 8.855525016784668, "learning_rate": 4.4637558287409916e-07, "loss": 0.5238, "mean_token_accuracy": 0.8325609564781189, "num_tokens": 40180724.0, "step": 1054 }, { "epoch": 0.13420684391298818, "ewc_loss": 0.006661929655820131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.661929546680767e-06, "grad_norm": 8.853079795837402, "learning_rate": 4.4679949130987706e-07, "loss": 0.5219, "mean_token_accuracy": 0.8387815952301025, "num_tokens": 40220475.0, "step": 1055 }, { "epoch": 0.13433405419157868, "ewc_loss": 0.006668872199952602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.668872174486751e-06, "grad_norm": 8.831315040588379, "learning_rate": 4.472233997456549e-07, "loss": 0.5327, "mean_token_accuracy": 0.8297128677368164, "num_tokens": 40258208.0, "step": 1056 }, { "epoch": 0.13446126447016918, "ewc_loss": 0.006681529805064201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.681529612251325e-06, "grad_norm": 8.894571304321289, "learning_rate": 4.476473081814328e-07, "loss": 0.517, "mean_token_accuracy": 0.8337315917015076, "num_tokens": 40299127.0, "step": 1057 }, { "epoch": 0.1345884747487597, "ewc_loss": 0.006686678621917963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.686678716505412e-06, "grad_norm": 8.831155776977539, "learning_rate": 4.4807121661721065e-07, "loss": 0.4754, "mean_token_accuracy": 0.8453074097633362, "num_tokens": 40339331.0, "step": 1058 }, { "epoch": 0.1347156850273502, "ewc_loss": 0.006678584031760693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.678584213659633e-06, "grad_norm": 8.852323532104492, "learning_rate": 4.4849512505298855e-07, "loss": 0.5196, "mean_token_accuracy": 0.833000659942627, "num_tokens": 40379386.0, "step": 1059 }, { "epoch": 0.1348428953059407, "ewc_loss": 0.006688685156404972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.688685061817523e-06, "grad_norm": 8.871649742126465, "learning_rate": 4.489190334887664e-07, "loss": 0.5571, "mean_token_accuracy": 0.8291006088256836, "num_tokens": 40419868.0, "step": 1060 }, { "epoch": 0.13497010558453124, "ewc_loss": 0.006703183054924011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7031828621111345e-06, "grad_norm": 8.925066947937012, "learning_rate": 4.493429419245443e-07, "loss": 0.5254, "mean_token_accuracy": 0.8343065977096558, "num_tokens": 40455346.0, "step": 1061 }, { "epoch": 0.13509731586312174, "ewc_loss": 0.0067086052149534225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.708605269523105e-06, "grad_norm": 8.909102439880371, "learning_rate": 4.4976685036032214e-07, "loss": 0.4839, "mean_token_accuracy": 0.8431966304779053, "num_tokens": 40490087.0, "step": 1062 }, { "epoch": 0.13522452614171224, "ewc_loss": 0.006707744672894478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7077448875352275e-06, "grad_norm": 8.931010246276855, "learning_rate": 4.5019075879610004e-07, "loss": 0.5416, "mean_token_accuracy": 0.8247320652008057, "num_tokens": 40524799.0, "step": 1063 }, { "epoch": 0.13535173642030277, "ewc_loss": 0.006713054142892361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.713054062856827e-06, "grad_norm": 8.940129280090332, "learning_rate": 4.506146672318779e-07, "loss": 0.5149, "mean_token_accuracy": 0.835648775100708, "num_tokens": 40563740.0, "step": 1064 }, { "epoch": 0.13547894669889327, "ewc_loss": 0.006718703545629978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7187033891968895e-06, "grad_norm": 8.848691940307617, "learning_rate": 4.5103857566765573e-07, "loss": 0.5683, "mean_token_accuracy": 0.8249656558036804, "num_tokens": 40604743.0, "step": 1065 }, { "epoch": 0.1356061569774838, "ewc_loss": 0.006714285351336002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7142855186830275e-06, "grad_norm": 8.949885368347168, "learning_rate": 4.5146248410343363e-07, "loss": 0.4739, "mean_token_accuracy": 0.8484461307525635, "num_tokens": 40640909.0, "step": 1066 }, { "epoch": 0.1357333672560743, "ewc_loss": 0.006751553155481815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.751552973582875e-06, "grad_norm": 8.88390064239502, "learning_rate": 4.5188639253921153e-07, "loss": 0.4819, "mean_token_accuracy": 0.8463665246963501, "num_tokens": 40678930.0, "step": 1067 }, { "epoch": 0.1358605775346648, "ewc_loss": 0.006722495891153812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.722495982103283e-06, "grad_norm": 8.937454223632812, "learning_rate": 4.523103009749894e-07, "loss": 0.4764, "mean_token_accuracy": 0.8456823229789734, "num_tokens": 40715609.0, "step": 1068 }, { "epoch": 0.13598778781325532, "ewc_loss": 0.006753811612725258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.753811703674728e-06, "grad_norm": 8.865582466125488, "learning_rate": 4.527342094107672e-07, "loss": 0.5008, "mean_token_accuracy": 0.8385908603668213, "num_tokens": 40760533.0, "step": 1069 }, { "epoch": 0.13611499809184582, "ewc_loss": 0.006763597019016743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.763596957171103e-06, "grad_norm": 8.948665618896484, "learning_rate": 4.531581178465451e-07, "loss": 0.5277, "mean_token_accuracy": 0.8319048881530762, "num_tokens": 40800039.0, "step": 1070 }, { "epoch": 0.13624220837043632, "ewc_loss": 0.006770261563360691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.770261734345695e-06, "grad_norm": 8.942712783813477, "learning_rate": 4.53582026282323e-07, "loss": 0.4912, "mean_token_accuracy": 0.8432337045669556, "num_tokens": 40840207.0, "step": 1071 }, { "epoch": 0.13636941864902685, "ewc_loss": 0.006761233787983656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.761233635188546e-06, "grad_norm": 8.95228099822998, "learning_rate": 4.5400593471810087e-07, "loss": 0.5732, "mean_token_accuracy": 0.8172969818115234, "num_tokens": 40879559.0, "step": 1072 }, { "epoch": 0.13649662892761735, "ewc_loss": 0.006756657734513283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7566575125965755e-06, "grad_norm": 8.911652565002441, "learning_rate": 4.544298431538787e-07, "loss": 0.5008, "mean_token_accuracy": 0.840157151222229, "num_tokens": 40919608.0, "step": 1073 }, { "epoch": 0.13662383920620785, "ewc_loss": 0.00677274726331234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.77274738336564e-06, "grad_norm": 8.973494529724121, "learning_rate": 4.548537515896566e-07, "loss": 0.561, "mean_token_accuracy": 0.8206901550292969, "num_tokens": 40953648.0, "step": 1074 }, { "epoch": 0.13675104948479838, "ewc_loss": 0.006785060279071331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.785060122638242e-06, "grad_norm": 8.96776008605957, "learning_rate": 4.552776600254345e-07, "loss": 0.533, "mean_token_accuracy": 0.8305390477180481, "num_tokens": 40989675.0, "step": 1075 }, { "epoch": 0.13687825976338888, "ewc_loss": 0.0067916810512542725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.791681244067149e-06, "grad_norm": 8.917040824890137, "learning_rate": 4.5570156846121236e-07, "loss": 0.4838, "mean_token_accuracy": 0.8453344702720642, "num_tokens": 41027992.0, "step": 1076 }, { "epoch": 0.13700547004197938, "ewc_loss": 0.006809308659285307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.809308615629561e-06, "grad_norm": 8.977764129638672, "learning_rate": 4.561254768969902e-07, "loss": 0.4835, "mean_token_accuracy": 0.842264711856842, "num_tokens": 41066925.0, "step": 1077 }, { "epoch": 0.1371326803205699, "ewc_loss": 0.0068117594346404076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.811759249103488e-06, "grad_norm": 8.908035278320312, "learning_rate": 4.565493853327681e-07, "loss": 0.4928, "mean_token_accuracy": 0.8424062132835388, "num_tokens": 41105090.0, "step": 1078 }, { "epoch": 0.1372598905991604, "ewc_loss": 0.00679581006988883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.7958098952658474e-06, "grad_norm": 8.968186378479004, "learning_rate": 4.56973293768546e-07, "loss": 0.5346, "mean_token_accuracy": 0.8332430124282837, "num_tokens": 41141924.0, "step": 1079 }, { "epoch": 0.1373871008777509, "ewc_loss": 0.006821647752076387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.8216477302485146e-06, "grad_norm": 8.91375732421875, "learning_rate": 4.573972022043238e-07, "loss": 0.5964, "mean_token_accuracy": 0.8111985921859741, "num_tokens": 41181512.0, "step": 1080 }, { "epoch": 0.13751431115634144, "ewc_loss": 0.006828750018030405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.828749974374659e-06, "grad_norm": 8.959160804748535, "learning_rate": 4.578211106401017e-07, "loss": 0.5442, "mean_token_accuracy": 0.8268597722053528, "num_tokens": 41224015.0, "step": 1081 }, { "epoch": 0.13764152143493194, "ewc_loss": 0.006853129249066114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.853129434603034e-06, "grad_norm": 8.934422492980957, "learning_rate": 4.582450190758796e-07, "loss": 0.5459, "mean_token_accuracy": 0.8282701969146729, "num_tokens": 41265568.0, "step": 1082 }, { "epoch": 0.13776873171352244, "ewc_loss": 0.006848588585853577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.848588782304432e-06, "grad_norm": 8.942715644836426, "learning_rate": 4.586689275116575e-07, "loss": 0.4589, "mean_token_accuracy": 0.8527233004570007, "num_tokens": 41303595.0, "step": 1083 }, { "epoch": 0.13789594199211297, "ewc_loss": 0.006860238034278154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.860238045192091e-06, "grad_norm": 8.98537826538086, "learning_rate": 4.590928359474353e-07, "loss": 0.5634, "mean_token_accuracy": 0.8227818012237549, "num_tokens": 41340119.0, "step": 1084 }, { "epoch": 0.13802315227070347, "ewc_loss": 0.006876401137560606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.876401130284648e-06, "grad_norm": 8.996471405029297, "learning_rate": 4.595167443832132e-07, "loss": 0.5146, "mean_token_accuracy": 0.8390593528747559, "num_tokens": 41380184.0, "step": 1085 }, { "epoch": 0.13815036254929397, "ewc_loss": 0.0068657975643873215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.865797331556678e-06, "grad_norm": 8.922409057617188, "learning_rate": 4.599406528189911e-07, "loss": 0.4572, "mean_token_accuracy": 0.851803719997406, "num_tokens": 41419038.0, "step": 1086 }, { "epoch": 0.1382775728278845, "ewc_loss": 0.006875763647258282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.8757635744987056e-06, "grad_norm": 8.986620903015137, "learning_rate": 4.60364561254769e-07, "loss": 0.5014, "mean_token_accuracy": 0.8365153670310974, "num_tokens": 41455767.0, "step": 1087 }, { "epoch": 0.138404783106475, "ewc_loss": 0.006908213719725609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.908213890710613e-06, "grad_norm": 9.001883506774902, "learning_rate": 4.607884696905468e-07, "loss": 0.498, "mean_token_accuracy": 0.8391926884651184, "num_tokens": 41497493.0, "step": 1088 }, { "epoch": 0.1385319933850655, "ewc_loss": 0.006899776868522167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.899776963109616e-06, "grad_norm": 8.94395637512207, "learning_rate": 4.612123781263247e-07, "loss": 0.529, "mean_token_accuracy": 0.8312018513679504, "num_tokens": 41542940.0, "step": 1089 }, { "epoch": 0.13865920366365603, "ewc_loss": 0.006890661083161831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.890661097713746e-06, "grad_norm": 8.999587059020996, "learning_rate": 4.616362865621026e-07, "loss": 0.4294, "mean_token_accuracy": 0.8583798408508301, "num_tokens": 41579302.0, "step": 1090 }, { "epoch": 0.13878641394224653, "ewc_loss": 0.0069206212647259235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.920621217432199e-06, "grad_norm": 9.042686462402344, "learning_rate": 4.620601949978805e-07, "loss": 0.5286, "mean_token_accuracy": 0.8328357934951782, "num_tokens": 41617555.0, "step": 1091 }, { "epoch": 0.13891362422083706, "ewc_loss": 0.006910054013133049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.910053798492299e-06, "grad_norm": 9.044805526733398, "learning_rate": 4.6248410343365827e-07, "loss": 0.5795, "mean_token_accuracy": 0.8156035542488098, "num_tokens": 41654660.0, "step": 1092 }, { "epoch": 0.13904083449942756, "ewc_loss": 0.006926964968442917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.926964942977065e-06, "grad_norm": 9.011391639709473, "learning_rate": 4.6290801186943617e-07, "loss": 0.4866, "mean_token_accuracy": 0.8439648747444153, "num_tokens": 41693773.0, "step": 1093 }, { "epoch": 0.13916804477801806, "ewc_loss": 0.006917580030858517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.91757986714947e-06, "grad_norm": 8.998295783996582, "learning_rate": 4.6333192030521407e-07, "loss": 0.5008, "mean_token_accuracy": 0.842637300491333, "num_tokens": 41728864.0, "step": 1094 }, { "epoch": 0.13929525505660859, "ewc_loss": 0.006939306389540434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.939306331332773e-06, "grad_norm": 8.988329887390137, "learning_rate": 4.6375582874099196e-07, "loss": 0.475, "mean_token_accuracy": 0.847097635269165, "num_tokens": 41770596.0, "step": 1095 }, { "epoch": 0.1394224653351991, "ewc_loss": 0.006941234692931175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.941234460100532e-06, "grad_norm": 9.01445484161377, "learning_rate": 4.6417973717676976e-07, "loss": 0.4941, "mean_token_accuracy": 0.8450824618339539, "num_tokens": 41806585.0, "step": 1096 }, { "epoch": 0.1395496756137896, "ewc_loss": 0.0069379135966300964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.937913440197008e-06, "grad_norm": 8.979991912841797, "learning_rate": 4.6460364561254766e-07, "loss": 0.5217, "mean_token_accuracy": 0.8316494226455688, "num_tokens": 41845488.0, "step": 1097 }, { "epoch": 0.13967688589238011, "ewc_loss": 0.006974833086133003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.974833013373427e-06, "grad_norm": 9.079938888549805, "learning_rate": 4.6502755404832556e-07, "loss": 0.4879, "mean_token_accuracy": 0.8423018455505371, "num_tokens": 41880674.0, "step": 1098 }, { "epoch": 0.13980409617097062, "ewc_loss": 0.006983492523431778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.983492312429007e-06, "grad_norm": 9.047505378723145, "learning_rate": 4.654514624841034e-07, "loss": 0.5023, "mean_token_accuracy": 0.8378487825393677, "num_tokens": 41913020.0, "step": 1099 }, { "epoch": 0.13993130644956112, "ewc_loss": 0.006949840113520622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.949840098968707e-06, "grad_norm": 9.009134292602539, "learning_rate": 4.6587537091988125e-07, "loss": 0.5079, "mean_token_accuracy": 0.8384613990783691, "num_tokens": 41948483.0, "step": 1100 }, { "epoch": 0.14005851672815164, "ewc_loss": 0.006975318305194378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.975318228796823e-06, "grad_norm": 9.06942081451416, "learning_rate": 4.6629927935565915e-07, "loss": 0.4792, "mean_token_accuracy": 0.846053421497345, "num_tokens": 41985659.0, "step": 1101 }, { "epoch": 0.14018572700674214, "ewc_loss": 0.006987839937210083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.987840151850833e-06, "grad_norm": 8.963232040405273, "learning_rate": 4.6672318779143705e-07, "loss": 0.5764, "mean_token_accuracy": 0.8175629377365112, "num_tokens": 42027746.0, "step": 1102 }, { "epoch": 0.14031293728533265, "ewc_loss": 0.006978786084800959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.978786132094683e-06, "grad_norm": 9.0798921585083, "learning_rate": 4.671470962272149e-07, "loss": 0.5327, "mean_token_accuracy": 0.8284912109375, "num_tokens": 42060903.0, "step": 1103 }, { "epoch": 0.14044014756392317, "ewc_loss": 0.007026578765362501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.026578714430798e-06, "grad_norm": 9.015740394592285, "learning_rate": 4.6757100466299274e-07, "loss": 0.5276, "mean_token_accuracy": 0.8347828388214111, "num_tokens": 42102722.0, "step": 1104 }, { "epoch": 0.14056735784251367, "ewc_loss": 0.006990180350840092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 6.990180281718494e-06, "grad_norm": 9.012999534606934, "learning_rate": 4.6799491309877064e-07, "loss": 0.5368, "mean_token_accuracy": 0.8267877101898193, "num_tokens": 42143418.0, "step": 1105 }, { "epoch": 0.14069456812110417, "ewc_loss": 0.007030877750366926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.0308778958860785e-06, "grad_norm": 9.062883377075195, "learning_rate": 4.6841882153454854e-07, "loss": 0.4762, "mean_token_accuracy": 0.8439474105834961, "num_tokens": 42177313.0, "step": 1106 }, { "epoch": 0.1408217783996947, "ewc_loss": 0.007037315517663956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.037315299385227e-06, "grad_norm": 9.090085983276367, "learning_rate": 4.688427299703264e-07, "loss": 0.4557, "mean_token_accuracy": 0.8498104214668274, "num_tokens": 42214158.0, "step": 1107 }, { "epoch": 0.1409489886782852, "ewc_loss": 0.007036636583507061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.036636361590354e-06, "grad_norm": 9.023221015930176, "learning_rate": 4.6926663840610423e-07, "loss": 0.5145, "mean_token_accuracy": 0.836715579032898, "num_tokens": 42254189.0, "step": 1108 }, { "epoch": 0.1410761989568757, "ewc_loss": 0.007019634824246168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.0196347223827615e-06, "grad_norm": 9.021513938903809, "learning_rate": 4.6969054684188213e-07, "loss": 0.5366, "mean_token_accuracy": 0.8258639574050903, "num_tokens": 42296853.0, "step": 1109 }, { "epoch": 0.14120340923546623, "ewc_loss": 0.00705695990473032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.056959930196172e-06, "grad_norm": 9.08816909790039, "learning_rate": 4.7011445527766003e-07, "loss": 0.507, "mean_token_accuracy": 0.8354595899581909, "num_tokens": 42331983.0, "step": 1110 }, { "epoch": 0.14133061951405673, "ewc_loss": 0.007057524286210537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.057524271658622e-06, "grad_norm": 9.02276611328125, "learning_rate": 4.7053836371343787e-07, "loss": 0.5161, "mean_token_accuracy": 0.8372000455856323, "num_tokens": 42370496.0, "step": 1111 }, { "epoch": 0.14145782979264723, "ewc_loss": 0.007047000806778669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.047000963211758e-06, "grad_norm": 9.072637557983398, "learning_rate": 4.709622721492157e-07, "loss": 0.495, "mean_token_accuracy": 0.8422063589096069, "num_tokens": 42406489.0, "step": 1112 }, { "epoch": 0.14158504007123776, "ewc_loss": 0.007053990848362446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.053990884742234e-06, "grad_norm": 9.028650283813477, "learning_rate": 4.713861805849936e-07, "loss": 0.4948, "mean_token_accuracy": 0.840084969997406, "num_tokens": 42440679.0, "step": 1113 }, { "epoch": 0.14171225034982826, "ewc_loss": 0.007051500491797924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.05150068824878e-06, "grad_norm": 9.05534839630127, "learning_rate": 4.718100890207715e-07, "loss": 0.5678, "mean_token_accuracy": 0.8187821507453918, "num_tokens": 42476699.0, "step": 1114 }, { "epoch": 0.1418394606284188, "ewc_loss": 0.007083964999765158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.083965101628564e-06, "grad_norm": 9.070419311523438, "learning_rate": 4.7223399745654936e-07, "loss": 0.5237, "mean_token_accuracy": 0.8325636386871338, "num_tokens": 42513348.0, "step": 1115 }, { "epoch": 0.1419666709070093, "ewc_loss": 0.007077380083501339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.077379905240377e-06, "grad_norm": 9.068018913269043, "learning_rate": 4.726579058923272e-07, "loss": 0.5154, "mean_token_accuracy": 0.8376848697662354, "num_tokens": 42550633.0, "step": 1116 }, { "epoch": 0.1420938811855998, "ewc_loss": 0.007104905322194099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.104905307642184e-06, "grad_norm": 9.081557273864746, "learning_rate": 4.730818143281051e-07, "loss": 0.5395, "mean_token_accuracy": 0.8309736251831055, "num_tokens": 42593649.0, "step": 1117 }, { "epoch": 0.14222109146419032, "ewc_loss": 0.0071257916279137135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.125791398721049e-06, "grad_norm": 9.132424354553223, "learning_rate": 4.7350572276388295e-07, "loss": 0.4863, "mean_token_accuracy": 0.844138503074646, "num_tokens": 42632379.0, "step": 1118 }, { "epoch": 0.14234830174278082, "ewc_loss": 0.007113007828593254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.113007995940279e-06, "grad_norm": 9.083820343017578, "learning_rate": 4.7392963119966085e-07, "loss": 0.5127, "mean_token_accuracy": 0.8347084522247314, "num_tokens": 42670319.0, "step": 1119 }, { "epoch": 0.14247551202137132, "ewc_loss": 0.007112225983291864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.1122258304967545e-06, "grad_norm": 9.13791275024414, "learning_rate": 4.7435353963543875e-07, "loss": 0.506, "mean_token_accuracy": 0.8398702144622803, "num_tokens": 42712680.0, "step": 1120 }, { "epoch": 0.14260272229996185, "ewc_loss": 0.007138082291930914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.138082310120808e-06, "grad_norm": 9.091802597045898, "learning_rate": 4.747774480712166e-07, "loss": 0.5232, "mean_token_accuracy": 0.8359050154685974, "num_tokens": 42748241.0, "step": 1121 }, { "epoch": 0.14272993257855235, "ewc_loss": 0.007121294271200895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.121294402168132e-06, "grad_norm": 9.128171920776367, "learning_rate": 4.7520135650699444e-07, "loss": 0.5434, "mean_token_accuracy": 0.8277984857559204, "num_tokens": 42783208.0, "step": 1122 }, { "epoch": 0.14285714285714285, "ewc_loss": 0.007155243307352066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.1552431109012105e-06, "grad_norm": 9.115145683288574, "learning_rate": 4.7562526494277234e-07, "loss": 0.5182, "mean_token_accuracy": 0.8361955881118774, "num_tokens": 42818570.0, "step": 1123 }, { "epoch": 0.14298435313573338, "ewc_loss": 0.0071474527940154076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.1474528340331744e-06, "grad_norm": 9.106606483459473, "learning_rate": 4.7604917337855024e-07, "loss": 0.457, "mean_token_accuracy": 0.8509628176689148, "num_tokens": 42852891.0, "step": 1124 }, { "epoch": 0.14311156341432388, "ewc_loss": 0.007159212604165077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.159212600527098e-06, "grad_norm": 9.115203857421875, "learning_rate": 4.764730818143281e-07, "loss": 0.5071, "mean_token_accuracy": 0.8311700224876404, "num_tokens": 42887359.0, "step": 1125 }, { "epoch": 0.14323877369291438, "ewc_loss": 0.0071658967062830925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.165896477090428e-06, "grad_norm": 9.145830154418945, "learning_rate": 4.768969902501059e-07, "loss": 0.5602, "mean_token_accuracy": 0.8217694759368896, "num_tokens": 42929308.0, "step": 1126 }, { "epoch": 0.1433659839715049, "ewc_loss": 0.007186630740761757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.186630682554096e-06, "grad_norm": 9.067960739135742, "learning_rate": 4.773208986858838e-07, "loss": 0.5094, "mean_token_accuracy": 0.8347816467285156, "num_tokens": 42969373.0, "step": 1127 }, { "epoch": 0.1434931942500954, "ewc_loss": 0.007167306262999773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.167306193878176e-06, "grad_norm": 9.1634521484375, "learning_rate": 4.777448071216617e-07, "loss": 0.5137, "mean_token_accuracy": 0.8367838263511658, "num_tokens": 43010133.0, "step": 1128 }, { "epoch": 0.1436204045286859, "ewc_loss": 0.00722142867743969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.22142885933863e-06, "grad_norm": 9.180391311645508, "learning_rate": 4.781687155574396e-07, "loss": 0.5399, "mean_token_accuracy": 0.8281848430633545, "num_tokens": 43053743.0, "step": 1129 }, { "epoch": 0.14374761480727644, "ewc_loss": 0.0071877907030284405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.187790743046207e-06, "grad_norm": 9.12755012512207, "learning_rate": 4.785926239932175e-07, "loss": 0.4802, "mean_token_accuracy": 0.8452798128128052, "num_tokens": 43094125.0, "step": 1130 }, { "epoch": 0.14387482508586694, "ewc_loss": 0.007205585949122906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.2055859163810965e-06, "grad_norm": 9.188690185546875, "learning_rate": 4.790165324289953e-07, "loss": 0.5072, "mean_token_accuracy": 0.8354777097702026, "num_tokens": 43132399.0, "step": 1131 }, { "epoch": 0.14400203536445744, "ewc_loss": 0.007209449075162411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.209448995126877e-06, "grad_norm": 9.223776817321777, "learning_rate": 4.794404408647732e-07, "loss": 0.5215, "mean_token_accuracy": 0.8352668285369873, "num_tokens": 43169036.0, "step": 1132 }, { "epoch": 0.14412924564304797, "ewc_loss": 0.007210292387753725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.210292551462771e-06, "grad_norm": 9.170967102050781, "learning_rate": 4.798643493005511e-07, "loss": 0.4508, "mean_token_accuracy": 0.8523489236831665, "num_tokens": 43209397.0, "step": 1133 }, { "epoch": 0.14425645592163847, "ewc_loss": 0.007199307903647423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.199308129202109e-06, "grad_norm": 9.162983894348145, "learning_rate": 4.80288257736329e-07, "loss": 0.5493, "mean_token_accuracy": 0.8268545866012573, "num_tokens": 43246788.0, "step": 1134 }, { "epoch": 0.14438366620022897, "ewc_loss": 0.0072144209407269955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.214420747914119e-06, "grad_norm": 9.183460235595703, "learning_rate": 4.807121661721068e-07, "loss": 0.4602, "mean_token_accuracy": 0.8506007790565491, "num_tokens": 43282774.0, "step": 1135 }, { "epoch": 0.1445108764788195, "ewc_loss": 0.007218570448458195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.218570317490958e-06, "grad_norm": 9.113341331481934, "learning_rate": 4.811360746078847e-07, "loss": 0.5178, "mean_token_accuracy": 0.8385640978813171, "num_tokens": 43322303.0, "step": 1136 }, { "epoch": 0.14463808675741, "ewc_loss": 0.007217052858322859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.2170528255810495e-06, "grad_norm": 9.205780982971191, "learning_rate": 4.815599830436625e-07, "loss": 0.5104, "mean_token_accuracy": 0.8362557888031006, "num_tokens": 43354611.0, "step": 1137 }, { "epoch": 0.1447652970360005, "ewc_loss": 0.007242872379720211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.242872470669681e-06, "grad_norm": 9.173937797546387, "learning_rate": 4.819838914794405e-07, "loss": 0.492, "mean_token_accuracy": 0.8420190215110779, "num_tokens": 43390640.0, "step": 1138 }, { "epoch": 0.14489250731459102, "ewc_loss": 0.007225680630654097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.2256807470694184e-06, "grad_norm": 9.156805038452148, "learning_rate": 4.824077999152183e-07, "loss": 0.5127, "mean_token_accuracy": 0.832589864730835, "num_tokens": 43424513.0, "step": 1139 }, { "epoch": 0.14501971759318152, "ewc_loss": 0.0072325365617871284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.232536518131383e-06, "grad_norm": 9.189584732055664, "learning_rate": 4.828317083509962e-07, "loss": 0.5185, "mean_token_accuracy": 0.8331106901168823, "num_tokens": 43461699.0, "step": 1140 }, { "epoch": 0.14514692787177205, "ewc_loss": 0.00725482078269124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.254820957314223e-06, "grad_norm": 9.258774757385254, "learning_rate": 4.83255616786774e-07, "loss": 0.4921, "mean_token_accuracy": 0.8460241556167603, "num_tokens": 43494681.0, "step": 1141 }, { "epoch": 0.14527413815036255, "ewc_loss": 0.007276573218405247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.276573342096526e-06, "grad_norm": 9.19938850402832, "learning_rate": 4.83679525222552e-07, "loss": 0.4679, "mean_token_accuracy": 0.849819540977478, "num_tokens": 43531318.0, "step": 1142 }, { "epoch": 0.14540134842895305, "ewc_loss": 0.007244214881211519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.244214884849498e-06, "grad_norm": 9.176626205444336, "learning_rate": 4.841034336583298e-07, "loss": 0.4835, "mean_token_accuracy": 0.8490703105926514, "num_tokens": 43569991.0, "step": 1143 }, { "epoch": 0.14552855870754358, "ewc_loss": 0.00727257551625371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.272575658134883e-06, "grad_norm": 9.172306060791016, "learning_rate": 4.845273420941076e-07, "loss": 0.4714, "mean_token_accuracy": 0.8497557640075684, "num_tokens": 43609978.0, "step": 1144 }, { "epoch": 0.14565576898613408, "ewc_loss": 0.007284253370016813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.284253570105648e-06, "grad_norm": 9.266481399536133, "learning_rate": 4.849512505298855e-07, "loss": 0.5207, "mean_token_accuracy": 0.8293027877807617, "num_tokens": 43640419.0, "step": 1145 }, { "epoch": 0.14578297926472458, "ewc_loss": 0.007304253987967968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.304253813344985e-06, "grad_norm": 9.164336204528809, "learning_rate": 4.853751589656634e-07, "loss": 0.4813, "mean_token_accuracy": 0.845633327960968, "num_tokens": 43682408.0, "step": 1146 }, { "epoch": 0.1459101895433151, "ewc_loss": 0.007275609765201807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.275609732459998e-06, "grad_norm": 9.149904251098633, "learning_rate": 4.857990674014413e-07, "loss": 0.5205, "mean_token_accuracy": 0.8322587609291077, "num_tokens": 43726158.0, "step": 1147 }, { "epoch": 0.1460373998219056, "ewc_loss": 0.0073172952979803085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.317295512621058e-06, "grad_norm": 9.211491584777832, "learning_rate": 4.862229758372191e-07, "loss": 0.5507, "mean_token_accuracy": 0.8224825859069824, "num_tokens": 43766466.0, "step": 1148 }, { "epoch": 0.1461646101004961, "ewc_loss": 0.007306298241019249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.306298357434571e-06, "grad_norm": 9.223184585571289, "learning_rate": 4.86646884272997e-07, "loss": 0.5364, "mean_token_accuracy": 0.8273011445999146, "num_tokens": 43807255.0, "step": 1149 }, { "epoch": 0.14629182037908664, "ewc_loss": 0.007306694984436035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.306694897124544e-06, "grad_norm": 9.189146995544434, "learning_rate": 4.870707927087749e-07, "loss": 0.5157, "mean_token_accuracy": 0.8346830606460571, "num_tokens": 43845419.0, "step": 1150 }, { "epoch": 0.14641903065767714, "ewc_loss": 0.007326249498873949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.326249487960013e-06, "grad_norm": 9.261659622192383, "learning_rate": 4.874947011445528e-07, "loss": 0.5244, "mean_token_accuracy": 0.8352789878845215, "num_tokens": 43882942.0, "step": 1151 }, { "epoch": 0.14654624093626764, "ewc_loss": 0.0073402379639446735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.340237971220631e-06, "grad_norm": 9.183905601501465, "learning_rate": 4.879186095803306e-07, "loss": 0.4434, "mean_token_accuracy": 0.8570513725280762, "num_tokens": 43923109.0, "step": 1152 }, { "epoch": 0.14667345121485817, "ewc_loss": 0.007326407823711634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.326407740038121e-06, "grad_norm": 9.251513481140137, "learning_rate": 4.883425180161085e-07, "loss": 0.4808, "mean_token_accuracy": 0.8436641693115234, "num_tokens": 43959752.0, "step": 1153 }, { "epoch": 0.14680066149344867, "ewc_loss": 0.007342388387769461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.342388471442973e-06, "grad_norm": 9.238960266113281, "learning_rate": 4.887664264518864e-07, "loss": 0.5288, "mean_token_accuracy": 0.8359091281890869, "num_tokens": 43996247.0, "step": 1154 }, { "epoch": 0.14692787177203917, "ewc_loss": 0.007337468210607767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.337468105106382e-06, "grad_norm": 9.192916870117188, "learning_rate": 4.891903348876643e-07, "loss": 0.5109, "mean_token_accuracy": 0.8395383954048157, "num_tokens": 44035751.0, "step": 1155 }, { "epoch": 0.1470550820506297, "ewc_loss": 0.007327654864639044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.327654657274252e-06, "grad_norm": 9.24500846862793, "learning_rate": 4.896142433234421e-07, "loss": 0.5461, "mean_token_accuracy": 0.8318984508514404, "num_tokens": 44073225.0, "step": 1156 }, { "epoch": 0.1471822923292202, "ewc_loss": 0.007365454453974962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.365454621321987e-06, "grad_norm": 9.245506286621094, "learning_rate": 4.9003815175922e-07, "loss": 0.5485, "mean_token_accuracy": 0.8311057686805725, "num_tokens": 44111761.0, "step": 1157 }, { "epoch": 0.1473095026078107, "ewc_loss": 0.007333132438361645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.33313254386303e-06, "grad_norm": 9.21023178100586, "learning_rate": 4.904620601949979e-07, "loss": 0.4997, "mean_token_accuracy": 0.8399181962013245, "num_tokens": 44151311.0, "step": 1158 }, { "epoch": 0.14743671288640123, "ewc_loss": 0.0073616523295640945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.361652478721226e-06, "grad_norm": 9.265753746032715, "learning_rate": 4.908859686307758e-07, "loss": 0.5455, "mean_token_accuracy": 0.8292182683944702, "num_tokens": 44184623.0, "step": 1159 }, { "epoch": 0.14756392316499173, "ewc_loss": 0.007370122708380222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.370122602878837e-06, "grad_norm": 9.237930297851562, "learning_rate": 4.913098770665536e-07, "loss": 0.5307, "mean_token_accuracy": 0.8341466784477234, "num_tokens": 44221753.0, "step": 1160 }, { "epoch": 0.14769113344358223, "ewc_loss": 0.0073557947762310505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.355794878094457e-06, "grad_norm": 9.204141616821289, "learning_rate": 4.917337855023314e-07, "loss": 0.4838, "mean_token_accuracy": 0.8413404822349548, "num_tokens": 44262542.0, "step": 1161 }, { "epoch": 0.14781834372217276, "ewc_loss": 0.007372713182121515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.372713298536837e-06, "grad_norm": 9.285285949707031, "learning_rate": 4.921576939381094e-07, "loss": 0.5069, "mean_token_accuracy": 0.8424044251441956, "num_tokens": 44300294.0, "step": 1162 }, { "epoch": 0.14794555400076326, "ewc_loss": 0.00740942033007741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.4094205047003925e-06, "grad_norm": 9.325693130493164, "learning_rate": 4.925816023738872e-07, "loss": 0.5522, "mean_token_accuracy": 0.8239065408706665, "num_tokens": 44338476.0, "step": 1163 }, { "epoch": 0.14807276427935376, "ewc_loss": 0.0074002426117658615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.400242793664802e-06, "grad_norm": 9.308759689331055, "learning_rate": 4.930055108096651e-07, "loss": 0.4546, "mean_token_accuracy": 0.8531520366668701, "num_tokens": 44367437.0, "step": 1164 }, { "epoch": 0.1481999745579443, "ewc_loss": 0.007410254795104265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.410254966089269e-06, "grad_norm": 9.2908296585083, "learning_rate": 4.934294192454429e-07, "loss": 0.5648, "mean_token_accuracy": 0.8221063017845154, "num_tokens": 44402636.0, "step": 1165 }, { "epoch": 0.1483271848365348, "ewc_loss": 0.007415704429149628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.415704203594942e-06, "grad_norm": 9.311450004577637, "learning_rate": 4.938533276812209e-07, "loss": 0.5067, "mean_token_accuracy": 0.8389235734939575, "num_tokens": 44438187.0, "step": 1166 }, { "epoch": 0.14845439511512531, "ewc_loss": 0.007426337338984013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.42633756090072e-06, "grad_norm": 9.232909202575684, "learning_rate": 4.942772361169987e-07, "loss": 0.4303, "mean_token_accuracy": 0.8561967611312866, "num_tokens": 44478345.0, "step": 1167 }, { "epoch": 0.14858160539371582, "ewc_loss": 0.007426311261951923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.426311185554368e-06, "grad_norm": 9.283711433410645, "learning_rate": 4.947011445527766e-07, "loss": 0.5106, "mean_token_accuracy": 0.8367986679077148, "num_tokens": 44520272.0, "step": 1168 }, { "epoch": 0.14870881567230632, "ewc_loss": 0.0074396333657205105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.439633463945938e-06, "grad_norm": 9.236552238464355, "learning_rate": 4.951250529885544e-07, "loss": 0.461, "mean_token_accuracy": 0.8534404635429382, "num_tokens": 44561058.0, "step": 1169 }, { "epoch": 0.14883602595089684, "ewc_loss": 0.007433551829308271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.433551672875183e-06, "grad_norm": 9.339798927307129, "learning_rate": 4.955489614243324e-07, "loss": 0.5949, "mean_token_accuracy": 0.8095171451568604, "num_tokens": 44599371.0, "step": 1170 }, { "epoch": 0.14896323622948734, "ewc_loss": 0.007475196849554777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.475196980522014e-06, "grad_norm": 9.296062469482422, "learning_rate": 4.959728698601102e-07, "loss": 0.5052, "mean_token_accuracy": 0.8309155702590942, "num_tokens": 44640499.0, "step": 1171 }, { "epoch": 0.14909044650807785, "ewc_loss": 0.0074441684409976006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.444168659276329e-06, "grad_norm": 9.22839641571045, "learning_rate": 4.963967782958881e-07, "loss": 0.4913, "mean_token_accuracy": 0.8436141014099121, "num_tokens": 44683164.0, "step": 1172 }, { "epoch": 0.14921765678666837, "ewc_loss": 0.0074628195725381374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.462819667125586e-06, "grad_norm": 9.307975769042969, "learning_rate": 4.968206867316659e-07, "loss": 0.5053, "mean_token_accuracy": 0.836128830909729, "num_tokens": 44721938.0, "step": 1173 }, { "epoch": 0.14934486706525887, "ewc_loss": 0.007476815953850746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.47681588109117e-06, "grad_norm": 9.34388542175293, "learning_rate": 4.972445951674439e-07, "loss": 0.4746, "mean_token_accuracy": 0.8452050685882568, "num_tokens": 44753506.0, "step": 1174 }, { "epoch": 0.14947207734384937, "ewc_loss": 0.007474921643733978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.474921858374728e-06, "grad_norm": 9.316603660583496, "learning_rate": 4.976685036032216e-07, "loss": 0.5661, "mean_token_accuracy": 0.8198848962783813, "num_tokens": 44792170.0, "step": 1175 }, { "epoch": 0.1495992876224399, "ewc_loss": 0.0074925790540874004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.492579243262298e-06, "grad_norm": 9.30681037902832, "learning_rate": 4.980924120389996e-07, "loss": 0.4421, "mean_token_accuracy": 0.8589831590652466, "num_tokens": 44830733.0, "step": 1176 }, { "epoch": 0.1497264979010304, "ewc_loss": 0.007493471726775169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.4934719123120885e-06, "grad_norm": 9.263440132141113, "learning_rate": 4.985163204747774e-07, "loss": 0.4565, "mean_token_accuracy": 0.8533914685249329, "num_tokens": 44869743.0, "step": 1177 }, { "epoch": 0.1498537081796209, "ewc_loss": 0.007503807544708252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.503807410103036e-06, "grad_norm": 9.296133995056152, "learning_rate": 4.989402289105554e-07, "loss": 0.503, "mean_token_accuracy": 0.8439584970474243, "num_tokens": 44913358.0, "step": 1178 }, { "epoch": 0.14998091845821143, "ewc_loss": 0.00751702394336462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.517023732361849e-06, "grad_norm": 9.407855987548828, "learning_rate": 4.993641373463331e-07, "loss": 0.5627, "mean_token_accuracy": 0.8206925988197327, "num_tokens": 44949963.0, "step": 1179 }, { "epoch": 0.15010812873680193, "ewc_loss": 0.007536727003753185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.536727025581058e-06, "grad_norm": 9.334980964660645, "learning_rate": 4.997880457821111e-07, "loss": 0.5177, "mean_token_accuracy": 0.8330935835838318, "num_tokens": 44988569.0, "step": 1180 }, { "epoch": 0.15023533901539243, "ewc_loss": 0.0075037539936602116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.503754204662982e-06, "grad_norm": 9.259678840637207, "learning_rate": 5.002119542178889e-07, "loss": 0.494, "mean_token_accuracy": 0.8400519490242004, "num_tokens": 45032634.0, "step": 1181 }, { "epoch": 0.15036254929398296, "ewc_loss": 0.007524162996560335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.524162811023416e-06, "grad_norm": 9.329591751098633, "learning_rate": 5.006358626536667e-07, "loss": 0.4624, "mean_token_accuracy": 0.8521193861961365, "num_tokens": 45069685.0, "step": 1182 }, { "epoch": 0.15048975957257346, "ewc_loss": 0.007546962238848209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.54696202420746e-06, "grad_norm": 9.32010555267334, "learning_rate": 5.010597710894446e-07, "loss": 0.4853, "mean_token_accuracy": 0.8430249094963074, "num_tokens": 45107481.0, "step": 1183 }, { "epoch": 0.15061696985116396, "ewc_loss": 0.0075265089981257915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.526508852606639e-06, "grad_norm": 9.352904319763184, "learning_rate": 5.014836795252225e-07, "loss": 0.4695, "mean_token_accuracy": 0.8486431837081909, "num_tokens": 45145747.0, "step": 1184 }, { "epoch": 0.1507441801297545, "ewc_loss": 0.007552778348326683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.552778242825298e-06, "grad_norm": 9.3345947265625, "learning_rate": 5.019075879610004e-07, "loss": 0.4768, "mean_token_accuracy": 0.8491553664207458, "num_tokens": 45183140.0, "step": 1185 }, { "epoch": 0.150871390408345, "ewc_loss": 0.007537583354860544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.537583314842777e-06, "grad_norm": 9.321589469909668, "learning_rate": 5.023314963967783e-07, "loss": 0.5104, "mean_token_accuracy": 0.8362940549850464, "num_tokens": 45224911.0, "step": 1186 }, { "epoch": 0.1509986006869355, "ewc_loss": 0.007558898068964481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.558898232673528e-06, "grad_norm": 9.372530937194824, "learning_rate": 5.027554048325562e-07, "loss": 0.5664, "mean_token_accuracy": 0.8217922449111938, "num_tokens": 45257388.0, "step": 1187 }, { "epoch": 0.15112581096552602, "ewc_loss": 0.007561345584690571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.561345682915999e-06, "grad_norm": 9.316360473632812, "learning_rate": 5.03179313268334e-07, "loss": 0.5517, "mean_token_accuracy": 0.8242517113685608, "num_tokens": 45297876.0, "step": 1188 }, { "epoch": 0.15125302124411652, "ewc_loss": 0.007562527433037758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.5625275712809525e-06, "grad_norm": 9.383915901184082, "learning_rate": 5.036032217041119e-07, "loss": 0.5261, "mean_token_accuracy": 0.8298499584197998, "num_tokens": 45334040.0, "step": 1189 }, { "epoch": 0.15138023152270705, "ewc_loss": 0.007590663153678179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.590663244627649e-06, "grad_norm": 9.37678050994873, "learning_rate": 5.040271301398897e-07, "loss": 0.5106, "mean_token_accuracy": 0.8338587284088135, "num_tokens": 45375985.0, "step": 1190 }, { "epoch": 0.15150744180129755, "ewc_loss": 0.007561863400042057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.561863185401307e-06, "grad_norm": 9.322171211242676, "learning_rate": 5.044510385756676e-07, "loss": 0.4795, "mean_token_accuracy": 0.8429697751998901, "num_tokens": 45419951.0, "step": 1191 }, { "epoch": 0.15163465207988805, "ewc_loss": 0.007574732415378094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.574732535431394e-06, "grad_norm": 9.386444091796875, "learning_rate": 5.048749470114455e-07, "loss": 0.4926, "mean_token_accuracy": 0.8426300287246704, "num_tokens": 45456988.0, "step": 1192 }, { "epoch": 0.15176186235847858, "ewc_loss": 0.007600720971822739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.600720891787205e-06, "grad_norm": 9.418204307556152, "learning_rate": 5.052988554472234e-07, "loss": 0.5106, "mean_token_accuracy": 0.8341052532196045, "num_tokens": 45488226.0, "step": 1193 }, { "epoch": 0.15188907263706908, "ewc_loss": 0.007596071343868971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.596071554871742e-06, "grad_norm": 9.382274627685547, "learning_rate": 5.057227638830013e-07, "loss": 0.5082, "mean_token_accuracy": 0.8377578258514404, "num_tokens": 45524661.0, "step": 1194 }, { "epoch": 0.15201628291565958, "ewc_loss": 0.007590934168547392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.590934274048777e-06, "grad_norm": 9.366539001464844, "learning_rate": 5.061466723187792e-07, "loss": 0.5105, "mean_token_accuracy": 0.8372360467910767, "num_tokens": 45563491.0, "step": 1195 }, { "epoch": 0.1521434931942501, "ewc_loss": 0.007610654924064875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.61065484766732e-06, "grad_norm": 9.378430366516113, "learning_rate": 5.065705807545569e-07, "loss": 0.4804, "mean_token_accuracy": 0.847387969493866, "num_tokens": 45596771.0, "step": 1196 }, { "epoch": 0.1522707034728406, "ewc_loss": 0.007626985665410757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.626985734532354e-06, "grad_norm": 9.319953918457031, "learning_rate": 5.069944891903349e-07, "loss": 0.4982, "mean_token_accuracy": 0.839550256729126, "num_tokens": 45640049.0, "step": 1197 }, { "epoch": 0.1523979137514311, "ewc_loss": 0.007617186289280653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.617186383868102e-06, "grad_norm": 9.391615867614746, "learning_rate": 5.074183976261127e-07, "loss": 0.4991, "mean_token_accuracy": 0.8381574153900146, "num_tokens": 45677375.0, "step": 1198 }, { "epoch": 0.15252512403002164, "ewc_loss": 0.00764778908342123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.647789061593357e-06, "grad_norm": 9.324050903320312, "learning_rate": 5.078423060618906e-07, "loss": 0.5025, "mean_token_accuracy": 0.8399147987365723, "num_tokens": 45719253.0, "step": 1199 }, { "epoch": 0.15265233430861214, "ewc_loss": 0.007620532996952534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.6205328696232755e-06, "grad_norm": 9.378524780273438, "learning_rate": 5.082662144976685e-07, "loss": 0.4991, "mean_token_accuracy": 0.8367461562156677, "num_tokens": 45756065.0, "step": 1200 }, { "epoch": 0.15277954458720264, "ewc_loss": 0.007682953029870987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.682952855248004e-06, "grad_norm": 9.425301551818848, "learning_rate": 5.086901229334464e-07, "loss": 0.4875, "mean_token_accuracy": 0.845773458480835, "num_tokens": 45791707.0, "step": 1201 }, { "epoch": 0.15290675486579317, "ewc_loss": 0.00766637921333313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.666379133297596e-06, "grad_norm": 9.353311538696289, "learning_rate": 5.091140313692243e-07, "loss": 0.5029, "mean_token_accuracy": 0.838281512260437, "num_tokens": 45828464.0, "step": 1202 }, { "epoch": 0.15303396514438367, "ewc_loss": 0.0076749613508582115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.674961125303525e-06, "grad_norm": 9.399191856384277, "learning_rate": 5.095379398050022e-07, "loss": 0.486, "mean_token_accuracy": 0.8430352210998535, "num_tokens": 45867046.0, "step": 1203 }, { "epoch": 0.15316117542297417, "ewc_loss": 0.007699149195104837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.699149136897177e-06, "grad_norm": 9.477133750915527, "learning_rate": 5.099618482407799e-07, "loss": 0.517, "mean_token_accuracy": 0.8358893394470215, "num_tokens": 45900530.0, "step": 1204 }, { "epoch": 0.1532883857015647, "ewc_loss": 0.007698931731283665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.698931767663453e-06, "grad_norm": 9.406614303588867, "learning_rate": 5.103857566765578e-07, "loss": 0.4951, "mean_token_accuracy": 0.8425600528717041, "num_tokens": 45940002.0, "step": 1205 }, { "epoch": 0.1534155959801552, "ewc_loss": 0.007668777368962765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.668777470826171e-06, "grad_norm": 9.345622062683105, "learning_rate": 5.108096651123357e-07, "loss": 0.4565, "mean_token_accuracy": 0.852232038974762, "num_tokens": 45975299.0, "step": 1206 }, { "epoch": 0.1535428062587457, "ewc_loss": 0.0077134291641414165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.713429113209713e-06, "grad_norm": 9.402788162231445, "learning_rate": 5.112335735481135e-07, "loss": 0.4959, "mean_token_accuracy": 0.8393778800964355, "num_tokens": 46012693.0, "step": 1207 }, { "epoch": 0.15367001653733622, "ewc_loss": 0.0077210343442857265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.721034307905938e-06, "grad_norm": 9.387974739074707, "learning_rate": 5.116574819838915e-07, "loss": 0.4003, "mean_token_accuracy": 0.8700167536735535, "num_tokens": 46051558.0, "step": 1208 }, { "epoch": 0.15379722681592672, "ewc_loss": 0.007716846186667681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.716846084804274e-06, "grad_norm": 9.427858352661133, "learning_rate": 5.120813904196693e-07, "loss": 0.4847, "mean_token_accuracy": 0.8447691202163696, "num_tokens": 46087173.0, "step": 1209 }, { "epoch": 0.15392443709451722, "ewc_loss": 0.00774947227910161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.749472388240974e-06, "grad_norm": 9.4890775680542, "learning_rate": 5.125052988554473e-07, "loss": 0.54, "mean_token_accuracy": 0.8312005996704102, "num_tokens": 46120482.0, "step": 1210 }, { "epoch": 0.15405164737310775, "ewc_loss": 0.007753163110464811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.753163117740769e-06, "grad_norm": 9.423501014709473, "learning_rate": 5.12929207291225e-07, "loss": 0.4881, "mean_token_accuracy": 0.8415518403053284, "num_tokens": 46154478.0, "step": 1211 }, { "epoch": 0.15417885765169825, "ewc_loss": 0.0077314218506217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.731421646894887e-06, "grad_norm": 9.417967796325684, "learning_rate": 5.133531157270029e-07, "loss": 0.4643, "mean_token_accuracy": 0.8504762649536133, "num_tokens": 46195012.0, "step": 1212 }, { "epoch": 0.15430606793028875, "ewc_loss": 0.00774455675855279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.744556569377892e-06, "grad_norm": 9.371761322021484, "learning_rate": 5.137770241627808e-07, "loss": 0.4292, "mean_token_accuracy": 0.8606215119361877, "num_tokens": 46230641.0, "step": 1213 }, { "epoch": 0.15443327820887928, "ewc_loss": 0.0077591221779584885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.759122127026785e-06, "grad_norm": 9.448098182678223, "learning_rate": 5.142009325985587e-07, "loss": 0.4695, "mean_token_accuracy": 0.8518247604370117, "num_tokens": 46268993.0, "step": 1214 }, { "epoch": 0.15456048848746978, "ewc_loss": 0.0077908397652208805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.790839845256414e-06, "grad_norm": 9.433245658874512, "learning_rate": 5.146248410343365e-07, "loss": 0.5029, "mean_token_accuracy": 0.839571475982666, "num_tokens": 46309310.0, "step": 1215 }, { "epoch": 0.1546876987660603, "ewc_loss": 0.007769170217216015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.769170224491972e-06, "grad_norm": 9.468074798583984, "learning_rate": 5.150487494701145e-07, "loss": 0.4441, "mean_token_accuracy": 0.8568198680877686, "num_tokens": 46352415.0, "step": 1216 }, { "epoch": 0.1548149090446508, "ewc_loss": 0.007788300514221191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.788300536049064e-06, "grad_norm": 9.478998184204102, "learning_rate": 5.154726579058923e-07, "loss": 0.4996, "mean_token_accuracy": 0.8377582430839539, "num_tokens": 46387838.0, "step": 1217 }, { "epoch": 0.1549421193232413, "ewc_loss": 0.007784602697938681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.784602530591656e-06, "grad_norm": 9.428297996520996, "learning_rate": 5.158965663416703e-07, "loss": 0.4949, "mean_token_accuracy": 0.84209144115448, "num_tokens": 46432669.0, "step": 1218 }, { "epoch": 0.15506932960183184, "ewc_loss": 0.0077830590307712555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.783059118082747e-06, "grad_norm": 9.437180519104004, "learning_rate": 5.16320474777448e-07, "loss": 0.45, "mean_token_accuracy": 0.8544599413871765, "num_tokens": 46474383.0, "step": 1219 }, { "epoch": 0.15519653988042234, "ewc_loss": 0.007796052377671003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.796052159392275e-06, "grad_norm": 9.498551368713379, "learning_rate": 5.167443832132259e-07, "loss": 0.5006, "mean_token_accuracy": 0.8393751382827759, "num_tokens": 46514263.0, "step": 1220 }, { "epoch": 0.15532375015901284, "ewc_loss": 0.007809750735759735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.809750968590379e-06, "grad_norm": 9.494149208068848, "learning_rate": 5.171682916490038e-07, "loss": 0.4676, "mean_token_accuracy": 0.8494616746902466, "num_tokens": 46549835.0, "step": 1221 }, { "epoch": 0.15545096043760337, "ewc_loss": 0.007804133929312229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.80413392931223e-06, "grad_norm": 9.5128173828125, "learning_rate": 5.175922000847816e-07, "loss": 0.4555, "mean_token_accuracy": 0.8534573316574097, "num_tokens": 46584811.0, "step": 1222 }, { "epoch": 0.15557817071619387, "ewc_loss": 0.007799262646585703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.799262675689533e-06, "grad_norm": 9.450207710266113, "learning_rate": 5.180161085205595e-07, "loss": 0.4768, "mean_token_accuracy": 0.8472306728363037, "num_tokens": 46621441.0, "step": 1223 }, { "epoch": 0.15570538099478437, "ewc_loss": 0.007802758831530809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.802758773323148e-06, "grad_norm": 9.479708671569824, "learning_rate": 5.184400169563374e-07, "loss": 0.53, "mean_token_accuracy": 0.8311318159103394, "num_tokens": 46661572.0, "step": 1224 }, { "epoch": 0.1558325912733749, "ewc_loss": 0.007817812263965607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.817811820132192e-06, "grad_norm": 9.507649421691895, "learning_rate": 5.188639253921153e-07, "loss": 0.5052, "mean_token_accuracy": 0.8372082114219666, "num_tokens": 46702157.0, "step": 1225 }, { "epoch": 0.1559598015519654, "ewc_loss": 0.007831083610653877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.831084076315165e-06, "grad_norm": 9.524208068847656, "learning_rate": 5.192878338278932e-07, "loss": 0.4817, "mean_token_accuracy": 0.8450473546981812, "num_tokens": 46739119.0, "step": 1226 }, { "epoch": 0.1560870118305559, "ewc_loss": 0.00782082974910736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.820829523552675e-06, "grad_norm": 9.498327255249023, "learning_rate": 5.19711742263671e-07, "loss": 0.5398, "mean_token_accuracy": 0.8274298906326294, "num_tokens": 46783990.0, "step": 1227 }, { "epoch": 0.15621422210914643, "ewc_loss": 0.007838376797735691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.83837640483398e-06, "grad_norm": 9.538247108459473, "learning_rate": 5.201356506994488e-07, "loss": 0.4773, "mean_token_accuracy": 0.8409522771835327, "num_tokens": 46816131.0, "step": 1228 }, { "epoch": 0.15634143238773693, "ewc_loss": 0.00783712137490511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.837121302145533e-06, "grad_norm": 9.485755920410156, "learning_rate": 5.205595591352268e-07, "loss": 0.5619, "mean_token_accuracy": 0.8219216465950012, "num_tokens": 46856453.0, "step": 1229 }, { "epoch": 0.15646864266632743, "ewc_loss": 0.00783286802470684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.832867595425341e-06, "grad_norm": 9.4901123046875, "learning_rate": 5.209834675710046e-07, "loss": 0.5003, "mean_token_accuracy": 0.8395565152168274, "num_tokens": 46897961.0, "step": 1230 }, { "epoch": 0.15659585294491796, "ewc_loss": 0.007855298928916454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.855299372749869e-06, "grad_norm": 9.490174293518066, "learning_rate": 5.214073760067825e-07, "loss": 0.5751, "mean_token_accuracy": 0.8189816474914551, "num_tokens": 46931961.0, "step": 1231 }, { "epoch": 0.15672306322350846, "ewc_loss": 0.00787176564335823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.871765774325468e-06, "grad_norm": 9.5540132522583, "learning_rate": 5.218312844425604e-07, "loss": 0.4954, "mean_token_accuracy": 0.8433078527450562, "num_tokens": 46967822.0, "step": 1232 }, { "epoch": 0.15685027350209896, "ewc_loss": 0.007880798541009426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.880798875703476e-06, "grad_norm": 9.557991981506348, "learning_rate": 5.222551928783383e-07, "loss": 0.4758, "mean_token_accuracy": 0.846790075302124, "num_tokens": 47004511.0, "step": 1233 }, { "epoch": 0.1569774837806895, "ewc_loss": 0.00788054708391428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.880546945671085e-06, "grad_norm": 9.516966819763184, "learning_rate": 5.226791013141161e-07, "loss": 0.5289, "mean_token_accuracy": 0.8332498073577881, "num_tokens": 47047244.0, "step": 1234 }, { "epoch": 0.15710469405928, "ewc_loss": 0.007874969393014908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.874969014665112e-06, "grad_norm": 9.517987251281738, "learning_rate": 5.23103009749894e-07, "loss": 0.4869, "mean_token_accuracy": 0.8434597253799438, "num_tokens": 47088206.0, "step": 1235 }, { "epoch": 0.1572319043378705, "ewc_loss": 0.00790139939635992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.901399840193335e-06, "grad_norm": 9.526910781860352, "learning_rate": 5.235269181856718e-07, "loss": 0.4953, "mean_token_accuracy": 0.8406969308853149, "num_tokens": 47132733.0, "step": 1236 }, { "epoch": 0.15735911461646102, "ewc_loss": 0.007893172092735767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.893172551121097e-06, "grad_norm": 9.556156158447266, "learning_rate": 5.239508266214498e-07, "loss": 0.5536, "mean_token_accuracy": 0.8203848004341125, "num_tokens": 47166840.0, "step": 1237 }, { "epoch": 0.15748632489505152, "ewc_loss": 0.007912197150290012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.912197361292783e-06, "grad_norm": 9.552367210388184, "learning_rate": 5.243747350572276e-07, "loss": 0.5125, "mean_token_accuracy": 0.8365429639816284, "num_tokens": 47203444.0, "step": 1238 }, { "epoch": 0.15761353517364202, "ewc_loss": 0.007926004938781261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.9260053098551e-06, "grad_norm": 9.556070327758789, "learning_rate": 5.247986434930056e-07, "loss": 0.5063, "mean_token_accuracy": 0.8416494131088257, "num_tokens": 47240380.0, "step": 1239 }, { "epoch": 0.15774074545223254, "ewc_loss": 0.007928445935249329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.928445484139957e-06, "grad_norm": 9.590022087097168, "learning_rate": 5.252225519287834e-07, "loss": 0.5669, "mean_token_accuracy": 0.8200805187225342, "num_tokens": 47277886.0, "step": 1240 }, { "epoch": 0.15786795573082305, "ewc_loss": 0.00793504249304533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.935042049211916e-06, "grad_norm": 9.569896697998047, "learning_rate": 5.256464603645613e-07, "loss": 0.4952, "mean_token_accuracy": 0.845004677772522, "num_tokens": 47315261.0, "step": 1241 }, { "epoch": 0.15799516600941357, "ewc_loss": 0.00793201569467783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.932015250844415e-06, "grad_norm": 9.544453620910645, "learning_rate": 5.260703688003391e-07, "loss": 0.4943, "mean_token_accuracy": 0.8415802717208862, "num_tokens": 47357489.0, "step": 1242 }, { "epoch": 0.15812237628800407, "ewc_loss": 0.007934873923659325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.934873792692088e-06, "grad_norm": 9.55034351348877, "learning_rate": 5.26494277236117e-07, "loss": 0.5114, "mean_token_accuracy": 0.8368135690689087, "num_tokens": 47389851.0, "step": 1243 }, { "epoch": 0.15824958656659457, "ewc_loss": 0.007958649657666683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.958649803185835e-06, "grad_norm": 9.564451217651367, "learning_rate": 5.269181856718948e-07, "loss": 0.5372, "mean_token_accuracy": 0.8271585702896118, "num_tokens": 47427491.0, "step": 1244 }, { "epoch": 0.1583767968451851, "ewc_loss": 0.00798096600919962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.980966074683238e-06, "grad_norm": 9.581250190734863, "learning_rate": 5.273420941076727e-07, "loss": 0.4772, "mean_token_accuracy": 0.8473994135856628, "num_tokens": 47462790.0, "step": 1245 }, { "epoch": 0.1585040071237756, "ewc_loss": 0.007970013655722141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.970013939484488e-06, "grad_norm": 9.528823852539062, "learning_rate": 5.277660025434506e-07, "loss": 0.4396, "mean_token_accuracy": 0.8568422198295593, "num_tokens": 47502240.0, "step": 1246 }, { "epoch": 0.1586312174023661, "ewc_loss": 0.00797412171959877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.974122127052397e-06, "grad_norm": 9.564961433410645, "learning_rate": 5.281899109792285e-07, "loss": 0.4648, "mean_token_accuracy": 0.8498642444610596, "num_tokens": 47542493.0, "step": 1247 }, { "epoch": 0.15875842768095663, "ewc_loss": 0.007980507798492908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.980507689353544e-06, "grad_norm": 9.494006156921387, "learning_rate": 5.286138194150064e-07, "loss": 0.5017, "mean_token_accuracy": 0.8414666652679443, "num_tokens": 47583378.0, "step": 1248 }, { "epoch": 0.15888563795954713, "ewc_loss": 0.007985005155205727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 7.985005140653811e-06, "grad_norm": 9.586009979248047, "learning_rate": 5.290377278507841e-07, "loss": 0.5262, "mean_token_accuracy": 0.8363580703735352, "num_tokens": 47622671.0, "step": 1249 }, { "epoch": 0.15901284823813763, "ewc_loss": 0.008027289994060993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.02729027782334e-06, "grad_norm": 9.576651573181152, "learning_rate": 5.294616362865621e-07, "loss": 0.4856, "mean_token_accuracy": 0.8475148677825928, "num_tokens": 47663774.0, "step": 1250 }, { "epoch": 0.15914005851672816, "ewc_loss": 0.008004864677786827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.004864866961725e-06, "grad_norm": 9.58491039276123, "learning_rate": 5.298855447223399e-07, "loss": 0.5567, "mean_token_accuracy": 0.8320462703704834, "num_tokens": 47706143.0, "step": 1251 }, { "epoch": 0.15926726879531866, "ewc_loss": 0.008020524866878986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.020524546736851e-06, "grad_norm": 9.58584976196289, "learning_rate": 5.303094531581178e-07, "loss": 0.5183, "mean_token_accuracy": 0.8353488445281982, "num_tokens": 47741197.0, "step": 1252 }, { "epoch": 0.15939447907390916, "ewc_loss": 0.008026446215808392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.026446266740095e-06, "grad_norm": 9.612488746643066, "learning_rate": 5.307333615938957e-07, "loss": 0.4418, "mean_token_accuracy": 0.8569802641868591, "num_tokens": 47779306.0, "step": 1253 }, { "epoch": 0.1595216893524997, "ewc_loss": 0.008010351099073887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.01035093900282e-06, "grad_norm": 9.596636772155762, "learning_rate": 5.311572700296736e-07, "loss": 0.5273, "mean_token_accuracy": 0.832267701625824, "num_tokens": 47811056.0, "step": 1254 }, { "epoch": 0.1596488996310902, "ewc_loss": 0.008030358701944351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.030358912947122e-06, "grad_norm": 9.58642292022705, "learning_rate": 5.315811784654515e-07, "loss": 0.4347, "mean_token_accuracy": 0.8550297617912292, "num_tokens": 47847075.0, "step": 1255 }, { "epoch": 0.1597761099096807, "ewc_loss": 0.008048555813729763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.048556082940195e-06, "grad_norm": 9.570103645324707, "learning_rate": 5.320050869012294e-07, "loss": 0.4921, "mean_token_accuracy": 0.8432572484016418, "num_tokens": 47892639.0, "step": 1256 }, { "epoch": 0.15990332018827122, "ewc_loss": 0.008037709631025791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.03770944912685e-06, "grad_norm": 9.557283401489258, "learning_rate": 5.324289953370071e-07, "loss": 0.5498, "mean_token_accuracy": 0.8267412185668945, "num_tokens": 47933522.0, "step": 1257 }, { "epoch": 0.16003053046686172, "ewc_loss": 0.008062701672315598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.062701454036869e-06, "grad_norm": 9.578531265258789, "learning_rate": 5.328529037727851e-07, "loss": 0.4863, "mean_token_accuracy": 0.8426772952079773, "num_tokens": 47977307.0, "step": 1258 }, { "epoch": 0.16015774074545222, "ewc_loss": 0.00805620290338993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.056203114392702e-06, "grad_norm": 9.632925987243652, "learning_rate": 5.332768122085629e-07, "loss": 0.4836, "mean_token_accuracy": 0.8436102867126465, "num_tokens": 48015129.0, "step": 1259 }, { "epoch": 0.16028495102404275, "ewc_loss": 0.008091825060546398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.091825293377042e-06, "grad_norm": 9.664517402648926, "learning_rate": 5.337007206443408e-07, "loss": 0.5183, "mean_token_accuracy": 0.831836462020874, "num_tokens": 48051904.0, "step": 1260 }, { "epoch": 0.16041216130263325, "ewc_loss": 0.00807876791805029, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.078767677943688e-06, "grad_norm": 9.575836181640625, "learning_rate": 5.341246290801187e-07, "loss": 0.4799, "mean_token_accuracy": 0.8488724231719971, "num_tokens": 48094001.0, "step": 1261 }, { "epoch": 0.16053937158122375, "ewc_loss": 0.008069931529462337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.069931936915964e-06, "grad_norm": 9.5987548828125, "learning_rate": 5.345485375158966e-07, "loss": 0.4719, "mean_token_accuracy": 0.8455383777618408, "num_tokens": 48134124.0, "step": 1262 }, { "epoch": 0.16066658185981428, "ewc_loss": 0.008093651384115219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.093651558738202e-06, "grad_norm": 9.608662605285645, "learning_rate": 5.349724459516745e-07, "loss": 0.4621, "mean_token_accuracy": 0.8501733541488647, "num_tokens": 48170730.0, "step": 1263 }, { "epoch": 0.16079379213840478, "ewc_loss": 0.008106045424938202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.106045243039262e-06, "grad_norm": 9.676233291625977, "learning_rate": 5.353963543874522e-07, "loss": 0.5335, "mean_token_accuracy": 0.8312870264053345, "num_tokens": 48212640.0, "step": 1264 }, { "epoch": 0.1609210024169953, "ewc_loss": 0.008125693537294865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.125693966576364e-06, "grad_norm": 9.662564277648926, "learning_rate": 5.358202628232301e-07, "loss": 0.5119, "mean_token_accuracy": 0.8366222977638245, "num_tokens": 48243136.0, "step": 1265 }, { "epoch": 0.1610482126955858, "ewc_loss": 0.008117558434605598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.117558536469005e-06, "grad_norm": 9.642105102539062, "learning_rate": 5.36244171259008e-07, "loss": 0.4901, "mean_token_accuracy": 0.8432881832122803, "num_tokens": 48282905.0, "step": 1266 }, { "epoch": 0.1611754229741763, "ewc_loss": 0.008133471943438053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.133471965265926e-06, "grad_norm": 9.688911437988281, "learning_rate": 5.366680796947859e-07, "loss": 0.5285, "mean_token_accuracy": 0.8336021304130554, "num_tokens": 48323325.0, "step": 1267 }, { "epoch": 0.16130263325276684, "ewc_loss": 0.008143533952534199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.14353370515164e-06, "grad_norm": 9.685995101928711, "learning_rate": 5.370919881305637e-07, "loss": 0.5058, "mean_token_accuracy": 0.8420091271400452, "num_tokens": 48358400.0, "step": 1268 }, { "epoch": 0.16142984353135734, "ewc_loss": 0.008135657757520676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.135657481034286e-06, "grad_norm": 9.675324440002441, "learning_rate": 5.375158965663417e-07, "loss": 0.5078, "mean_token_accuracy": 0.8396325707435608, "num_tokens": 48393018.0, "step": 1269 }, { "epoch": 0.16155705380994784, "ewc_loss": 0.008150462992489338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.150463145284448e-06, "grad_norm": 9.672798156738281, "learning_rate": 5.379398050021195e-07, "loss": 0.4944, "mean_token_accuracy": 0.8420442342758179, "num_tokens": 48432952.0, "step": 1270 }, { "epoch": 0.16168426408853837, "ewc_loss": 0.008135342970490456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.135342795867473e-06, "grad_norm": 9.67576789855957, "learning_rate": 5.383637134378975e-07, "loss": 0.4832, "mean_token_accuracy": 0.8420106172561646, "num_tokens": 48466913.0, "step": 1271 }, { "epoch": 0.16181147436712887, "ewc_loss": 0.00815723929554224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.157238880812656e-06, "grad_norm": 9.647733688354492, "learning_rate": 5.387876218736752e-07, "loss": 0.5633, "mean_token_accuracy": 0.8179314732551575, "num_tokens": 48512658.0, "step": 1272 }, { "epoch": 0.16193868464571937, "ewc_loss": 0.008152798749506474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.1527987276786e-06, "grad_norm": 9.655823707580566, "learning_rate": 5.392115303094531e-07, "loss": 0.4603, "mean_token_accuracy": 0.8501253128051758, "num_tokens": 48552283.0, "step": 1273 }, { "epoch": 0.1620658949243099, "ewc_loss": 0.008198418654501438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.19841898191953e-06, "grad_norm": 9.66170883178711, "learning_rate": 5.39635438745231e-07, "loss": 0.4403, "mean_token_accuracy": 0.8574925065040588, "num_tokens": 48590504.0, "step": 1274 }, { "epoch": 0.1621931052029004, "ewc_loss": 0.008185542188584805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.18554235593183e-06, "grad_norm": 9.702615737915039, "learning_rate": 5.400593471810089e-07, "loss": 0.5281, "mean_token_accuracy": 0.8315649628639221, "num_tokens": 48629993.0, "step": 1275 }, { "epoch": 0.1623203154814909, "ewc_loss": 0.00820708554238081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.207085556932725e-06, "grad_norm": 9.710153579711914, "learning_rate": 5.404832556167867e-07, "loss": 0.532, "mean_token_accuracy": 0.8309192061424255, "num_tokens": 48665074.0, "step": 1276 }, { "epoch": 0.16244752576008142, "ewc_loss": 0.008213695138692856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.213694854930509e-06, "grad_norm": 9.691532135009766, "learning_rate": 5.409071640525647e-07, "loss": 0.4894, "mean_token_accuracy": 0.8470224142074585, "num_tokens": 48705524.0, "step": 1277 }, { "epoch": 0.16257473603867192, "ewc_loss": 0.008209377527236938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.209377483581193e-06, "grad_norm": 9.7319917678833, "learning_rate": 5.413310724883425e-07, "loss": 0.5363, "mean_token_accuracy": 0.8209081888198853, "num_tokens": 48744030.0, "step": 1278 }, { "epoch": 0.16270194631726242, "ewc_loss": 0.008226786740124226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.22678703116253e-06, "grad_norm": 9.628336906433105, "learning_rate": 5.417549809241205e-07, "loss": 0.5275, "mean_token_accuracy": 0.8349016904830933, "num_tokens": 48788401.0, "step": 1279 }, { "epoch": 0.16282915659585295, "ewc_loss": 0.00819981750100851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.199817784770858e-06, "grad_norm": 9.714442253112793, "learning_rate": 5.421788893598982e-07, "loss": 0.5311, "mean_token_accuracy": 0.8307559490203857, "num_tokens": 48823046.0, "step": 1280 }, { "epoch": 0.16295636687444345, "ewc_loss": 0.00825318694114685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.253186933870893e-06, "grad_norm": 9.699302673339844, "learning_rate": 5.42602797795676e-07, "loss": 0.5366, "mean_token_accuracy": 0.8285706043243408, "num_tokens": 48868293.0, "step": 1281 }, { "epoch": 0.16308357715303395, "ewc_loss": 0.008241565898060799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.24156632006634e-06, "grad_norm": 9.685342788696289, "learning_rate": 5.43026706231454e-07, "loss": 0.5648, "mean_token_accuracy": 0.8202016949653625, "num_tokens": 48908775.0, "step": 1282 }, { "epoch": 0.16321078743162448, "ewc_loss": 0.008233562111854553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.233561857196037e-06, "grad_norm": 9.693596839904785, "learning_rate": 5.434506146672319e-07, "loss": 0.5423, "mean_token_accuracy": 0.8276475667953491, "num_tokens": 48957640.0, "step": 1283 }, { "epoch": 0.16333799771021498, "ewc_loss": 0.008249691687524319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.24969174573198e-06, "grad_norm": 9.704276084899902, "learning_rate": 5.438745231030097e-07, "loss": 0.533, "mean_token_accuracy": 0.8273066282272339, "num_tokens": 48993601.0, "step": 1284 }, { "epoch": 0.16346520798880548, "ewc_loss": 0.008259872905910015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.259872629423626e-06, "grad_norm": 9.788525581359863, "learning_rate": 5.442984315387876e-07, "loss": 0.4404, "mean_token_accuracy": 0.8576697707176208, "num_tokens": 49028411.0, "step": 1285 }, { "epoch": 0.163592418267396, "ewc_loss": 0.00826321728527546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.263217750936747e-06, "grad_norm": 9.648269653320312, "learning_rate": 5.447223399745655e-07, "loss": 0.5033, "mean_token_accuracy": 0.8421281576156616, "num_tokens": 49067449.0, "step": 1286 }, { "epoch": 0.1637196285459865, "ewc_loss": 0.00824824534356594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.24824564915616e-06, "grad_norm": 9.734519958496094, "learning_rate": 5.451462484103433e-07, "loss": 0.4787, "mean_token_accuracy": 0.8468450903892517, "num_tokens": 49102650.0, "step": 1287 }, { "epoch": 0.163846838824577, "ewc_loss": 0.008297587744891644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.297587555716746e-06, "grad_norm": 9.782185554504395, "learning_rate": 5.455701568461212e-07, "loss": 0.5032, "mean_token_accuracy": 0.8375102281570435, "num_tokens": 49137110.0, "step": 1288 }, { "epoch": 0.16397404910316754, "ewc_loss": 0.008263454772531986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.26345512905391e-06, "grad_norm": 9.686664581298828, "learning_rate": 5.45994065281899e-07, "loss": 0.5742, "mean_token_accuracy": 0.8158959150314331, "num_tokens": 49177765.0, "step": 1289 }, { "epoch": 0.16410125938175804, "ewc_loss": 0.008294476196169853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.29447617434198e-06, "grad_norm": 9.81298828125, "learning_rate": 5.46417973717677e-07, "loss": 0.5147, "mean_token_accuracy": 0.8329997062683105, "num_tokens": 49207860.0, "step": 1290 }, { "epoch": 0.16422846966034857, "ewc_loss": 0.008313694037497044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.313693797390442e-06, "grad_norm": 9.752401351928711, "learning_rate": 5.468418821534548e-07, "loss": 0.4467, "mean_token_accuracy": 0.8552408218383789, "num_tokens": 49239994.0, "step": 1291 }, { "epoch": 0.16435567993893907, "ewc_loss": 0.008304157294332981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.304157745442353e-06, "grad_norm": 9.738819122314453, "learning_rate": 5.472657905892327e-07, "loss": 0.462, "mean_token_accuracy": 0.8523424863815308, "num_tokens": 49273464.0, "step": 1292 }, { "epoch": 0.16448289021752957, "ewc_loss": 0.008331503719091415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.33150352264056e-06, "grad_norm": 9.764198303222656, "learning_rate": 5.476896990250106e-07, "loss": 0.576, "mean_token_accuracy": 0.8185168504714966, "num_tokens": 49313144.0, "step": 1293 }, { "epoch": 0.1646101004961201, "ewc_loss": 0.008345083333551884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.345083188032731e-06, "grad_norm": 9.760520935058594, "learning_rate": 5.481136074607885e-07, "loss": 0.4623, "mean_token_accuracy": 0.8477658033370972, "num_tokens": 49351392.0, "step": 1294 }, { "epoch": 0.1647373107747106, "ewc_loss": 0.008344807662069798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.344807611138094e-06, "grad_norm": 9.749974250793457, "learning_rate": 5.485375158965663e-07, "loss": 0.4639, "mean_token_accuracy": 0.8510518670082092, "num_tokens": 49396726.0, "step": 1295 }, { "epoch": 0.1648645210533011, "ewc_loss": 0.008348370902240276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.34837101137964e-06, "grad_norm": 9.730863571166992, "learning_rate": 5.489614243323442e-07, "loss": 0.4977, "mean_token_accuracy": 0.8384144306182861, "num_tokens": 49435856.0, "step": 1296 }, { "epoch": 0.16499173133189163, "ewc_loss": 0.008369363844394684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.369363968085963e-06, "grad_norm": 9.806783676147461, "learning_rate": 5.49385332768122e-07, "loss": 0.4384, "mean_token_accuracy": 0.8558106422424316, "num_tokens": 49473695.0, "step": 1297 }, { "epoch": 0.16511894161048213, "ewc_loss": 0.008379610255360603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.379610335396137e-06, "grad_norm": 9.787147521972656, "learning_rate": 5.498092412039e-07, "loss": 0.565, "mean_token_accuracy": 0.8250571489334106, "num_tokens": 49509165.0, "step": 1298 }, { "epoch": 0.16524615188907263, "ewc_loss": 0.008350786752998829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.35078662930755e-06, "grad_norm": 9.672829627990723, "learning_rate": 5.502331496396778e-07, "loss": 0.4694, "mean_token_accuracy": 0.8481627702713013, "num_tokens": 49553790.0, "step": 1299 }, { "epoch": 0.16537336216766316, "ewc_loss": 0.008368081413209438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.368081580556463e-06, "grad_norm": 9.76758098602295, "learning_rate": 5.506570580754557e-07, "loss": 0.4927, "mean_token_accuracy": 0.8422073125839233, "num_tokens": 49593124.0, "step": 1300 }, { "epoch": 0.16550057244625366, "ewc_loss": 0.00838957354426384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.38957384985406e-06, "grad_norm": 9.818998336791992, "learning_rate": 5.510809665112336e-07, "loss": 0.4808, "mean_token_accuracy": 0.8484593629837036, "num_tokens": 49632016.0, "step": 1301 }, { "epoch": 0.16562778272484416, "ewc_loss": 0.008396828547120094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.396828889090102e-06, "grad_norm": 9.80968189239502, "learning_rate": 5.515048749470113e-07, "loss": 0.4947, "mean_token_accuracy": 0.8420684337615967, "num_tokens": 49670689.0, "step": 1302 }, { "epoch": 0.1657549930034347, "ewc_loss": 0.008397338911890984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.397339115617797e-06, "grad_norm": 9.779494285583496, "learning_rate": 5.519287833827893e-07, "loss": 0.4812, "mean_token_accuracy": 0.8442970514297485, "num_tokens": 49709230.0, "step": 1303 }, { "epoch": 0.1658822032820252, "ewc_loss": 0.008408925496041775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.408925168623682e-06, "grad_norm": 9.909521102905273, "learning_rate": 5.523526918185671e-07, "loss": 0.5378, "mean_token_accuracy": 0.8277225494384766, "num_tokens": 49741648.0, "step": 1304 }, { "epoch": 0.1660094135606157, "ewc_loss": 0.008435851894319057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.435851668764371e-06, "grad_norm": 9.800047874450684, "learning_rate": 5.52776600254345e-07, "loss": 0.4558, "mean_token_accuracy": 0.8490667343139648, "num_tokens": 49773906.0, "step": 1305 }, { "epoch": 0.16613662383920622, "ewc_loss": 0.00839501153677702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.39501171867596e-06, "grad_norm": 9.746415138244629, "learning_rate": 5.532005086901229e-07, "loss": 0.5141, "mean_token_accuracy": 0.8402760028839111, "num_tokens": 49812645.0, "step": 1306 }, { "epoch": 0.16626383411779672, "ewc_loss": 0.008429784327745438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.429784429608844e-06, "grad_norm": 9.797709465026855, "learning_rate": 5.536244171259008e-07, "loss": 0.4833, "mean_token_accuracy": 0.84595787525177, "num_tokens": 49854500.0, "step": 1307 }, { "epoch": 0.16639104439638722, "ewc_loss": 0.008446242660284042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.446242645732127e-06, "grad_norm": 9.884993553161621, "learning_rate": 5.540483255616786e-07, "loss": 0.5769, "mean_token_accuracy": 0.8191024661064148, "num_tokens": 49889408.0, "step": 1308 }, { "epoch": 0.16651825467497774, "ewc_loss": 0.008447312749922276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.447313120996114e-06, "grad_norm": 9.784640312194824, "learning_rate": 5.544722339974566e-07, "loss": 0.5006, "mean_token_accuracy": 0.8406210541725159, "num_tokens": 49923581.0, "step": 1309 }, { "epoch": 0.16664546495356825, "ewc_loss": 0.008438115008175373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.438115401077084e-06, "grad_norm": 9.814475059509277, "learning_rate": 5.548961424332343e-07, "loss": 0.4897, "mean_token_accuracy": 0.8402355313301086, "num_tokens": 49959433.0, "step": 1310 }, { "epoch": 0.16677267523215875, "ewc_loss": 0.008477088995277882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.477089068037458e-06, "grad_norm": 9.810522079467773, "learning_rate": 5.553200508690123e-07, "loss": 0.457, "mean_token_accuracy": 0.8538364171981812, "num_tokens": 50000307.0, "step": 1311 }, { "epoch": 0.16689988551074927, "ewc_loss": 0.008457967080175877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.457966941932682e-06, "grad_norm": 9.817611694335938, "learning_rate": 5.557439593047901e-07, "loss": 0.5504, "mean_token_accuracy": 0.8234829306602478, "num_tokens": 50035587.0, "step": 1312 }, { "epoch": 0.16702709578933977, "ewc_loss": 0.008482422679662704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.482422344968654e-06, "grad_norm": 9.816761016845703, "learning_rate": 5.56167867740568e-07, "loss": 0.4883, "mean_token_accuracy": 0.8436017632484436, "num_tokens": 50079425.0, "step": 1313 }, { "epoch": 0.16715430606793028, "ewc_loss": 0.008483503945171833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.483503734169062e-06, "grad_norm": 9.831145286560059, "learning_rate": 5.565917761763459e-07, "loss": 0.4707, "mean_token_accuracy": 0.8495950102806091, "num_tokens": 50110674.0, "step": 1314 }, { "epoch": 0.1672815163465208, "ewc_loss": 0.008484585210680962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.48458512336947e-06, "grad_norm": 9.818624496459961, "learning_rate": 5.570156846121238e-07, "loss": 0.5101, "mean_token_accuracy": 0.8350931406021118, "num_tokens": 50149033.0, "step": 1315 }, { "epoch": 0.1674087266251113, "ewc_loss": 0.00848522037267685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.485219950671308e-06, "grad_norm": 9.82253646850586, "learning_rate": 5.574395930479016e-07, "loss": 0.5297, "mean_token_accuracy": 0.8314992189407349, "num_tokens": 50185728.0, "step": 1316 }, { "epoch": 0.16753593690370183, "ewc_loss": 0.008515307679772377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.51530785439536e-06, "grad_norm": 9.914838790893555, "learning_rate": 5.578635014836796e-07, "loss": 0.4796, "mean_token_accuracy": 0.8468550443649292, "num_tokens": 50219671.0, "step": 1317 }, { "epoch": 0.16766314718229233, "ewc_loss": 0.008508564904332161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.508564860676415e-06, "grad_norm": 9.84076976776123, "learning_rate": 5.582874099194573e-07, "loss": 0.5406, "mean_token_accuracy": 0.8321616649627686, "num_tokens": 50251989.0, "step": 1318 }, { "epoch": 0.16779035746088283, "ewc_loss": 0.008496656082570553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.4966559370514e-06, "grad_norm": 9.828411102294922, "learning_rate": 5.587113183552353e-07, "loss": 0.5249, "mean_token_accuracy": 0.8333433270454407, "num_tokens": 50291079.0, "step": 1319 }, { "epoch": 0.16791756773947336, "ewc_loss": 0.008522906340658665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.522906682628673e-06, "grad_norm": 9.812154769897461, "learning_rate": 5.591352267910131e-07, "loss": 0.4802, "mean_token_accuracy": 0.8465261459350586, "num_tokens": 50332262.0, "step": 1320 }, { "epoch": 0.16804477801806386, "ewc_loss": 0.008526148274540901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.526148121745791e-06, "grad_norm": 9.87891960144043, "learning_rate": 5.59559135226791e-07, "loss": 0.5104, "mean_token_accuracy": 0.8394380807876587, "num_tokens": 50371278.0, "step": 1321 }, { "epoch": 0.16817198829665436, "ewc_loss": 0.008538839407265186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.53883921081433e-06, "grad_norm": 9.846327781677246, "learning_rate": 5.599830436625689e-07, "loss": 0.4886, "mean_token_accuracy": 0.8433374762535095, "num_tokens": 50409463.0, "step": 1322 }, { "epoch": 0.1682991985752449, "ewc_loss": 0.008540221489965916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.540221642761026e-06, "grad_norm": 9.844474792480469, "learning_rate": 5.604069520983468e-07, "loss": 0.482, "mean_token_accuracy": 0.8454651832580566, "num_tokens": 50446745.0, "step": 1323 }, { "epoch": 0.1684264088538354, "ewc_loss": 0.008546823635697365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.546823664801195e-06, "grad_norm": 9.856328010559082, "learning_rate": 5.608308605341246e-07, "loss": 0.4736, "mean_token_accuracy": 0.8464843034744263, "num_tokens": 50479725.0, "step": 1324 }, { "epoch": 0.1685536191324259, "ewc_loss": 0.008558511734008789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.55851158121368e-06, "grad_norm": 9.898223876953125, "learning_rate": 5.612547689699024e-07, "loss": 0.537, "mean_token_accuracy": 0.8296910524368286, "num_tokens": 50514734.0, "step": 1325 }, { "epoch": 0.16868082941101642, "ewc_loss": 0.00857541710138321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.575417268730234e-06, "grad_norm": 9.825493812561035, "learning_rate": 5.616786774056803e-07, "loss": 0.515, "mean_token_accuracy": 0.837580680847168, "num_tokens": 50555165.0, "step": 1326 }, { "epoch": 0.16880803968960692, "ewc_loss": 0.008570054545998573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.570054887968581e-06, "grad_norm": 9.873229026794434, "learning_rate": 5.621025858414582e-07, "loss": 0.5161, "mean_token_accuracy": 0.8349378108978271, "num_tokens": 50598251.0, "step": 1327 }, { "epoch": 0.16893524996819742, "ewc_loss": 0.00858598668128252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.585986506659538e-06, "grad_norm": 9.829952239990234, "learning_rate": 5.625264942772361e-07, "loss": 0.5084, "mean_token_accuracy": 0.8393842577934265, "num_tokens": 50638423.0, "step": 1328 }, { "epoch": 0.16906246024678795, "ewc_loss": 0.008585358038544655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.585358045820612e-06, "grad_norm": 9.857829093933105, "learning_rate": 5.629504027130139e-07, "loss": 0.5669, "mean_token_accuracy": 0.8283222913742065, "num_tokens": 50678226.0, "step": 1329 }, { "epoch": 0.16918967052537845, "ewc_loss": 0.008599293418228626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.599293323641177e-06, "grad_norm": 9.87226676940918, "learning_rate": 5.633743111487919e-07, "loss": 0.5668, "mean_token_accuracy": 0.8172990679740906, "num_tokens": 50717542.0, "step": 1330 }, { "epoch": 0.16931688080396895, "ewc_loss": 0.008609614335000515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.609614269516896e-06, "grad_norm": 9.921074867248535, "learning_rate": 5.637982195845697e-07, "loss": 0.4465, "mean_token_accuracy": 0.8564890623092651, "num_tokens": 50752914.0, "step": 1331 }, { "epoch": 0.16944409108255948, "ewc_loss": 0.008618149906396866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.618149877293035e-06, "grad_norm": 9.894545555114746, "learning_rate": 5.642221280203476e-07, "loss": 0.446, "mean_token_accuracy": 0.8574275970458984, "num_tokens": 50789276.0, "step": 1332 }, { "epoch": 0.16957130136114998, "ewc_loss": 0.00861609447747469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.616094419267029e-06, "grad_norm": 9.947332382202148, "learning_rate": 5.646460364561254e-07, "loss": 0.4667, "mean_token_accuracy": 0.8477533459663391, "num_tokens": 50820759.0, "step": 1333 }, { "epoch": 0.16969851163974048, "ewc_loss": 0.00862041860818863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.620419066573959e-06, "grad_norm": 9.868196487426758, "learning_rate": 5.650699448919033e-07, "loss": 0.5571, "mean_token_accuracy": 0.8240818977355957, "num_tokens": 50863241.0, "step": 1334 }, { "epoch": 0.169825721918331, "ewc_loss": 0.00861207116395235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.612070814706385e-06, "grad_norm": 9.884315490722656, "learning_rate": 5.654938533276812e-07, "loss": 0.4965, "mean_token_accuracy": 0.8400493860244751, "num_tokens": 50904296.0, "step": 1335 }, { "epoch": 0.1699529321969215, "ewc_loss": 0.008645680733025074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.645680281915702e-06, "grad_norm": 9.903801918029785, "learning_rate": 5.659177617634591e-07, "loss": 0.4823, "mean_token_accuracy": 0.8457850217819214, "num_tokens": 50942180.0, "step": 1336 }, { "epoch": 0.170080142475512, "ewc_loss": 0.008631618693470955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.63161858433159e-06, "grad_norm": 9.88376235961914, "learning_rate": 5.663416701992369e-07, "loss": 0.5091, "mean_token_accuracy": 0.838674783706665, "num_tokens": 50980287.0, "step": 1337 }, { "epoch": 0.17020735275410254, "ewc_loss": 0.0086372597143054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.637259270471986e-06, "grad_norm": 9.974531173706055, "learning_rate": 5.667655786350149e-07, "loss": 0.5722, "mean_token_accuracy": 0.812346339225769, "num_tokens": 51014218.0, "step": 1338 }, { "epoch": 0.17033456303269304, "ewc_loss": 0.008663139306008816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.663138942210935e-06, "grad_norm": 9.890029907226562, "learning_rate": 5.671894870707927e-07, "loss": 0.4883, "mean_token_accuracy": 0.8459409475326538, "num_tokens": 51055915.0, "step": 1339 }, { "epoch": 0.17046177331128357, "ewc_loss": 0.008628983981907368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.628983778180555e-06, "grad_norm": 9.893234252929688, "learning_rate": 5.676133955065705e-07, "loss": 0.4316, "mean_token_accuracy": 0.8576407432556152, "num_tokens": 51090684.0, "step": 1340 }, { "epoch": 0.17058898358987407, "ewc_loss": 0.00866332184523344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.663321750645991e-06, "grad_norm": 9.908535957336426, "learning_rate": 5.680373039423484e-07, "loss": 0.479, "mean_token_accuracy": 0.8453258872032166, "num_tokens": 51130156.0, "step": 1341 }, { "epoch": 0.17071619386846457, "ewc_loss": 0.008681559935212135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.681559847900644e-06, "grad_norm": 9.969313621520996, "learning_rate": 5.684612123781263e-07, "loss": 0.5192, "mean_token_accuracy": 0.8330962657928467, "num_tokens": 51171282.0, "step": 1342 }, { "epoch": 0.1708434041470551, "ewc_loss": 0.008680672384798527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.680672181071714e-06, "grad_norm": 9.92475700378418, "learning_rate": 5.688851208139042e-07, "loss": 0.5356, "mean_token_accuracy": 0.8280595541000366, "num_tokens": 51210616.0, "step": 1343 }, { "epoch": 0.1709706144256456, "ewc_loss": 0.008668841794133186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.668841473991051e-06, "grad_norm": 9.920761108398438, "learning_rate": 5.69309029249682e-07, "loss": 0.5217, "mean_token_accuracy": 0.8376531600952148, "num_tokens": 51253357.0, "step": 1344 }, { "epoch": 0.1710978247042361, "ewc_loss": 0.00867084227502346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.670842362334952e-06, "grad_norm": 9.906644821166992, "learning_rate": 5.697329376854599e-07, "loss": 0.5005, "mean_token_accuracy": 0.8386698961257935, "num_tokens": 51292671.0, "step": 1345 }, { "epoch": 0.17122503498282662, "ewc_loss": 0.00867827981710434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.67828021000605e-06, "grad_norm": 9.986044883728027, "learning_rate": 5.701568461212378e-07, "loss": 0.4404, "mean_token_accuracy": 0.8584893941879272, "num_tokens": 51329145.0, "step": 1346 }, { "epoch": 0.17135224526141712, "ewc_loss": 0.00870811939239502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.708119821676519e-06, "grad_norm": 9.942609786987305, "learning_rate": 5.705807545570157e-07, "loss": 0.4496, "mean_token_accuracy": 0.8537614345550537, "num_tokens": 51371265.0, "step": 1347 }, { "epoch": 0.17147945554000762, "ewc_loss": 0.00868204515427351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.68204551807139e-06, "grad_norm": 9.974318504333496, "learning_rate": 5.710046629927934e-07, "loss": 0.4867, "mean_token_accuracy": 0.8415695428848267, "num_tokens": 51410262.0, "step": 1348 }, { "epoch": 0.17160666581859815, "ewc_loss": 0.008709973655641079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.709973371878732e-06, "grad_norm": 9.947151184082031, "learning_rate": 5.714285714285714e-07, "loss": 0.4573, "mean_token_accuracy": 0.8524802923202515, "num_tokens": 51447796.0, "step": 1349 }, { "epoch": 0.17173387609718865, "ewc_loss": 0.008689181879162788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.689182322996203e-06, "grad_norm": 9.971696853637695, "learning_rate": 5.718524798643492e-07, "loss": 0.5215, "mean_token_accuracy": 0.8360422849655151, "num_tokens": 51482677.0, "step": 1350 }, { "epoch": 0.17186108637577915, "ewc_loss": 0.00871333945542574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.713339411769994e-06, "grad_norm": 9.991971015930176, "learning_rate": 5.722763883001272e-07, "loss": 0.5219, "mean_token_accuracy": 0.8356094360351562, "num_tokens": 51524111.0, "step": 1351 }, { "epoch": 0.17198829665436968, "ewc_loss": 0.008705945685505867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.70594521984458e-06, "grad_norm": 9.946978569030762, "learning_rate": 5.72700296735905e-07, "loss": 0.4947, "mean_token_accuracy": 0.8408339023590088, "num_tokens": 51561008.0, "step": 1352 }, { "epoch": 0.17211550693296018, "ewc_loss": 0.008711540140211582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.711540431249887e-06, "grad_norm": 9.956319808959961, "learning_rate": 5.731242051716829e-07, "loss": 0.5485, "mean_token_accuracy": 0.8224732875823975, "num_tokens": 51603091.0, "step": 1353 }, { "epoch": 0.17224271721155068, "ewc_loss": 0.008698392659425735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.698392775841057e-06, "grad_norm": 9.89830207824707, "learning_rate": 5.735481136074608e-07, "loss": 0.4639, "mean_token_accuracy": 0.8490347862243652, "num_tokens": 51637789.0, "step": 1354 }, { "epoch": 0.1723699274901412, "ewc_loss": 0.008729534223675728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.729533874429762e-06, "grad_norm": 9.988018989562988, "learning_rate": 5.739720220432386e-07, "loss": 0.4685, "mean_token_accuracy": 0.8464595079421997, "num_tokens": 51674207.0, "step": 1355 }, { "epoch": 0.1724971377687317, "ewc_loss": 0.008750987239181995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.750987035455182e-06, "grad_norm": 9.967604637145996, "learning_rate": 5.743959304790164e-07, "loss": 0.5648, "mean_token_accuracy": 0.823067307472229, "num_tokens": 51715109.0, "step": 1356 }, { "epoch": 0.1726243480473222, "ewc_loss": 0.008742592297494411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.742592399357818e-06, "grad_norm": 9.986296653747559, "learning_rate": 5.748198389147944e-07, "loss": 0.5116, "mean_token_accuracy": 0.8387328386306763, "num_tokens": 51753373.0, "step": 1357 }, { "epoch": 0.17275155832591274, "ewc_loss": 0.008742105215787888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.742104910197668e-06, "grad_norm": 9.95651912689209, "learning_rate": 5.752437473505722e-07, "loss": 0.5495, "mean_token_accuracy": 0.8282997012138367, "num_tokens": 51795100.0, "step": 1358 }, { "epoch": 0.17287876860450324, "ewc_loss": 0.008738338015973568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.738337783142924e-06, "grad_norm": 9.954689025878906, "learning_rate": 5.756676557863502e-07, "loss": 0.5229, "mean_token_accuracy": 0.8325068950653076, "num_tokens": 51831840.0, "step": 1359 }, { "epoch": 0.17300597888309374, "ewc_loss": 0.008748738095164299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.748737855057698e-06, "grad_norm": 9.977264404296875, "learning_rate": 5.76091564222128e-07, "loss": 0.4752, "mean_token_accuracy": 0.8480137586593628, "num_tokens": 51870227.0, "step": 1360 }, { "epoch": 0.17313318916168427, "ewc_loss": 0.008761096745729446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.76109697856009e-06, "grad_norm": 9.991901397705078, "learning_rate": 5.765154726579059e-07, "loss": 0.5009, "mean_token_accuracy": 0.8402191400527954, "num_tokens": 51908251.0, "step": 1361 }, { "epoch": 0.17326039944027477, "ewc_loss": 0.008781462907791138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.781463293416891e-06, "grad_norm": 10.032218933105469, "learning_rate": 5.769393810936838e-07, "loss": 0.4906, "mean_token_accuracy": 0.8431751132011414, "num_tokens": 51944531.0, "step": 1362 }, { "epoch": 0.17338760971886527, "ewc_loss": 0.008782570250332355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.782570148468949e-06, "grad_norm": 9.96409797668457, "learning_rate": 5.773632895294616e-07, "loss": 0.5447, "mean_token_accuracy": 0.8255380392074585, "num_tokens": 51985742.0, "step": 1363 }, { "epoch": 0.1735148199974558, "ewc_loss": 0.008780702948570251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.780702955846209e-06, "grad_norm": 10.043107032775879, "learning_rate": 5.777871979652394e-07, "loss": 0.4809, "mean_token_accuracy": 0.8454508781433105, "num_tokens": 52022036.0, "step": 1364 }, { "epoch": 0.1736420302760463, "ewc_loss": 0.008806689642369747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.806689947959967e-06, "grad_norm": 9.986726760864258, "learning_rate": 5.782111064010173e-07, "loss": 0.5043, "mean_token_accuracy": 0.8377903699874878, "num_tokens": 52066077.0, "step": 1365 }, { "epoch": 0.17376924055463683, "ewc_loss": 0.008789055049419403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.78905484569259e-06, "grad_norm": 9.962079048156738, "learning_rate": 5.786350148367952e-07, "loss": 0.4958, "mean_token_accuracy": 0.8412131071090698, "num_tokens": 52111280.0, "step": 1366 }, { "epoch": 0.17389645083322733, "ewc_loss": 0.0088087422773242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.808742677501868e-06, "grad_norm": 9.991192817687988, "learning_rate": 5.790589232725731e-07, "loss": 0.4937, "mean_token_accuracy": 0.8394418954849243, "num_tokens": 52153148.0, "step": 1367 }, { "epoch": 0.17402366111181783, "ewc_loss": 0.008823472075164318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.823471944197081e-06, "grad_norm": 10.06406021118164, "learning_rate": 5.79482831708351e-07, "loss": 0.4786, "mean_token_accuracy": 0.8459194898605347, "num_tokens": 52186722.0, "step": 1368 }, { "epoch": 0.17415087139040836, "ewc_loss": 0.008847861550748348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.847861863614526e-06, "grad_norm": 10.0590181350708, "learning_rate": 5.799067401441288e-07, "loss": 0.4888, "mean_token_accuracy": 0.8408021926879883, "num_tokens": 52220837.0, "step": 1369 }, { "epoch": 0.17427808166899886, "ewc_loss": 0.00883833970874548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.838339454086963e-06, "grad_norm": 10.026017189025879, "learning_rate": 5.803306485799068e-07, "loss": 0.4733, "mean_token_accuracy": 0.848750114440918, "num_tokens": 52257049.0, "step": 1370 }, { "epoch": 0.17440529194758936, "ewc_loss": 0.008840888738632202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.840888767736033e-06, "grad_norm": 10.015865325927734, "learning_rate": 5.807545570156845e-07, "loss": 0.4503, "mean_token_accuracy": 0.8529432415962219, "num_tokens": 52297632.0, "step": 1371 }, { "epoch": 0.1745325022261799, "ewc_loss": 0.0088551826775074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.855182386469096e-06, "grad_norm": 10.05499267578125, "learning_rate": 5.811784654514624e-07, "loss": 0.5502, "mean_token_accuracy": 0.8253124356269836, "num_tokens": 52331008.0, "step": 1372 }, { "epoch": 0.1746597125047704, "ewc_loss": 0.00887129083275795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.871290447132196e-06, "grad_norm": 10.04951000213623, "learning_rate": 5.816023738872403e-07, "loss": 0.4812, "mean_token_accuracy": 0.8464053869247437, "num_tokens": 52369420.0, "step": 1373 }, { "epoch": 0.1747869227833609, "ewc_loss": 0.00885517057031393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.855170563037973e-06, "grad_norm": 10.016695022583008, "learning_rate": 5.820262823230182e-07, "loss": 0.512, "mean_token_accuracy": 0.8364970684051514, "num_tokens": 52411759.0, "step": 1374 }, { "epoch": 0.17491413306195142, "ewc_loss": 0.008882362395524979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.88236263563158e-06, "grad_norm": 10.063186645507812, "learning_rate": 5.824501907587961e-07, "loss": 0.4715, "mean_token_accuracy": 0.8487448692321777, "num_tokens": 52446459.0, "step": 1375 }, { "epoch": 0.17504134334054192, "ewc_loss": 0.00889092218130827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.890921890269965e-06, "grad_norm": 10.071800231933594, "learning_rate": 5.82874099194574e-07, "loss": 0.5553, "mean_token_accuracy": 0.8220375776290894, "num_tokens": 52484615.0, "step": 1376 }, { "epoch": 0.17516855361913242, "ewc_loss": 0.008892725221812725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.89272541826358e-06, "grad_norm": 9.987380981445312, "learning_rate": 5.832980076303518e-07, "loss": 0.5661, "mean_token_accuracy": 0.8191061019897461, "num_tokens": 52525195.0, "step": 1377 }, { "epoch": 0.17529576389772294, "ewc_loss": 0.008885224349796772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.885223905963358e-06, "grad_norm": 10.09898853302002, "learning_rate": 5.837219160661297e-07, "loss": 0.4497, "mean_token_accuracy": 0.8543046712875366, "num_tokens": 52558783.0, "step": 1378 }, { "epoch": 0.17542297417631345, "ewc_loss": 0.008942460641264915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.942460226535331e-06, "grad_norm": 10.035115242004395, "learning_rate": 5.841458245019075e-07, "loss": 0.445, "mean_token_accuracy": 0.8551780581474304, "num_tokens": 52599176.0, "step": 1379 }, { "epoch": 0.17555018445490395, "ewc_loss": 0.008889826014637947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.889825949154329e-06, "grad_norm": 10.071212768554688, "learning_rate": 5.845697329376855e-07, "loss": 0.5018, "mean_token_accuracy": 0.8430089950561523, "num_tokens": 52630865.0, "step": 1380 }, { "epoch": 0.17567739473349447, "ewc_loss": 0.008951473981142044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.951474228524603e-06, "grad_norm": 10.097527503967285, "learning_rate": 5.849936413734633e-07, "loss": 0.5532, "mean_token_accuracy": 0.8241570591926575, "num_tokens": 52671702.0, "step": 1381 }, { "epoch": 0.17580460501208497, "ewc_loss": 0.008940854109823704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.940854058892e-06, "grad_norm": 10.055683135986328, "learning_rate": 5.854175498092412e-07, "loss": 0.4577, "mean_token_accuracy": 0.8531271815299988, "num_tokens": 52712288.0, "step": 1382 }, { "epoch": 0.17593181529067548, "ewc_loss": 0.00894879549741745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.948795766627882e-06, "grad_norm": 10.06497573852539, "learning_rate": 5.858414582450191e-07, "loss": 0.5001, "mean_token_accuracy": 0.8415571451187134, "num_tokens": 52756569.0, "step": 1383 }, { "epoch": 0.176059025569266, "ewc_loss": 0.008956482633948326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.956482815847266e-06, "grad_norm": 10.090215682983398, "learning_rate": 5.86265366680797e-07, "loss": 0.5228, "mean_token_accuracy": 0.8351274728775024, "num_tokens": 52796195.0, "step": 1384 }, { "epoch": 0.1761862358478565, "ewc_loss": 0.008948145434260368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.948145477916114e-06, "grad_norm": 10.099381446838379, "learning_rate": 5.866892751165748e-07, "loss": 0.4945, "mean_token_accuracy": 0.8432031869888306, "num_tokens": 52839687.0, "step": 1385 }, { "epoch": 0.176313446126447, "ewc_loss": 0.008969501592218876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.969501323008444e-06, "grad_norm": 10.103466033935547, "learning_rate": 5.871131835523526e-07, "loss": 0.5486, "mean_token_accuracy": 0.8259243965148926, "num_tokens": 52875660.0, "step": 1386 }, { "epoch": 0.17644065640503753, "ewc_loss": 0.008949199691414833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.949199582275469e-06, "grad_norm": 10.10586166381836, "learning_rate": 5.875370919881305e-07, "loss": 0.5052, "mean_token_accuracy": 0.8369793891906738, "num_tokens": 52921220.0, "step": 1387 }, { "epoch": 0.17656786668362803, "ewc_loss": 0.008996081538498402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.996081305667758e-06, "grad_norm": 10.164324760437012, "learning_rate": 5.879610004239084e-07, "loss": 0.5587, "mean_token_accuracy": 0.8259023427963257, "num_tokens": 52960042.0, "step": 1388 }, { "epoch": 0.17669507696221853, "ewc_loss": 0.008983494713902473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.983494808489922e-06, "grad_norm": 10.07987117767334, "learning_rate": 5.883849088596863e-07, "loss": 0.5424, "mean_token_accuracy": 0.8286414742469788, "num_tokens": 53002817.0, "step": 1389 }, { "epoch": 0.17682228724080906, "ewc_loss": 0.008960654959082603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.960654668044299e-06, "grad_norm": 10.128381729125977, "learning_rate": 5.888088172954641e-07, "loss": 0.5162, "mean_token_accuracy": 0.8366878032684326, "num_tokens": 53037751.0, "step": 1390 }, { "epoch": 0.17694949751939956, "ewc_loss": 0.00900796614587307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.007966582430527e-06, "grad_norm": 10.149300575256348, "learning_rate": 5.892327257312421e-07, "loss": 0.428, "mean_token_accuracy": 0.8638814091682434, "num_tokens": 53074286.0, "step": 1391 }, { "epoch": 0.1770767077979901, "ewc_loss": 0.008981984108686447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.981984137790278e-06, "grad_norm": 10.045902252197266, "learning_rate": 5.896566341670199e-07, "loss": 0.4504, "mean_token_accuracy": 0.8494687080383301, "num_tokens": 53113525.0, "step": 1392 }, { "epoch": 0.1772039180765806, "ewc_loss": 0.00898732803761959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.987328328657895e-06, "grad_norm": 10.154834747314453, "learning_rate": 5.900805426027977e-07, "loss": 0.4938, "mean_token_accuracy": 0.8415862321853638, "num_tokens": 53150137.0, "step": 1393 }, { "epoch": 0.1773311283551711, "ewc_loss": 0.00902031920850277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.020319339470007e-06, "grad_norm": 10.055654525756836, "learning_rate": 5.905044510385756e-07, "loss": 0.4458, "mean_token_accuracy": 0.8555756211280823, "num_tokens": 53189519.0, "step": 1394 }, { "epoch": 0.17745833863376162, "ewc_loss": 0.008980346843600273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 8.980347047327086e-06, "grad_norm": 10.112987518310547, "learning_rate": 5.909283594743535e-07, "loss": 0.5581, "mean_token_accuracy": 0.8219404816627502, "num_tokens": 53227452.0, "step": 1395 }, { "epoch": 0.17758554891235212, "ewc_loss": 0.009062159806489944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.062159733730368e-06, "grad_norm": 10.120709419250488, "learning_rate": 5.913522679101314e-07, "loss": 0.49, "mean_token_accuracy": 0.8386826515197754, "num_tokens": 53265875.0, "step": 1396 }, { "epoch": 0.17771275919094262, "ewc_loss": 0.009020315483212471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.0203157014912e-06, "grad_norm": 10.10856819152832, "learning_rate": 5.917761763459093e-07, "loss": 0.5501, "mean_token_accuracy": 0.8244185447692871, "num_tokens": 53305477.0, "step": 1397 }, { "epoch": 0.17783996946953315, "ewc_loss": 0.00905772764235735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.057727766048629e-06, "grad_norm": 10.162209510803223, "learning_rate": 5.922000847816871e-07, "loss": 0.469, "mean_token_accuracy": 0.8502697944641113, "num_tokens": 53342800.0, "step": 1398 }, { "epoch": 0.17796717974812365, "ewc_loss": 0.009058608673512936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.058609066414647e-06, "grad_norm": 10.131486892700195, "learning_rate": 5.926239932174651e-07, "loss": 0.5337, "mean_token_accuracy": 0.8307569622993469, "num_tokens": 53387476.0, "step": 1399 }, { "epoch": 0.17809439002671415, "ewc_loss": 0.009063760749995708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.063760444405489e-06, "grad_norm": 10.153576850891113, "learning_rate": 5.930479016532429e-07, "loss": 0.4907, "mean_token_accuracy": 0.8424570560455322, "num_tokens": 53424628.0, "step": 1400 }, { "epoch": 0.17822160030530468, "ewc_loss": 0.009074683301150799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.074683475773782e-06, "grad_norm": 10.186662673950195, "learning_rate": 5.934718100890207e-07, "loss": 0.4723, "mean_token_accuracy": 0.8459707498550415, "num_tokens": 53464459.0, "step": 1401 }, { "epoch": 0.17834881058389518, "ewc_loss": 0.009047150611877441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.04715034266701e-06, "grad_norm": 10.170918464660645, "learning_rate": 5.938957185247986e-07, "loss": 0.5194, "mean_token_accuracy": 0.8311982154846191, "num_tokens": 53501721.0, "step": 1402 }, { "epoch": 0.17847602086248568, "ewc_loss": 0.009049887768924236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.049887921719346e-06, "grad_norm": 10.192562103271484, "learning_rate": 5.943196269605765e-07, "loss": 0.481, "mean_token_accuracy": 0.8418924808502197, "num_tokens": 53536607.0, "step": 1403 }, { "epoch": 0.1786032311410762, "ewc_loss": 0.009067005477845669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.067005521501414e-06, "grad_norm": 10.203117370605469, "learning_rate": 5.947435353963544e-07, "loss": 0.5319, "mean_token_accuracy": 0.8345040082931519, "num_tokens": 53569809.0, "step": 1404 }, { "epoch": 0.1787304414196667, "ewc_loss": 0.009083639830350876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.083640179596841e-06, "grad_norm": 10.147880554199219, "learning_rate": 5.951674438321323e-07, "loss": 0.5081, "mean_token_accuracy": 0.8380953073501587, "num_tokens": 53604423.0, "step": 1405 }, { "epoch": 0.1788576516982572, "ewc_loss": 0.009091947227716446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.091947504202835e-06, "grad_norm": 10.198975563049316, "learning_rate": 5.955913522679101e-07, "loss": 0.5044, "mean_token_accuracy": 0.8355219960212708, "num_tokens": 53642145.0, "step": 1406 }, { "epoch": 0.17898486197684774, "ewc_loss": 0.009113158099353313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.113157830142882e-06, "grad_norm": 10.161797523498535, "learning_rate": 5.96015260703688e-07, "loss": 0.5801, "mean_token_accuracy": 0.8142701387405396, "num_tokens": 53682998.0, "step": 1407 }, { "epoch": 0.17911207225543824, "ewc_loss": 0.009121380746364594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.12138057174161e-06, "grad_norm": 10.178041458129883, "learning_rate": 5.964391691394659e-07, "loss": 0.5125, "mean_token_accuracy": 0.8381724953651428, "num_tokens": 53729764.0, "step": 1408 }, { "epoch": 0.17923928253402874, "ewc_loss": 0.009143303148448467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.143303032033145e-06, "grad_norm": 10.26089859008789, "learning_rate": 5.968630775752436e-07, "loss": 0.5464, "mean_token_accuracy": 0.826880693435669, "num_tokens": 53764522.0, "step": 1409 }, { "epoch": 0.17936649281261927, "ewc_loss": 0.00913475826382637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.134758329309989e-06, "grad_norm": 10.169135093688965, "learning_rate": 5.972869860110216e-07, "loss": 0.5279, "mean_token_accuracy": 0.8281760215759277, "num_tokens": 53799791.0, "step": 1410 }, { "epoch": 0.17949370309120977, "ewc_loss": 0.009139693342149258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.139693247561809e-06, "grad_norm": 10.232272148132324, "learning_rate": 5.977108944467994e-07, "loss": 0.5667, "mean_token_accuracy": 0.824955940246582, "num_tokens": 53840181.0, "step": 1411 }, { "epoch": 0.17962091336980027, "ewc_loss": 0.009170973673462868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.170973498839885e-06, "grad_norm": 10.143980026245117, "learning_rate": 5.981348028825774e-07, "loss": 0.4723, "mean_token_accuracy": 0.8508672714233398, "num_tokens": 53882813.0, "step": 1412 }, { "epoch": 0.1797481236483908, "ewc_loss": 0.009152105078101158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.152105121756904e-06, "grad_norm": 10.192907333374023, "learning_rate": 5.985587113183552e-07, "loss": 0.5213, "mean_token_accuracy": 0.8344751596450806, "num_tokens": 53922198.0, "step": 1413 }, { "epoch": 0.1798753339269813, "ewc_loss": 0.009187791496515274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.18779187486507e-06, "grad_norm": 10.164992332458496, "learning_rate": 5.989826197541331e-07, "loss": 0.5156, "mean_token_accuracy": 0.8370367288589478, "num_tokens": 53959390.0, "step": 1414 }, { "epoch": 0.18000254420557182, "ewc_loss": 0.009181150235235691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.181149835058022e-06, "grad_norm": 10.195905685424805, "learning_rate": 5.99406528189911e-07, "loss": 0.5445, "mean_token_accuracy": 0.8317371010780334, "num_tokens": 54005707.0, "step": 1415 }, { "epoch": 0.18012975448416232, "ewc_loss": 0.009198207408189774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.198207408189774e-06, "grad_norm": 10.180023193359375, "learning_rate": 5.998304366256888e-07, "loss": 0.5079, "mean_token_accuracy": 0.8368973731994629, "num_tokens": 54047808.0, "step": 1416 }, { "epoch": 0.18025696476275282, "ewc_loss": 0.009186701849102974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.186702300212346e-06, "grad_norm": 10.200472831726074, "learning_rate": 6.002543450614666e-07, "loss": 0.432, "mean_token_accuracy": 0.8632118105888367, "num_tokens": 54082929.0, "step": 1417 }, { "epoch": 0.18038417504134335, "ewc_loss": 0.00920049101114273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.200491149385925e-06, "grad_norm": 10.169245719909668, "learning_rate": 6.006782534972446e-07, "loss": 0.4935, "mean_token_accuracy": 0.8426351547241211, "num_tokens": 54120199.0, "step": 1418 }, { "epoch": 0.18051138531993385, "ewc_loss": 0.009184357710182667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.184357622871175e-06, "grad_norm": 10.215002059936523, "learning_rate": 6.011021619330224e-07, "loss": 0.5292, "mean_token_accuracy": 0.8301190137863159, "num_tokens": 54158629.0, "step": 1419 }, { "epoch": 0.18063859559852435, "ewc_loss": 0.00923160184174776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.231602234649472e-06, "grad_norm": 10.226677894592285, "learning_rate": 6.015260703688004e-07, "loss": 0.5088, "mean_token_accuracy": 0.8341633081436157, "num_tokens": 54194772.0, "step": 1420 }, { "epoch": 0.18076580587711488, "ewc_loss": 0.009203813038766384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.2038126240368e-06, "grad_norm": 10.191241264343262, "learning_rate": 6.019499788045782e-07, "loss": 0.5168, "mean_token_accuracy": 0.8363956212997437, "num_tokens": 54229825.0, "step": 1421 }, { "epoch": 0.18089301615570538, "ewc_loss": 0.00921760406345129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.217604201694485e-06, "grad_norm": 10.247845649719238, "learning_rate": 6.023738872403561e-07, "loss": 0.473, "mean_token_accuracy": 0.8463913202285767, "num_tokens": 54260051.0, "step": 1422 }, { "epoch": 0.18102022643429588, "ewc_loss": 0.009248054586350918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.248054993804544e-06, "grad_norm": 10.167701721191406, "learning_rate": 6.02797795676134e-07, "loss": 0.4993, "mean_token_accuracy": 0.8440043926239014, "num_tokens": 54297865.0, "step": 1423 }, { "epoch": 0.1811474367128864, "ewc_loss": 0.009243197739124298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.243197382602375e-06, "grad_norm": 10.156292915344238, "learning_rate": 6.032217041119118e-07, "loss": 0.5146, "mean_token_accuracy": 0.8308391571044922, "num_tokens": 54334618.0, "step": 1424 }, { "epoch": 0.1812746469914769, "ewc_loss": 0.009269805625081062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.269805559597444e-06, "grad_norm": 10.229350090026855, "learning_rate": 6.036456125476896e-07, "loss": 0.5513, "mean_token_accuracy": 0.824205756187439, "num_tokens": 54369761.0, "step": 1425 }, { "epoch": 0.1814018572700674, "ewc_loss": 0.009287552908062935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.287552529713139e-06, "grad_norm": 10.16701889038086, "learning_rate": 6.040695209834675e-07, "loss": 0.4502, "mean_token_accuracy": 0.8525953888893127, "num_tokens": 54411549.0, "step": 1426 }, { "epoch": 0.18152906754865794, "ewc_loss": 0.009274632669985294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.274632247979753e-06, "grad_norm": 10.21630859375, "learning_rate": 6.044934294192454e-07, "loss": 0.535, "mean_token_accuracy": 0.8309880495071411, "num_tokens": 54449137.0, "step": 1427 }, { "epoch": 0.18165627782724844, "ewc_loss": 0.00934700109064579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.347000741399825e-06, "grad_norm": 10.184321403503418, "learning_rate": 6.049173378550233e-07, "loss": 0.4901, "mean_token_accuracy": 0.8456557393074036, "num_tokens": 54494604.0, "step": 1428 }, { "epoch": 0.18178348810583894, "ewc_loss": 0.009318471886217594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.318471711594611e-06, "grad_norm": 10.245443344116211, "learning_rate": 6.053412462908012e-07, "loss": 0.4709, "mean_token_accuracy": 0.8475134372711182, "num_tokens": 54533533.0, "step": 1429 }, { "epoch": 0.18191069838442947, "ewc_loss": 0.009349779225885868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.349779247713741e-06, "grad_norm": 10.227254867553711, "learning_rate": 6.05765154726579e-07, "loss": 0.5431, "mean_token_accuracy": 0.8288599252700806, "num_tokens": 54575211.0, "step": 1430 }, { "epoch": 0.18203790866301997, "ewc_loss": 0.009317781776189804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.317781405115966e-06, "grad_norm": 10.19505786895752, "learning_rate": 6.061890631623569e-07, "loss": 0.4478, "mean_token_accuracy": 0.8546468615531921, "num_tokens": 54618308.0, "step": 1431 }, { "epoch": 0.18216511894161047, "ewc_loss": 0.009352956898510456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.352957022201736e-06, "grad_norm": 10.26461124420166, "learning_rate": 6.066129715981347e-07, "loss": 0.4866, "mean_token_accuracy": 0.8431453704833984, "num_tokens": 54660260.0, "step": 1432 }, { "epoch": 0.182292329220201, "ewc_loss": 0.00932695996016264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.326960025646258e-06, "grad_norm": 10.237062454223633, "learning_rate": 6.070368800339126e-07, "loss": 0.4591, "mean_token_accuracy": 0.851201057434082, "num_tokens": 54692056.0, "step": 1433 }, { "epoch": 0.1824195394987915, "ewc_loss": 0.009348093532025814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.348093954031356e-06, "grad_norm": 10.292241096496582, "learning_rate": 6.074607884696905e-07, "loss": 0.4798, "mean_token_accuracy": 0.8441725373268127, "num_tokens": 54729252.0, "step": 1434 }, { "epoch": 0.182546749777382, "ewc_loss": 0.009369325824081898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.369326107844245e-06, "grad_norm": 10.28661823272705, "learning_rate": 6.078846969054684e-07, "loss": 0.5037, "mean_token_accuracy": 0.837192952632904, "num_tokens": 54763422.0, "step": 1435 }, { "epoch": 0.18267396005597253, "ewc_loss": 0.009354854933917522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.354855137644336e-06, "grad_norm": 10.262594223022461, "learning_rate": 6.083086053412463e-07, "loss": 0.4786, "mean_token_accuracy": 0.8465065360069275, "num_tokens": 54803494.0, "step": 1436 }, { "epoch": 0.18280117033456303, "ewc_loss": 0.009361944161355495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.361943739349954e-06, "grad_norm": 10.34109878540039, "learning_rate": 6.087325137770242e-07, "loss": 0.4677, "mean_token_accuracy": 0.8488557934761047, "num_tokens": 54839349.0, "step": 1437 }, { "epoch": 0.18292838061315353, "ewc_loss": 0.009373503737151623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.373503417009488e-06, "grad_norm": 10.29621696472168, "learning_rate": 6.09156422212802e-07, "loss": 0.5414, "mean_token_accuracy": 0.8309072256088257, "num_tokens": 54881941.0, "step": 1438 }, { "epoch": 0.18305559089174406, "ewc_loss": 0.009336192160844803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.336192306363955e-06, "grad_norm": 10.289496421813965, "learning_rate": 6.095803306485799e-07, "loss": 0.4916, "mean_token_accuracy": 0.8396027088165283, "num_tokens": 54922927.0, "step": 1439 }, { "epoch": 0.18318280117033456, "ewc_loss": 0.009375042282044888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.375042282044888e-06, "grad_norm": 10.29446029663086, "learning_rate": 6.100042390843577e-07, "loss": 0.52, "mean_token_accuracy": 0.8326152563095093, "num_tokens": 54963885.0, "step": 1440 }, { "epoch": 0.1833100114489251, "ewc_loss": 0.009362751618027687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.362751370645128e-06, "grad_norm": 10.289997100830078, "learning_rate": 6.104281475201356e-07, "loss": 0.4691, "mean_token_accuracy": 0.8476988673210144, "num_tokens": 55001284.0, "step": 1441 }, { "epoch": 0.1834372217275156, "ewc_loss": 0.009387487545609474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.387487807543948e-06, "grad_norm": 10.301560401916504, "learning_rate": 6.108520559559135e-07, "loss": 0.5294, "mean_token_accuracy": 0.8314549922943115, "num_tokens": 55034813.0, "step": 1442 }, { "epoch": 0.1835644320061061, "ewc_loss": 0.009379446506500244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.379446964885574e-06, "grad_norm": 10.270184516906738, "learning_rate": 6.112759643916914e-07, "loss": 0.4342, "mean_token_accuracy": 0.8591206669807434, "num_tokens": 55070620.0, "step": 1443 }, { "epoch": 0.18369164228469662, "ewc_loss": 0.009381712414324284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.38171251618769e-06, "grad_norm": 10.314718246459961, "learning_rate": 6.116998728274693e-07, "loss": 0.5167, "mean_token_accuracy": 0.8403515815734863, "num_tokens": 55108394.0, "step": 1444 }, { "epoch": 0.18381885256328712, "ewc_loss": 0.009393040090799332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.393040272698272e-06, "grad_norm": 10.312654495239258, "learning_rate": 6.121237812632472e-07, "loss": 0.5208, "mean_token_accuracy": 0.8329851627349854, "num_tokens": 55148170.0, "step": 1445 }, { "epoch": 0.18394606284187762, "ewc_loss": 0.009387756697833538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.387757017975673e-06, "grad_norm": 10.238649368286133, "learning_rate": 6.125476896990249e-07, "loss": 0.462, "mean_token_accuracy": 0.853449821472168, "num_tokens": 55188102.0, "step": 1446 }, { "epoch": 0.18407327312046814, "ewc_loss": 0.009400570765137672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.400570888828952e-06, "grad_norm": 10.291728019714355, "learning_rate": 6.129715981348028e-07, "loss": 0.5009, "mean_token_accuracy": 0.8436034917831421, "num_tokens": 55225893.0, "step": 1447 }, { "epoch": 0.18420048339905865, "ewc_loss": 0.00941034872084856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.410348866367713e-06, "grad_norm": 10.267735481262207, "learning_rate": 6.133955065705807e-07, "loss": 0.5203, "mean_token_accuracy": 0.8318333625793457, "num_tokens": 55265321.0, "step": 1448 }, { "epoch": 0.18432769367764915, "ewc_loss": 0.009425687603652477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.425687494513113e-06, "grad_norm": 10.323302268981934, "learning_rate": 6.138194150063585e-07, "loss": 0.4805, "mean_token_accuracy": 0.8471753001213074, "num_tokens": 55304179.0, "step": 1449 }, { "epoch": 0.18445490395623967, "ewc_loss": 0.00945227313786745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.452272934140638e-06, "grad_norm": 10.288997650146484, "learning_rate": 6.142433234421365e-07, "loss": 0.5148, "mean_token_accuracy": 0.8345506191253662, "num_tokens": 55343266.0, "step": 1450 }, { "epoch": 0.18458211423483017, "ewc_loss": 0.009427376091480255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.427376426174305e-06, "grad_norm": 10.28865909576416, "learning_rate": 6.146672318779143e-07, "loss": 0.5189, "mean_token_accuracy": 0.8351224660873413, "num_tokens": 55381147.0, "step": 1451 }, { "epoch": 0.18470932451342068, "ewc_loss": 0.009458349086344242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.458349268243182e-06, "grad_norm": 10.349076271057129, "learning_rate": 6.150911403136923e-07, "loss": 0.5192, "mean_token_accuracy": 0.8367218375205994, "num_tokens": 55414855.0, "step": 1452 }, { "epoch": 0.1848365347920112, "ewc_loss": 0.009460704401135445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.460703950026073e-06, "grad_norm": 10.28182315826416, "learning_rate": 6.155150487494701e-07, "loss": 0.4422, "mean_token_accuracy": 0.856967031955719, "num_tokens": 55449699.0, "step": 1453 }, { "epoch": 0.1849637450706017, "ewc_loss": 0.009449637494981289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.4496372184949e-06, "grad_norm": 10.252260208129883, "learning_rate": 6.159389571852479e-07, "loss": 0.4699, "mean_token_accuracy": 0.8499880433082581, "num_tokens": 55493202.0, "step": 1454 }, { "epoch": 0.1850909553491922, "ewc_loss": 0.009473497048020363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.47349690250121e-06, "grad_norm": 10.282819747924805, "learning_rate": 6.163628656210258e-07, "loss": 0.492, "mean_token_accuracy": 0.8439464569091797, "num_tokens": 55539873.0, "step": 1455 }, { "epoch": 0.18521816562778273, "ewc_loss": 0.00949134211987257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.491342098044697e-06, "grad_norm": 10.357613563537598, "learning_rate": 6.167867740568037e-07, "loss": 0.5516, "mean_token_accuracy": 0.8254607915878296, "num_tokens": 55576338.0, "step": 1456 }, { "epoch": 0.18534537590637323, "ewc_loss": 0.00949582364410162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.495824087935034e-06, "grad_norm": 10.256068229675293, "learning_rate": 6.172106824925815e-07, "loss": 0.472, "mean_token_accuracy": 0.8465123176574707, "num_tokens": 55618330.0, "step": 1457 }, { "epoch": 0.18547258618496373, "ewc_loss": 0.009488029405474663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.48802971834084e-06, "grad_norm": 10.309460639953613, "learning_rate": 6.176345909283595e-07, "loss": 0.4892, "mean_token_accuracy": 0.8431510329246521, "num_tokens": 55657394.0, "step": 1458 }, { "epoch": 0.18559979646355426, "ewc_loss": 0.009497938677668571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.497938663116656e-06, "grad_norm": 10.328414916992188, "learning_rate": 6.180584993641373e-07, "loss": 0.4749, "mean_token_accuracy": 0.8469345569610596, "num_tokens": 55695087.0, "step": 1459 }, { "epoch": 0.18572700674214476, "ewc_loss": 0.009503878653049469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.503878573013935e-06, "grad_norm": 10.35250186920166, "learning_rate": 6.184824077999153e-07, "loss": 0.4591, "mean_token_accuracy": 0.8520053029060364, "num_tokens": 55730852.0, "step": 1460 }, { "epoch": 0.18585421702073526, "ewc_loss": 0.009519717656075954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.51971742324531e-06, "grad_norm": 10.370821952819824, "learning_rate": 6.189063162356931e-07, "loss": 0.5557, "mean_token_accuracy": 0.8241114616394043, "num_tokens": 55765678.0, "step": 1461 }, { "epoch": 0.1859814272993258, "ewc_loss": 0.009511253796517849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.511253665550612e-06, "grad_norm": 10.316402435302734, "learning_rate": 6.193302246714709e-07, "loss": 0.4812, "mean_token_accuracy": 0.8481829762458801, "num_tokens": 55804409.0, "step": 1462 }, { "epoch": 0.1861086375779163, "ewc_loss": 0.009525885805487633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.525885616312735e-06, "grad_norm": 10.366753578186035, "learning_rate": 6.197541331072488e-07, "loss": 0.5105, "mean_token_accuracy": 0.8362874984741211, "num_tokens": 55839520.0, "step": 1463 }, { "epoch": 0.1862358478565068, "ewc_loss": 0.009546703658998013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.546703950036317e-06, "grad_norm": 10.362625122070312, "learning_rate": 6.201780415430267e-07, "loss": 0.4901, "mean_token_accuracy": 0.8428542613983154, "num_tokens": 55879089.0, "step": 1464 }, { "epoch": 0.18636305813509732, "ewc_loss": 0.009542213752865791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.542213774693664e-06, "grad_norm": 10.344768524169922, "learning_rate": 6.206019499788045e-07, "loss": 0.534, "mean_token_accuracy": 0.8301579356193542, "num_tokens": 55916655.0, "step": 1465 }, { "epoch": 0.18649026841368782, "ewc_loss": 0.009551912546157837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.55191262619337e-06, "grad_norm": 10.330897331237793, "learning_rate": 6.210258584145825e-07, "loss": 0.4684, "mean_token_accuracy": 0.8461110591888428, "num_tokens": 55953828.0, "step": 1466 }, { "epoch": 0.18661747869227835, "ewc_loss": 0.00955481268465519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.554813004797325e-06, "grad_norm": 10.317888259887695, "learning_rate": 6.214497668503603e-07, "loss": 0.517, "mean_token_accuracy": 0.8308992385864258, "num_tokens": 55992037.0, "step": 1467 }, { "epoch": 0.18674468897086885, "ewc_loss": 0.009585010819137096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.585010957380291e-06, "grad_norm": 10.495732307434082, "learning_rate": 6.218736752861383e-07, "loss": 0.4781, "mean_token_accuracy": 0.8479709625244141, "num_tokens": 56020424.0, "step": 1468 }, { "epoch": 0.18687189924945935, "ewc_loss": 0.009621448814868927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.621448953112122e-06, "grad_norm": 10.332141876220703, "learning_rate": 6.22297583721916e-07, "loss": 0.5101, "mean_token_accuracy": 0.842918872833252, "num_tokens": 56060839.0, "step": 1469 }, { "epoch": 0.18699910952804988, "ewc_loss": 0.009563423693180084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.56342410063371e-06, "grad_norm": 10.332494735717773, "learning_rate": 6.227214921576938e-07, "loss": 0.5407, "mean_token_accuracy": 0.8321959376335144, "num_tokens": 56099095.0, "step": 1470 }, { "epoch": 0.18712631980664038, "ewc_loss": 0.009620686061680317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.620685887057334e-06, "grad_norm": 10.365487098693848, "learning_rate": 6.231454005934718e-07, "loss": 0.4364, "mean_token_accuracy": 0.8603891134262085, "num_tokens": 56133805.0, "step": 1471 }, { "epoch": 0.18725353008523088, "ewc_loss": 0.009621361270546913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.621361641620751e-06, "grad_norm": 10.30132007598877, "learning_rate": 6.235693090292496e-07, "loss": 0.5559, "mean_token_accuracy": 0.8275014162063599, "num_tokens": 56173064.0, "step": 1472 }, { "epoch": 0.1873807403638214, "ewc_loss": 0.009612774476408958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.612774192646611e-06, "grad_norm": 10.386942863464355, "learning_rate": 6.239932174650275e-07, "loss": 0.437, "mean_token_accuracy": 0.8614148497581482, "num_tokens": 56213285.0, "step": 1473 }, { "epoch": 0.1875079506424119, "ewc_loss": 0.00965762697160244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.657626833359245e-06, "grad_norm": 10.395952224731445, "learning_rate": 6.244171259008054e-07, "loss": 0.5201, "mean_token_accuracy": 0.8314194083213806, "num_tokens": 56248218.0, "step": 1474 }, { "epoch": 0.1876351609210024, "ewc_loss": 0.009645791724324226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.645791578805074e-06, "grad_norm": 10.359295845031738, "learning_rate": 6.248410343365833e-07, "loss": 0.4638, "mean_token_accuracy": 0.8516383171081543, "num_tokens": 56286098.0, "step": 1475 }, { "epoch": 0.18776237119959294, "ewc_loss": 0.009651755914092064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.651756045059301e-06, "grad_norm": 10.365899085998535, "learning_rate": 6.252649427723612e-07, "loss": 0.4716, "mean_token_accuracy": 0.8507519960403442, "num_tokens": 56329732.0, "step": 1476 }, { "epoch": 0.18788958147818344, "ewc_loss": 0.00965232402086258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.652324479247909e-06, "grad_norm": 10.382208824157715, "learning_rate": 6.25688851208139e-07, "loss": 0.4765, "mean_token_accuracy": 0.8482435345649719, "num_tokens": 56367838.0, "step": 1477 }, { "epoch": 0.18801679175677394, "ewc_loss": 0.009663841687142849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.66384141065646e-06, "grad_norm": 10.404630661010742, "learning_rate": 6.261127596439168e-07, "loss": 0.5293, "mean_token_accuracy": 0.8299828767776489, "num_tokens": 56413059.0, "step": 1478 }, { "epoch": 0.18814400203536447, "ewc_loss": 0.00965651124715805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.65651088336017e-06, "grad_norm": 10.35327434539795, "learning_rate": 6.265366680796948e-07, "loss": 0.432, "mean_token_accuracy": 0.855688214302063, "num_tokens": 56453806.0, "step": 1479 }, { "epoch": 0.18827121231395497, "ewc_loss": 0.009646471589803696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.646471880842e-06, "grad_norm": 10.379637718200684, "learning_rate": 6.269605765154726e-07, "loss": 0.4802, "mean_token_accuracy": 0.8442482948303223, "num_tokens": 56497448.0, "step": 1480 }, { "epoch": 0.18839842259254547, "ewc_loss": 0.00967920571565628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.67920550465351e-06, "grad_norm": 10.394790649414062, "learning_rate": 6.273844849512505e-07, "loss": 0.4823, "mean_token_accuracy": 0.8439145088195801, "num_tokens": 56537731.0, "step": 1481 }, { "epoch": 0.188525632871136, "ewc_loss": 0.009674650616943836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.674650755187031e-06, "grad_norm": 10.397928237915039, "learning_rate": 6.278083933870284e-07, "loss": 0.4212, "mean_token_accuracy": 0.8646004796028137, "num_tokens": 56580944.0, "step": 1482 }, { "epoch": 0.1886528431497265, "ewc_loss": 0.009674765169620514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.674765351519454e-06, "grad_norm": 10.4392728805542, "learning_rate": 6.282323018228063e-07, "loss": 0.4538, "mean_token_accuracy": 0.8553471565246582, "num_tokens": 56616306.0, "step": 1483 }, { "epoch": 0.188780053428317, "ewc_loss": 0.009704391472041607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.704391231935006e-06, "grad_norm": 10.461405754089355, "learning_rate": 6.286562102585841e-07, "loss": 0.5338, "mean_token_accuracy": 0.8334819078445435, "num_tokens": 56661874.0, "step": 1484 }, { "epoch": 0.18890726370690752, "ewc_loss": 0.009670490399003029, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.670490726421122e-06, "grad_norm": 10.369935035705566, "learning_rate": 6.29080118694362e-07, "loss": 0.4286, "mean_token_accuracy": 0.8597143292427063, "num_tokens": 56696697.0, "step": 1485 }, { "epoch": 0.18903447398549802, "ewc_loss": 0.009687365032732487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.687364581623115e-06, "grad_norm": 10.46349048614502, "learning_rate": 6.295040271301398e-07, "loss": 0.428, "mean_token_accuracy": 0.8598521947860718, "num_tokens": 56732581.0, "step": 1486 }, { "epoch": 0.18916168426408853, "ewc_loss": 0.00970062892884016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.700628652353771e-06, "grad_norm": 10.418600082397461, "learning_rate": 6.299279355659178e-07, "loss": 0.4777, "mean_token_accuracy": 0.8480672240257263, "num_tokens": 56777905.0, "step": 1487 }, { "epoch": 0.18928889454267905, "ewc_loss": 0.009677124209702015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.677124580775853e-06, "grad_norm": 10.410721778869629, "learning_rate": 6.303518440016956e-07, "loss": 0.4972, "mean_token_accuracy": 0.8393299579620361, "num_tokens": 56812915.0, "step": 1488 }, { "epoch": 0.18941610482126955, "ewc_loss": 0.009714906103909016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.714905900182202e-06, "grad_norm": 10.47385025024414, "learning_rate": 6.307757524374735e-07, "loss": 0.4923, "mean_token_accuracy": 0.845255970954895, "num_tokens": 56846907.0, "step": 1489 }, { "epoch": 0.18954331509986008, "ewc_loss": 0.009711180813610554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.71118060988374e-06, "grad_norm": 10.426136016845703, "learning_rate": 6.311996608732514e-07, "loss": 0.52, "mean_token_accuracy": 0.837628960609436, "num_tokens": 56884614.0, "step": 1490 }, { "epoch": 0.18967052537845058, "ewc_loss": 0.009719197638332844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.719197805679869e-06, "grad_norm": 10.479096412658691, "learning_rate": 6.316235693090292e-07, "loss": 0.5116, "mean_token_accuracy": 0.8346410989761353, "num_tokens": 56920508.0, "step": 1491 }, { "epoch": 0.18979773565704108, "ewc_loss": 0.009753457270562649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.753457561600953e-06, "grad_norm": 10.456979751586914, "learning_rate": 6.320474777448071e-07, "loss": 0.5147, "mean_token_accuracy": 0.8342057466506958, "num_tokens": 56957629.0, "step": 1492 }, { "epoch": 0.1899249459356316, "ewc_loss": 0.009710789658129215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.710789527161978e-06, "grad_norm": 10.385331153869629, "learning_rate": 6.324713861805849e-07, "loss": 0.4726, "mean_token_accuracy": 0.8461419343948364, "num_tokens": 56997265.0, "step": 1493 }, { "epoch": 0.1900521562142221, "ewc_loss": 0.009762289933860302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.76228966464987e-06, "grad_norm": 10.532485961914062, "learning_rate": 6.328952946163628e-07, "loss": 0.5758, "mean_token_accuracy": 0.8211225271224976, "num_tokens": 57031519.0, "step": 1494 }, { "epoch": 0.1901793664928126, "ewc_loss": 0.009769456461071968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.769456482899841e-06, "grad_norm": 10.405298233032227, "learning_rate": 6.333192030521407e-07, "loss": 0.4462, "mean_token_accuracy": 0.8515660166740417, "num_tokens": 57067264.0, "step": 1495 }, { "epoch": 0.19030657677140314, "ewc_loss": 0.009761026129126549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.761026376509108e-06, "grad_norm": 10.434383392333984, "learning_rate": 6.337431114879186e-07, "loss": 0.5007, "mean_token_accuracy": 0.8402388095855713, "num_tokens": 57104906.0, "step": 1496 }, { "epoch": 0.19043378704999364, "ewc_loss": 0.009780822321772575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.780822438187897e-06, "grad_norm": 10.414787292480469, "learning_rate": 6.341670199236965e-07, "loss": 0.4663, "mean_token_accuracy": 0.8504252433776855, "num_tokens": 57142534.0, "step": 1497 }, { "epoch": 0.19056099732858414, "ewc_loss": 0.009772324934601784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.772325029189233e-06, "grad_norm": 10.396516799926758, "learning_rate": 6.345909283594744e-07, "loss": 0.4484, "mean_token_accuracy": 0.8552085757255554, "num_tokens": 57188735.0, "step": 1498 }, { "epoch": 0.19068820760717467, "ewc_loss": 0.009815930388867855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.815930752665736e-06, "grad_norm": 10.459111213684082, "learning_rate": 6.350148367952522e-07, "loss": 0.5006, "mean_token_accuracy": 0.8337381482124329, "num_tokens": 57224610.0, "step": 1499 }, { "epoch": 0.19081541788576517, "ewc_loss": 0.00983725767582655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.83725749392761e-06, "grad_norm": 10.473698616027832, "learning_rate": 6.354387452310301e-07, "loss": 0.4553, "mean_token_accuracy": 0.8496204614639282, "num_tokens": 57259553.0, "step": 1500 }, { "epoch": 0.19094262816435567, "ewc_loss": 0.009828020818531513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.828020665736403e-06, "grad_norm": 10.470232009887695, "learning_rate": 6.358626536668079e-07, "loss": 0.4596, "mean_token_accuracy": 0.8525118827819824, "num_tokens": 57295819.0, "step": 1501 }, { "epoch": 0.1910698384429462, "ewc_loss": 0.009832970798015594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.832971045398153e-06, "grad_norm": 10.423311233520508, "learning_rate": 6.362865621025858e-07, "loss": 0.4474, "mean_token_accuracy": 0.857118546962738, "num_tokens": 57339110.0, "step": 1502 }, { "epoch": 0.1911970487215367, "ewc_loss": 0.00983116403222084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.83116387942573e-06, "grad_norm": 10.46788215637207, "learning_rate": 6.367104705383637e-07, "loss": 0.4766, "mean_token_accuracy": 0.8464949131011963, "num_tokens": 57377311.0, "step": 1503 }, { "epoch": 0.1913242590001272, "ewc_loss": 0.009856725111603737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.856725228019059e-06, "grad_norm": 10.465092658996582, "learning_rate": 6.371343789741416e-07, "loss": 0.489, "mean_token_accuracy": 0.8393208384513855, "num_tokens": 57419281.0, "step": 1504 }, { "epoch": 0.19145146927871773, "ewc_loss": 0.009852669201791286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.852668881649151e-06, "grad_norm": 10.530004501342773, "learning_rate": 6.375582874099195e-07, "loss": 0.4764, "mean_token_accuracy": 0.8476353883743286, "num_tokens": 57458134.0, "step": 1505 }, { "epoch": 0.19157867955730823, "ewc_loss": 0.009851661510765553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.851661161519587e-06, "grad_norm": 10.4677734375, "learning_rate": 6.379821958456974e-07, "loss": 0.531, "mean_token_accuracy": 0.830483615398407, "num_tokens": 57492931.0, "step": 1506 }, { "epoch": 0.19170588983589873, "ewc_loss": 0.009849880822002888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.849880370893516e-06, "grad_norm": 10.526869773864746, "learning_rate": 6.384061042814751e-07, "loss": 0.5015, "mean_token_accuracy": 0.838324785232544, "num_tokens": 57529581.0, "step": 1507 }, { "epoch": 0.19183310011448926, "ewc_loss": 0.009870612062513828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.87061230262043e-06, "grad_norm": 10.456395149230957, "learning_rate": 6.38830012717253e-07, "loss": 0.4892, "mean_token_accuracy": 0.8436779379844666, "num_tokens": 57570302.0, "step": 1508 }, { "epoch": 0.19196031039307976, "ewc_loss": 0.009832954034209251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.83295376499882e-06, "grad_norm": 10.440227508544922, "learning_rate": 6.392539211530309e-07, "loss": 0.5012, "mean_token_accuracy": 0.8373692035675049, "num_tokens": 57615974.0, "step": 1509 }, { "epoch": 0.19208752067167026, "ewc_loss": 0.009879267774522305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.879267963697203e-06, "grad_norm": 10.465947151184082, "learning_rate": 6.396778295888087e-07, "loss": 0.4657, "mean_token_accuracy": 0.8520839810371399, "num_tokens": 57653737.0, "step": 1510 }, { "epoch": 0.1922147309502608, "ewc_loss": 0.00986715592443943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.867156222753692e-06, "grad_norm": 10.442893981933594, "learning_rate": 6.401017380245867e-07, "loss": 0.4991, "mean_token_accuracy": 0.8386801481246948, "num_tokens": 57693431.0, "step": 1511 }, { "epoch": 0.1923419412288513, "ewc_loss": 0.009872734546661377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.872734153759666e-06, "grad_norm": 10.420129776000977, "learning_rate": 6.405256464603645e-07, "loss": 0.4836, "mean_token_accuracy": 0.8428902626037598, "num_tokens": 57736413.0, "step": 1512 }, { "epoch": 0.1924691515074418, "ewc_loss": 0.00988683570176363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.886835869110655e-06, "grad_norm": 10.528295516967773, "learning_rate": 6.409495548961425e-07, "loss": 0.4679, "mean_token_accuracy": 0.8514495491981506, "num_tokens": 57773293.0, "step": 1513 }, { "epoch": 0.19259636178603232, "ewc_loss": 0.009922290220856667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.922289791575167e-06, "grad_norm": 10.531208038330078, "learning_rate": 6.413734633319203e-07, "loss": 0.4807, "mean_token_accuracy": 0.8463356494903564, "num_tokens": 57809294.0, "step": 1514 }, { "epoch": 0.19272357206462282, "ewc_loss": 0.009892095811665058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.892095476971008e-06, "grad_norm": 10.461431503295898, "learning_rate": 6.417973717676981e-07, "loss": 0.4514, "mean_token_accuracy": 0.8527944087982178, "num_tokens": 57843610.0, "step": 1515 }, { "epoch": 0.19285078234321335, "ewc_loss": 0.009886318817734718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.886319276120048e-06, "grad_norm": 10.463729858398438, "learning_rate": 6.42221280203476e-07, "loss": 0.5119, "mean_token_accuracy": 0.8335180282592773, "num_tokens": 57893098.0, "step": 1516 }, { "epoch": 0.19297799262180385, "ewc_loss": 0.009934461675584316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.934461559168994e-06, "grad_norm": 10.548332214355469, "learning_rate": 6.426451886392539e-07, "loss": 0.5385, "mean_token_accuracy": 0.8325604200363159, "num_tokens": 57933098.0, "step": 1517 }, { "epoch": 0.19310520290039435, "ewc_loss": 0.009954721666872501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.954721463145688e-06, "grad_norm": 10.549966812133789, "learning_rate": 6.430690970750317e-07, "loss": 0.4702, "mean_token_accuracy": 0.8483806848526001, "num_tokens": 57971502.0, "step": 1518 }, { "epoch": 0.19323241317898487, "ewc_loss": 0.009936273097991943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.936273272614926e-06, "grad_norm": 10.563044548034668, "learning_rate": 6.434930055108097e-07, "loss": 0.5202, "mean_token_accuracy": 0.8331229090690613, "num_tokens": 58001557.0, "step": 1519 }, { "epoch": 0.19335962345757537, "ewc_loss": 0.00995917059481144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.959170711226761e-06, "grad_norm": 10.548542022705078, "learning_rate": 6.439169139465875e-07, "loss": 0.5288, "mean_token_accuracy": 0.8313899636268616, "num_tokens": 58038058.0, "step": 1520 }, { "epoch": 0.19348683373616588, "ewc_loss": 0.009966197423636913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.966197467292659e-06, "grad_norm": 10.523962020874023, "learning_rate": 6.443408223823655e-07, "loss": 0.4965, "mean_token_accuracy": 0.8398059010505676, "num_tokens": 58073138.0, "step": 1521 }, { "epoch": 0.1936140440147564, "ewc_loss": 0.009975312277674675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.975312423193827e-06, "grad_norm": 10.548715591430664, "learning_rate": 6.447647308181432e-07, "loss": 0.4877, "mean_token_accuracy": 0.8421800136566162, "num_tokens": 58111793.0, "step": 1522 }, { "epoch": 0.1937412542933469, "ewc_loss": 0.010014223866164684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.001422424451448e-05, "grad_norm": 10.507556915283203, "learning_rate": 6.451886392539211e-07, "loss": 0.4759, "mean_token_accuracy": 0.8458437919616699, "num_tokens": 58152911.0, "step": 1523 }, { "epoch": 0.1938684645719374, "ewc_loss": 0.009993652813136578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 9.993652383855078e-06, "grad_norm": 10.571436882019043, "learning_rate": 6.45612547689699e-07, "loss": 0.4939, "mean_token_accuracy": 0.8422895669937134, "num_tokens": 58192095.0, "step": 1524 }, { "epoch": 0.19399567485052793, "ewc_loss": 0.010018757544457912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0018757166108117e-05, "grad_norm": 10.513009071350098, "learning_rate": 6.460364561254769e-07, "loss": 0.4486, "mean_token_accuracy": 0.8541529178619385, "num_tokens": 58232651.0, "step": 1525 }, { "epoch": 0.19412288512911843, "ewc_loss": 0.010022195056080818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0022195056080818e-05, "grad_norm": 10.560256958007812, "learning_rate": 6.464603645612547e-07, "loss": 0.4459, "mean_token_accuracy": 0.856299102306366, "num_tokens": 58270802.0, "step": 1526 }, { "epoch": 0.19425009540770893, "ewc_loss": 0.010050644166767597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0050644050352275e-05, "grad_norm": 10.551065444946289, "learning_rate": 6.468842729970327e-07, "loss": 0.4779, "mean_token_accuracy": 0.8491832613945007, "num_tokens": 58313179.0, "step": 1527 }, { "epoch": 0.19437730568629946, "ewc_loss": 0.010028469376266003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.002846966002835e-05, "grad_norm": 10.554027557373047, "learning_rate": 6.473081814328105e-07, "loss": 0.5175, "mean_token_accuracy": 0.8391242623329163, "num_tokens": 58354728.0, "step": 1528 }, { "epoch": 0.19450451596488996, "ewc_loss": 0.010047204792499542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0047205250884872e-05, "grad_norm": 10.595659255981445, "learning_rate": 6.477320898685885e-07, "loss": 0.5017, "mean_token_accuracy": 0.8415570259094238, "num_tokens": 58398649.0, "step": 1529 }, { "epoch": 0.19463172624348046, "ewc_loss": 0.010038510896265507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0038510481535923e-05, "grad_norm": 10.553763389587402, "learning_rate": 6.481559983043662e-07, "loss": 0.5126, "mean_token_accuracy": 0.8353432416915894, "num_tokens": 58438261.0, "step": 1530 }, { "epoch": 0.194758936522071, "ewc_loss": 0.010042245499789715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0042245776276104e-05, "grad_norm": 10.577752113342285, "learning_rate": 6.48579906740144e-07, "loss": 0.4571, "mean_token_accuracy": 0.8513373136520386, "num_tokens": 58479141.0, "step": 1531 }, { "epoch": 0.1948861468006615, "ewc_loss": 0.010053133592009544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0053133337351028e-05, "grad_norm": 10.561805725097656, "learning_rate": 6.49003815175922e-07, "loss": 0.4254, "mean_token_accuracy": 0.8629391193389893, "num_tokens": 58515700.0, "step": 1532 }, { "epoch": 0.195013357079252, "ewc_loss": 0.010049979202449322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.004997920972528e-05, "grad_norm": 10.622739791870117, "learning_rate": 6.494277236116998e-07, "loss": 0.4319, "mean_token_accuracy": 0.8615601062774658, "num_tokens": 58549825.0, "step": 1533 }, { "epoch": 0.19514056735784252, "ewc_loss": 0.010095575824379921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0095575817103963e-05, "grad_norm": 10.56966781616211, "learning_rate": 6.498516320474777e-07, "loss": 0.5719, "mean_token_accuracy": 0.8219863176345825, "num_tokens": 58596110.0, "step": 1534 }, { "epoch": 0.19526777763643302, "ewc_loss": 0.010071350261569023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.007135051622754e-05, "grad_norm": 10.604430198669434, "learning_rate": 6.502755404832556e-07, "loss": 0.4822, "mean_token_accuracy": 0.8424047231674194, "num_tokens": 58632410.0, "step": 1535 }, { "epoch": 0.19539498791502352, "ewc_loss": 0.010099509730935097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0099509381689131e-05, "grad_norm": 10.633097648620605, "learning_rate": 6.506994489190335e-07, "loss": 0.5445, "mean_token_accuracy": 0.8313365578651428, "num_tokens": 58671339.0, "step": 1536 }, { "epoch": 0.19552219819361405, "ewc_loss": 0.010071792639791965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0071792530652601e-05, "grad_norm": 10.540936470031738, "learning_rate": 6.511233573548114e-07, "loss": 0.4438, "mean_token_accuracy": 0.8558211922645569, "num_tokens": 58711786.0, "step": 1537 }, { "epoch": 0.19564940847220455, "ewc_loss": 0.010060169734060764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0060170097858645e-05, "grad_norm": 10.63624095916748, "learning_rate": 6.515472657905892e-07, "loss": 0.487, "mean_token_accuracy": 0.8437939286231995, "num_tokens": 58743600.0, "step": 1538 }, { "epoch": 0.19577661875079505, "ewc_loss": 0.010107043199241161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0107043635798618e-05, "grad_norm": 10.595001220703125, "learning_rate": 6.51971174226367e-07, "loss": 0.5442, "mean_token_accuracy": 0.8271766901016235, "num_tokens": 58780622.0, "step": 1539 }, { "epoch": 0.19590382902938558, "ewc_loss": 0.010079790838062763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0079790627059992e-05, "grad_norm": 10.556588172912598, "learning_rate": 6.52395082662145e-07, "loss": 0.5372, "mean_token_accuracy": 0.8288803696632385, "num_tokens": 58819370.0, "step": 1540 }, { "epoch": 0.19603103930797608, "ewc_loss": 0.010107770562171936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0107770322065335e-05, "grad_norm": 10.579480171203613, "learning_rate": 6.528189910979228e-07, "loss": 0.4606, "mean_token_accuracy": 0.8513529896736145, "num_tokens": 58856270.0, "step": 1541 }, { "epoch": 0.1961582495865666, "ewc_loss": 0.010129082016646862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0129081601917278e-05, "grad_norm": 10.561535835266113, "learning_rate": 6.532428995337007e-07, "loss": 0.478, "mean_token_accuracy": 0.8459364771842957, "num_tokens": 58898702.0, "step": 1542 }, { "epoch": 0.1962854598651571, "ewc_loss": 0.010116676799952984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0116677003679797e-05, "grad_norm": 10.600628852844238, "learning_rate": 6.536668079694786e-07, "loss": 0.4993, "mean_token_accuracy": 0.8372145891189575, "num_tokens": 58934602.0, "step": 1543 }, { "epoch": 0.1964126701437476, "ewc_loss": 0.010147777386009693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0147777175006922e-05, "grad_norm": 10.587449073791504, "learning_rate": 6.540907164052565e-07, "loss": 0.4854, "mean_token_accuracy": 0.8437312841415405, "num_tokens": 58977766.0, "step": 1544 }, { "epoch": 0.19653988042233814, "ewc_loss": 0.010134386830031872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.013438668451272e-05, "grad_norm": 10.58064079284668, "learning_rate": 6.545146248410343e-07, "loss": 0.4957, "mean_token_accuracy": 0.8401300311088562, "num_tokens": 59019649.0, "step": 1545 }, { "epoch": 0.19666709070092864, "ewc_loss": 0.010159758850932121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0159758858208079e-05, "grad_norm": 10.638884544372559, "learning_rate": 6.549385332768122e-07, "loss": 0.5076, "mean_token_accuracy": 0.8388210535049438, "num_tokens": 59060543.0, "step": 1546 }, { "epoch": 0.19679430097951914, "ewc_loss": 0.010179350152611732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.017935028357897e-05, "grad_norm": 10.655401229858398, "learning_rate": 6.5536244171259e-07, "loss": 0.5063, "mean_token_accuracy": 0.8364051580429077, "num_tokens": 59100049.0, "step": 1547 }, { "epoch": 0.19692151125810967, "ewc_loss": 0.010165009647607803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0165009371121414e-05, "grad_norm": 10.608292579650879, "learning_rate": 6.55786350148368e-07, "loss": 0.5131, "mean_token_accuracy": 0.8351407051086426, "num_tokens": 59137370.0, "step": 1548 }, { "epoch": 0.19704872153670017, "ewc_loss": 0.01016603596508503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0166036190639716e-05, "grad_norm": 10.640124320983887, "learning_rate": 6.562102585841458e-07, "loss": 0.4659, "mean_token_accuracy": 0.8450132608413696, "num_tokens": 59176196.0, "step": 1549 }, { "epoch": 0.19717593181529067, "ewc_loss": 0.01017923653125763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0179236596741248e-05, "grad_norm": 10.640838623046875, "learning_rate": 6.566341670199236e-07, "loss": 0.464, "mean_token_accuracy": 0.8514302968978882, "num_tokens": 59216215.0, "step": 1550 }, { "epoch": 0.1973031420938812, "ewc_loss": 0.010176432318985462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0176432624575682e-05, "grad_norm": 10.71075439453125, "learning_rate": 6.570580754557016e-07, "loss": 0.4806, "mean_token_accuracy": 0.8454367518424988, "num_tokens": 59254173.0, "step": 1551 }, { "epoch": 0.1974303523724717, "ewc_loss": 0.010192770510911942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0192770787398331e-05, "grad_norm": 10.61838436126709, "learning_rate": 6.574819838914794e-07, "loss": 0.4521, "mean_token_accuracy": 0.853335976600647, "num_tokens": 59288496.0, "step": 1552 }, { "epoch": 0.1975575626510622, "ewc_loss": 0.010197729803621769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0197730262007099e-05, "grad_norm": 10.710439682006836, "learning_rate": 6.579058923272573e-07, "loss": 0.5115, "mean_token_accuracy": 0.8350812196731567, "num_tokens": 59329043.0, "step": 1553 }, { "epoch": 0.19768477292965272, "ewc_loss": 0.010206827893853188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0206827937508933e-05, "grad_norm": 10.69878101348877, "learning_rate": 6.583298007630351e-07, "loss": 0.5023, "mean_token_accuracy": 0.8405630588531494, "num_tokens": 59360743.0, "step": 1554 }, { "epoch": 0.19781198320824323, "ewc_loss": 0.010199015028774738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0199015378020704e-05, "grad_norm": 10.644542694091797, "learning_rate": 6.58753709198813e-07, "loss": 0.4983, "mean_token_accuracy": 0.8393670320510864, "num_tokens": 59404167.0, "step": 1555 }, { "epoch": 0.19793919348683373, "ewc_loss": 0.010212994180619717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0212994311586954e-05, "grad_norm": 10.686399459838867, "learning_rate": 6.591776176345909e-07, "loss": 0.4662, "mean_token_accuracy": 0.8462942838668823, "num_tokens": 59447770.0, "step": 1556 }, { "epoch": 0.19806640376542425, "ewc_loss": 0.010214336216449738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0214335816272069e-05, "grad_norm": 10.606297492980957, "learning_rate": 6.596015260703688e-07, "loss": 0.4753, "mean_token_accuracy": 0.8507354259490967, "num_tokens": 59494989.0, "step": 1557 }, { "epoch": 0.19819361404401475, "ewc_loss": 0.010208405554294586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.020840591081651e-05, "grad_norm": 10.70700740814209, "learning_rate": 6.600254345061466e-07, "loss": 0.4626, "mean_token_accuracy": 0.8514148592948914, "num_tokens": 59534926.0, "step": 1558 }, { "epoch": 0.19832082432260525, "ewc_loss": 0.010246401652693748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0246401870972477e-05, "grad_norm": 10.701923370361328, "learning_rate": 6.604493429419246e-07, "loss": 0.5177, "mean_token_accuracy": 0.8358314633369446, "num_tokens": 59577832.0, "step": 1559 }, { "epoch": 0.19844803460119578, "ewc_loss": 0.010213026776909828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0213027053396218e-05, "grad_norm": 10.655926704406738, "learning_rate": 6.608732513777023e-07, "loss": 0.5252, "mean_token_accuracy": 0.8395428657531738, "num_tokens": 59624995.0, "step": 1560 }, { "epoch": 0.19857524487978628, "ewc_loss": 0.010240662842988968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.024066295940429e-05, "grad_norm": 10.71573257446289, "learning_rate": 6.612971598134803e-07, "loss": 0.4781, "mean_token_accuracy": 0.8451354503631592, "num_tokens": 59663000.0, "step": 1561 }, { "epoch": 0.19870245515837678, "ewc_loss": 0.01023017056286335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0230171028524637e-05, "grad_norm": 10.70605182647705, "learning_rate": 6.617210682492581e-07, "loss": 0.5331, "mean_token_accuracy": 0.8303512930870056, "num_tokens": 59700241.0, "step": 1562 }, { "epoch": 0.1988296654369673, "ewc_loss": 0.010224349796772003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.022435026243329e-05, "grad_norm": 10.71159553527832, "learning_rate": 6.62144976685036e-07, "loss": 0.4512, "mean_token_accuracy": 0.8564144372940063, "num_tokens": 59737244.0, "step": 1563 }, { "epoch": 0.1989568757155578, "ewc_loss": 0.010236386209726334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0236386515316553e-05, "grad_norm": 10.702031135559082, "learning_rate": 6.625688851208139e-07, "loss": 0.5011, "mean_token_accuracy": 0.8404824733734131, "num_tokens": 59775538.0, "step": 1564 }, { "epoch": 0.19908408599414834, "ewc_loss": 0.010234897024929523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.023489676299505e-05, "grad_norm": 10.691096305847168, "learning_rate": 6.629927935565918e-07, "loss": 0.4954, "mean_token_accuracy": 0.8387423753738403, "num_tokens": 59810373.0, "step": 1565 }, { "epoch": 0.19921129627273884, "ewc_loss": 0.010251933708786964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.025193341774866e-05, "grad_norm": 10.708268165588379, "learning_rate": 6.634167019923696e-07, "loss": 0.527, "mean_token_accuracy": 0.835247278213501, "num_tokens": 59849992.0, "step": 1566 }, { "epoch": 0.19933850655132934, "ewc_loss": 0.010271304287016392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.027130383590702e-05, "grad_norm": 10.727422714233398, "learning_rate": 6.638406104281476e-07, "loss": 0.4881, "mean_token_accuracy": 0.8426077365875244, "num_tokens": 59882552.0, "step": 1567 }, { "epoch": 0.19946571682991987, "ewc_loss": 0.01027683261781931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0276832654199097e-05, "grad_norm": 10.716983795166016, "learning_rate": 6.642645188639253e-07, "loss": 0.4635, "mean_token_accuracy": 0.8486323356628418, "num_tokens": 59925090.0, "step": 1568 }, { "epoch": 0.19959292710851037, "ewc_loss": 0.010290742851793766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0290742466168012e-05, "grad_norm": 10.67456340789795, "learning_rate": 6.646884272997032e-07, "loss": 0.5123, "mean_token_accuracy": 0.8363911509513855, "num_tokens": 59965406.0, "step": 1569 }, { "epoch": 0.19972013738710087, "ewc_loss": 0.010284199379384518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0284199561283458e-05, "grad_norm": 10.67335033416748, "learning_rate": 6.651123357354811e-07, "loss": 0.4989, "mean_token_accuracy": 0.8397740721702576, "num_tokens": 60001770.0, "step": 1570 }, { "epoch": 0.1998473476656914, "ewc_loss": 0.010306105948984623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.030610565067036e-05, "grad_norm": 10.72637939453125, "learning_rate": 6.655362441712589e-07, "loss": 0.5148, "mean_token_accuracy": 0.8330581188201904, "num_tokens": 60036911.0, "step": 1571 }, { "epoch": 0.1999745579442819, "ewc_loss": 0.01033046655356884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0330466466257349e-05, "grad_norm": 10.678305625915527, "learning_rate": 6.659601526070369e-07, "loss": 0.5431, "mean_token_accuracy": 0.8308118581771851, "num_tokens": 60080210.0, "step": 1572 }, { "epoch": 0.2001017682228724, "ewc_loss": 0.010309294797480106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0309294339094777e-05, "grad_norm": 10.718464851379395, "learning_rate": 6.663840610428147e-07, "loss": 0.5777, "mean_token_accuracy": 0.8226875066757202, "num_tokens": 60111533.0, "step": 1573 }, { "epoch": 0.20022897850146293, "ewc_loss": 0.01037693489342928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0376935279055033e-05, "grad_norm": 10.710732460021973, "learning_rate": 6.668079694785926e-07, "loss": 0.4534, "mean_token_accuracy": 0.8544647693634033, "num_tokens": 60150733.0, "step": 1574 }, { "epoch": 0.20035618878005343, "ewc_loss": 0.010357153601944447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0357153769291472e-05, "grad_norm": 10.733989715576172, "learning_rate": 6.672318779143704e-07, "loss": 0.4734, "mean_token_accuracy": 0.8483583331108093, "num_tokens": 60185034.0, "step": 1575 }, { "epoch": 0.20048339905864393, "ewc_loss": 0.010398836806416512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0398836820968427e-05, "grad_norm": 10.76487922668457, "learning_rate": 6.676557863501483e-07, "loss": 0.5051, "mean_token_accuracy": 0.838955819606781, "num_tokens": 60216865.0, "step": 1576 }, { "epoch": 0.20061060933723446, "ewc_loss": 0.010396186262369156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.039618655340746e-05, "grad_norm": 10.710731506347656, "learning_rate": 6.680796947859262e-07, "loss": 0.4936, "mean_token_accuracy": 0.8408787250518799, "num_tokens": 60257685.0, "step": 1577 }, { "epoch": 0.20073781961582496, "ewc_loss": 0.010414852760732174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0414853022666648e-05, "grad_norm": 10.73580265045166, "learning_rate": 6.685036032217041e-07, "loss": 0.4878, "mean_token_accuracy": 0.8441879749298096, "num_tokens": 60294258.0, "step": 1578 }, { "epoch": 0.20086502989441546, "ewc_loss": 0.010417630895972252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0417630619485863e-05, "grad_norm": 10.730090141296387, "learning_rate": 6.689275116574819e-07, "loss": 0.4777, "mean_token_accuracy": 0.8493698239326477, "num_tokens": 60330319.0, "step": 1579 }, { "epoch": 0.200992240173006, "ewc_loss": 0.010432911105453968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.043291103997035e-05, "grad_norm": 10.707188606262207, "learning_rate": 6.693514200932599e-07, "loss": 0.4949, "mean_token_accuracy": 0.8443465232849121, "num_tokens": 60370482.0, "step": 1580 }, { "epoch": 0.2011194504515965, "ewc_loss": 0.010460161603987217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.046016132022487e-05, "grad_norm": 10.761861801147461, "learning_rate": 6.697753285290377e-07, "loss": 0.5002, "mean_token_accuracy": 0.8392459750175476, "num_tokens": 60412436.0, "step": 1581 }, { "epoch": 0.201246660730187, "ewc_loss": 0.010484529659152031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0484529411769472e-05, "grad_norm": 10.772846221923828, "learning_rate": 6.701992369648156e-07, "loss": 0.4232, "mean_token_accuracy": 0.8644047975540161, "num_tokens": 60448982.0, "step": 1582 }, { "epoch": 0.20137387100877752, "ewc_loss": 0.010459735058248043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0459734767209738e-05, "grad_norm": 10.727388381958008, "learning_rate": 6.706231454005934e-07, "loss": 0.4411, "mean_token_accuracy": 0.8557104468345642, "num_tokens": 60489878.0, "step": 1583 }, { "epoch": 0.20150108128736802, "ewc_loss": 0.010464680381119251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.046468059939798e-05, "grad_norm": 10.744385719299316, "learning_rate": 6.710470538363713e-07, "loss": 0.4765, "mean_token_accuracy": 0.8499553203582764, "num_tokens": 60528979.0, "step": 1584 }, { "epoch": 0.20162829156595852, "ewc_loss": 0.010495131835341454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0495131391508039e-05, "grad_norm": 10.820680618286133, "learning_rate": 6.714709622721492e-07, "loss": 0.5043, "mean_token_accuracy": 0.8376134037971497, "num_tokens": 60561670.0, "step": 1585 }, { "epoch": 0.20175550184454905, "ewc_loss": 0.010503873229026794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.050387345458148e-05, "grad_norm": 10.770384788513184, "learning_rate": 6.718948707079271e-07, "loss": 0.4795, "mean_token_accuracy": 0.844946563243866, "num_tokens": 60602061.0, "step": 1586 }, { "epoch": 0.20188271212313955, "ewc_loss": 0.010468259453773499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0468259461049456e-05, "grad_norm": 10.743966102600098, "learning_rate": 6.723187791437049e-07, "loss": 0.4822, "mean_token_accuracy": 0.8452428579330444, "num_tokens": 60639805.0, "step": 1587 }, { "epoch": 0.20200992240173005, "ewc_loss": 0.010504351928830147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0504351848794613e-05, "grad_norm": 10.798186302185059, "learning_rate": 6.727426875794829e-07, "loss": 0.5088, "mean_token_accuracy": 0.8375815153121948, "num_tokens": 60681816.0, "step": 1588 }, { "epoch": 0.20213713268032057, "ewc_loss": 0.010512259788811207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0512259905226529e-05, "grad_norm": 10.740699768066406, "learning_rate": 6.731665960152607e-07, "loss": 0.4933, "mean_token_accuracy": 0.8393352627754211, "num_tokens": 60726343.0, "step": 1589 }, { "epoch": 0.20226434295891108, "ewc_loss": 0.010477829724550247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0477830073796213e-05, "grad_norm": 10.793167114257812, "learning_rate": 6.735905044510385e-07, "loss": 0.5376, "mean_token_accuracy": 0.8233746886253357, "num_tokens": 60764814.0, "step": 1590 }, { "epoch": 0.2023915532375016, "ewc_loss": 0.010513152927160263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.051315302902367e-05, "grad_norm": 10.835600852966309, "learning_rate": 6.740144128868164e-07, "loss": 0.5047, "mean_token_accuracy": 0.83934485912323, "num_tokens": 60802384.0, "step": 1591 }, { "epoch": 0.2025187635160921, "ewc_loss": 0.01050103735178709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0501037650101352e-05, "grad_norm": 10.796347618103027, "learning_rate": 6.744383213225942e-07, "loss": 0.5312, "mean_token_accuracy": 0.8298988342285156, "num_tokens": 60842085.0, "step": 1592 }, { "epoch": 0.2026459737946826, "ewc_loss": 0.01049982849508524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0499828931642696e-05, "grad_norm": 10.798352241516113, "learning_rate": 6.748622297583722e-07, "loss": 0.466, "mean_token_accuracy": 0.8455414772033691, "num_tokens": 60874835.0, "step": 1593 }, { "epoch": 0.20277318407327313, "ewc_loss": 0.010516496375203133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0516496331547387e-05, "grad_norm": 10.79662036895752, "learning_rate": 6.7528613819415e-07, "loss": 0.4934, "mean_token_accuracy": 0.8399492502212524, "num_tokens": 60910682.0, "step": 1594 }, { "epoch": 0.20290039435186363, "ewc_loss": 0.010517836548388004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.05178369267378e-05, "grad_norm": 10.828742027282715, "learning_rate": 6.757100466299279e-07, "loss": 0.4892, "mean_token_accuracy": 0.8456935286521912, "num_tokens": 60951352.0, "step": 1595 }, { "epoch": 0.20302760463045413, "ewc_loss": 0.010506897233426571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0506897524464875e-05, "grad_norm": 10.859352111816406, "learning_rate": 6.761339550657058e-07, "loss": 0.5173, "mean_token_accuracy": 0.8313466310501099, "num_tokens": 60979055.0, "step": 1596 }, { "epoch": 0.20315481490904466, "ewc_loss": 0.010538625530898571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0538625247136224e-05, "grad_norm": 10.79887580871582, "learning_rate": 6.765578635014837e-07, "loss": 0.4958, "mean_token_accuracy": 0.8437532186508179, "num_tokens": 61017437.0, "step": 1597 }, { "epoch": 0.20328202518763516, "ewc_loss": 0.010523230768740177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0523231139814015e-05, "grad_norm": 10.759023666381836, "learning_rate": 6.769817719372614e-07, "loss": 0.4715, "mean_token_accuracy": 0.8484761118888855, "num_tokens": 61052286.0, "step": 1598 }, { "epoch": 0.20340923546622566, "ewc_loss": 0.01056582573801279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0565825505182147e-05, "grad_norm": 10.833123207092285, "learning_rate": 6.774056803730394e-07, "loss": 0.5166, "mean_token_accuracy": 0.8333941698074341, "num_tokens": 61086110.0, "step": 1599 }, { "epoch": 0.2035364457448162, "ewc_loss": 0.010591364465653896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0591364116407931e-05, "grad_norm": 10.780293464660645, "learning_rate": 6.778295888088172e-07, "loss": 0.4883, "mean_token_accuracy": 0.8429431915283203, "num_tokens": 61124181.0, "step": 1600 }, { "epoch": 0.2036636560234067, "ewc_loss": 0.010558906011283398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0558906069491059e-05, "grad_norm": 10.844965934753418, "learning_rate": 6.782534972445952e-07, "loss": 0.5087, "mean_token_accuracy": 0.8354562520980835, "num_tokens": 61160881.0, "step": 1601 }, { "epoch": 0.2037908663019972, "ewc_loss": 0.010624459013342857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0624458809616044e-05, "grad_norm": 10.782631874084473, "learning_rate": 6.78677405680373e-07, "loss": 0.5065, "mean_token_accuracy": 0.8400034308433533, "num_tokens": 61200755.0, "step": 1602 }, { "epoch": 0.20391807658058772, "ewc_loss": 0.010586787015199661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0586786629573908e-05, "grad_norm": 10.784939765930176, "learning_rate": 6.791013141161509e-07, "loss": 0.4971, "mean_token_accuracy": 0.8392208218574524, "num_tokens": 61240045.0, "step": 1603 }, { "epoch": 0.20404528685917822, "ewc_loss": 0.010646743699908257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0646743248798884e-05, "grad_norm": 10.868887901306152, "learning_rate": 6.795252225519288e-07, "loss": 0.5223, "mean_token_accuracy": 0.8314521312713623, "num_tokens": 61282832.0, "step": 1604 }, { "epoch": 0.20417249713776872, "ewc_loss": 0.0106276273727417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0627627489157021e-05, "grad_norm": 10.788911819458008, "learning_rate": 6.799491309877067e-07, "loss": 0.496, "mean_token_accuracy": 0.8379881381988525, "num_tokens": 61319841.0, "step": 1605 }, { "epoch": 0.20429970741635925, "ewc_loss": 0.010622146539390087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0622146874084137e-05, "grad_norm": 10.872540473937988, "learning_rate": 6.803730394234844e-07, "loss": 0.4653, "mean_token_accuracy": 0.8511703610420227, "num_tokens": 61352732.0, "step": 1606 }, { "epoch": 0.20442691769494975, "ewc_loss": 0.010635659098625183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0635659236868378e-05, "grad_norm": 10.8489351272583, "learning_rate": 6.807969478592624e-07, "loss": 0.4982, "mean_token_accuracy": 0.8428376317024231, "num_tokens": 61388324.0, "step": 1607 }, { "epoch": 0.20455412797354025, "ewc_loss": 0.0106503926217556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.06503930510371e-05, "grad_norm": 10.824588775634766, "learning_rate": 6.812208562950402e-07, "loss": 0.5218, "mean_token_accuracy": 0.839627742767334, "num_tokens": 61431222.0, "step": 1608 }, { "epoch": 0.20468133825213078, "ewc_loss": 0.010646563023328781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0646563168847933e-05, "grad_norm": 10.854026794433594, "learning_rate": 6.816447647308182e-07, "loss": 0.5521, "mean_token_accuracy": 0.8244813680648804, "num_tokens": 61475617.0, "step": 1609 }, { "epoch": 0.20480854853072128, "ewc_loss": 0.010667385533452034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0667385140550323e-05, "grad_norm": 10.868169784545898, "learning_rate": 6.82068673166596e-07, "loss": 0.4781, "mean_token_accuracy": 0.8473519682884216, "num_tokens": 61511338.0, "step": 1610 }, { "epoch": 0.20493575880931178, "ewc_loss": 0.010657425969839096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.065742617356591e-05, "grad_norm": 10.85924243927002, "learning_rate": 6.824925816023738e-07, "loss": 0.5, "mean_token_accuracy": 0.8380210995674133, "num_tokens": 61549093.0, "step": 1611 }, { "epoch": 0.2050629690879023, "ewc_loss": 0.010663449764251709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0663449756975751e-05, "grad_norm": 10.809918403625488, "learning_rate": 6.829164900381518e-07, "loss": 0.4678, "mean_token_accuracy": 0.8489874601364136, "num_tokens": 61593843.0, "step": 1612 }, { "epoch": 0.2051901793664928, "ewc_loss": 0.010661935433745384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.06619354482973e-05, "grad_norm": 10.913220405578613, "learning_rate": 6.833403984739295e-07, "loss": 0.4792, "mean_token_accuracy": 0.8442472219467163, "num_tokens": 61629362.0, "step": 1613 }, { "epoch": 0.2053173896450833, "ewc_loss": 0.010706337168812752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0706336979637854e-05, "grad_norm": 10.93743896484375, "learning_rate": 6.837643069097074e-07, "loss": 0.56, "mean_token_accuracy": 0.8223986625671387, "num_tokens": 61667037.0, "step": 1614 }, { "epoch": 0.20544459992367384, "ewc_loss": 0.010666959919035435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0666959497029893e-05, "grad_norm": 10.804036140441895, "learning_rate": 6.841882153454853e-07, "loss": 0.478, "mean_token_accuracy": 0.8439479470252991, "num_tokens": 61701976.0, "step": 1615 }, { "epoch": 0.20557181020226434, "ewc_loss": 0.010689061135053635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0689061127777677e-05, "grad_norm": 10.889054298400879, "learning_rate": 6.846121237812632e-07, "loss": 0.4465, "mean_token_accuracy": 0.855478048324585, "num_tokens": 61739068.0, "step": 1616 }, { "epoch": 0.20569902048085487, "ewc_loss": 0.010722619481384754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0722619663283695e-05, "grad_norm": 10.831504821777344, "learning_rate": 6.850360322170411e-07, "loss": 0.4615, "mean_token_accuracy": 0.8514084219932556, "num_tokens": 61781515.0, "step": 1617 }, { "epoch": 0.20582623075944537, "ewc_loss": 0.01068645529448986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0686455425457098e-05, "grad_norm": 10.919774055480957, "learning_rate": 6.85459940652819e-07, "loss": 0.4754, "mean_token_accuracy": 0.8435879349708557, "num_tokens": 61815201.0, "step": 1618 }, { "epoch": 0.20595344103803587, "ewc_loss": 0.010736729949712753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0736729564087e-05, "grad_norm": 10.85789680480957, "learning_rate": 6.858838490885968e-07, "loss": 0.5326, "mean_token_accuracy": 0.8288758993148804, "num_tokens": 61856639.0, "step": 1619 }, { "epoch": 0.2060806513166264, "ewc_loss": 0.01072101853787899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0721018952608574e-05, "grad_norm": 10.920888900756836, "learning_rate": 6.863077575243748e-07, "loss": 0.5451, "mean_token_accuracy": 0.8299935460090637, "num_tokens": 61894313.0, "step": 1620 }, { "epoch": 0.2062078615952169, "ewc_loss": 0.010755535215139389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0755535186035559e-05, "grad_norm": 10.858414649963379, "learning_rate": 6.867316659601525e-07, "loss": 0.4806, "mean_token_accuracy": 0.8427442908287048, "num_tokens": 61930748.0, "step": 1621 }, { "epoch": 0.2063350718738074, "ewc_loss": 0.010726111009716988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.07261112134438e-05, "grad_norm": 10.830870628356934, "learning_rate": 6.871555743959304e-07, "loss": 0.4953, "mean_token_accuracy": 0.842211127281189, "num_tokens": 61972002.0, "step": 1622 }, { "epoch": 0.20646228215239792, "ewc_loss": 0.01075730286538601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0757303243735805e-05, "grad_norm": 10.886420249938965, "learning_rate": 6.875794828317083e-07, "loss": 0.471, "mean_token_accuracy": 0.8494649529457092, "num_tokens": 62010141.0, "step": 1623 }, { "epoch": 0.20658949243098843, "ewc_loss": 0.010771493427455425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0771493180072866e-05, "grad_norm": 10.823720932006836, "learning_rate": 6.880033912674862e-07, "loss": 0.4596, "mean_token_accuracy": 0.853575587272644, "num_tokens": 62055907.0, "step": 1624 }, { "epoch": 0.20671670270957893, "ewc_loss": 0.01076676044613123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.076676016964484e-05, "grad_norm": 10.895406723022461, "learning_rate": 6.884272997032641e-07, "loss": 0.5663, "mean_token_accuracy": 0.8201919794082642, "num_tokens": 62095885.0, "step": 1625 }, { "epoch": 0.20684391298816945, "ewc_loss": 0.01079208217561245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0792082321131602e-05, "grad_norm": 10.892860412597656, "learning_rate": 6.88851208139042e-07, "loss": 0.4584, "mean_token_accuracy": 0.854236364364624, "num_tokens": 62130497.0, "step": 1626 }, { "epoch": 0.20697112326675995, "ewc_loss": 0.010789618827402592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0789618499984499e-05, "grad_norm": 10.891979217529297, "learning_rate": 6.892751165748198e-07, "loss": 0.5193, "mean_token_accuracy": 0.8333075046539307, "num_tokens": 62170296.0, "step": 1627 }, { "epoch": 0.20709833354535045, "ewc_loss": 0.010799289681017399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0799290066643152e-05, "grad_norm": 10.90099048614502, "learning_rate": 6.896990250105978e-07, "loss": 0.4552, "mean_token_accuracy": 0.857386589050293, "num_tokens": 62210168.0, "step": 1628 }, { "epoch": 0.20722554382394098, "ewc_loss": 0.010802469216287136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.080246966012055e-05, "grad_norm": 10.91174030303955, "learning_rate": 6.901229334463755e-07, "loss": 0.4815, "mean_token_accuracy": 0.8436627388000488, "num_tokens": 62245251.0, "step": 1629 }, { "epoch": 0.20735275410253148, "ewc_loss": 0.010803982615470886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.08039830593043e-05, "grad_norm": 10.90420150756836, "learning_rate": 6.905468418821534e-07, "loss": 0.4799, "mean_token_accuracy": 0.8497012257575989, "num_tokens": 62286124.0, "step": 1630 }, { "epoch": 0.20747996438112198, "ewc_loss": 0.010802767239511013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.080276706488803e-05, "grad_norm": 10.904279708862305, "learning_rate": 6.909707503179313e-07, "loss": 0.4842, "mean_token_accuracy": 0.8414260745048523, "num_tokens": 62325604.0, "step": 1631 }, { "epoch": 0.2076071746597125, "ewc_loss": 0.010806117206811905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0806117643369362e-05, "grad_norm": 10.913839340209961, "learning_rate": 6.913946587537091e-07, "loss": 0.4802, "mean_token_accuracy": 0.8453749418258667, "num_tokens": 62364467.0, "step": 1632 }, { "epoch": 0.207734384938303, "ewc_loss": 0.010795419104397297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0795419257192407e-05, "grad_norm": 10.917299270629883, "learning_rate": 6.918185671894871e-07, "loss": 0.5533, "mean_token_accuracy": 0.8237698674201965, "num_tokens": 62402075.0, "step": 1633 }, { "epoch": 0.2078615952168935, "ewc_loss": 0.010837082751095295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0837082299985923e-05, "grad_norm": 10.94880199432373, "learning_rate": 6.922424756252649e-07, "loss": 0.5204, "mean_token_accuracy": 0.8404114246368408, "num_tokens": 62437802.0, "step": 1634 }, { "epoch": 0.20798880549548404, "ewc_loss": 0.01081049907952547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0810498679347802e-05, "grad_norm": 10.909064292907715, "learning_rate": 6.926663840610428e-07, "loss": 0.4389, "mean_token_accuracy": 0.85733962059021, "num_tokens": 62476176.0, "step": 1635 }, { "epoch": 0.20811601577407454, "ewc_loss": 0.010831594467163086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0831594408955425e-05, "grad_norm": 10.958365440368652, "learning_rate": 6.930902924968206e-07, "loss": 0.4841, "mean_token_accuracy": 0.8408010005950928, "num_tokens": 62514378.0, "step": 1636 }, { "epoch": 0.20824322605266504, "ewc_loss": 0.010851038619875908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0851038496184628e-05, "grad_norm": 10.915790557861328, "learning_rate": 6.935142009325985e-07, "loss": 0.5063, "mean_token_accuracy": 0.8380424380302429, "num_tokens": 62555302.0, "step": 1637 }, { "epoch": 0.20837043633125557, "ewc_loss": 0.01081835851073265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0818358532560524e-05, "grad_norm": 10.885286331176758, "learning_rate": 6.939381093683764e-07, "loss": 0.5247, "mean_token_accuracy": 0.8326329588890076, "num_tokens": 62594356.0, "step": 1638 }, { "epoch": 0.20849764660984607, "ewc_loss": 0.010854240506887436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0854240827029571e-05, "grad_norm": 10.906695365905762, "learning_rate": 6.943620178041543e-07, "loss": 0.4424, "mean_token_accuracy": 0.8580896258354187, "num_tokens": 62632903.0, "step": 1639 }, { "epoch": 0.2086248568884366, "ewc_loss": 0.010846821591258049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0846821169252507e-05, "grad_norm": 10.92658519744873, "learning_rate": 6.947859262399321e-07, "loss": 0.4929, "mean_token_accuracy": 0.8480898141860962, "num_tokens": 62668979.0, "step": 1640 }, { "epoch": 0.2087520671670271, "ewc_loss": 0.010854760184884071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0854760148504283e-05, "grad_norm": 10.927142143249512, "learning_rate": 6.952098346757101e-07, "loss": 0.4786, "mean_token_accuracy": 0.8457597494125366, "num_tokens": 62705867.0, "step": 1641 }, { "epoch": 0.2088792774456176, "ewc_loss": 0.010862103663384914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0862103408726398e-05, "grad_norm": 10.959943771362305, "learning_rate": 6.956337431114879e-07, "loss": 0.4753, "mean_token_accuracy": 0.8491199612617493, "num_tokens": 62744559.0, "step": 1642 }, { "epoch": 0.20900648772420813, "ewc_loss": 0.010891460813581944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0891460988204926e-05, "grad_norm": 10.916056632995605, "learning_rate": 6.960576515472658e-07, "loss": 0.5052, "mean_token_accuracy": 0.8390615582466125, "num_tokens": 62785221.0, "step": 1643 }, { "epoch": 0.20913369800279863, "ewc_loss": 0.010865945369005203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0865945114346687e-05, "grad_norm": 10.953765869140625, "learning_rate": 6.964815599830436e-07, "loss": 0.4951, "mean_token_accuracy": 0.8404682874679565, "num_tokens": 62820095.0, "step": 1644 }, { "epoch": 0.20926090828138913, "ewc_loss": 0.010911403223872185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0911403478530701e-05, "grad_norm": 10.931188583374023, "learning_rate": 6.969054684188215e-07, "loss": 0.5461, "mean_token_accuracy": 0.8228410482406616, "num_tokens": 62863414.0, "step": 1645 }, { "epoch": 0.20938811855997966, "ewc_loss": 0.010920397937297821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0920397471636534e-05, "grad_norm": 10.943268775939941, "learning_rate": 6.973293768545994e-07, "loss": 0.4559, "mean_token_accuracy": 0.853758692741394, "num_tokens": 62904647.0, "step": 1646 }, { "epoch": 0.20951532883857016, "ewc_loss": 0.010905051603913307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0905051567533519e-05, "grad_norm": 10.923727989196777, "learning_rate": 6.977532852903773e-07, "loss": 0.4889, "mean_token_accuracy": 0.8411614894866943, "num_tokens": 62949476.0, "step": 1647 }, { "epoch": 0.20964253911716066, "ewc_loss": 0.010940221138298512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0940220818156376e-05, "grad_norm": 10.972960472106934, "learning_rate": 6.981771937261551e-07, "loss": 0.5426, "mean_token_accuracy": 0.8280373811721802, "num_tokens": 62984784.0, "step": 1648 }, { "epoch": 0.2097697493957512, "ewc_loss": 0.010958748869597912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0958749044220895e-05, "grad_norm": 10.973786354064941, "learning_rate": 6.986011021619331e-07, "loss": 0.4959, "mean_token_accuracy": 0.8416436910629272, "num_tokens": 63024395.0, "step": 1649 }, { "epoch": 0.2098969596743417, "ewc_loss": 0.010928979143500328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0928979463642463e-05, "grad_norm": 10.939373016357422, "learning_rate": 6.990250105977109e-07, "loss": 0.5128, "mean_token_accuracy": 0.8382488489151001, "num_tokens": 63061886.0, "step": 1650 }, { "epoch": 0.2100241699529322, "ewc_loss": 0.010963342152535915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0963341992464848e-05, "grad_norm": 10.997669219970703, "learning_rate": 6.994489190334886e-07, "loss": 0.4808, "mean_token_accuracy": 0.8440515995025635, "num_tokens": 63104150.0, "step": 1651 }, { "epoch": 0.21015138023152272, "ewc_loss": 0.01099314633756876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0993146133841947e-05, "grad_norm": 10.95556926727295, "learning_rate": 6.998728274692666e-07, "loss": 0.4437, "mean_token_accuracy": 0.8566100001335144, "num_tokens": 63144346.0, "step": 1652 }, { "epoch": 0.21027859051011322, "ewc_loss": 0.010967331007122993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0967331036226824e-05, "grad_norm": 10.985786437988281, "learning_rate": 7.002967359050444e-07, "loss": 0.4861, "mean_token_accuracy": 0.849744439125061, "num_tokens": 63180042.0, "step": 1653 }, { "epoch": 0.21040580078870372, "ewc_loss": 0.01099657453596592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.099657492886763e-05, "grad_norm": 10.979500770568848, "learning_rate": 7.007206443408224e-07, "loss": 0.4356, "mean_token_accuracy": 0.857648491859436, "num_tokens": 63216570.0, "step": 1654 }, { "epoch": 0.21053301106729425, "ewc_loss": 0.010991008020937443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0991007911798079e-05, "grad_norm": 11.014620780944824, "learning_rate": 7.011445527766002e-07, "loss": 0.4872, "mean_token_accuracy": 0.8445501327514648, "num_tokens": 63253457.0, "step": 1655 }, { "epoch": 0.21066022134588475, "ewc_loss": 0.010987376794219017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0987377208948601e-05, "grad_norm": 10.958158493041992, "learning_rate": 7.015684612123781e-07, "loss": 0.4725, "mean_token_accuracy": 0.8464361429214478, "num_tokens": 63295430.0, "step": 1656 }, { "epoch": 0.21078743162447525, "ewc_loss": 0.010987496003508568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.0987496352754533e-05, "grad_norm": 10.984831809997559, "learning_rate": 7.01992369648156e-07, "loss": 0.4943, "mean_token_accuracy": 0.8440289497375488, "num_tokens": 63340118.0, "step": 1657 }, { "epoch": 0.21091464190306577, "ewc_loss": 0.011022607795894146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1022607395716477e-05, "grad_norm": 11.07864761352539, "learning_rate": 7.024162780839339e-07, "loss": 0.5256, "mean_token_accuracy": 0.825495719909668, "num_tokens": 63372037.0, "step": 1658 }, { "epoch": 0.21104185218165628, "ewc_loss": 0.011038421653211117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1038421689590905e-05, "grad_norm": 11.009424209594727, "learning_rate": 7.028401865197116e-07, "loss": 0.5165, "mean_token_accuracy": 0.8338913321495056, "num_tokens": 63415508.0, "step": 1659 }, { "epoch": 0.21116906246024678, "ewc_loss": 0.011006597429513931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1006597560481168e-05, "grad_norm": 10.977277755737305, "learning_rate": 7.032640949554896e-07, "loss": 0.4245, "mean_token_accuracy": 0.8591517806053162, "num_tokens": 63459320.0, "step": 1660 }, { "epoch": 0.2112962727388373, "ewc_loss": 0.011009825393557549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.100982535717776e-05, "grad_norm": 11.008634567260742, "learning_rate": 7.036880033912674e-07, "loss": 0.5252, "mean_token_accuracy": 0.833853006362915, "num_tokens": 63499624.0, "step": 1661 }, { "epoch": 0.2114234830174278, "ewc_loss": 0.011038192547857761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1038192496926058e-05, "grad_norm": 11.04946231842041, "learning_rate": 7.041119118270454e-07, "loss": 0.4821, "mean_token_accuracy": 0.8439587354660034, "num_tokens": 63536507.0, "step": 1662 }, { "epoch": 0.2115506932960183, "ewc_loss": 0.011032198555767536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1032198926841374e-05, "grad_norm": 11.007936477661133, "learning_rate": 7.045358202628232e-07, "loss": 0.5228, "mean_token_accuracy": 0.8348448276519775, "num_tokens": 63576483.0, "step": 1663 }, { "epoch": 0.21167790357460883, "ewc_loss": 0.011027943342924118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1027943401131779e-05, "grad_norm": 11.034728050231934, "learning_rate": 7.049597286986011e-07, "loss": 0.5122, "mean_token_accuracy": 0.8348782658576965, "num_tokens": 63615568.0, "step": 1664 }, { "epoch": 0.21180511385319933, "ewc_loss": 0.011043349280953407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.104334933188511e-05, "grad_norm": 11.043707847595215, "learning_rate": 7.05383637134379e-07, "loss": 0.4729, "mean_token_accuracy": 0.8470866680145264, "num_tokens": 63650023.0, "step": 1665 }, { "epoch": 0.21193232413178986, "ewc_loss": 0.011045246385037899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1045246537833009e-05, "grad_norm": 10.997713088989258, "learning_rate": 7.058075455701568e-07, "loss": 0.5027, "mean_token_accuracy": 0.8341810703277588, "num_tokens": 63690229.0, "step": 1666 }, { "epoch": 0.21205953441038036, "ewc_loss": 0.011073385365307331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1073385394411162e-05, "grad_norm": 11.030763626098633, "learning_rate": 7.062314540059346e-07, "loss": 0.4468, "mean_token_accuracy": 0.8583194017410278, "num_tokens": 63728169.0, "step": 1667 }, { "epoch": 0.21218674468897086, "ewc_loss": 0.011065145023167133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1065145372413099e-05, "grad_norm": 11.083873748779297, "learning_rate": 7.066553624417126e-07, "loss": 0.4728, "mean_token_accuracy": 0.8460031747817993, "num_tokens": 63762816.0, "step": 1668 }, { "epoch": 0.2123139549675614, "ewc_loss": 0.011087990365922451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1087990060332231e-05, "grad_norm": 11.045337677001953, "learning_rate": 7.070792708774904e-07, "loss": 0.4943, "mean_token_accuracy": 0.8417240381240845, "num_tokens": 63804288.0, "step": 1669 }, { "epoch": 0.2124411652461519, "ewc_loss": 0.01106687169522047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1066871593357064e-05, "grad_norm": 11.002484321594238, "learning_rate": 7.075031793132684e-07, "loss": 0.515, "mean_token_accuracy": 0.8388233184814453, "num_tokens": 63851365.0, "step": 1670 }, { "epoch": 0.2125683755247424, "ewc_loss": 0.011087965220212936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1087965503975283e-05, "grad_norm": 11.10324478149414, "learning_rate": 7.079270877490462e-07, "loss": 0.4766, "mean_token_accuracy": 0.8490382432937622, "num_tokens": 63886187.0, "step": 1671 }, { "epoch": 0.21269558580333292, "ewc_loss": 0.011124078184366226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.112407790060388e-05, "grad_norm": 11.11866569519043, "learning_rate": 7.08350996184824e-07, "loss": 0.55, "mean_token_accuracy": 0.8255476951599121, "num_tokens": 63927187.0, "step": 1672 }, { "epoch": 0.21282279608192342, "ewc_loss": 0.011087757535278797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1087757229688577e-05, "grad_norm": 11.085519790649414, "learning_rate": 7.08774904620602e-07, "loss": 0.4702, "mean_token_accuracy": 0.8522381782531738, "num_tokens": 63964088.0, "step": 1673 }, { "epoch": 0.21295000636051392, "ewc_loss": 0.011089123785495758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1089124200225342e-05, "grad_norm": 11.017900466918945, "learning_rate": 7.091988130563797e-07, "loss": 0.5153, "mean_token_accuracy": 0.835392951965332, "num_tokens": 64004931.0, "step": 1674 }, { "epoch": 0.21307721663910445, "ewc_loss": 0.0111085195094347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.110851917474065e-05, "grad_norm": 11.116706848144531, "learning_rate": 7.096227214921576e-07, "loss": 0.5089, "mean_token_accuracy": 0.8371049761772156, "num_tokens": 64043966.0, "step": 1675 }, { "epoch": 0.21320442691769495, "ewc_loss": 0.011119437403976917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1119437658635434e-05, "grad_norm": 11.009734153747559, "learning_rate": 7.100466299279355e-07, "loss": 0.4912, "mean_token_accuracy": 0.8434974551200867, "num_tokens": 64078886.0, "step": 1676 }, { "epoch": 0.21333163719628545, "ewc_loss": 0.011123226955533028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.112322661356302e-05, "grad_norm": 11.12407398223877, "learning_rate": 7.104705383637134e-07, "loss": 0.5158, "mean_token_accuracy": 0.8405131101608276, "num_tokens": 64117291.0, "step": 1677 }, { "epoch": 0.21345884747487598, "ewc_loss": 0.011148246005177498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1148245903314091e-05, "grad_norm": 11.050154685974121, "learning_rate": 7.108944467994913e-07, "loss": 0.4715, "mean_token_accuracy": 0.8472495675086975, "num_tokens": 64156369.0, "step": 1678 }, { "epoch": 0.21358605775346648, "ewc_loss": 0.011152663268148899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1152663319080602e-05, "grad_norm": 11.148877143859863, "learning_rate": 7.113183552352692e-07, "loss": 0.5454, "mean_token_accuracy": 0.8219115138053894, "num_tokens": 64193646.0, "step": 1679 }, { "epoch": 0.21371326803205698, "ewc_loss": 0.011175485327839851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.117548526963219e-05, "grad_norm": 11.044771194458008, "learning_rate": 7.11742263671047e-07, "loss": 0.4371, "mean_token_accuracy": 0.8578841090202332, "num_tokens": 64231933.0, "step": 1680 }, { "epoch": 0.2138404783106475, "ewc_loss": 0.011144846677780151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1144846212118864e-05, "grad_norm": 11.12444019317627, "learning_rate": 7.12166172106825e-07, "loss": 0.57, "mean_token_accuracy": 0.8165662288665771, "num_tokens": 64267418.0, "step": 1681 }, { "epoch": 0.213967688589238, "ewc_loss": 0.011199455708265305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1199455911992118e-05, "grad_norm": 11.086254119873047, "learning_rate": 7.125900805426027e-07, "loss": 0.451, "mean_token_accuracy": 0.8541772961616516, "num_tokens": 64302440.0, "step": 1682 }, { "epoch": 0.2140948988678285, "ewc_loss": 0.011170423589646816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1170423931616824e-05, "grad_norm": 11.085646629333496, "learning_rate": 7.130139889783806e-07, "loss": 0.4902, "mean_token_accuracy": 0.843077540397644, "num_tokens": 64338240.0, "step": 1683 }, { "epoch": 0.21422210914641904, "ewc_loss": 0.011197756044566631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1197756066394504e-05, "grad_norm": 11.0818510055542, "learning_rate": 7.134378974141585e-07, "loss": 0.4808, "mean_token_accuracy": 0.8440494537353516, "num_tokens": 64375734.0, "step": 1684 }, { "epoch": 0.21434931942500954, "ewc_loss": 0.011184272356331348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1184271897946019e-05, "grad_norm": 11.108765602111816, "learning_rate": 7.138618058499364e-07, "loss": 0.4745, "mean_token_accuracy": 0.8482682108879089, "num_tokens": 64411401.0, "step": 1685 }, { "epoch": 0.21447652970360004, "ewc_loss": 0.011220022104680538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1220022315683309e-05, "grad_norm": 11.080961227416992, "learning_rate": 7.142857142857143e-07, "loss": 0.45, "mean_token_accuracy": 0.8561400175094604, "num_tokens": 64454719.0, "step": 1686 }, { "epoch": 0.21460373998219057, "ewc_loss": 0.01120605319738388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1206053386558779e-05, "grad_norm": 11.094816207885742, "learning_rate": 7.147096227214922e-07, "loss": 0.4747, "mean_token_accuracy": 0.8479831218719482, "num_tokens": 64495736.0, "step": 1687 }, { "epoch": 0.21473095026078107, "ewc_loss": 0.01124637108296156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1246371286688372e-05, "grad_norm": 11.137890815734863, "learning_rate": 7.1513353115727e-07, "loss": 0.4551, "mean_token_accuracy": 0.8527122735977173, "num_tokens": 64532843.0, "step": 1688 }, { "epoch": 0.21485816053937157, "ewc_loss": 0.011220999993383884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1221000022487715e-05, "grad_norm": 11.131657600402832, "learning_rate": 7.155574395930479e-07, "loss": 0.5014, "mean_token_accuracy": 0.8354445099830627, "num_tokens": 64568950.0, "step": 1689 }, { "epoch": 0.2149853708179621, "ewc_loss": 0.011232183314859867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1232183169340715e-05, "grad_norm": 11.12601089477539, "learning_rate": 7.159813480288257e-07, "loss": 0.4778, "mean_token_accuracy": 0.8433665037155151, "num_tokens": 64606878.0, "step": 1690 }, { "epoch": 0.2151125810965526, "ewc_loss": 0.011259881779551506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.125988183048321e-05, "grad_norm": 11.178816795349121, "learning_rate": 7.164052564646035e-07, "loss": 0.445, "mean_token_accuracy": 0.8561204075813293, "num_tokens": 64643012.0, "step": 1691 }, { "epoch": 0.21523979137514312, "ewc_loss": 0.011261433362960815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1261433428444434e-05, "grad_norm": 11.083073616027832, "learning_rate": 7.168291649003815e-07, "loss": 0.4663, "mean_token_accuracy": 0.8527405261993408, "num_tokens": 64684444.0, "step": 1692 }, { "epoch": 0.21536700165373363, "ewc_loss": 0.01127324253320694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1273242307652254e-05, "grad_norm": 11.190679550170898, "learning_rate": 7.172530733361593e-07, "loss": 0.5165, "mean_token_accuracy": 0.8358592987060547, "num_tokens": 64722068.0, "step": 1693 }, { "epoch": 0.21549421193232413, "ewc_loss": 0.011283536441624165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1283536878181621e-05, "grad_norm": 11.10556697845459, "learning_rate": 7.176769817719373e-07, "loss": 0.4281, "mean_token_accuracy": 0.867254376411438, "num_tokens": 64761829.0, "step": 1694 }, { "epoch": 0.21562142221091465, "ewc_loss": 0.011260272935032845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1260272913204972e-05, "grad_norm": 11.160658836364746, "learning_rate": 7.181008902077151e-07, "loss": 0.4744, "mean_token_accuracy": 0.847823441028595, "num_tokens": 64799185.0, "step": 1695 }, { "epoch": 0.21574863248950515, "ewc_loss": 0.011273818090558052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1273818017798476e-05, "grad_norm": 11.140164375305176, "learning_rate": 7.18524798643493e-07, "loss": 0.5046, "mean_token_accuracy": 0.8399621248245239, "num_tokens": 64839306.0, "step": 1696 }, { "epoch": 0.21587584276809565, "ewc_loss": 0.011299442499876022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1299442121526226e-05, "grad_norm": 11.206132888793945, "learning_rate": 7.189487070792708e-07, "loss": 0.4792, "mean_token_accuracy": 0.8480233550071716, "num_tokens": 64876011.0, "step": 1697 }, { "epoch": 0.21600305304668618, "ewc_loss": 0.011304125189781189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1304125109745655e-05, "grad_norm": 11.18287467956543, "learning_rate": 7.193726155150487e-07, "loss": 0.4786, "mean_token_accuracy": 0.8492008447647095, "num_tokens": 64918510.0, "step": 1698 }, { "epoch": 0.21613026332527668, "ewc_loss": 0.011265617795288563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1265618013567291e-05, "grad_norm": 11.124110221862793, "learning_rate": 7.197965239508265e-07, "loss": 0.4849, "mean_token_accuracy": 0.8452616930007935, "num_tokens": 64957156.0, "step": 1699 }, { "epoch": 0.21625747360386718, "ewc_loss": 0.01129822712391615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1298227036604658e-05, "grad_norm": 11.17480182647705, "learning_rate": 7.202204323866045e-07, "loss": 0.4835, "mean_token_accuracy": 0.8464818000793457, "num_tokens": 64998761.0, "step": 1700 }, { "epoch": 0.2163846838824577, "ewc_loss": 0.011323420330882072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1323420039843768e-05, "grad_norm": 11.219527244567871, "learning_rate": 7.206443408223823e-07, "loss": 0.4865, "mean_token_accuracy": 0.8423057198524475, "num_tokens": 65038849.0, "step": 1701 }, { "epoch": 0.2165118941610482, "ewc_loss": 0.011293803341686726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1293803254375234e-05, "grad_norm": 11.12479019165039, "learning_rate": 7.210682492581603e-07, "loss": 0.4668, "mean_token_accuracy": 0.8477709889411926, "num_tokens": 65071130.0, "step": 1702 }, { "epoch": 0.2166391044396387, "ewc_loss": 0.011301967315375805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1301966878818348e-05, "grad_norm": 11.158265113830566, "learning_rate": 7.214921576939381e-07, "loss": 0.5085, "mean_token_accuracy": 0.8339571952819824, "num_tokens": 65111619.0, "step": 1703 }, { "epoch": 0.21676631471822924, "ewc_loss": 0.01132469903677702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.132469878939446e-05, "grad_norm": 11.167082786560059, "learning_rate": 7.219160661297159e-07, "loss": 0.4555, "mean_token_accuracy": 0.8517605066299438, "num_tokens": 65147537.0, "step": 1704 }, { "epoch": 0.21689352499681974, "ewc_loss": 0.011336103081703186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1336102943459991e-05, "grad_norm": 11.147008895874023, "learning_rate": 7.223399745654938e-07, "loss": 0.4921, "mean_token_accuracy": 0.8413820266723633, "num_tokens": 65187254.0, "step": 1705 }, { "epoch": 0.21702073527541024, "ewc_loss": 0.011326855979859829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1326856110827066e-05, "grad_norm": 11.198310852050781, "learning_rate": 7.227638830012717e-07, "loss": 0.5035, "mean_token_accuracy": 0.8353396654129028, "num_tokens": 65226130.0, "step": 1706 }, { "epoch": 0.21714794555400077, "ewc_loss": 0.011359790340065956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1359790732967667e-05, "grad_norm": 11.176061630249023, "learning_rate": 7.231877914370495e-07, "loss": 0.4667, "mean_token_accuracy": 0.8516073226928711, "num_tokens": 65261282.0, "step": 1707 }, { "epoch": 0.21727515583259127, "ewc_loss": 0.011359221301972866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1359221389284357e-05, "grad_norm": 11.250753402709961, "learning_rate": 7.236116998728275e-07, "loss": 0.5183, "mean_token_accuracy": 0.8334797620773315, "num_tokens": 65294850.0, "step": 1708 }, { "epoch": 0.21740236611118177, "ewc_loss": 0.011396298184990883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1396297850296833e-05, "grad_norm": 11.183426856994629, "learning_rate": 7.240356083086053e-07, "loss": 0.4103, "mean_token_accuracy": 0.865911066532135, "num_tokens": 65332323.0, "step": 1709 }, { "epoch": 0.2175295763897723, "ewc_loss": 0.011363115161657333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1363114936102647e-05, "grad_norm": 11.200894355773926, "learning_rate": 7.244595167443833e-07, "loss": 0.5041, "mean_token_accuracy": 0.8391700983047485, "num_tokens": 65376033.0, "step": 1710 }, { "epoch": 0.2176567866683628, "ewc_loss": 0.011380117386579514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.138011703005759e-05, "grad_norm": 11.20874309539795, "learning_rate": 7.248834251801611e-07, "loss": 0.4688, "mean_token_accuracy": 0.8512305617332458, "num_tokens": 65417875.0, "step": 1711 }, { "epoch": 0.2177839969469533, "ewc_loss": 0.01138013694435358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.138013703894103e-05, "grad_norm": 11.253625869750977, "learning_rate": 7.253073336159388e-07, "loss": 0.4454, "mean_token_accuracy": 0.8550479412078857, "num_tokens": 65456675.0, "step": 1712 }, { "epoch": 0.21791120722554383, "ewc_loss": 0.011391853913664818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.139185405918397e-05, "grad_norm": 11.208663940429688, "learning_rate": 7.257312420517168e-07, "loss": 0.438, "mean_token_accuracy": 0.8578544855117798, "num_tokens": 65487699.0, "step": 1713 }, { "epoch": 0.21803841750413433, "ewc_loss": 0.011420347727835178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1420347618695814e-05, "grad_norm": 11.334539413452148, "learning_rate": 7.261551504874946e-07, "loss": 0.4651, "mean_token_accuracy": 0.8469487428665161, "num_tokens": 65526143.0, "step": 1714 }, { "epoch": 0.21816562778272486, "ewc_loss": 0.011385061778128147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.138506195275113e-05, "grad_norm": 11.168130874633789, "learning_rate": 7.265790589232725e-07, "loss": 0.4625, "mean_token_accuracy": 0.851620078086853, "num_tokens": 65562969.0, "step": 1715 }, { "epoch": 0.21829283806131536, "ewc_loss": 0.011385601945221424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1385602192603983e-05, "grad_norm": 11.244617462158203, "learning_rate": 7.270029673590504e-07, "loss": 0.4951, "mean_token_accuracy": 0.8432294130325317, "num_tokens": 65605999.0, "step": 1716 }, { "epoch": 0.21842004833990586, "ewc_loss": 0.011435147374868393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1435146916483063e-05, "grad_norm": 11.249335289001465, "learning_rate": 7.274268757948283e-07, "loss": 0.54, "mean_token_accuracy": 0.8290014863014221, "num_tokens": 65644877.0, "step": 1717 }, { "epoch": 0.2185472586184964, "ewc_loss": 0.011408731341362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.140873155236477e-05, "grad_norm": 11.267475128173828, "learning_rate": 7.278507842306062e-07, "loss": 0.4606, "mean_token_accuracy": 0.8525354862213135, "num_tokens": 65678823.0, "step": 1718 }, { "epoch": 0.2186744688970869, "ewc_loss": 0.01142556220293045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.142556266131578e-05, "grad_norm": 11.24706745147705, "learning_rate": 7.282746926663841e-07, "loss": 0.5405, "mean_token_accuracy": 0.8297483921051025, "num_tokens": 65717185.0, "step": 1719 }, { "epoch": 0.2188016791756774, "ewc_loss": 0.011423653922975063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1423653631936759e-05, "grad_norm": 11.32953929901123, "learning_rate": 7.286986011021618e-07, "loss": 0.461, "mean_token_accuracy": 0.8520806431770325, "num_tokens": 65754422.0, "step": 1720 }, { "epoch": 0.21892888945426792, "ewc_loss": 0.011425379663705826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1425379852880724e-05, "grad_norm": 11.18369197845459, "learning_rate": 7.291225095379398e-07, "loss": 0.497, "mean_token_accuracy": 0.8420898914337158, "num_tokens": 65799390.0, "step": 1721 }, { "epoch": 0.21905609973285842, "ewc_loss": 0.01141246035695076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.141246048064204e-05, "grad_norm": 11.258301734924316, "learning_rate": 7.295464179737176e-07, "loss": 0.4896, "mean_token_accuracy": 0.8464692831039429, "num_tokens": 65837566.0, "step": 1722 }, { "epoch": 0.21918331001144892, "ewc_loss": 0.011443286202847958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.144328598456923e-05, "grad_norm": 11.230189323425293, "learning_rate": 7.299703264094955e-07, "loss": 0.4741, "mean_token_accuracy": 0.8485912084579468, "num_tokens": 65874537.0, "step": 1723 }, { "epoch": 0.21931052029003945, "ewc_loss": 0.011456654407083988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.145665464719059e-05, "grad_norm": 11.246023178100586, "learning_rate": 7.303942348452734e-07, "loss": 0.5007, "mean_token_accuracy": 0.8396563529968262, "num_tokens": 65915616.0, "step": 1724 }, { "epoch": 0.21943773056862995, "ewc_loss": 0.011491985060274601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.149198487837566e-05, "grad_norm": 11.25373649597168, "learning_rate": 7.308181432810513e-07, "loss": 0.4491, "mean_token_accuracy": 0.8526420593261719, "num_tokens": 65954263.0, "step": 1725 }, { "epoch": 0.21956494084722045, "ewc_loss": 0.011480645276606083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1480645298433956e-05, "grad_norm": 11.216552734375, "learning_rate": 7.312420517168292e-07, "loss": 0.4822, "mean_token_accuracy": 0.8429367542266846, "num_tokens": 65992880.0, "step": 1726 }, { "epoch": 0.21969215112581097, "ewc_loss": 0.011490032076835632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1490032193250954e-05, "grad_norm": 11.217874526977539, "learning_rate": 7.31665960152607e-07, "loss": 0.5003, "mean_token_accuracy": 0.8359469175338745, "num_tokens": 66033958.0, "step": 1727 }, { "epoch": 0.21981936140440148, "ewc_loss": 0.01150611974298954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1506119335535914e-05, "grad_norm": 11.311470031738281, "learning_rate": 7.320898685883848e-07, "loss": 0.4716, "mean_token_accuracy": 0.8467923402786255, "num_tokens": 66072448.0, "step": 1728 }, { "epoch": 0.21994657168299198, "ewc_loss": 0.011521164327859879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1521164196892641e-05, "grad_norm": 11.228349685668945, "learning_rate": 7.325137770241628e-07, "loss": 0.5828, "mean_token_accuracy": 0.8166890144348145, "num_tokens": 66112648.0, "step": 1729 }, { "epoch": 0.2200737819615825, "ewc_loss": 0.011534180492162704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1534180885064416e-05, "grad_norm": 11.276569366455078, "learning_rate": 7.329376854599406e-07, "loss": 0.4939, "mean_token_accuracy": 0.8415390849113464, "num_tokens": 66154791.0, "step": 1730 }, { "epoch": 0.220200992240173, "ewc_loss": 0.011549851857125759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1549851478775963e-05, "grad_norm": 11.280734062194824, "learning_rate": 7.333615938957184e-07, "loss": 0.4824, "mean_token_accuracy": 0.8448140621185303, "num_tokens": 66192450.0, "step": 1731 }, { "epoch": 0.2203282025187635, "ewc_loss": 0.011544513516128063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1544513654371258e-05, "grad_norm": 11.274833679199219, "learning_rate": 7.337855023314964e-07, "loss": 0.4769, "mean_token_accuracy": 0.8452935218811035, "num_tokens": 66231173.0, "step": 1732 }, { "epoch": 0.22045541279735403, "ewc_loss": 0.011552807874977589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1552808246051427e-05, "grad_norm": 11.290910720825195, "learning_rate": 7.342094107672742e-07, "loss": 0.5296, "mean_token_accuracy": 0.8291382789611816, "num_tokens": 66272956.0, "step": 1733 }, { "epoch": 0.22058262307594453, "ewc_loss": 0.011568121612071991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1568121408345178e-05, "grad_norm": 11.286615371704102, "learning_rate": 7.346333192030522e-07, "loss": 0.4841, "mean_token_accuracy": 0.8493243455886841, "num_tokens": 66309450.0, "step": 1734 }, { "epoch": 0.22070983335453503, "ewc_loss": 0.011568678542971611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1568678928597365e-05, "grad_norm": 11.3283109664917, "learning_rate": 7.350572276388299e-07, "loss": 0.5338, "mean_token_accuracy": 0.8275008797645569, "num_tokens": 66350947.0, "step": 1735 }, { "epoch": 0.22083704363312556, "ewc_loss": 0.011594319716095924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1594319403229747e-05, "grad_norm": 11.297779083251953, "learning_rate": 7.354811360746078e-07, "loss": 0.4915, "mean_token_accuracy": 0.8406003713607788, "num_tokens": 66388364.0, "step": 1736 }, { "epoch": 0.22096425391171606, "ewc_loss": 0.011591052636504173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1591052498260979e-05, "grad_norm": 11.288505554199219, "learning_rate": 7.359050445103857e-07, "loss": 0.5059, "mean_token_accuracy": 0.8368948101997375, "num_tokens": 66425029.0, "step": 1737 }, { "epoch": 0.22109146419030656, "ewc_loss": 0.011614766903221607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1614766663115006e-05, "grad_norm": 11.323405265808105, "learning_rate": 7.363289529461636e-07, "loss": 0.482, "mean_token_accuracy": 0.8451079726219177, "num_tokens": 66463924.0, "step": 1738 }, { "epoch": 0.2212186744688971, "ewc_loss": 0.011608770117163658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1608770364546217e-05, "grad_norm": 11.304152488708496, "learning_rate": 7.367528613819415e-07, "loss": 0.4369, "mean_token_accuracy": 0.8584502935409546, "num_tokens": 66504824.0, "step": 1739 }, { "epoch": 0.2213458847474876, "ewc_loss": 0.011612216010689735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1612216439971235e-05, "grad_norm": 11.292954444885254, "learning_rate": 7.371767698177194e-07, "loss": 0.4559, "mean_token_accuracy": 0.8526148796081543, "num_tokens": 66539308.0, "step": 1740 }, { "epoch": 0.22147309502607812, "ewc_loss": 0.01162809506058693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1628095307969488e-05, "grad_norm": 11.305456161499023, "learning_rate": 7.376006782534972e-07, "loss": 0.4518, "mean_token_accuracy": 0.8550670146942139, "num_tokens": 66572785.0, "step": 1741 }, { "epoch": 0.22160030530466862, "ewc_loss": 0.011637401767075062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.163740216725273e-05, "grad_norm": 11.286751747131348, "learning_rate": 7.380245866892751e-07, "loss": 0.4694, "mean_token_accuracy": 0.8477567434310913, "num_tokens": 66607890.0, "step": 1742 }, { "epoch": 0.22172751558325912, "ewc_loss": 0.011665953323245049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1665953024930786e-05, "grad_norm": 11.33385944366455, "learning_rate": 7.384484951250529e-07, "loss": 0.4718, "mean_token_accuracy": 0.8499903082847595, "num_tokens": 66650553.0, "step": 1743 }, { "epoch": 0.22185472586184965, "ewc_loss": 0.011655544862151146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1655544767563697e-05, "grad_norm": 11.297502517700195, "learning_rate": 7.388724035608308e-07, "loss": 0.466, "mean_token_accuracy": 0.8505991697311401, "num_tokens": 66685330.0, "step": 1744 }, { "epoch": 0.22198193614044015, "ewc_loss": 0.01165006309747696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1650063242996112e-05, "grad_norm": 11.35104751586914, "learning_rate": 7.392963119966087e-07, "loss": 0.4495, "mean_token_accuracy": 0.853792667388916, "num_tokens": 66716816.0, "step": 1745 }, { "epoch": 0.22210914641903065, "ewc_loss": 0.011672521941363811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1672522305161692e-05, "grad_norm": 11.343219757080078, "learning_rate": 7.397202204323866e-07, "loss": 0.4943, "mean_token_accuracy": 0.8424131274223328, "num_tokens": 66754030.0, "step": 1746 }, { "epoch": 0.22223635669762118, "ewc_loss": 0.011674356646835804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1674356755975168e-05, "grad_norm": 11.32797908782959, "learning_rate": 7.401441288681645e-07, "loss": 0.4515, "mean_token_accuracy": 0.8546382784843445, "num_tokens": 66793650.0, "step": 1747 }, { "epoch": 0.22236356697621168, "ewc_loss": 0.011684720404446125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.168472044810187e-05, "grad_norm": 11.288288116455078, "learning_rate": 7.405680373039424e-07, "loss": 0.4353, "mean_token_accuracy": 0.8619227409362793, "num_tokens": 66833709.0, "step": 1748 }, { "epoch": 0.22249077725480218, "ewc_loss": 0.011671553365886211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1671553693304304e-05, "grad_norm": 11.342915534973145, "learning_rate": 7.409919457397202e-07, "loss": 0.5043, "mean_token_accuracy": 0.8375469446182251, "num_tokens": 66866456.0, "step": 1749 }, { "epoch": 0.2226179875333927, "ewc_loss": 0.011716106906533241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1716107110260054e-05, "grad_norm": 11.31043529510498, "learning_rate": 7.414158541754981e-07, "loss": 0.53, "mean_token_accuracy": 0.8289625644683838, "num_tokens": 66905875.0, "step": 1750 }, { "epoch": 0.2227451978119832, "ewc_loss": 0.0116927744820714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1692774933180772e-05, "grad_norm": 11.352070808410645, "learning_rate": 7.418397626112759e-07, "loss": 0.4846, "mean_token_accuracy": 0.8522859215736389, "num_tokens": 66936009.0, "step": 1751 }, { "epoch": 0.2228724080905737, "ewc_loss": 0.011742791160941124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.174279077531537e-05, "grad_norm": 11.304351806640625, "learning_rate": 7.422636710470537e-07, "loss": 0.4888, "mean_token_accuracy": 0.8434777855873108, "num_tokens": 66978111.0, "step": 1752 }, { "epoch": 0.22299961836916424, "ewc_loss": 0.011727613396942616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1727613127732184e-05, "grad_norm": 11.341005325317383, "learning_rate": 7.426875794828317e-07, "loss": 0.5017, "mean_token_accuracy": 0.8428605198860168, "num_tokens": 67015938.0, "step": 1753 }, { "epoch": 0.22312682864775474, "ewc_loss": 0.011759576387703419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1759576409531292e-05, "grad_norm": 11.33017635345459, "learning_rate": 7.431114879186095e-07, "loss": 0.5223, "mean_token_accuracy": 0.8331508636474609, "num_tokens": 67055456.0, "step": 1754 }, { "epoch": 0.22325403892634524, "ewc_loss": 0.011751048266887665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1751048077712767e-05, "grad_norm": 11.363568305969238, "learning_rate": 7.435353963543875e-07, "loss": 0.4794, "mean_token_accuracy": 0.8464071154594421, "num_tokens": 67096298.0, "step": 1755 }, { "epoch": 0.22338124920493577, "ewc_loss": 0.0118111502379179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1811150216090027e-05, "grad_norm": 11.374720573425293, "learning_rate": 7.439593047901653e-07, "loss": 0.4355, "mean_token_accuracy": 0.8590805530548096, "num_tokens": 67135280.0, "step": 1756 }, { "epoch": 0.22350845948352627, "ewc_loss": 0.011786910705268383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1786910363298375e-05, "grad_norm": 11.354400634765625, "learning_rate": 7.443832132259431e-07, "loss": 0.4998, "mean_token_accuracy": 0.8385438919067383, "num_tokens": 67171936.0, "step": 1757 }, { "epoch": 0.22363566976211677, "ewc_loss": 0.011800584383308887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1800584616139531e-05, "grad_norm": 11.416980743408203, "learning_rate": 7.44807121661721e-07, "loss": 0.4883, "mean_token_accuracy": 0.8421709537506104, "num_tokens": 67209854.0, "step": 1758 }, { "epoch": 0.2237628800407073, "ewc_loss": 0.011829575523734093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1829575669253245e-05, "grad_norm": 11.404767990112305, "learning_rate": 7.452310300974989e-07, "loss": 0.4432, "mean_token_accuracy": 0.8525835275650024, "num_tokens": 67244369.0, "step": 1759 }, { "epoch": 0.2238900903192978, "ewc_loss": 0.011784781701862812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1784782145696227e-05, "grad_norm": 11.334197998046875, "learning_rate": 7.456549385332767e-07, "loss": 0.5216, "mean_token_accuracy": 0.8343517184257507, "num_tokens": 67283994.0, "step": 1760 }, { "epoch": 0.2240173005978883, "ewc_loss": 0.011835025623440742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1835025361506268e-05, "grad_norm": 11.409000396728516, "learning_rate": 7.460788469690547e-07, "loss": 0.4069, "mean_token_accuracy": 0.8652253150939941, "num_tokens": 67323230.0, "step": 1761 }, { "epoch": 0.22414451087647883, "ewc_loss": 0.011838801205158234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.183880158350803e-05, "grad_norm": 11.448850631713867, "learning_rate": 7.465027554048325e-07, "loss": 0.5605, "mean_token_accuracy": 0.8241946697235107, "num_tokens": 67366198.0, "step": 1762 }, { "epoch": 0.22427172115506933, "ewc_loss": 0.011813283897936344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1813283890660387e-05, "grad_norm": 11.399812698364258, "learning_rate": 7.469266638406105e-07, "loss": 0.5444, "mean_token_accuracy": 0.829265832901001, "num_tokens": 67410685.0, "step": 1763 }, { "epoch": 0.22439893143365983, "ewc_loss": 0.011830787174403667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1830787116196007e-05, "grad_norm": 11.44825553894043, "learning_rate": 7.473505722763883e-07, "loss": 0.4666, "mean_token_accuracy": 0.8505357503890991, "num_tokens": 67445818.0, "step": 1764 }, { "epoch": 0.22452614171225035, "ewc_loss": 0.011823556385934353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1823556633316912e-05, "grad_norm": 11.425453186035156, "learning_rate": 7.477744807121661e-07, "loss": 0.5086, "mean_token_accuracy": 0.8375741839408875, "num_tokens": 67483873.0, "step": 1765 }, { "epoch": 0.22465335199084085, "ewc_loss": 0.011810525320470333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.181052539322991e-05, "grad_norm": 11.41148567199707, "learning_rate": 7.48198389147944e-07, "loss": 0.4868, "mean_token_accuracy": 0.8478261232376099, "num_tokens": 67519524.0, "step": 1766 }, { "epoch": 0.22478056226943138, "ewc_loss": 0.011796507984399796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1796508260886185e-05, "grad_norm": 11.378096580505371, "learning_rate": 7.486222975837219e-07, "loss": 0.4714, "mean_token_accuracy": 0.8465681076049805, "num_tokens": 67560920.0, "step": 1767 }, { "epoch": 0.22490777254802188, "ewc_loss": 0.011813451535999775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1813451237685513e-05, "grad_norm": 11.388077735900879, "learning_rate": 7.490462060194997e-07, "loss": 0.4419, "mean_token_accuracy": 0.8541232347488403, "num_tokens": 67594948.0, "step": 1768 }, { "epoch": 0.22503498282661238, "ewc_loss": 0.011841426603496075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1841426385217346e-05, "grad_norm": 11.405641555786133, "learning_rate": 7.494701144552777e-07, "loss": 0.5303, "mean_token_accuracy": 0.8301271200180054, "num_tokens": 67628867.0, "step": 1769 }, { "epoch": 0.2251621931052029, "ewc_loss": 0.011841568164527416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1841568266390823e-05, "grad_norm": 11.369749069213867, "learning_rate": 7.498940228910555e-07, "loss": 0.4903, "mean_token_accuracy": 0.8412503004074097, "num_tokens": 67669762.0, "step": 1770 }, { "epoch": 0.2252894033837934, "ewc_loss": 0.011870421469211578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1870421076309867e-05, "grad_norm": 11.433340072631836, "learning_rate": 7.503179313268335e-07, "loss": 0.4785, "mean_token_accuracy": 0.8438314199447632, "num_tokens": 67704750.0, "step": 1771 }, { "epoch": 0.2254166136623839, "ewc_loss": 0.011892410926520824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1892410839209333e-05, "grad_norm": 11.373743057250977, "learning_rate": 7.507418397626113e-07, "loss": 0.5159, "mean_token_accuracy": 0.8350380659103394, "num_tokens": 67745181.0, "step": 1772 }, { "epoch": 0.22554382394097444, "ewc_loss": 0.011886273510754108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.188627356896177e-05, "grad_norm": 11.416301727294922, "learning_rate": 7.51165748198389e-07, "loss": 0.5028, "mean_token_accuracy": 0.8371667861938477, "num_tokens": 67785929.0, "step": 1773 }, { "epoch": 0.22567103421956494, "ewc_loss": 0.011910676024854183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1910676221305039e-05, "grad_norm": 11.436124801635742, "learning_rate": 7.51589656634167e-07, "loss": 0.4581, "mean_token_accuracy": 0.8479307889938354, "num_tokens": 67821887.0, "step": 1774 }, { "epoch": 0.22579824449815544, "ewc_loss": 0.011906602419912815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1906602594535798e-05, "grad_norm": 11.410723686218262, "learning_rate": 7.520135650699448e-07, "loss": 0.4923, "mean_token_accuracy": 0.8414221405982971, "num_tokens": 67859608.0, "step": 1775 }, { "epoch": 0.22592545477674597, "ewc_loss": 0.011912113055586815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.191211322293384e-05, "grad_norm": 11.41484260559082, "learning_rate": 7.524374735057227e-07, "loss": 0.5018, "mean_token_accuracy": 0.8401467204093933, "num_tokens": 67895287.0, "step": 1776 }, { "epoch": 0.22605266505533647, "ewc_loss": 0.011951209045946598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1951208762184251e-05, "grad_norm": 11.46627426147461, "learning_rate": 7.528613819415006e-07, "loss": 0.4595, "mean_token_accuracy": 0.8559792637825012, "num_tokens": 67930358.0, "step": 1777 }, { "epoch": 0.22617987533392697, "ewc_loss": 0.011922978796064854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1922978956135921e-05, "grad_norm": 11.398819923400879, "learning_rate": 7.532852903772785e-07, "loss": 0.5006, "mean_token_accuracy": 0.8388281464576721, "num_tokens": 67966112.0, "step": 1778 }, { "epoch": 0.2263070856125175, "ewc_loss": 0.011958347633481026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1958347386098467e-05, "grad_norm": 11.496186256408691, "learning_rate": 7.537091988130564e-07, "loss": 0.5086, "mean_token_accuracy": 0.834649920463562, "num_tokens": 68001297.0, "step": 1779 }, { "epoch": 0.226434295891108, "ewc_loss": 0.01198558695614338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1985586752416566e-05, "grad_norm": 11.443612098693848, "learning_rate": 7.541331072488342e-07, "loss": 0.4899, "mean_token_accuracy": 0.8430771827697754, "num_tokens": 68037346.0, "step": 1780 }, { "epoch": 0.2265615061696985, "ewc_loss": 0.011943567544221878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1943567187699955e-05, "grad_norm": 11.410872459411621, "learning_rate": 7.54557015684612e-07, "loss": 0.4879, "mean_token_accuracy": 0.8428922295570374, "num_tokens": 68076566.0, "step": 1781 }, { "epoch": 0.22668871644828903, "ewc_loss": 0.01198719535022974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1987195648544002e-05, "grad_norm": 11.465088844299316, "learning_rate": 7.5498092412039e-07, "loss": 0.4665, "mean_token_accuracy": 0.8529967665672302, "num_tokens": 68116834.0, "step": 1782 }, { "epoch": 0.22681592672687953, "ewc_loss": 0.012005534954369068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2005534699710552e-05, "grad_norm": 11.423832893371582, "learning_rate": 7.554048325561678e-07, "loss": 0.4412, "mean_token_accuracy": 0.8578931093215942, "num_tokens": 68155573.0, "step": 1783 }, { "epoch": 0.22694313700547003, "ewc_loss": 0.011979435570538044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1979435839748476e-05, "grad_norm": 11.48856258392334, "learning_rate": 7.558287409919457e-07, "loss": 0.5349, "mean_token_accuracy": 0.832036018371582, "num_tokens": 68193378.0, "step": 1784 }, { "epoch": 0.22707034728406056, "ewc_loss": 0.01200446393340826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2004464224446565e-05, "grad_norm": 11.45003890991211, "learning_rate": 7.562526494277236e-07, "loss": 0.5066, "mean_token_accuracy": 0.8346165418624878, "num_tokens": 68230183.0, "step": 1785 }, { "epoch": 0.22719755756265106, "ewc_loss": 0.01199156604707241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1991565770586021e-05, "grad_norm": 11.416117668151855, "learning_rate": 7.566765578635015e-07, "loss": 0.4764, "mean_token_accuracy": 0.848227322101593, "num_tokens": 68270526.0, "step": 1786 }, { "epoch": 0.22732476784124156, "ewc_loss": 0.011998367495834827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.1998367881460581e-05, "grad_norm": 11.477773666381836, "learning_rate": 7.571004662992794e-07, "loss": 0.5019, "mean_token_accuracy": 0.8432669639587402, "num_tokens": 68308690.0, "step": 1787 }, { "epoch": 0.2274519781198321, "ewc_loss": 0.012047110125422478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2047110431012698e-05, "grad_norm": 11.46584701538086, "learning_rate": 7.575243747350572e-07, "loss": 0.5443, "mean_token_accuracy": 0.8288313150405884, "num_tokens": 68348660.0, "step": 1788 }, { "epoch": 0.2275791883984226, "ewc_loss": 0.012037787586450577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2037787200824823e-05, "grad_norm": 11.494304656982422, "learning_rate": 7.57948283170835e-07, "loss": 0.4932, "mean_token_accuracy": 0.8460025787353516, "num_tokens": 68389129.0, "step": 1789 }, { "epoch": 0.2277063986770131, "ewc_loss": 0.012062040157616138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.20620397865423e-05, "grad_norm": 11.447732925415039, "learning_rate": 7.58372191606613e-07, "loss": 0.4846, "mean_token_accuracy": 0.8451551198959351, "num_tokens": 68423234.0, "step": 1790 }, { "epoch": 0.22783360895560362, "ewc_loss": 0.012052730657160282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2052731108269654e-05, "grad_norm": 11.509414672851562, "learning_rate": 7.587961000423908e-07, "loss": 0.4922, "mean_token_accuracy": 0.8440569639205933, "num_tokens": 68460834.0, "step": 1791 }, { "epoch": 0.22796081923419412, "ewc_loss": 0.012059740722179413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2059740583936218e-05, "grad_norm": 11.476848602294922, "learning_rate": 7.592200084781686e-07, "loss": 0.5358, "mean_token_accuracy": 0.8336132764816284, "num_tokens": 68495605.0, "step": 1792 }, { "epoch": 0.22808802951278465, "ewc_loss": 0.012048404663801193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.204840464197332e-05, "grad_norm": 11.498549461364746, "learning_rate": 7.596439169139466e-07, "loss": 0.4825, "mean_token_accuracy": 0.8456066846847534, "num_tokens": 68530838.0, "step": 1793 }, { "epoch": 0.22821523979137515, "ewc_loss": 0.012092335149645805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2092335055058356e-05, "grad_norm": 11.475353240966797, "learning_rate": 7.600678253497244e-07, "loss": 0.4765, "mean_token_accuracy": 0.8488498330116272, "num_tokens": 68571225.0, "step": 1794 }, { "epoch": 0.22834245006996565, "ewc_loss": 0.012093762867152691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.209376296174014e-05, "grad_norm": 11.561251640319824, "learning_rate": 7.604917337855023e-07, "loss": 0.4583, "mean_token_accuracy": 0.8551763296127319, "num_tokens": 68608284.0, "step": 1795 }, { "epoch": 0.22846966034855618, "ewc_loss": 0.012105349451303482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2105349014746025e-05, "grad_norm": 11.460660934448242, "learning_rate": 7.609156422212801e-07, "loss": 0.4397, "mean_token_accuracy": 0.8583230376243591, "num_tokens": 68645943.0, "step": 1796 }, { "epoch": 0.22859687062714668, "ewc_loss": 0.012085267342627048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2085267371730879e-05, "grad_norm": 11.510079383850098, "learning_rate": 7.61339550657058e-07, "loss": 0.4628, "mean_token_accuracy": 0.8499848246574402, "num_tokens": 68684245.0, "step": 1797 }, { "epoch": 0.22872408090573718, "ewc_loss": 0.012110518291592598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2110518582630903e-05, "grad_norm": 11.514233589172363, "learning_rate": 7.617634590928359e-07, "loss": 0.4766, "mean_token_accuracy": 0.849455714225769, "num_tokens": 68717532.0, "step": 1798 }, { "epoch": 0.2288512911843277, "ewc_loss": 0.012105708010494709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2105708265153226e-05, "grad_norm": 11.49263858795166, "learning_rate": 7.621873675286138e-07, "loss": 0.4956, "mean_token_accuracy": 0.8432055711746216, "num_tokens": 68759515.0, "step": 1799 }, { "epoch": 0.2289785014629182, "ewc_loss": 0.01212230697274208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2122306543460581e-05, "grad_norm": 11.468011856079102, "learning_rate": 7.626112759643916e-07, "loss": 0.5024, "mean_token_accuracy": 0.8382395505905151, "num_tokens": 68796969.0, "step": 1800 }, { "epoch": 0.2291057117415087, "ewc_loss": 0.012128341943025589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2128341950301547e-05, "grad_norm": 11.540891647338867, "learning_rate": 7.630351844001696e-07, "loss": 0.4836, "mean_token_accuracy": 0.8443586826324463, "num_tokens": 68831138.0, "step": 1801 }, { "epoch": 0.22923292202009923, "ewc_loss": 0.012147056870162487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2147056622779928e-05, "grad_norm": 11.521855354309082, "learning_rate": 7.634590928359474e-07, "loss": 0.4741, "mean_token_accuracy": 0.8500065803527832, "num_tokens": 68868741.0, "step": 1802 }, { "epoch": 0.22936013229868973, "ewc_loss": 0.012126367539167404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2126367437303998e-05, "grad_norm": 11.440393447875977, "learning_rate": 7.638830012717253e-07, "loss": 0.4937, "mean_token_accuracy": 0.8404624462127686, "num_tokens": 68909191.0, "step": 1803 }, { "epoch": 0.22948734257728023, "ewc_loss": 0.012148796580731869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.214879648614442e-05, "grad_norm": 11.573708534240723, "learning_rate": 7.643069097075031e-07, "loss": 0.469, "mean_token_accuracy": 0.8473615646362305, "num_tokens": 68949261.0, "step": 1804 }, { "epoch": 0.22961455285587076, "ewc_loss": 0.012140616774559021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2140616490796674e-05, "grad_norm": 11.47916030883789, "learning_rate": 7.64730818143281e-07, "loss": 0.4432, "mean_token_accuracy": 0.8530887961387634, "num_tokens": 68990434.0, "step": 1805 }, { "epoch": 0.22974176313446126, "ewc_loss": 0.012122317217290401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2122317457397003e-05, "grad_norm": 11.525010108947754, "learning_rate": 7.651547265790589e-07, "loss": 0.5264, "mean_token_accuracy": 0.8324760794639587, "num_tokens": 69029482.0, "step": 1806 }, { "epoch": 0.22986897341305176, "ewc_loss": 0.012165537104010582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2165537555119954e-05, "grad_norm": 11.452993392944336, "learning_rate": 7.655786350148368e-07, "loss": 0.506, "mean_token_accuracy": 0.8377771377563477, "num_tokens": 69069766.0, "step": 1807 }, { "epoch": 0.2299961836916423, "ewc_loss": 0.012116821482777596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2116821380914189e-05, "grad_norm": 11.483573913574219, "learning_rate": 7.660025434506146e-07, "loss": 0.4925, "mean_token_accuracy": 0.842795193195343, "num_tokens": 69114592.0, "step": 1808 }, { "epoch": 0.2301233939702328, "ewc_loss": 0.012170115485787392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2170115041953977e-05, "grad_norm": 11.50284481048584, "learning_rate": 7.664264518863926e-07, "loss": 0.481, "mean_token_accuracy": 0.8465561866760254, "num_tokens": 69152599.0, "step": 1809 }, { "epoch": 0.2302506042488233, "ewc_loss": 0.012136452831327915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2136452824051958e-05, "grad_norm": 11.559889793395996, "learning_rate": 7.668503603221704e-07, "loss": 0.5334, "mean_token_accuracy": 0.8271223306655884, "num_tokens": 69193157.0, "step": 1810 }, { "epoch": 0.23037781452741382, "ewc_loss": 0.01216805912554264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2168058674433269e-05, "grad_norm": 11.452980995178223, "learning_rate": 7.672742687579483e-07, "loss": 0.4457, "mean_token_accuracy": 0.8541589379310608, "num_tokens": 69230681.0, "step": 1811 }, { "epoch": 0.23050502480600432, "ewc_loss": 0.012144822627305984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2144822903792374e-05, "grad_norm": 11.535825729370117, "learning_rate": 7.676981771937261e-07, "loss": 0.4239, "mean_token_accuracy": 0.858272910118103, "num_tokens": 69261292.0, "step": 1812 }, { "epoch": 0.23063223508459482, "ewc_loss": 0.012213723734021187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2213723493914586e-05, "grad_norm": 11.527734756469727, "learning_rate": 7.681220856295039e-07, "loss": 0.4551, "mean_token_accuracy": 0.8534281849861145, "num_tokens": 69301795.0, "step": 1813 }, { "epoch": 0.23075944536318535, "ewc_loss": 0.012170812115073204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2170811714895535e-05, "grad_norm": 11.542118072509766, "learning_rate": 7.685459940652819e-07, "loss": 0.478, "mean_token_accuracy": 0.8441800475120544, "num_tokens": 69337038.0, "step": 1814 }, { "epoch": 0.23088665564177585, "ewc_loss": 0.012191874906420708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2191874702693895e-05, "grad_norm": 11.450959205627441, "learning_rate": 7.689699025010597e-07, "loss": 0.4756, "mean_token_accuracy": 0.8462185263633728, "num_tokens": 69379871.0, "step": 1815 }, { "epoch": 0.23101386592036638, "ewc_loss": 0.012202938087284565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.220293779624626e-05, "grad_norm": 11.57317066192627, "learning_rate": 7.693938109368376e-07, "loss": 0.4969, "mean_token_accuracy": 0.8390171527862549, "num_tokens": 69418021.0, "step": 1816 }, { "epoch": 0.23114107619895688, "ewc_loss": 0.01225489005446434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2254889952600934e-05, "grad_norm": 11.592658042907715, "learning_rate": 7.698177193726155e-07, "loss": 0.4746, "mean_token_accuracy": 0.8452825546264648, "num_tokens": 69455574.0, "step": 1817 }, { "epoch": 0.23126828647754738, "ewc_loss": 0.012224974110722542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2224973943375517e-05, "grad_norm": 11.555922508239746, "learning_rate": 7.702416278083933e-07, "loss": 0.499, "mean_token_accuracy": 0.840886116027832, "num_tokens": 69489822.0, "step": 1818 }, { "epoch": 0.2313954967561379, "ewc_loss": 0.01223418302834034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2234182577230968e-05, "grad_norm": 11.56700611114502, "learning_rate": 7.706655362441712e-07, "loss": 0.4702, "mean_token_accuracy": 0.8473623991012573, "num_tokens": 69532821.0, "step": 1819 }, { "epoch": 0.2315227070347284, "ewc_loss": 0.012237380258738995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2237380360602401e-05, "grad_norm": 11.574020385742188, "learning_rate": 7.710894446799491e-07, "loss": 0.4538, "mean_token_accuracy": 0.8513413667678833, "num_tokens": 69565674.0, "step": 1820 }, { "epoch": 0.2316499173133189, "ewc_loss": 0.012272483669221401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2272483218112029e-05, "grad_norm": 11.587913513183594, "learning_rate": 7.715133531157269e-07, "loss": 0.453, "mean_token_accuracy": 0.856295108795166, "num_tokens": 69606118.0, "step": 1821 }, { "epoch": 0.23177712759190944, "ewc_loss": 0.012276084162294865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2276083907636348e-05, "grad_norm": 11.573840141296387, "learning_rate": 7.719372615515049e-07, "loss": 0.5249, "mean_token_accuracy": 0.8331923484802246, "num_tokens": 69650102.0, "step": 1822 }, { "epoch": 0.23190433787049994, "ewc_loss": 0.012258635833859444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2258636161277536e-05, "grad_norm": 11.59312629699707, "learning_rate": 7.723611699872827e-07, "loss": 0.4699, "mean_token_accuracy": 0.8477238416671753, "num_tokens": 69693440.0, "step": 1823 }, { "epoch": 0.23203154814909044, "ewc_loss": 0.012261087074875832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2261087249498814e-05, "grad_norm": 11.560918807983398, "learning_rate": 7.727850784230606e-07, "loss": 0.4887, "mean_token_accuracy": 0.8387184739112854, "num_tokens": 69732651.0, "step": 1824 }, { "epoch": 0.23215875842768097, "ewc_loss": 0.01228141225874424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2281412637094036e-05, "grad_norm": 11.64747428894043, "learning_rate": 7.732089868588385e-07, "loss": 0.5032, "mean_token_accuracy": 0.8416160345077515, "num_tokens": 69769915.0, "step": 1825 }, { "epoch": 0.23228596870627147, "ewc_loss": 0.012299750931560993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2299750778765883e-05, "grad_norm": 11.528173446655273, "learning_rate": 7.736328952946163e-07, "loss": 0.4256, "mean_token_accuracy": 0.862841010093689, "num_tokens": 69813943.0, "step": 1826 }, { "epoch": 0.23241317898486197, "ewc_loss": 0.012267231941223145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2267231795703992e-05, "grad_norm": 11.653331756591797, "learning_rate": 7.740568037303942e-07, "loss": 0.4557, "mean_token_accuracy": 0.8500105142593384, "num_tokens": 69847039.0, "step": 1827 }, { "epoch": 0.2325403892634525, "ewc_loss": 0.012318208813667297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2318208973738365e-05, "grad_norm": 11.568612098693848, "learning_rate": 7.744807121661721e-07, "loss": 0.4211, "mean_token_accuracy": 0.8651862144470215, "num_tokens": 69884588.0, "step": 1828 }, { "epoch": 0.232667599542043, "ewc_loss": 0.012281938455998898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.228193832503166e-05, "grad_norm": 11.618677139282227, "learning_rate": 7.749046206019499e-07, "loss": 0.4851, "mean_token_accuracy": 0.8441789150238037, "num_tokens": 69925552.0, "step": 1829 }, { "epoch": 0.2327948098206335, "ewc_loss": 0.01231382880359888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2313828847254626e-05, "grad_norm": 11.602465629577637, "learning_rate": 7.753285290377279e-07, "loss": 0.4183, "mean_token_accuracy": 0.8630166053771973, "num_tokens": 69961776.0, "step": 1830 }, { "epoch": 0.23292202009922403, "ewc_loss": 0.01232551783323288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2325517673161812e-05, "grad_norm": 11.65878963470459, "learning_rate": 7.757524374735057e-07, "loss": 0.5251, "mean_token_accuracy": 0.8309548497200012, "num_tokens": 69999775.0, "step": 1831 }, { "epoch": 0.23304923037781453, "ewc_loss": 0.012335495091974735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2335494830040261e-05, "grad_norm": 11.599736213684082, "learning_rate": 7.761763459092836e-07, "loss": 0.4355, "mean_token_accuracy": 0.8614736199378967, "num_tokens": 70037294.0, "step": 1832 }, { "epoch": 0.23317644065640503, "ewc_loss": 0.012326451949775219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2326451724220533e-05, "grad_norm": 11.629446983337402, "learning_rate": 7.766002543450614e-07, "loss": 0.4722, "mean_token_accuracy": 0.8482538461685181, "num_tokens": 70075014.0, "step": 1833 }, { "epoch": 0.23330365093499555, "ewc_loss": 0.012361605651676655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.236160551343346e-05, "grad_norm": 11.667048454284668, "learning_rate": 7.770241627808392e-07, "loss": 0.4279, "mean_token_accuracy": 0.8620029091835022, "num_tokens": 70112186.0, "step": 1834 }, { "epoch": 0.23343086121358606, "ewc_loss": 0.012350283563137054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2350283213891089e-05, "grad_norm": 11.63790512084961, "learning_rate": 7.774480712166172e-07, "loss": 0.5065, "mean_token_accuracy": 0.8370102643966675, "num_tokens": 70153285.0, "step": 1835 }, { "epoch": 0.23355807149217656, "ewc_loss": 0.012348310090601444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2348310519882943e-05, "grad_norm": 11.660578727722168, "learning_rate": 7.77871979652395e-07, "loss": 0.488, "mean_token_accuracy": 0.8440637588500977, "num_tokens": 70193382.0, "step": 1836 }, { "epoch": 0.23368528177076708, "ewc_loss": 0.012372102588415146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2372102901281323e-05, "grad_norm": 11.636241912841797, "learning_rate": 7.782958880881729e-07, "loss": 0.4784, "mean_token_accuracy": 0.8460359573364258, "num_tokens": 70227279.0, "step": 1837 }, { "epoch": 0.23381249204935758, "ewc_loss": 0.01238061673939228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.238061668118462e-05, "grad_norm": 11.709064483642578, "learning_rate": 7.787197965239508e-07, "loss": 0.4401, "mean_token_accuracy": 0.8543820381164551, "num_tokens": 70264395.0, "step": 1838 }, { "epoch": 0.23393970232794808, "ewc_loss": 0.0123978853225708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2397885257087182e-05, "grad_norm": 11.69606876373291, "learning_rate": 7.791437049597287e-07, "loss": 0.5563, "mean_token_accuracy": 0.8237185478210449, "num_tokens": 70295016.0, "step": 1839 }, { "epoch": 0.2340669126065386, "ewc_loss": 0.012384699657559395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2384699402900878e-05, "grad_norm": 11.651540756225586, "learning_rate": 7.795676133955065e-07, "loss": 0.4996, "mean_token_accuracy": 0.8404430150985718, "num_tokens": 70332774.0, "step": 1840 }, { "epoch": 0.2341941228851291, "ewc_loss": 0.012409632094204426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2409632290655281e-05, "grad_norm": 11.657635688781738, "learning_rate": 7.799915218312844e-07, "loss": 0.484, "mean_token_accuracy": 0.8446564674377441, "num_tokens": 70366880.0, "step": 1841 }, { "epoch": 0.23432133316371964, "ewc_loss": 0.012428482063114643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2428482477844227e-05, "grad_norm": 11.681742668151855, "learning_rate": 7.804154302670622e-07, "loss": 0.4599, "mean_token_accuracy": 0.8514472246170044, "num_tokens": 70405235.0, "step": 1842 }, { "epoch": 0.23444854344231014, "ewc_loss": 0.012431158684194088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2431159120751545e-05, "grad_norm": 11.664671897888184, "learning_rate": 7.808393387028402e-07, "loss": 0.4697, "mean_token_accuracy": 0.8468689918518066, "num_tokens": 70446503.0, "step": 1843 }, { "epoch": 0.23457575372090064, "ewc_loss": 0.012439452111721039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.243945189344231e-05, "grad_norm": 11.672840118408203, "learning_rate": 7.81263247138618e-07, "loss": 0.52, "mean_token_accuracy": 0.832452654838562, "num_tokens": 70483514.0, "step": 1844 }, { "epoch": 0.23470296399949117, "ewc_loss": 0.012461595237255096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2461595360946376e-05, "grad_norm": 11.713607788085938, "learning_rate": 7.816871555743959e-07, "loss": 0.4817, "mean_token_accuracy": 0.8439074158668518, "num_tokens": 70520944.0, "step": 1845 }, { "epoch": 0.23483017427808167, "ewc_loss": 0.012439167127013206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2439167221600655e-05, "grad_norm": 11.647308349609375, "learning_rate": 7.821110640101738e-07, "loss": 0.5043, "mean_token_accuracy": 0.8363344669342041, "num_tokens": 70564399.0, "step": 1846 }, { "epoch": 0.23495738455667217, "ewc_loss": 0.01245193649083376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2451936527213547e-05, "grad_norm": 11.670485496520996, "learning_rate": 7.825349724459517e-07, "loss": 0.4777, "mean_token_accuracy": 0.8465208411216736, "num_tokens": 70604000.0, "step": 1847 }, { "epoch": 0.2350845948352627, "ewc_loss": 0.0124733941629529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2473394235712476e-05, "grad_norm": 11.669500350952148, "learning_rate": 7.829588808817294e-07, "loss": 0.4705, "mean_token_accuracy": 0.8481160402297974, "num_tokens": 70645133.0, "step": 1848 }, { "epoch": 0.2352118051138532, "ewc_loss": 0.012461039237678051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2461039659683593e-05, "grad_norm": 11.660828590393066, "learning_rate": 7.833827893175074e-07, "loss": 0.5045, "mean_token_accuracy": 0.8325310945510864, "num_tokens": 70675824.0, "step": 1849 }, { "epoch": 0.2353390153924437, "ewc_loss": 0.012494408525526524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.249440811079694e-05, "grad_norm": 11.679319381713867, "learning_rate": 7.838066977532852e-07, "loss": 0.4674, "mean_token_accuracy": 0.8491721153259277, "num_tokens": 70714103.0, "step": 1850 }, { "epoch": 0.23546622567103423, "ewc_loss": 0.01250580232590437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.250580226042075e-05, "grad_norm": 11.688112258911133, "learning_rate": 7.842306061890632e-07, "loss": 0.5175, "mean_token_accuracy": 0.8360562324523926, "num_tokens": 70754265.0, "step": 1851 }, { "epoch": 0.23559343594962473, "ewc_loss": 0.01251341961324215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2513419278548099e-05, "grad_norm": 11.676060676574707, "learning_rate": 7.84654514624841e-07, "loss": 0.4842, "mean_token_accuracy": 0.841384768486023, "num_tokens": 70791368.0, "step": 1852 }, { "epoch": 0.23572064622821523, "ewc_loss": 0.012486852705478668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2486852938309312e-05, "grad_norm": 11.661371231079102, "learning_rate": 7.850784230606188e-07, "loss": 0.5297, "mean_token_accuracy": 0.833185076713562, "num_tokens": 70827397.0, "step": 1853 }, { "epoch": 0.23584785650680576, "ewc_loss": 0.012530232779681683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2530233107099775e-05, "grad_norm": 11.690608978271484, "learning_rate": 7.855023314963968e-07, "loss": 0.5003, "mean_token_accuracy": 0.8417885899543762, "num_tokens": 70872503.0, "step": 1854 }, { "epoch": 0.23597506678539626, "ewc_loss": 0.012558993883430958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.255899405805394e-05, "grad_norm": 11.677393913269043, "learning_rate": 7.859262399321746e-07, "loss": 0.4996, "mean_token_accuracy": 0.8379917144775391, "num_tokens": 70914376.0, "step": 1855 }, { "epoch": 0.23610227706398676, "ewc_loss": 0.01253579929471016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2535799214674626e-05, "grad_norm": 11.682008743286133, "learning_rate": 7.863501483679524e-07, "loss": 0.4564, "mean_token_accuracy": 0.8531503081321716, "num_tokens": 70954004.0, "step": 1856 }, { "epoch": 0.2362294873425773, "ewc_loss": 0.012557271867990494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2557271475088783e-05, "grad_norm": 11.732419967651367, "learning_rate": 7.867740568037303e-07, "loss": 0.4571, "mean_token_accuracy": 0.8491787910461426, "num_tokens": 70991765.0, "step": 1857 }, { "epoch": 0.2363566976211678, "ewc_loss": 0.012565038166940212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2565038559841923e-05, "grad_norm": 11.745306015014648, "learning_rate": 7.871979652395082e-07, "loss": 0.4863, "mean_token_accuracy": 0.8405678272247314, "num_tokens": 71025478.0, "step": 1858 }, { "epoch": 0.2364839078997583, "ewc_loss": 0.01257812138646841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2578121641126927e-05, "grad_norm": 11.715812683105469, "learning_rate": 7.876218736752861e-07, "loss": 0.441, "mean_token_accuracy": 0.8591705560684204, "num_tokens": 71067287.0, "step": 1859 }, { "epoch": 0.23661111817834882, "ewc_loss": 0.012556609697639942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2556609362945892e-05, "grad_norm": 11.738237380981445, "learning_rate": 7.88045782111064e-07, "loss": 0.5108, "mean_token_accuracy": 0.8382138013839722, "num_tokens": 71108095.0, "step": 1860 }, { "epoch": 0.23673832845693932, "ewc_loss": 0.01258566603064537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2585665899678133e-05, "grad_norm": 11.73409366607666, "learning_rate": 7.884696905468418e-07, "loss": 0.4442, "mean_token_accuracy": 0.8578635454177856, "num_tokens": 71150297.0, "step": 1861 }, { "epoch": 0.23686553873552982, "ewc_loss": 0.012584106996655464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2584107025759295e-05, "grad_norm": 11.730156898498535, "learning_rate": 7.888935989826198e-07, "loss": 0.4917, "mean_token_accuracy": 0.8387565612792969, "num_tokens": 71189205.0, "step": 1862 }, { "epoch": 0.23699274901412035, "ewc_loss": 0.012587396427989006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2587396668095607e-05, "grad_norm": 11.783839225769043, "learning_rate": 7.893175074183976e-07, "loss": 0.4811, "mean_token_accuracy": 0.8438929319381714, "num_tokens": 71228141.0, "step": 1863 }, { "epoch": 0.23711995929271085, "ewc_loss": 0.012616131454706192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2616131243703421e-05, "grad_norm": 11.767593383789062, "learning_rate": 7.897414158541754e-07, "loss": 0.4818, "mean_token_accuracy": 0.845295786857605, "num_tokens": 71267854.0, "step": 1864 }, { "epoch": 0.23724716957130135, "ewc_loss": 0.01257410179823637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.257410167454509e-05, "grad_norm": 11.769550323486328, "learning_rate": 7.901653242899533e-07, "loss": 0.5168, "mean_token_accuracy": 0.8395932912826538, "num_tokens": 71305030.0, "step": 1865 }, { "epoch": 0.23737437984989188, "ewc_loss": 0.01259827334433794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.259827331523411e-05, "grad_norm": 11.730993270874023, "learning_rate": 7.905892327257312e-07, "loss": 0.472, "mean_token_accuracy": 0.8480026721954346, "num_tokens": 71342202.0, "step": 1866 }, { "epoch": 0.23750159012848238, "ewc_loss": 0.012580606155097485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.258060638065217e-05, "grad_norm": 11.78137493133545, "learning_rate": 7.910131411615091e-07, "loss": 0.4724, "mean_token_accuracy": 0.8503526449203491, "num_tokens": 71379702.0, "step": 1867 }, { "epoch": 0.2376288004070729, "ewc_loss": 0.012606021016836166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2606021300598513e-05, "grad_norm": 11.776863098144531, "learning_rate": 7.91437049597287e-07, "loss": 0.4747, "mean_token_accuracy": 0.8473943471908569, "num_tokens": 71417785.0, "step": 1868 }, { "epoch": 0.2377560106856634, "ewc_loss": 0.012590684927999973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2590684491442516e-05, "grad_norm": 11.800322532653809, "learning_rate": 7.918609580330648e-07, "loss": 0.518, "mean_token_accuracy": 0.8338843584060669, "num_tokens": 71461358.0, "step": 1869 }, { "epoch": 0.2378832209642539, "ewc_loss": 0.01259091217070818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.259091186511796e-05, "grad_norm": 11.75363540649414, "learning_rate": 7.922848664688428e-07, "loss": 0.515, "mean_token_accuracy": 0.8379940390586853, "num_tokens": 71500059.0, "step": 1870 }, { "epoch": 0.23801043124284443, "ewc_loss": 0.012598470784723759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2598470675584394e-05, "grad_norm": 11.824610710144043, "learning_rate": 7.927087749046205e-07, "loss": 0.426, "mean_token_accuracy": 0.8626359701156616, "num_tokens": 71537043.0, "step": 1871 }, { "epoch": 0.23813764152143493, "ewc_loss": 0.012614131905138493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2614132174348924e-05, "grad_norm": 11.782757759094238, "learning_rate": 7.931326833403983e-07, "loss": 0.4613, "mean_token_accuracy": 0.849163293838501, "num_tokens": 71570671.0, "step": 1872 }, { "epoch": 0.23826485180002543, "ewc_loss": 0.012602749280631542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2602748938661534e-05, "grad_norm": 11.782325744628906, "learning_rate": 7.935565917761763e-07, "loss": 0.4716, "mean_token_accuracy": 0.8486692905426025, "num_tokens": 71614813.0, "step": 1873 }, { "epoch": 0.23839206207861596, "ewc_loss": 0.012622815556824207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.262281512026675e-05, "grad_norm": 11.801219940185547, "learning_rate": 7.939805002119541e-07, "loss": 0.4869, "mean_token_accuracy": 0.8410029411315918, "num_tokens": 71650441.0, "step": 1874 }, { "epoch": 0.23851927235720646, "ewc_loss": 0.012647468596696854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.264746879314771e-05, "grad_norm": 11.79133129119873, "learning_rate": 7.944044086477321e-07, "loss": 0.4928, "mean_token_accuracy": 0.8424073457717896, "num_tokens": 71690425.0, "step": 1875 }, { "epoch": 0.23864648263579696, "ewc_loss": 0.01263880543410778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2638805856113322e-05, "grad_norm": 11.774648666381836, "learning_rate": 7.948283170835099e-07, "loss": 0.4405, "mean_token_accuracy": 0.8567514419555664, "num_tokens": 71727349.0, "step": 1876 }, { "epoch": 0.2387736929143875, "ewc_loss": 0.012697175145149231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2697175407083705e-05, "grad_norm": 11.81020736694336, "learning_rate": 7.952522255192878e-07, "loss": 0.4059, "mean_token_accuracy": 0.8686563372612, "num_tokens": 71770386.0, "step": 1877 }, { "epoch": 0.238900903192978, "ewc_loss": 0.012662663124501705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2662662811635528e-05, "grad_norm": 11.810643196105957, "learning_rate": 7.956761339550657e-07, "loss": 0.4389, "mean_token_accuracy": 0.8582828044891357, "num_tokens": 71808946.0, "step": 1878 }, { "epoch": 0.2390281134715685, "ewc_loss": 0.012658638879656792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2658639207074884e-05, "grad_norm": 11.791154861450195, "learning_rate": 7.961000423908435e-07, "loss": 0.4796, "mean_token_accuracy": 0.851800799369812, "num_tokens": 71848605.0, "step": 1879 }, { "epoch": 0.23915532375015902, "ewc_loss": 0.012690726667642593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2690727089648135e-05, "grad_norm": 11.831435203552246, "learning_rate": 7.965239508266214e-07, "loss": 0.4581, "mean_token_accuracy": 0.8508530259132385, "num_tokens": 71886771.0, "step": 1880 }, { "epoch": 0.23928253402874952, "ewc_loss": 0.012668631039559841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2668630915868562e-05, "grad_norm": 11.772930145263672, "learning_rate": 7.969478592623993e-07, "loss": 0.4352, "mean_token_accuracy": 0.8573909401893616, "num_tokens": 71924905.0, "step": 1881 }, { "epoch": 0.23940974430734002, "ewc_loss": 0.012669989839196205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.266998970095301e-05, "grad_norm": 11.83371353149414, "learning_rate": 7.973717676981771e-07, "loss": 0.4618, "mean_token_accuracy": 0.851680338382721, "num_tokens": 71964513.0, "step": 1882 }, { "epoch": 0.23953695458593055, "ewc_loss": 0.012682079337537289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2682079614023678e-05, "grad_norm": 11.855463027954102, "learning_rate": 7.977956761339551e-07, "loss": 0.4318, "mean_token_accuracy": 0.8596727848052979, "num_tokens": 72000918.0, "step": 1883 }, { "epoch": 0.23966416486452105, "ewc_loss": 0.01268830243498087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2688302376773208e-05, "grad_norm": 11.81168270111084, "learning_rate": 7.982195845697329e-07, "loss": 0.5621, "mean_token_accuracy": 0.8208482265472412, "num_tokens": 72040840.0, "step": 1884 }, { "epoch": 0.23979137514311155, "ewc_loss": 0.012664873152971268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2664872883760836e-05, "grad_norm": 11.835641860961914, "learning_rate": 7.986434930055108e-07, "loss": 0.4349, "mean_token_accuracy": 0.856788694858551, "num_tokens": 72076757.0, "step": 1885 }, { "epoch": 0.23991858542170208, "ewc_loss": 0.012719285674393177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2719286132778507e-05, "grad_norm": 11.84221076965332, "learning_rate": 7.990674014412886e-07, "loss": 0.492, "mean_token_accuracy": 0.8446254730224609, "num_tokens": 72120760.0, "step": 1886 }, { "epoch": 0.24004579570029258, "ewc_loss": 0.012708759866654873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2708759641100187e-05, "grad_norm": 11.829183578491211, "learning_rate": 7.994913098770665e-07, "loss": 0.4784, "mean_token_accuracy": 0.844088613986969, "num_tokens": 72159985.0, "step": 1887 }, { "epoch": 0.24017300597888308, "ewc_loss": 0.012721403501927853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2721403436444234e-05, "grad_norm": 11.854864120483398, "learning_rate": 7.999152183128444e-07, "loss": 0.512, "mean_token_accuracy": 0.835132360458374, "num_tokens": 72196377.0, "step": 1888 }, { "epoch": 0.2403002162574736, "ewc_loss": 0.012725182808935642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.27251823869301e-05, "grad_norm": 11.782598495483398, "learning_rate": 8.003391267486223e-07, "loss": 0.4608, "mean_token_accuracy": 0.8515506982803345, "num_tokens": 72236585.0, "step": 1889 }, { "epoch": 0.2404274265360641, "ewc_loss": 0.012722979299724102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2722979590762407e-05, "grad_norm": 11.799784660339355, "learning_rate": 8.007630351844001e-07, "loss": 0.4735, "mean_token_accuracy": 0.8462462425231934, "num_tokens": 72268450.0, "step": 1890 }, { "epoch": 0.24055463681465464, "ewc_loss": 0.012737764976918697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2737765246129129e-05, "grad_norm": 11.842704772949219, "learning_rate": 8.011869436201781e-07, "loss": 0.4576, "mean_token_accuracy": 0.8518913984298706, "num_tokens": 72304092.0, "step": 1891 }, { "epoch": 0.24068184709324514, "ewc_loss": 0.012767643667757511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2767643966071773e-05, "grad_norm": 11.763023376464844, "learning_rate": 8.016108520559559e-07, "loss": 0.4839, "mean_token_accuracy": 0.8427836298942566, "num_tokens": 72347312.0, "step": 1892 }, { "epoch": 0.24080905737183564, "ewc_loss": 0.012776450254023075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.277645060326904e-05, "grad_norm": 11.877752304077148, "learning_rate": 8.020347604917338e-07, "loss": 0.4465, "mean_token_accuracy": 0.8550742864608765, "num_tokens": 72382135.0, "step": 1893 }, { "epoch": 0.24093626765042617, "ewc_loss": 0.01283468957990408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2834689187002368e-05, "grad_norm": 11.86928939819336, "learning_rate": 8.024586689275116e-07, "loss": 0.4736, "mean_token_accuracy": 0.8480091691017151, "num_tokens": 72421526.0, "step": 1894 }, { "epoch": 0.24106347792901667, "ewc_loss": 0.012802733108401299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2802733181160875e-05, "grad_norm": 11.8391695022583, "learning_rate": 8.028825773632894e-07, "loss": 0.4739, "mean_token_accuracy": 0.8489208817481995, "num_tokens": 72458592.0, "step": 1895 }, { "epoch": 0.24119068820760717, "ewc_loss": 0.012804443947970867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.280444394069491e-05, "grad_norm": 11.890799522399902, "learning_rate": 8.033064857990674e-07, "loss": 0.4055, "mean_token_accuracy": 0.8664870858192444, "num_tokens": 72494734.0, "step": 1896 }, { "epoch": 0.2413178984861977, "ewc_loss": 0.012816524133086205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2816523849323858e-05, "grad_norm": 11.806103706359863, "learning_rate": 8.037303942348452e-07, "loss": 0.4553, "mean_token_accuracy": 0.8536058068275452, "num_tokens": 72531560.0, "step": 1897 }, { "epoch": 0.2414451087647882, "ewc_loss": 0.012810859829187393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2810859516321216e-05, "grad_norm": 11.88453197479248, "learning_rate": 8.041543026706231e-07, "loss": 0.4537, "mean_token_accuracy": 0.8536015748977661, "num_tokens": 72570144.0, "step": 1898 }, { "epoch": 0.2415723190433787, "ewc_loss": 0.012862786650657654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2862787116318941e-05, "grad_norm": 11.847530364990234, "learning_rate": 8.04578211106401e-07, "loss": 0.4636, "mean_token_accuracy": 0.8477222919464111, "num_tokens": 72606738.0, "step": 1899 }, { "epoch": 0.24169952932196923, "ewc_loss": 0.0128342155367136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2834215340262745e-05, "grad_norm": 11.839522361755371, "learning_rate": 8.050021195421789e-07, "loss": 0.5072, "mean_token_accuracy": 0.837294340133667, "num_tokens": 72646230.0, "step": 1900 }, { "epoch": 0.24182673960055973, "ewc_loss": 0.012835226021707058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2835225788876414e-05, "grad_norm": 11.910049438476562, "learning_rate": 8.054260279779567e-07, "loss": 0.4759, "mean_token_accuracy": 0.8435872197151184, "num_tokens": 72681811.0, "step": 1901 }, { "epoch": 0.24195394987915023, "ewc_loss": 0.012885643169283867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2885642718174495e-05, "grad_norm": 11.797220230102539, "learning_rate": 8.058499364137346e-07, "loss": 0.5124, "mean_token_accuracy": 0.835808515548706, "num_tokens": 72721805.0, "step": 1902 }, { "epoch": 0.24208116015774075, "ewc_loss": 0.01285417191684246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2854171473009046e-05, "grad_norm": 11.868437767028809, "learning_rate": 8.062738448495124e-07, "loss": 0.425, "mean_token_accuracy": 0.8614009618759155, "num_tokens": 72763142.0, "step": 1903 }, { "epoch": 0.24220837043633126, "ewc_loss": 0.012922994792461395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2922994756081607e-05, "grad_norm": 11.829606056213379, "learning_rate": 8.066977532852904e-07, "loss": 0.4167, "mean_token_accuracy": 0.8682218790054321, "num_tokens": 72802047.0, "step": 1904 }, { "epoch": 0.24233558071492176, "ewc_loss": 0.012879536487162113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2879536370746791e-05, "grad_norm": 11.886667251586914, "learning_rate": 8.071216617210682e-07, "loss": 0.4954, "mean_token_accuracy": 0.8412469625473022, "num_tokens": 72834470.0, "step": 1905 }, { "epoch": 0.24246279099351228, "ewc_loss": 0.01292335893958807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2923358553962316e-05, "grad_norm": 11.82734489440918, "learning_rate": 8.075455701568461e-07, "loss": 0.4644, "mean_token_accuracy": 0.85108482837677, "num_tokens": 72873101.0, "step": 1906 }, { "epoch": 0.24259000127210278, "ewc_loss": 0.012904426082968712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2904426512250211e-05, "grad_norm": 11.857231140136719, "learning_rate": 8.07969478592624e-07, "loss": 0.4443, "mean_token_accuracy": 0.8594135046005249, "num_tokens": 72915504.0, "step": 1907 }, { "epoch": 0.24271721155069328, "ewc_loss": 0.012918485328555107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2918485481350217e-05, "grad_norm": 11.872148513793945, "learning_rate": 8.083933870284019e-07, "loss": 0.4183, "mean_token_accuracy": 0.8658091425895691, "num_tokens": 72947179.0, "step": 1908 }, { "epoch": 0.2428444218292838, "ewc_loss": 0.012950564734637737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2950565178471152e-05, "grad_norm": 11.951253890991211, "learning_rate": 8.088172954641796e-07, "loss": 0.459, "mean_token_accuracy": 0.8521853685379028, "num_tokens": 72983313.0, "step": 1909 }, { "epoch": 0.2429716321078743, "ewc_loss": 0.012953247874975204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2953248187841382e-05, "grad_norm": 11.898183822631836, "learning_rate": 8.092412038999576e-07, "loss": 0.5091, "mean_token_accuracy": 0.8372098207473755, "num_tokens": 73014218.0, "step": 1910 }, { "epoch": 0.24309884238646481, "ewc_loss": 0.012928832322359085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2928831893077586e-05, "grad_norm": 11.90000057220459, "learning_rate": 8.096651123357354e-07, "loss": 0.5187, "mean_token_accuracy": 0.8321608304977417, "num_tokens": 73048282.0, "step": 1911 }, { "epoch": 0.24322605266505534, "ewc_loss": 0.012961934320628643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2961934771738015e-05, "grad_norm": 11.853548049926758, "learning_rate": 8.100890207715134e-07, "loss": 0.5043, "mean_token_accuracy": 0.8395146131515503, "num_tokens": 73084390.0, "step": 1912 }, { "epoch": 0.24335326294364584, "ewc_loss": 0.012970131821930408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2970132047485095e-05, "grad_norm": 11.914061546325684, "learning_rate": 8.105129292072912e-07, "loss": 0.4717, "mean_token_accuracy": 0.8488998413085938, "num_tokens": 73123296.0, "step": 1913 }, { "epoch": 0.24348047322223634, "ewc_loss": 0.013014025054872036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3014025171287358e-05, "grad_norm": 11.971630096435547, "learning_rate": 8.10936837643069e-07, "loss": 0.5171, "mean_token_accuracy": 0.835808277130127, "num_tokens": 73157533.0, "step": 1914 }, { "epoch": 0.24360768350082687, "ewc_loss": 0.013004432432353497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3004432730667759e-05, "grad_norm": 11.879753112792969, "learning_rate": 8.11360746078847e-07, "loss": 0.4505, "mean_token_accuracy": 0.8537031412124634, "num_tokens": 73195314.0, "step": 1915 }, { "epoch": 0.24373489377941737, "ewc_loss": 0.012993362732231617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.2993362361157779e-05, "grad_norm": 11.869317054748535, "learning_rate": 8.117846545146248e-07, "loss": 0.4192, "mean_token_accuracy": 0.8620620965957642, "num_tokens": 73236763.0, "step": 1916 }, { "epoch": 0.2438621040580079, "ewc_loss": 0.013041645288467407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.30416456158855e-05, "grad_norm": 11.939310073852539, "learning_rate": 8.122085629504026e-07, "loss": 0.5082, "mean_token_accuracy": 0.8348221778869629, "num_tokens": 73276719.0, "step": 1917 }, { "epoch": 0.2439893143365984, "ewc_loss": 0.013056976720690727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3056976968073286e-05, "grad_norm": 11.868988990783691, "learning_rate": 8.126324713861805e-07, "loss": 0.4737, "mean_token_accuracy": 0.8498880863189697, "num_tokens": 73318594.0, "step": 1918 }, { "epoch": 0.2441165246151889, "ewc_loss": 0.013024562038481236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3024561667407397e-05, "grad_norm": 11.886700630187988, "learning_rate": 8.130563798219584e-07, "loss": 0.4482, "mean_token_accuracy": 0.857265830039978, "num_tokens": 73358760.0, "step": 1919 }, { "epoch": 0.24424373489377943, "ewc_loss": 0.01308212336152792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3082123587082606e-05, "grad_norm": 11.871085166931152, "learning_rate": 8.134802882577363e-07, "loss": 0.4919, "mean_token_accuracy": 0.8460448980331421, "num_tokens": 73398623.0, "step": 1920 }, { "epoch": 0.24437094517236993, "ewc_loss": 0.013061854988336563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3061854588158894e-05, "grad_norm": 11.988149642944336, "learning_rate": 8.139041966935142e-07, "loss": 0.4733, "mean_token_accuracy": 0.8494042158126831, "num_tokens": 73446369.0, "step": 1921 }, { "epoch": 0.24449815545096043, "ewc_loss": 0.013105560094118118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3105560356052592e-05, "grad_norm": 11.899585723876953, "learning_rate": 8.14328105129292e-07, "loss": 0.5338, "mean_token_accuracy": 0.8345806002616882, "num_tokens": 73483152.0, "step": 1922 }, { "epoch": 0.24462536572955096, "ewc_loss": 0.013067950494587421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3067950931144878e-05, "grad_norm": 11.965278625488281, "learning_rate": 8.1475201356507e-07, "loss": 0.488, "mean_token_accuracy": 0.8446956872940063, "num_tokens": 73519179.0, "step": 1923 }, { "epoch": 0.24475257600814146, "ewc_loss": 0.013114380650222301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3114380635670386e-05, "grad_norm": 11.867424964904785, "learning_rate": 8.151759220008477e-07, "loss": 0.4603, "mean_token_accuracy": 0.8501300811767578, "num_tokens": 73561947.0, "step": 1924 }, { "epoch": 0.24487978628673196, "ewc_loss": 0.013068335130810738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3068334737909026e-05, "grad_norm": 11.966448783874512, "learning_rate": 8.155998304366256e-07, "loss": 0.4753, "mean_token_accuracy": 0.8520728349685669, "num_tokens": 73595841.0, "step": 1925 }, { "epoch": 0.2450069965653225, "ewc_loss": 0.01312527060508728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3125270925229415e-05, "grad_norm": 11.982604026794434, "learning_rate": 8.160237388724035e-07, "loss": 0.5009, "mean_token_accuracy": 0.836991548538208, "num_tokens": 73629716.0, "step": 1926 }, { "epoch": 0.245134206843913, "ewc_loss": 0.013085459358990192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3085459613648709e-05, "grad_norm": 11.961519241333008, "learning_rate": 8.164476473081814e-07, "loss": 0.488, "mean_token_accuracy": 0.8433490991592407, "num_tokens": 73667285.0, "step": 1927 }, { "epoch": 0.2452614171225035, "ewc_loss": 0.013118579983711243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3118579772708472e-05, "grad_norm": 11.962017059326172, "learning_rate": 8.168715557439593e-07, "loss": 0.4325, "mean_token_accuracy": 0.8606914281845093, "num_tokens": 73708059.0, "step": 1928 }, { "epoch": 0.24538862740109402, "ewc_loss": 0.013091550208628178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3091550499666482e-05, "grad_norm": 11.909066200256348, "learning_rate": 8.172954641797372e-07, "loss": 0.4443, "mean_token_accuracy": 0.8549286127090454, "num_tokens": 73748553.0, "step": 1929 }, { "epoch": 0.24551583767968452, "ewc_loss": 0.013114014640450478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3114015018800274e-05, "grad_norm": 12.027928352355957, "learning_rate": 8.17719372615515e-07, "loss": 0.5449, "mean_token_accuracy": 0.8260848522186279, "num_tokens": 73796295.0, "step": 1930 }, { "epoch": 0.24564304795827502, "ewc_loss": 0.013118312694132328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.311831238126615e-05, "grad_norm": 11.941526412963867, "learning_rate": 8.18143281051293e-07, "loss": 0.525, "mean_token_accuracy": 0.8347783088684082, "num_tokens": 73835960.0, "step": 1931 }, { "epoch": 0.24577025823686555, "ewc_loss": 0.01309917587786913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3099175703246146e-05, "grad_norm": 11.983266830444336, "learning_rate": 8.185671894870707e-07, "loss": 0.5406, "mean_token_accuracy": 0.8284996747970581, "num_tokens": 73875302.0, "step": 1932 }, { "epoch": 0.24589746851545605, "ewc_loss": 0.013160379603505135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3160379239707254e-05, "grad_norm": 12.035028457641602, "learning_rate": 8.189910979228485e-07, "loss": 0.4315, "mean_token_accuracy": 0.8589519262313843, "num_tokens": 73907703.0, "step": 1933 }, { "epoch": 0.24602467879404655, "ewc_loss": 0.013138179667294025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3138179383531678e-05, "grad_norm": 12.012659072875977, "learning_rate": 8.194150063586265e-07, "loss": 0.5261, "mean_token_accuracy": 0.8344296813011169, "num_tokens": 73945449.0, "step": 1934 }, { "epoch": 0.24615188907263708, "ewc_loss": 0.01314372569322586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.314372548222309e-05, "grad_norm": 12.02491569519043, "learning_rate": 8.198389147944043e-07, "loss": 0.5141, "mean_token_accuracy": 0.8321437835693359, "num_tokens": 73982461.0, "step": 1935 }, { "epoch": 0.24627909935122758, "ewc_loss": 0.013121151365339756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3121150914230384e-05, "grad_norm": 11.942059516906738, "learning_rate": 8.202628232301823e-07, "loss": 0.4872, "mean_token_accuracy": 0.8484033346176147, "num_tokens": 74018252.0, "step": 1936 }, { "epoch": 0.24640630962981808, "ewc_loss": 0.01313647348433733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3136473171471152e-05, "grad_norm": 12.044960021972656, "learning_rate": 8.206867316659601e-07, "loss": 0.5077, "mean_token_accuracy": 0.835716962814331, "num_tokens": 74065843.0, "step": 1937 }, { "epoch": 0.2465335199084086, "ewc_loss": 0.0131679717451334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3167971701477654e-05, "grad_norm": 11.952363014221191, "learning_rate": 8.21110640101738e-07, "loss": 0.3981, "mean_token_accuracy": 0.867840588092804, "num_tokens": 74106389.0, "step": 1938 }, { "epoch": 0.2466607301869991, "ewc_loss": 0.01313558965921402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3135590052115731e-05, "grad_norm": 12.051657676696777, "learning_rate": 8.215345485375159e-07, "loss": 0.5182, "mean_token_accuracy": 0.835205078125, "num_tokens": 74150786.0, "step": 1939 }, { "epoch": 0.2467879404655896, "ewc_loss": 0.013194904662668705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3194904568081256e-05, "grad_norm": 12.093504905700684, "learning_rate": 8.219584569732937e-07, "loss": 0.4884, "mean_token_accuracy": 0.840093731880188, "num_tokens": 74190474.0, "step": 1940 }, { "epoch": 0.24691515074418013, "ewc_loss": 0.013145878911018372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3145879165676888e-05, "grad_norm": 12.009723663330078, "learning_rate": 8.223823654090715e-07, "loss": 0.4384, "mean_token_accuracy": 0.8579214811325073, "num_tokens": 74226437.0, "step": 1941 }, { "epoch": 0.24704236102277063, "ewc_loss": 0.013139386661350727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3139386283000931e-05, "grad_norm": 12.030317306518555, "learning_rate": 8.228062738448495e-07, "loss": 0.4673, "mean_token_accuracy": 0.8470085859298706, "num_tokens": 74267292.0, "step": 1942 }, { "epoch": 0.24716957130136116, "ewc_loss": 0.013181604444980621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3181604117562529e-05, "grad_norm": 12.0780029296875, "learning_rate": 8.232301822806273e-07, "loss": 0.4923, "mean_token_accuracy": 0.8420701026916504, "num_tokens": 74306738.0, "step": 1943 }, { "epoch": 0.24729678157995166, "ewc_loss": 0.01316328439861536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3163284165784717e-05, "grad_norm": 12.026376724243164, "learning_rate": 8.236540907164053e-07, "loss": 0.4816, "mean_token_accuracy": 0.8478597402572632, "num_tokens": 74343151.0, "step": 1944 }, { "epoch": 0.24742399185854216, "ewc_loss": 0.013166997581720352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3166997632652055e-05, "grad_norm": 12.05552864074707, "learning_rate": 8.240779991521831e-07, "loss": 0.4587, "mean_token_accuracy": 0.8508179187774658, "num_tokens": 74382755.0, "step": 1945 }, { "epoch": 0.2475512021371327, "ewc_loss": 0.013175415806472301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3175415915611666e-05, "grad_norm": 12.004059791564941, "learning_rate": 8.24501907587961e-07, "loss": 0.4951, "mean_token_accuracy": 0.8412001132965088, "num_tokens": 74426129.0, "step": 1946 }, { "epoch": 0.2476784124157232, "ewc_loss": 0.013190013356506824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3190013305575121e-05, "grad_norm": 12.061578750610352, "learning_rate": 8.249258160237388e-07, "loss": 0.5067, "mean_token_accuracy": 0.8421053290367126, "num_tokens": 74460904.0, "step": 1947 }, { "epoch": 0.2478056226943137, "ewc_loss": 0.013216778635978699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3216778825153597e-05, "grad_norm": 12.083060264587402, "learning_rate": 8.253497244595167e-07, "loss": 0.4954, "mean_token_accuracy": 0.842060923576355, "num_tokens": 74497993.0, "step": 1948 }, { "epoch": 0.24793283297290422, "ewc_loss": 0.013167830184102058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3167829820304178e-05, "grad_norm": 11.956731796264648, "learning_rate": 8.257736328952945e-07, "loss": 0.5129, "mean_token_accuracy": 0.8365620374679565, "num_tokens": 74541108.0, "step": 1949 }, { "epoch": 0.24806004325149472, "ewc_loss": 0.013225856237113476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3225856491771992e-05, "grad_norm": 12.075614929199219, "learning_rate": 8.261975413310725e-07, "loss": 0.4833, "mean_token_accuracy": 0.846144437789917, "num_tokens": 74579389.0, "step": 1950 }, { "epoch": 0.24818725353008522, "ewc_loss": 0.013275466859340668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.32754666992696e-05, "grad_norm": 12.016340255737305, "learning_rate": 8.266214497668503e-07, "loss": 0.4361, "mean_token_accuracy": 0.8621736764907837, "num_tokens": 74621546.0, "step": 1951 }, { "epoch": 0.24831446380867575, "ewc_loss": 0.013202753849327564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3202753507357556e-05, "grad_norm": 12.035871505737305, "learning_rate": 8.270453582026283e-07, "loss": 0.5169, "mean_token_accuracy": 0.832760214805603, "num_tokens": 74660178.0, "step": 1952 }, { "epoch": 0.24844167408726625, "ewc_loss": 0.013287854380905628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3287854017107747e-05, "grad_norm": 12.128716468811035, "learning_rate": 8.274692666384061e-07, "loss": 0.4631, "mean_token_accuracy": 0.8508771061897278, "num_tokens": 74693867.0, "step": 1953 }, { "epoch": 0.24856888436585675, "ewc_loss": 0.013273987919092178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3273987860884517e-05, "grad_norm": 12.03231430053711, "learning_rate": 8.27893175074184e-07, "loss": 0.4765, "mean_token_accuracy": 0.8452613353729248, "num_tokens": 74727561.0, "step": 1954 }, { "epoch": 0.24869609464444728, "ewc_loss": 0.013234340585768223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.323434025835013e-05, "grad_norm": 12.110921859741211, "learning_rate": 8.283170835099618e-07, "loss": 0.4785, "mean_token_accuracy": 0.8472462892532349, "num_tokens": 74763719.0, "step": 1955 }, { "epoch": 0.24882330492303778, "ewc_loss": 0.013288892805576324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3288892660057172e-05, "grad_norm": 12.035872459411621, "learning_rate": 8.287409919457396e-07, "loss": 0.4431, "mean_token_accuracy": 0.8560314178466797, "num_tokens": 74802666.0, "step": 1956 }, { "epoch": 0.24895051520162828, "ewc_loss": 0.013246686197817326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3246685739431996e-05, "grad_norm": 12.035141944885254, "learning_rate": 8.291649003815175e-07, "loss": 0.4933, "mean_token_accuracy": 0.8423302173614502, "num_tokens": 74840307.0, "step": 1957 }, { "epoch": 0.2490777254802188, "ewc_loss": 0.013341555371880531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.334155513177393e-05, "grad_norm": 12.114551544189453, "learning_rate": 8.295888088172954e-07, "loss": 0.5046, "mean_token_accuracy": 0.840674638748169, "num_tokens": 74880181.0, "step": 1958 }, { "epoch": 0.2492049357588093, "ewc_loss": 0.013306397944688797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3306397704582196e-05, "grad_norm": 12.024083137512207, "learning_rate": 8.300127172530733e-07, "loss": 0.4132, "mean_token_accuracy": 0.8636617064476013, "num_tokens": 74917148.0, "step": 1959 }, { "epoch": 0.2493321460373998, "ewc_loss": 0.01330680213868618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3306802429724485e-05, "grad_norm": 12.084453582763672, "learning_rate": 8.304366256888512e-07, "loss": 0.4497, "mean_token_accuracy": 0.8533236980438232, "num_tokens": 74953124.0, "step": 1960 }, { "epoch": 0.24945935631599034, "ewc_loss": 0.013355187140405178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3355187547858804e-05, "grad_norm": 12.160188674926758, "learning_rate": 8.308605341246291e-07, "loss": 0.4639, "mean_token_accuracy": 0.8497527837753296, "num_tokens": 74989726.0, "step": 1961 }, { "epoch": 0.24958656659458084, "ewc_loss": 0.013342603109776974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3342602869670372e-05, "grad_norm": 12.125385284423828, "learning_rate": 8.312844425604068e-07, "loss": 0.4651, "mean_token_accuracy": 0.8559634685516357, "num_tokens": 75024849.0, "step": 1962 }, { "epoch": 0.24971377687317134, "ewc_loss": 0.013325928710401058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.332592910330277e-05, "grad_norm": 12.042869567871094, "learning_rate": 8.317083509961848e-07, "loss": 0.4929, "mean_token_accuracy": 0.8444492816925049, "num_tokens": 75066464.0, "step": 1963 }, { "epoch": 0.24984098715176187, "ewc_loss": 0.01336654182523489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3366541679715738e-05, "grad_norm": 12.201502799987793, "learning_rate": 8.321322594319626e-07, "loss": 0.4447, "mean_token_accuracy": 0.8553497791290283, "num_tokens": 75106829.0, "step": 1964 }, { "epoch": 0.24996819743035237, "ewc_loss": 0.013379271142184734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3379270967561752e-05, "grad_norm": 12.132375717163086, "learning_rate": 8.325561678677405e-07, "loss": 0.4976, "mean_token_accuracy": 0.8414309620857239, "num_tokens": 75148227.0, "step": 1965 }, { "epoch": 0.2500954077089429, "ewc_loss": 0.013344562612473965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3344562830752693e-05, "grad_norm": 12.120616912841797, "learning_rate": 8.329800763035184e-07, "loss": 0.5116, "mean_token_accuracy": 0.8349983096122742, "num_tokens": 75188719.0, "step": 1966 }, { "epoch": 0.25022261798753337, "ewc_loss": 0.013367339968681335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3367340216063894e-05, "grad_norm": 12.23047161102295, "learning_rate": 8.334039847392963e-07, "loss": 0.4623, "mean_token_accuracy": 0.8487727046012878, "num_tokens": 75222260.0, "step": 1967 }, { "epoch": 0.2503498282661239, "ewc_loss": 0.01338085625320673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3380856216826942e-05, "grad_norm": 12.063455581665039, "learning_rate": 8.338278931750742e-07, "loss": 0.4367, "mean_token_accuracy": 0.8585834503173828, "num_tokens": 75255489.0, "step": 1968 }, { "epoch": 0.2504770385447144, "ewc_loss": 0.013345147483050823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3345147635845933e-05, "grad_norm": 12.124614715576172, "learning_rate": 8.342518016108521e-07, "loss": 0.4634, "mean_token_accuracy": 0.8463515043258667, "num_tokens": 75290548.0, "step": 1969 }, { "epoch": 0.2506042488233049, "ewc_loss": 0.013424897566437721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3424897588265594e-05, "grad_norm": 12.224324226379395, "learning_rate": 8.346757100466298e-07, "loss": 0.4637, "mean_token_accuracy": 0.8517095446586609, "num_tokens": 75323912.0, "step": 1970 }, { "epoch": 0.2507314591018954, "ewc_loss": 0.013415808789432049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3415809007710777e-05, "grad_norm": 12.088432312011719, "learning_rate": 8.350996184824078e-07, "loss": 0.505, "mean_token_accuracy": 0.8357679843902588, "num_tokens": 75361206.0, "step": 1971 }, { "epoch": 0.25085866938048595, "ewc_loss": 0.013405718840658665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3405719073489308e-05, "grad_norm": 12.13259220123291, "learning_rate": 8.355235269181856e-07, "loss": 0.4858, "mean_token_accuracy": 0.8435503244400024, "num_tokens": 75397292.0, "step": 1972 }, { "epoch": 0.2509858796590764, "ewc_loss": 0.013440740294754505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3440740076475777e-05, "grad_norm": 12.15066909790039, "learning_rate": 8.359474353539635e-07, "loss": 0.4435, "mean_token_accuracy": 0.8572182655334473, "num_tokens": 75437616.0, "step": 1973 }, { "epoch": 0.25111308993766696, "ewc_loss": 0.013453678227961063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3453678548103198e-05, "grad_norm": 12.199966430664062, "learning_rate": 8.363713437897414e-07, "loss": 0.5351, "mean_token_accuracy": 0.8305422067642212, "num_tokens": 75486172.0, "step": 1974 }, { "epoch": 0.2512403002162575, "ewc_loss": 0.01345413364470005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3454133295454085e-05, "grad_norm": 12.172826766967773, "learning_rate": 8.367952522255193e-07, "loss": 0.4694, "mean_token_accuracy": 0.8448318243026733, "num_tokens": 75526637.0, "step": 1975 }, { "epoch": 0.25136751049484796, "ewc_loss": 0.013406993821263313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3406994185061194e-05, "grad_norm": 12.08626937866211, "learning_rate": 8.372191606612972e-07, "loss": 0.4709, "mean_token_accuracy": 0.8498826026916504, "num_tokens": 75564632.0, "step": 1976 }, { "epoch": 0.2514947207734385, "ewc_loss": 0.013470345176756382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3470345038513187e-05, "grad_norm": 12.242023468017578, "learning_rate": 8.376430690970749e-07, "loss": 0.4389, "mean_token_accuracy": 0.8589217662811279, "num_tokens": 75595553.0, "step": 1977 }, { "epoch": 0.251621931052029, "ewc_loss": 0.0134885273873806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3488527656591032e-05, "grad_norm": 12.096843719482422, "learning_rate": 8.380669775328528e-07, "loss": 0.4332, "mean_token_accuracy": 0.861548900604248, "num_tokens": 75635847.0, "step": 1978 }, { "epoch": 0.25174914133061954, "ewc_loss": 0.013456505723297596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.345650616713101e-05, "grad_norm": 12.229851722717285, "learning_rate": 8.384908859686307e-07, "loss": 0.4314, "mean_token_accuracy": 0.8615484833717346, "num_tokens": 75677403.0, "step": 1979 }, { "epoch": 0.25187635160921, "ewc_loss": 0.013522396795451641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3522397239285056e-05, "grad_norm": 12.151074409484863, "learning_rate": 8.389147944044086e-07, "loss": 0.4731, "mean_token_accuracy": 0.846799373626709, "num_tokens": 75720703.0, "step": 1980 }, { "epoch": 0.25200356188780054, "ewc_loss": 0.013450855389237404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3450855476548895e-05, "grad_norm": 12.198342323303223, "learning_rate": 8.393387028401864e-07, "loss": 0.4968, "mean_token_accuracy": 0.8412562012672424, "num_tokens": 75758000.0, "step": 1981 }, { "epoch": 0.25213077216639107, "ewc_loss": 0.013520478270947933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3520478205464315e-05, "grad_norm": 12.168485641479492, "learning_rate": 8.397626112759644e-07, "loss": 0.5244, "mean_token_accuracy": 0.8333500027656555, "num_tokens": 75791113.0, "step": 1982 }, { "epoch": 0.25225798244498154, "ewc_loss": 0.013495361432433128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3495361599780153e-05, "grad_norm": 12.23773193359375, "learning_rate": 8.401865197117422e-07, "loss": 0.4704, "mean_token_accuracy": 0.8494448065757751, "num_tokens": 75821733.0, "step": 1983 }, { "epoch": 0.25238519272357207, "ewc_loss": 0.013516462408006191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3516462786355987e-05, "grad_norm": 12.040870666503906, "learning_rate": 8.406104281475202e-07, "loss": 0.4475, "mean_token_accuracy": 0.8540909290313721, "num_tokens": 75861404.0, "step": 1984 }, { "epoch": 0.2525124030021626, "ewc_loss": 0.013466731645166874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3466731616063043e-05, "grad_norm": 12.161377906799316, "learning_rate": 8.410343365832979e-07, "loss": 0.4483, "mean_token_accuracy": 0.8544606566429138, "num_tokens": 75903059.0, "step": 1985 }, { "epoch": 0.2526396132807531, "ewc_loss": 0.013552747666835785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.355274798697792e-05, "grad_norm": 12.098591804504395, "learning_rate": 8.414582450190758e-07, "loss": 0.4608, "mean_token_accuracy": 0.8519070148468018, "num_tokens": 75947125.0, "step": 1986 }, { "epoch": 0.2527668235593436, "ewc_loss": 0.013494526036083698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3494525774149224e-05, "grad_norm": 12.15963077545166, "learning_rate": 8.418821534548537e-07, "loss": 0.5455, "mean_token_accuracy": 0.8273711800575256, "num_tokens": 75987519.0, "step": 1987 }, { "epoch": 0.25289403383793413, "ewc_loss": 0.013553974218666553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.355397398583591e-05, "grad_norm": 12.148017883300781, "learning_rate": 8.423060618906316e-07, "loss": 0.4721, "mean_token_accuracy": 0.8485779166221619, "num_tokens": 76027728.0, "step": 1988 }, { "epoch": 0.2530212441165246, "ewc_loss": 0.013508782722055912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3508783013094217e-05, "grad_norm": 12.122323989868164, "learning_rate": 8.427299703264095e-07, "loss": 0.4705, "mean_token_accuracy": 0.8503372669219971, "num_tokens": 76070942.0, "step": 1989 }, { "epoch": 0.25314845439511513, "ewc_loss": 0.013567972928285599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3567972928285599e-05, "grad_norm": 12.169004440307617, "learning_rate": 8.431538787621874e-07, "loss": 0.4962, "mean_token_accuracy": 0.842050313949585, "num_tokens": 76113330.0, "step": 1990 }, { "epoch": 0.25327566467370566, "ewc_loss": 0.013538096100091934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3538096027332358e-05, "grad_norm": 12.155852317810059, "learning_rate": 8.435777871979652e-07, "loss": 0.4488, "mean_token_accuracy": 0.8556441068649292, "num_tokens": 76153511.0, "step": 1991 }, { "epoch": 0.25340287495229613, "ewc_loss": 0.013570141047239304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3570141163654625e-05, "grad_norm": 12.189703941345215, "learning_rate": 8.440016956337432e-07, "loss": 0.474, "mean_token_accuracy": 0.8491147756576538, "num_tokens": 76191505.0, "step": 1992 }, { "epoch": 0.25353008523088666, "ewc_loss": 0.013558574952185154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3558575119532179e-05, "grad_norm": 12.201952934265137, "learning_rate": 8.444256040695209e-07, "loss": 0.49, "mean_token_accuracy": 0.8449344635009766, "num_tokens": 76232479.0, "step": 1993 }, { "epoch": 0.2536572955094772, "ewc_loss": 0.013579977676272392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.35799773488543e-05, "grad_norm": 12.273432731628418, "learning_rate": 8.448495125052988e-07, "loss": 0.4842, "mean_token_accuracy": 0.8452047109603882, "num_tokens": 76270192.0, "step": 1994 }, { "epoch": 0.25378450578806766, "ewc_loss": 0.013571596704423428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3571596355177462e-05, "grad_norm": 12.197848320007324, "learning_rate": 8.452734209410767e-07, "loss": 0.4242, "mean_token_accuracy": 0.863332211971283, "num_tokens": 76303981.0, "step": 1995 }, { "epoch": 0.2539117160666582, "ewc_loss": 0.013539438135921955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3539438441512175e-05, "grad_norm": 12.23548698425293, "learning_rate": 8.456973293768545e-07, "loss": 0.4488, "mean_token_accuracy": 0.8540077209472656, "num_tokens": 76338496.0, "step": 1996 }, { "epoch": 0.2540389263452487, "ewc_loss": 0.013601107522845268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.360110763926059e-05, "grad_norm": 12.253417015075684, "learning_rate": 8.461212378126325e-07, "loss": 0.5028, "mean_token_accuracy": 0.8399996757507324, "num_tokens": 76377904.0, "step": 1997 }, { "epoch": 0.2541661366238392, "ewc_loss": 0.013561706990003586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3561707419285085e-05, "grad_norm": 12.201644897460938, "learning_rate": 8.465451462484103e-07, "loss": 0.5257, "mean_token_accuracy": 0.83392333984375, "num_tokens": 76413905.0, "step": 1998 }, { "epoch": 0.2542933469024297, "ewc_loss": 0.013591967523097992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3591967217507772e-05, "grad_norm": 12.230923652648926, "learning_rate": 8.469690546841882e-07, "loss": 0.4749, "mean_token_accuracy": 0.8469228744506836, "num_tokens": 76451591.0, "step": 1999 }, { "epoch": 0.25442055718102025, "ewc_loss": 0.01361511368304491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3615113857667893e-05, "grad_norm": 12.204500198364258, "learning_rate": 8.47392963119966e-07, "loss": 0.4548, "mean_token_accuracy": 0.8530957102775574, "num_tokens": 76486191.0, "step": 2000 }, { "epoch": 0.2545477674596107, "ewc_loss": 0.013628937304019928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.362893726764014e-05, "grad_norm": 12.263635635375977, "learning_rate": 8.478168715557439e-07, "loss": 0.4742, "mean_token_accuracy": 0.8497443795204163, "num_tokens": 76523317.0, "step": 2001 }, { "epoch": 0.25467497773820125, "ewc_loss": 0.013642044737935066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3642044905282091e-05, "grad_norm": 12.195551872253418, "learning_rate": 8.482407799915217e-07, "loss": 0.4807, "mean_token_accuracy": 0.845645546913147, "num_tokens": 76562661.0, "step": 2002 }, { "epoch": 0.2548021880167918, "ewc_loss": 0.013625865802168846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3625865904032253e-05, "grad_norm": 12.253323554992676, "learning_rate": 8.486646884272997e-07, "loss": 0.475, "mean_token_accuracy": 0.8452749848365784, "num_tokens": 76600914.0, "step": 2003 }, { "epoch": 0.25492939829538225, "ewc_loss": 0.013662188313901424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3662188393936958e-05, "grad_norm": 12.219103813171387, "learning_rate": 8.490885968630775e-07, "loss": 0.4952, "mean_token_accuracy": 0.8400648832321167, "num_tokens": 76636310.0, "step": 2004 }, { "epoch": 0.2550566085739728, "ewc_loss": 0.013664137572050095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3664137441082858e-05, "grad_norm": 12.259051322937012, "learning_rate": 8.495125052988555e-07, "loss": 0.4796, "mean_token_accuracy": 0.8486875891685486, "num_tokens": 76680889.0, "step": 2005 }, { "epoch": 0.2551838188525633, "ewc_loss": 0.013683455064892769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3683455108548515e-05, "grad_norm": 12.19045352935791, "learning_rate": 8.499364137346333e-07, "loss": 0.4723, "mean_token_accuracy": 0.852568507194519, "num_tokens": 76722469.0, "step": 2006 }, { "epoch": 0.2553110291311538, "ewc_loss": 0.013659107498824596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3659107935382053e-05, "grad_norm": 12.267535209655762, "learning_rate": 8.503603221704112e-07, "loss": 0.5082, "mean_token_accuracy": 0.837643027305603, "num_tokens": 76757668.0, "step": 2007 }, { "epoch": 0.2554382394097443, "ewc_loss": 0.013720154762268066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3720155038754456e-05, "grad_norm": 12.192365646362305, "learning_rate": 8.50784230606189e-07, "loss": 0.4687, "mean_token_accuracy": 0.8501105904579163, "num_tokens": 76795607.0, "step": 2008 }, { "epoch": 0.25556544968833483, "ewc_loss": 0.013702754862606525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3702754586120136e-05, "grad_norm": 12.28986644744873, "learning_rate": 8.512081390419669e-07, "loss": 0.4979, "mean_token_accuracy": 0.8486766219139099, "num_tokens": 76837161.0, "step": 2009 }, { "epoch": 0.2556926599669253, "ewc_loss": 0.013748583383858204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3748583114647772e-05, "grad_norm": 12.21728801727295, "learning_rate": 8.516320474777447e-07, "loss": 0.4762, "mean_token_accuracy": 0.8480059504508972, "num_tokens": 76872198.0, "step": 2010 }, { "epoch": 0.25581987024551583, "ewc_loss": 0.013709116727113724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3709116501559038e-05, "grad_norm": 12.230307579040527, "learning_rate": 8.520559559135227e-07, "loss": 0.442, "mean_token_accuracy": 0.8604609966278076, "num_tokens": 76908976.0, "step": 2011 }, { "epoch": 0.25594708052410636, "ewc_loss": 0.013764253817498684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.376425370835932e-05, "grad_norm": 12.292516708374023, "learning_rate": 8.524798643493005e-07, "loss": 0.4933, "mean_token_accuracy": 0.8465037941932678, "num_tokens": 76947298.0, "step": 2012 }, { "epoch": 0.25607429080269684, "ewc_loss": 0.0137667590752244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3766759366262704e-05, "grad_norm": 12.200885772705078, "learning_rate": 8.529037727850785e-07, "loss": 0.4488, "mean_token_accuracy": 0.8534867763519287, "num_tokens": 76985112.0, "step": 2013 }, { "epoch": 0.25620150108128736, "ewc_loss": 0.013752619735896587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.375261945213424e-05, "grad_norm": 12.269926071166992, "learning_rate": 8.533276812208563e-07, "loss": 0.4218, "mean_token_accuracy": 0.863330602645874, "num_tokens": 77020309.0, "step": 2014 }, { "epoch": 0.2563287113598779, "ewc_loss": 0.01381661742925644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3816617865813896e-05, "grad_norm": 12.229564666748047, "learning_rate": 8.53751589656634e-07, "loss": 0.4624, "mean_token_accuracy": 0.8490040302276611, "num_tokens": 77063245.0, "step": 2015 }, { "epoch": 0.25645592163846836, "ewc_loss": 0.013818595558404922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.381859510729555e-05, "grad_norm": 12.31717586517334, "learning_rate": 8.54175498092412e-07, "loss": 0.4593, "mean_token_accuracy": 0.8519405722618103, "num_tokens": 77096791.0, "step": 2016 }, { "epoch": 0.2565831319170589, "ewc_loss": 0.013822421431541443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3822421351505909e-05, "grad_norm": 12.263118743896484, "learning_rate": 8.545994065281898e-07, "loss": 0.4379, "mean_token_accuracy": 0.8595253825187683, "num_tokens": 77129281.0, "step": 2017 }, { "epoch": 0.2567103421956494, "ewc_loss": 0.013842943124473095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3842943189956713e-05, "grad_norm": 12.323362350463867, "learning_rate": 8.550233149639677e-07, "loss": 0.526, "mean_token_accuracy": 0.8301554322242737, "num_tokens": 77168785.0, "step": 2018 }, { "epoch": 0.2568375524742399, "ewc_loss": 0.013864049687981606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3864049833500758e-05, "grad_norm": 12.236516952514648, "learning_rate": 8.554472233997456e-07, "loss": 0.4934, "mean_token_accuracy": 0.8393276333808899, "num_tokens": 77208360.0, "step": 2019 }, { "epoch": 0.2569647627528304, "ewc_loss": 0.013804083690047264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3804084119328763e-05, "grad_norm": 12.309318542480469, "learning_rate": 8.558711318355235e-07, "loss": 0.5261, "mean_token_accuracy": 0.8309464454650879, "num_tokens": 77245539.0, "step": 2020 }, { "epoch": 0.25709197303142095, "ewc_loss": 0.013862458989024162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3862459127267357e-05, "grad_norm": 12.267396926879883, "learning_rate": 8.562950402713014e-07, "loss": 0.4288, "mean_token_accuracy": 0.8672081232070923, "num_tokens": 77279118.0, "step": 2021 }, { "epoch": 0.2572191833100114, "ewc_loss": 0.013861853629350662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3861853403795976e-05, "grad_norm": 12.414143562316895, "learning_rate": 8.567189487070793e-07, "loss": 0.5166, "mean_token_accuracy": 0.8376284837722778, "num_tokens": 77316259.0, "step": 2022 }, { "epoch": 0.25734639358860195, "ewc_loss": 0.01389244757592678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3892447896068916e-05, "grad_norm": 12.21265983581543, "learning_rate": 8.57142857142857e-07, "loss": 0.5345, "mean_token_accuracy": 0.8307534456253052, "num_tokens": 77359712.0, "step": 2023 }, { "epoch": 0.2574736038671925, "ewc_loss": 0.013856293633580208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3856293662684038e-05, "grad_norm": 12.28869915008545, "learning_rate": 8.57566765578635e-07, "loss": 0.437, "mean_token_accuracy": 0.8617696166038513, "num_tokens": 77402269.0, "step": 2024 }, { "epoch": 0.25760081414578295, "ewc_loss": 0.01393820345401764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.393820366502041e-05, "grad_norm": 12.308343887329102, "learning_rate": 8.579906740144128e-07, "loss": 0.4653, "mean_token_accuracy": 0.8517146110534668, "num_tokens": 77445791.0, "step": 2025 }, { "epoch": 0.2577280244243735, "ewc_loss": 0.01387636736035347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3876367120246869e-05, "grad_norm": 12.291706085205078, "learning_rate": 8.584145824501907e-07, "loss": 0.4984, "mean_token_accuracy": 0.8388892412185669, "num_tokens": 77485376.0, "step": 2026 }, { "epoch": 0.257855234702964, "ewc_loss": 0.013926568441092968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3926568499300629e-05, "grad_norm": 12.352724075317383, "learning_rate": 8.588384908859686e-07, "loss": 0.4957, "mean_token_accuracy": 0.8442597985267639, "num_tokens": 77524344.0, "step": 2027 }, { "epoch": 0.25798244498155454, "ewc_loss": 0.013897082768380642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.389708268106915e-05, "grad_norm": 12.32453727722168, "learning_rate": 8.592623993217465e-07, "loss": 0.4893, "mean_token_accuracy": 0.8451278209686279, "num_tokens": 77562746.0, "step": 2028 }, { "epoch": 0.258109655260145, "ewc_loss": 0.013916675932705402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3916675925429445e-05, "grad_norm": 12.362007141113281, "learning_rate": 8.596863077575244e-07, "loss": 0.5118, "mean_token_accuracy": 0.838961124420166, "num_tokens": 77601261.0, "step": 2029 }, { "epoch": 0.25823686553873554, "ewc_loss": 0.01390567421913147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3905673768022098e-05, "grad_norm": 12.311685562133789, "learning_rate": 8.601102161933023e-07, "loss": 0.4925, "mean_token_accuracy": 0.8431625366210938, "num_tokens": 77640054.0, "step": 2030 }, { "epoch": 0.25836407581732607, "ewc_loss": 0.013924644328653812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3924644008511677e-05, "grad_norm": 12.301393508911133, "learning_rate": 8.6053412462908e-07, "loss": 0.4669, "mean_token_accuracy": 0.8510668277740479, "num_tokens": 77680098.0, "step": 2031 }, { "epoch": 0.25849128609591654, "ewc_loss": 0.013946374878287315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3946374565421138e-05, "grad_norm": 12.483000755310059, "learning_rate": 8.60958033064858e-07, "loss": 0.4606, "mean_token_accuracy": 0.8536473512649536, "num_tokens": 77712517.0, "step": 2032 }, { "epoch": 0.25861849637450707, "ewc_loss": 0.013949803076684475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3949803360446822e-05, "grad_norm": 12.359227180480957, "learning_rate": 8.613819415006358e-07, "loss": 0.5227, "mean_token_accuracy": 0.8388880491256714, "num_tokens": 77751335.0, "step": 2033 }, { "epoch": 0.2587457066530976, "ewc_loss": 0.013914725743234158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3914725968788844e-05, "grad_norm": 12.473104476928711, "learning_rate": 8.618058499364137e-07, "loss": 0.5091, "mean_token_accuracy": 0.8373361825942993, "num_tokens": 77792300.0, "step": 2034 }, { "epoch": 0.25887291693168807, "ewc_loss": 0.013954401947557926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3954401765658986e-05, "grad_norm": 12.326900482177734, "learning_rate": 8.622297583721916e-07, "loss": 0.4676, "mean_token_accuracy": 0.8530761003494263, "num_tokens": 77836391.0, "step": 2035 }, { "epoch": 0.2590001272102786, "ewc_loss": 0.013887225650250912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3887225577491336e-05, "grad_norm": 12.341496467590332, "learning_rate": 8.626536668079695e-07, "loss": 0.4789, "mean_token_accuracy": 0.8445723056793213, "num_tokens": 77871600.0, "step": 2036 }, { "epoch": 0.2591273374888691, "ewc_loss": 0.013981418684124947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.398141830577515e-05, "grad_norm": 12.439516067504883, "learning_rate": 8.630775752437474e-07, "loss": 0.4676, "mean_token_accuracy": 0.8513150215148926, "num_tokens": 77904623.0, "step": 2037 }, { "epoch": 0.2592545477674596, "ewc_loss": 0.013944526202976704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3944526472187135e-05, "grad_norm": 12.356438636779785, "learning_rate": 8.635014836795251e-07, "loss": 0.5057, "mean_token_accuracy": 0.8399879336357117, "num_tokens": 77941145.0, "step": 2038 }, { "epoch": 0.2593817580460501, "ewc_loss": 0.013937460258603096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3937460607849061e-05, "grad_norm": 12.34203052520752, "learning_rate": 8.63925392115303e-07, "loss": 0.4441, "mean_token_accuracy": 0.8548209071159363, "num_tokens": 77975087.0, "step": 2039 }, { "epoch": 0.25950896832464065, "ewc_loss": 0.013998000882565975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3998001122672576e-05, "grad_norm": 12.410650253295898, "learning_rate": 8.643493005510809e-07, "loss": 0.4515, "mean_token_accuracy": 0.854022741317749, "num_tokens": 78009526.0, "step": 2040 }, { "epoch": 0.2596361786032311, "ewc_loss": 0.013989460654556751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3989460967422929e-05, "grad_norm": 12.336174011230469, "learning_rate": 8.647732089868588e-07, "loss": 0.5288, "mean_token_accuracy": 0.8314497470855713, "num_tokens": 78050918.0, "step": 2041 }, { "epoch": 0.25976338888182166, "ewc_loss": 0.013992153108119965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.3992153071740177e-05, "grad_norm": 12.31937026977539, "learning_rate": 8.651971174226366e-07, "loss": 0.4452, "mean_token_accuracy": 0.8570675849914551, "num_tokens": 78085547.0, "step": 2042 }, { "epoch": 0.2598905991604122, "ewc_loss": 0.014048008248209953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4048007869860157e-05, "grad_norm": 12.468954086303711, "learning_rate": 8.656210258584146e-07, "loss": 0.5147, "mean_token_accuracy": 0.834693193435669, "num_tokens": 78122916.0, "step": 2043 }, { "epoch": 0.26001780943900266, "ewc_loss": 0.01406425703316927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4064256902202033e-05, "grad_norm": 12.357136726379395, "learning_rate": 8.660449342941924e-07, "loss": 0.5077, "mean_token_accuracy": 0.8349677920341492, "num_tokens": 78161800.0, "step": 2044 }, { "epoch": 0.2601450197175932, "ewc_loss": 0.014009389095008373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4009388905833475e-05, "grad_norm": 12.382204055786133, "learning_rate": 8.664688427299704e-07, "loss": 0.4858, "mean_token_accuracy": 0.843926191329956, "num_tokens": 78199961.0, "step": 2045 }, { "epoch": 0.2602722299961837, "ewc_loss": 0.014059213921427727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4059213754080702e-05, "grad_norm": 12.361077308654785, "learning_rate": 8.668927511657481e-07, "loss": 0.4436, "mean_token_accuracy": 0.8556631803512573, "num_tokens": 78231332.0, "step": 2046 }, { "epoch": 0.2603994402747742, "ewc_loss": 0.014058833010494709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4058832675800659e-05, "grad_norm": 12.434534072875977, "learning_rate": 8.67316659601526e-07, "loss": 0.4996, "mean_token_accuracy": 0.8399518728256226, "num_tokens": 78265054.0, "step": 2047 }, { "epoch": 0.2605266505533647, "ewc_loss": 0.014093409292399883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.409340893587796e-05, "grad_norm": 12.355259895324707, "learning_rate": 8.677405680373039e-07, "loss": 0.4566, "mean_token_accuracy": 0.8550381660461426, "num_tokens": 78305445.0, "step": 2048 }, { "epoch": 0.26065386083195524, "ewc_loss": 0.014046718366444111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4046718206373043e-05, "grad_norm": 12.388561248779297, "learning_rate": 8.681644764730818e-07, "loss": 0.4861, "mean_token_accuracy": 0.8437750339508057, "num_tokens": 78339174.0, "step": 2049 }, { "epoch": 0.2607810711105457, "ewc_loss": 0.014137150719761848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.413715108355973e-05, "grad_norm": 12.423752784729004, "learning_rate": 8.685883849088596e-07, "loss": 0.4282, "mean_token_accuracy": 0.8617712259292603, "num_tokens": 78377630.0, "step": 2050 }, { "epoch": 0.26090828138913624, "ewc_loss": 0.014116568490862846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4116568308963906e-05, "grad_norm": 12.374224662780762, "learning_rate": 8.690122933446376e-07, "loss": 0.4848, "mean_token_accuracy": 0.8480895757675171, "num_tokens": 78419908.0, "step": 2051 }, { "epoch": 0.26103549166772677, "ewc_loss": 0.0141239482909441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4123947948974092e-05, "grad_norm": 12.377008438110352, "learning_rate": 8.694362017804154e-07, "loss": 0.4511, "mean_token_accuracy": 0.856820285320282, "num_tokens": 78459561.0, "step": 2052 }, { "epoch": 0.26116270194631724, "ewc_loss": 0.01415390893816948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4153908523439895e-05, "grad_norm": 12.406469345092773, "learning_rate": 8.698601102161933e-07, "loss": 0.4829, "mean_token_accuracy": 0.8446494340896606, "num_tokens": 78502500.0, "step": 2053 }, { "epoch": 0.26128991222490777, "ewc_loss": 0.014140007086098194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4140006896923296e-05, "grad_norm": 12.379804611206055, "learning_rate": 8.702840186519711e-07, "loss": 0.4402, "mean_token_accuracy": 0.8590071797370911, "num_tokens": 78545411.0, "step": 2054 }, { "epoch": 0.2614171225034983, "ewc_loss": 0.014163519255816936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.416351915395353e-05, "grad_norm": 12.404970169067383, "learning_rate": 8.70707927087749e-07, "loss": 0.4396, "mean_token_accuracy": 0.8594251871109009, "num_tokens": 78585878.0, "step": 2055 }, { "epoch": 0.2615443327820888, "ewc_loss": 0.014171959832310677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4171960174280684e-05, "grad_norm": 12.45464038848877, "learning_rate": 8.711318355235269e-07, "loss": 0.5286, "mean_token_accuracy": 0.8293654322624207, "num_tokens": 78621233.0, "step": 2056 }, { "epoch": 0.2616715430606793, "ewc_loss": 0.014199844561517239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.419984437234234e-05, "grad_norm": 12.467642784118652, "learning_rate": 8.715557439593047e-07, "loss": 0.4489, "mean_token_accuracy": 0.8532188534736633, "num_tokens": 78654644.0, "step": 2057 }, { "epoch": 0.26179875333926983, "ewc_loss": 0.014173062518239021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4173062481859233e-05, "grad_norm": 12.441023826599121, "learning_rate": 8.719796523950826e-07, "loss": 0.526, "mean_token_accuracy": 0.8309076428413391, "num_tokens": 78692851.0, "step": 2058 }, { "epoch": 0.2619259636178603, "ewc_loss": 0.01418372429907322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4183724488248117e-05, "grad_norm": 12.392349243164062, "learning_rate": 8.724035608308605e-07, "loss": 0.4198, "mean_token_accuracy": 0.8648945093154907, "num_tokens": 78725329.0, "step": 2059 }, { "epoch": 0.26205317389645083, "ewc_loss": 0.014186845161020756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.41868449645699e-05, "grad_norm": 12.457953453063965, "learning_rate": 8.728274692666384e-07, "loss": 0.4767, "mean_token_accuracy": 0.8481172919273376, "num_tokens": 78766645.0, "step": 2060 }, { "epoch": 0.26218038417504136, "ewc_loss": 0.0142102912068367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4210290828486905e-05, "grad_norm": 12.412444114685059, "learning_rate": 8.732513777024162e-07, "loss": 0.4703, "mean_token_accuracy": 0.8505579829216003, "num_tokens": 78801576.0, "step": 2061 }, { "epoch": 0.26230759445363183, "ewc_loss": 0.014160825870931149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.416082614014158e-05, "grad_norm": 12.42557144165039, "learning_rate": 8.736752861381941e-07, "loss": 0.5142, "mean_token_accuracy": 0.8351682424545288, "num_tokens": 78845577.0, "step": 2062 }, { "epoch": 0.26243480473222236, "ewc_loss": 0.014216090552508831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.421609067620011e-05, "grad_norm": 12.434623718261719, "learning_rate": 8.740991945739719e-07, "loss": 0.4797, "mean_token_accuracy": 0.8467831611633301, "num_tokens": 78884229.0, "step": 2063 }, { "epoch": 0.2625620150108129, "ewc_loss": 0.014168967492878437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4168967936711852e-05, "grad_norm": 12.366791725158691, "learning_rate": 8.745231030097499e-07, "loss": 0.4661, "mean_token_accuracy": 0.8506754636764526, "num_tokens": 78926948.0, "step": 2064 }, { "epoch": 0.26268922528940336, "ewc_loss": 0.0142283346503973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4228334293875378e-05, "grad_norm": 12.503667831420898, "learning_rate": 8.749470114455277e-07, "loss": 0.4424, "mean_token_accuracy": 0.8573787212371826, "num_tokens": 78963582.0, "step": 2065 }, { "epoch": 0.2628164355679939, "ewc_loss": 0.014233066700398922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4233066394808702e-05, "grad_norm": 12.482117652893066, "learning_rate": 8.753709198813056e-07, "loss": 0.55, "mean_token_accuracy": 0.8248345851898193, "num_tokens": 79007345.0, "step": 2066 }, { "epoch": 0.2629436458465844, "ewc_loss": 0.014193348586559296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4193348761182278e-05, "grad_norm": 12.42556095123291, "learning_rate": 8.757948283170835e-07, "loss": 0.4475, "mean_token_accuracy": 0.8588581085205078, "num_tokens": 79049505.0, "step": 2067 }, { "epoch": 0.2630708561251749, "ewc_loss": 0.014237688854336739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4237688446883112e-05, "grad_norm": 12.5933198928833, "learning_rate": 8.762187367528613e-07, "loss": 0.442, "mean_token_accuracy": 0.8550006151199341, "num_tokens": 79087419.0, "step": 2068 }, { "epoch": 0.2631980664037654, "ewc_loss": 0.014255544170737267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.425554455636302e-05, "grad_norm": 12.43038272857666, "learning_rate": 8.766426451886392e-07, "loss": 0.4901, "mean_token_accuracy": 0.8408753871917725, "num_tokens": 79131444.0, "step": 2069 }, { "epoch": 0.26332527668235595, "ewc_loss": 0.0141910957172513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4191095942805987e-05, "grad_norm": 12.531970024108887, "learning_rate": 8.770665536244171e-07, "loss": 0.5168, "mean_token_accuracy": 0.837668776512146, "num_tokens": 79171976.0, "step": 2070 }, { "epoch": 0.2634524869609464, "ewc_loss": 0.014270932413637638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4270932297222316e-05, "grad_norm": 12.540428161621094, "learning_rate": 8.774904620601949e-07, "loss": 0.4448, "mean_token_accuracy": 0.8558573126792908, "num_tokens": 79206273.0, "step": 2071 }, { "epoch": 0.26357969723953695, "ewc_loss": 0.01419002190232277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4190021829563193e-05, "grad_norm": 12.489262580871582, "learning_rate": 8.779143704959729e-07, "loss": 0.4469, "mean_token_accuracy": 0.8553977012634277, "num_tokens": 79245564.0, "step": 2072 }, { "epoch": 0.2637069075181275, "ewc_loss": 0.014229181222617626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4229181033442728e-05, "grad_norm": 12.517560958862305, "learning_rate": 8.783382789317507e-07, "loss": 0.4758, "mean_token_accuracy": 0.8478782176971436, "num_tokens": 79281857.0, "step": 2073 }, { "epoch": 0.26383411779671795, "ewc_loss": 0.014255840331315994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4255840142141096e-05, "grad_norm": 12.650904655456543, "learning_rate": 8.787621873675286e-07, "loss": 0.511, "mean_token_accuracy": 0.8325632810592651, "num_tokens": 79312482.0, "step": 2074 }, { "epoch": 0.2639613280753085, "ewc_loss": 0.014257315546274185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4257315342547372e-05, "grad_norm": 12.475111961364746, "learning_rate": 8.791860958033065e-07, "loss": 0.45, "mean_token_accuracy": 0.8555936217308044, "num_tokens": 79348157.0, "step": 2075 }, { "epoch": 0.264088538353899, "ewc_loss": 0.014217591844499111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4217592251952738e-05, "grad_norm": 12.53930950164795, "learning_rate": 8.796100042390842e-07, "loss": 0.436, "mean_token_accuracy": 0.8602615594863892, "num_tokens": 79387540.0, "step": 2076 }, { "epoch": 0.2642157486324895, "ewc_loss": 0.014266754500567913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4266754078562371e-05, "grad_norm": 12.553947448730469, "learning_rate": 8.800339126748622e-07, "loss": 0.454, "mean_token_accuracy": 0.8522303104400635, "num_tokens": 79421424.0, "step": 2077 }, { "epoch": 0.26434295891108, "ewc_loss": 0.014262023381888866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.426202379661845e-05, "grad_norm": 12.563748359680176, "learning_rate": 8.8045782111064e-07, "loss": 0.5245, "mean_token_accuracy": 0.8325083255767822, "num_tokens": 79458733.0, "step": 2078 }, { "epoch": 0.26447016918967053, "ewc_loss": 0.014270037412643433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4270037354435772e-05, "grad_norm": 12.496116638183594, "learning_rate": 8.808817295464179e-07, "loss": 0.4767, "mean_token_accuracy": 0.8481016755104065, "num_tokens": 79495394.0, "step": 2079 }, { "epoch": 0.26459737946826106, "ewc_loss": 0.014270973391830921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4270973224483896e-05, "grad_norm": 12.569413185119629, "learning_rate": 8.813056379821958e-07, "loss": 0.4472, "mean_token_accuracy": 0.8575844764709473, "num_tokens": 79529799.0, "step": 2080 }, { "epoch": 0.26472458974685154, "ewc_loss": 0.014307010918855667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4307011042546947e-05, "grad_norm": 12.510077476501465, "learning_rate": 8.817295464179737e-07, "loss": 0.4729, "mean_token_accuracy": 0.8477810621261597, "num_tokens": 79564664.0, "step": 2081 }, { "epoch": 0.26485180002544206, "ewc_loss": 0.01430362369865179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4303624084277544e-05, "grad_norm": 12.473958969116211, "learning_rate": 8.821534548537515e-07, "loss": 0.5511, "mean_token_accuracy": 0.8229351043701172, "num_tokens": 79606104.0, "step": 2082 }, { "epoch": 0.2649790103040326, "ewc_loss": 0.014366892166435719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4366892173711676e-05, "grad_norm": 12.572588920593262, "learning_rate": 8.825773632895295e-07, "loss": 0.4466, "mean_token_accuracy": 0.857699453830719, "num_tokens": 79647688.0, "step": 2083 }, { "epoch": 0.26510622058262306, "ewc_loss": 0.01435501966625452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4355019629874732e-05, "grad_norm": 12.540081977844238, "learning_rate": 8.830012717253072e-07, "loss": 0.4636, "mean_token_accuracy": 0.8536120057106018, "num_tokens": 79685500.0, "step": 2084 }, { "epoch": 0.2652334308612136, "ewc_loss": 0.014325493946671486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4325493793876376e-05, "grad_norm": 12.474178314208984, "learning_rate": 8.834251801610852e-07, "loss": 0.4925, "mean_token_accuracy": 0.8417856693267822, "num_tokens": 79729912.0, "step": 2085 }, { "epoch": 0.2653606411398041, "ewc_loss": 0.014335552230477333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4335551895783283e-05, "grad_norm": 12.557950019836426, "learning_rate": 8.83849088596863e-07, "loss": 0.4492, "mean_token_accuracy": 0.8560048341751099, "num_tokens": 79767487.0, "step": 2086 }, { "epoch": 0.2654878514183946, "ewc_loss": 0.01437645684927702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.437645641999552e-05, "grad_norm": 12.542616844177246, "learning_rate": 8.842729970326409e-07, "loss": 0.4283, "mean_token_accuracy": 0.8642721176147461, "num_tokens": 79804802.0, "step": 2087 }, { "epoch": 0.2656150616969851, "ewc_loss": 0.014371751807630062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4371751603903249e-05, "grad_norm": 12.54255199432373, "learning_rate": 8.846969054684188e-07, "loss": 0.5463, "mean_token_accuracy": 0.8280608654022217, "num_tokens": 79845685.0, "step": 2088 }, { "epoch": 0.26574227197557565, "ewc_loss": 0.014377656392753124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4377656043507159e-05, "grad_norm": 12.556649208068848, "learning_rate": 8.851208139041967e-07, "loss": 0.4359, "mean_token_accuracy": 0.8592821359634399, "num_tokens": 79884692.0, "step": 2089 }, { "epoch": 0.2658694822541661, "ewc_loss": 0.014392106793820858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4392107004823629e-05, "grad_norm": 12.576570510864258, "learning_rate": 8.855447223399745e-07, "loss": 0.4951, "mean_token_accuracy": 0.8396041393280029, "num_tokens": 79919780.0, "step": 2090 }, { "epoch": 0.26599669253275665, "ewc_loss": 0.014401379972696304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4401380212802906e-05, "grad_norm": 12.539743423461914, "learning_rate": 8.859686307757524e-07, "loss": 0.5211, "mean_token_accuracy": 0.8334133625030518, "num_tokens": 79958317.0, "step": 2091 }, { "epoch": 0.2661239028113472, "ewc_loss": 0.014385282061994076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4385282156581525e-05, "grad_norm": 12.52277660369873, "learning_rate": 8.863925392115302e-07, "loss": 0.4531, "mean_token_accuracy": 0.8533399105072021, "num_tokens": 79994126.0, "step": 2092 }, { "epoch": 0.26625111308993765, "ewc_loss": 0.014445322565734386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4445322449319065e-05, "grad_norm": 12.551522254943848, "learning_rate": 8.868164476473082e-07, "loss": 0.4242, "mean_token_accuracy": 0.8646503686904907, "num_tokens": 80033157.0, "step": 2093 }, { "epoch": 0.2663783233685282, "ewc_loss": 0.014396623708307743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4396623555512633e-05, "grad_norm": 12.506830215454102, "learning_rate": 8.87240356083086e-07, "loss": 0.4523, "mean_token_accuracy": 0.8561995029449463, "num_tokens": 80073354.0, "step": 2094 }, { "epoch": 0.2665055336471187, "ewc_loss": 0.01444710697978735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4447106877923943e-05, "grad_norm": 12.613503456115723, "learning_rate": 8.876642645188639e-07, "loss": 0.5135, "mean_token_accuracy": 0.8357128500938416, "num_tokens": 80111819.0, "step": 2095 }, { "epoch": 0.2666327439257092, "ewc_loss": 0.01442989893257618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.44298992381664e-05, "grad_norm": 12.512619018554688, "learning_rate": 8.880881729546418e-07, "loss": 0.4283, "mean_token_accuracy": 0.8614404201507568, "num_tokens": 80147951.0, "step": 2096 }, { "epoch": 0.2667599542042997, "ewc_loss": 0.014415946789085865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.44159466799465e-05, "grad_norm": 12.523294448852539, "learning_rate": 8.885120813904197e-07, "loss": 0.442, "mean_token_accuracy": 0.8564013242721558, "num_tokens": 80191319.0, "step": 2097 }, { "epoch": 0.26688716448289024, "ewc_loss": 0.014469425193965435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4469424968410749e-05, "grad_norm": 12.585094451904297, "learning_rate": 8.889359898261976e-07, "loss": 0.529, "mean_token_accuracy": 0.8346270322799683, "num_tokens": 80226300.0, "step": 2098 }, { "epoch": 0.2670143747614807, "ewc_loss": 0.014503599144518375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4503599231829867e-05, "grad_norm": 12.599091529846191, "learning_rate": 8.893598982619753e-07, "loss": 0.4542, "mean_token_accuracy": 0.8539203405380249, "num_tokens": 80265465.0, "step": 2099 }, { "epoch": 0.26714158504007124, "ewc_loss": 0.014479400590062141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4479400306299794e-05, "grad_norm": 12.571945190429688, "learning_rate": 8.897838066977532e-07, "loss": 0.4562, "mean_token_accuracy": 0.8532869815826416, "num_tokens": 80312696.0, "step": 2100 }, { "epoch": 0.26726879531866177, "ewc_loss": 0.014496969059109688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4496969015453942e-05, "grad_norm": 12.598705291748047, "learning_rate": 8.902077151335311e-07, "loss": 0.4774, "mean_token_accuracy": 0.8458227515220642, "num_tokens": 80353507.0, "step": 2101 }, { "epoch": 0.26739600559725224, "ewc_loss": 0.014503403566777706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4503403690468986e-05, "grad_norm": 12.64376163482666, "learning_rate": 8.90631623569309e-07, "loss": 0.5601, "mean_token_accuracy": 0.8190615177154541, "num_tokens": 80390795.0, "step": 2102 }, { "epoch": 0.26752321587584277, "ewc_loss": 0.014483271166682243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.448327111575054e-05, "grad_norm": 12.573884010314941, "learning_rate": 8.910555320050868e-07, "loss": 0.4692, "mean_token_accuracy": 0.8496943712234497, "num_tokens": 80427599.0, "step": 2103 }, { "epoch": 0.2676504261544333, "ewc_loss": 0.014494192786514759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4494193237624131e-05, "grad_norm": 12.593392372131348, "learning_rate": 8.914794404408648e-07, "loss": 0.4723, "mean_token_accuracy": 0.8494606018066406, "num_tokens": 80468084.0, "step": 2104 }, { "epoch": 0.26777763643302377, "ewc_loss": 0.014478865079581738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4478865523415152e-05, "grad_norm": 12.555636405944824, "learning_rate": 8.919033488766426e-07, "loss": 0.4163, "mean_token_accuracy": 0.8658795952796936, "num_tokens": 80507218.0, "step": 2105 }, { "epoch": 0.2679048467116143, "ewc_loss": 0.014490004628896713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4490005014522467e-05, "grad_norm": 12.630045890808105, "learning_rate": 8.923272573124204e-07, "loss": 0.5, "mean_token_accuracy": 0.8414599895477295, "num_tokens": 80541730.0, "step": 2106 }, { "epoch": 0.2680320569902048, "ewc_loss": 0.014505112543702126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4505112631013617e-05, "grad_norm": 12.610334396362305, "learning_rate": 8.927511657481983e-07, "loss": 0.5113, "mean_token_accuracy": 0.8366543054580688, "num_tokens": 80584677.0, "step": 2107 }, { "epoch": 0.2681592672687953, "ewc_loss": 0.014494115486741066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4494115021079779e-05, "grad_norm": 12.63097858428955, "learning_rate": 8.931750741839762e-07, "loss": 0.4455, "mean_token_accuracy": 0.8589719533920288, "num_tokens": 80616332.0, "step": 2108 }, { "epoch": 0.2682864775473858, "ewc_loss": 0.014527072198688984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4527072380587924e-05, "grad_norm": 12.700511932373047, "learning_rate": 8.935989826197541e-07, "loss": 0.4542, "mean_token_accuracy": 0.8564516305923462, "num_tokens": 80654435.0, "step": 2109 }, { "epoch": 0.26841368782597635, "ewc_loss": 0.014515153132379055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.451515345252119e-05, "grad_norm": 12.57844066619873, "learning_rate": 8.94022891055532e-07, "loss": 0.4309, "mean_token_accuracy": 0.8652622699737549, "num_tokens": 80691246.0, "step": 2110 }, { "epoch": 0.2685408981045668, "ewc_loss": 0.014509644359350204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4509644643112551e-05, "grad_norm": 12.627920150756836, "learning_rate": 8.944467994913098e-07, "loss": 0.4821, "mean_token_accuracy": 0.8415374159812927, "num_tokens": 80730510.0, "step": 2111 }, { "epoch": 0.26866810838315736, "ewc_loss": 0.014543509110808372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4543508768838365e-05, "grad_norm": 12.636617660522461, "learning_rate": 8.948707079270878e-07, "loss": 0.4006, "mean_token_accuracy": 0.8699673414230347, "num_tokens": 80769369.0, "step": 2112 }, { "epoch": 0.2687953186617479, "ewc_loss": 0.014524122700095177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4524122889270075e-05, "grad_norm": 12.596875190734863, "learning_rate": 8.952946163628656e-07, "loss": 0.4555, "mean_token_accuracy": 0.8524158000946045, "num_tokens": 80802168.0, "step": 2113 }, { "epoch": 0.26892252894033836, "ewc_loss": 0.014535252004861832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.453525237593567e-05, "grad_norm": 12.58098030090332, "learning_rate": 8.957185247986434e-07, "loss": 0.4054, "mean_token_accuracy": 0.8690012693405151, "num_tokens": 80839389.0, "step": 2114 }, { "epoch": 0.2690497392189289, "ewc_loss": 0.014547084458172321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4547084902005736e-05, "grad_norm": 12.509760856628418, "learning_rate": 8.961424332344213e-07, "loss": 0.4603, "mean_token_accuracy": 0.854017972946167, "num_tokens": 80881139.0, "step": 2115 }, { "epoch": 0.2691769494975194, "ewc_loss": 0.01458396203815937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4583962183678523e-05, "grad_norm": 12.675947189331055, "learning_rate": 8.965663416701992e-07, "loss": 0.4632, "mean_token_accuracy": 0.8500317931175232, "num_tokens": 80919809.0, "step": 2116 }, { "epoch": 0.2693041597761099, "ewc_loss": 0.014618358574807644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4618358363804873e-05, "grad_norm": 12.595836639404297, "learning_rate": 8.969902501059771e-07, "loss": 0.439, "mean_token_accuracy": 0.8612732887268066, "num_tokens": 80956860.0, "step": 2117 }, { "epoch": 0.2694313700547004, "ewc_loss": 0.014569582417607307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4569582162948791e-05, "grad_norm": 12.622783660888672, "learning_rate": 8.97414158541755e-07, "loss": 0.406, "mean_token_accuracy": 0.868331253528595, "num_tokens": 80994322.0, "step": 2118 }, { "epoch": 0.26955858033329094, "ewc_loss": 0.014639447443187237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4639447726949584e-05, "grad_norm": 12.649641990661621, "learning_rate": 8.978380669775328e-07, "loss": 0.4888, "mean_token_accuracy": 0.8430496454238892, "num_tokens": 81032792.0, "step": 2119 }, { "epoch": 0.2696857906118814, "ewc_loss": 0.01460608746856451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4606087461288553e-05, "grad_norm": 12.626220703125, "learning_rate": 8.982619754133107e-07, "loss": 0.4896, "mean_token_accuracy": 0.8441433906555176, "num_tokens": 81070011.0, "step": 2120 }, { "epoch": 0.26981300089047194, "ewc_loss": 0.01460281852632761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.460281873733038e-05, "grad_norm": 12.539050102233887, "learning_rate": 8.986858838490886e-07, "loss": 0.4919, "mean_token_accuracy": 0.8430880308151245, "num_tokens": 81113145.0, "step": 2121 }, { "epoch": 0.26994021116906247, "ewc_loss": 0.014621017500758171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4621017726312857e-05, "grad_norm": 12.671870231628418, "learning_rate": 8.991097922848663e-07, "loss": 0.4604, "mean_token_accuracy": 0.8536757230758667, "num_tokens": 81154175.0, "step": 2122 }, { "epoch": 0.27006742144765294, "ewc_loss": 0.014665289781987667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.466529010940576e-05, "grad_norm": 12.630247116088867, "learning_rate": 8.995337007206443e-07, "loss": 0.484, "mean_token_accuracy": 0.8467215895652771, "num_tokens": 81193319.0, "step": 2123 }, { "epoch": 0.2701946317262435, "ewc_loss": 0.01461971178650856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4619711691921111e-05, "grad_norm": 12.632678031921387, "learning_rate": 8.999576091564221e-07, "loss": 0.4515, "mean_token_accuracy": 0.855525016784668, "num_tokens": 81229057.0, "step": 2124 }, { "epoch": 0.270321842004834, "ewc_loss": 0.014669115655124187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4669115444121417e-05, "grad_norm": 12.653864860534668, "learning_rate": 9.003815175922001e-07, "loss": 0.4593, "mean_token_accuracy": 0.8519155383110046, "num_tokens": 81262380.0, "step": 2125 }, { "epoch": 0.2704490522834245, "ewc_loss": 0.014674695208668709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4674695194116794e-05, "grad_norm": 12.652957916259766, "learning_rate": 9.008054260279779e-07, "loss": 0.4642, "mean_token_accuracy": 0.8521689176559448, "num_tokens": 81304080.0, "step": 2126 }, { "epoch": 0.270576262562015, "ewc_loss": 0.01470243837684393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4702438420499675e-05, "grad_norm": 12.687347412109375, "learning_rate": 9.012293344637558e-07, "loss": 0.4944, "mean_token_accuracy": 0.844611644744873, "num_tokens": 81340226.0, "step": 2127 }, { "epoch": 0.27070347284060553, "ewc_loss": 0.014678467065095901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4678466868645046e-05, "grad_norm": 12.647103309631348, "learning_rate": 9.016532428995337e-07, "loss": 0.4406, "mean_token_accuracy": 0.8576743006706238, "num_tokens": 81378046.0, "step": 2128 }, { "epoch": 0.27083068311919606, "ewc_loss": 0.014685733243823051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.468573282181751e-05, "grad_norm": 12.641593933105469, "learning_rate": 9.020771513353115e-07, "loss": 0.4849, "mean_token_accuracy": 0.8450803756713867, "num_tokens": 81419486.0, "step": 2129 }, { "epoch": 0.27095789339778653, "ewc_loss": 0.014721800573170185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4721800653205719e-05, "grad_norm": 12.662357330322266, "learning_rate": 9.025010597710894e-07, "loss": 0.4843, "mean_token_accuracy": 0.8448084592819214, "num_tokens": 81457488.0, "step": 2130 }, { "epoch": 0.27108510367637706, "ewc_loss": 0.014729980379343033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4729980648553465e-05, "grad_norm": 12.671982765197754, "learning_rate": 9.029249682068673e-07, "loss": 0.4532, "mean_token_accuracy": 0.8550363779067993, "num_tokens": 81494812.0, "step": 2131 }, { "epoch": 0.2712123139549676, "ewc_loss": 0.01473470963537693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4734710021002684e-05, "grad_norm": 12.647457122802734, "learning_rate": 9.033488766426451e-07, "loss": 0.4702, "mean_token_accuracy": 0.8482628464698792, "num_tokens": 81531421.0, "step": 2132 }, { "epoch": 0.27133952423355806, "ewc_loss": 0.014747658744454384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4747658497071825e-05, "grad_norm": 12.64688777923584, "learning_rate": 9.037727850784231e-07, "loss": 0.4355, "mean_token_accuracy": 0.857460618019104, "num_tokens": 81567221.0, "step": 2133 }, { "epoch": 0.2714667345121486, "ewc_loss": 0.014745427295565605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4745427506568376e-05, "grad_norm": 12.694779396057129, "learning_rate": 9.041966935142009e-07, "loss": 0.4697, "mean_token_accuracy": 0.8502384424209595, "num_tokens": 81605832.0, "step": 2134 }, { "epoch": 0.2715939447907391, "ewc_loss": 0.014767736196517944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4767736502108164e-05, "grad_norm": 12.689303398132324, "learning_rate": 9.046206019499788e-07, "loss": 0.4245, "mean_token_accuracy": 0.8653514981269836, "num_tokens": 81646919.0, "step": 2135 }, { "epoch": 0.2717211550693296, "ewc_loss": 0.014727652072906494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4727652342116926e-05, "grad_norm": 12.630210876464844, "learning_rate": 9.050445103857567e-07, "loss": 0.5033, "mean_token_accuracy": 0.8438559770584106, "num_tokens": 81687214.0, "step": 2136 }, { "epoch": 0.2718483653479201, "ewc_loss": 0.014771297574043274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4771297173865605e-05, "grad_norm": 12.714020729064941, "learning_rate": 9.054684188215344e-07, "loss": 0.4084, "mean_token_accuracy": 0.8691741824150085, "num_tokens": 81721437.0, "step": 2137 }, { "epoch": 0.27197557562651065, "ewc_loss": 0.014799267053604126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4799266864429228e-05, "grad_norm": 12.691913604736328, "learning_rate": 9.058923272573124e-07, "loss": 0.4818, "mean_token_accuracy": 0.8459348678588867, "num_tokens": 81756595.0, "step": 2138 }, { "epoch": 0.2721027859051011, "ewc_loss": 0.014787545427680016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4787545296712779e-05, "grad_norm": 12.653044700622559, "learning_rate": 9.063162356930902e-07, "loss": 0.4468, "mean_token_accuracy": 0.8579918742179871, "num_tokens": 81796198.0, "step": 2139 }, { "epoch": 0.27222999618369165, "ewc_loss": 0.014823933131992817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4823933270236012e-05, "grad_norm": 12.751115798950195, "learning_rate": 9.067401441288681e-07, "loss": 0.4777, "mean_token_accuracy": 0.8496411442756653, "num_tokens": 81833002.0, "step": 2140 }, { "epoch": 0.2723572064622822, "ewc_loss": 0.014834712259471416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4834712601441424e-05, "grad_norm": 12.637350082397461, "learning_rate": 9.07164052564646e-07, "loss": 0.4495, "mean_token_accuracy": 0.8513622283935547, "num_tokens": 81872089.0, "step": 2141 }, { "epoch": 0.27248441674087265, "ewc_loss": 0.014792256988584995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4792257388762664e-05, "grad_norm": 12.737350463867188, "learning_rate": 9.075879610004239e-07, "loss": 0.4847, "mean_token_accuracy": 0.8438743352890015, "num_tokens": 81908356.0, "step": 2142 }, { "epoch": 0.2726116270194632, "ewc_loss": 0.01487190555781126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4871905477775726e-05, "grad_norm": 12.717363357543945, "learning_rate": 9.080118694362017e-07, "loss": 0.4893, "mean_token_accuracy": 0.8412249684333801, "num_tokens": 81950251.0, "step": 2143 }, { "epoch": 0.2727388372980537, "ewc_loss": 0.014806322753429413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4806322724325582e-05, "grad_norm": 12.741666793823242, "learning_rate": 9.084357778719796e-07, "loss": 0.4824, "mean_token_accuracy": 0.8433279991149902, "num_tokens": 81987157.0, "step": 2144 }, { "epoch": 0.2728660475766442, "ewc_loss": 0.014865466393530369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4865466255287174e-05, "grad_norm": 12.683852195739746, "learning_rate": 9.088596863077574e-07, "loss": 0.4663, "mean_token_accuracy": 0.8546035885810852, "num_tokens": 82023633.0, "step": 2145 }, { "epoch": 0.2729932578552347, "ewc_loss": 0.014826228842139244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4826228834863286e-05, "grad_norm": 12.690059661865234, "learning_rate": 9.092835947435354e-07, "loss": 0.4694, "mean_token_accuracy": 0.8464484214782715, "num_tokens": 82061311.0, "step": 2146 }, { "epoch": 0.27312046813382523, "ewc_loss": 0.014918088912963867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4918088709237054e-05, "grad_norm": 12.785216331481934, "learning_rate": 9.097075031793132e-07, "loss": 0.5038, "mean_token_accuracy": 0.8396371603012085, "num_tokens": 82098370.0, "step": 2147 }, { "epoch": 0.2732476784124157, "ewc_loss": 0.014874867163598537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4874866792524699e-05, "grad_norm": 12.705668449401855, "learning_rate": 9.101314116150911e-07, "loss": 0.431, "mean_token_accuracy": 0.8650564551353455, "num_tokens": 82134931.0, "step": 2148 }, { "epoch": 0.27337488869100623, "ewc_loss": 0.014891493134200573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.489149326516781e-05, "grad_norm": 12.766267776489258, "learning_rate": 9.10555320050869e-07, "loss": 0.4403, "mean_token_accuracy": 0.8592076301574707, "num_tokens": 82171391.0, "step": 2149 }, { "epoch": 0.27350209896959676, "ewc_loss": 0.014915277250111103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4915277461113874e-05, "grad_norm": 12.702559471130371, "learning_rate": 9.109792284866469e-07, "loss": 0.4593, "mean_token_accuracy": 0.8562018275260925, "num_tokens": 82214064.0, "step": 2150 }, { "epoch": 0.27362930924818724, "ewc_loss": 0.01489451713860035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4894517335051205e-05, "grad_norm": 12.718281745910645, "learning_rate": 9.114031369224247e-07, "loss": 0.4754, "mean_token_accuracy": 0.8476098775863647, "num_tokens": 82252774.0, "step": 2151 }, { "epoch": 0.27375651952677776, "ewc_loss": 0.014953775331377983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.495377546234522e-05, "grad_norm": 12.835103034973145, "learning_rate": 9.118270453582026e-07, "loss": 0.5303, "mean_token_accuracy": 0.8353033065795898, "num_tokens": 82291645.0, "step": 2152 }, { "epoch": 0.2738837298053683, "ewc_loss": 0.014951072633266449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4951072444091551e-05, "grad_norm": 12.71554183959961, "learning_rate": 9.122509537939804e-07, "loss": 0.462, "mean_token_accuracy": 0.8466678857803345, "num_tokens": 82333643.0, "step": 2153 }, { "epoch": 0.27401094008395877, "ewc_loss": 0.014933041296899319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4933041711628903e-05, "grad_norm": 12.795358657836914, "learning_rate": 9.126748622297584e-07, "loss": 0.5005, "mean_token_accuracy": 0.8366433382034302, "num_tokens": 82370792.0, "step": 2154 }, { "epoch": 0.2741381503625493, "ewc_loss": 0.014980319887399673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4980319974711165e-05, "grad_norm": 12.746315956115723, "learning_rate": 9.130987706655362e-07, "loss": 0.4906, "mean_token_accuracy": 0.8429681062698364, "num_tokens": 82408626.0, "step": 2155 }, { "epoch": 0.2742653606411398, "ewc_loss": 0.01493249274790287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4932492376829032e-05, "grad_norm": 12.756540298461914, "learning_rate": 9.135226791013141e-07, "loss": 0.5099, "mean_token_accuracy": 0.8433453440666199, "num_tokens": 82448401.0, "step": 2156 }, { "epoch": 0.2743925709197303, "ewc_loss": 0.014995096251368523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.499509653513087e-05, "grad_norm": 12.791008949279785, "learning_rate": 9.13946587537092e-07, "loss": 0.4668, "mean_token_accuracy": 0.8465355634689331, "num_tokens": 82476644.0, "step": 2157 }, { "epoch": 0.2745197811983208, "ewc_loss": 0.014989720657467842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.498972051194869e-05, "grad_norm": 12.777445793151855, "learning_rate": 9.143704959728699e-07, "loss": 0.4829, "mean_token_accuracy": 0.841818630695343, "num_tokens": 82512722.0, "step": 2158 }, { "epoch": 0.27464699147691135, "ewc_loss": 0.014999903738498688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.4999904124124441e-05, "grad_norm": 12.825346946716309, "learning_rate": 9.147944044086476e-07, "loss": 0.503, "mean_token_accuracy": 0.839947521686554, "num_tokens": 82545393.0, "step": 2159 }, { "epoch": 0.2747742017555018, "ewc_loss": 0.015039700083434582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5039699974295218e-05, "grad_norm": 12.77763557434082, "learning_rate": 9.152183128444255e-07, "loss": 0.5, "mean_token_accuracy": 0.845841646194458, "num_tokens": 82586329.0, "step": 2160 }, { "epoch": 0.27490141203409235, "ewc_loss": 0.014999683015048504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.499968311691191e-05, "grad_norm": 12.78937816619873, "learning_rate": 9.156422212802034e-07, "loss": 0.5067, "mean_token_accuracy": 0.8358232378959656, "num_tokens": 82625563.0, "step": 2161 }, { "epoch": 0.2750286223126829, "ewc_loss": 0.015034997835755348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5034997886687052e-05, "grad_norm": 12.767109870910645, "learning_rate": 9.160661297159813e-07, "loss": 0.4449, "mean_token_accuracy": 0.8567212820053101, "num_tokens": 82660030.0, "step": 2162 }, { "epoch": 0.27515583259127335, "ewc_loss": 0.015008910559117794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5008910850156099e-05, "grad_norm": 12.824947357177734, "learning_rate": 9.164900381517592e-07, "loss": 0.4675, "mean_token_accuracy": 0.8499528765678406, "num_tokens": 82698794.0, "step": 2163 }, { "epoch": 0.2752830428698639, "ewc_loss": 0.015042092651128769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5042092854855582e-05, "grad_norm": 12.728955268859863, "learning_rate": 9.16913946587537e-07, "loss": 0.4649, "mean_token_accuracy": 0.8490256071090698, "num_tokens": 82740015.0, "step": 2164 }, { "epoch": 0.2754102531484544, "ewc_loss": 0.015042493119835854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5042493032524362e-05, "grad_norm": 12.832844734191895, "learning_rate": 9.17337855023315e-07, "loss": 0.4785, "mean_token_accuracy": 0.8455030918121338, "num_tokens": 82775765.0, "step": 2165 }, { "epoch": 0.2755374634270449, "ewc_loss": 0.015070077031850815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5070077097334433e-05, "grad_norm": 12.786483764648438, "learning_rate": 9.177617634590928e-07, "loss": 0.429, "mean_token_accuracy": 0.8613017201423645, "num_tokens": 82814324.0, "step": 2166 }, { "epoch": 0.2756646737056354, "ewc_loss": 0.015090025030076504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5090025044628419e-05, "grad_norm": 12.833725929260254, "learning_rate": 9.181856718948706e-07, "loss": 0.4164, "mean_token_accuracy": 0.8642662763595581, "num_tokens": 82851007.0, "step": 2167 }, { "epoch": 0.27579188398422594, "ewc_loss": 0.015053529292345047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5053528841235675e-05, "grad_norm": 12.831707000732422, "learning_rate": 9.186095803306485e-07, "loss": 0.4602, "mean_token_accuracy": 0.8510432839393616, "num_tokens": 82886846.0, "step": 2168 }, { "epoch": 0.2759190942628164, "ewc_loss": 0.015073100104928017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5073100257723127e-05, "grad_norm": 12.855157852172852, "learning_rate": 9.190334887664264e-07, "loss": 0.4983, "mean_token_accuracy": 0.840952455997467, "num_tokens": 82929607.0, "step": 2169 }, { "epoch": 0.27604630454140694, "ewc_loss": 0.015050864778459072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5050864931254182e-05, "grad_norm": 12.917223930358887, "learning_rate": 9.194573972022043e-07, "loss": 0.5567, "mean_token_accuracy": 0.8296220302581787, "num_tokens": 82965089.0, "step": 2170 }, { "epoch": 0.27617351481999747, "ewc_loss": 0.015049062669277191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5049062312755268e-05, "grad_norm": 12.833966255187988, "learning_rate": 9.198813056379822e-07, "loss": 0.5305, "mean_token_accuracy": 0.8296035528182983, "num_tokens": 83008345.0, "step": 2171 }, { "epoch": 0.27630072509858794, "ewc_loss": 0.015033205039799213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5033205272629857e-05, "grad_norm": 12.874180793762207, "learning_rate": 9.2030521407376e-07, "loss": 0.4841, "mean_token_accuracy": 0.8431103229522705, "num_tokens": 83043763.0, "step": 2172 }, { "epoch": 0.27642793537717847, "ewc_loss": 0.015060015954077244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5060016266943421e-05, "grad_norm": 12.829380989074707, "learning_rate": 9.20729122509538e-07, "loss": 0.4283, "mean_token_accuracy": 0.862281084060669, "num_tokens": 83081389.0, "step": 2173 }, { "epoch": 0.276555145655769, "ewc_loss": 0.015036729164421558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5036729564599227e-05, "grad_norm": 12.884476661682129, "learning_rate": 9.211530309453158e-07, "loss": 0.4671, "mean_token_accuracy": 0.8480230569839478, "num_tokens": 83120547.0, "step": 2174 }, { "epoch": 0.27668235593435947, "ewc_loss": 0.015078570693731308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.507857086835429e-05, "grad_norm": 12.911306381225586, "learning_rate": 9.215769393810936e-07, "loss": 0.4734, "mean_token_accuracy": 0.8497522473335266, "num_tokens": 83157229.0, "step": 2175 }, { "epoch": 0.27680956621295, "ewc_loss": 0.015031483955681324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5031483599159401e-05, "grad_norm": 12.781240463256836, "learning_rate": 9.220008478168715e-07, "loss": 0.5532, "mean_token_accuracy": 0.8243883848190308, "num_tokens": 83198217.0, "step": 2176 }, { "epoch": 0.2769367764915405, "ewc_loss": 0.015074113383889198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5074113434820902e-05, "grad_norm": 12.859488487243652, "learning_rate": 9.224247562526494e-07, "loss": 0.4738, "mean_token_accuracy": 0.846394956111908, "num_tokens": 83238750.0, "step": 2177 }, { "epoch": 0.277063986770131, "ewc_loss": 0.015096156857907772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5096156857907772e-05, "grad_norm": 12.88062858581543, "learning_rate": 9.228486646884273e-07, "loss": 0.4614, "mean_token_accuracy": 0.8508873581886292, "num_tokens": 83280138.0, "step": 2178 }, { "epoch": 0.2771911970487215, "ewc_loss": 0.015107898972928524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5107899344002362e-05, "grad_norm": 12.859678268432617, "learning_rate": 9.232725731242052e-07, "loss": 0.5151, "mean_token_accuracy": 0.8347841501235962, "num_tokens": 83320633.0, "step": 2179 }, { "epoch": 0.27731840732731206, "ewc_loss": 0.015109866857528687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5109866581042297e-05, "grad_norm": 12.90982723236084, "learning_rate": 9.23696481559983e-07, "loss": 0.4978, "mean_token_accuracy": 0.838827908039093, "num_tokens": 83358787.0, "step": 2180 }, { "epoch": 0.2774456176059026, "ewc_loss": 0.015118729323148727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5118729606911074e-05, "grad_norm": 12.895682334899902, "learning_rate": 9.24120389995761e-07, "loss": 0.4406, "mean_token_accuracy": 0.8579394221305847, "num_tokens": 83400609.0, "step": 2181 }, { "epoch": 0.27757282788449306, "ewc_loss": 0.015100765973329544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5100766177056357e-05, "grad_norm": 12.887127876281738, "learning_rate": 9.245442984315387e-07, "loss": 0.4568, "mean_token_accuracy": 0.8521329760551453, "num_tokens": 83436058.0, "step": 2182 }, { "epoch": 0.2777000381630836, "ewc_loss": 0.01512238010764122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5122380318643991e-05, "grad_norm": 12.849930763244629, "learning_rate": 9.249682068673165e-07, "loss": 0.4913, "mean_token_accuracy": 0.8407802581787109, "num_tokens": 83474542.0, "step": 2183 }, { "epoch": 0.2778272484416741, "ewc_loss": 0.015134000219404697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5134000022953842e-05, "grad_norm": 12.984222412109375, "learning_rate": 9.253921153030945e-07, "loss": 0.5283, "mean_token_accuracy": 0.8320216536521912, "num_tokens": 83515551.0, "step": 2184 }, { "epoch": 0.2779544587202646, "ewc_loss": 0.015141700394451618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5141700714593753e-05, "grad_norm": 12.825108528137207, "learning_rate": 9.258160237388723e-07, "loss": 0.4424, "mean_token_accuracy": 0.8592773079872131, "num_tokens": 83551826.0, "step": 2185 }, { "epoch": 0.2780816689988551, "ewc_loss": 0.015107006765902042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5107007129699923e-05, "grad_norm": 12.95866584777832, "learning_rate": 9.262399321746503e-07, "loss": 0.4604, "mean_token_accuracy": 0.8524503707885742, "num_tokens": 83586598.0, "step": 2186 }, { "epoch": 0.27820887927744564, "ewc_loss": 0.015158625319600105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5158625501499046e-05, "grad_norm": 12.930466651916504, "learning_rate": 9.266638406104281e-07, "loss": 0.5103, "mean_token_accuracy": 0.8362548351287842, "num_tokens": 83625996.0, "step": 2187 }, { "epoch": 0.2783360895560361, "ewc_loss": 0.015094385482370853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5094385162228718e-05, "grad_norm": 12.84423542022705, "learning_rate": 9.27087749046206e-07, "loss": 0.4794, "mean_token_accuracy": 0.8478508591651917, "num_tokens": 83657322.0, "step": 2188 }, { "epoch": 0.27846329983462664, "ewc_loss": 0.015133806504309177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5133806300582364e-05, "grad_norm": 12.926480293273926, "learning_rate": 9.275116574819839e-07, "loss": 0.4742, "mean_token_accuracy": 0.8476510047912598, "num_tokens": 83693578.0, "step": 2189 }, { "epoch": 0.27859051011321717, "ewc_loss": 0.015171968378126621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5171968698268756e-05, "grad_norm": 12.896369934082031, "learning_rate": 9.279355659177617e-07, "loss": 0.479, "mean_token_accuracy": 0.8481056690216064, "num_tokens": 83727672.0, "step": 2190 }, { "epoch": 0.27871772039180764, "ewc_loss": 0.01517676655203104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5176766282820608e-05, "grad_norm": 12.932967185974121, "learning_rate": 9.283594743535395e-07, "loss": 0.523, "mean_token_accuracy": 0.835661768913269, "num_tokens": 83761405.0, "step": 2191 }, { "epoch": 0.2788449306703982, "ewc_loss": 0.015186152420938015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5186152268142905e-05, "grad_norm": 12.862316131591797, "learning_rate": 9.287833827893175e-07, "loss": 0.4621, "mean_token_accuracy": 0.8481671214103699, "num_tokens": 83801564.0, "step": 2192 }, { "epoch": 0.2789721409489887, "ewc_loss": 0.015194036066532135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5194035768217873e-05, "grad_norm": 12.939946174621582, "learning_rate": 9.292072912250953e-07, "loss": 0.4919, "mean_token_accuracy": 0.8415722250938416, "num_tokens": 83834260.0, "step": 2193 }, { "epoch": 0.2790993512275792, "ewc_loss": 0.0152659360319376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.526593587186653e-05, "grad_norm": 12.902490615844727, "learning_rate": 9.296311996608733e-07, "loss": 0.4195, "mean_token_accuracy": 0.8637423515319824, "num_tokens": 83873518.0, "step": 2194 }, { "epoch": 0.2792265615061697, "ewc_loss": 0.015223262831568718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5223263289954048e-05, "grad_norm": 12.86803150177002, "learning_rate": 9.300551080966511e-07, "loss": 0.534, "mean_token_accuracy": 0.8299422264099121, "num_tokens": 83913902.0, "step": 2195 }, { "epoch": 0.27935377178476023, "ewc_loss": 0.015245798043906689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5245797840179875e-05, "grad_norm": 12.910931587219238, "learning_rate": 9.30479016532429e-07, "loss": 0.4601, "mean_token_accuracy": 0.847554087638855, "num_tokens": 83947952.0, "step": 2196 }, { "epoch": 0.2794809820633507, "ewc_loss": 0.01528554130345583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.528554093965795e-05, "grad_norm": 12.959392547607422, "learning_rate": 9.309029249682068e-07, "loss": 0.4692, "mean_token_accuracy": 0.8505388498306274, "num_tokens": 83986057.0, "step": 2197 }, { "epoch": 0.27960819234194123, "ewc_loss": 0.015286299400031567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5286299458239228e-05, "grad_norm": 12.903026580810547, "learning_rate": 9.313268334039847e-07, "loss": 0.4796, "mean_token_accuracy": 0.8457092642784119, "num_tokens": 84023881.0, "step": 2198 }, { "epoch": 0.27973540262053176, "ewc_loss": 0.015282404609024525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5282405001926236e-05, "grad_norm": 12.96944522857666, "learning_rate": 9.317507418397625e-07, "loss": 0.4129, "mean_token_accuracy": 0.8680404424667358, "num_tokens": 84061480.0, "step": 2199 }, { "epoch": 0.27986261289912223, "ewc_loss": 0.01529227290302515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5292273019440472e-05, "grad_norm": 12.954870223999023, "learning_rate": 9.321746502755404e-07, "loss": 0.5155, "mean_token_accuracy": 0.8373785614967346, "num_tokens": 84101782.0, "step": 2200 }, { "epoch": 0.27998982317771276, "ewc_loss": 0.015288744121789932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5288744179997593e-05, "grad_norm": 12.951828002929688, "learning_rate": 9.325985587113183e-07, "loss": 0.4743, "mean_token_accuracy": 0.8472238183021545, "num_tokens": 84136125.0, "step": 2201 }, { "epoch": 0.2801170334563033, "ewc_loss": 0.015299477614462376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5299478036467917e-05, "grad_norm": 12.99428653717041, "learning_rate": 9.330224671470962e-07, "loss": 0.4874, "mean_token_accuracy": 0.8474857807159424, "num_tokens": 84169748.0, "step": 2202 }, { "epoch": 0.28024424373489376, "ewc_loss": 0.015335055999457836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.533505565021187e-05, "grad_norm": 12.991003036499023, "learning_rate": 9.334463755828741e-07, "loss": 0.5023, "mean_token_accuracy": 0.8354731202125549, "num_tokens": 84201292.0, "step": 2203 }, { "epoch": 0.2803714540134843, "ewc_loss": 0.015309760347008705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5309760783566162e-05, "grad_norm": 12.93194580078125, "learning_rate": 9.338702840186519e-07, "loss": 0.5023, "mean_token_accuracy": 0.8369333744049072, "num_tokens": 84240652.0, "step": 2204 }, { "epoch": 0.2804986642920748, "ewc_loss": 0.015355692245066166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.53556920849951e-05, "grad_norm": 12.91310977935791, "learning_rate": 9.342941924544298e-07, "loss": 0.4935, "mean_token_accuracy": 0.8414711952209473, "num_tokens": 84280252.0, "step": 2205 }, { "epoch": 0.2806258745706653, "ewc_loss": 0.015382951125502586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5382951460196637e-05, "grad_norm": 13.006098747253418, "learning_rate": 9.347181008902076e-07, "loss": 0.4312, "mean_token_accuracy": 0.8641813397407532, "num_tokens": 84316505.0, "step": 2206 }, { "epoch": 0.2807530848492558, "ewc_loss": 0.015384742990136147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.538474316475913e-05, "grad_norm": 12.98299789428711, "learning_rate": 9.351420093259855e-07, "loss": 0.435, "mean_token_accuracy": 0.8606253862380981, "num_tokens": 84349990.0, "step": 2207 }, { "epoch": 0.28088029512784635, "ewc_loss": 0.015360872261226177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5360872566816397e-05, "grad_norm": 12.91600227355957, "learning_rate": 9.355659177617634e-07, "loss": 0.4245, "mean_token_accuracy": 0.8611031770706177, "num_tokens": 84381066.0, "step": 2208 }, { "epoch": 0.2810075054064368, "ewc_loss": 0.015390561893582344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5390562111861072e-05, "grad_norm": 12.979333877563477, "learning_rate": 9.359898261975413e-07, "loss": 0.4161, "mean_token_accuracy": 0.8647922277450562, "num_tokens": 84422540.0, "step": 2209 }, { "epoch": 0.28113471568502735, "ewc_loss": 0.01542904507368803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.542904465168249e-05, "grad_norm": 12.96568775177002, "learning_rate": 9.364137346333192e-07, "loss": 0.5442, "mean_token_accuracy": 0.827860951423645, "num_tokens": 84462158.0, "step": 2210 }, { "epoch": 0.2812619259636179, "ewc_loss": 0.0154199730604887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5419973351527005e-05, "grad_norm": 13.02001953125, "learning_rate": 9.368376430690971e-07, "loss": 0.4334, "mean_token_accuracy": 0.8590037822723389, "num_tokens": 84495887.0, "step": 2211 }, { "epoch": 0.28138913624220835, "ewc_loss": 0.015439938753843307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5439938579220325e-05, "grad_norm": 12.995201110839844, "learning_rate": 9.372615515048749e-07, "loss": 0.5061, "mean_token_accuracy": 0.8392241597175598, "num_tokens": 84530176.0, "step": 2212 }, { "epoch": 0.2815163465207989, "ewc_loss": 0.015431837178766727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.543183680041693e-05, "grad_norm": 12.991424560546875, "learning_rate": 9.376854599406528e-07, "loss": 0.5076, "mean_token_accuracy": 0.8398855328559875, "num_tokens": 84574450.0, "step": 2213 }, { "epoch": 0.2816435567993894, "ewc_loss": 0.015454309061169624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5454308595508337e-05, "grad_norm": 12.984065055847168, "learning_rate": 9.381093683764306e-07, "loss": 0.4756, "mean_token_accuracy": 0.8477356433868408, "num_tokens": 84610661.0, "step": 2214 }, { "epoch": 0.2817707670779799, "ewc_loss": 0.015439402312040329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.543940197734628e-05, "grad_norm": 12.960885047912598, "learning_rate": 9.385332768122085e-07, "loss": 0.4568, "mean_token_accuracy": 0.8514411449432373, "num_tokens": 84651259.0, "step": 2215 }, { "epoch": 0.2818979773565704, "ewc_loss": 0.015448124147951603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.544812403153628e-05, "grad_norm": 12.99124526977539, "learning_rate": 9.389571852479864e-07, "loss": 0.4614, "mean_token_accuracy": 0.8523657321929932, "num_tokens": 84692389.0, "step": 2216 }, { "epoch": 0.28202518763516093, "ewc_loss": 0.01548960991203785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.548960972286295e-05, "grad_norm": 13.021879196166992, "learning_rate": 9.393810936837643e-07, "loss": 0.4653, "mean_token_accuracy": 0.8506121635437012, "num_tokens": 84731905.0, "step": 2217 }, { "epoch": 0.2821523979137514, "ewc_loss": 0.015480623580515385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5480623915209435e-05, "grad_norm": 12.970352172851562, "learning_rate": 9.398050021195422e-07, "loss": 0.4934, "mean_token_accuracy": 0.8460274338722229, "num_tokens": 84773727.0, "step": 2218 }, { "epoch": 0.28227960819234194, "ewc_loss": 0.015436431393027306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.543643156765029e-05, "grad_norm": 13.030126571655273, "learning_rate": 9.402289105553201e-07, "loss": 0.4384, "mean_token_accuracy": 0.8577964305877686, "num_tokens": 84816771.0, "step": 2219 }, { "epoch": 0.28240681847093246, "ewc_loss": 0.015487054362893105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.548705404275097e-05, "grad_norm": 12.968040466308594, "learning_rate": 9.406528189910978e-07, "loss": 0.4635, "mean_token_accuracy": 0.8524121046066284, "num_tokens": 84853172.0, "step": 2220 }, { "epoch": 0.28253402874952294, "ewc_loss": 0.01545729860663414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5457299014087766e-05, "grad_norm": 13.022124290466309, "learning_rate": 9.410767274268757e-07, "loss": 0.4436, "mean_token_accuracy": 0.8541243076324463, "num_tokens": 84891508.0, "step": 2221 }, { "epoch": 0.28266123902811346, "ewc_loss": 0.015499343164265156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5499343135161325e-05, "grad_norm": 13.032045364379883, "learning_rate": 9.415006358626536e-07, "loss": 0.5416, "mean_token_accuracy": 0.8266539573669434, "num_tokens": 84931204.0, "step": 2222 }, { "epoch": 0.282788449306704, "ewc_loss": 0.015492030419409275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.549203079775907e-05, "grad_norm": 13.05672550201416, "learning_rate": 9.419245442984314e-07, "loss": 0.523, "mean_token_accuracy": 0.8334733247756958, "num_tokens": 84966782.0, "step": 2223 }, { "epoch": 0.28291565958529447, "ewc_loss": 0.015484749339520931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5484749383176677e-05, "grad_norm": 12.945755004882812, "learning_rate": 9.423484527342094e-07, "loss": 0.4683, "mean_token_accuracy": 0.847068190574646, "num_tokens": 85001512.0, "step": 2224 }, { "epoch": 0.283042869863885, "ewc_loss": 0.015503650531172752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5503650502068922e-05, "grad_norm": 13.068111419677734, "learning_rate": 9.427723611699872e-07, "loss": 0.4768, "mean_token_accuracy": 0.849037230014801, "num_tokens": 85036082.0, "step": 2225 }, { "epoch": 0.2831700801424755, "ewc_loss": 0.015531235374510288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5531235476373695e-05, "grad_norm": 13.01383113861084, "learning_rate": 9.431962696057652e-07, "loss": 0.4832, "mean_token_accuracy": 0.8455092906951904, "num_tokens": 85079433.0, "step": 2226 }, { "epoch": 0.283297290421066, "ewc_loss": 0.015512161888182163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5512161553488113e-05, "grad_norm": 13.024042129516602, "learning_rate": 9.43620178041543e-07, "loss": 0.4342, "mean_token_accuracy": 0.8618674874305725, "num_tokens": 85116940.0, "step": 2227 }, { "epoch": 0.2834245006996565, "ewc_loss": 0.01551718171685934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.55171819642419e-05, "grad_norm": 12.99355411529541, "learning_rate": 9.440440864773208e-07, "loss": 0.4878, "mean_token_accuracy": 0.8459346294403076, "num_tokens": 85153328.0, "step": 2228 }, { "epoch": 0.28355171097824705, "ewc_loss": 0.015535208396613598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.553520814923104e-05, "grad_norm": 13.065271377563477, "learning_rate": 9.444679949130987e-07, "loss": 0.4889, "mean_token_accuracy": 0.8418079614639282, "num_tokens": 85194059.0, "step": 2229 }, { "epoch": 0.2836789212568376, "ewc_loss": 0.015548285096883774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.554828486405313e-05, "grad_norm": 12.965754508972168, "learning_rate": 9.448919033488766e-07, "loss": 0.51, "mean_token_accuracy": 0.8399542570114136, "num_tokens": 85230820.0, "step": 2230 }, { "epoch": 0.28380613153542805, "ewc_loss": 0.015514111146330833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5514111510128714e-05, "grad_norm": 13.052739143371582, "learning_rate": 9.453158117846544e-07, "loss": 0.4911, "mean_token_accuracy": 0.8438466191291809, "num_tokens": 85268864.0, "step": 2231 }, { "epoch": 0.2839333418140186, "ewc_loss": 0.015600712038576603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.560071177664213e-05, "grad_norm": 13.037452697753906, "learning_rate": 9.457397202204324e-07, "loss": 0.5182, "mean_token_accuracy": 0.8353893756866455, "num_tokens": 85311377.0, "step": 2232 }, { "epoch": 0.2840605520926091, "ewc_loss": 0.015524734742939472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5524734408245422e-05, "grad_norm": 13.01198673248291, "learning_rate": 9.461636286562102e-07, "loss": 0.4318, "mean_token_accuracy": 0.8632246851921082, "num_tokens": 85347082.0, "step": 2233 }, { "epoch": 0.2841877623711996, "ewc_loss": 0.015584440901875496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.558444091642741e-05, "grad_norm": 13.076248168945312, "learning_rate": 9.465875370919882e-07, "loss": 0.5206, "mean_token_accuracy": 0.837335467338562, "num_tokens": 85384696.0, "step": 2234 }, { "epoch": 0.2843149726497901, "ewc_loss": 0.015577088110148907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.557708856125828e-05, "grad_norm": 13.12686824798584, "learning_rate": 9.470114455277659e-07, "loss": 0.5064, "mean_token_accuracy": 0.8374106287956238, "num_tokens": 85420192.0, "step": 2235 }, { "epoch": 0.28444218292838064, "ewc_loss": 0.015572217293083668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5572217307635583e-05, "grad_norm": 13.036750793457031, "learning_rate": 9.474353539635438e-07, "loss": 0.4562, "mean_token_accuracy": 0.8515883684158325, "num_tokens": 85462281.0, "step": 2236 }, { "epoch": 0.2845693932069711, "ewc_loss": 0.01557657215744257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5576571968267672e-05, "grad_norm": 13.085848808288574, "learning_rate": 9.478592623993217e-07, "loss": 0.4624, "mean_token_accuracy": 0.8501999378204346, "num_tokens": 85495191.0, "step": 2237 }, { "epoch": 0.28469660348556164, "ewc_loss": 0.015586190856993198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5586190784233622e-05, "grad_norm": 13.087467193603516, "learning_rate": 9.482831708350996e-07, "loss": 0.4599, "mean_token_accuracy": 0.8533587455749512, "num_tokens": 85533803.0, "step": 2238 }, { "epoch": 0.28482381376415217, "ewc_loss": 0.015586985275149345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5586985682602972e-05, "grad_norm": 13.07793140411377, "learning_rate": 9.487070792708775e-07, "loss": 0.5068, "mean_token_accuracy": 0.8407497406005859, "num_tokens": 85573424.0, "step": 2239 }, { "epoch": 0.28495102404274264, "ewc_loss": 0.015627803280949593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.562780380481854e-05, "grad_norm": 13.101541519165039, "learning_rate": 9.491309877066554e-07, "loss": 0.49, "mean_token_accuracy": 0.8441770672798157, "num_tokens": 85616423.0, "step": 2240 }, { "epoch": 0.28507823432133317, "ewc_loss": 0.015621586702764034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.562158649903722e-05, "grad_norm": 13.129585266113281, "learning_rate": 9.495548961424332e-07, "loss": 0.5161, "mean_token_accuracy": 0.8359214067459106, "num_tokens": 85658410.0, "step": 2241 }, { "epoch": 0.2852054445999237, "ewc_loss": 0.015620888210833073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.562088800710626e-05, "grad_norm": 13.161063194274902, "learning_rate": 9.499788045782111e-07, "loss": 0.4741, "mean_token_accuracy": 0.8477307558059692, "num_tokens": 85695767.0, "step": 2242 }, { "epoch": 0.28533265487851417, "ewc_loss": 0.015602081082761288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5602081475663e-05, "grad_norm": 13.056024551391602, "learning_rate": 9.504027130139889e-07, "loss": 0.4897, "mean_token_accuracy": 0.8436658382415771, "num_tokens": 85737771.0, "step": 2243 }, { "epoch": 0.2854598651571047, "ewc_loss": 0.01563013158738613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.563013211125508e-05, "grad_norm": 13.207996368408203, "learning_rate": 9.508266214497667e-07, "loss": 0.4363, "mean_token_accuracy": 0.8618483543395996, "num_tokens": 85774789.0, "step": 2244 }, { "epoch": 0.2855870754356952, "ewc_loss": 0.015652380883693695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.565238017064985e-05, "grad_norm": 13.011068344116211, "learning_rate": 9.512505298855447e-07, "loss": 0.4755, "mean_token_accuracy": 0.8502930402755737, "num_tokens": 85817833.0, "step": 2245 }, { "epoch": 0.2857142857142857, "ewc_loss": 0.015602502040565014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5602501662215218e-05, "grad_norm": 13.163098335266113, "learning_rate": 9.516744383213225e-07, "loss": 0.4191, "mean_token_accuracy": 0.8634750247001648, "num_tokens": 85854488.0, "step": 2246 }, { "epoch": 0.2858414959928762, "ewc_loss": 0.015689564868807793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5689565771026537e-05, "grad_norm": 13.102503776550293, "learning_rate": 9.520983467571005e-07, "loss": 0.4979, "mean_token_accuracy": 0.8423011898994446, "num_tokens": 85893746.0, "step": 2247 }, { "epoch": 0.28596870627146675, "ewc_loss": 0.015601063147187233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5601062841597013e-05, "grad_norm": 13.103726387023926, "learning_rate": 9.525222551928783e-07, "loss": 0.5376, "mean_token_accuracy": 0.8348484039306641, "num_tokens": 85932798.0, "step": 2248 }, { "epoch": 0.2860959165500572, "ewc_loss": 0.015661094337701797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5661094948882237e-05, "grad_norm": 13.098007202148438, "learning_rate": 9.529461636286562e-07, "loss": 0.451, "mean_token_accuracy": 0.8550750017166138, "num_tokens": 85964446.0, "step": 2249 }, { "epoch": 0.28622312682864776, "ewc_loss": 0.015654709190130234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5654708477086388e-05, "grad_norm": 13.05636215209961, "learning_rate": 9.533700720644341e-07, "loss": 0.4286, "mean_token_accuracy": 0.8625198602676392, "num_tokens": 86000587.0, "step": 2250 }, { "epoch": 0.2863503371072383, "ewc_loss": 0.015659771859645844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.565977254358586e-05, "grad_norm": 13.079755783081055, "learning_rate": 9.537939805002118e-07, "loss": 0.4807, "mean_token_accuracy": 0.8463059067726135, "num_tokens": 86042891.0, "step": 2251 }, { "epoch": 0.28647754738582876, "ewc_loss": 0.015713199973106384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5713199900346808e-05, "grad_norm": 13.164274215698242, "learning_rate": 9.542178889359898e-07, "loss": 0.458, "mean_token_accuracy": 0.8532431125640869, "num_tokens": 86079973.0, "step": 2252 }, { "epoch": 0.2866047576644193, "ewc_loss": 0.01570880226790905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5708801583969034e-05, "grad_norm": 13.136764526367188, "learning_rate": 9.546417973717677e-07, "loss": 0.481, "mean_token_accuracy": 0.8475881814956665, "num_tokens": 86119767.0, "step": 2253 }, { "epoch": 0.2867319679430098, "ewc_loss": 0.015714965760707855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.571496613905765e-05, "grad_norm": 13.063323020935059, "learning_rate": 9.550657058075455e-07, "loss": 0.5017, "mean_token_accuracy": 0.8412477970123291, "num_tokens": 86163176.0, "step": 2254 }, { "epoch": 0.2868591782216003, "ewc_loss": 0.01575729250907898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.575729220348876e-05, "grad_norm": 13.235050201416016, "learning_rate": 9.554896142433234e-07, "loss": 0.4968, "mean_token_accuracy": 0.8401882648468018, "num_tokens": 86201523.0, "step": 2255 }, { "epoch": 0.2869863885001908, "ewc_loss": 0.01573268510401249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.573268491483759e-05, "grad_norm": 13.083259582519531, "learning_rate": 9.559135226791012e-07, "loss": 0.5155, "mean_token_accuracy": 0.8316934704780579, "num_tokens": 86240277.0, "step": 2256 }, { "epoch": 0.28711359877878134, "ewc_loss": 0.0157316867262125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5731686289655045e-05, "grad_norm": 13.164502143859863, "learning_rate": 9.563374311148793e-07, "loss": 0.5051, "mean_token_accuracy": 0.8397306203842163, "num_tokens": 86289714.0, "step": 2257 }, { "epoch": 0.2872408090573718, "ewc_loss": 0.01576712727546692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5767127479193732e-05, "grad_norm": 13.130732536315918, "learning_rate": 9.56761339550657e-07, "loss": 0.4559, "mean_token_accuracy": 0.8549535274505615, "num_tokens": 86320786.0, "step": 2258 }, { "epoch": 0.28736801933596234, "ewc_loss": 0.015739688649773598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5739688024041243e-05, "grad_norm": 13.125412940979004, "learning_rate": 9.57185247986435e-07, "loss": 0.4459, "mean_token_accuracy": 0.8574937582015991, "num_tokens": 86358002.0, "step": 2259 }, { "epoch": 0.28749522961455287, "ewc_loss": 0.015795765444636345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5795765648363158e-05, "grad_norm": 13.172987937927246, "learning_rate": 9.576091564222128e-07, "loss": 0.4142, "mean_token_accuracy": 0.8642817139625549, "num_tokens": 86395334.0, "step": 2260 }, { "epoch": 0.28762243989314334, "ewc_loss": 0.015758760273456573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.575876012793742e-05, "grad_norm": 13.110847473144531, "learning_rate": 9.580330648579906e-07, "loss": 0.4918, "mean_token_accuracy": 0.8401589393615723, "num_tokens": 86432616.0, "step": 2261 }, { "epoch": 0.2877496501717339, "ewc_loss": 0.015727223828434944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5727224308648147e-05, "grad_norm": 13.101658821105957, "learning_rate": 9.584569732937685e-07, "loss": 0.486, "mean_token_accuracy": 0.8432148694992065, "num_tokens": 86472293.0, "step": 2262 }, { "epoch": 0.2878768604503244, "ewc_loss": 0.015791965648531914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.579196577949915e-05, "grad_norm": 13.162043571472168, "learning_rate": 9.588808817295463e-07, "loss": 0.4772, "mean_token_accuracy": 0.8438735604286194, "num_tokens": 86506304.0, "step": 2263 }, { "epoch": 0.2880040707289149, "ewc_loss": 0.01577957160770893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.577957118570339e-05, "grad_norm": 13.078690528869629, "learning_rate": 9.593047901653242e-07, "loss": 0.4747, "mean_token_accuracy": 0.8459247350692749, "num_tokens": 86540520.0, "step": 2264 }, { "epoch": 0.2881312810075054, "ewc_loss": 0.015839822590351105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5839823390706442e-05, "grad_norm": 13.18597412109375, "learning_rate": 9.597286986011022e-07, "loss": 0.4356, "mean_token_accuracy": 0.8590083718299866, "num_tokens": 86576956.0, "step": 2265 }, { "epoch": 0.28825849128609593, "ewc_loss": 0.015841111540794373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5841111235204153e-05, "grad_norm": 13.091238021850586, "learning_rate": 9.601526070368799e-07, "loss": 0.5201, "mean_token_accuracy": 0.8353040218353271, "num_tokens": 86613799.0, "step": 2266 }, { "epoch": 0.2883857015646864, "ewc_loss": 0.015829604119062424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.582960430823732e-05, "grad_norm": 13.172151565551758, "learning_rate": 9.60576515472658e-07, "loss": 0.4589, "mean_token_accuracy": 0.852531373500824, "num_tokens": 86650338.0, "step": 2267 }, { "epoch": 0.28851291184327693, "ewc_loss": 0.015881894156336784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5881894796621054e-05, "grad_norm": 13.132814407348633, "learning_rate": 9.610004239084358e-07, "loss": 0.4719, "mean_token_accuracy": 0.8471061587333679, "num_tokens": 86691605.0, "step": 2268 }, { "epoch": 0.28864012212186746, "ewc_loss": 0.015846313908696175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5846313544898294e-05, "grad_norm": 13.217361450195312, "learning_rate": 9.614243323442136e-07, "loss": 0.5397, "mean_token_accuracy": 0.8271902799606323, "num_tokens": 86728767.0, "step": 2269 }, { "epoch": 0.28876733240045793, "ewc_loss": 0.01589876599609852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5898765923338942e-05, "grad_norm": 13.175174713134766, "learning_rate": 9.618482407799915e-07, "loss": 0.4527, "mean_token_accuracy": 0.8529199361801147, "num_tokens": 86765240.0, "step": 2270 }, { "epoch": 0.28889454267904846, "ewc_loss": 0.01588546670973301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.588546729180962e-05, "grad_norm": 13.163941383361816, "learning_rate": 9.622721492157693e-07, "loss": 0.42, "mean_token_accuracy": 0.8655073642730713, "num_tokens": 86804042.0, "step": 2271 }, { "epoch": 0.289021752957639, "ewc_loss": 0.01589772291481495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.589772364241071e-05, "grad_norm": 13.245749473571777, "learning_rate": 9.626960576515472e-07, "loss": 0.4711, "mean_token_accuracy": 0.8492452502250671, "num_tokens": 86835220.0, "step": 2272 }, { "epoch": 0.28914896323622946, "ewc_loss": 0.01592588610947132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5925885236356407e-05, "grad_norm": 13.170352935791016, "learning_rate": 9.63119966087325e-07, "loss": 0.4569, "mean_token_accuracy": 0.8511273860931396, "num_tokens": 86873414.0, "step": 2273 }, { "epoch": 0.28927617351482, "ewc_loss": 0.015914570540189743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5914571122266352e-05, "grad_norm": 13.150273323059082, "learning_rate": 9.635438745231029e-07, "loss": 0.4471, "mean_token_accuracy": 0.8572249412536621, "num_tokens": 86914272.0, "step": 2274 }, { "epoch": 0.2894033837934105, "ewc_loss": 0.015940748155117035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.594074819877278e-05, "grad_norm": 13.127717018127441, "learning_rate": 9.63967782958881e-07, "loss": 0.4879, "mean_token_accuracy": 0.8420060873031616, "num_tokens": 86952425.0, "step": 2275 }, { "epoch": 0.289530594072001, "ewc_loss": 0.015956077724695206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5956078641465865e-05, "grad_norm": 13.231739044189453, "learning_rate": 9.643916913946588e-07, "loss": 0.4466, "mean_token_accuracy": 0.8569761514663696, "num_tokens": 86987842.0, "step": 2276 }, { "epoch": 0.2896578043505915, "ewc_loss": 0.015964198857545853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5964198610163294e-05, "grad_norm": 13.192774772644043, "learning_rate": 9.648155998304366e-07, "loss": 0.4199, "mean_token_accuracy": 0.8650720119476318, "num_tokens": 87026750.0, "step": 2277 }, { "epoch": 0.28978501462918205, "ewc_loss": 0.015969660133123398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5969661035342142e-05, "grad_norm": 13.202753067016602, "learning_rate": 9.652395082662145e-07, "loss": 0.524, "mean_token_accuracy": 0.8319334983825684, "num_tokens": 87069685.0, "step": 2278 }, { "epoch": 0.2899122249077726, "ewc_loss": 0.01596194878220558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5961948520271108e-05, "grad_norm": 13.219972610473633, "learning_rate": 9.656634167019923e-07, "loss": 0.4795, "mean_token_accuracy": 0.8426449298858643, "num_tokens": 87107054.0, "step": 2279 }, { "epoch": 0.29003943518636305, "ewc_loss": 0.01602100394666195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6021003830246627e-05, "grad_norm": 13.347208976745605, "learning_rate": 9.660873251377701e-07, "loss": 0.4861, "mean_token_accuracy": 0.8442389965057373, "num_tokens": 87146254.0, "step": 2280 }, { "epoch": 0.2901666454649536, "ewc_loss": 0.015997791662812233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.599779170646798e-05, "grad_norm": 13.252788543701172, "learning_rate": 9.66511233573548e-07, "loss": 0.5107, "mean_token_accuracy": 0.841096818447113, "num_tokens": 87182404.0, "step": 2281 }, { "epoch": 0.2902938557435441, "ewc_loss": 0.015936769545078278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5936770068947226e-05, "grad_norm": 13.1520414352417, "learning_rate": 9.669351420093258e-07, "loss": 0.4424, "mean_token_accuracy": 0.8577525615692139, "num_tokens": 87216551.0, "step": 2282 }, { "epoch": 0.2904210660221346, "ewc_loss": 0.015978621318936348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.597862137714401e-05, "grad_norm": 13.279358863830566, "learning_rate": 9.67359050445104e-07, "loss": 0.4671, "mean_token_accuracy": 0.8497762680053711, "num_tokens": 87249402.0, "step": 2283 }, { "epoch": 0.2905482763007251, "ewc_loss": 0.01601277105510235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.601277108420618e-05, "grad_norm": 13.121003150939941, "learning_rate": 9.677829588808817e-07, "loss": 0.4531, "mean_token_accuracy": 0.8516486883163452, "num_tokens": 87281846.0, "step": 2284 }, { "epoch": 0.29067548657931563, "ewc_loss": 0.015968715772032738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.5968715160852298e-05, "grad_norm": 13.178945541381836, "learning_rate": 9.682068673166596e-07, "loss": 0.5247, "mean_token_accuracy": 0.8368241786956787, "num_tokens": 87315765.0, "step": 2285 }, { "epoch": 0.2908026968579061, "ewc_loss": 0.01610289327800274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6102892914204858e-05, "grad_norm": 13.278037071228027, "learning_rate": 9.686307757524374e-07, "loss": 0.4666, "mean_token_accuracy": 0.8490803241729736, "num_tokens": 87353668.0, "step": 2286 }, { "epoch": 0.29092990713649663, "ewc_loss": 0.01606169529259205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6061694623203948e-05, "grad_norm": 13.161894798278809, "learning_rate": 9.690546841882153e-07, "loss": 0.3978, "mean_token_accuracy": 0.8714944124221802, "num_tokens": 87390747.0, "step": 2287 }, { "epoch": 0.29105711741508716, "ewc_loss": 0.016032325103878975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6032325220294297e-05, "grad_norm": 13.180928230285645, "learning_rate": 9.694785926239931e-07, "loss": 0.4795, "mean_token_accuracy": 0.8494030237197876, "num_tokens": 87430686.0, "step": 2288 }, { "epoch": 0.29118432769367764, "ewc_loss": 0.016126960515975952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6126959963003173e-05, "grad_norm": 13.243826866149902, "learning_rate": 9.69902501059771e-07, "loss": 0.5191, "mean_token_accuracy": 0.8349023461341858, "num_tokens": 87465290.0, "step": 2289 }, { "epoch": 0.29131153797226816, "ewc_loss": 0.01609981246292591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.609981154615525e-05, "grad_norm": 13.235063552856445, "learning_rate": 9.703264094955488e-07, "loss": 0.4883, "mean_token_accuracy": 0.8431195020675659, "num_tokens": 87504553.0, "step": 2290 }, { "epoch": 0.2914387482508587, "ewc_loss": 0.016107574105262756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6107574992929585e-05, "grad_norm": 13.178988456726074, "learning_rate": 9.707503179313269e-07, "loss": 0.4579, "mean_token_accuracy": 0.8544882535934448, "num_tokens": 87544896.0, "step": 2291 }, { "epoch": 0.29156595852944917, "ewc_loss": 0.016119608655571938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6119609426823445e-05, "grad_norm": 13.212382316589355, "learning_rate": 9.711742263671047e-07, "loss": 0.4576, "mean_token_accuracy": 0.8531988263130188, "num_tokens": 87582075.0, "step": 2292 }, { "epoch": 0.2916931688080397, "ewc_loss": 0.016150472685694695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6150472220033407e-05, "grad_norm": 13.240909576416016, "learning_rate": 9.715981348028826e-07, "loss": 0.4511, "mean_token_accuracy": 0.8551439642906189, "num_tokens": 87623117.0, "step": 2293 }, { "epoch": 0.2918203790866302, "ewc_loss": 0.016191136091947556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6191135728149675e-05, "grad_norm": 13.355195045471191, "learning_rate": 9.720220432386604e-07, "loss": 0.4661, "mean_token_accuracy": 0.8530863523483276, "num_tokens": 87656263.0, "step": 2294 }, { "epoch": 0.2919475893652207, "ewc_loss": 0.016142915934324265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6142916138051078e-05, "grad_norm": 13.218024253845215, "learning_rate": 9.724459516744383e-07, "loss": 0.4385, "mean_token_accuracy": 0.8590909242630005, "num_tokens": 87695852.0, "step": 2295 }, { "epoch": 0.2920747996438112, "ewc_loss": 0.0160792488604784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.607924968993757e-05, "grad_norm": 13.174707412719727, "learning_rate": 9.728698601102161e-07, "loss": 0.438, "mean_token_accuracy": 0.8590394258499146, "num_tokens": 87734331.0, "step": 2296 }, { "epoch": 0.29220200992240175, "ewc_loss": 0.01616886630654335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6168865840882063e-05, "grad_norm": 13.312957763671875, "learning_rate": 9.73293768545994e-07, "loss": 0.472, "mean_token_accuracy": 0.8456056118011475, "num_tokens": 87768447.0, "step": 2297 }, { "epoch": 0.2923292202009922, "ewc_loss": 0.016166718676686287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.616671943338588e-05, "grad_norm": 13.267602920532227, "learning_rate": 9.737176769817718e-07, "loss": 0.4987, "mean_token_accuracy": 0.845011830329895, "num_tokens": 87813213.0, "step": 2298 }, { "epoch": 0.29245643047958275, "ewc_loss": 0.016134856268763542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6134856196003966e-05, "grad_norm": 13.276408195495605, "learning_rate": 9.741415854175499e-07, "loss": 0.535, "mean_token_accuracy": 0.8336260914802551, "num_tokens": 87851336.0, "step": 2299 }, { "epoch": 0.2925836407581733, "ewc_loss": 0.016132649034261703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6132649761857465e-05, "grad_norm": 13.193011283874512, "learning_rate": 9.745654938533277e-07, "loss": 0.4887, "mean_token_accuracy": 0.8433858156204224, "num_tokens": 87888474.0, "step": 2300 }, { "epoch": 0.29271085103676375, "ewc_loss": 0.016150761395692825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.615076143934857e-05, "grad_norm": 13.339428901672363, "learning_rate": 9.749894022891056e-07, "loss": 0.5132, "mean_token_accuracy": 0.8383822441101074, "num_tokens": 87928232.0, "step": 2301 }, { "epoch": 0.2928380613153543, "ewc_loss": 0.01618027873337269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.618027818039991e-05, "grad_norm": 13.153454780578613, "learning_rate": 9.754133107248834e-07, "loss": 0.4307, "mean_token_accuracy": 0.8598496317863464, "num_tokens": 87966982.0, "step": 2302 }, { "epoch": 0.2929652715939448, "ewc_loss": 0.016152050346136093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6152051102835685e-05, "grad_norm": 13.313947677612305, "learning_rate": 9.758372191606612e-07, "loss": 0.5174, "mean_token_accuracy": 0.8340871930122375, "num_tokens": 88003550.0, "step": 2303 }, { "epoch": 0.2930924818725353, "ewc_loss": 0.016230467706918716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6230467736022547e-05, "grad_norm": 13.226091384887695, "learning_rate": 9.76261127596439e-07, "loss": 0.4276, "mean_token_accuracy": 0.862973690032959, "num_tokens": 88038038.0, "step": 2304 }, { "epoch": 0.2932196921511258, "ewc_loss": 0.016167806461453438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.61678071890492e-05, "grad_norm": 13.320189476013184, "learning_rate": 9.76685036032217e-07, "loss": 0.5001, "mean_token_accuracy": 0.8372560143470764, "num_tokens": 88074627.0, "step": 2305 }, { "epoch": 0.29334690242971634, "ewc_loss": 0.01625976525247097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6259764379356056e-05, "grad_norm": 13.245223045349121, "learning_rate": 9.771089444679948e-07, "loss": 0.4426, "mean_token_accuracy": 0.8572534918785095, "num_tokens": 88116772.0, "step": 2306 }, { "epoch": 0.2934741127083068, "ewc_loss": 0.016214365139603615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6214364222832955e-05, "grad_norm": 13.301011085510254, "learning_rate": 9.775328529037728e-07, "loss": 0.4877, "mean_token_accuracy": 0.8433687090873718, "num_tokens": 88152888.0, "step": 2307 }, { "epoch": 0.29360132298689734, "ewc_loss": 0.016250494867563248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6250494809355587e-05, "grad_norm": 13.361879348754883, "learning_rate": 9.779567613395507e-07, "loss": 0.5176, "mean_token_accuracy": 0.8378593325614929, "num_tokens": 88184544.0, "step": 2308 }, { "epoch": 0.29372853326548787, "ewc_loss": 0.01623438112437725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6234380382229574e-05, "grad_norm": 13.269608497619629, "learning_rate": 9.783806697753285e-07, "loss": 0.4608, "mean_token_accuracy": 0.8528844118118286, "num_tokens": 88221603.0, "step": 2309 }, { "epoch": 0.29385574354407834, "ewc_loss": 0.016248436644673347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6248437532340176e-05, "grad_norm": 13.325888633728027, "learning_rate": 9.788045782111064e-07, "loss": 0.4352, "mean_token_accuracy": 0.8598183393478394, "num_tokens": 88265109.0, "step": 2310 }, { "epoch": 0.29398295382266887, "ewc_loss": 0.016257386654615402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6257386960205622e-05, "grad_norm": 13.279876708984375, "learning_rate": 9.792284866468842e-07, "loss": 0.4258, "mean_token_accuracy": 0.86099773645401, "num_tokens": 88302756.0, "step": 2311 }, { "epoch": 0.2941101641012594, "ewc_loss": 0.016207998618483543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6207997759920545e-05, "grad_norm": 13.317380905151367, "learning_rate": 9.79652395082662e-07, "loss": 0.4256, "mean_token_accuracy": 0.8577460050582886, "num_tokens": 88335605.0, "step": 2312 }, { "epoch": 0.29423737437984987, "ewc_loss": 0.016273997724056244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6273997971438803e-05, "grad_norm": 13.29171371459961, "learning_rate": 9.8007630351844e-07, "loss": 0.4957, "mean_token_accuracy": 0.8438183665275574, "num_tokens": 88369447.0, "step": 2313 }, { "epoch": 0.2943645846584404, "ewc_loss": 0.01625197008252144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6251970009761862e-05, "grad_norm": 13.291825294494629, "learning_rate": 9.805002119542178e-07, "loss": 0.5252, "mean_token_accuracy": 0.831856906414032, "num_tokens": 88404442.0, "step": 2314 }, { "epoch": 0.2944917949370309, "ewc_loss": 0.016312118619680405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6312118532368913e-05, "grad_norm": 13.308959007263184, "learning_rate": 9.809241203899958e-07, "loss": 0.4706, "mean_token_accuracy": 0.8510032892227173, "num_tokens": 88445870.0, "step": 2315 }, { "epoch": 0.2946190052156214, "ewc_loss": 0.016309482976794243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6309482816723175e-05, "grad_norm": 13.329060554504395, "learning_rate": 9.813480288257737e-07, "loss": 0.485, "mean_token_accuracy": 0.8416692018508911, "num_tokens": 88480994.0, "step": 2316 }, { "epoch": 0.2947462154942119, "ewc_loss": 0.016316616907715797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6316616893163882e-05, "grad_norm": 13.248295783996582, "learning_rate": 9.817719372615515e-07, "loss": 0.4408, "mean_token_accuracy": 0.8580677509307861, "num_tokens": 88519666.0, "step": 2317 }, { "epoch": 0.29487342577280246, "ewc_loss": 0.016312962397933006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.631296254345216e-05, "grad_norm": 13.362229347229004, "learning_rate": 9.821958456973294e-07, "loss": 0.4151, "mean_token_accuracy": 0.8648424744606018, "num_tokens": 88553855.0, "step": 2318 }, { "epoch": 0.29500063605139293, "ewc_loss": 0.01637711003422737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6377109204768203e-05, "grad_norm": 13.294302940368652, "learning_rate": 9.826197541331072e-07, "loss": 0.4309, "mean_token_accuracy": 0.8622801303863525, "num_tokens": 88589053.0, "step": 2319 }, { "epoch": 0.29512784632998346, "ewc_loss": 0.01634078659117222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6340785805368796e-05, "grad_norm": 13.31486988067627, "learning_rate": 9.83043662568885e-07, "loss": 0.4888, "mean_token_accuracy": 0.8446817398071289, "num_tokens": 88627067.0, "step": 2320 }, { "epoch": 0.295255056608574, "ewc_loss": 0.01637873612344265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6378735381294973e-05, "grad_norm": 13.352311134338379, "learning_rate": 9.83467571004663e-07, "loss": 0.4331, "mean_token_accuracy": 0.8602496385574341, "num_tokens": 88668030.0, "step": 2321 }, { "epoch": 0.29538226688716446, "ewc_loss": 0.016356391832232475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6356390915461816e-05, "grad_norm": 13.262405395507812, "learning_rate": 9.838914794404407e-07, "loss": 0.4317, "mean_token_accuracy": 0.8620027303695679, "num_tokens": 88708889.0, "step": 2322 }, { "epoch": 0.295509477165755, "ewc_loss": 0.016385871917009354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6385871276725084e-05, "grad_norm": 13.372967720031738, "learning_rate": 9.843153878762188e-07, "loss": 0.4988, "mean_token_accuracy": 0.8432937860488892, "num_tokens": 88745689.0, "step": 2323 }, { "epoch": 0.2956366874443455, "ewc_loss": 0.01640365645289421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.640365553612355e-05, "grad_norm": 13.308679580688477, "learning_rate": 9.847392963119966e-07, "loss": 0.45, "mean_token_accuracy": 0.8568452596664429, "num_tokens": 88784667.0, "step": 2324 }, { "epoch": 0.295763897722936, "ewc_loss": 0.016395332291722298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6395331840612926e-05, "grad_norm": 13.363302230834961, "learning_rate": 9.851632047477745e-07, "loss": 0.4636, "mean_token_accuracy": 0.8502978086471558, "num_tokens": 88827334.0, "step": 2325 }, { "epoch": 0.2958911080015265, "ewc_loss": 0.016394350677728653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6394351405324414e-05, "grad_norm": 13.327396392822266, "learning_rate": 9.855871131835523e-07, "loss": 0.397, "mean_token_accuracy": 0.8700763583183289, "num_tokens": 88863744.0, "step": 2326 }, { "epoch": 0.29601831828011704, "ewc_loss": 0.01635773852467537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6357738786609843e-05, "grad_norm": 13.30174732208252, "learning_rate": 9.860110216193302e-07, "loss": 0.4292, "mean_token_accuracy": 0.8630789518356323, "num_tokens": 88903693.0, "step": 2327 }, { "epoch": 0.2961455285587075, "ewc_loss": 0.016380831599235535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6380830857087858e-05, "grad_norm": 13.347646713256836, "learning_rate": 9.86434930055108e-07, "loss": 0.4247, "mean_token_accuracy": 0.862349808216095, "num_tokens": 88936668.0, "step": 2328 }, { "epoch": 0.29627273883729804, "ewc_loss": 0.01640254072844982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6402540495619178e-05, "grad_norm": 13.31421184539795, "learning_rate": 9.868588384908859e-07, "loss": 0.4342, "mean_token_accuracy": 0.857590913772583, "num_tokens": 88970920.0, "step": 2329 }, { "epoch": 0.2963999491158886, "ewc_loss": 0.016374636441469193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6374637198168784e-05, "grad_norm": 13.33220100402832, "learning_rate": 9.872827469266637e-07, "loss": 0.4369, "mean_token_accuracy": 0.8562982678413391, "num_tokens": 89010856.0, "step": 2330 }, { "epoch": 0.2965271593944791, "ewc_loss": 0.016417361795902252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.641736162127927e-05, "grad_norm": 13.326687812805176, "learning_rate": 9.877066553624418e-07, "loss": 0.4456, "mean_token_accuracy": 0.8560352921485901, "num_tokens": 89047109.0, "step": 2331 }, { "epoch": 0.2966543696730696, "ewc_loss": 0.016404103487730026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6404103007516824e-05, "grad_norm": 13.29448413848877, "learning_rate": 9.881305637982196e-07, "loss": 0.4543, "mean_token_accuracy": 0.8545552492141724, "num_tokens": 89087859.0, "step": 2332 }, { "epoch": 0.2967815799516601, "ewc_loss": 0.01642025262117386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6420251995441504e-05, "grad_norm": 13.378293991088867, "learning_rate": 9.885544722339975e-07, "loss": 0.4367, "mean_token_accuracy": 0.8573349118232727, "num_tokens": 89123601.0, "step": 2333 }, { "epoch": 0.29690879023025063, "ewc_loss": 0.01643841154873371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6438410966657102e-05, "grad_norm": 13.341032981872559, "learning_rate": 9.889783806697753e-07, "loss": 0.4192, "mean_token_accuracy": 0.8657649159431458, "num_tokens": 89160133.0, "step": 2334 }, { "epoch": 0.2970360005088411, "ewc_loss": 0.01642211340367794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.642211282160133e-05, "grad_norm": 13.324057579040527, "learning_rate": 9.894022891055532e-07, "loss": 0.413, "mean_token_accuracy": 0.867870569229126, "num_tokens": 89196731.0, "step": 2335 }, { "epoch": 0.29716321078743163, "ewc_loss": 0.016465838998556137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.646583950787317e-05, "grad_norm": 13.36482048034668, "learning_rate": 9.89826197541331e-07, "loss": 0.4017, "mean_token_accuracy": 0.8698976635932922, "num_tokens": 89230159.0, "step": 2336 }, { "epoch": 0.29729042106602216, "ewc_loss": 0.016421707347035408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.642170718696434e-05, "grad_norm": 13.376700401306152, "learning_rate": 9.902501059771089e-07, "loss": 0.4664, "mean_token_accuracy": 0.8485066294670105, "num_tokens": 89264648.0, "step": 2337 }, { "epoch": 0.29741763134461263, "ewc_loss": 0.016490936279296875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6490936104673892e-05, "grad_norm": 13.383551597595215, "learning_rate": 9.906740144128867e-07, "loss": 0.4662, "mean_token_accuracy": 0.8481549024581909, "num_tokens": 89302505.0, "step": 2338 }, { "epoch": 0.29754484162320316, "ewc_loss": 0.01646866463124752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6468664398416877e-05, "grad_norm": 13.379528999328613, "learning_rate": 9.910979228486648e-07, "loss": 0.4252, "mean_token_accuracy": 0.8609700202941895, "num_tokens": 89338079.0, "step": 2339 }, { "epoch": 0.2976720519017937, "ewc_loss": 0.016515197232365608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6515197785338387e-05, "grad_norm": 13.42723560333252, "learning_rate": 9.915218312844426e-07, "loss": 0.4903, "mean_token_accuracy": 0.847962498664856, "num_tokens": 89376717.0, "step": 2340 }, { "epoch": 0.29779926218038416, "ewc_loss": 0.01651386357843876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6513864466105588e-05, "grad_norm": 13.366710662841797, "learning_rate": 9.919457397202205e-07, "loss": 0.4798, "mean_token_accuracy": 0.8452503085136414, "num_tokens": 89416164.0, "step": 2341 }, { "epoch": 0.2979264724589747, "ewc_loss": 0.01649174466729164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.649174555495847e-05, "grad_norm": 13.426027297973633, "learning_rate": 9.923696481559983e-07, "loss": 0.4333, "mean_token_accuracy": 0.86073899269104, "num_tokens": 89446491.0, "step": 2342 }, { "epoch": 0.2980536827375652, "ewc_loss": 0.016519863158464432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6519863493158482e-05, "grad_norm": 13.33735179901123, "learning_rate": 9.927935565917761e-07, "loss": 0.4816, "mean_token_accuracy": 0.8430908918380737, "num_tokens": 89483637.0, "step": 2343 }, { "epoch": 0.2981808930161557, "ewc_loss": 0.016531117260456085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.653111758059822e-05, "grad_norm": 13.37739372253418, "learning_rate": 9.93217465027554e-07, "loss": 0.4741, "mean_token_accuracy": 0.851422905921936, "num_tokens": 89526532.0, "step": 2344 }, { "epoch": 0.2983081032947462, "ewc_loss": 0.01650831662118435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6508316548424773e-05, "grad_norm": 13.350072860717773, "learning_rate": 9.936413734633318e-07, "loss": 0.5357, "mean_token_accuracy": 0.8341864943504333, "num_tokens": 89558353.0, "step": 2345 }, { "epoch": 0.29843531357333675, "ewc_loss": 0.01654013991355896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6540139768039808e-05, "grad_norm": 13.383685111999512, "learning_rate": 9.940652818991097e-07, "loss": 0.428, "mean_token_accuracy": 0.8623740673065186, "num_tokens": 89598727.0, "step": 2346 }, { "epoch": 0.2985625238519272, "ewc_loss": 0.01663154736161232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6631547623546794e-05, "grad_norm": 13.500410079956055, "learning_rate": 9.944891903348877e-07, "loss": 0.4956, "mean_token_accuracy": 0.8414719104766846, "num_tokens": 89640093.0, "step": 2347 }, { "epoch": 0.29868973413051775, "ewc_loss": 0.016610488295555115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6610489183221944e-05, "grad_norm": 13.420228958129883, "learning_rate": 9.949130987706656e-07, "loss": 0.4411, "mean_token_accuracy": 0.8577425479888916, "num_tokens": 89681401.0, "step": 2348 }, { "epoch": 0.2988169444091083, "ewc_loss": 0.016612837091088295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.661283749854192e-05, "grad_norm": 13.629432678222656, "learning_rate": 9.953370072064432e-07, "loss": 0.4337, "mean_token_accuracy": 0.8587164878845215, "num_tokens": 89720366.0, "step": 2349 }, { "epoch": 0.29894415468769875, "ewc_loss": 0.01661243848502636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6612439139862545e-05, "grad_norm": 13.405516624450684, "learning_rate": 9.957609156422213e-07, "loss": 0.5109, "mean_token_accuracy": 0.8367594480514526, "num_tokens": 89762234.0, "step": 2350 }, { "epoch": 0.2990713649662893, "ewc_loss": 0.01653820089995861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6538200725335628e-05, "grad_norm": 13.749051094055176, "learning_rate": 9.961848240779991e-07, "loss": 0.4576, "mean_token_accuracy": 0.8540748953819275, "num_tokens": 89792208.0, "step": 2351 }, { "epoch": 0.2991985752448798, "ewc_loss": 0.016646189615130424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6646190488245338e-05, "grad_norm": 13.4006929397583, "learning_rate": 9.96608732513777e-07, "loss": 0.4343, "mean_token_accuracy": 0.8645193576812744, "num_tokens": 89833246.0, "step": 2352 }, { "epoch": 0.2993257855234703, "ewc_loss": 0.016447030007839203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6447029338451102e-05, "grad_norm": 13.38206672668457, "learning_rate": 9.970326409495548e-07, "loss": 0.4934, "mean_token_accuracy": 0.8409097194671631, "num_tokens": 89873271.0, "step": 2353 }, { "epoch": 0.2994529958020608, "ewc_loss": 0.016633622348308563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.663362309045624e-05, "grad_norm": 13.525476455688477, "learning_rate": 9.974565493853327e-07, "loss": 0.5085, "mean_token_accuracy": 0.8365013003349304, "num_tokens": 89913069.0, "step": 2354 }, { "epoch": 0.29958020608065133, "ewc_loss": 0.016575463116168976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6575462723267265e-05, "grad_norm": 13.417603492736816, "learning_rate": 9.978804578211107e-07, "loss": 0.4884, "mean_token_accuracy": 0.8428868651390076, "num_tokens": 89948915.0, "step": 2355 }, { "epoch": 0.2997074163592418, "ewc_loss": 0.01656521111726761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.656521089898888e-05, "grad_norm": 13.399345397949219, "learning_rate": 9.983043662568886e-07, "loss": 0.4869, "mean_token_accuracy": 0.8454707264900208, "num_tokens": 89982123.0, "step": 2356 }, { "epoch": 0.29983462663783234, "ewc_loss": 0.016567254438996315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6567253624089062e-05, "grad_norm": 13.354045867919922, "learning_rate": 9.987282746926662e-07, "loss": 0.4368, "mean_token_accuracy": 0.8588844537734985, "num_tokens": 90018259.0, "step": 2357 }, { "epoch": 0.29996183691642286, "ewc_loss": 0.016626859083771706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6626858268864453e-05, "grad_norm": 13.506240844726562, "learning_rate": 9.991521831284443e-07, "loss": 0.4589, "mean_token_accuracy": 0.8539094924926758, "num_tokens": 90059041.0, "step": 2358 }, { "epoch": 0.30008904719501334, "ewc_loss": 0.01664072461426258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6640724425087683e-05, "grad_norm": 13.429643630981445, "learning_rate": 9.995760915642221e-07, "loss": 0.4078, "mean_token_accuracy": 0.8681272268295288, "num_tokens": 90089463.0, "step": 2359 }, { "epoch": 0.30021625747360386, "ewc_loss": 0.01659133844077587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6591338862781413e-05, "grad_norm": 13.401180267333984, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8605650067329407, "num_tokens": 90125691.0, "step": 2360 }, { "epoch": 0.3003434677521944, "ewc_loss": 0.016641778871417046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.664177943894174e-05, "grad_norm": 13.498373031616211, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.849335789680481, "num_tokens": 90161778.0, "step": 2361 }, { "epoch": 0.30047067803078487, "ewc_loss": 0.016701197251677513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6701196727808565e-05, "grad_norm": 13.451842308044434, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8368611335754395, "num_tokens": 90202243.0, "step": 2362 }, { "epoch": 0.3005978883093754, "ewc_loss": 0.016676101833581924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6676101949997246e-05, "grad_norm": 13.531424522399902, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8435074687004089, "num_tokens": 90248924.0, "step": 2363 }, { "epoch": 0.3007250985879659, "ewc_loss": 0.016678379848599434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6678379324730486e-05, "grad_norm": 13.474045753479004, "learning_rate": 1e-06, "loss": 0.5756, "mean_token_accuracy": 0.8228678703308105, "num_tokens": 90281988.0, "step": 2364 }, { "epoch": 0.3008523088665564, "ewc_loss": 0.016649479046463966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.664947922108695e-05, "grad_norm": 13.461675643920898, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8457869291305542, "num_tokens": 90317050.0, "step": 2365 }, { "epoch": 0.3009795191451469, "ewc_loss": 0.016699964180588722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.669996345299296e-05, "grad_norm": 13.457512855529785, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8592474460601807, "num_tokens": 90358064.0, "step": 2366 }, { "epoch": 0.30110672942373745, "ewc_loss": 0.016714489087462425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6714489902369678e-05, "grad_norm": 13.477814674377441, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8398895263671875, "num_tokens": 90400692.0, "step": 2367 }, { "epoch": 0.3012339397023279, "ewc_loss": 0.016692418605089188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6692418284947053e-05, "grad_norm": 13.417945861816406, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8432717323303223, "num_tokens": 90432207.0, "step": 2368 }, { "epoch": 0.30136114998091845, "ewc_loss": 0.016729313880205154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6729314666008577e-05, "grad_norm": 13.512282371520996, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8481036424636841, "num_tokens": 90466081.0, "step": 2369 }, { "epoch": 0.301488360259509, "ewc_loss": 0.01672050729393959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6720507119316608e-05, "grad_norm": 13.438225746154785, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8573535680770874, "num_tokens": 90501963.0, "step": 2370 }, { "epoch": 0.30161557053809945, "ewc_loss": 0.016701936721801758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6701937056495808e-05, "grad_norm": 13.444366455078125, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8621453046798706, "num_tokens": 90540651.0, "step": 2371 }, { "epoch": 0.30174278081669, "ewc_loss": 0.01676148734986782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6761487131589092e-05, "grad_norm": 13.461027145385742, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8430911302566528, "num_tokens": 90585105.0, "step": 2372 }, { "epoch": 0.3018699910952805, "ewc_loss": 0.016765736043453217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6765736290835775e-05, "grad_norm": 13.427145004272461, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8483808636665344, "num_tokens": 90626773.0, "step": 2373 }, { "epoch": 0.301997201373871, "ewc_loss": 0.01675659976899624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6756599507061765e-05, "grad_norm": 13.45167350769043, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8655705451965332, "num_tokens": 90665974.0, "step": 2374 }, { "epoch": 0.3021244116524615, "ewc_loss": 0.01677209697663784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6772097296779975e-05, "grad_norm": 13.474018096923828, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8464855551719666, "num_tokens": 90706801.0, "step": 2375 }, { "epoch": 0.30225162193105204, "ewc_loss": 0.01678844355046749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.678844273556024e-05, "grad_norm": 13.452264785766602, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8473924398422241, "num_tokens": 90743635.0, "step": 2376 }, { "epoch": 0.3023788322096425, "ewc_loss": 0.01675419509410858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6754194803070277e-05, "grad_norm": 13.390058517456055, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8515101671218872, "num_tokens": 90782797.0, "step": 2377 }, { "epoch": 0.30250604248823304, "ewc_loss": 0.016806453466415405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.680645436863415e-05, "grad_norm": 13.46319580078125, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8431118130683899, "num_tokens": 90822661.0, "step": 2378 }, { "epoch": 0.30263325276682357, "ewc_loss": 0.016793686896562576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679368688201066e-05, "grad_norm": 13.398202896118164, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8334805369377136, "num_tokens": 90857804.0, "step": 2379 }, { "epoch": 0.3027604630454141, "ewc_loss": 0.016769183799624443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6769183275755495e-05, "grad_norm": 13.459080696105957, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8486816883087158, "num_tokens": 90895778.0, "step": 2380 }, { "epoch": 0.30288767332400457, "ewc_loss": 0.016833249479532242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6833249901537783e-05, "grad_norm": 13.457079887390137, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8574584722518921, "num_tokens": 90934463.0, "step": 2381 }, { "epoch": 0.3030148836025951, "ewc_loss": 0.01680302992463112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6803029211587273e-05, "grad_norm": 13.485614776611328, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8555593490600586, "num_tokens": 90967888.0, "step": 2382 }, { "epoch": 0.3031420938811856, "ewc_loss": 0.016833074390888214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.683307345956564e-05, "grad_norm": 13.500497817993164, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8583371639251709, "num_tokens": 91005200.0, "step": 2383 }, { "epoch": 0.3032693041597761, "ewc_loss": 0.016823410987854004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6823410987854004e-05, "grad_norm": 13.525330543518066, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8346921801567078, "num_tokens": 91037840.0, "step": 2384 }, { "epoch": 0.3033965144383666, "ewc_loss": 0.01682247966527939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.682247966527939e-05, "grad_norm": 13.537400245666504, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8371767997741699, "num_tokens": 91073336.0, "step": 2385 }, { "epoch": 0.30352372471695716, "ewc_loss": 0.016797702759504318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679770321061369e-05, "grad_norm": 13.474868774414062, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8512016534805298, "num_tokens": 91107001.0, "step": 2386 }, { "epoch": 0.30365093499554763, "ewc_loss": 0.01679762452840805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6797624994069338e-05, "grad_norm": 13.460144996643066, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8578190803527832, "num_tokens": 91143937.0, "step": 2387 }, { "epoch": 0.30377814527413816, "ewc_loss": 0.016817064955830574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6817064533825032e-05, "grad_norm": 13.525086402893066, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8416515588760376, "num_tokens": 91185391.0, "step": 2388 }, { "epoch": 0.3039053555527287, "ewc_loss": 0.01680833473801613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6808335203677416e-05, "grad_norm": 13.47343921661377, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8497011065483093, "num_tokens": 91227653.0, "step": 2389 }, { "epoch": 0.30403256583131916, "ewc_loss": 0.01679403893649578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679403976595495e-05, "grad_norm": 13.490814208984375, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8706789016723633, "num_tokens": 91263610.0, "step": 2390 }, { "epoch": 0.3041597761099097, "ewc_loss": 0.016794651746749878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679465094639454e-05, "grad_norm": 13.430459976196289, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8464882969856262, "num_tokens": 91296880.0, "step": 2391 }, { "epoch": 0.3042869863885002, "ewc_loss": 0.016768168658018112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6768168279668316e-05, "grad_norm": 13.494597434997559, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.838828444480896, "num_tokens": 91336242.0, "step": 2392 }, { "epoch": 0.3044141966670907, "ewc_loss": 0.01684202067553997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.684202106844168e-05, "grad_norm": 13.480645179748535, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8590444326400757, "num_tokens": 91376527.0, "step": 2393 }, { "epoch": 0.3045414069456812, "ewc_loss": 0.016822509467601776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6822508769109845e-05, "grad_norm": 13.563000679016113, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8408258557319641, "num_tokens": 91408415.0, "step": 2394 }, { "epoch": 0.30466861722427174, "ewc_loss": 0.01687382161617279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6873822460183874e-05, "grad_norm": 13.531702041625977, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8520632982254028, "num_tokens": 91449189.0, "step": 2395 }, { "epoch": 0.3047958275028622, "ewc_loss": 0.016789473593235016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.678947410255205e-05, "grad_norm": 13.47978401184082, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8470587134361267, "num_tokens": 91488599.0, "step": 2396 }, { "epoch": 0.30492303778145274, "ewc_loss": 0.01683896966278553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.683896880422253e-05, "grad_norm": 13.52731990814209, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8550986051559448, "num_tokens": 91525138.0, "step": 2397 }, { "epoch": 0.30505024806004327, "ewc_loss": 0.016805117949843407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6805117411422543e-05, "grad_norm": 13.527441024780273, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8535337448120117, "num_tokens": 91567444.0, "step": 2398 }, { "epoch": 0.30517745833863374, "ewc_loss": 0.016799038276076317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6799038348835893e-05, "grad_norm": 13.49072551727295, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8481539487838745, "num_tokens": 91602926.0, "step": 2399 }, { "epoch": 0.3053046686172243, "ewc_loss": 0.01680769957602024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.680770037637558e-05, "grad_norm": 13.565671920776367, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8626430630683899, "num_tokens": 91635901.0, "step": 2400 }, { "epoch": 0.3054318788958148, "ewc_loss": 0.016831854358315468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6831854736665264e-05, "grad_norm": 13.50022029876709, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8526471853256226, "num_tokens": 91679542.0, "step": 2401 }, { "epoch": 0.3055590891744053, "ewc_loss": 0.016825949773192406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6825950297061354e-05, "grad_norm": 13.574213027954102, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8459829688072205, "num_tokens": 91723022.0, "step": 2402 }, { "epoch": 0.3056862994529958, "ewc_loss": 0.016844388097524643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6844387573655695e-05, "grad_norm": 13.570307731628418, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.846832275390625, "num_tokens": 91760664.0, "step": 2403 }, { "epoch": 0.30581350973158633, "ewc_loss": 0.01678362488746643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6783624232630245e-05, "grad_norm": 13.51480770111084, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8470643162727356, "num_tokens": 91805570.0, "step": 2404 }, { "epoch": 0.3059407200101768, "ewc_loss": 0.016778063029050827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6778063582023606e-05, "grad_norm": 13.503514289855957, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.842828631401062, "num_tokens": 91843342.0, "step": 2405 }, { "epoch": 0.30606793028876733, "ewc_loss": 0.016784708946943283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.678470835031476e-05, "grad_norm": 13.51404857635498, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8416271209716797, "num_tokens": 91883872.0, "step": 2406 }, { "epoch": 0.30619514056735786, "ewc_loss": 0.016795901581645012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679590241110418e-05, "grad_norm": 13.513155937194824, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.850726842880249, "num_tokens": 91923174.0, "step": 2407 }, { "epoch": 0.30632235084594833, "ewc_loss": 0.01679336093366146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6793361282907426e-05, "grad_norm": 13.514409065246582, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8497353792190552, "num_tokens": 91963462.0, "step": 2408 }, { "epoch": 0.30644956112453886, "ewc_loss": 0.016807058826088905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6807058273116127e-05, "grad_norm": 13.577467918395996, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8549931049346924, "num_tokens": 92000907.0, "step": 2409 }, { "epoch": 0.3065767714031294, "ewc_loss": 0.01677575521171093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6775755284470506e-05, "grad_norm": 13.557655334472656, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8529601097106934, "num_tokens": 92036110.0, "step": 2410 }, { "epoch": 0.30670398168171986, "ewc_loss": 0.01677641086280346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6776410120655783e-05, "grad_norm": 13.432760238647461, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8466047048568726, "num_tokens": 92080647.0, "step": 2411 }, { "epoch": 0.3068311919603104, "ewc_loss": 0.016773367300629616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.677336695138365e-05, "grad_norm": 13.593413352966309, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8552169799804688, "num_tokens": 92115407.0, "step": 2412 }, { "epoch": 0.3069584022389009, "ewc_loss": 0.016810858622193336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6810858141980134e-05, "grad_norm": 13.53641128540039, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8515290021896362, "num_tokens": 92149660.0, "step": 2413 }, { "epoch": 0.3070856125174914, "ewc_loss": 0.016754690557718277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6754691387177445e-05, "grad_norm": 13.503557205200195, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8541141152381897, "num_tokens": 92191581.0, "step": 2414 }, { "epoch": 0.3072128227960819, "ewc_loss": 0.016798052936792374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.679805245657917e-05, "grad_norm": 13.577826499938965, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8460930585861206, "num_tokens": 92230435.0, "step": 2415 }, { "epoch": 0.30734003307467245, "ewc_loss": 0.01679036393761635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6790363588370383e-05, "grad_norm": 13.579065322875977, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8387669324874878, "num_tokens": 92269545.0, "step": 2416 }, { "epoch": 0.3074672433532629, "ewc_loss": 0.01677767001092434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.677767068031244e-05, "grad_norm": 13.482076644897461, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.857884407043457, "num_tokens": 92307447.0, "step": 2417 }, { "epoch": 0.30759445363185345, "ewc_loss": 0.01677871122956276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6778711142251268e-05, "grad_norm": 13.529552459716797, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8495556712150574, "num_tokens": 92349441.0, "step": 2418 }, { "epoch": 0.307721663910444, "ewc_loss": 0.016800597310066223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6800597222754732e-05, "grad_norm": 13.508145332336426, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.855688214302063, "num_tokens": 92380405.0, "step": 2419 }, { "epoch": 0.30784887418903445, "ewc_loss": 0.016808494925498962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6808495274744928e-05, "grad_norm": 13.503924369812012, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8491613268852234, "num_tokens": 92419596.0, "step": 2420 }, { "epoch": 0.307976084467625, "ewc_loss": 0.016848845407366753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6848845916683786e-05, "grad_norm": 13.5629301071167, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8445955514907837, "num_tokens": 92454590.0, "step": 2421 }, { "epoch": 0.3081032947462155, "ewc_loss": 0.01684071309864521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6840713215060532e-05, "grad_norm": 13.47497272491455, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8463300466537476, "num_tokens": 92491640.0, "step": 2422 }, { "epoch": 0.308230505024806, "ewc_loss": 0.016821732744574547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.682173206063453e-05, "grad_norm": 13.480057716369629, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8422626256942749, "num_tokens": 92528682.0, "step": 2423 }, { "epoch": 0.3083577153033965, "ewc_loss": 0.016876880079507828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6876880181371234e-05, "grad_norm": 13.531780242919922, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8575854897499084, "num_tokens": 92566368.0, "step": 2424 }, { "epoch": 0.30848492558198704, "ewc_loss": 0.016852067783474922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6852067346917465e-05, "grad_norm": 13.467142105102539, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8283444046974182, "num_tokens": 92601380.0, "step": 2425 }, { "epoch": 0.3086121358605775, "ewc_loss": 0.016853252425789833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6853253328008577e-05, "grad_norm": 13.431432723999023, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8661075830459595, "num_tokens": 92639724.0, "step": 2426 }, { "epoch": 0.30873934613916804, "ewc_loss": 0.016879433766007423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6879434042493813e-05, "grad_norm": 13.475322723388672, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8567083477973938, "num_tokens": 92677450.0, "step": 2427 }, { "epoch": 0.30886655641775856, "ewc_loss": 0.016913369297981262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6913369108806364e-05, "grad_norm": 13.466280937194824, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8584259152412415, "num_tokens": 92714681.0, "step": 2428 }, { "epoch": 0.3089937666963491, "ewc_loss": 0.016912082210183144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6912083083298057e-05, "grad_norm": 13.506824493408203, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8476836681365967, "num_tokens": 92749925.0, "step": 2429 }, { "epoch": 0.30912097697493957, "ewc_loss": 0.016923436895012856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.692343721515499e-05, "grad_norm": 13.491896629333496, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8541135191917419, "num_tokens": 92791720.0, "step": 2430 }, { "epoch": 0.3092481872535301, "ewc_loss": 0.016956660896539688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6956661056610756e-05, "grad_norm": 13.649762153625488, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8428208827972412, "num_tokens": 92826179.0, "step": 2431 }, { "epoch": 0.3093753975321206, "ewc_loss": 0.016932837665081024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6932837752392516e-05, "grad_norm": 13.468910217285156, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8501271605491638, "num_tokens": 92872751.0, "step": 2432 }, { "epoch": 0.3095026078107111, "ewc_loss": 0.01695948652923107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6959485947154462e-05, "grad_norm": 13.873351097106934, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8449369668960571, "num_tokens": 92907470.0, "step": 2433 }, { "epoch": 0.3096298180893016, "ewc_loss": 0.01697535626590252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.69753566297004e-05, "grad_norm": 13.513131141662598, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8559046983718872, "num_tokens": 92943321.0, "step": 2434 }, { "epoch": 0.30975702836789215, "ewc_loss": 0.0168668981641531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6866897567524575e-05, "grad_norm": 13.938785552978516, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.852655291557312, "num_tokens": 92982136.0, "step": 2435 }, { "epoch": 0.3098842386464826, "ewc_loss": 0.016992652788758278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6992653399938717e-05, "grad_norm": 13.570773124694824, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8405638933181763, "num_tokens": 93015974.0, "step": 2436 }, { "epoch": 0.31001144892507315, "ewc_loss": 0.016722818836569786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6722819054848514e-05, "grad_norm": 13.471534729003906, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8544313311576843, "num_tokens": 93054010.0, "step": 2437 }, { "epoch": 0.3101386592036637, "ewc_loss": 0.016920221969485283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.692022124188952e-05, "grad_norm": 13.475147247314453, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8581773042678833, "num_tokens": 93090226.0, "step": 2438 }, { "epoch": 0.31026586948225415, "ewc_loss": 0.016901161521673203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6901161870919168e-05, "grad_norm": 13.68656063079834, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8476501703262329, "num_tokens": 93130242.0, "step": 2439 }, { "epoch": 0.3103930797608447, "ewc_loss": 0.01692654937505722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6926549506024458e-05, "grad_norm": 13.572603225708008, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8464102745056152, "num_tokens": 93170990.0, "step": 2440 }, { "epoch": 0.3105202900394352, "ewc_loss": 0.01684766449034214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6847665392560884e-05, "grad_norm": 13.521292686462402, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8437020778656006, "num_tokens": 93205478.0, "step": 2441 }, { "epoch": 0.3106475003180257, "ewc_loss": 0.01688239723443985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.688239717623219e-05, "grad_norm": 13.487272262573242, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8434487581253052, "num_tokens": 93240952.0, "step": 2442 }, { "epoch": 0.3107747105966162, "ewc_loss": 0.016874166205525398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6874166249181144e-05, "grad_norm": 13.598531723022461, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8399257659912109, "num_tokens": 93278882.0, "step": 2443 }, { "epoch": 0.31090192087520674, "ewc_loss": 0.01693117246031761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.693117337708827e-05, "grad_norm": 13.471080780029297, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8427258729934692, "num_tokens": 93324573.0, "step": 2444 }, { "epoch": 0.3110291311537972, "ewc_loss": 0.01689305528998375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6893054635147564e-05, "grad_norm": 13.471790313720703, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.865034818649292, "num_tokens": 93361049.0, "step": 2445 }, { "epoch": 0.31115634143238774, "ewc_loss": 0.016946988180279732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.69469876709627e-05, "grad_norm": 13.490349769592285, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8472906351089478, "num_tokens": 93401748.0, "step": 2446 }, { "epoch": 0.31128355171097827, "ewc_loss": 0.016930418089032173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.69304184964858e-05, "grad_norm": 13.613862991333008, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.861823320388794, "num_tokens": 93434857.0, "step": 2447 }, { "epoch": 0.31141076198956874, "ewc_loss": 0.016956087201833725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.695608807494864e-05, "grad_norm": 13.44132137298584, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8484506607055664, "num_tokens": 93474681.0, "step": 2448 }, { "epoch": 0.31153797226815927, "ewc_loss": 0.016920803114771843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6920803318498656e-05, "grad_norm": 13.619239807128906, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8489527702331543, "num_tokens": 93510627.0, "step": 2449 }, { "epoch": 0.3116651825467498, "ewc_loss": 0.01701446995139122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7014470358844846e-05, "grad_norm": 13.465130805969238, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8407180905342102, "num_tokens": 93551598.0, "step": 2450 }, { "epoch": 0.31179239282534027, "ewc_loss": 0.016938377171754837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6938376575126313e-05, "grad_norm": 13.609405517578125, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8434566259384155, "num_tokens": 93586622.0, "step": 2451 }, { "epoch": 0.3119196031039308, "ewc_loss": 0.01701425202190876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.701425208011642e-05, "grad_norm": 13.524270057678223, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8394380211830139, "num_tokens": 93626496.0, "step": 2452 }, { "epoch": 0.3120468133825213, "ewc_loss": 0.016947826370596886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6947826225077733e-05, "grad_norm": 13.502325057983398, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8518481850624084, "num_tokens": 93665641.0, "step": 2453 }, { "epoch": 0.3121740236611118, "ewc_loss": 0.016988176852464676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.698817686701659e-05, "grad_norm": 13.542527198791504, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8559513092041016, "num_tokens": 93702135.0, "step": 2454 }, { "epoch": 0.3123012339397023, "ewc_loss": 0.016979588195681572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6979587599053048e-05, "grad_norm": 13.510588645935059, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8573741316795349, "num_tokens": 93738226.0, "step": 2455 }, { "epoch": 0.31242844421829286, "ewc_loss": 0.016995998099446297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.699599852145184e-05, "grad_norm": 13.56083869934082, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8535915613174438, "num_tokens": 93780273.0, "step": 2456 }, { "epoch": 0.31255565449688333, "ewc_loss": 0.016983630135655403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6983629393507726e-05, "grad_norm": 13.48698902130127, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8370805978775024, "num_tokens": 93821977.0, "step": 2457 }, { "epoch": 0.31268286477547386, "ewc_loss": 0.01698494143784046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6984940884867683e-05, "grad_norm": 13.50965690612793, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8585320711135864, "num_tokens": 93856084.0, "step": 2458 }, { "epoch": 0.3128100750540644, "ewc_loss": 0.016971133649349213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6971132936305366e-05, "grad_norm": 13.500460624694824, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.853350043296814, "num_tokens": 93893491.0, "step": 2459 }, { "epoch": 0.31293728533265486, "ewc_loss": 0.017038820311427116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7038821169990115e-05, "grad_norm": 13.575762748718262, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8383378982543945, "num_tokens": 93940240.0, "step": 2460 }, { "epoch": 0.3130644956112454, "ewc_loss": 0.017051037400960922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.705103750282433e-05, "grad_norm": 13.572911262512207, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8421541452407837, "num_tokens": 93974504.0, "step": 2461 }, { "epoch": 0.3131917058898359, "ewc_loss": 0.017010102048516273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7010102965286933e-05, "grad_norm": 13.54771900177002, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.851078987121582, "num_tokens": 94009208.0, "step": 2462 }, { "epoch": 0.3133189161684264, "ewc_loss": 0.01701481081545353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.701481050986331e-05, "grad_norm": 13.469582557678223, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8506633043289185, "num_tokens": 94049249.0, "step": 2463 }, { "epoch": 0.3134461264470169, "ewc_loss": 0.017054228112101555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7054228010238148e-05, "grad_norm": 13.618362426757812, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8503232002258301, "num_tokens": 94085553.0, "step": 2464 }, { "epoch": 0.31357333672560744, "ewc_loss": 0.017066715285182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.706671537249349e-05, "grad_norm": 13.578913688659668, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8307681083679199, "num_tokens": 94127531.0, "step": 2465 }, { "epoch": 0.3137005470041979, "ewc_loss": 0.017009476199746132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.700947541394271e-05, "grad_norm": 13.487006187438965, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8617957830429077, "num_tokens": 94171191.0, "step": 2466 }, { "epoch": 0.31382775728278844, "ewc_loss": 0.01703541725873947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.703541784081608e-05, "grad_norm": 13.557134628295898, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8552970290184021, "num_tokens": 94212090.0, "step": 2467 }, { "epoch": 0.313954967561379, "ewc_loss": 0.01704639010131359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.704638998489827e-05, "grad_norm": 13.527813911437988, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8685197830200195, "num_tokens": 94246835.0, "step": 2468 }, { "epoch": 0.31408217783996945, "ewc_loss": 0.017035799100995064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7035799828590825e-05, "grad_norm": 13.472894668579102, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8482979536056519, "num_tokens": 94284581.0, "step": 2469 }, { "epoch": 0.31420938811856, "ewc_loss": 0.017054475843906403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.705447539279703e-05, "grad_norm": 13.62136173248291, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8606407642364502, "num_tokens": 94323050.0, "step": 2470 }, { "epoch": 0.3143365983971505, "ewc_loss": 0.01709807850420475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7098078387789428e-05, "grad_norm": 13.528435707092285, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8449625968933105, "num_tokens": 94364702.0, "step": 2471 }, { "epoch": 0.314463808675741, "ewc_loss": 0.01701728254556656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.701728251646273e-05, "grad_norm": 13.563177108764648, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8565439581871033, "num_tokens": 94404735.0, "step": 2472 }, { "epoch": 0.3145910189543315, "ewc_loss": 0.017062190920114517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7062191545846872e-05, "grad_norm": 13.609221458435059, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8349897265434265, "num_tokens": 94444683.0, "step": 2473 }, { "epoch": 0.31471822923292203, "ewc_loss": 0.017036162316799164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.703616180748213e-05, "grad_norm": 13.646700859069824, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8528376817703247, "num_tokens": 94484392.0, "step": 2474 }, { "epoch": 0.3148454395115125, "ewc_loss": 0.016994422301650047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6994421457638964e-05, "grad_norm": 13.489434242248535, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8583987951278687, "num_tokens": 94524860.0, "step": 2475 }, { "epoch": 0.31497264979010303, "ewc_loss": 0.016982989385724068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.698298910923768e-05, "grad_norm": 13.635238647460938, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8472567796707153, "num_tokens": 94565076.0, "step": 2476 }, { "epoch": 0.31509986006869356, "ewc_loss": 0.017049850896000862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7049851521733217e-05, "grad_norm": 13.584020614624023, "learning_rate": 1e-06, "loss": 0.542, "mean_token_accuracy": 0.8277328014373779, "num_tokens": 94606191.0, "step": 2477 }, { "epoch": 0.31522707034728403, "ewc_loss": 0.016970215365290642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.697021616564598e-05, "grad_norm": 13.472735404968262, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8448812961578369, "num_tokens": 94648021.0, "step": 2478 }, { "epoch": 0.31535428062587456, "ewc_loss": 0.017033929005265236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7033928088494577e-05, "grad_norm": 13.654189109802246, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.861859142780304, "num_tokens": 94686305.0, "step": 2479 }, { "epoch": 0.3154814909044651, "ewc_loss": 0.01706678792834282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7066788132069632e-05, "grad_norm": 13.577160835266113, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8481972813606262, "num_tokens": 94728606.0, "step": 2480 }, { "epoch": 0.3156087011830556, "ewc_loss": 0.016987690702080727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.6987691196845844e-05, "grad_norm": 13.493274688720703, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8535206913948059, "num_tokens": 94770162.0, "step": 2481 }, { "epoch": 0.3157359114616461, "ewc_loss": 0.017000021412968636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7000022126012482e-05, "grad_norm": 13.608177185058594, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8574779033660889, "num_tokens": 94807735.0, "step": 2482 }, { "epoch": 0.3158631217402366, "ewc_loss": 0.01702001690864563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7020016457536258e-05, "grad_norm": 13.590568542480469, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8451527953147888, "num_tokens": 94840366.0, "step": 2483 }, { "epoch": 0.31599033201882715, "ewc_loss": 0.016982994973659515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.698299456620589e-05, "grad_norm": 13.524377822875977, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8471486568450928, "num_tokens": 94874293.0, "step": 2484 }, { "epoch": 0.3161175422974176, "ewc_loss": 0.01701180636882782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7011805539368652e-05, "grad_norm": 13.605581283569336, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8498106002807617, "num_tokens": 94909347.0, "step": 2485 }, { "epoch": 0.31624475257600815, "ewc_loss": 0.01705222576856613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7052225302904844e-05, "grad_norm": 13.48830795288086, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8607774972915649, "num_tokens": 94955120.0, "step": 2486 }, { "epoch": 0.3163719628545987, "ewc_loss": 0.017012834548950195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.701283508737106e-05, "grad_norm": 13.49676513671875, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8642659783363342, "num_tokens": 94992319.0, "step": 2487 }, { "epoch": 0.31649917313318915, "ewc_loss": 0.017035705968737602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.703570524114184e-05, "grad_norm": 13.572066307067871, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8601109385490417, "num_tokens": 95023464.0, "step": 2488 }, { "epoch": 0.3166263834117797, "ewc_loss": 0.017058011144399643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7058011508197524e-05, "grad_norm": 13.546337127685547, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8640475869178772, "num_tokens": 95055963.0, "step": 2489 }, { "epoch": 0.3167535936903702, "ewc_loss": 0.017069879919290543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.706988041405566e-05, "grad_norm": 13.652176856994629, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8320144414901733, "num_tokens": 95089628.0, "step": 2490 }, { "epoch": 0.3168808039689607, "ewc_loss": 0.017102893441915512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7102893252740614e-05, "grad_norm": 13.593114852905273, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8649954795837402, "num_tokens": 95128557.0, "step": 2491 }, { "epoch": 0.3170080142475512, "ewc_loss": 0.01704558916389942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.704558962956071e-05, "grad_norm": 13.5614595413208, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8388370871543884, "num_tokens": 95168324.0, "step": 2492 }, { "epoch": 0.31713522452614173, "ewc_loss": 0.01708533614873886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.708533636701759e-05, "grad_norm": 13.561444282531738, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8311547040939331, "num_tokens": 95208139.0, "step": 2493 }, { "epoch": 0.3172624348047322, "ewc_loss": 0.017130982130765915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.713098208711017e-05, "grad_norm": 13.583918571472168, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8453672528266907, "num_tokens": 95247337.0, "step": 2494 }, { "epoch": 0.31738964508332274, "ewc_loss": 0.0171034075319767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7103408026741818e-05, "grad_norm": 13.588966369628906, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8437696695327759, "num_tokens": 95289208.0, "step": 2495 }, { "epoch": 0.31751685536191326, "ewc_loss": 0.017086973413825035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.708697345748078e-05, "grad_norm": 13.49604606628418, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8483284115791321, "num_tokens": 95329717.0, "step": 2496 }, { "epoch": 0.31764406564050374, "ewc_loss": 0.01712098717689514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7120986740337685e-05, "grad_norm": 13.61538028717041, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8311485052108765, "num_tokens": 95371052.0, "step": 2497 }, { "epoch": 0.31777127591909426, "ewc_loss": 0.01715654693543911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7156546164187603e-05, "grad_norm": 13.528586387634277, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.871436595916748, "num_tokens": 95412486.0, "step": 2498 }, { "epoch": 0.3178984861976848, "ewc_loss": 0.017097817733883858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.709781827230472e-05, "grad_norm": 13.65611743927002, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8673737049102783, "num_tokens": 95446167.0, "step": 2499 }, { "epoch": 0.31802569647627527, "ewc_loss": 0.01713010109961033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7130101696238853e-05, "grad_norm": 13.467179298400879, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8459265828132629, "num_tokens": 95485763.0, "step": 2500 }, { "epoch": 0.3181529067548658, "ewc_loss": 0.017079927027225494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7079926692531444e-05, "grad_norm": 13.723906517028809, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8406491875648499, "num_tokens": 95526407.0, "step": 2501 }, { "epoch": 0.3182801170334563, "ewc_loss": 0.01717345416545868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.717345367069356e-05, "grad_norm": 13.607102394104004, "learning_rate": 1e-06, "loss": 0.5732, "mean_token_accuracy": 0.8181341886520386, "num_tokens": 95567479.0, "step": 2502 }, { "epoch": 0.3184073273120468, "ewc_loss": 0.01704026199877262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7040261809597723e-05, "grad_norm": 13.581896781921387, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8565924167633057, "num_tokens": 95606588.0, "step": 2503 }, { "epoch": 0.3185345375906373, "ewc_loss": 0.017112400382757187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.711240111035295e-05, "grad_norm": 13.607545852661133, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.855099081993103, "num_tokens": 95653293.0, "step": 2504 }, { "epoch": 0.31866174786922785, "ewc_loss": 0.017092369496822357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7092368580051698e-05, "grad_norm": 13.601682662963867, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8415385484695435, "num_tokens": 95692424.0, "step": 2505 }, { "epoch": 0.3187889581478183, "ewc_loss": 0.01709194853901863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.709194839349948e-05, "grad_norm": 13.609681129455566, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8603289127349854, "num_tokens": 95733679.0, "step": 2506 }, { "epoch": 0.31891616842640885, "ewc_loss": 0.017111938446760178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7111939087044448e-05, "grad_norm": 13.643525123596191, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.845456600189209, "num_tokens": 95768211.0, "step": 2507 }, { "epoch": 0.3190433787049994, "ewc_loss": 0.017106184735894203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7106185623561032e-05, "grad_norm": 13.576387405395508, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8489319682121277, "num_tokens": 95805974.0, "step": 2508 }, { "epoch": 0.31917058898358985, "ewc_loss": 0.017086585983633995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7086586012737826e-05, "grad_norm": 13.657111167907715, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8490913510322571, "num_tokens": 95845463.0, "step": 2509 }, { "epoch": 0.3192977992621804, "ewc_loss": 0.017080269753932953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7080270481528714e-05, "grad_norm": 13.539944648742676, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8462983965873718, "num_tokens": 95888148.0, "step": 2510 }, { "epoch": 0.3194250095407709, "ewc_loss": 0.017041215673089027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.704121496004518e-05, "grad_norm": 13.569212913513184, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.872072696685791, "num_tokens": 95925432.0, "step": 2511 }, { "epoch": 0.3195522198193614, "ewc_loss": 0.017100032418966293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7100031982408836e-05, "grad_norm": 13.57509708404541, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8568296432495117, "num_tokens": 95970695.0, "step": 2512 }, { "epoch": 0.3196794300979519, "ewc_loss": 0.01710132695734501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.710132710286416e-05, "grad_norm": 13.655829429626465, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8516861200332642, "num_tokens": 96006093.0, "step": 2513 }, { "epoch": 0.31980664037654244, "ewc_loss": 0.0171199943870306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.711999357212335e-05, "grad_norm": 13.527925491333008, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8507923483848572, "num_tokens": 96045945.0, "step": 2514 }, { "epoch": 0.3199338506551329, "ewc_loss": 0.017100010067224503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7100010154535994e-05, "grad_norm": 13.629621505737305, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8666196465492249, "num_tokens": 96076748.0, "step": 2515 }, { "epoch": 0.32006106093372344, "ewc_loss": 0.017150280997157097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.715028156468179e-05, "grad_norm": 13.588661193847656, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8529529571533203, "num_tokens": 96116964.0, "step": 2516 }, { "epoch": 0.32018827121231397, "ewc_loss": 0.017115062102675438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7115062291850336e-05, "grad_norm": 13.54648494720459, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8536974787712097, "num_tokens": 96152073.0, "step": 2517 }, { "epoch": 0.32031548149090444, "ewc_loss": 0.017169160768389702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7169160855701193e-05, "grad_norm": 13.607748985290527, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8501542806625366, "num_tokens": 96191608.0, "step": 2518 }, { "epoch": 0.32044269176949497, "ewc_loss": 0.01715734414756298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.715734470053576e-05, "grad_norm": 13.491161346435547, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.868553876876831, "num_tokens": 96228124.0, "step": 2519 }, { "epoch": 0.3205699020480855, "ewc_loss": 0.017163824290037155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.716382394079119e-05, "grad_norm": 13.662345886230469, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8620271682739258, "num_tokens": 96265870.0, "step": 2520 }, { "epoch": 0.32069711232667597, "ewc_loss": 0.017202511429786682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7202512026415206e-05, "grad_norm": 13.483277320861816, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8615937232971191, "num_tokens": 96309623.0, "step": 2521 }, { "epoch": 0.3208243226052665, "ewc_loss": 0.017164409160614014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.716440965537913e-05, "grad_norm": 13.660478591918945, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8431456089019775, "num_tokens": 96344036.0, "step": 2522 }, { "epoch": 0.320951532883857, "ewc_loss": 0.01723066158592701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.723066088743508e-05, "grad_norm": 13.634780883789062, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8626691699028015, "num_tokens": 96377627.0, "step": 2523 }, { "epoch": 0.3210787431624475, "ewc_loss": 0.017135700210928917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7135700545622967e-05, "grad_norm": 13.609709739685059, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.852004885673523, "num_tokens": 96413568.0, "step": 2524 }, { "epoch": 0.32120595344103803, "ewc_loss": 0.01719842292368412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7198422938236035e-05, "grad_norm": 13.505568504333496, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8483930230140686, "num_tokens": 96454682.0, "step": 2525 }, { "epoch": 0.32133316371962856, "ewc_loss": 0.017169415950775146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.716941551421769e-05, "grad_norm": 13.580927848815918, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8405903577804565, "num_tokens": 96490456.0, "step": 2526 }, { "epoch": 0.32146037399821903, "ewc_loss": 0.01724499836564064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7244998161913827e-05, "grad_norm": 13.612174987792969, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.845089852809906, "num_tokens": 96526891.0, "step": 2527 }, { "epoch": 0.32158758427680956, "ewc_loss": 0.01720881089568138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7208811186719686e-05, "grad_norm": 13.55623722076416, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8459365367889404, "num_tokens": 96575226.0, "step": 2528 }, { "epoch": 0.3217147945554001, "ewc_loss": 0.017223231494426727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7223232134710997e-05, "grad_norm": 13.549556732177734, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8409669399261475, "num_tokens": 96606383.0, "step": 2529 }, { "epoch": 0.3218420048339906, "ewc_loss": 0.017235441133379936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7235441191587597e-05, "grad_norm": 13.61444091796875, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8535162210464478, "num_tokens": 96639826.0, "step": 2530 }, { "epoch": 0.3219692151125811, "ewc_loss": 0.017256174236536026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7256174032809213e-05, "grad_norm": 13.527280807495117, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8555052280426025, "num_tokens": 96679151.0, "step": 2531 }, { "epoch": 0.3220964253911716, "ewc_loss": 0.017254704609513283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7254704289371148e-05, "grad_norm": 13.605386734008789, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8576341867446899, "num_tokens": 96719365.0, "step": 2532 }, { "epoch": 0.32222363566976214, "ewc_loss": 0.017301077023148537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7301077605225146e-05, "grad_norm": 13.615330696105957, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8486162424087524, "num_tokens": 96758318.0, "step": 2533 }, { "epoch": 0.3223508459483526, "ewc_loss": 0.017246369272470474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.72463696799241e-05, "grad_norm": 13.538028717041016, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8457432985305786, "num_tokens": 96791159.0, "step": 2534 }, { "epoch": 0.32247805622694314, "ewc_loss": 0.017328547313809395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7328547983197495e-05, "grad_norm": 13.636761665344238, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8538566827774048, "num_tokens": 96825352.0, "step": 2535 }, { "epoch": 0.32260526650553367, "ewc_loss": 0.01730925962328911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7309259419562295e-05, "grad_norm": 13.578794479370117, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8494433164596558, "num_tokens": 96859700.0, "step": 2536 }, { "epoch": 0.32273247678412414, "ewc_loss": 0.01727086305618286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7270862372242846e-05, "grad_norm": 13.572001457214355, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8495006561279297, "num_tokens": 96897077.0, "step": 2537 }, { "epoch": 0.3228596870627147, "ewc_loss": 0.017320046201348305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7320046026725322e-05, "grad_norm": 13.570262908935547, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.84630286693573, "num_tokens": 96936936.0, "step": 2538 }, { "epoch": 0.3229868973413052, "ewc_loss": 0.017311831936240196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.731183147057891e-05, "grad_norm": 13.529497146606445, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8574941158294678, "num_tokens": 96975702.0, "step": 2539 }, { "epoch": 0.3231141076198957, "ewc_loss": 0.017302272841334343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7302272681263275e-05, "grad_norm": 13.5596923828125, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8502781391143799, "num_tokens": 97010585.0, "step": 2540 }, { "epoch": 0.3232413178984862, "ewc_loss": 0.017346231266856194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7346230379189365e-05, "grad_norm": 13.557217597961426, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8753092288970947, "num_tokens": 97047210.0, "step": 2541 }, { "epoch": 0.32336852817707673, "ewc_loss": 0.01731042005121708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7310419934801757e-05, "grad_norm": 13.581432342529297, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8460096716880798, "num_tokens": 97084835.0, "step": 2542 }, { "epoch": 0.3234957384556672, "ewc_loss": 0.017342042177915573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7342043065582402e-05, "grad_norm": 13.475007057189941, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8486325740814209, "num_tokens": 97125829.0, "step": 2543 }, { "epoch": 0.32362294873425773, "ewc_loss": 0.01731831766664982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7318317986791953e-05, "grad_norm": 13.605302810668945, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8445284366607666, "num_tokens": 97162874.0, "step": 2544 }, { "epoch": 0.32375015901284826, "ewc_loss": 0.01739111728966236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7391117580700666e-05, "grad_norm": 13.584555625915527, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.854572057723999, "num_tokens": 97197559.0, "step": 2545 }, { "epoch": 0.32387736929143873, "ewc_loss": 0.017295442521572113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.729544237605296e-05, "grad_norm": 13.538692474365234, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8382947444915771, "num_tokens": 97227028.0, "step": 2546 }, { "epoch": 0.32400457957002926, "ewc_loss": 0.0173740703612566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7374070012010634e-05, "grad_norm": 13.564014434814453, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8470507860183716, "num_tokens": 97270209.0, "step": 2547 }, { "epoch": 0.3241317898486198, "ewc_loss": 0.017383437603712082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7383437807438895e-05, "grad_norm": 13.571257591247559, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8542701005935669, "num_tokens": 97309176.0, "step": 2548 }, { "epoch": 0.32425900012721026, "ewc_loss": 0.01738365739583969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7383657905156724e-05, "grad_norm": 13.601861953735352, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8314580917358398, "num_tokens": 97340464.0, "step": 2549 }, { "epoch": 0.3243862104058008, "ewc_loss": 0.01738409698009491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.738409628160298e-05, "grad_norm": 13.531208992004395, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.854107677936554, "num_tokens": 97378435.0, "step": 2550 }, { "epoch": 0.3245134206843913, "ewc_loss": 0.017386464402079582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7386464605806395e-05, "grad_norm": 13.533829689025879, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8473052978515625, "num_tokens": 97417886.0, "step": 2551 }, { "epoch": 0.3246406309629818, "ewc_loss": 0.017404651269316673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7404650861863047e-05, "grad_norm": 13.60496711730957, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8545637130737305, "num_tokens": 97450552.0, "step": 2552 }, { "epoch": 0.3247678412415723, "ewc_loss": 0.017426935955882072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7426935301045887e-05, "grad_norm": 13.556544303894043, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8576267957687378, "num_tokens": 97488586.0, "step": 2553 }, { "epoch": 0.32489505152016285, "ewc_loss": 0.01739548146724701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7395481336279772e-05, "grad_norm": 13.587864875793457, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.846317708492279, "num_tokens": 97528893.0, "step": 2554 }, { "epoch": 0.3250222617987533, "ewc_loss": 0.01742965541779995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7429654690204188e-05, "grad_norm": 13.594429969787598, "learning_rate": 1e-06, "loss": 0.5378, "mean_token_accuracy": 0.826775312423706, "num_tokens": 97568654.0, "step": 2555 }, { "epoch": 0.32514947207734385, "ewc_loss": 0.017382200807332993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7382200894644484e-05, "grad_norm": 13.543188095092773, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8554520606994629, "num_tokens": 97607068.0, "step": 2556 }, { "epoch": 0.3252766823559344, "ewc_loss": 0.017427973449230194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7427973943995312e-05, "grad_norm": 13.59250259399414, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8729396462440491, "num_tokens": 97647505.0, "step": 2557 }, { "epoch": 0.32540389263452485, "ewc_loss": 0.01740432344377041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.740432344377041e-05, "grad_norm": 13.540068626403809, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8445438146591187, "num_tokens": 97690505.0, "step": 2558 }, { "epoch": 0.3255311029131154, "ewc_loss": 0.017388347536325455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7388347259839065e-05, "grad_norm": 13.656488418579102, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8389396667480469, "num_tokens": 97723955.0, "step": 2559 }, { "epoch": 0.3256583131917059, "ewc_loss": 0.017428996041417122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7428996216040105e-05, "grad_norm": 13.551631927490234, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.84581458568573, "num_tokens": 97763822.0, "step": 2560 }, { "epoch": 0.3257855234702964, "ewc_loss": 0.017396848648786545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.739684921631124e-05, "grad_norm": 13.574995040893555, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8467487692832947, "num_tokens": 97803827.0, "step": 2561 }, { "epoch": 0.3259127337488869, "ewc_loss": 0.01744426228106022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7444262994104065e-05, "grad_norm": 13.588356018066406, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8504093289375305, "num_tokens": 97841056.0, "step": 2562 }, { "epoch": 0.32603994402747744, "ewc_loss": 0.017416369169950485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.741636879160069e-05, "grad_norm": 13.592775344848633, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8680570125579834, "num_tokens": 97881714.0, "step": 2563 }, { "epoch": 0.3261671543060679, "ewc_loss": 0.01742192544043064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7421925804228522e-05, "grad_norm": 13.624979972839355, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8480483293533325, "num_tokens": 97925797.0, "step": 2564 }, { "epoch": 0.32629436458465844, "ewc_loss": 0.017417319118976593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.741731830406934e-05, "grad_norm": 13.587027549743652, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8554949760437012, "num_tokens": 97963638.0, "step": 2565 }, { "epoch": 0.32642157486324896, "ewc_loss": 0.017399994656443596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.739999424898997e-05, "grad_norm": 13.639111518859863, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8446874618530273, "num_tokens": 98002043.0, "step": 2566 }, { "epoch": 0.32654878514183944, "ewc_loss": 0.017423123121261597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7423122699256055e-05, "grad_norm": 13.670172691345215, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.85419762134552, "num_tokens": 98041031.0, "step": 2567 }, { "epoch": 0.32667599542042997, "ewc_loss": 0.017382852733135223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7382852092850953e-05, "grad_norm": 13.628666877746582, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8583517074584961, "num_tokens": 98075510.0, "step": 2568 }, { "epoch": 0.3268032056990205, "ewc_loss": 0.017370518296957016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.737051752570551e-05, "grad_norm": 13.623434066772461, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8569477796554565, "num_tokens": 98114162.0, "step": 2569 }, { "epoch": 0.32693041597761097, "ewc_loss": 0.017371224239468575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7371225112583488e-05, "grad_norm": 13.667459487915039, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8468643426895142, "num_tokens": 98148722.0, "step": 2570 }, { "epoch": 0.3270576262562015, "ewc_loss": 0.017374154180288315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7374153685523197e-05, "grad_norm": 13.592867851257324, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8307649493217468, "num_tokens": 98188255.0, "step": 2571 }, { "epoch": 0.327184836534792, "ewc_loss": 0.017340971156954765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7340971680823714e-05, "grad_norm": 13.55100154876709, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8612636923789978, "num_tokens": 98228597.0, "step": 2572 }, { "epoch": 0.3273120468133825, "ewc_loss": 0.017406899482011795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.740689913276583e-05, "grad_norm": 13.646553039550781, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8423901796340942, "num_tokens": 98273405.0, "step": 2573 }, { "epoch": 0.327439257091973, "ewc_loss": 0.01735912822186947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7359128833049908e-05, "grad_norm": 13.593140602111816, "learning_rate": 1e-06, "loss": 0.5277, "mean_token_accuracy": 0.8365862369537354, "num_tokens": 98315091.0, "step": 2574 }, { "epoch": 0.32756646737056355, "ewc_loss": 0.01737380214035511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7373802620568313e-05, "grad_norm": 13.635157585144043, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8522583246231079, "num_tokens": 98356967.0, "step": 2575 }, { "epoch": 0.327693677649154, "ewc_loss": 0.017389195039868355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7389194908901118e-05, "grad_norm": 13.619617462158203, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8417129516601562, "num_tokens": 98394455.0, "step": 2576 }, { "epoch": 0.32782088792774455, "ewc_loss": 0.017372824251651764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7372824004269205e-05, "grad_norm": 13.623538970947266, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8592255115509033, "num_tokens": 98430455.0, "step": 2577 }, { "epoch": 0.3279480982063351, "ewc_loss": 0.01740933582186699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.740933657856658e-05, "grad_norm": 13.588129997253418, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8374004364013672, "num_tokens": 98472218.0, "step": 2578 }, { "epoch": 0.3280753084849256, "ewc_loss": 0.01736145094037056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7361451682518236e-05, "grad_norm": 13.60006046295166, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8576407432556152, "num_tokens": 98509540.0, "step": 2579 }, { "epoch": 0.3282025187635161, "ewc_loss": 0.017400849610567093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7400849174009636e-05, "grad_norm": 13.604438781738281, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8501648902893066, "num_tokens": 98544761.0, "step": 2580 }, { "epoch": 0.3283297290421066, "ewc_loss": 0.017389073967933655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.738907303661108e-05, "grad_norm": 13.591403007507324, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8636109828948975, "num_tokens": 98580522.0, "step": 2581 }, { "epoch": 0.32845693932069714, "ewc_loss": 0.017393948510289192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7393947928212583e-05, "grad_norm": 13.638087272644043, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.85986328125, "num_tokens": 98612988.0, "step": 2582 }, { "epoch": 0.3285841495992876, "ewc_loss": 0.017419228330254555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7419228242943063e-05, "grad_norm": 13.610661506652832, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8395774960517883, "num_tokens": 98654213.0, "step": 2583 }, { "epoch": 0.32871135987787814, "ewc_loss": 0.01740361377596855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7403614037903026e-05, "grad_norm": 13.622868537902832, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8536894917488098, "num_tokens": 98697168.0, "step": 2584 }, { "epoch": 0.32883857015646867, "ewc_loss": 0.017431315034627914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7431315427529626e-05, "grad_norm": 13.592618942260742, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8561791777610779, "num_tokens": 98735664.0, "step": 2585 }, { "epoch": 0.32896578043505914, "ewc_loss": 0.017405929043889046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.740592961141374e-05, "grad_norm": 13.592321395874023, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8455733060836792, "num_tokens": 98777257.0, "step": 2586 }, { "epoch": 0.32909299071364967, "ewc_loss": 0.017399899661540985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7399899661540985e-05, "grad_norm": 13.583491325378418, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8551669120788574, "num_tokens": 98812443.0, "step": 2587 }, { "epoch": 0.3292202009922402, "ewc_loss": 0.017430905252695084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7430904335924424e-05, "grad_norm": 13.62889289855957, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8592295050621033, "num_tokens": 98848231.0, "step": 2588 }, { "epoch": 0.32934741127083067, "ewc_loss": 0.017458392307162285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.745839290379081e-05, "grad_norm": 13.610374450683594, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8546088337898254, "num_tokens": 98886487.0, "step": 2589 }, { "epoch": 0.3294746215494212, "ewc_loss": 0.017427412793040276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7427411876269616e-05, "grad_norm": 13.643019676208496, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8415317535400391, "num_tokens": 98924750.0, "step": 2590 }, { "epoch": 0.3296018318280117, "ewc_loss": 0.017457712441682816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7457712601753883e-05, "grad_norm": 13.60708999633789, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8290500640869141, "num_tokens": 98964151.0, "step": 2591 }, { "epoch": 0.3297290421066022, "ewc_loss": 0.017453519627451897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.745351983117871e-05, "grad_norm": 13.60112190246582, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8626753091812134, "num_tokens": 99000277.0, "step": 2592 }, { "epoch": 0.3298562523851927, "ewc_loss": 0.01745547540485859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.745547524478752e-05, "grad_norm": 13.672006607055664, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8414217233657837, "num_tokens": 99042335.0, "step": 2593 }, { "epoch": 0.32998346266378326, "ewc_loss": 0.017440466210246086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7440466763218865e-05, "grad_norm": 13.613570213317871, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8493430018424988, "num_tokens": 99082964.0, "step": 2594 }, { "epoch": 0.33011067294237373, "ewc_loss": 0.017417510971426964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7417511116946116e-05, "grad_norm": 13.617237091064453, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8743221759796143, "num_tokens": 99123440.0, "step": 2595 }, { "epoch": 0.33023788322096426, "ewc_loss": 0.017462192103266716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7462192772654817e-05, "grad_norm": 13.64152717590332, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8495721817016602, "num_tokens": 99167047.0, "step": 2596 }, { "epoch": 0.3303650934995548, "ewc_loss": 0.017446964979171753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.744696419336833e-05, "grad_norm": 13.628889083862305, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8486707210540771, "num_tokens": 99202594.0, "step": 2597 }, { "epoch": 0.33049230377814526, "ewc_loss": 0.01742321252822876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.742321182973683e-05, "grad_norm": 13.594964981079102, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8576105833053589, "num_tokens": 99244732.0, "step": 2598 }, { "epoch": 0.3306195140567358, "ewc_loss": 0.01742948219180107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.742948188621085e-05, "grad_norm": 13.600838661193848, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8622231483459473, "num_tokens": 99287161.0, "step": 2599 }, { "epoch": 0.3307467243353263, "ewc_loss": 0.017466183751821518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7466183635406196e-05, "grad_norm": 13.671562194824219, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8446821570396423, "num_tokens": 99322650.0, "step": 2600 }, { "epoch": 0.3308739346139168, "ewc_loss": 0.017441783100366592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7441783711547032e-05, "grad_norm": 13.647726058959961, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8520265817642212, "num_tokens": 99358529.0, "step": 2601 }, { "epoch": 0.3310011448925073, "ewc_loss": 0.017407383769750595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7407382983947173e-05, "grad_norm": 13.603891372680664, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8605055809020996, "num_tokens": 99394010.0, "step": 2602 }, { "epoch": 0.33112835517109784, "ewc_loss": 0.017426246777176857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7426245904061943e-05, "grad_norm": 13.583222389221191, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8544784784317017, "num_tokens": 99434764.0, "step": 2603 }, { "epoch": 0.3312555654496883, "ewc_loss": 0.017478933557868004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.747893293213565e-05, "grad_norm": 13.606410026550293, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8529412746429443, "num_tokens": 99481322.0, "step": 2604 }, { "epoch": 0.33138277572827884, "ewc_loss": 0.017473630607128143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7473630578024313e-05, "grad_norm": 13.628849029541016, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8421703577041626, "num_tokens": 99517719.0, "step": 2605 }, { "epoch": 0.3315099860068694, "ewc_loss": 0.01747274212539196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7472742911195382e-05, "grad_norm": 13.676207542419434, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8533337712287903, "num_tokens": 99553662.0, "step": 2606 }, { "epoch": 0.33163719628545985, "ewc_loss": 0.01748061738908291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7480617316323332e-05, "grad_norm": 13.674235343933105, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8511984348297119, "num_tokens": 99589195.0, "step": 2607 }, { "epoch": 0.3317644065640504, "ewc_loss": 0.01744190789759159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7441907402826473e-05, "grad_norm": 13.61531925201416, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8538326621055603, "num_tokens": 99626092.0, "step": 2608 }, { "epoch": 0.3318916168426409, "ewc_loss": 0.017441971227526665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7441971067455597e-05, "grad_norm": 13.657334327697754, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8268818259239197, "num_tokens": 99667087.0, "step": 2609 }, { "epoch": 0.3320188271212314, "ewc_loss": 0.017491905018687248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7491904145572335e-05, "grad_norm": 13.648857116699219, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8459917306900024, "num_tokens": 99705929.0, "step": 2610 }, { "epoch": 0.3321460373998219, "ewc_loss": 0.017459698021411896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.745969711919315e-05, "grad_norm": 13.653564453125, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8480609655380249, "num_tokens": 99742044.0, "step": 2611 }, { "epoch": 0.33227324767841243, "ewc_loss": 0.01749059371650219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.749059447320178e-05, "grad_norm": 13.646588325500488, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8544822931289673, "num_tokens": 99772621.0, "step": 2612 }, { "epoch": 0.3324004579570029, "ewc_loss": 0.01750139705836773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.750139745126944e-05, "grad_norm": 13.648740768432617, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8572288751602173, "num_tokens": 99814405.0, "step": 2613 }, { "epoch": 0.33252766823559343, "ewc_loss": 0.017501557245850563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7501557522336952e-05, "grad_norm": 13.584592819213867, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8612377643585205, "num_tokens": 99852197.0, "step": 2614 }, { "epoch": 0.33265487851418396, "ewc_loss": 0.017504120245575905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.750412047840655e-05, "grad_norm": 13.629034996032715, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8472418189048767, "num_tokens": 99894254.0, "step": 2615 }, { "epoch": 0.33278208879277443, "ewc_loss": 0.017534686252474785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7534686776343733e-05, "grad_norm": 13.612160682678223, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8455934524536133, "num_tokens": 99936230.0, "step": 2616 }, { "epoch": 0.33290929907136496, "ewc_loss": 0.01753203757107258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.753203832777217e-05, "grad_norm": 13.65528392791748, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8526925444602966, "num_tokens": 99974964.0, "step": 2617 }, { "epoch": 0.3330365093499555, "ewc_loss": 0.01750955916941166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.750955925672315e-05, "grad_norm": 13.610279083251953, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8421779274940491, "num_tokens": 100018750.0, "step": 2618 }, { "epoch": 0.33316371962854596, "ewc_loss": 0.017526037991046906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7526037481729873e-05, "grad_norm": 13.706846237182617, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8467217087745667, "num_tokens": 100054064.0, "step": 2619 }, { "epoch": 0.3332909299071365, "ewc_loss": 0.017489826306700706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7489826859673485e-05, "grad_norm": 13.628031730651855, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8566639423370361, "num_tokens": 100092549.0, "step": 2620 }, { "epoch": 0.333418140185727, "ewc_loss": 0.017502257600426674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7502257833257318e-05, "grad_norm": 13.664437294006348, "learning_rate": 1e-06, "loss": 0.5454, "mean_token_accuracy": 0.8254133462905884, "num_tokens": 100123973.0, "step": 2621 }, { "epoch": 0.3335453504643175, "ewc_loss": 0.01754206232726574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.754206277837511e-05, "grad_norm": 13.638455390930176, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8509425520896912, "num_tokens": 100162142.0, "step": 2622 }, { "epoch": 0.333672560742908, "ewc_loss": 0.017557621002197266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.755762059474364e-05, "grad_norm": 13.708724021911621, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8462446928024292, "num_tokens": 100199352.0, "step": 2623 }, { "epoch": 0.33379977102149855, "ewc_loss": 0.017570069059729576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7570069758221507e-05, "grad_norm": 13.692790031433105, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8362734317779541, "num_tokens": 100237446.0, "step": 2624 }, { "epoch": 0.333926981300089, "ewc_loss": 0.017518209293484688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.751820855133701e-05, "grad_norm": 13.635448455810547, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.856721043586731, "num_tokens": 100272986.0, "step": 2625 }, { "epoch": 0.33405419157867955, "ewc_loss": 0.01758512668311596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.758512735250406e-05, "grad_norm": 13.681640625, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8420368432998657, "num_tokens": 100304998.0, "step": 2626 }, { "epoch": 0.3341814018572701, "ewc_loss": 0.0175495482981205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7549547919770703e-05, "grad_norm": 13.601738929748535, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8481602072715759, "num_tokens": 100338473.0, "step": 2627 }, { "epoch": 0.33430861213586055, "ewc_loss": 0.017574215307831764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7574215235072188e-05, "grad_norm": 13.671945571899414, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8336306810379028, "num_tokens": 100372644.0, "step": 2628 }, { "epoch": 0.3344358224144511, "ewc_loss": 0.017602108418941498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.760210761858616e-05, "grad_norm": 13.580323219299316, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8313639163970947, "num_tokens": 100413248.0, "step": 2629 }, { "epoch": 0.3345630326930416, "ewc_loss": 0.01755521632730961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.755521589075215e-05, "grad_norm": 13.681950569152832, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8461248874664307, "num_tokens": 100448022.0, "step": 2630 }, { "epoch": 0.33469024297163213, "ewc_loss": 0.017663482576608658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.76634821400512e-05, "grad_norm": 13.642518043518066, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8386799097061157, "num_tokens": 100493222.0, "step": 2631 }, { "epoch": 0.3348174532502226, "ewc_loss": 0.01756158098578453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.756158053467516e-05, "grad_norm": 13.636734008789062, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8562902212142944, "num_tokens": 100534736.0, "step": 2632 }, { "epoch": 0.33494466352881314, "ewc_loss": 0.01766231283545494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.766231252986472e-05, "grad_norm": 13.645453453063965, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.858795702457428, "num_tokens": 100576754.0, "step": 2633 }, { "epoch": 0.33507187380740366, "ewc_loss": 0.017648547887802124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.764854823704809e-05, "grad_norm": 13.720324516296387, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8429581522941589, "num_tokens": 100611407.0, "step": 2634 }, { "epoch": 0.33519908408599414, "ewc_loss": 0.017635200172662735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.763520049280487e-05, "grad_norm": 13.626036643981934, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8569589257240295, "num_tokens": 100643531.0, "step": 2635 }, { "epoch": 0.33532629436458466, "ewc_loss": 0.017668580636382103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.766858076734934e-05, "grad_norm": 13.684320449829102, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8513252139091492, "num_tokens": 100678612.0, "step": 2636 }, { "epoch": 0.3354535046431752, "ewc_loss": 0.017678437754511833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7678437870927155e-05, "grad_norm": 13.635994911193848, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8550249338150024, "num_tokens": 100718873.0, "step": 2637 }, { "epoch": 0.33558071492176567, "ewc_loss": 0.017665212973952293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7665213817963377e-05, "grad_norm": 13.633346557617188, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8493317365646362, "num_tokens": 100759883.0, "step": 2638 }, { "epoch": 0.3357079252003562, "ewc_loss": 0.017693577334284782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7693577319732867e-05, "grad_norm": 13.717645645141602, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8305257558822632, "num_tokens": 100800022.0, "step": 2639 }, { "epoch": 0.3358351354789467, "ewc_loss": 0.017627527937293053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7627527995500714e-05, "grad_norm": 13.656105995178223, "learning_rate": 1e-06, "loss": 0.5212, "mean_token_accuracy": 0.8357191681861877, "num_tokens": 100837299.0, "step": 2640 }, { "epoch": 0.3359623457575372, "ewc_loss": 0.01765596494078636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7655964256846346e-05, "grad_norm": 13.708507537841797, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8469474911689758, "num_tokens": 100868433.0, "step": 2641 }, { "epoch": 0.3360895560361277, "ewc_loss": 0.017688283696770668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.768828406056855e-05, "grad_norm": 13.663823127746582, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8361858129501343, "num_tokens": 100907768.0, "step": 2642 }, { "epoch": 0.33621676631471825, "ewc_loss": 0.01764701120555401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7647011191002093e-05, "grad_norm": 13.668105125427246, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8279089331626892, "num_tokens": 100942723.0, "step": 2643 }, { "epoch": 0.3363439765933087, "ewc_loss": 0.017685700207948685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7685701095615514e-05, "grad_norm": 13.628137588500977, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8560978770256042, "num_tokens": 100984997.0, "step": 2644 }, { "epoch": 0.33647118687189925, "ewc_loss": 0.017689445987343788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7689446394797415e-05, "grad_norm": 13.680808067321777, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8656405210494995, "num_tokens": 101026290.0, "step": 2645 }, { "epoch": 0.3365983971504898, "ewc_loss": 0.01766028255224228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7660282537690364e-05, "grad_norm": 13.652589797973633, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8439136743545532, "num_tokens": 101066766.0, "step": 2646 }, { "epoch": 0.33672560742908025, "ewc_loss": 0.017662160098552704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7662159734754823e-05, "grad_norm": 13.66496467590332, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8550884127616882, "num_tokens": 101100833.0, "step": 2647 }, { "epoch": 0.3368528177076708, "ewc_loss": 0.017708664759993553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7708664017845877e-05, "grad_norm": 13.697360038757324, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8610401153564453, "num_tokens": 101145552.0, "step": 2648 }, { "epoch": 0.3369800279862613, "ewc_loss": 0.017689133062958717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7689133528620005e-05, "grad_norm": 13.638618469238281, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8529172539710999, "num_tokens": 101186644.0, "step": 2649 }, { "epoch": 0.3371072382648518, "ewc_loss": 0.017670733854174614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.767073445080314e-05, "grad_norm": 13.667525291442871, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8482557535171509, "num_tokens": 101223356.0, "step": 2650 }, { "epoch": 0.3372344485434423, "ewc_loss": 0.017685234546661377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7685235434328206e-05, "grad_norm": 13.658096313476562, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8633866310119629, "num_tokens": 101260527.0, "step": 2651 }, { "epoch": 0.33736165882203284, "ewc_loss": 0.01767977699637413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7679776647128165e-05, "grad_norm": 13.631531715393066, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8503134250640869, "num_tokens": 101305403.0, "step": 2652 }, { "epoch": 0.3374888691006233, "ewc_loss": 0.01766851171851158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7668511645752005e-05, "grad_norm": 13.637166023254395, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.836419939994812, "num_tokens": 101345858.0, "step": 2653 }, { "epoch": 0.33761607937921384, "ewc_loss": 0.017694728448987007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7694728740025312e-05, "grad_norm": 13.740785598754883, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8268146514892578, "num_tokens": 101380321.0, "step": 2654 }, { "epoch": 0.33774328965780437, "ewc_loss": 0.017662959173321724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.766295827110298e-05, "grad_norm": 13.62994384765625, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8592904806137085, "num_tokens": 101419120.0, "step": 2655 }, { "epoch": 0.33787049993639484, "ewc_loss": 0.017662959173321724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.766295827110298e-05, "grad_norm": 13.667430877685547, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8527238965034485, "num_tokens": 101453272.0, "step": 2656 }, { "epoch": 0.33799771021498537, "ewc_loss": 0.017691830173134804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.769182927091606e-05, "grad_norm": 13.774497032165527, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8400189280509949, "num_tokens": 101486879.0, "step": 2657 }, { "epoch": 0.3381249204935759, "ewc_loss": 0.017677901312708855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.767790126905311e-05, "grad_norm": 13.695512771606445, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8435209393501282, "num_tokens": 101523691.0, "step": 2658 }, { "epoch": 0.33825213077216637, "ewc_loss": 0.017704641446471214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7704642232274637e-05, "grad_norm": 13.721696853637695, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.852433443069458, "num_tokens": 101555279.0, "step": 2659 }, { "epoch": 0.3383793410507569, "ewc_loss": 0.017688894644379616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.768889524100814e-05, "grad_norm": 13.721790313720703, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8670722246170044, "num_tokens": 101591780.0, "step": 2660 }, { "epoch": 0.3385065513293474, "ewc_loss": 0.017681343480944633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7681342797004618e-05, "grad_norm": 13.681585311889648, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8670125007629395, "num_tokens": 101631041.0, "step": 2661 }, { "epoch": 0.3386337616079379, "ewc_loss": 0.017630694434046745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7630694856052287e-05, "grad_norm": 13.653372764587402, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8410087823867798, "num_tokens": 101666315.0, "step": 2662 }, { "epoch": 0.33876097188652843, "ewc_loss": 0.017674176022410393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7674175978754647e-05, "grad_norm": 13.68056583404541, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8643708229064941, "num_tokens": 101704046.0, "step": 2663 }, { "epoch": 0.33888818216511896, "ewc_loss": 0.01768294721841812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7682947145658545e-05, "grad_norm": 13.64148998260498, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8615180850028992, "num_tokens": 101743076.0, "step": 2664 }, { "epoch": 0.33901539244370943, "ewc_loss": 0.01770118810236454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7701187971397303e-05, "grad_norm": 13.67708683013916, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8637347221374512, "num_tokens": 101784006.0, "step": 2665 }, { "epoch": 0.33914260272229996, "ewc_loss": 0.017725037410855293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7725036741467193e-05, "grad_norm": 13.675692558288574, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8579965829849243, "num_tokens": 101821845.0, "step": 2666 }, { "epoch": 0.3392698130008905, "ewc_loss": 0.01769554615020752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7695545466267504e-05, "grad_norm": 13.646180152893066, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8562881350517273, "num_tokens": 101861121.0, "step": 2667 }, { "epoch": 0.33939702327948096, "ewc_loss": 0.01771591044962406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7715910871629603e-05, "grad_norm": 13.742420196533203, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.847842276096344, "num_tokens": 101894330.0, "step": 2668 }, { "epoch": 0.3395242335580715, "ewc_loss": 0.017723888158798218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7723888959153555e-05, "grad_norm": 13.686836242675781, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8364970684051514, "num_tokens": 101934835.0, "step": 2669 }, { "epoch": 0.339651443836662, "ewc_loss": 0.01768382452428341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7683823898551054e-05, "grad_norm": 13.69809341430664, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8338295817375183, "num_tokens": 101970943.0, "step": 2670 }, { "epoch": 0.3397786541152525, "ewc_loss": 0.017710458487272263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7710459360387176e-05, "grad_norm": 13.767668724060059, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8396227359771729, "num_tokens": 102001211.0, "step": 2671 }, { "epoch": 0.339905864393843, "ewc_loss": 0.017704887315630913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7704887795844115e-05, "grad_norm": 13.657541275024414, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8579314351081848, "num_tokens": 102039399.0, "step": 2672 }, { "epoch": 0.34003307467243354, "ewc_loss": 0.017678890377283096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7678890799288638e-05, "grad_norm": 13.753432273864746, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.84040367603302, "num_tokens": 102070635.0, "step": 2673 }, { "epoch": 0.340160284951024, "ewc_loss": 0.01775936596095562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7759366528480314e-05, "grad_norm": 13.740330696105957, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8557143807411194, "num_tokens": 102104590.0, "step": 2674 }, { "epoch": 0.34028749522961454, "ewc_loss": 0.017703279852867126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.770327980921138e-05, "grad_norm": 13.709786415100098, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8570275902748108, "num_tokens": 102149267.0, "step": 2675 }, { "epoch": 0.3404147055082051, "ewc_loss": 0.017696009948849678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7696009308565408e-05, "grad_norm": 13.670683860778809, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8349132537841797, "num_tokens": 102188089.0, "step": 2676 }, { "epoch": 0.34054191578679555, "ewc_loss": 0.017718689516186714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.771869028743822e-05, "grad_norm": 13.763665199279785, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8583669066429138, "num_tokens": 102224335.0, "step": 2677 }, { "epoch": 0.3406691260653861, "ewc_loss": 0.017700986936688423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.770098606357351e-05, "grad_norm": 13.597103118896484, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.85492342710495, "num_tokens": 102263421.0, "step": 2678 }, { "epoch": 0.3407963363439766, "ewc_loss": 0.01772109977900982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.772110044839792e-05, "grad_norm": 13.719199180603027, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8344004154205322, "num_tokens": 102301133.0, "step": 2679 }, { "epoch": 0.34092354662256713, "ewc_loss": 0.017753947526216507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.775394775904715e-05, "grad_norm": 13.673657417297363, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8551884293556213, "num_tokens": 102338516.0, "step": 2680 }, { "epoch": 0.3410507569011576, "ewc_loss": 0.017720818519592285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.772081850504037e-05, "grad_norm": 13.672994613647461, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8409753441810608, "num_tokens": 102376010.0, "step": 2681 }, { "epoch": 0.34117796717974813, "ewc_loss": 0.017729077488183975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7729076716932468e-05, "grad_norm": 13.665654182434082, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8727134466171265, "num_tokens": 102417080.0, "step": 2682 }, { "epoch": 0.34130517745833866, "ewc_loss": 0.01771528273820877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.771528332028538e-05, "grad_norm": 13.665321350097656, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8534853458404541, "num_tokens": 102458042.0, "step": 2683 }, { "epoch": 0.34143238773692913, "ewc_loss": 0.017761562019586563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7761562048690394e-05, "grad_norm": 13.79477310180664, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8593340516090393, "num_tokens": 102497152.0, "step": 2684 }, { "epoch": 0.34155959801551966, "ewc_loss": 0.017722763121128082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.772276300471276e-05, "grad_norm": 13.645489692687988, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8494827747344971, "num_tokens": 102531486.0, "step": 2685 }, { "epoch": 0.3416868082941102, "ewc_loss": 0.01770230010151863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.770229937392287e-05, "grad_norm": 13.781974792480469, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8498009443283081, "num_tokens": 102571552.0, "step": 2686 }, { "epoch": 0.34181401857270066, "ewc_loss": 0.01776616834104061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.776616772986017e-05, "grad_norm": 13.700627326965332, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8572161197662354, "num_tokens": 102607338.0, "step": 2687 }, { "epoch": 0.3419412288512912, "ewc_loss": 0.01767418161034584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7674181435722858e-05, "grad_norm": 13.693755149841309, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8516257405281067, "num_tokens": 102654445.0, "step": 2688 }, { "epoch": 0.3420684391298817, "ewc_loss": 0.017716096714138985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7716096408548765e-05, "grad_norm": 13.698356628417969, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8602049350738525, "num_tokens": 102695265.0, "step": 2689 }, { "epoch": 0.3421956494084722, "ewc_loss": 0.017672909423708916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.767290996212978e-05, "grad_norm": 13.719155311584473, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.855535626411438, "num_tokens": 102728621.0, "step": 2690 }, { "epoch": 0.3423228596870627, "ewc_loss": 0.017725849524140358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7725849829730578e-05, "grad_norm": 13.756657600402832, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.845471978187561, "num_tokens": 102766378.0, "step": 2691 }, { "epoch": 0.34245006996565325, "ewc_loss": 0.01771254651248455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7712545741233043e-05, "grad_norm": 13.635725975036621, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8500756025314331, "num_tokens": 102806331.0, "step": 2692 }, { "epoch": 0.3425772802442437, "ewc_loss": 0.017728477716445923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.77284782694187e-05, "grad_norm": 13.748245239257812, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8538130521774292, "num_tokens": 102845314.0, "step": 2693 }, { "epoch": 0.34270449052283425, "ewc_loss": 0.01774754375219345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7747543097357266e-05, "grad_norm": 13.751049995422363, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.850996196269989, "num_tokens": 102888614.0, "step": 2694 }, { "epoch": 0.3428317008014248, "ewc_loss": 0.01770220510661602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7702204786473885e-05, "grad_norm": 13.69819450378418, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8464421033859253, "num_tokens": 102925988.0, "step": 2695 }, { "epoch": 0.34295891108001525, "ewc_loss": 0.017766684293746948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.776668432285078e-05, "grad_norm": 13.766576766967773, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8601606488227844, "num_tokens": 102963150.0, "step": 2696 }, { "epoch": 0.3430861213586058, "ewc_loss": 0.017743758857250214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.774375959939789e-05, "grad_norm": 13.758087158203125, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8589801788330078, "num_tokens": 102999164.0, "step": 2697 }, { "epoch": 0.3432133316371963, "ewc_loss": 0.01772169955074787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7721698895911686e-05, "grad_norm": 13.758551597595215, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8601385951042175, "num_tokens": 103034337.0, "step": 2698 }, { "epoch": 0.3433405419157868, "ewc_loss": 0.01776687428355217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7766873497748747e-05, "grad_norm": 13.750828742980957, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8583264946937561, "num_tokens": 103075062.0, "step": 2699 }, { "epoch": 0.3434677521943773, "ewc_loss": 0.017720239236950874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7720240066410042e-05, "grad_norm": 13.776782989501953, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8472769260406494, "num_tokens": 103113209.0, "step": 2700 }, { "epoch": 0.34359496247296784, "ewc_loss": 0.01773940585553646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.77394049387658e-05, "grad_norm": 13.673064231872559, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8559104204177856, "num_tokens": 103153117.0, "step": 2701 }, { "epoch": 0.3437221727515583, "ewc_loss": 0.01772342249751091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7723423297866248e-05, "grad_norm": 13.883337020874023, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8509097695350647, "num_tokens": 103192234.0, "step": 2702 }, { "epoch": 0.34384938303014884, "ewc_loss": 0.017752651125192642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7752650819602422e-05, "grad_norm": 13.654458045959473, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8483777642250061, "num_tokens": 103224958.0, "step": 2703 }, { "epoch": 0.34397659330873936, "ewc_loss": 0.017675058916211128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7675058188615367e-05, "grad_norm": 13.784880638122559, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8536217212677002, "num_tokens": 103259679.0, "step": 2704 }, { "epoch": 0.34410380358732984, "ewc_loss": 0.01780950091779232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7809501514420845e-05, "grad_norm": 13.724491119384766, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8434473276138306, "num_tokens": 103299987.0, "step": 2705 }, { "epoch": 0.34423101386592037, "ewc_loss": 0.01771698147058487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7716982256388292e-05, "grad_norm": 13.76314640045166, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8383527994155884, "num_tokens": 103344457.0, "step": 2706 }, { "epoch": 0.3443582241445109, "ewc_loss": 0.0177591685205698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7759168258635327e-05, "grad_norm": 13.6643648147583, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8567811250686646, "num_tokens": 103385173.0, "step": 2707 }, { "epoch": 0.34448543442310137, "ewc_loss": 0.0177445150911808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7744514480000362e-05, "grad_norm": 13.776098251342773, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8362598419189453, "num_tokens": 103425972.0, "step": 2708 }, { "epoch": 0.3446126447016919, "ewc_loss": 0.017773527652025223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7773527360986918e-05, "grad_norm": 13.676788330078125, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.847551167011261, "num_tokens": 103466819.0, "step": 2709 }, { "epoch": 0.3447398549802824, "ewc_loss": 0.017775267362594604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.777526813384611e-05, "grad_norm": 13.734535217285156, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8407928943634033, "num_tokens": 103509660.0, "step": 2710 }, { "epoch": 0.3448670652588729, "ewc_loss": 0.017784062772989273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7784062947612256e-05, "grad_norm": 13.710559844970703, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8461986184120178, "num_tokens": 103544557.0, "step": 2711 }, { "epoch": 0.3449942755374634, "ewc_loss": 0.017763422802090645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.776342287485022e-05, "grad_norm": 13.690908432006836, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8609265089035034, "num_tokens": 103585132.0, "step": 2712 }, { "epoch": 0.34512148581605395, "ewc_loss": 0.017792843282222748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.779284320946317e-05, "grad_norm": 13.686761856079102, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8599283695220947, "num_tokens": 103626353.0, "step": 2713 }, { "epoch": 0.3452486960946444, "ewc_loss": 0.017758861184120178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7758860849426128e-05, "grad_norm": 13.718286514282227, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8549147844314575, "num_tokens": 103659035.0, "step": 2714 }, { "epoch": 0.34537590637323495, "ewc_loss": 0.017851131036877632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7851130905910395e-05, "grad_norm": 13.693978309631348, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8598737716674805, "num_tokens": 103699497.0, "step": 2715 }, { "epoch": 0.3455031166518255, "ewc_loss": 0.017810354009270668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7810354620451108e-05, "grad_norm": 13.72887897491455, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8398376107215881, "num_tokens": 103733685.0, "step": 2716 }, { "epoch": 0.34563032693041595, "ewc_loss": 0.0178254134953022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7825414033723064e-05, "grad_norm": 13.707674026489258, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.834998607635498, "num_tokens": 103769399.0, "step": 2717 }, { "epoch": 0.3457575372090065, "ewc_loss": 0.017803123220801353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7803124137572013e-05, "grad_norm": 13.643379211425781, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8588575124740601, "num_tokens": 103809078.0, "step": 2718 }, { "epoch": 0.345884747487597, "ewc_loss": 0.017845774069428444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7845773982116953e-05, "grad_norm": 13.735910415649414, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8687927722930908, "num_tokens": 103846764.0, "step": 2719 }, { "epoch": 0.3460119577661875, "ewc_loss": 0.017824698239564896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7824697351898067e-05, "grad_norm": 13.67676830291748, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8611564040184021, "num_tokens": 103883089.0, "step": 2720 }, { "epoch": 0.346139168044778, "ewc_loss": 0.017859311774373055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.785931090125814e-05, "grad_norm": 13.72838306427002, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8690177202224731, "num_tokens": 103915775.0, "step": 2721 }, { "epoch": 0.34626637832336854, "ewc_loss": 0.017893671989440918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7893671611091122e-05, "grad_norm": 13.743343353271484, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8473504185676575, "num_tokens": 103953703.0, "step": 2722 }, { "epoch": 0.346393588601959, "ewc_loss": 0.01785598322749138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7855983969639055e-05, "grad_norm": 13.704047203063965, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8428550958633423, "num_tokens": 103990775.0, "step": 2723 }, { "epoch": 0.34652079888054954, "ewc_loss": 0.0178386177867651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7838618077803403e-05, "grad_norm": 13.742635726928711, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8553104996681213, "num_tokens": 104025927.0, "step": 2724 }, { "epoch": 0.34664800915914007, "ewc_loss": 0.01787419803440571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.787419751053676e-05, "grad_norm": 13.700276374816895, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8431152701377869, "num_tokens": 104065860.0, "step": 2725 }, { "epoch": 0.34677521943773054, "ewc_loss": 0.017801204696297646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7801205103751272e-05, "grad_norm": 13.701111793518066, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8547506928443909, "num_tokens": 104102117.0, "step": 2726 }, { "epoch": 0.34690242971632107, "ewc_loss": 0.01789969392120838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7899694285006262e-05, "grad_norm": 13.747843742370605, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8483384847640991, "num_tokens": 104140896.0, "step": 2727 }, { "epoch": 0.3470296399949116, "ewc_loss": 0.01786843314766884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7868433133116923e-05, "grad_norm": 13.764742851257324, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8503549098968506, "num_tokens": 104181355.0, "step": 2728 }, { "epoch": 0.3471568502735021, "ewc_loss": 0.017874451354146004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7874452169053257e-05, "grad_norm": 13.735128402709961, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8541398644447327, "num_tokens": 104218564.0, "step": 2729 }, { "epoch": 0.3472840605520926, "ewc_loss": 0.017844023182988167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7844022295321338e-05, "grad_norm": 13.799701690673828, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.856555700302124, "num_tokens": 104256553.0, "step": 2730 }, { "epoch": 0.3474112708306831, "ewc_loss": 0.017861418426036835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.786141910997685e-05, "grad_norm": 13.820318222045898, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.86562180519104, "num_tokens": 104291150.0, "step": 2731 }, { "epoch": 0.34753848110927366, "ewc_loss": 0.017798207700252533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.779820740921423e-05, "grad_norm": 13.702349662780762, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8450522422790527, "num_tokens": 104327977.0, "step": 2732 }, { "epoch": 0.34766569138786413, "ewc_loss": 0.017843710258603096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7843709429143928e-05, "grad_norm": 13.80893325805664, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8483303785324097, "num_tokens": 104365167.0, "step": 2733 }, { "epoch": 0.34779290166645466, "ewc_loss": 0.017862191423773766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7862192180473357e-05, "grad_norm": 13.747074127197266, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.857292890548706, "num_tokens": 104400304.0, "step": 2734 }, { "epoch": 0.3479201119450452, "ewc_loss": 0.017800873145461082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7800874047679827e-05, "grad_norm": 13.743067741394043, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.836146354675293, "num_tokens": 104437974.0, "step": 2735 }, { "epoch": 0.34804732222363566, "ewc_loss": 0.017899125814437866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7899124941322953e-05, "grad_norm": 13.823674201965332, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8558439612388611, "num_tokens": 104474474.0, "step": 2736 }, { "epoch": 0.3481745325022262, "ewc_loss": 0.017835207283496857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7835207472671755e-05, "grad_norm": 13.709239959716797, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.838829755783081, "num_tokens": 104511004.0, "step": 2737 }, { "epoch": 0.3483017427808167, "ewc_loss": 0.017864953726530075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.786495340638794e-05, "grad_norm": 13.752372741699219, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8459891080856323, "num_tokens": 104551889.0, "step": 2738 }, { "epoch": 0.3484289530594072, "ewc_loss": 0.017909759655594826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7909760572365485e-05, "grad_norm": 13.766469955444336, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8363819122314453, "num_tokens": 104591962.0, "step": 2739 }, { "epoch": 0.3485561633379977, "ewc_loss": 0.017851969227194786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.785196946002543e-05, "grad_norm": 13.667780876159668, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8497593402862549, "num_tokens": 104636201.0, "step": 2740 }, { "epoch": 0.34868337361658824, "ewc_loss": 0.017860742285847664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.786074244591873e-05, "grad_norm": 13.739038467407227, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8519457578659058, "num_tokens": 104673448.0, "step": 2741 }, { "epoch": 0.3488105838951787, "ewc_loss": 0.01793070323765278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.793070259736851e-05, "grad_norm": 13.749126434326172, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8605354428291321, "num_tokens": 104708879.0, "step": 2742 }, { "epoch": 0.34893779417376924, "ewc_loss": 0.01789381541311741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7893815311254002e-05, "grad_norm": 13.742810249328613, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8628868460655212, "num_tokens": 104747073.0, "step": 2743 }, { "epoch": 0.3490650044523598, "ewc_loss": 0.017901325598359108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7901325918501243e-05, "grad_norm": 13.794478416442871, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8520054817199707, "num_tokens": 104786950.0, "step": 2744 }, { "epoch": 0.34919221473095025, "ewc_loss": 0.017919940873980522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.791994145605713e-05, "grad_norm": 13.774837493896484, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8571510314941406, "num_tokens": 104828306.0, "step": 2745 }, { "epoch": 0.3493194250095408, "ewc_loss": 0.017865467816591263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7865468180389144e-05, "grad_norm": 13.719943046569824, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8439732789993286, "num_tokens": 104867030.0, "step": 2746 }, { "epoch": 0.3494466352881313, "ewc_loss": 0.017880814149975777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.788081499398686e-05, "grad_norm": 13.760491371154785, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8564363718032837, "num_tokens": 104909182.0, "step": 2747 }, { "epoch": 0.3495738455667218, "ewc_loss": 0.01787256821990013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7872567696031183e-05, "grad_norm": 13.772761344909668, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8478855490684509, "num_tokens": 104950648.0, "step": 2748 }, { "epoch": 0.3497010558453123, "ewc_loss": 0.01788480579853058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.788480585673824e-05, "grad_norm": 13.791837692260742, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.857933521270752, "num_tokens": 104994780.0, "step": 2749 }, { "epoch": 0.34982826612390283, "ewc_loss": 0.01784735545516014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7847354683908634e-05, "grad_norm": 13.778928756713867, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8600085973739624, "num_tokens": 105029469.0, "step": 2750 }, { "epoch": 0.3499554764024933, "ewc_loss": 0.017856955528259277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.785695530998055e-05, "grad_norm": 13.750972747802734, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8470999598503113, "num_tokens": 105066797.0, "step": 2751 }, { "epoch": 0.35008268668108383, "ewc_loss": 0.017845654860138893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7845655747805722e-05, "grad_norm": 13.72603702545166, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8668095469474792, "num_tokens": 105100869.0, "step": 2752 }, { "epoch": 0.35020989695967436, "ewc_loss": 0.017855502665042877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7855501937447116e-05, "grad_norm": 13.752413749694824, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8658301830291748, "num_tokens": 105135409.0, "step": 2753 }, { "epoch": 0.35033710723826483, "ewc_loss": 0.017885355278849602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.788535519153811e-05, "grad_norm": 13.715556144714355, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8591448068618774, "num_tokens": 105172839.0, "step": 2754 }, { "epoch": 0.35046431751685536, "ewc_loss": 0.017858272418379784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7858272258308716e-05, "grad_norm": 13.725942611694336, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8587677478790283, "num_tokens": 105216656.0, "step": 2755 }, { "epoch": 0.3505915277954459, "ewc_loss": 0.017899977043271065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7899976228363812e-05, "grad_norm": 13.830399513244629, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8480989336967468, "num_tokens": 105253740.0, "step": 2756 }, { "epoch": 0.35071873807403636, "ewc_loss": 0.017883989959955215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7883989130496047e-05, "grad_norm": 13.73109245300293, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8570380210876465, "num_tokens": 105290073.0, "step": 2757 }, { "epoch": 0.3508459483526269, "ewc_loss": 0.017873937264084816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7873937395052053e-05, "grad_norm": 13.787677764892578, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8592984676361084, "num_tokens": 105325933.0, "step": 2758 }, { "epoch": 0.3509731586312174, "ewc_loss": 0.017861144617199898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7861144442576915e-05, "grad_norm": 13.741395950317383, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8567897081375122, "num_tokens": 105362962.0, "step": 2759 }, { "epoch": 0.3511003689098079, "ewc_loss": 0.017872385680675507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.787238579709083e-05, "grad_norm": 13.733047485351562, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8534592390060425, "num_tokens": 105410074.0, "step": 2760 }, { "epoch": 0.3512275791883984, "ewc_loss": 0.017866916954517365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7866916095954366e-05, "grad_norm": 13.706154823303223, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8386069536209106, "num_tokens": 105450264.0, "step": 2761 }, { "epoch": 0.35135478946698895, "ewc_loss": 0.017896294593811035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7896294593811035e-05, "grad_norm": 13.791963577270508, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8435940742492676, "num_tokens": 105486566.0, "step": 2762 }, { "epoch": 0.3514819997455794, "ewc_loss": 0.017901454120874405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.790145324775949e-05, "grad_norm": 13.6754732131958, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8561241030693054, "num_tokens": 105527185.0, "step": 2763 }, { "epoch": 0.35160921002416995, "ewc_loss": 0.017880095168948174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7880094674183056e-05, "grad_norm": 13.783330917358398, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8525820374488831, "num_tokens": 105557401.0, "step": 2764 }, { "epoch": 0.3517364203027605, "ewc_loss": 0.017968621104955673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7968621250474826e-05, "grad_norm": 13.733247756958008, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8588999509811401, "num_tokens": 105586877.0, "step": 2765 }, { "epoch": 0.35186363058135095, "ewc_loss": 0.017897628247737885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7897627913043834e-05, "grad_norm": 13.726339340209961, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8574533462524414, "num_tokens": 105626988.0, "step": 2766 }, { "epoch": 0.3519908408599415, "ewc_loss": 0.017964079976081848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.796407923393417e-05, "grad_norm": 13.75228500366211, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8441099524497986, "num_tokens": 105665796.0, "step": 2767 }, { "epoch": 0.352118051138532, "ewc_loss": 0.017971834167838097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7971833585761487e-05, "grad_norm": 13.736953735351562, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8447110056877136, "num_tokens": 105708892.0, "step": 2768 }, { "epoch": 0.3522452614171225, "ewc_loss": 0.01797041855752468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.797041841200553e-05, "grad_norm": 13.719340324401855, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8476429581642151, "num_tokens": 105748221.0, "step": 2769 }, { "epoch": 0.352372471695713, "ewc_loss": 0.01796353980898857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.796353899408132e-05, "grad_norm": 13.782346725463867, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8510470390319824, "num_tokens": 105784568.0, "step": 2770 }, { "epoch": 0.35249968197430354, "ewc_loss": 0.01797708310186863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.797708318918012e-05, "grad_norm": 13.7847318649292, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8410502672195435, "num_tokens": 105827223.0, "step": 2771 }, { "epoch": 0.352626892252894, "ewc_loss": 0.017979441210627556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7979440599447116e-05, "grad_norm": 13.74282455444336, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8711565732955933, "num_tokens": 105871014.0, "step": 2772 }, { "epoch": 0.35275410253148454, "ewc_loss": 0.017982708290219307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7982707504415885e-05, "grad_norm": 13.746846199035645, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8494021892547607, "num_tokens": 105908995.0, "step": 2773 }, { "epoch": 0.35288131281007507, "ewc_loss": 0.017971783876419067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7971784473047592e-05, "grad_norm": 13.776369094848633, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8465502262115479, "num_tokens": 105948243.0, "step": 2774 }, { "epoch": 0.35300852308866554, "ewc_loss": 0.017977792769670486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7977792595047504e-05, "grad_norm": 13.80984878540039, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8551692962646484, "num_tokens": 105983997.0, "step": 2775 }, { "epoch": 0.35313573336725607, "ewc_loss": 0.017959751188755035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7959751858143136e-05, "grad_norm": 13.762616157531738, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8400208950042725, "num_tokens": 106020336.0, "step": 2776 }, { "epoch": 0.3532629436458466, "ewc_loss": 0.0179978609085083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7997861505136825e-05, "grad_norm": 13.82138729095459, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8548445105552673, "num_tokens": 106053300.0, "step": 2777 }, { "epoch": 0.35339015392443707, "ewc_loss": 0.017965711653232574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7965710867429152e-05, "grad_norm": 13.753536224365234, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8669015765190125, "num_tokens": 106091790.0, "step": 2778 }, { "epoch": 0.3535173642030276, "ewc_loss": 0.017964795231819153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.796479591575917e-05, "grad_norm": 13.774308204650879, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8562581539154053, "num_tokens": 106123768.0, "step": 2779 }, { "epoch": 0.3536445744816181, "ewc_loss": 0.018028970807790756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.802897168090567e-05, "grad_norm": 13.865461349487305, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8609435558319092, "num_tokens": 106158219.0, "step": 2780 }, { "epoch": 0.35377178476020865, "ewc_loss": 0.018006931990385056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.800693280529231e-05, "grad_norm": 13.782323837280273, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8489013910293579, "num_tokens": 106195795.0, "step": 2781 }, { "epoch": 0.3538989950387991, "ewc_loss": 0.017992569133639336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.7992568245972507e-05, "grad_norm": 13.792247772216797, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8542767763137817, "num_tokens": 106234973.0, "step": 2782 }, { "epoch": 0.35402620531738965, "ewc_loss": 0.0180128812789917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8012880900641903e-05, "grad_norm": 13.834310531616211, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8380579948425293, "num_tokens": 106267297.0, "step": 2783 }, { "epoch": 0.3541534155959802, "ewc_loss": 0.018043484538793564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8043485397356562e-05, "grad_norm": 13.83720874786377, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8604980707168579, "num_tokens": 106309070.0, "step": 2784 }, { "epoch": 0.35428062587457065, "ewc_loss": 0.01797349564731121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.797349614207633e-05, "grad_norm": 13.753100395202637, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8574972748756409, "num_tokens": 106345784.0, "step": 2785 }, { "epoch": 0.3544078361531612, "ewc_loss": 0.01800197921693325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8001979697146453e-05, "grad_norm": 13.765721321105957, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8608325719833374, "num_tokens": 106383172.0, "step": 2786 }, { "epoch": 0.3545350464317517, "ewc_loss": 0.018008744344115257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.800874451873824e-05, "grad_norm": 13.781210899353027, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8508648872375488, "num_tokens": 106416916.0, "step": 2787 }, { "epoch": 0.3546622567103422, "ewc_loss": 0.01804261840879917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.804261773941107e-05, "grad_norm": 13.711602210998535, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8374290466308594, "num_tokens": 106456969.0, "step": 2788 }, { "epoch": 0.3547894669889327, "ewc_loss": 0.01802460476756096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8024604287347756e-05, "grad_norm": 13.825648307800293, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8506060838699341, "num_tokens": 106490476.0, "step": 2789 }, { "epoch": 0.35491667726752324, "ewc_loss": 0.018074342980980873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8074342733598314e-05, "grad_norm": 13.692703247070312, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.853410005569458, "num_tokens": 106525990.0, "step": 2790 }, { "epoch": 0.3550438875461137, "ewc_loss": 0.01803668960928917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8036689652944915e-05, "grad_norm": 13.819649696350098, "learning_rate": 1e-06, "loss": 0.5591, "mean_token_accuracy": 0.8283470869064331, "num_tokens": 106562953.0, "step": 2791 }, { "epoch": 0.35517109782470424, "ewc_loss": 0.01813092641532421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8130926036974415e-05, "grad_norm": 13.707895278930664, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8550410270690918, "num_tokens": 106599142.0, "step": 2792 }, { "epoch": 0.35529830810329477, "ewc_loss": 0.01807430386543274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.807430453482084e-05, "grad_norm": 13.768033981323242, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8648420572280884, "num_tokens": 106636743.0, "step": 2793 }, { "epoch": 0.35542551838188524, "ewc_loss": 0.018138566985726357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.813856761145871e-05, "grad_norm": 13.719993591308594, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.857551097869873, "num_tokens": 106678992.0, "step": 2794 }, { "epoch": 0.35555272866047577, "ewc_loss": 0.018130620941519737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.813062044675462e-05, "grad_norm": 13.777027130126953, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8433179259300232, "num_tokens": 106719884.0, "step": 2795 }, { "epoch": 0.3556799389390663, "ewc_loss": 0.018116561695933342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8116561477654614e-05, "grad_norm": 13.666723251342773, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8488169312477112, "num_tokens": 106755224.0, "step": 2796 }, { "epoch": 0.35580714921765677, "ewc_loss": 0.018100913614034653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8100914530805312e-05, "grad_norm": 13.778676986694336, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8646227121353149, "num_tokens": 106789366.0, "step": 2797 }, { "epoch": 0.3559343594962473, "ewc_loss": 0.01818600483238697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8186005036113784e-05, "grad_norm": 13.790806770324707, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8438515663146973, "num_tokens": 106826382.0, "step": 2798 }, { "epoch": 0.3560615697748378, "ewc_loss": 0.018103552982211113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8103553884429857e-05, "grad_norm": 13.745118141174316, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8441170454025269, "num_tokens": 106860027.0, "step": 2799 }, { "epoch": 0.3561887800534283, "ewc_loss": 0.01814855821430683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8148557501262985e-05, "grad_norm": 13.845812797546387, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8605861663818359, "num_tokens": 106898684.0, "step": 2800 }, { "epoch": 0.35631599033201883, "ewc_loss": 0.018102282658219337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.810228241083678e-05, "grad_norm": 13.682714462280273, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8546262979507446, "num_tokens": 106934496.0, "step": 2801 }, { "epoch": 0.35644320061060936, "ewc_loss": 0.018135199323296547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8135198843083344e-05, "grad_norm": 13.760705947875977, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8665359020233154, "num_tokens": 106974335.0, "step": 2802 }, { "epoch": 0.35657041088919983, "ewc_loss": 0.018141141161322594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.814114148146473e-05, "grad_norm": 13.817973136901855, "learning_rate": 1e-06, "loss": 0.5642, "mean_token_accuracy": 0.8279879093170166, "num_tokens": 107013914.0, "step": 2803 }, { "epoch": 0.35669762116779036, "ewc_loss": 0.018139179795980453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8139180610887706e-05, "grad_norm": 13.714194297790527, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8564237356185913, "num_tokens": 107053747.0, "step": 2804 }, { "epoch": 0.3568248314463809, "ewc_loss": 0.018114373087882996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8114373233402148e-05, "grad_norm": 13.859232902526855, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8425394296646118, "num_tokens": 107095245.0, "step": 2805 }, { "epoch": 0.35695204172497136, "ewc_loss": 0.018157651647925377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8157652448280714e-05, "grad_norm": 13.786529541015625, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8535986542701721, "num_tokens": 107133627.0, "step": 2806 }, { "epoch": 0.3570792520035619, "ewc_loss": 0.018104229122400284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8104228729498573e-05, "grad_norm": 13.819435119628906, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8292480707168579, "num_tokens": 107170128.0, "step": 2807 }, { "epoch": 0.3572064622821524, "ewc_loss": 0.01813945733010769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8139457097277045e-05, "grad_norm": 13.865226745605469, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8630402088165283, "num_tokens": 107209137.0, "step": 2808 }, { "epoch": 0.3573336725607429, "ewc_loss": 0.018082667142152786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.808266642910894e-05, "grad_norm": 13.738222122192383, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8474326133728027, "num_tokens": 107244866.0, "step": 2809 }, { "epoch": 0.3574608828393334, "ewc_loss": 0.018140142783522606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8140142856282182e-05, "grad_norm": 13.885741233825684, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8523731827735901, "num_tokens": 107287399.0, "step": 2810 }, { "epoch": 0.35758809311792394, "ewc_loss": 0.018103888258337975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.810388857848011e-05, "grad_norm": 13.783061981201172, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.843061089515686, "num_tokens": 107325627.0, "step": 2811 }, { "epoch": 0.3577153033965144, "ewc_loss": 0.018059926107525826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.805992542358581e-05, "grad_norm": 13.811239242553711, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8684911727905273, "num_tokens": 107359375.0, "step": 2812 }, { "epoch": 0.35784251367510495, "ewc_loss": 0.018114501610398293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.81145023816498e-05, "grad_norm": 13.74185848236084, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8419046401977539, "num_tokens": 107400286.0, "step": 2813 }, { "epoch": 0.3579697239536955, "ewc_loss": 0.01808122545480728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8081225789501332e-05, "grad_norm": 13.79446029663086, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8435879945755005, "num_tokens": 107431768.0, "step": 2814 }, { "epoch": 0.35809693423228595, "ewc_loss": 0.018092291429638863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.80922907020431e-05, "grad_norm": 13.786270141601562, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8622719049453735, "num_tokens": 107470875.0, "step": 2815 }, { "epoch": 0.3582241445108765, "ewc_loss": 0.01813650317490101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8136503058485687e-05, "grad_norm": 13.822962760925293, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8443825244903564, "num_tokens": 107510988.0, "step": 2816 }, { "epoch": 0.358351354789467, "ewc_loss": 0.01810361072421074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8103610273101367e-05, "grad_norm": 13.872855186462402, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8608594536781311, "num_tokens": 107552423.0, "step": 2817 }, { "epoch": 0.3584785650680575, "ewc_loss": 0.0181175097823143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.811750917113386e-05, "grad_norm": 13.76656723022461, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8457623720169067, "num_tokens": 107591693.0, "step": 2818 }, { "epoch": 0.358605775346648, "ewc_loss": 0.018091300502419472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8091301171807572e-05, "grad_norm": 13.875197410583496, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8581841588020325, "num_tokens": 107626017.0, "step": 2819 }, { "epoch": 0.35873298562523853, "ewc_loss": 0.018147261813282967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.814726238080766e-05, "grad_norm": 13.859668731689453, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8685932755470276, "num_tokens": 107664927.0, "step": 2820 }, { "epoch": 0.358860195903829, "ewc_loss": 0.018074974417686462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8074973922921345e-05, "grad_norm": 13.752900123596191, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8456867337226868, "num_tokens": 107702487.0, "step": 2821 }, { "epoch": 0.35898740618241953, "ewc_loss": 0.018063897266983986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8063898096443154e-05, "grad_norm": 13.725478172302246, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8660750389099121, "num_tokens": 107741263.0, "step": 2822 }, { "epoch": 0.35911461646101006, "ewc_loss": 0.018102414906024933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8102415197063237e-05, "grad_norm": 13.752057075500488, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8558867573738098, "num_tokens": 107781079.0, "step": 2823 }, { "epoch": 0.35924182673960053, "ewc_loss": 0.018085358664393425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8085358533426188e-05, "grad_norm": 13.728703498840332, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8717030882835388, "num_tokens": 107819309.0, "step": 2824 }, { "epoch": 0.35936903701819106, "ewc_loss": 0.018135178834199905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8135178834199905e-05, "grad_norm": 13.825309753417969, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8541170954704285, "num_tokens": 107853251.0, "step": 2825 }, { "epoch": 0.3594962472967816, "ewc_loss": 0.01812753453850746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8127535440726206e-05, "grad_norm": 13.739580154418945, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8728609085083008, "num_tokens": 107893476.0, "step": 2826 }, { "epoch": 0.35962345757537206, "ewc_loss": 0.01811375468969345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8113754777004942e-05, "grad_norm": 13.805405616760254, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8627604246139526, "num_tokens": 107931204.0, "step": 2827 }, { "epoch": 0.3597506678539626, "ewc_loss": 0.018164347857236862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8164348148275167e-05, "grad_norm": 13.851231575012207, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.835906982421875, "num_tokens": 107966471.0, "step": 2828 }, { "epoch": 0.3598778781325531, "ewc_loss": 0.018091775476932526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8091775928041898e-05, "grad_norm": 13.74179458618164, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8541324734687805, "num_tokens": 108005328.0, "step": 2829 }, { "epoch": 0.36000508841114365, "ewc_loss": 0.018107373267412186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8107373762177303e-05, "grad_norm": 13.814541816711426, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8417599201202393, "num_tokens": 108046295.0, "step": 2830 }, { "epoch": 0.3601322986897341, "ewc_loss": 0.0181344635784626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8134463971364312e-05, "grad_norm": 13.823005676269531, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8559800386428833, "num_tokens": 108078164.0, "step": 2831 }, { "epoch": 0.36025950896832465, "ewc_loss": 0.018144266679883003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.814426650526002e-05, "grad_norm": 13.833807945251465, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8481884598731995, "num_tokens": 108117105.0, "step": 2832 }, { "epoch": 0.3603867192469152, "ewc_loss": 0.018141252920031548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8141252439818345e-05, "grad_norm": 13.82213020324707, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8498939871788025, "num_tokens": 108155301.0, "step": 2833 }, { "epoch": 0.36051392952550565, "ewc_loss": 0.018136072903871536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.813607377698645e-05, "grad_norm": 13.772988319396973, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8642603158950806, "num_tokens": 108193091.0, "step": 2834 }, { "epoch": 0.3606411398040962, "ewc_loss": 0.018131384626030922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.813138442230411e-05, "grad_norm": 13.829484939575195, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8515287637710571, "num_tokens": 108233719.0, "step": 2835 }, { "epoch": 0.3607683500826867, "ewc_loss": 0.018146276473999023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.814627648855094e-05, "grad_norm": 13.776839256286621, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8587710857391357, "num_tokens": 108265775.0, "step": 2836 }, { "epoch": 0.3608955603612772, "ewc_loss": 0.01808927208185196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8089271179633215e-05, "grad_norm": 13.7548828125, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8632224202156067, "num_tokens": 108307953.0, "step": 2837 }, { "epoch": 0.3610227706398677, "ewc_loss": 0.018111659213900566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8111659301212057e-05, "grad_norm": 13.729565620422363, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8491572737693787, "num_tokens": 108351998.0, "step": 2838 }, { "epoch": 0.36114998091845824, "ewc_loss": 0.018178680911660194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8178681784775108e-05, "grad_norm": 13.847307205200195, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8413877487182617, "num_tokens": 108395968.0, "step": 2839 }, { "epoch": 0.3612771911970487, "ewc_loss": 0.018139488995075226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.813948983908631e-05, "grad_norm": 13.812162399291992, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8348907232284546, "num_tokens": 108433045.0, "step": 2840 }, { "epoch": 0.36140440147563924, "ewc_loss": 0.018118752166628838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8118751540896483e-05, "grad_norm": 13.829309463500977, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8482532501220703, "num_tokens": 108470964.0, "step": 2841 }, { "epoch": 0.36153161175422976, "ewc_loss": 0.018155250698328018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8155251382268034e-05, "grad_norm": 13.741121292114258, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8539153933525085, "num_tokens": 108512572.0, "step": 2842 }, { "epoch": 0.36165882203282024, "ewc_loss": 0.01809815876185894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.809815876185894e-05, "grad_norm": 13.842109680175781, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8381054401397705, "num_tokens": 108548526.0, "step": 2843 }, { "epoch": 0.36178603231141077, "ewc_loss": 0.018147077411413193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.81470768438885e-05, "grad_norm": 13.777063369750977, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8503709435462952, "num_tokens": 108588447.0, "step": 2844 }, { "epoch": 0.3619132425900013, "ewc_loss": 0.018077993765473366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.807799344533123e-05, "grad_norm": 13.769536018371582, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8431098461151123, "num_tokens": 108628426.0, "step": 2845 }, { "epoch": 0.36204045286859177, "ewc_loss": 0.0181041918694973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8104192349710502e-05, "grad_norm": 13.780987739562988, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8479094505310059, "num_tokens": 108661355.0, "step": 2846 }, { "epoch": 0.3621676631471823, "ewc_loss": 0.01814844273030758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8148442904930562e-05, "grad_norm": 13.793584823608398, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8687329888343811, "num_tokens": 108698505.0, "step": 2847 }, { "epoch": 0.3622948734257728, "ewc_loss": 0.01816423237323761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.816423173295334e-05, "grad_norm": 13.865523338317871, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.857895016670227, "num_tokens": 108737409.0, "step": 2848 }, { "epoch": 0.3624220837043633, "ewc_loss": 0.018147224560379982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8147224182030186e-05, "grad_norm": 13.733656883239746, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.853946328163147, "num_tokens": 108772969.0, "step": 2849 }, { "epoch": 0.3625492939829538, "ewc_loss": 0.018135348334908485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8135348000214435e-05, "grad_norm": 13.839851379394531, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8592507839202881, "num_tokens": 108805129.0, "step": 2850 }, { "epoch": 0.36267650426154435, "ewc_loss": 0.018189193680882454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.81891937245382e-05, "grad_norm": 13.750110626220703, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.838977575302124, "num_tokens": 108841197.0, "step": 2851 }, { "epoch": 0.3628037145401348, "ewc_loss": 0.01817280985414982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8172810086980462e-05, "grad_norm": 13.843156814575195, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8666619062423706, "num_tokens": 108876442.0, "step": 2852 }, { "epoch": 0.36293092481872535, "ewc_loss": 0.018223010003566742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.822301055653952e-05, "grad_norm": 13.800357818603516, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8386474847793579, "num_tokens": 108911432.0, "step": 2853 }, { "epoch": 0.3630581350973159, "ewc_loss": 0.018200011923909187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8200011254521087e-05, "grad_norm": 13.840360641479492, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.838535487651825, "num_tokens": 108952977.0, "step": 2854 }, { "epoch": 0.36318534537590635, "ewc_loss": 0.018244700506329536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.82447001861874e-05, "grad_norm": 13.821040153503418, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8295849561691284, "num_tokens": 108995602.0, "step": 2855 }, { "epoch": 0.3633125556544969, "ewc_loss": 0.018190259113907814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8190259652328677e-05, "grad_norm": 13.7871675491333, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8556130528450012, "num_tokens": 109040226.0, "step": 2856 }, { "epoch": 0.3634397659330874, "ewc_loss": 0.018245451152324677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8245451428811066e-05, "grad_norm": 13.857333183288574, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8544996380805969, "num_tokens": 109082898.0, "step": 2857 }, { "epoch": 0.3635669762116779, "ewc_loss": 0.01821461319923401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8214614101452753e-05, "grad_norm": 13.757034301757812, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8438977003097534, "num_tokens": 109122460.0, "step": 2858 }, { "epoch": 0.3636941864902684, "ewc_loss": 0.018213192000985146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.821319165173918e-05, "grad_norm": 13.780673027038574, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.852561354637146, "num_tokens": 109161680.0, "step": 2859 }, { "epoch": 0.36382139676885894, "ewc_loss": 0.0182032473385334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8203247236669995e-05, "grad_norm": 13.728036880493164, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8719627857208252, "num_tokens": 109198943.0, "step": 2860 }, { "epoch": 0.3639486070474494, "ewc_loss": 0.018267560750246048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8267561245011166e-05, "grad_norm": 13.805463790893555, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8606827855110168, "num_tokens": 109235719.0, "step": 2861 }, { "epoch": 0.36407581732603994, "ewc_loss": 0.018284335732460022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8284335965290666e-05, "grad_norm": 13.864151954650879, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8455997705459595, "num_tokens": 109272681.0, "step": 2862 }, { "epoch": 0.36420302760463047, "ewc_loss": 0.018257247284054756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.825724757509306e-05, "grad_norm": 13.775532722473145, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8412586450576782, "num_tokens": 109306114.0, "step": 2863 }, { "epoch": 0.36433023788322094, "ewc_loss": 0.01825745590031147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.825745675887447e-05, "grad_norm": 13.838881492614746, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8444747924804688, "num_tokens": 109337728.0, "step": 2864 }, { "epoch": 0.36445744816181147, "ewc_loss": 0.018277831375598907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8277831259183586e-05, "grad_norm": 13.811017990112305, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8427852392196655, "num_tokens": 109377809.0, "step": 2865 }, { "epoch": 0.364584658440402, "ewc_loss": 0.018275724723935127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.827572486945428e-05, "grad_norm": 13.801636695861816, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.866635799407959, "num_tokens": 109413499.0, "step": 2866 }, { "epoch": 0.36471186871899247, "ewc_loss": 0.018258102238178253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8258102500112727e-05, "grad_norm": 13.795485496520996, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8433106541633606, "num_tokens": 109460961.0, "step": 2867 }, { "epoch": 0.364839078997583, "ewc_loss": 0.01828727126121521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.828727181418799e-05, "grad_norm": 13.78693675994873, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8636746406555176, "num_tokens": 109500615.0, "step": 2868 }, { "epoch": 0.36496628927617353, "ewc_loss": 0.018263855949044228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8263855963596143e-05, "grad_norm": 13.804396629333496, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8562266826629639, "num_tokens": 109533921.0, "step": 2869 }, { "epoch": 0.365093499554764, "ewc_loss": 0.018308894708752632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8308894141227938e-05, "grad_norm": 13.859201431274414, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8427709341049194, "num_tokens": 109567069.0, "step": 2870 }, { "epoch": 0.36522070983335453, "ewc_loss": 0.018275247886776924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.827524829423055e-05, "grad_norm": 13.847609519958496, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8655310869216919, "num_tokens": 109603696.0, "step": 2871 }, { "epoch": 0.36534792011194506, "ewc_loss": 0.01830134727060795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8301347154192626e-05, "grad_norm": 13.747718811035156, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8703523278236389, "num_tokens": 109644590.0, "step": 2872 }, { "epoch": 0.36547513039053553, "ewc_loss": 0.018267419189214706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.826741936383769e-05, "grad_norm": 13.846661567687988, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8484907150268555, "num_tokens": 109678869.0, "step": 2873 }, { "epoch": 0.36560234066912606, "ewc_loss": 0.018315674737095833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8315675333724357e-05, "grad_norm": 13.787202835083008, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.846916675567627, "num_tokens": 109722517.0, "step": 2874 }, { "epoch": 0.3657295509477166, "ewc_loss": 0.018258968368172646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8258968339068815e-05, "grad_norm": 13.829216957092285, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8498433828353882, "num_tokens": 109757869.0, "step": 2875 }, { "epoch": 0.36585676122630706, "ewc_loss": 0.018356477841734886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8356477085035294e-05, "grad_norm": 13.8743257522583, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8482401371002197, "num_tokens": 109793162.0, "step": 2876 }, { "epoch": 0.3659839715048976, "ewc_loss": 0.018258094787597656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8258095224155113e-05, "grad_norm": 13.798608779907227, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8591156005859375, "num_tokens": 109832942.0, "step": 2877 }, { "epoch": 0.3661111817834881, "ewc_loss": 0.01830335333943367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8303353499504738e-05, "grad_norm": 13.899942398071289, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8515350818634033, "num_tokens": 109876230.0, "step": 2878 }, { "epoch": 0.3662383920620786, "ewc_loss": 0.0182992834597826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8299282601219602e-05, "grad_norm": 13.823640823364258, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8431985378265381, "num_tokens": 109911844.0, "step": 2879 }, { "epoch": 0.3663656023406691, "ewc_loss": 0.018281085416674614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.828108543122653e-05, "grad_norm": 13.895395278930664, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8418639302253723, "num_tokens": 109957619.0, "step": 2880 }, { "epoch": 0.36649281261925964, "ewc_loss": 0.018328310921788216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8328310034121387e-05, "grad_norm": 13.800921440124512, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8419814109802246, "num_tokens": 109999117.0, "step": 2881 }, { "epoch": 0.3666200228978502, "ewc_loss": 0.01826179213821888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8261791410623118e-05, "grad_norm": 13.823376655578613, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8594143986701965, "num_tokens": 110039557.0, "step": 2882 }, { "epoch": 0.36674723317644065, "ewc_loss": 0.018290894106030464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.829089342209045e-05, "grad_norm": 13.833477020263672, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8442869782447815, "num_tokens": 110074418.0, "step": 2883 }, { "epoch": 0.3668744434550312, "ewc_loss": 0.018275048583745956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.827504820539616e-05, "grad_norm": 13.839751243591309, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8576421737670898, "num_tokens": 110109255.0, "step": 2884 }, { "epoch": 0.3670016537336217, "ewc_loss": 0.01829705759882927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8297057977179065e-05, "grad_norm": 13.774951934814453, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8592991828918457, "num_tokens": 110150180.0, "step": 2885 }, { "epoch": 0.3671288640122122, "ewc_loss": 0.018272656947374344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8272656234330498e-05, "grad_norm": 13.852136611938477, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8361213207244873, "num_tokens": 110193653.0, "step": 2886 }, { "epoch": 0.3672560742908027, "ewc_loss": 0.018314827233552933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8314827684662305e-05, "grad_norm": 13.852082252502441, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.859850287437439, "num_tokens": 110231020.0, "step": 2887 }, { "epoch": 0.36738328456939323, "ewc_loss": 0.018239330500364304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8239330529468134e-05, "grad_norm": 13.772884368896484, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8680551052093506, "num_tokens": 110269397.0, "step": 2888 }, { "epoch": 0.3675104948479837, "ewc_loss": 0.018305080011487007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8305079720448703e-05, "grad_norm": 13.813499450683594, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8674136400222778, "num_tokens": 110311188.0, "step": 2889 }, { "epoch": 0.36763770512657423, "ewc_loss": 0.018311094492673874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.831109511840623e-05, "grad_norm": 13.868358612060547, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8567513823509216, "num_tokens": 110343801.0, "step": 2890 }, { "epoch": 0.36776491540516476, "ewc_loss": 0.018303245306015015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.830324617912993e-05, "grad_norm": 13.79171085357666, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8364333510398865, "num_tokens": 110385830.0, "step": 2891 }, { "epoch": 0.36789212568375523, "ewc_loss": 0.01823490858078003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8234908566228114e-05, "grad_norm": 13.83321475982666, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8531714677810669, "num_tokens": 110421494.0, "step": 2892 }, { "epoch": 0.36801933596234576, "ewc_loss": 0.018361037597060204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8361037291469984e-05, "grad_norm": 13.832880973815918, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8523328900337219, "num_tokens": 110463132.0, "step": 2893 }, { "epoch": 0.3681465462409363, "ewc_loss": 0.0182654932141304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8265493054059334e-05, "grad_norm": 13.793042182922363, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8551936149597168, "num_tokens": 110498330.0, "step": 2894 }, { "epoch": 0.36827375651952676, "ewc_loss": 0.018312137573957443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.831213739933446e-05, "grad_norm": 13.81572151184082, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8608452081680298, "num_tokens": 110534438.0, "step": 2895 }, { "epoch": 0.3684009667981173, "ewc_loss": 0.018286744132637978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.828674430726096e-05, "grad_norm": 13.789575576782227, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8481413125991821, "num_tokens": 110572059.0, "step": 2896 }, { "epoch": 0.3685281770767078, "ewc_loss": 0.01827070116996765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8270700820721686e-05, "grad_norm": 13.759978294372559, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.855504035949707, "num_tokens": 110611012.0, "step": 2897 }, { "epoch": 0.3686553873552983, "ewc_loss": 0.018312079831957817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8312079191673547e-05, "grad_norm": 13.795363426208496, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8609946966171265, "num_tokens": 110647429.0, "step": 2898 }, { "epoch": 0.3687825976338888, "ewc_loss": 0.018317028880119324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8317028661840595e-05, "grad_norm": 13.845572471618652, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.847174882888794, "num_tokens": 110685052.0, "step": 2899 }, { "epoch": 0.36890980791247935, "ewc_loss": 0.01832970604300499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8329705198993906e-05, "grad_norm": 13.796257972717285, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.831953763961792, "num_tokens": 110725222.0, "step": 2900 }, { "epoch": 0.3690370181910698, "ewc_loss": 0.01832173764705658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8321738025406376e-05, "grad_norm": 13.822916984558105, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.842598021030426, "num_tokens": 110764314.0, "step": 2901 }, { "epoch": 0.36916422846966035, "ewc_loss": 0.018374113366007805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8374114006292075e-05, "grad_norm": 13.846861839294434, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8650768995285034, "num_tokens": 110803585.0, "step": 2902 }, { "epoch": 0.3692914387482509, "ewc_loss": 0.01834554225206375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8345541320741177e-05, "grad_norm": 13.838027000427246, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8470339775085449, "num_tokens": 110846532.0, "step": 2903 }, { "epoch": 0.36941864902684135, "ewc_loss": 0.018363166600465775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8363167328061536e-05, "grad_norm": 13.803709030151367, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8506155014038086, "num_tokens": 110889807.0, "step": 2904 }, { "epoch": 0.3695458593054319, "ewc_loss": 0.018340863287448883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8340862879995257e-05, "grad_norm": 13.892472267150879, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8336892127990723, "num_tokens": 110930567.0, "step": 2905 }, { "epoch": 0.3696730695840224, "ewc_loss": 0.018397971987724304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8397971871308982e-05, "grad_norm": 13.807015419006348, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8588649034500122, "num_tokens": 110968725.0, "step": 2906 }, { "epoch": 0.3698002798626129, "ewc_loss": 0.01828005164861679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8280052245245315e-05, "grad_norm": 13.820086479187012, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8767304420471191, "num_tokens": 111002724.0, "step": 2907 }, { "epoch": 0.3699274901412034, "ewc_loss": 0.018346190452575684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8346190699958242e-05, "grad_norm": 13.850055694580078, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8519067764282227, "num_tokens": 111036668.0, "step": 2908 }, { "epoch": 0.37005470041979394, "ewc_loss": 0.018338067457079887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8338067093282007e-05, "grad_norm": 13.817606925964355, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8443226218223572, "num_tokens": 111077712.0, "step": 2909 }, { "epoch": 0.3701819106983844, "ewc_loss": 0.018344059586524963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8344058844377287e-05, "grad_norm": 13.776371002197266, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8433505296707153, "num_tokens": 111118856.0, "step": 2910 }, { "epoch": 0.37030912097697494, "ewc_loss": 0.018329579383134842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8329579688725062e-05, "grad_norm": 13.842751502990723, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8442775011062622, "num_tokens": 111156712.0, "step": 2911 }, { "epoch": 0.37043633125556547, "ewc_loss": 0.018352460116147995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8352460756432265e-05, "grad_norm": 13.794408798217773, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8537894487380981, "num_tokens": 111195542.0, "step": 2912 }, { "epoch": 0.37056354153415594, "ewc_loss": 0.018335776403546333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8335776985622942e-05, "grad_norm": 13.82270336151123, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8349575996398926, "num_tokens": 111237827.0, "step": 2913 }, { "epoch": 0.37069075181274647, "ewc_loss": 0.01838475465774536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8384755094302818e-05, "grad_norm": 13.817075729370117, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.859958827495575, "num_tokens": 111278802.0, "step": 2914 }, { "epoch": 0.370817962091337, "ewc_loss": 0.01834256947040558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8342569092055783e-05, "grad_norm": 13.851553916931152, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8502187728881836, "num_tokens": 111313795.0, "step": 2915 }, { "epoch": 0.37094517236992747, "ewc_loss": 0.01838049478828907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8380495021119714e-05, "grad_norm": 13.820759773254395, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.859325647354126, "num_tokens": 111353798.0, "step": 2916 }, { "epoch": 0.371072382648518, "ewc_loss": 0.018332144245505333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.833214446378406e-05, "grad_norm": 13.838215827941895, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8532806634902954, "num_tokens": 111390613.0, "step": 2917 }, { "epoch": 0.3711995929271085, "ewc_loss": 0.01837306283414364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8373062630416825e-05, "grad_norm": 13.831811904907227, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8532522916793823, "num_tokens": 111432043.0, "step": 2918 }, { "epoch": 0.371326803205699, "ewc_loss": 0.018354708328843117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8354709027335048e-05, "grad_norm": 13.814557075500488, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8587698936462402, "num_tokens": 111476575.0, "step": 2919 }, { "epoch": 0.3714540134842895, "ewc_loss": 0.018382901325821877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8382901544100605e-05, "grad_norm": 13.810226440429688, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8524537682533264, "num_tokens": 111515304.0, "step": 2920 }, { "epoch": 0.37158122376288005, "ewc_loss": 0.018358344212174416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8358343368163332e-05, "grad_norm": 13.828012466430664, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8619697093963623, "num_tokens": 111550246.0, "step": 2921 }, { "epoch": 0.3717084340414705, "ewc_loss": 0.018358327448368073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.83583269972587e-05, "grad_norm": 13.802115440368652, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8537164926528931, "num_tokens": 111592189.0, "step": 2922 }, { "epoch": 0.37183564432006105, "ewc_loss": 0.01837770827114582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.837770832935348e-05, "grad_norm": 13.827637672424316, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8566097021102905, "num_tokens": 111638177.0, "step": 2923 }, { "epoch": 0.3719628545986516, "ewc_loss": 0.01839613914489746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8396138329990208e-05, "grad_norm": 13.788752555847168, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.855026125907898, "num_tokens": 111674365.0, "step": 2924 }, { "epoch": 0.37209006487724205, "ewc_loss": 0.018392685800790787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8392685888102278e-05, "grad_norm": 13.898481369018555, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8484031558036804, "num_tokens": 111714571.0, "step": 2925 }, { "epoch": 0.3722172751558326, "ewc_loss": 0.018400685861706734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8400685803499073e-05, "grad_norm": 13.835250854492188, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8535038828849792, "num_tokens": 111755214.0, "step": 2926 }, { "epoch": 0.3723444854344231, "ewc_loss": 0.018341440707445145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8341441318625584e-05, "grad_norm": 13.83652400970459, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8260235786437988, "num_tokens": 111796516.0, "step": 2927 }, { "epoch": 0.3724716957130136, "ewc_loss": 0.018366457894444466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.836645787989255e-05, "grad_norm": 13.813822746276855, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8390228748321533, "num_tokens": 111832615.0, "step": 2928 }, { "epoch": 0.3725989059916041, "ewc_loss": 0.018403593450784683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8403594367555343e-05, "grad_norm": 13.882144927978516, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8437641859054565, "num_tokens": 111874682.0, "step": 2929 }, { "epoch": 0.37272611627019464, "ewc_loss": 0.018375307321548462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.83753072633408e-05, "grad_norm": 13.883143424987793, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8405978083610535, "num_tokens": 111916369.0, "step": 2930 }, { "epoch": 0.37285332654878517, "ewc_loss": 0.01832655444741249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8326554709346965e-05, "grad_norm": 13.801787376403809, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8446910381317139, "num_tokens": 111950671.0, "step": 2931 }, { "epoch": 0.37298053682737564, "ewc_loss": 0.018333042040467262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.833304122556001e-05, "grad_norm": 13.849793434143066, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8344811797142029, "num_tokens": 111988846.0, "step": 2932 }, { "epoch": 0.37310774710596617, "ewc_loss": 0.0183434896171093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8343489500693977e-05, "grad_norm": 13.769303321838379, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8616036772727966, "num_tokens": 112022853.0, "step": 2933 }, { "epoch": 0.3732349573845567, "ewc_loss": 0.018357690423727036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.835769035096746e-05, "grad_norm": 13.839446067810059, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8479582071304321, "num_tokens": 112065631.0, "step": 2934 }, { "epoch": 0.37336216766314717, "ewc_loss": 0.01842643693089485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.842643723648507e-05, "grad_norm": 13.864017486572266, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8422114253044128, "num_tokens": 112107236.0, "step": 2935 }, { "epoch": 0.3734893779417377, "ewc_loss": 0.018384724855422974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8384724171482958e-05, "grad_norm": 13.847131729125977, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8301229476928711, "num_tokens": 112153914.0, "step": 2936 }, { "epoch": 0.3736165882203282, "ewc_loss": 0.018411219120025635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8411219571135007e-05, "grad_norm": 13.861469268798828, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8544866442680359, "num_tokens": 112187596.0, "step": 2937 }, { "epoch": 0.3737437984989187, "ewc_loss": 0.018428996205329895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.842899655457586e-05, "grad_norm": 13.820279121398926, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8498852849006653, "num_tokens": 112222409.0, "step": 2938 }, { "epoch": 0.37387100877750923, "ewc_loss": 0.018432265147566795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8432265278534032e-05, "grad_norm": 13.853043556213379, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8399068117141724, "num_tokens": 112257699.0, "step": 2939 }, { "epoch": 0.37399821905609976, "ewc_loss": 0.01845192350447178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8451923097018152e-05, "grad_norm": 13.820365905761719, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8606460690498352, "num_tokens": 112292431.0, "step": 2940 }, { "epoch": 0.37412542933469023, "ewc_loss": 0.018437035381793976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.843703466875013e-05, "grad_norm": 13.837727546691895, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8525960445404053, "num_tokens": 112331241.0, "step": 2941 }, { "epoch": 0.37425263961328076, "ewc_loss": 0.01846052147448063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8460521459928714e-05, "grad_norm": 13.773655891418457, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8488593697547913, "num_tokens": 112368486.0, "step": 2942 }, { "epoch": 0.3743798498918713, "ewc_loss": 0.018512431532144547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8512431779527105e-05, "grad_norm": 13.979849815368652, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8431251049041748, "num_tokens": 112409638.0, "step": 2943 }, { "epoch": 0.37450706017046176, "ewc_loss": 0.01853286474943161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8532864487497136e-05, "grad_norm": 13.795801162719727, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8476849794387817, "num_tokens": 112448257.0, "step": 2944 }, { "epoch": 0.3746342704490523, "ewc_loss": 0.018530121073126793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.853012145147659e-05, "grad_norm": 14.100334167480469, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8663403391838074, "num_tokens": 112488739.0, "step": 2945 }, { "epoch": 0.3747614807276428, "ewc_loss": 0.01854388788342476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8543887563282624e-05, "grad_norm": 13.811102867126465, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8372139930725098, "num_tokens": 112528621.0, "step": 2946 }, { "epoch": 0.3748886910062333, "ewc_loss": 0.018428193405270576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8428192561259493e-05, "grad_norm": 14.104850769042969, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8467203378677368, "num_tokens": 112564615.0, "step": 2947 }, { "epoch": 0.3750159012848238, "ewc_loss": 0.01859297603368759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8592976630316116e-05, "grad_norm": 13.935276985168457, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8409345149993896, "num_tokens": 112598190.0, "step": 2948 }, { "epoch": 0.37514311156341434, "ewc_loss": 0.018375050276517868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.83750507858349e-05, "grad_norm": 13.798545837402344, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8515396118164062, "num_tokens": 112632078.0, "step": 2949 }, { "epoch": 0.3752703218420048, "ewc_loss": 0.01848394051194191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8483940948499367e-05, "grad_norm": 13.946250915527344, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8292795419692993, "num_tokens": 112667848.0, "step": 2950 }, { "epoch": 0.37539753212059535, "ewc_loss": 0.018487341701984406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8487342458683997e-05, "grad_norm": 13.880057334899902, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8501411080360413, "num_tokens": 112707037.0, "step": 2951 }, { "epoch": 0.3755247423991859, "ewc_loss": 0.01845565438270569, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8455653844284825e-05, "grad_norm": 13.797091484069824, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8613132238388062, "num_tokens": 112751921.0, "step": 2952 }, { "epoch": 0.37565195267777635, "ewc_loss": 0.018530357629060745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.853035792009905e-05, "grad_norm": 13.985538482666016, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8559582233428955, "num_tokens": 112793061.0, "step": 2953 }, { "epoch": 0.3757791629563669, "ewc_loss": 0.018491823226213455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.849182262958493e-05, "grad_norm": 13.840816497802734, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8616692423820496, "num_tokens": 112833852.0, "step": 2954 }, { "epoch": 0.3759063732349574, "ewc_loss": 0.018405571579933167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8405571609036997e-05, "grad_norm": 13.878581047058105, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8483273983001709, "num_tokens": 112866718.0, "step": 2955 }, { "epoch": 0.3760335835135479, "ewc_loss": 0.01846453547477722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.846453596954234e-05, "grad_norm": 13.761116981506348, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.852886438369751, "num_tokens": 112909420.0, "step": 2956 }, { "epoch": 0.3761607937921384, "ewc_loss": 0.01846415549516678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8464155800757e-05, "grad_norm": 13.861063957214355, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8543105125427246, "num_tokens": 112944570.0, "step": 2957 }, { "epoch": 0.37628800407072893, "ewc_loss": 0.018537063151597977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.853706271504052e-05, "grad_norm": 13.915390014648438, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8648937344551086, "num_tokens": 112984107.0, "step": 2958 }, { "epoch": 0.3764152143493194, "ewc_loss": 0.01847195439040661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.847195380833e-05, "grad_norm": 13.780265808105469, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8749052286148071, "num_tokens": 113023066.0, "step": 2959 }, { "epoch": 0.37654242462790993, "ewc_loss": 0.018457939848303795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8457940313965082e-05, "grad_norm": 13.89109992980957, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8501651287078857, "num_tokens": 113060093.0, "step": 2960 }, { "epoch": 0.37666963490650046, "ewc_loss": 0.018496574833989143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8496575648896396e-05, "grad_norm": 13.92126750946045, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.843931257724762, "num_tokens": 113095940.0, "step": 2961 }, { "epoch": 0.37679684518509093, "ewc_loss": 0.01847151853144169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8471519069862552e-05, "grad_norm": 13.871736526489258, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8632537722587585, "num_tokens": 113130699.0, "step": 2962 }, { "epoch": 0.37692405546368146, "ewc_loss": 0.01847834140062332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8478342099115252e-05, "grad_norm": 13.835226058959961, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8521980047225952, "num_tokens": 113169145.0, "step": 2963 }, { "epoch": 0.377051265742272, "ewc_loss": 0.018485553562641144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8485554392100312e-05, "grad_norm": 13.844673156738281, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8663371205329895, "num_tokens": 113201987.0, "step": 2964 }, { "epoch": 0.37717847602086246, "ewc_loss": 0.01850346475839615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8503464161767624e-05, "grad_norm": 13.89437484741211, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.857999324798584, "num_tokens": 113239588.0, "step": 2965 }, { "epoch": 0.377305686299453, "ewc_loss": 0.018529701977968216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.852970126492437e-05, "grad_norm": 13.806720733642578, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8548012375831604, "num_tokens": 113277075.0, "step": 2966 }, { "epoch": 0.3774328965780435, "ewc_loss": 0.018513834103941917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.851383422035724e-05, "grad_norm": 13.89934253692627, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8804821968078613, "num_tokens": 113313708.0, "step": 2967 }, { "epoch": 0.377560106856634, "ewc_loss": 0.018551615998148918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.855161644925829e-05, "grad_norm": 13.856461524963379, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8336259126663208, "num_tokens": 113344027.0, "step": 2968 }, { "epoch": 0.3776873171352245, "ewc_loss": 0.018502768129110336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8502767488826066e-05, "grad_norm": 13.900825500488281, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8571142554283142, "num_tokens": 113388444.0, "step": 2969 }, { "epoch": 0.37781452741381505, "ewc_loss": 0.01855490542948246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.85549051820999e-05, "grad_norm": 13.812376022338867, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8476485013961792, "num_tokens": 113428660.0, "step": 2970 }, { "epoch": 0.3779417376924055, "ewc_loss": 0.018542582169175148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8542581528890878e-05, "grad_norm": 13.897982597351074, "learning_rate": 1e-06, "loss": 0.528, "mean_token_accuracy": 0.8318710327148438, "num_tokens": 113469160.0, "step": 2971 }, { "epoch": 0.37806894797099605, "ewc_loss": 0.018580907955765724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.858090763562359e-05, "grad_norm": 13.808805465698242, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8278312683105469, "num_tokens": 113504291.0, "step": 2972 }, { "epoch": 0.3781961582495866, "ewc_loss": 0.018537625670433044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8537624782766216e-05, "grad_norm": 13.873880386352539, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.853873610496521, "num_tokens": 113548085.0, "step": 2973 }, { "epoch": 0.37832336852817705, "ewc_loss": 0.018587937578558922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8587938029668294e-05, "grad_norm": 13.835861206054688, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.856459379196167, "num_tokens": 113581664.0, "step": 2974 }, { "epoch": 0.3784505788067676, "ewc_loss": 0.018580522388219833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8580522009870037e-05, "grad_norm": 13.892338752746582, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8657783269882202, "num_tokens": 113618552.0, "step": 2975 }, { "epoch": 0.3785777890853581, "ewc_loss": 0.01859033666551113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.859033727669157e-05, "grad_norm": 13.896721839904785, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8323521614074707, "num_tokens": 113653406.0, "step": 2976 }, { "epoch": 0.3787049993639486, "ewc_loss": 0.018582886084914207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8582886696094647e-05, "grad_norm": 13.8724365234375, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8609912395477295, "num_tokens": 113693803.0, "step": 2977 }, { "epoch": 0.3788322096425391, "ewc_loss": 0.01857006922364235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8570070096757263e-05, "grad_norm": 13.840968132019043, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8668986558914185, "num_tokens": 113734637.0, "step": 2978 }, { "epoch": 0.37895941992112964, "ewc_loss": 0.018604803830385208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.860480369941797e-05, "grad_norm": 13.882339477539062, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.845292329788208, "num_tokens": 113776579.0, "step": 2979 }, { "epoch": 0.37908663019972016, "ewc_loss": 0.018574723973870277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8574723071651533e-05, "grad_norm": 13.799708366394043, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.842154324054718, "num_tokens": 113818857.0, "step": 2980 }, { "epoch": 0.37921384047831064, "ewc_loss": 0.018544694408774376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8544695194577798e-05, "grad_norm": 13.906017303466797, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8442491292953491, "num_tokens": 113853248.0, "step": 2981 }, { "epoch": 0.37934105075690117, "ewc_loss": 0.018610099330544472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8610098777571693e-05, "grad_norm": 13.901344299316406, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8586301803588867, "num_tokens": 113894206.0, "step": 2982 }, { "epoch": 0.3794682610354917, "ewc_loss": 0.018584247678518295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.85842473001685e-05, "grad_norm": 13.901510238647461, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8603284358978271, "num_tokens": 113931309.0, "step": 2983 }, { "epoch": 0.37959547131408217, "ewc_loss": 0.018585199490189552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8585198631626554e-05, "grad_norm": 13.828861236572266, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8418899178504944, "num_tokens": 113976559.0, "step": 2984 }, { "epoch": 0.3797226815926727, "ewc_loss": 0.01855679415166378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8556793293100782e-05, "grad_norm": 13.911911964416504, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8514145612716675, "num_tokens": 114010814.0, "step": 2985 }, { "epoch": 0.3798498918712632, "ewc_loss": 0.01862405240535736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8624052245286293e-05, "grad_norm": 13.92440414428711, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8717019557952881, "num_tokens": 114049389.0, "step": 2986 }, { "epoch": 0.3799771021498537, "ewc_loss": 0.01854345202445984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8543452824815176e-05, "grad_norm": 13.89630126953125, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8707998394966125, "num_tokens": 114081463.0, "step": 2987 }, { "epoch": 0.3801043124284442, "ewc_loss": 0.018556252121925354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8556251234258525e-05, "grad_norm": 13.864921569824219, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8680210113525391, "num_tokens": 114117175.0, "step": 2988 }, { "epoch": 0.38023152270703475, "ewc_loss": 0.01856626570224762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8566264770925045e-05, "grad_norm": 13.933452606201172, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8418067693710327, "num_tokens": 114153764.0, "step": 2989 }, { "epoch": 0.3803587329856252, "ewc_loss": 0.018563847988843918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8563847334007733e-05, "grad_norm": 13.866924285888672, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8520365953445435, "num_tokens": 114191487.0, "step": 2990 }, { "epoch": 0.38048594326421575, "ewc_loss": 0.018543604761362076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8543605619925074e-05, "grad_norm": 13.900291442871094, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.828049898147583, "num_tokens": 114233880.0, "step": 2991 }, { "epoch": 0.3806131535428063, "ewc_loss": 0.018605176359415054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8605176592245698e-05, "grad_norm": 13.922004699707031, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.851099967956543, "num_tokens": 114269238.0, "step": 2992 }, { "epoch": 0.38074036382139675, "ewc_loss": 0.018591657280921936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8591657862998545e-05, "grad_norm": 13.892789840698242, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8644975423812866, "num_tokens": 114307645.0, "step": 2993 }, { "epoch": 0.3808675740999873, "ewc_loss": 0.018553784117102623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8553784684627317e-05, "grad_norm": 13.903022766113281, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8559421300888062, "num_tokens": 114341800.0, "step": 2994 }, { "epoch": 0.3809947843785778, "ewc_loss": 0.01859372854232788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.859372787293978e-05, "grad_norm": 13.933429718017578, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8612021207809448, "num_tokens": 114378363.0, "step": 2995 }, { "epoch": 0.3811219946571683, "ewc_loss": 0.01859738491475582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8597384041640908e-05, "grad_norm": 13.88498306274414, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8418859243392944, "num_tokens": 114414489.0, "step": 2996 }, { "epoch": 0.3812492049357588, "ewc_loss": 0.01859392039477825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8593920685816556e-05, "grad_norm": 13.960759162902832, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8479049205780029, "num_tokens": 114449197.0, "step": 2997 }, { "epoch": 0.38137641521434934, "ewc_loss": 0.018604852259159088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8604852812131867e-05, "grad_norm": 13.903020858764648, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8502969145774841, "num_tokens": 114478654.0, "step": 2998 }, { "epoch": 0.3815036254929398, "ewc_loss": 0.01859726943075657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8597269445308484e-05, "grad_norm": 13.871230125427246, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8607299327850342, "num_tokens": 114521807.0, "step": 2999 }, { "epoch": 0.38163083577153034, "ewc_loss": 0.018595287576317787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.859528674685862e-05, "grad_norm": 13.889424324035645, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8663357496261597, "num_tokens": 114558297.0, "step": 3000 }, { "epoch": 0.38175804605012087, "ewc_loss": 0.018641391769051552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8641392671270296e-05, "grad_norm": 13.898359298706055, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8590611219406128, "num_tokens": 114599630.0, "step": 3001 }, { "epoch": 0.38188525632871134, "ewc_loss": 0.018634969368577003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8634969819686376e-05, "grad_norm": 13.87154769897461, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8504043817520142, "num_tokens": 114638697.0, "step": 3002 }, { "epoch": 0.38201246660730187, "ewc_loss": 0.01861407235264778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8614071450429037e-05, "grad_norm": 13.921082496643066, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8533320426940918, "num_tokens": 114676106.0, "step": 3003 }, { "epoch": 0.3821396768858924, "ewc_loss": 0.018652748316526413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8652748622116633e-05, "grad_norm": 13.893077850341797, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8510263562202454, "num_tokens": 114712026.0, "step": 3004 }, { "epoch": 0.38226688716448287, "ewc_loss": 0.018625156953930855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8625156371854246e-05, "grad_norm": 13.8836030960083, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8310210704803467, "num_tokens": 114752348.0, "step": 3005 }, { "epoch": 0.3823940974430734, "ewc_loss": 0.018677707761526108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8677706975722685e-05, "grad_norm": 13.918079376220703, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8567671775817871, "num_tokens": 114789922.0, "step": 3006 }, { "epoch": 0.38252130772166393, "ewc_loss": 0.01865525357425213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8655253370525315e-05, "grad_norm": 13.860820770263672, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8468659520149231, "num_tokens": 114838834.0, "step": 3007 }, { "epoch": 0.3826485180002544, "ewc_loss": 0.018680108711123466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8680108041735366e-05, "grad_norm": 13.953157424926758, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8322215676307678, "num_tokens": 114873320.0, "step": 3008 }, { "epoch": 0.38277572827884493, "ewc_loss": 0.018660886213183403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8660886780708097e-05, "grad_norm": 13.926397323608398, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8649044036865234, "num_tokens": 114908293.0, "step": 3009 }, { "epoch": 0.38290293855743546, "ewc_loss": 0.01864914409816265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8649143385118805e-05, "grad_norm": 13.929815292358398, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8427125215530396, "num_tokens": 114950083.0, "step": 3010 }, { "epoch": 0.38303014883602593, "ewc_loss": 0.018651336431503296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.865133708633948e-05, "grad_norm": 13.888848304748535, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8570477962493896, "num_tokens": 114982353.0, "step": 3011 }, { "epoch": 0.38315735911461646, "ewc_loss": 0.0186192337423563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.86192337423563e-05, "grad_norm": 13.93561840057373, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8584486246109009, "num_tokens": 115017668.0, "step": 3012 }, { "epoch": 0.383284569393207, "ewc_loss": 0.018669551238417625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8669550627237186e-05, "grad_norm": 13.843632698059082, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8642102479934692, "num_tokens": 115055266.0, "step": 3013 }, { "epoch": 0.38341177967179746, "ewc_loss": 0.018582113087177277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.858211362559814e-05, "grad_norm": 13.877117156982422, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8479607105255127, "num_tokens": 115089728.0, "step": 3014 }, { "epoch": 0.383538989950388, "ewc_loss": 0.018693340942263603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8693341189646162e-05, "grad_norm": 13.926132202148438, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8379985094070435, "num_tokens": 115135192.0, "step": 3015 }, { "epoch": 0.3836662002289785, "ewc_loss": 0.018660133704543114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.866013371909503e-05, "grad_norm": 13.862494468688965, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8360768556594849, "num_tokens": 115170350.0, "step": 3016 }, { "epoch": 0.383793410507569, "ewc_loss": 0.018678676337003708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8678676497074775e-05, "grad_norm": 13.966438293457031, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8592821359634399, "num_tokens": 115202724.0, "step": 3017 }, { "epoch": 0.3839206207861595, "ewc_loss": 0.01870499923825264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8704999092733487e-05, "grad_norm": 13.96337604522705, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8529717922210693, "num_tokens": 115239274.0, "step": 3018 }, { "epoch": 0.38404783106475004, "ewc_loss": 0.018654393032193184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8654392988537438e-05, "grad_norm": 13.935458183288574, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8614017963409424, "num_tokens": 115278330.0, "step": 3019 }, { "epoch": 0.3841750413433405, "ewc_loss": 0.018677381798624992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.867738137661945e-05, "grad_norm": 13.83255672454834, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8482331037521362, "num_tokens": 115321258.0, "step": 3020 }, { "epoch": 0.38430225162193105, "ewc_loss": 0.01866498589515686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8664984963834286e-05, "grad_norm": 13.915322303771973, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8546682000160217, "num_tokens": 115362609.0, "step": 3021 }, { "epoch": 0.3844294619005216, "ewc_loss": 0.018670126795768738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.867012724687811e-05, "grad_norm": 13.86347770690918, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.8260862231254578, "num_tokens": 115405814.0, "step": 3022 }, { "epoch": 0.38455667217911205, "ewc_loss": 0.018686454743146896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8686454495764337e-05, "grad_norm": 13.948158264160156, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8467360734939575, "num_tokens": 115440856.0, "step": 3023 }, { "epoch": 0.3846838824577026, "ewc_loss": 0.018675804138183594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8675804312806576e-05, "grad_norm": 13.868144989013672, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8582828044891357, "num_tokens": 115482582.0, "step": 3024 }, { "epoch": 0.3848110927362931, "ewc_loss": 0.018684756010770798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8684755559661426e-05, "grad_norm": 13.889530181884766, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.843937873840332, "num_tokens": 115527268.0, "step": 3025 }, { "epoch": 0.3849383030148836, "ewc_loss": 0.0187094584107399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8709459254750982e-05, "grad_norm": 13.962153434753418, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8471433520317078, "num_tokens": 115563757.0, "step": 3026 }, { "epoch": 0.3850655132934741, "ewc_loss": 0.018663588911294937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8663587979972363e-05, "grad_norm": 13.85682487487793, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8566524386405945, "num_tokens": 115599629.0, "step": 3027 }, { "epoch": 0.38519272357206463, "ewc_loss": 0.01866164803504944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.866164711827878e-05, "grad_norm": 13.954726219177246, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8568577766418457, "num_tokens": 115634261.0, "step": 3028 }, { "epoch": 0.3853199338506551, "ewc_loss": 0.018729211762547493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8729211660684086e-05, "grad_norm": 13.912079811096191, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8658790588378906, "num_tokens": 115671150.0, "step": 3029 }, { "epoch": 0.38544714412924563, "ewc_loss": 0.018666986376047134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8666985852178186e-05, "grad_norm": 13.889360427856445, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8669619560241699, "num_tokens": 115714328.0, "step": 3030 }, { "epoch": 0.38557435440783616, "ewc_loss": 0.018705181777477264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8705180991673842e-05, "grad_norm": 13.86727237701416, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.868110179901123, "num_tokens": 115747118.0, "step": 3031 }, { "epoch": 0.3857015646864267, "ewc_loss": 0.01868540421128273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.868540493887849e-05, "grad_norm": 13.878026962280273, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8326205611228943, "num_tokens": 115790309.0, "step": 3032 }, { "epoch": 0.38582877496501716, "ewc_loss": 0.018706701695919037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8706701666815206e-05, "grad_norm": 13.86578369140625, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8649377822875977, "num_tokens": 115825630.0, "step": 3033 }, { "epoch": 0.3859559852436077, "ewc_loss": 0.01868809200823307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.868809158622753e-05, "grad_norm": 13.898432731628418, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8602187037467957, "num_tokens": 115858848.0, "step": 3034 }, { "epoch": 0.3860831955221982, "ewc_loss": 0.018717102706432343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.871710264822468e-05, "grad_norm": 13.885408401489258, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8622076511383057, "num_tokens": 115900024.0, "step": 3035 }, { "epoch": 0.3862104058007887, "ewc_loss": 0.018698440864682198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8698441635933705e-05, "grad_norm": 13.921915054321289, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.828596830368042, "num_tokens": 115938899.0, "step": 3036 }, { "epoch": 0.3863376160793792, "ewc_loss": 0.018696723505854607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8696722690947354e-05, "grad_norm": 13.906700134277344, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8616079092025757, "num_tokens": 115972199.0, "step": 3037 }, { "epoch": 0.38646482635796975, "ewc_loss": 0.018718037754297256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8718037608778104e-05, "grad_norm": 13.872823715209961, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8623061180114746, "num_tokens": 116011506.0, "step": 3038 }, { "epoch": 0.3865920366365602, "ewc_loss": 0.01869232952594757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.869232983153779e-05, "grad_norm": 13.915249824523926, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8302064538002014, "num_tokens": 116051428.0, "step": 3039 }, { "epoch": 0.38671924691515075, "ewc_loss": 0.018702905625104904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8702905435930006e-05, "grad_norm": 13.87480354309082, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8619053363800049, "num_tokens": 116090307.0, "step": 3040 }, { "epoch": 0.3868464571937413, "ewc_loss": 0.01868402771651745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8684027963900007e-05, "grad_norm": 13.966642379760742, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8434181213378906, "num_tokens": 116123381.0, "step": 3041 }, { "epoch": 0.38697366747233175, "ewc_loss": 0.01874387264251709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8743872715276666e-05, "grad_norm": 13.898895263671875, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.856381893157959, "num_tokens": 116157755.0, "step": 3042 }, { "epoch": 0.3871008777509223, "ewc_loss": 0.018706677481532097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.870667801995296e-05, "grad_norm": 13.91994571685791, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8535926938056946, "num_tokens": 116197371.0, "step": 3043 }, { "epoch": 0.3872280880295128, "ewc_loss": 0.018753521144390106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8753520635073073e-05, "grad_norm": 13.866000175476074, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8384402394294739, "num_tokens": 116240544.0, "step": 3044 }, { "epoch": 0.3873552983081033, "ewc_loss": 0.018697338178753853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8697337509365752e-05, "grad_norm": 13.905121803283691, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8504034280776978, "num_tokens": 116272079.0, "step": 3045 }, { "epoch": 0.3874825085866938, "ewc_loss": 0.0187666155397892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.87666155397892e-05, "grad_norm": 13.882064819335938, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.850878119468689, "num_tokens": 116309641.0, "step": 3046 }, { "epoch": 0.38760971886528434, "ewc_loss": 0.018758807331323624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8758806618279777e-05, "grad_norm": 13.903572082519531, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8476430177688599, "num_tokens": 116344470.0, "step": 3047 }, { "epoch": 0.3877369291438748, "ewc_loss": 0.018773281946778297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8773282135953195e-05, "grad_norm": 13.92188549041748, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8573588132858276, "num_tokens": 116380177.0, "step": 3048 }, { "epoch": 0.38786413942246534, "ewc_loss": 0.01877603307366371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8776032447931357e-05, "grad_norm": 13.88747501373291, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8634644746780396, "num_tokens": 116416482.0, "step": 3049 }, { "epoch": 0.38799134970105587, "ewc_loss": 0.01878095604479313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8780956452246755e-05, "grad_norm": 13.919408798217773, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8407676815986633, "num_tokens": 116456103.0, "step": 3050 }, { "epoch": 0.38811855997964634, "ewc_loss": 0.018770482391119003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.877048271126114e-05, "grad_norm": 13.846664428710938, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8590476512908936, "num_tokens": 116496810.0, "step": 3051 }, { "epoch": 0.38824577025823687, "ewc_loss": 0.01875893399119377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8758933947538026e-05, "grad_norm": 13.89447021484375, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8494745492935181, "num_tokens": 116532999.0, "step": 3052 }, { "epoch": 0.3883729805368274, "ewc_loss": 0.01879543997347355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8795439245877787e-05, "grad_norm": 13.885054588317871, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8614655137062073, "num_tokens": 116565700.0, "step": 3053 }, { "epoch": 0.38850019081541787, "ewc_loss": 0.018814878538250923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8814878785633482e-05, "grad_norm": 13.871390342712402, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8567953109741211, "num_tokens": 116602798.0, "step": 3054 }, { "epoch": 0.3886274010940084, "ewc_loss": 0.018824510276317596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8824510334525257e-05, "grad_norm": 13.891912460327148, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8634549379348755, "num_tokens": 116638743.0, "step": 3055 }, { "epoch": 0.3887546113725989, "ewc_loss": 0.018824616447091103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8824615835910663e-05, "grad_norm": 13.90054988861084, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8656474351882935, "num_tokens": 116672290.0, "step": 3056 }, { "epoch": 0.3888818216511894, "ewc_loss": 0.018826456740498543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.882645665318705e-05, "grad_norm": 13.890876770019531, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8547903299331665, "num_tokens": 116712593.0, "step": 3057 }, { "epoch": 0.3890090319297799, "ewc_loss": 0.018848763778805733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8848762920242734e-05, "grad_norm": 13.905205726623535, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8510863184928894, "num_tokens": 116748663.0, "step": 3058 }, { "epoch": 0.38913624220837045, "ewc_loss": 0.018884573131799698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.888457336463034e-05, "grad_norm": 13.959335327148438, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8497604131698608, "num_tokens": 116796018.0, "step": 3059 }, { "epoch": 0.3892634524869609, "ewc_loss": 0.018866851925849915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8866852769860998e-05, "grad_norm": 13.908551216125488, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.850488007068634, "num_tokens": 116837287.0, "step": 3060 }, { "epoch": 0.38939066276555145, "ewc_loss": 0.018844205886125565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8844206351786852e-05, "grad_norm": 13.947033882141113, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8372166752815247, "num_tokens": 116872163.0, "step": 3061 }, { "epoch": 0.389517873044142, "ewc_loss": 0.018866892904043198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8866892787627876e-05, "grad_norm": 13.919686317443848, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8611109256744385, "num_tokens": 116913445.0, "step": 3062 }, { "epoch": 0.38964508332273246, "ewc_loss": 0.018826527521014214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.882652759377379e-05, "grad_norm": 13.871633529663086, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8434876203536987, "num_tokens": 116954469.0, "step": 3063 }, { "epoch": 0.389772293601323, "ewc_loss": 0.01883798837661743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8837989046005532e-05, "grad_norm": 13.924980163574219, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8524385094642639, "num_tokens": 116994339.0, "step": 3064 }, { "epoch": 0.3898995038799135, "ewc_loss": 0.01884480193257332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884480116132181e-05, "grad_norm": 13.921802520751953, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8648905754089355, "num_tokens": 117030867.0, "step": 3065 }, { "epoch": 0.390026714158504, "ewc_loss": 0.01882915385067463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.882915421447251e-05, "grad_norm": 13.954010009765625, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8622898459434509, "num_tokens": 117068309.0, "step": 3066 }, { "epoch": 0.3901539244370945, "ewc_loss": 0.01880786381661892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8807862943504006e-05, "grad_norm": 13.889483451843262, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.849096953868866, "num_tokens": 117105278.0, "step": 3067 }, { "epoch": 0.39028113471568504, "ewc_loss": 0.01879291981458664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8792919945553876e-05, "grad_norm": 13.905576705932617, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8453453779220581, "num_tokens": 117143096.0, "step": 3068 }, { "epoch": 0.3904083449942755, "ewc_loss": 0.018817663192749023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.881766365841031e-05, "grad_norm": 13.895934104919434, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8537262678146362, "num_tokens": 117179638.0, "step": 3069 }, { "epoch": 0.39053555527286604, "ewc_loss": 0.018847528845071793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884752964542713e-05, "grad_norm": 13.940886497497559, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8662344813346863, "num_tokens": 117211560.0, "step": 3070 }, { "epoch": 0.39066276555145657, "ewc_loss": 0.018848637118935585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884863740997389e-05, "grad_norm": 13.946565628051758, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8457472324371338, "num_tokens": 117250622.0, "step": 3071 }, { "epoch": 0.39078997583004704, "ewc_loss": 0.018836265429854393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.883626464405097e-05, "grad_norm": 13.893075942993164, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8729478716850281, "num_tokens": 117287600.0, "step": 3072 }, { "epoch": 0.39091718610863757, "ewc_loss": 0.018795449286699295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.879545015981421e-05, "grad_norm": 13.89260196685791, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8382713198661804, "num_tokens": 117328188.0, "step": 3073 }, { "epoch": 0.3910443963872281, "ewc_loss": 0.018837086856365204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8837086827261373e-05, "grad_norm": 13.856677055358887, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8613929748535156, "num_tokens": 117367282.0, "step": 3074 }, { "epoch": 0.39117160666581857, "ewc_loss": 0.01880250684916973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8802506019710563e-05, "grad_norm": 13.927600860595703, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8639898300170898, "num_tokens": 117403376.0, "step": 3075 }, { "epoch": 0.3912988169444091, "ewc_loss": 0.018872177228331566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8872176951845177e-05, "grad_norm": 13.901261329650879, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8433469533920288, "num_tokens": 117438661.0, "step": 3076 }, { "epoch": 0.39142602722299963, "ewc_loss": 0.01879000850021839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.87900077435188e-05, "grad_norm": 13.89180850982666, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8578109741210938, "num_tokens": 117477351.0, "step": 3077 }, { "epoch": 0.3915532375015901, "ewc_loss": 0.01884232647716999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884232733573299e-05, "grad_norm": 13.9098482131958, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8510736227035522, "num_tokens": 117512773.0, "step": 3078 }, { "epoch": 0.39168044778018063, "ewc_loss": 0.018868321552872658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.886832069430966e-05, "grad_norm": 13.936131477355957, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.838498055934906, "num_tokens": 117555370.0, "step": 3079 }, { "epoch": 0.39180765805877116, "ewc_loss": 0.01885918341577053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.885918391053565e-05, "grad_norm": 13.891632080078125, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8611690998077393, "num_tokens": 117594208.0, "step": 3080 }, { "epoch": 0.3919348683373617, "ewc_loss": 0.01883627660572529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8836277376976795e-05, "grad_norm": 13.944416999816895, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8556708097457886, "num_tokens": 117632715.0, "step": 3081 }, { "epoch": 0.39206207861595216, "ewc_loss": 0.018857264891266823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8857264876714908e-05, "grad_norm": 13.899236679077148, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.848816990852356, "num_tokens": 117669221.0, "step": 3082 }, { "epoch": 0.3921892888945427, "ewc_loss": 0.0188166294246912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8816628653439693e-05, "grad_norm": 13.917208671569824, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8477210998535156, "num_tokens": 117709559.0, "step": 3083 }, { "epoch": 0.3923164991731332, "ewc_loss": 0.018834374845027924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8834374714060687e-05, "grad_norm": 13.851378440856934, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8609140515327454, "num_tokens": 117745240.0, "step": 3084 }, { "epoch": 0.3924437094517237, "ewc_loss": 0.018818391487002373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.881839125417173e-05, "grad_norm": 13.906817436218262, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8546164035797119, "num_tokens": 117787897.0, "step": 3085 }, { "epoch": 0.3925709197303142, "ewc_loss": 0.018857911229133606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8857910617953166e-05, "grad_norm": 13.910606384277344, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8501874208450317, "num_tokens": 117826862.0, "step": 3086 }, { "epoch": 0.39269813000890474, "ewc_loss": 0.018868014216423035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8868015104089864e-05, "grad_norm": 13.977127075195312, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8484793901443481, "num_tokens": 117866932.0, "step": 3087 }, { "epoch": 0.3928253402874952, "ewc_loss": 0.018863176926970482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.886317659227643e-05, "grad_norm": 13.924385070800781, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8541218042373657, "num_tokens": 117905538.0, "step": 3088 }, { "epoch": 0.39295255056608575, "ewc_loss": 0.01884336955845356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884336961666122e-05, "grad_norm": 13.939708709716797, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8420352935791016, "num_tokens": 117945123.0, "step": 3089 }, { "epoch": 0.3930797608446763, "ewc_loss": 0.018838895484805107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8838894902728498e-05, "grad_norm": 13.918983459472656, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.856061577796936, "num_tokens": 117988743.0, "step": 3090 }, { "epoch": 0.39320697112326675, "ewc_loss": 0.018857646733522415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8857646864489652e-05, "grad_norm": 13.936357498168945, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8685864210128784, "num_tokens": 118026078.0, "step": 3091 }, { "epoch": 0.3933341814018573, "ewc_loss": 0.018824096769094467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8824097423930652e-05, "grad_norm": 13.931234359741211, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8656010627746582, "num_tokens": 118060105.0, "step": 3092 }, { "epoch": 0.3934613916804478, "ewc_loss": 0.018841005861759186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8841006749426015e-05, "grad_norm": 13.991125106811523, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8804504871368408, "num_tokens": 118089677.0, "step": 3093 }, { "epoch": 0.3935886019590383, "ewc_loss": 0.018859371542930603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8859371266444214e-05, "grad_norm": 13.930394172668457, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8669055700302124, "num_tokens": 118128449.0, "step": 3094 }, { "epoch": 0.3937158122376288, "ewc_loss": 0.018802162259817123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8802162230713293e-05, "grad_norm": 13.97648811340332, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.862830638885498, "num_tokens": 118168411.0, "step": 3095 }, { "epoch": 0.39384302251621933, "ewc_loss": 0.018827468156814575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8827468011295423e-05, "grad_norm": 13.957817077636719, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8647893667221069, "num_tokens": 118206097.0, "step": 3096 }, { "epoch": 0.3939702327948098, "ewc_loss": 0.018840434029698372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8840433767763898e-05, "grad_norm": 13.975221633911133, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8672564029693604, "num_tokens": 118244429.0, "step": 3097 }, { "epoch": 0.39409744307340033, "ewc_loss": 0.01880122534930706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8801225451170467e-05, "grad_norm": 13.973675727844238, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8597513437271118, "num_tokens": 118279125.0, "step": 3098 }, { "epoch": 0.39422465335199086, "ewc_loss": 0.018793245777487755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.879324554465711e-05, "grad_norm": 13.990341186523438, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8516455292701721, "num_tokens": 118313156.0, "step": 3099 }, { "epoch": 0.39435186363058133, "ewc_loss": 0.018791697919368744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8791697584674694e-05, "grad_norm": 14.031179428100586, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8394792079925537, "num_tokens": 118349868.0, "step": 3100 }, { "epoch": 0.39447907390917186, "ewc_loss": 0.018813898786902428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.881389835034497e-05, "grad_norm": 13.948952674865723, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8452171087265015, "num_tokens": 118394998.0, "step": 3101 }, { "epoch": 0.3946062841877624, "ewc_loss": 0.018770696595311165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8770697352010757e-05, "grad_norm": 13.953506469726562, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8655362129211426, "num_tokens": 118436002.0, "step": 3102 }, { "epoch": 0.39473349446635286, "ewc_loss": 0.018803248181939125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.880324816738721e-05, "grad_norm": 13.954170227050781, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8594265580177307, "num_tokens": 118475094.0, "step": 3103 }, { "epoch": 0.3948607047449434, "ewc_loss": 0.01881803572177887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8818036551238038e-05, "grad_norm": 13.965704917907715, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8517879843711853, "num_tokens": 118514164.0, "step": 3104 }, { "epoch": 0.3949879150235339, "ewc_loss": 0.018817802891135216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8817803720594384e-05, "grad_norm": 13.92615032196045, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8501861095428467, "num_tokens": 118554271.0, "step": 3105 }, { "epoch": 0.3951151253021244, "ewc_loss": 0.018831059336662292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8831058696378022e-05, "grad_norm": 13.990382194519043, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8546288013458252, "num_tokens": 118597123.0, "step": 3106 }, { "epoch": 0.3952423355807149, "ewc_loss": 0.018862441182136536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.88624417205574e-05, "grad_norm": 13.930365562438965, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8488953113555908, "num_tokens": 118636683.0, "step": 3107 }, { "epoch": 0.39536954585930545, "ewc_loss": 0.018764328211545944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8764329070108943e-05, "grad_norm": 13.944531440734863, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8570793271064758, "num_tokens": 118665770.0, "step": 3108 }, { "epoch": 0.3954967561378959, "ewc_loss": 0.0188450925052166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884509219962638e-05, "grad_norm": 13.970969200134277, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8643726110458374, "num_tokens": 118701652.0, "step": 3109 }, { "epoch": 0.39562396641648645, "ewc_loss": 0.018801802769303322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.880180207081139e-05, "grad_norm": 13.896934509277344, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8605321645736694, "num_tokens": 118741756.0, "step": 3110 }, { "epoch": 0.395751176695077, "ewc_loss": 0.018856128677725792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.885612800833769e-05, "grad_norm": 13.939095497131348, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8545217514038086, "num_tokens": 118784320.0, "step": 3111 }, { "epoch": 0.39587838697366745, "ewc_loss": 0.018828170374035835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.882817014120519e-05, "grad_norm": 13.923807144165039, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8484461307525635, "num_tokens": 118821456.0, "step": 3112 }, { "epoch": 0.396005597252258, "ewc_loss": 0.018838629126548767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.883862933027558e-05, "grad_norm": 13.950143814086914, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8523820042610168, "num_tokens": 118861746.0, "step": 3113 }, { "epoch": 0.3961328075308485, "ewc_loss": 0.018866801634430885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.88668018381577e-05, "grad_norm": 13.930763244628906, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8589901924133301, "num_tokens": 118900346.0, "step": 3114 }, { "epoch": 0.396260017809439, "ewc_loss": 0.018847335129976273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.884733501356095e-05, "grad_norm": 13.90896224975586, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8546635508537292, "num_tokens": 118939707.0, "step": 3115 }, { "epoch": 0.3963872280880295, "ewc_loss": 0.018884003162384033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.888400402094703e-05, "grad_norm": 14.01419448852539, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8541874289512634, "num_tokens": 118971154.0, "step": 3116 }, { "epoch": 0.39651443836662004, "ewc_loss": 0.018914714455604553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8914714019047096e-05, "grad_norm": 13.967418670654297, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8395781517028809, "num_tokens": 119012344.0, "step": 3117 }, { "epoch": 0.3966416486452105, "ewc_loss": 0.018873678520321846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8873679437092505e-05, "grad_norm": 13.92270565032959, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8535453081130981, "num_tokens": 119047677.0, "step": 3118 }, { "epoch": 0.39676885892380104, "ewc_loss": 0.018886849284172058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8886848920374177e-05, "grad_norm": 13.948702812194824, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8688575029373169, "num_tokens": 119086357.0, "step": 3119 }, { "epoch": 0.39689606920239157, "ewc_loss": 0.018929725512862206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.892972613859456e-05, "grad_norm": 13.928337097167969, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8619176149368286, "num_tokens": 119122854.0, "step": 3120 }, { "epoch": 0.39702327948098204, "ewc_loss": 0.018944235518574715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8944236217066646e-05, "grad_norm": 14.058891296386719, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8612045049667358, "num_tokens": 119155449.0, "step": 3121 }, { "epoch": 0.39715048975957257, "ewc_loss": 0.01896112784743309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8961127352667972e-05, "grad_norm": 13.989700317382812, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8492549061775208, "num_tokens": 119197287.0, "step": 3122 }, { "epoch": 0.3972777000381631, "ewc_loss": 0.0188889317214489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8888931663241237e-05, "grad_norm": 13.970383644104004, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8429279327392578, "num_tokens": 119235749.0, "step": 3123 }, { "epoch": 0.39740491031675357, "ewc_loss": 0.018948420882225037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8948421711684205e-05, "grad_norm": 14.031947135925293, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8504791259765625, "num_tokens": 119267282.0, "step": 3124 }, { "epoch": 0.3975321205953441, "ewc_loss": 0.018934844061732292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.893484477477614e-05, "grad_norm": 14.03154182434082, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8459841012954712, "num_tokens": 119304387.0, "step": 3125 }, { "epoch": 0.3976593308739346, "ewc_loss": 0.018919847905635834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8919847207143903e-05, "grad_norm": 13.927484512329102, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8445993065834045, "num_tokens": 119341470.0, "step": 3126 }, { "epoch": 0.3977865411525251, "ewc_loss": 0.01891295611858368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.891295687528327e-05, "grad_norm": 13.974739074707031, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8430545926094055, "num_tokens": 119381067.0, "step": 3127 }, { "epoch": 0.3979137514311156, "ewc_loss": 0.018933817744255066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8933817045763135e-05, "grad_norm": 13.916132926940918, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8548144102096558, "num_tokens": 119415680.0, "step": 3128 }, { "epoch": 0.39804096170970615, "ewc_loss": 0.018958400934934616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8958400687552057e-05, "grad_norm": 14.004876136779785, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8452901840209961, "num_tokens": 119454300.0, "step": 3129 }, { "epoch": 0.3981681719882967, "ewc_loss": 0.018982643261551857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8982644178322516e-05, "grad_norm": 13.965144157409668, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8666098117828369, "num_tokens": 119485599.0, "step": 3130 }, { "epoch": 0.39829538226688715, "ewc_loss": 0.019010595977306366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9010596588486806e-05, "grad_norm": 14.059247970581055, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.839496910572052, "num_tokens": 119525820.0, "step": 3131 }, { "epoch": 0.3984225925454777, "ewc_loss": 0.018989551812410355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.898955088108778e-05, "grad_norm": 13.894489288330078, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8604438900947571, "num_tokens": 119568216.0, "step": 3132 }, { "epoch": 0.3985498028240682, "ewc_loss": 0.018959231674671173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8959231965709478e-05, "grad_norm": 14.049627304077148, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8418471813201904, "num_tokens": 119602914.0, "step": 3133 }, { "epoch": 0.3986770131026587, "ewc_loss": 0.018998393788933754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8998392988578416e-05, "grad_norm": 13.913328170776367, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8497888445854187, "num_tokens": 119637024.0, "step": 3134 }, { "epoch": 0.3988042233812492, "ewc_loss": 0.018945278599858284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8945278497994877e-05, "grad_norm": 14.003954887390137, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8441998362541199, "num_tokens": 119679265.0, "step": 3135 }, { "epoch": 0.39893143365983974, "ewc_loss": 0.01901101879775524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9011018594028428e-05, "grad_norm": 13.970634460449219, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8452862501144409, "num_tokens": 119716254.0, "step": 3136 }, { "epoch": 0.3990586439384302, "ewc_loss": 0.018991172313690186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.899117160064634e-05, "grad_norm": 13.961892127990723, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8465222120285034, "num_tokens": 119761325.0, "step": 3137 }, { "epoch": 0.39918585421702074, "ewc_loss": 0.01897517591714859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8975175407831557e-05, "grad_norm": 13.992222785949707, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8768662214279175, "num_tokens": 119795468.0, "step": 3138 }, { "epoch": 0.39931306449561127, "ewc_loss": 0.019017385318875313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.901738505694084e-05, "grad_norm": 13.969636917114258, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8443554639816284, "num_tokens": 119833804.0, "step": 3139 }, { "epoch": 0.39944027477420174, "ewc_loss": 0.01896270550787449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.896270623547025e-05, "grad_norm": 14.036651611328125, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8495060801506042, "num_tokens": 119870852.0, "step": 3140 }, { "epoch": 0.39956748505279227, "ewc_loss": 0.019014671444892883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9014671124750748e-05, "grad_norm": 13.981973648071289, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8559520840644836, "num_tokens": 119909258.0, "step": 3141 }, { "epoch": 0.3996946953313828, "ewc_loss": 0.018996883183717728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8996883227373473e-05, "grad_norm": 14.064270973205566, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8423531651496887, "num_tokens": 119940701.0, "step": 3142 }, { "epoch": 0.39982190560997327, "ewc_loss": 0.01901697739958763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9016977603314444e-05, "grad_norm": 14.085296630859375, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8537570834159851, "num_tokens": 119976780.0, "step": 3143 }, { "epoch": 0.3999491158885638, "ewc_loss": 0.01901470683515072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.901470750453882e-05, "grad_norm": 14.003314018249512, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.845504105091095, "num_tokens": 120017543.0, "step": 3144 }, { "epoch": 0.40007632616715433, "ewc_loss": 0.018963640555739403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8963641196023673e-05, "grad_norm": 14.088022232055664, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8509373664855957, "num_tokens": 120050036.0, "step": 3145 }, { "epoch": 0.4002035364457448, "ewc_loss": 0.01903037168085575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9030370822292753e-05, "grad_norm": 13.937299728393555, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8677893877029419, "num_tokens": 120084582.0, "step": 3146 }, { "epoch": 0.40033074672433533, "ewc_loss": 0.018966786563396454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8966786228702404e-05, "grad_norm": 14.085986137390137, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.849531888961792, "num_tokens": 120120218.0, "step": 3147 }, { "epoch": 0.40045795700292586, "ewc_loss": 0.019049301743507385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9049301045015454e-05, "grad_norm": 13.924762725830078, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8747954964637756, "num_tokens": 120161650.0, "step": 3148 }, { "epoch": 0.40058516728151633, "ewc_loss": 0.01894904114305973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8949041987070814e-05, "grad_norm": 13.969480514526367, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8627731204032898, "num_tokens": 120197409.0, "step": 3149 }, { "epoch": 0.40071237756010686, "ewc_loss": 0.019067566841840744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9067567336605862e-05, "grad_norm": 13.98612117767334, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8413079977035522, "num_tokens": 120240700.0, "step": 3150 }, { "epoch": 0.4008395878386974, "ewc_loss": 0.019021715968847275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.902171607071068e-05, "grad_norm": 13.967838287353516, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8438308238983154, "num_tokens": 120286741.0, "step": 3151 }, { "epoch": 0.40096679811728786, "ewc_loss": 0.019031282514333725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.903128213598393e-05, "grad_norm": 14.015639305114746, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8568107485771179, "num_tokens": 120323236.0, "step": 3152 }, { "epoch": 0.4010940083958784, "ewc_loss": 0.01906622387468815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.906622310343664e-05, "grad_norm": 14.042609214782715, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8524059653282166, "num_tokens": 120362185.0, "step": 3153 }, { "epoch": 0.4012212186744689, "ewc_loss": 0.019027963280677795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9027962480322458e-05, "grad_norm": 14.021035194396973, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8449883460998535, "num_tokens": 120397524.0, "step": 3154 }, { "epoch": 0.4013484289530594, "ewc_loss": 0.01901889778673649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.901889845612459e-05, "grad_norm": 13.980713844299316, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8577472567558289, "num_tokens": 120439723.0, "step": 3155 }, { "epoch": 0.4014756392316499, "ewc_loss": 0.019014829769730568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9014829376828857e-05, "grad_norm": 14.013644218444824, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8575167059898376, "num_tokens": 120483693.0, "step": 3156 }, { "epoch": 0.40160284951024044, "ewc_loss": 0.019038818776607513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.903881820908282e-05, "grad_norm": 13.980718612670898, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8632526993751526, "num_tokens": 120520558.0, "step": 3157 }, { "epoch": 0.4017300597888309, "ewc_loss": 0.01897994801402092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.897994843602646e-05, "grad_norm": 14.040173530578613, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8631959557533264, "num_tokens": 120557901.0, "step": 3158 }, { "epoch": 0.40185727006742145, "ewc_loss": 0.018999923020601273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.89999227586668e-05, "grad_norm": 13.973485946655273, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8600250482559204, "num_tokens": 120593851.0, "step": 3159 }, { "epoch": 0.401984480346012, "ewc_loss": 0.01899540424346924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.899540438898839e-05, "grad_norm": 14.064532279968262, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8468981981277466, "num_tokens": 120624533.0, "step": 3160 }, { "epoch": 0.40211169062460245, "ewc_loss": 0.019038010388612747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9038010577787645e-05, "grad_norm": 14.062872886657715, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8486756086349487, "num_tokens": 120660820.0, "step": 3161 }, { "epoch": 0.402238900903193, "ewc_loss": 0.01898588240146637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8985881979460828e-05, "grad_norm": 14.029924392700195, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8651807308197021, "num_tokens": 120701485.0, "step": 3162 }, { "epoch": 0.4023661111817835, "ewc_loss": 0.018967092037200928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.89670918189222e-05, "grad_norm": 13.965004920959473, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8632745742797852, "num_tokens": 120736677.0, "step": 3163 }, { "epoch": 0.402493321460374, "ewc_loss": 0.018977267667651176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8977267245645635e-05, "grad_norm": 14.067028045654297, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8510072827339172, "num_tokens": 120773933.0, "step": 3164 }, { "epoch": 0.4026205317389645, "ewc_loss": 0.019004682078957558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9004681234946474e-05, "grad_norm": 14.034749984741211, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8584389090538025, "num_tokens": 120804671.0, "step": 3165 }, { "epoch": 0.40274774201755503, "ewc_loss": 0.018951160833239555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8951161109725945e-05, "grad_norm": 14.068110466003418, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8578585982322693, "num_tokens": 120837042.0, "step": 3166 }, { "epoch": 0.4028749522961455, "ewc_loss": 0.019036447629332542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.903644806589e-05, "grad_norm": 14.079780578613281, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8503073453903198, "num_tokens": 120875127.0, "step": 3167 }, { "epoch": 0.40300216257473603, "ewc_loss": 0.018954504281282425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8954504412249662e-05, "grad_norm": 14.082682609558105, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8534634113311768, "num_tokens": 120915937.0, "step": 3168 }, { "epoch": 0.40312937285332656, "ewc_loss": 0.019015558063983917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.901555879157968e-05, "grad_norm": 14.063109397888184, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8386349678039551, "num_tokens": 120953334.0, "step": 3169 }, { "epoch": 0.40325658313191703, "ewc_loss": 0.018959036096930504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8959035514853895e-05, "grad_norm": 14.018667221069336, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8533456325531006, "num_tokens": 120986674.0, "step": 3170 }, { "epoch": 0.40338379341050756, "ewc_loss": 0.019058311358094215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9058310499531217e-05, "grad_norm": 14.062769889831543, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8601818084716797, "num_tokens": 121026305.0, "step": 3171 }, { "epoch": 0.4035110036890981, "ewc_loss": 0.019020039588212967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9020038962480612e-05, "grad_norm": 13.966890335083008, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8679167628288269, "num_tokens": 121061644.0, "step": 3172 }, { "epoch": 0.40363821396768856, "ewc_loss": 0.019040273502469063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9040273400605656e-05, "grad_norm": 13.99831771850586, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8501860499382019, "num_tokens": 121102860.0, "step": 3173 }, { "epoch": 0.4037654242462791, "ewc_loss": 0.019096961244940758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.909696038637776e-05, "grad_norm": 14.08981704711914, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8679362535476685, "num_tokens": 121136566.0, "step": 3174 }, { "epoch": 0.4038926345248696, "ewc_loss": 0.019097277894616127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.909727870952338e-05, "grad_norm": 14.068207740783691, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8639988899230957, "num_tokens": 121173759.0, "step": 3175 }, { "epoch": 0.4040198448034601, "ewc_loss": 0.019047990441322327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9047989553655498e-05, "grad_norm": 14.014276504516602, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8544062972068787, "num_tokens": 121219881.0, "step": 3176 }, { "epoch": 0.4041470550820506, "ewc_loss": 0.019083131104707718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.90831306099426e-05, "grad_norm": 14.135313034057617, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8531321883201599, "num_tokens": 121258719.0, "step": 3177 }, { "epoch": 0.40427426536064115, "ewc_loss": 0.01905730739235878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9057308236369863e-05, "grad_norm": 14.031878471374512, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8310614228248596, "num_tokens": 121298013.0, "step": 3178 }, { "epoch": 0.4044014756392316, "ewc_loss": 0.018998894840478897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.8998895029653795e-05, "grad_norm": 14.090154647827148, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8663110136985779, "num_tokens": 121329615.0, "step": 3179 }, { "epoch": 0.40452868591782215, "ewc_loss": 0.019074229523539543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.907423029479105e-05, "grad_norm": 14.036624908447266, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.864240288734436, "num_tokens": 121365199.0, "step": 3180 }, { "epoch": 0.4046558961964127, "ewc_loss": 0.019038397818803787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.90383980225306e-05, "grad_norm": 14.079754829406738, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8504315614700317, "num_tokens": 121402393.0, "step": 3181 }, { "epoch": 0.4047831064750032, "ewc_loss": 0.019016508013010025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.901650830404833e-05, "grad_norm": 14.042952537536621, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.85478276014328, "num_tokens": 121439678.0, "step": 3182 }, { "epoch": 0.4049103167535937, "ewc_loss": 0.019077118486166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.907711884996388e-05, "grad_norm": 14.04578971862793, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8488666415214539, "num_tokens": 121480781.0, "step": 3183 }, { "epoch": 0.4050375270321842, "ewc_loss": 0.019041480496525764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.904148120956961e-05, "grad_norm": 13.990032196044922, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8609601855278015, "num_tokens": 121517614.0, "step": 3184 }, { "epoch": 0.40516473731077474, "ewc_loss": 0.019019629806280136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9019629689864814e-05, "grad_norm": 13.981022834777832, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8518101572990417, "num_tokens": 121552225.0, "step": 3185 }, { "epoch": 0.4052919475893652, "ewc_loss": 0.019061481580138206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9061480998061597e-05, "grad_norm": 14.083468437194824, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8478518724441528, "num_tokens": 121587177.0, "step": 3186 }, { "epoch": 0.40541915786795574, "ewc_loss": 0.019061336293816566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9061335478909314e-05, "grad_norm": 14.069645881652832, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8365025520324707, "num_tokens": 121619547.0, "step": 3187 }, { "epoch": 0.40554636814654627, "ewc_loss": 0.019045842811465263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9045843146159314e-05, "grad_norm": 13.953582763671875, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.861444354057312, "num_tokens": 121657396.0, "step": 3188 }, { "epoch": 0.40567357842513674, "ewc_loss": 0.019055131822824478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.905513090605382e-05, "grad_norm": 14.057417869567871, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8369451761245728, "num_tokens": 121691867.0, "step": 3189 }, { "epoch": 0.40580078870372727, "ewc_loss": 0.019116710871458054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.911671097332146e-05, "grad_norm": 14.012317657470703, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8471791744232178, "num_tokens": 121723546.0, "step": 3190 }, { "epoch": 0.4059279989823178, "ewc_loss": 0.01909775659441948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9097757103736512e-05, "grad_norm": 14.022687911987305, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8534903526306152, "num_tokens": 121764589.0, "step": 3191 }, { "epoch": 0.40605520926090827, "ewc_loss": 0.019096223637461662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9096223695669323e-05, "grad_norm": 13.972245216369629, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8433489799499512, "num_tokens": 121803281.0, "step": 3192 }, { "epoch": 0.4061824195394988, "ewc_loss": 0.01911357417702675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9113575035589747e-05, "grad_norm": 14.028647422790527, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8403059244155884, "num_tokens": 121841710.0, "step": 3193 }, { "epoch": 0.4063096298180893, "ewc_loss": 0.019154788926243782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.915478969749529e-05, "grad_norm": 14.037263870239258, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8522687554359436, "num_tokens": 121881979.0, "step": 3194 }, { "epoch": 0.4064368400966798, "ewc_loss": 0.019110340625047684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9110340872430243e-05, "grad_norm": 14.019953727722168, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8589742183685303, "num_tokens": 121921599.0, "step": 3195 }, { "epoch": 0.4065640503752703, "ewc_loss": 0.019165465608239174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.91654653463047e-05, "grad_norm": 14.072933197021484, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8634422421455383, "num_tokens": 121960545.0, "step": 3196 }, { "epoch": 0.40669126065386085, "ewc_loss": 0.019153829663991928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9153829271090217e-05, "grad_norm": 14.074522018432617, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8357717990875244, "num_tokens": 121999259.0, "step": 3197 }, { "epoch": 0.4068184709324513, "ewc_loss": 0.01912664994597435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9126649931422435e-05, "grad_norm": 14.02922248840332, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8677930235862732, "num_tokens": 122039066.0, "step": 3198 }, { "epoch": 0.40694568121104185, "ewc_loss": 0.019102513790130615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9102513761026785e-05, "grad_norm": 14.004059791564941, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8620204925537109, "num_tokens": 122085035.0, "step": 3199 }, { "epoch": 0.4070728914896324, "ewc_loss": 0.019094770774245262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.909477032313589e-05, "grad_norm": 13.946402549743652, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8678202629089355, "num_tokens": 122128007.0, "step": 3200 }, { "epoch": 0.40720010176822286, "ewc_loss": 0.019117996096611023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9117995179840364e-05, "grad_norm": 14.161527633666992, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8520150780677795, "num_tokens": 122169623.0, "step": 3201 }, { "epoch": 0.4073273120468134, "ewc_loss": 0.01913733221590519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.913733285618946e-05, "grad_norm": 14.06867504119873, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8494658470153809, "num_tokens": 122201755.0, "step": 3202 }, { "epoch": 0.4074545223254039, "ewc_loss": 0.019075393676757812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.907539444800932e-05, "grad_norm": 14.117490768432617, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8644255995750427, "num_tokens": 122233680.0, "step": 3203 }, { "epoch": 0.4075817326039944, "ewc_loss": 0.019136875867843628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.913687628984917e-05, "grad_norm": 14.07841682434082, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8511735796928406, "num_tokens": 122265107.0, "step": 3204 }, { "epoch": 0.4077089428825849, "ewc_loss": 0.019069792702794075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.90697919606464e-05, "grad_norm": 14.080692291259766, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8589115142822266, "num_tokens": 122302750.0, "step": 3205 }, { "epoch": 0.40783615316117544, "ewc_loss": 0.01910462975502014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.910462924570311e-05, "grad_norm": 13.957499504089355, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8447647094726562, "num_tokens": 122346820.0, "step": 3206 }, { "epoch": 0.4079633634397659, "ewc_loss": 0.01909773424267769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9097733456874266e-05, "grad_norm": 14.060074806213379, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8587201833724976, "num_tokens": 122383254.0, "step": 3207 }, { "epoch": 0.40809057371835644, "ewc_loss": 0.019119804725050926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.911980507429689e-05, "grad_norm": 14.052298545837402, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8659594058990479, "num_tokens": 122416693.0, "step": 3208 }, { "epoch": 0.40821778399694697, "ewc_loss": 0.019100429490208626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.910042919917032e-05, "grad_norm": 14.062702178955078, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8599934577941895, "num_tokens": 122452403.0, "step": 3209 }, { "epoch": 0.40834499427553744, "ewc_loss": 0.019144367426633835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.914436688821297e-05, "grad_norm": 14.090871810913086, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8435196876525879, "num_tokens": 122489685.0, "step": 3210 }, { "epoch": 0.40847220455412797, "ewc_loss": 0.01911635510623455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9116354451398365e-05, "grad_norm": 14.031388282775879, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8374783992767334, "num_tokens": 122529079.0, "step": 3211 }, { "epoch": 0.4085994148327185, "ewc_loss": 0.019108930602669716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9108931155642495e-05, "grad_norm": 14.017373085021973, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8476499319076538, "num_tokens": 122569933.0, "step": 3212 }, { "epoch": 0.40872662511130897, "ewc_loss": 0.01914365403354168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.914365384436678e-05, "grad_norm": 14.056321144104004, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8348606824874878, "num_tokens": 122607884.0, "step": 3213 }, { "epoch": 0.4088538353898995, "ewc_loss": 0.01913142390549183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9131424778606743e-05, "grad_norm": 14.054991722106934, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8556621670722961, "num_tokens": 122651968.0, "step": 3214 }, { "epoch": 0.40898104566849003, "ewc_loss": 0.019113684073090553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.911368417495396e-05, "grad_norm": 14.043996810913086, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8586587905883789, "num_tokens": 122688680.0, "step": 3215 }, { "epoch": 0.4091082559470805, "ewc_loss": 0.01913660392165184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9136603441438638e-05, "grad_norm": 14.079777717590332, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8596605062484741, "num_tokens": 122728271.0, "step": 3216 }, { "epoch": 0.40923546622567103, "ewc_loss": 0.019131822511553764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.913182313728612e-05, "grad_norm": 14.03471851348877, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8537166118621826, "num_tokens": 122759691.0, "step": 3217 }, { "epoch": 0.40936267650426156, "ewc_loss": 0.019094767048954964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9094766685157083e-05, "grad_norm": 14.035582542419434, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8524458408355713, "num_tokens": 122800185.0, "step": 3218 }, { "epoch": 0.40948988678285203, "ewc_loss": 0.019141338765621185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9141338270856068e-05, "grad_norm": 14.019938468933105, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8640760183334351, "num_tokens": 122839548.0, "step": 3219 }, { "epoch": 0.40961709706144256, "ewc_loss": 0.01910572685301304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9105726096313447e-05, "grad_norm": 14.08153247833252, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8400436639785767, "num_tokens": 122879621.0, "step": 3220 }, { "epoch": 0.4097443073400331, "ewc_loss": 0.019128860905766487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9128860003547743e-05, "grad_norm": 13.995078086853027, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.849095344543457, "num_tokens": 122921495.0, "step": 3221 }, { "epoch": 0.40987151761862356, "ewc_loss": 0.019070759415626526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9070759663009085e-05, "grad_norm": 14.050018310546875, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8606969118118286, "num_tokens": 122957197.0, "step": 3222 }, { "epoch": 0.4099987278972141, "ewc_loss": 0.019141925498843193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9141925804433413e-05, "grad_norm": 14.045540809631348, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8349555730819702, "num_tokens": 122999999.0, "step": 3223 }, { "epoch": 0.4101259381758046, "ewc_loss": 0.01906944252550602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9069442714680918e-05, "grad_norm": 14.000093460083008, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8588658571243286, "num_tokens": 123040402.0, "step": 3224 }, { "epoch": 0.4102531484543951, "ewc_loss": 0.019130980595946312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9130980945192277e-05, "grad_norm": 14.058956146240234, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8590664863586426, "num_tokens": 123080086.0, "step": 3225 }, { "epoch": 0.4103803587329856, "ewc_loss": 0.01912260614335537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9122606317978352e-05, "grad_norm": 14.00848388671875, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8561620116233826, "num_tokens": 123120058.0, "step": 3226 }, { "epoch": 0.41050756901157615, "ewc_loss": 0.019079795107245445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9079794583376497e-05, "grad_norm": 14.059426307678223, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8553510904312134, "num_tokens": 123162003.0, "step": 3227 }, { "epoch": 0.4106347792901666, "ewc_loss": 0.019138440489768982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.913844062073622e-05, "grad_norm": 14.077810287475586, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8217750191688538, "num_tokens": 123202249.0, "step": 3228 }, { "epoch": 0.41076198956875715, "ewc_loss": 0.01909755729138851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9097557014902122e-05, "grad_norm": 14.069781303405762, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8487157225608826, "num_tokens": 123237401.0, "step": 3229 }, { "epoch": 0.4108891998473477, "ewc_loss": 0.019104011356830597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9104010789305903e-05, "grad_norm": 14.034977912902832, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8707388043403625, "num_tokens": 123269364.0, "step": 3230 }, { "epoch": 0.4110164101259382, "ewc_loss": 0.019095687195658684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9095687093795277e-05, "grad_norm": 14.072489738464355, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8563780784606934, "num_tokens": 123307743.0, "step": 3231 }, { "epoch": 0.4111436204045287, "ewc_loss": 0.019116269424557686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.91162689588964e-05, "grad_norm": 14.023783683776855, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8589677214622498, "num_tokens": 123340102.0, "step": 3232 }, { "epoch": 0.4112708306831192, "ewc_loss": 0.01908988133072853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.908988087961916e-05, "grad_norm": 14.057910919189453, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8491523265838623, "num_tokens": 123374956.0, "step": 3233 }, { "epoch": 0.41139804096170973, "ewc_loss": 0.019142812117934227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.914281165227294e-05, "grad_norm": 13.980524063110352, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8643032312393188, "num_tokens": 123418070.0, "step": 3234 }, { "epoch": 0.4115252512403002, "ewc_loss": 0.019105851650238037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.910585160658229e-05, "grad_norm": 14.0838041305542, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8519993424415588, "num_tokens": 123453183.0, "step": 3235 }, { "epoch": 0.41165246151889073, "ewc_loss": 0.019188448786735535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.91884482774185e-05, "grad_norm": 14.013002395629883, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8552747368812561, "num_tokens": 123491526.0, "step": 3236 }, { "epoch": 0.41177967179748126, "ewc_loss": 0.01916029304265976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9160292140441015e-05, "grad_norm": 14.065893173217773, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8563383221626282, "num_tokens": 123529743.0, "step": 3237 }, { "epoch": 0.41190688207607173, "ewc_loss": 0.019192742183804512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9192742911400273e-05, "grad_norm": 13.972153663635254, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8569386601448059, "num_tokens": 123573402.0, "step": 3238 }, { "epoch": 0.41203409235466226, "ewc_loss": 0.01918046921491623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9180468370905146e-05, "grad_norm": 14.14665412902832, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8517476320266724, "num_tokens": 123604167.0, "step": 3239 }, { "epoch": 0.4121613026332528, "ewc_loss": 0.019245868548750877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9245868315920234e-05, "grad_norm": 13.968440055847168, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.85111004114151, "num_tokens": 123648104.0, "step": 3240 }, { "epoch": 0.41228851291184326, "ewc_loss": 0.019141199067234993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9141198208671995e-05, "grad_norm": 14.006129264831543, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8444294929504395, "num_tokens": 123689678.0, "step": 3241 }, { "epoch": 0.4124157231904338, "ewc_loss": 0.019233809784054756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9233810235164128e-05, "grad_norm": 14.01653003692627, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.861763596534729, "num_tokens": 123729257.0, "step": 3242 }, { "epoch": 0.4125429334690243, "ewc_loss": 0.01917116902768612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.917116969707422e-05, "grad_norm": 13.987542152404785, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8575119972229004, "num_tokens": 123772251.0, "step": 3243 }, { "epoch": 0.4126701437476148, "ewc_loss": 0.019243942573666573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.924394200614188e-05, "grad_norm": 14.108383178710938, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8672319650650024, "num_tokens": 123810345.0, "step": 3244 }, { "epoch": 0.4127973540262053, "ewc_loss": 0.019234856590628624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9234856154071167e-05, "grad_norm": 14.072376251220703, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8513456583023071, "num_tokens": 123855276.0, "step": 3245 }, { "epoch": 0.41292456430479585, "ewc_loss": 0.01919921115040779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9199211237719283e-05, "grad_norm": 14.072467803955078, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8572870492935181, "num_tokens": 123896613.0, "step": 3246 }, { "epoch": 0.4130517745833863, "ewc_loss": 0.019163651391863823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9163651813869365e-05, "grad_norm": 14.028542518615723, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8389718532562256, "num_tokens": 123940713.0, "step": 3247 }, { "epoch": 0.41317898486197685, "ewc_loss": 0.019158802926540375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.915880238811951e-05, "grad_norm": 14.065031051635742, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8443821668624878, "num_tokens": 123981485.0, "step": 3248 }, { "epoch": 0.4133061951405674, "ewc_loss": 0.019176596775650978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.91765975614544e-05, "grad_norm": 14.060348510742188, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8546626567840576, "num_tokens": 124018159.0, "step": 3249 }, { "epoch": 0.41343340541915785, "ewc_loss": 0.019140880554914474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9140879885526374e-05, "grad_norm": 14.028340339660645, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8646892309188843, "num_tokens": 124049911.0, "step": 3250 }, { "epoch": 0.4135606156977484, "ewc_loss": 0.019186461344361305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.918646194098983e-05, "grad_norm": 14.099125862121582, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8754578828811646, "num_tokens": 124084260.0, "step": 3251 }, { "epoch": 0.4136878259763389, "ewc_loss": 0.01917142979800701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9171429812558927e-05, "grad_norm": 14.038719177246094, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8579612970352173, "num_tokens": 124122741.0, "step": 3252 }, { "epoch": 0.4138150362549294, "ewc_loss": 0.01916215941309929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9162160242558457e-05, "grad_norm": 14.089371681213379, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8508859872817993, "num_tokens": 124162793.0, "step": 3253 }, { "epoch": 0.4139422465335199, "ewc_loss": 0.019200481474399567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9200480892322958e-05, "grad_norm": 14.076826095581055, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8475684523582458, "num_tokens": 124206889.0, "step": 3254 }, { "epoch": 0.41406945681211044, "ewc_loss": 0.01914345845580101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9143459212500602e-05, "grad_norm": 14.047700881958008, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8660213351249695, "num_tokens": 124244551.0, "step": 3255 }, { "epoch": 0.4141966670907009, "ewc_loss": 0.019177842885255814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.917784356919583e-05, "grad_norm": 14.098962783813477, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8580392599105835, "num_tokens": 124290485.0, "step": 3256 }, { "epoch": 0.41432387736929144, "ewc_loss": 0.01918994076550007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9189941667718813e-05, "grad_norm": 14.089415550231934, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8615499138832092, "num_tokens": 124323763.0, "step": 3257 }, { "epoch": 0.41445108764788197, "ewc_loss": 0.019133172929286957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.913317282742355e-05, "grad_norm": 14.024849891662598, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8538581132888794, "num_tokens": 124363318.0, "step": 3258 }, { "epoch": 0.41457829792647244, "ewc_loss": 0.01916445419192314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9164453988196328e-05, "grad_norm": 14.066654205322266, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8779836297035217, "num_tokens": 124400845.0, "step": 3259 }, { "epoch": 0.41470550820506297, "ewc_loss": 0.01918737217783928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9187371435691603e-05, "grad_norm": 14.066549301147461, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8608507513999939, "num_tokens": 124442546.0, "step": 3260 }, { "epoch": 0.4148327184836535, "ewc_loss": 0.019161079078912735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.916107976285275e-05, "grad_norm": 14.097129821777344, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8493501543998718, "num_tokens": 124477737.0, "step": 3261 }, { "epoch": 0.41495992876224397, "ewc_loss": 0.0191761814057827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.917618101288099e-05, "grad_norm": 14.042519569396973, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8420767784118652, "num_tokens": 124518081.0, "step": 3262 }, { "epoch": 0.4150871390408345, "ewc_loss": 0.019131965935230255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9131966837449e-05, "grad_norm": 14.02145004272461, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8439216017723083, "num_tokens": 124556361.0, "step": 3263 }, { "epoch": 0.415214349319425, "ewc_loss": 0.01918238028883934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9182380128768273e-05, "grad_norm": 14.029732704162598, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8539543151855469, "num_tokens": 124588490.0, "step": 3264 }, { "epoch": 0.4153415595980155, "ewc_loss": 0.01919601671397686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9196017092326656e-05, "grad_norm": 14.108802795410156, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.853061854839325, "num_tokens": 124624656.0, "step": 3265 }, { "epoch": 0.415468769876606, "ewc_loss": 0.019199185073375702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9199185771867633e-05, "grad_norm": 14.05122184753418, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8414551019668579, "num_tokens": 124660681.0, "step": 3266 }, { "epoch": 0.41559598015519655, "ewc_loss": 0.019203757867217064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9203758711228147e-05, "grad_norm": 14.046365737915039, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8553541898727417, "num_tokens": 124703448.0, "step": 3267 }, { "epoch": 0.415723190433787, "ewc_loss": 0.01924220845103264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.92422085092403e-05, "grad_norm": 14.082671165466309, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8521643877029419, "num_tokens": 124742078.0, "step": 3268 }, { "epoch": 0.41585040071237755, "ewc_loss": 0.01924048364162445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9240484107285738e-05, "grad_norm": 14.112126350402832, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8413253426551819, "num_tokens": 124779894.0, "step": 3269 }, { "epoch": 0.4159776109909681, "ewc_loss": 0.01924973540008068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9249735487392172e-05, "grad_norm": 14.079456329345703, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8492485284805298, "num_tokens": 124817918.0, "step": 3270 }, { "epoch": 0.41610482126955856, "ewc_loss": 0.019251596182584763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9251596313552e-05, "grad_norm": 14.115262985229492, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8711498379707336, "num_tokens": 124854839.0, "step": 3271 }, { "epoch": 0.4162320315481491, "ewc_loss": 0.019279954954981804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.927995435835328e-05, "grad_norm": 14.048300743103027, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8379102945327759, "num_tokens": 124886779.0, "step": 3272 }, { "epoch": 0.4163592418267396, "ewc_loss": 0.0192353967577219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.923539639392402e-05, "grad_norm": 14.073122024536133, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8630689978599548, "num_tokens": 124927919.0, "step": 3273 }, { "epoch": 0.4164864521053301, "ewc_loss": 0.019312117248773575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9312117728986777e-05, "grad_norm": 14.056722640991211, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.851699948310852, "num_tokens": 124966475.0, "step": 3274 }, { "epoch": 0.4166136623839206, "ewc_loss": 0.01926390267908573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9263901776866987e-05, "grad_norm": 14.112585067749023, "learning_rate": 1e-06, "loss": 0.5368, "mean_token_accuracy": 0.8299437165260315, "num_tokens": 125004110.0, "step": 3275 }, { "epoch": 0.41674087266251114, "ewc_loss": 0.019317220896482468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9317221813253127e-05, "grad_norm": 14.072938919067383, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8686223030090332, "num_tokens": 125043164.0, "step": 3276 }, { "epoch": 0.4168680829411016, "ewc_loss": 0.019233563914895058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.923356467159465e-05, "grad_norm": 13.963682174682617, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8573331236839294, "num_tokens": 125084255.0, "step": 3277 }, { "epoch": 0.41699529321969214, "ewc_loss": 0.019281012937426567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9281013010186143e-05, "grad_norm": 14.166558265686035, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8676677346229553, "num_tokens": 125124971.0, "step": 3278 }, { "epoch": 0.41712250349828267, "ewc_loss": 0.019337909296154976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9337909179739654e-05, "grad_norm": 14.047539710998535, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8504800796508789, "num_tokens": 125161470.0, "step": 3279 }, { "epoch": 0.4172497137768732, "ewc_loss": 0.019257627427577972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9257628082414158e-05, "grad_norm": 14.125147819519043, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8388077020645142, "num_tokens": 125199117.0, "step": 3280 }, { "epoch": 0.41737692405546367, "ewc_loss": 0.019304269924759865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.930427060869988e-05, "grad_norm": 14.039407730102539, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8363902568817139, "num_tokens": 125238136.0, "step": 3281 }, { "epoch": 0.4175041343340542, "ewc_loss": 0.019298851490020752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9298851839266717e-05, "grad_norm": 14.133509635925293, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8515477776527405, "num_tokens": 125281041.0, "step": 3282 }, { "epoch": 0.41763134461264473, "ewc_loss": 0.019302159547805786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9302158762002364e-05, "grad_norm": 14.057676315307617, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8571322560310364, "num_tokens": 125314739.0, "step": 3283 }, { "epoch": 0.4177585548912352, "ewc_loss": 0.019282247871160507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.928224810399115e-05, "grad_norm": 14.109458923339844, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8478931188583374, "num_tokens": 125353129.0, "step": 3284 }, { "epoch": 0.41788576516982573, "ewc_loss": 0.019321536645293236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9321536456118338e-05, "grad_norm": 14.128355026245117, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8526004552841187, "num_tokens": 125394092.0, "step": 3285 }, { "epoch": 0.41801297544841626, "ewc_loss": 0.019274389371275902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9274390069767833e-05, "grad_norm": 14.068021774291992, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8479282855987549, "num_tokens": 125438262.0, "step": 3286 }, { "epoch": 0.41814018572700673, "ewc_loss": 0.019280267879366875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.928026722453069e-05, "grad_norm": 14.119110107421875, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8521828651428223, "num_tokens": 125477842.0, "step": 3287 }, { "epoch": 0.41826739600559726, "ewc_loss": 0.019284479320049286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.92844800039893e-05, "grad_norm": 14.03946304321289, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8658298254013062, "num_tokens": 125517236.0, "step": 3288 }, { "epoch": 0.4183946062841878, "ewc_loss": 0.01927470788359642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9274708392913453e-05, "grad_norm": 14.08850383758545, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8472151756286621, "num_tokens": 125561598.0, "step": 3289 }, { "epoch": 0.41852181656277826, "ewc_loss": 0.019331706687808037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9331706425873563e-05, "grad_norm": 14.173782348632812, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.851804256439209, "num_tokens": 125599104.0, "step": 3290 }, { "epoch": 0.4186490268413688, "ewc_loss": 0.019286347553133965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9286348106106743e-05, "grad_norm": 14.09463119506836, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8550819158554077, "num_tokens": 125635286.0, "step": 3291 }, { "epoch": 0.4187762371199593, "ewc_loss": 0.019255811348557472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9255810912000015e-05, "grad_norm": 14.106193542480469, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8583056926727295, "num_tokens": 125669534.0, "step": 3292 }, { "epoch": 0.4189034473985498, "ewc_loss": 0.019290650263428688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.929065001604613e-05, "grad_norm": 14.091795921325684, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8540551662445068, "num_tokens": 125707966.0, "step": 3293 }, { "epoch": 0.4190306576771403, "ewc_loss": 0.019241947680711746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9241948393755592e-05, "grad_norm": 14.061507225036621, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8494994640350342, "num_tokens": 125741448.0, "step": 3294 }, { "epoch": 0.41915786795573085, "ewc_loss": 0.019306445494294167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.930644612002652e-05, "grad_norm": 14.174002647399902, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8474273085594177, "num_tokens": 125776887.0, "step": 3295 }, { "epoch": 0.4192850782343213, "ewc_loss": 0.019290609285235405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.929060999827925e-05, "grad_norm": 14.09157943725586, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8584141731262207, "num_tokens": 125807899.0, "step": 3296 }, { "epoch": 0.41941228851291185, "ewc_loss": 0.019269617274403572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.926961704157293e-05, "grad_norm": 14.079061508178711, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8620879650115967, "num_tokens": 125847711.0, "step": 3297 }, { "epoch": 0.4195394987915024, "ewc_loss": 0.019306669011712074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9306669855723158e-05, "grad_norm": 14.112265586853027, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8678218126296997, "num_tokens": 125883248.0, "step": 3298 }, { "epoch": 0.41966670907009285, "ewc_loss": 0.019322169944643974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9322169464430772e-05, "grad_norm": 14.08543586730957, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8426631689071655, "num_tokens": 125925771.0, "step": 3299 }, { "epoch": 0.4197939193486834, "ewc_loss": 0.019262973219156265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.926297409227118e-05, "grad_norm": 14.032925605773926, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8421432971954346, "num_tokens": 125968815.0, "step": 3300 }, { "epoch": 0.4199211296272739, "ewc_loss": 0.019297439604997635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9297440303489566e-05, "grad_norm": 14.05649471282959, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8488976359367371, "num_tokens": 126007761.0, "step": 3301 }, { "epoch": 0.4200483399058644, "ewc_loss": 0.019335826858878136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9335826436872594e-05, "grad_norm": 14.15017032623291, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8347102403640747, "num_tokens": 126047872.0, "step": 3302 }, { "epoch": 0.4201755501844549, "ewc_loss": 0.019319934770464897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9319933926453814e-05, "grad_norm": 13.993423461914062, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8607409000396729, "num_tokens": 126081273.0, "step": 3303 }, { "epoch": 0.42030276046304543, "ewc_loss": 0.01930304616689682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9303046428831294e-05, "grad_norm": 14.057291030883789, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8384418487548828, "num_tokens": 126121590.0, "step": 3304 }, { "epoch": 0.4204299707416359, "ewc_loss": 0.019396495074033737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.939649519044906e-05, "grad_norm": 14.065491676330566, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8373550176620483, "num_tokens": 126168396.0, "step": 3305 }, { "epoch": 0.42055718102022643, "ewc_loss": 0.01932602748274803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9326027540955693e-05, "grad_norm": 14.083361625671387, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8491560220718384, "num_tokens": 126205220.0, "step": 3306 }, { "epoch": 0.42068439129881696, "ewc_loss": 0.019366394728422165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9366394553799182e-05, "grad_norm": 14.109652519226074, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8468104600906372, "num_tokens": 126239131.0, "step": 3307 }, { "epoch": 0.42081160157740743, "ewc_loss": 0.019360974431037903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9360973965376616e-05, "grad_norm": 14.080968856811523, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8561374545097351, "num_tokens": 126271931.0, "step": 3308 }, { "epoch": 0.42093881185599796, "ewc_loss": 0.019345803186297417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9345803593751043e-05, "grad_norm": 14.022035598754883, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8633211851119995, "num_tokens": 126307730.0, "step": 3309 }, { "epoch": 0.4210660221345885, "ewc_loss": 0.0193773303180933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.93773303180933e-05, "grad_norm": 14.089567184448242, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8572077751159668, "num_tokens": 126344528.0, "step": 3310 }, { "epoch": 0.42119323241317896, "ewc_loss": 0.01942279189825058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.942279232025612e-05, "grad_norm": 14.075236320495605, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8622058629989624, "num_tokens": 126377260.0, "step": 3311 }, { "epoch": 0.4213204426917695, "ewc_loss": 0.01936514489352703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9365144908078946e-05, "grad_norm": 14.09196662902832, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8501004576683044, "num_tokens": 126409723.0, "step": 3312 }, { "epoch": 0.42144765297036, "ewc_loss": 0.019429683685302734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9429684471106157e-05, "grad_norm": 14.065560340881348, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8578436374664307, "num_tokens": 126450929.0, "step": 3313 }, { "epoch": 0.4215748632489505, "ewc_loss": 0.019427459686994553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.942745984706562e-05, "grad_norm": 14.053973197937012, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8636337518692017, "num_tokens": 126486878.0, "step": 3314 }, { "epoch": 0.421702073527541, "ewc_loss": 0.01941831409931183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.941831396834459e-05, "grad_norm": 14.13982105255127, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8424298763275146, "num_tokens": 126522884.0, "step": 3315 }, { "epoch": 0.42182928380613155, "ewc_loss": 0.019432559609413147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9432560293353163e-05, "grad_norm": 14.052435874938965, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8558131456375122, "num_tokens": 126560848.0, "step": 3316 }, { "epoch": 0.421956494084722, "ewc_loss": 0.01937822252511978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.937822344189044e-05, "grad_norm": 14.091959953308105, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8565747737884521, "num_tokens": 126592947.0, "step": 3317 }, { "epoch": 0.42208370436331255, "ewc_loss": 0.01947791501879692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9477914975141175e-05, "grad_norm": 14.10309886932373, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8646736145019531, "num_tokens": 126630631.0, "step": 3318 }, { "epoch": 0.4222109146419031, "ewc_loss": 0.01944109983742237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.944110044860281e-05, "grad_norm": 14.059965133666992, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8498491644859314, "num_tokens": 126673395.0, "step": 3319 }, { "epoch": 0.42233812492049355, "ewc_loss": 0.019438518211245537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9438517483649775e-05, "grad_norm": 14.091047286987305, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8707317113876343, "num_tokens": 126711504.0, "step": 3320 }, { "epoch": 0.4224653351990841, "ewc_loss": 0.019462229683995247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.94622298295144e-05, "grad_norm": 14.091609001159668, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8674412965774536, "num_tokens": 126753217.0, "step": 3321 }, { "epoch": 0.4225925454776746, "ewc_loss": 0.01941690221428871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.941690243256744e-05, "grad_norm": 14.039690017700195, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8604470491409302, "num_tokens": 126792982.0, "step": 3322 }, { "epoch": 0.4227197557562651, "ewc_loss": 0.019394470378756523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9394470655242912e-05, "grad_norm": 14.111624717712402, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8481541275978088, "num_tokens": 126823771.0, "step": 3323 }, { "epoch": 0.4228469660348556, "ewc_loss": 0.019511383026838303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9511382561177015e-05, "grad_norm": 14.129457473754883, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8585985898971558, "num_tokens": 126867137.0, "step": 3324 }, { "epoch": 0.42297417631344614, "ewc_loss": 0.01943635568022728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9436356524238363e-05, "grad_norm": 14.097798347473145, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8597042560577393, "num_tokens": 126902680.0, "step": 3325 }, { "epoch": 0.4231013865920366, "ewc_loss": 0.01945594884455204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9455948859103955e-05, "grad_norm": 14.083028793334961, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8661162853240967, "num_tokens": 126941037.0, "step": 3326 }, { "epoch": 0.42322859687062714, "ewc_loss": 0.01942615583539009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9426155631663278e-05, "grad_norm": 14.144943237304688, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8582674264907837, "num_tokens": 126980009.0, "step": 3327 }, { "epoch": 0.42335580714921767, "ewc_loss": 0.019452210515737534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.945221083587967e-05, "grad_norm": 14.103255271911621, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8667576313018799, "num_tokens": 127020022.0, "step": 3328 }, { "epoch": 0.42348301742780814, "ewc_loss": 0.01940854825079441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9408547814236954e-05, "grad_norm": 14.167081832885742, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8356512784957886, "num_tokens": 127060500.0, "step": 3329 }, { "epoch": 0.42361022770639867, "ewc_loss": 0.0194266177713871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.942661765497178e-05, "grad_norm": 14.168852806091309, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8508961200714111, "num_tokens": 127095886.0, "step": 3330 }, { "epoch": 0.4237374379849892, "ewc_loss": 0.019419977441430092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9419976524659432e-05, "grad_norm": 14.15312385559082, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8455743193626404, "num_tokens": 127134334.0, "step": 3331 }, { "epoch": 0.4238646482635797, "ewc_loss": 0.01936240680515766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.936240732902661e-05, "grad_norm": 14.071560859680176, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8471882343292236, "num_tokens": 127174288.0, "step": 3332 }, { "epoch": 0.4239918585421702, "ewc_loss": 0.019401097670197487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.940109723364003e-05, "grad_norm": 14.175238609313965, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8523842692375183, "num_tokens": 127210311.0, "step": 3333 }, { "epoch": 0.4241190688207607, "ewc_loss": 0.019389627501368523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.938962668646127e-05, "grad_norm": 14.084685325622559, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8757756352424622, "num_tokens": 127252813.0, "step": 3334 }, { "epoch": 0.42424627909935125, "ewc_loss": 0.019353318959474564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9353319657966495e-05, "grad_norm": 14.142975807189941, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8424515724182129, "num_tokens": 127285847.0, "step": 3335 }, { "epoch": 0.4243734893779417, "ewc_loss": 0.01939603127539158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9396031348151155e-05, "grad_norm": 14.138619422912598, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8687705993652344, "num_tokens": 127325360.0, "step": 3336 }, { "epoch": 0.42450069965653225, "ewc_loss": 0.019367286935448647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9367287677596323e-05, "grad_norm": 14.097556114196777, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8583803772926331, "num_tokens": 127366446.0, "step": 3337 }, { "epoch": 0.4246279099351228, "ewc_loss": 0.01941562630236149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.941562550200615e-05, "grad_norm": 14.172014236450195, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8644642233848572, "num_tokens": 127402608.0, "step": 3338 }, { "epoch": 0.42475512021371326, "ewc_loss": 0.019359175115823746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.935917498485651e-05, "grad_norm": 14.048850059509277, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8605510592460632, "num_tokens": 127445355.0, "step": 3339 }, { "epoch": 0.4248823304923038, "ewc_loss": 0.019337471574544907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.93374708032934e-05, "grad_norm": 14.09967041015625, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8500697016716003, "num_tokens": 127480523.0, "step": 3340 }, { "epoch": 0.4250095407708943, "ewc_loss": 0.01942002959549427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9420029275352135e-05, "grad_norm": 14.142637252807617, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.873948335647583, "num_tokens": 127520787.0, "step": 3341 }, { "epoch": 0.4251367510494848, "ewc_loss": 0.019374892115592957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.937489287229255e-05, "grad_norm": 14.143780708312988, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8500537872314453, "num_tokens": 127565576.0, "step": 3342 }, { "epoch": 0.4252639613280753, "ewc_loss": 0.019415654242038727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9415654605836608e-05, "grad_norm": 14.11638069152832, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8659226894378662, "num_tokens": 127599144.0, "step": 3343 }, { "epoch": 0.42539117160666584, "ewc_loss": 0.019382202997803688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.93822033907054e-05, "grad_norm": 14.085518836975098, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8732001185417175, "num_tokens": 127637361.0, "step": 3344 }, { "epoch": 0.4255183818852563, "ewc_loss": 0.01939772628247738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.939772664627526e-05, "grad_norm": 14.167591094970703, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8444145917892456, "num_tokens": 127675139.0, "step": 3345 }, { "epoch": 0.42564559216384684, "ewc_loss": 0.01938551478087902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9385513951419853e-05, "grad_norm": 14.065777778625488, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8615208268165588, "num_tokens": 127718822.0, "step": 3346 }, { "epoch": 0.42577280244243737, "ewc_loss": 0.019399579614400864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9399580196477473e-05, "grad_norm": 14.146924018859863, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.859218955039978, "num_tokens": 127763332.0, "step": 3347 }, { "epoch": 0.42590001272102784, "ewc_loss": 0.019409699365496635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.94096992345294e-05, "grad_norm": 14.10350513458252, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8672171831130981, "num_tokens": 127800346.0, "step": 3348 }, { "epoch": 0.42602722299961837, "ewc_loss": 0.019354965537786484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9354965843376704e-05, "grad_norm": 14.085346221923828, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8530792593955994, "num_tokens": 127845142.0, "step": 3349 }, { "epoch": 0.4261544332782089, "ewc_loss": 0.01935533434152603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9355335098225623e-05, "grad_norm": 14.178585052490234, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8384827375411987, "num_tokens": 127881131.0, "step": 3350 }, { "epoch": 0.4262816435567994, "ewc_loss": 0.01935611292719841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.935611362569034e-05, "grad_norm": 14.053669929504395, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8695188760757446, "num_tokens": 127917941.0, "step": 3351 }, { "epoch": 0.4264088538353899, "ewc_loss": 0.019349541515111923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.934954161697533e-05, "grad_norm": 14.20085620880127, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8418489098548889, "num_tokens": 127963345.0, "step": 3352 }, { "epoch": 0.42653606411398043, "ewc_loss": 0.019357461482286453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.935746149683837e-05, "grad_norm": 14.018166542053223, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8638578057289124, "num_tokens": 128004839.0, "step": 3353 }, { "epoch": 0.4266632743925709, "ewc_loss": 0.01931164786219597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9311648429720663e-05, "grad_norm": 14.1181058883667, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8482612371444702, "num_tokens": 128042709.0, "step": 3354 }, { "epoch": 0.42679048467116143, "ewc_loss": 0.01942005753517151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9420058379182592e-05, "grad_norm": 14.132298469543457, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8414164781570435, "num_tokens": 128085069.0, "step": 3355 }, { "epoch": 0.42691769494975196, "ewc_loss": 0.01932682655751705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.932682607730385e-05, "grad_norm": 14.005901336669922, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8492041826248169, "num_tokens": 128124300.0, "step": 3356 }, { "epoch": 0.42704490522834243, "ewc_loss": 0.019345087930560112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.934508873091545e-05, "grad_norm": 14.075255393981934, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8535365462303162, "num_tokens": 128164410.0, "step": 3357 }, { "epoch": 0.42717211550693296, "ewc_loss": 0.019403230398893356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.940323090821039e-05, "grad_norm": 14.08934211730957, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8657292723655701, "num_tokens": 128206524.0, "step": 3358 }, { "epoch": 0.4272993257855235, "ewc_loss": 0.019364962354302406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.936496300913859e-05, "grad_norm": 14.043869018554688, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.846798837184906, "num_tokens": 128246387.0, "step": 3359 }, { "epoch": 0.42742653606411396, "ewc_loss": 0.019401665776968002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.940166657732334e-05, "grad_norm": 14.141929626464844, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8658362030982971, "num_tokens": 128287794.0, "step": 3360 }, { "epoch": 0.4275537463427045, "ewc_loss": 0.019399570301175117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9399571101530455e-05, "grad_norm": 14.151021957397461, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8523732423782349, "num_tokens": 128328467.0, "step": 3361 }, { "epoch": 0.427680956621295, "ewc_loss": 0.019404368475079536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9404367776587605e-05, "grad_norm": 14.11352252960205, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8644638061523438, "num_tokens": 128364319.0, "step": 3362 }, { "epoch": 0.4278081668998855, "ewc_loss": 0.01937256194651127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9372562746866606e-05, "grad_norm": 14.130367279052734, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8506752252578735, "num_tokens": 128403892.0, "step": 3363 }, { "epoch": 0.427935377178476, "ewc_loss": 0.019397351890802383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.939735193445813e-05, "grad_norm": 14.090413093566895, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8351627588272095, "num_tokens": 128446760.0, "step": 3364 }, { "epoch": 0.42806258745706655, "ewc_loss": 0.01939360611140728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9393606635276228e-05, "grad_norm": 14.162609100341797, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8306021690368652, "num_tokens": 128485498.0, "step": 3365 }, { "epoch": 0.428189797735657, "ewc_loss": 0.019388120621442795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9388120563235134e-05, "grad_norm": 14.111139297485352, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8439720869064331, "num_tokens": 128527871.0, "step": 3366 }, { "epoch": 0.42831700801424755, "ewc_loss": 0.01938353106379509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9383531252969988e-05, "grad_norm": 14.139060974121094, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8424689769744873, "num_tokens": 128567792.0, "step": 3367 }, { "epoch": 0.4284442182928381, "ewc_loss": 0.019428675994277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9428676750976592e-05, "grad_norm": 14.117555618286133, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8454784154891968, "num_tokens": 128607217.0, "step": 3368 }, { "epoch": 0.42857142857142855, "ewc_loss": 0.01938292384147644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9382923710509203e-05, "grad_norm": 14.1475830078125, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8606875538825989, "num_tokens": 128642787.0, "step": 3369 }, { "epoch": 0.4286986388500191, "ewc_loss": 0.01939467340707779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9394672563066706e-05, "grad_norm": 14.11083984375, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8656641244888306, "num_tokens": 128680462.0, "step": 3370 }, { "epoch": 0.4288258491286096, "ewc_loss": 0.019392354413866997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.939235517056659e-05, "grad_norm": 14.126852989196777, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8511271476745605, "num_tokens": 128715101.0, "step": 3371 }, { "epoch": 0.4289530594072001, "ewc_loss": 0.01944885030388832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.944885116245132e-05, "grad_norm": 14.186952590942383, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8463611602783203, "num_tokens": 128749601.0, "step": 3372 }, { "epoch": 0.4290802696857906, "ewc_loss": 0.019427483901381493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9427483493927866e-05, "grad_norm": 14.07359504699707, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8501479625701904, "num_tokens": 128788744.0, "step": 3373 }, { "epoch": 0.42920747996438113, "ewc_loss": 0.019447287544608116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.944728683156427e-05, "grad_norm": 14.200703620910645, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8497742414474487, "num_tokens": 128828102.0, "step": 3374 }, { "epoch": 0.4293346902429716, "ewc_loss": 0.019429022446274757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9429022358963266e-05, "grad_norm": 14.067004203796387, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8492405414581299, "num_tokens": 128863610.0, "step": 3375 }, { "epoch": 0.42946190052156213, "ewc_loss": 0.01943853311240673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9438533854554407e-05, "grad_norm": 14.19184398651123, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8684408068656921, "num_tokens": 128898836.0, "step": 3376 }, { "epoch": 0.42958911080015266, "ewc_loss": 0.019481869414448738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9481869458104484e-05, "grad_norm": 14.07994270324707, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8558118939399719, "num_tokens": 128934821.0, "step": 3377 }, { "epoch": 0.42971632107874314, "ewc_loss": 0.019484153017401695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9484152289805934e-05, "grad_norm": 14.231078147888184, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8529317378997803, "num_tokens": 128966914.0, "step": 3378 }, { "epoch": 0.42984353135733366, "ewc_loss": 0.01954733580350876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9547336705727503e-05, "grad_norm": 14.121928215026855, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8607110381126404, "num_tokens": 129008305.0, "step": 3379 }, { "epoch": 0.4299707416359242, "ewc_loss": 0.01945331320166588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9453313143458217e-05, "grad_norm": 14.151758193969727, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8752456903457642, "num_tokens": 129043331.0, "step": 3380 }, { "epoch": 0.4300979519145147, "ewc_loss": 0.019542785361409187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.954278559423983e-05, "grad_norm": 14.1474027633667, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8605730533599854, "num_tokens": 129083513.0, "step": 3381 }, { "epoch": 0.4302251621931052, "ewc_loss": 0.019479408860206604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9479408365441486e-05, "grad_norm": 14.094329833984375, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8542840480804443, "num_tokens": 129113463.0, "step": 3382 }, { "epoch": 0.4303523724716957, "ewc_loss": 0.019505038857460022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.950503974512685e-05, "grad_norm": 14.09873104095459, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8307427167892456, "num_tokens": 129153080.0, "step": 3383 }, { "epoch": 0.43047958275028625, "ewc_loss": 0.019539423286914825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9539424101822078e-05, "grad_norm": 14.092208862304688, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8516874313354492, "num_tokens": 129193338.0, "step": 3384 }, { "epoch": 0.4306067930288767, "ewc_loss": 0.01956944726407528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9569446521927603e-05, "grad_norm": 14.102134704589844, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8444675803184509, "num_tokens": 129234816.0, "step": 3385 }, { "epoch": 0.43073400330746725, "ewc_loss": 0.019552282989025116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9552282537915744e-05, "grad_norm": 14.103509902954102, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8569269180297852, "num_tokens": 129275228.0, "step": 3386 }, { "epoch": 0.4308612135860578, "ewc_loss": 0.019570009782910347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9570010408642702e-05, "grad_norm": 14.089376449584961, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.836631178855896, "num_tokens": 129309747.0, "step": 3387 }, { "epoch": 0.43098842386464825, "ewc_loss": 0.019527466967701912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.952746788447257e-05, "grad_norm": 14.103631973266602, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8578166365623474, "num_tokens": 129342580.0, "step": 3388 }, { "epoch": 0.4311156341432388, "ewc_loss": 0.019601892679929733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.960189183591865e-05, "grad_norm": 14.177201271057129, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.849475622177124, "num_tokens": 129376599.0, "step": 3389 }, { "epoch": 0.4312428444218293, "ewc_loss": 0.019581811502575874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9581812011892907e-05, "grad_norm": 14.177654266357422, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8532871007919312, "num_tokens": 129414783.0, "step": 3390 }, { "epoch": 0.4313700547004198, "ewc_loss": 0.01954638957977295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9546389012248255e-05, "grad_norm": 14.072948455810547, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8536167740821838, "num_tokens": 129450287.0, "step": 3391 }, { "epoch": 0.4314972649790103, "ewc_loss": 0.01958184503018856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.958184475370217e-05, "grad_norm": 14.22410774230957, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8364420533180237, "num_tokens": 129486733.0, "step": 3392 }, { "epoch": 0.43162447525760084, "ewc_loss": 0.01961018145084381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.961018097063061e-05, "grad_norm": 14.11970043182373, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8546376824378967, "num_tokens": 129524392.0, "step": 3393 }, { "epoch": 0.4317516855361913, "ewc_loss": 0.01956179179251194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.956179221451748e-05, "grad_norm": 14.121731758117676, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8409722447395325, "num_tokens": 129558452.0, "step": 3394 }, { "epoch": 0.43187889581478184, "ewc_loss": 0.019633861258625984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.963386057468597e-05, "grad_norm": 14.168853759765625, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8533902168273926, "num_tokens": 129597076.0, "step": 3395 }, { "epoch": 0.43200610609337237, "ewc_loss": 0.019606629386544228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9606630303314887e-05, "grad_norm": 14.093371391296387, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8485661149024963, "num_tokens": 129635861.0, "step": 3396 }, { "epoch": 0.43213331637196284, "ewc_loss": 0.019582495093345642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.958249595190864e-05, "grad_norm": 14.149012565612793, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8493725061416626, "num_tokens": 129673760.0, "step": 3397 }, { "epoch": 0.43226052665055337, "ewc_loss": 0.01962355338037014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9623554180725478e-05, "grad_norm": 14.121382713317871, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8552845120429993, "num_tokens": 129710730.0, "step": 3398 }, { "epoch": 0.4323877369291439, "ewc_loss": 0.0196171086281538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9617109501268715e-05, "grad_norm": 14.223121643066406, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8505702018737793, "num_tokens": 129748724.0, "step": 3399 }, { "epoch": 0.43251494720773437, "ewc_loss": 0.019677171483635902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9677170712384395e-05, "grad_norm": 14.180541038513184, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8673111200332642, "num_tokens": 129787478.0, "step": 3400 }, { "epoch": 0.4326421574863249, "ewc_loss": 0.019581280648708344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9581280866987072e-05, "grad_norm": 14.117831230163574, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8645206689834595, "num_tokens": 129829852.0, "step": 3401 }, { "epoch": 0.4327693677649154, "ewc_loss": 0.019615279510617256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9615279597928748e-05, "grad_norm": 14.16585922241211, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.853483259677887, "num_tokens": 129869279.0, "step": 3402 }, { "epoch": 0.4328965780435059, "ewc_loss": 0.019660139456391335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9660139514598995e-05, "grad_norm": 14.216455459594727, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8491973280906677, "num_tokens": 129903347.0, "step": 3403 }, { "epoch": 0.4330237883220964, "ewc_loss": 0.019604148343205452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.960414920176845e-05, "grad_norm": 14.168998718261719, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8642271161079407, "num_tokens": 129940028.0, "step": 3404 }, { "epoch": 0.43315099860068695, "ewc_loss": 0.01960703171789646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.960703229997307e-05, "grad_norm": 14.201349258422852, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8638485670089722, "num_tokens": 129976438.0, "step": 3405 }, { "epoch": 0.4332782088792774, "ewc_loss": 0.019607098773121834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9607099602581002e-05, "grad_norm": 14.183464050292969, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8595371246337891, "num_tokens": 130015698.0, "step": 3406 }, { "epoch": 0.43340541915786795, "ewc_loss": 0.01958322338759899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.958322354767006e-05, "grad_norm": 14.156157493591309, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8690138459205627, "num_tokens": 130056000.0, "step": 3407 }, { "epoch": 0.4335326294364585, "ewc_loss": 0.01959417387843132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9594173863879405e-05, "grad_norm": 14.21341323852539, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.85202956199646, "num_tokens": 130096990.0, "step": 3408 }, { "epoch": 0.43365983971504896, "ewc_loss": 0.019594186916947365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.959418659680523e-05, "grad_norm": 14.145256042480469, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8639801144599915, "num_tokens": 130128556.0, "step": 3409 }, { "epoch": 0.4337870499936395, "ewc_loss": 0.019561471417546272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9561472072382458e-05, "grad_norm": 14.092442512512207, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8609005808830261, "num_tokens": 130170607.0, "step": 3410 }, { "epoch": 0.43391426027223, "ewc_loss": 0.0195953119546175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9595312551246025e-05, "grad_norm": 14.202789306640625, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8502354621887207, "num_tokens": 130212511.0, "step": 3411 }, { "epoch": 0.4340414705508205, "ewc_loss": 0.01955643855035305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9556438928702846e-05, "grad_norm": 14.08137321472168, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8475171327590942, "num_tokens": 130249246.0, "step": 3412 }, { "epoch": 0.434168680829411, "ewc_loss": 0.01956130936741829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9561310182325542e-05, "grad_norm": 14.119629859924316, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.874706506729126, "num_tokens": 130291250.0, "step": 3413 }, { "epoch": 0.43429589110800154, "ewc_loss": 0.019584575667977333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9584575056796893e-05, "grad_norm": 14.187901496887207, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8342617750167847, "num_tokens": 130327094.0, "step": 3414 }, { "epoch": 0.434423101386592, "ewc_loss": 0.01959182322025299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9591823729570024e-05, "grad_norm": 14.148019790649414, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8465431332588196, "num_tokens": 130366482.0, "step": 3415 }, { "epoch": 0.43455031166518254, "ewc_loss": 0.019553959369659424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9553959646145813e-05, "grad_norm": 14.125690460205078, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8543967008590698, "num_tokens": 130407416.0, "step": 3416 }, { "epoch": 0.43467752194377307, "ewc_loss": 0.019614635035395622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9614635675679892e-05, "grad_norm": 14.187131881713867, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8603788614273071, "num_tokens": 130447125.0, "step": 3417 }, { "epoch": 0.43480473222236354, "ewc_loss": 0.019572222605347633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9572222299757414e-05, "grad_norm": 14.134322166442871, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8607012033462524, "num_tokens": 130480955.0, "step": 3418 }, { "epoch": 0.43493194250095407, "ewc_loss": 0.019568171352148056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9568171410355717e-05, "grad_norm": 14.117803573608398, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8668766021728516, "num_tokens": 130511771.0, "step": 3419 }, { "epoch": 0.4350591527795446, "ewc_loss": 0.019569875672459602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.956987580342684e-05, "grad_norm": 14.119118690490723, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8532389402389526, "num_tokens": 130550080.0, "step": 3420 }, { "epoch": 0.4351863630581351, "ewc_loss": 0.01961345598101616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.961345515155699e-05, "grad_norm": 14.145015716552734, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8433312177658081, "num_tokens": 130588109.0, "step": 3421 }, { "epoch": 0.4353135733367256, "ewc_loss": 0.019590023905038834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9590024749049917e-05, "grad_norm": 14.092480659484863, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8400533199310303, "num_tokens": 130626832.0, "step": 3422 }, { "epoch": 0.43544078361531613, "ewc_loss": 0.019609550014138222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9609549781307578e-05, "grad_norm": 14.197607040405273, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8590576648712158, "num_tokens": 130662202.0, "step": 3423 }, { "epoch": 0.4355679938939066, "ewc_loss": 0.019620615988969803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.962061651283875e-05, "grad_norm": 14.11144733428955, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8506233096122742, "num_tokens": 130700747.0, "step": 3424 }, { "epoch": 0.43569520417249713, "ewc_loss": 0.019593022763729095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.959302244358696e-05, "grad_norm": 14.159111022949219, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8526349663734436, "num_tokens": 130742849.0, "step": 3425 }, { "epoch": 0.43582241445108766, "ewc_loss": 0.019588910043239594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9588909708545543e-05, "grad_norm": 14.053775787353516, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8682574033737183, "num_tokens": 130780156.0, "step": 3426 }, { "epoch": 0.43594962472967813, "ewc_loss": 0.019581768661737442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9581768356147222e-05, "grad_norm": 14.141417503356934, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8544107675552368, "num_tokens": 130825824.0, "step": 3427 }, { "epoch": 0.43607683500826866, "ewc_loss": 0.019618969410657883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9618970327428542e-05, "grad_norm": 14.121752738952637, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8636379241943359, "num_tokens": 130863603.0, "step": 3428 }, { "epoch": 0.4362040452868592, "ewc_loss": 0.019577326253056526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9577326384023763e-05, "grad_norm": 14.072735786437988, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8565112948417664, "num_tokens": 130906689.0, "step": 3429 }, { "epoch": 0.4363312555654497, "ewc_loss": 0.01964053139090538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.964053080882877e-05, "grad_norm": 14.198784828186035, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8520069718360901, "num_tokens": 130947785.0, "step": 3430 }, { "epoch": 0.4364584658440402, "ewc_loss": 0.019666137173771858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9666136722662486e-05, "grad_norm": 14.205352783203125, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8656411170959473, "num_tokens": 130985489.0, "step": 3431 }, { "epoch": 0.4365856761226307, "ewc_loss": 0.019595002755522728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9595003323047422e-05, "grad_norm": 14.112862586975098, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8639603853225708, "num_tokens": 131025274.0, "step": 3432 }, { "epoch": 0.43671288640122125, "ewc_loss": 0.019570432603359222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9570432414184324e-05, "grad_norm": 14.171486854553223, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8534504175186157, "num_tokens": 131060132.0, "step": 3433 }, { "epoch": 0.4368400966798117, "ewc_loss": 0.01960655488073826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.960655572474934e-05, "grad_norm": 14.083374977111816, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8444598913192749, "num_tokens": 131105109.0, "step": 3434 }, { "epoch": 0.43696730695840225, "ewc_loss": 0.019616801291704178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9616802092059515e-05, "grad_norm": 14.253169059753418, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8700342178344727, "num_tokens": 131142094.0, "step": 3435 }, { "epoch": 0.4370945172369928, "ewc_loss": 0.01960122399032116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.960122426680755e-05, "grad_norm": 14.110901832580566, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8544918298721313, "num_tokens": 131174648.0, "step": 3436 }, { "epoch": 0.43722172751558325, "ewc_loss": 0.019570806995034218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9570807126001455e-05, "grad_norm": 14.141881942749023, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8705410957336426, "num_tokens": 131213447.0, "step": 3437 }, { "epoch": 0.4373489377941738, "ewc_loss": 0.019598446786403656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9598446669988334e-05, "grad_norm": 14.186437606811523, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8511804938316345, "num_tokens": 131249622.0, "step": 3438 }, { "epoch": 0.4374761480727643, "ewc_loss": 0.01959841325879097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.959841392817907e-05, "grad_norm": 14.203768730163574, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8714756965637207, "num_tokens": 131285883.0, "step": 3439 }, { "epoch": 0.4376033583513548, "ewc_loss": 0.01960170455276966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9601704480010085e-05, "grad_norm": 14.191864013671875, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8594889640808105, "num_tokens": 131322774.0, "step": 3440 }, { "epoch": 0.4377305686299453, "ewc_loss": 0.019585376605391502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9585377231123857e-05, "grad_norm": 14.10799789428711, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8672598004341125, "num_tokens": 131364513.0, "step": 3441 }, { "epoch": 0.43785777890853583, "ewc_loss": 0.01955883391201496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9558834537747316e-05, "grad_norm": 14.178154945373535, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8460181355476379, "num_tokens": 131402299.0, "step": 3442 }, { "epoch": 0.4379849891871263, "ewc_loss": 0.01961134374141693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9611343304859474e-05, "grad_norm": 14.14601993560791, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8628867864608765, "num_tokens": 131447830.0, "step": 3443 }, { "epoch": 0.43811219946571683, "ewc_loss": 0.019553955644369125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9553956008167006e-05, "grad_norm": 14.178152084350586, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8485828042030334, "num_tokens": 131481058.0, "step": 3444 }, { "epoch": 0.43823940974430736, "ewc_loss": 0.019636396318674088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.963639624591451e-05, "grad_norm": 14.211703300476074, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.85169517993927, "num_tokens": 131518376.0, "step": 3445 }, { "epoch": 0.43836662002289783, "ewc_loss": 0.01957523822784424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9575238184188493e-05, "grad_norm": 14.156623840332031, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8409346342086792, "num_tokens": 131553702.0, "step": 3446 }, { "epoch": 0.43849383030148836, "ewc_loss": 0.019613469019532204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.961346970347222e-05, "grad_norm": 14.217150688171387, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8634428977966309, "num_tokens": 131587026.0, "step": 3447 }, { "epoch": 0.4386210405800789, "ewc_loss": 0.019597548991441727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9597549908212386e-05, "grad_norm": 14.173466682434082, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8618197441101074, "num_tokens": 131621145.0, "step": 3448 }, { "epoch": 0.43874825085866936, "ewc_loss": 0.019641460850834846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9641460312413983e-05, "grad_norm": 14.187411308288574, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8613103628158569, "num_tokens": 131661618.0, "step": 3449 }, { "epoch": 0.4388754611372599, "ewc_loss": 0.019649667665362358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9649667592602782e-05, "grad_norm": 14.207792282104492, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8492564558982849, "num_tokens": 131693840.0, "step": 3450 }, { "epoch": 0.4390026714158504, "ewc_loss": 0.01962941884994507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.962941860256251e-05, "grad_norm": 14.155147552490234, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8580856323242188, "num_tokens": 131730856.0, "step": 3451 }, { "epoch": 0.4391298816944409, "ewc_loss": 0.019657118245959282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9657118173199706e-05, "grad_norm": 14.187398910522461, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8328554630279541, "num_tokens": 131769172.0, "step": 3452 }, { "epoch": 0.4392570919730314, "ewc_loss": 0.019683392718434334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9683393475133926e-05, "grad_norm": 14.135078430175781, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8581900000572205, "num_tokens": 131810481.0, "step": 3453 }, { "epoch": 0.43938430225162195, "ewc_loss": 0.019646847620606422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9646848159027286e-05, "grad_norm": 14.168911933898926, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8578238487243652, "num_tokens": 131850742.0, "step": 3454 }, { "epoch": 0.4395115125302124, "ewc_loss": 0.019689586013555527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9689585315063596e-05, "grad_norm": 14.157771110534668, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8542618155479431, "num_tokens": 131886675.0, "step": 3455 }, { "epoch": 0.43963872280880295, "ewc_loss": 0.019673245027661324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9673245333251543e-05, "grad_norm": 14.158385276794434, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8617328405380249, "num_tokens": 131926831.0, "step": 3456 }, { "epoch": 0.4397659330873935, "ewc_loss": 0.01969541609287262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9695416995091364e-05, "grad_norm": 14.157670974731445, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8540767431259155, "num_tokens": 131962265.0, "step": 3457 }, { "epoch": 0.43989314336598395, "ewc_loss": 0.01969229429960251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9692293790285476e-05, "grad_norm": 14.163681983947754, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8508315086364746, "num_tokens": 131999906.0, "step": 3458 }, { "epoch": 0.4400203536445745, "ewc_loss": 0.019715428352355957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9715427697519772e-05, "grad_norm": 14.170503616333008, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8722906708717346, "num_tokens": 132031330.0, "step": 3459 }, { "epoch": 0.440147563923165, "ewc_loss": 0.01971699483692646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9716995666385628e-05, "grad_norm": 14.184944152832031, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.85439532995224, "num_tokens": 132065049.0, "step": 3460 }, { "epoch": 0.4402747742017555, "ewc_loss": 0.019746365025639534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.974636506929528e-05, "grad_norm": 14.200739860534668, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8598147630691528, "num_tokens": 132100993.0, "step": 3461 }, { "epoch": 0.440401984480346, "ewc_loss": 0.019697941839694977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9697941752383485e-05, "grad_norm": 14.18056583404541, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8537242412567139, "num_tokens": 132138169.0, "step": 3462 }, { "epoch": 0.44052919475893654, "ewc_loss": 0.019722137600183487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9722137949429452e-05, "grad_norm": 14.136842727661133, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8630506992340088, "num_tokens": 132170498.0, "step": 3463 }, { "epoch": 0.440656405037527, "ewc_loss": 0.019733566790819168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.973356665985193e-05, "grad_norm": 14.200590133666992, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8378470540046692, "num_tokens": 132213583.0, "step": 3464 }, { "epoch": 0.44078361531611754, "ewc_loss": 0.019764255732297897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9764256649068557e-05, "grad_norm": 14.125092506408691, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8513409495353699, "num_tokens": 132255427.0, "step": 3465 }, { "epoch": 0.44091082559470807, "ewc_loss": 0.01973196119070053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.97319604922086e-05, "grad_norm": 14.247498512268066, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8542537689208984, "num_tokens": 132295795.0, "step": 3466 }, { "epoch": 0.44103803587329854, "ewc_loss": 0.01977018639445305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9770186554524116e-05, "grad_norm": 14.159501075744629, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8383450508117676, "num_tokens": 132336881.0, "step": 3467 }, { "epoch": 0.44116524615188907, "ewc_loss": 0.01970144733786583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9701446944964118e-05, "grad_norm": 14.158531188964844, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8558424711227417, "num_tokens": 132375136.0, "step": 3468 }, { "epoch": 0.4412924564304796, "ewc_loss": 0.019772719591856003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9772720406763256e-05, "grad_norm": 14.20228385925293, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8530651330947876, "num_tokens": 132412893.0, "step": 3469 }, { "epoch": 0.44141966670907007, "ewc_loss": 0.019766787067055702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.976678686332889e-05, "grad_norm": 14.227015495300293, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8495328426361084, "num_tokens": 132444360.0, "step": 3470 }, { "epoch": 0.4415468769876606, "ewc_loss": 0.019737087190151215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9737086404347792e-05, "grad_norm": 14.200145721435547, "learning_rate": 1e-06, "loss": 0.5487, "mean_token_accuracy": 0.8292059898376465, "num_tokens": 132485022.0, "step": 3471 }, { "epoch": 0.4416740872662511, "ewc_loss": 0.019749630242586136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9749630155274644e-05, "grad_norm": 14.192525863647461, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8410972356796265, "num_tokens": 132522800.0, "step": 3472 }, { "epoch": 0.4418012975448416, "ewc_loss": 0.01979735493659973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9797354980255477e-05, "grad_norm": 14.197168350219727, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8579010963439941, "num_tokens": 132560527.0, "step": 3473 }, { "epoch": 0.4419285078234321, "ewc_loss": 0.01979242078959942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9792420062003657e-05, "grad_norm": 14.19719409942627, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8526227474212646, "num_tokens": 132601075.0, "step": 3474 }, { "epoch": 0.44205571810202265, "ewc_loss": 0.01975071243941784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9750712453969754e-05, "grad_norm": 14.201419830322266, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.845802903175354, "num_tokens": 132639851.0, "step": 3475 }, { "epoch": 0.4421829283806131, "ewc_loss": 0.019786110147833824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9786109987762757e-05, "grad_norm": 14.13321590423584, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.857459306716919, "num_tokens": 132679849.0, "step": 3476 }, { "epoch": 0.44231013865920366, "ewc_loss": 0.019793374463915825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.979337503144052e-05, "grad_norm": 14.245123863220215, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8598997592926025, "num_tokens": 132721020.0, "step": 3477 }, { "epoch": 0.4424373489377942, "ewc_loss": 0.019773798063397408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9773797248490155e-05, "grad_norm": 14.180889129638672, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8424762487411499, "num_tokens": 132760933.0, "step": 3478 }, { "epoch": 0.44256455921638466, "ewc_loss": 0.01973653770983219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.973653706954792e-05, "grad_norm": 14.173493385314941, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8663647174835205, "num_tokens": 132800627.0, "step": 3479 }, { "epoch": 0.4426917694949752, "ewc_loss": 0.019767332822084427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9767332560149953e-05, "grad_norm": 14.208321571350098, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8547406196594238, "num_tokens": 132833232.0, "step": 3480 }, { "epoch": 0.4428189797735657, "ewc_loss": 0.01974070817232132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9740708012250252e-05, "grad_norm": 14.247900009155273, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8536231517791748, "num_tokens": 132874670.0, "step": 3481 }, { "epoch": 0.44294619005215624, "ewc_loss": 0.01974727213382721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.974727274500765e-05, "grad_norm": 14.17786979675293, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8565479516983032, "num_tokens": 132916187.0, "step": 3482 }, { "epoch": 0.4430734003307467, "ewc_loss": 0.019708866253495216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9708866602741182e-05, "grad_norm": 14.238329887390137, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8579931259155273, "num_tokens": 132956948.0, "step": 3483 }, { "epoch": 0.44320061060933724, "ewc_loss": 0.019705861806869507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9705861632246524e-05, "grad_norm": 14.144256591796875, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8601813316345215, "num_tokens": 132995213.0, "step": 3484 }, { "epoch": 0.44332782088792777, "ewc_loss": 0.01973438449203968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9734385205083527e-05, "grad_norm": 14.267431259155273, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8521209359169006, "num_tokens": 133038850.0, "step": 3485 }, { "epoch": 0.44345503116651824, "ewc_loss": 0.01975199207663536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9751991203520447e-05, "grad_norm": 14.223523139953613, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.856870174407959, "num_tokens": 133080766.0, "step": 3486 }, { "epoch": 0.44358224144510877, "ewc_loss": 0.019674211740493774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9674211216624826e-05, "grad_norm": 14.181896209716797, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8515698909759521, "num_tokens": 133120845.0, "step": 3487 }, { "epoch": 0.4437094517236993, "ewc_loss": 0.01969439908862114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9694398361025378e-05, "grad_norm": 14.213850975036621, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8529244661331177, "num_tokens": 133166011.0, "step": 3488 }, { "epoch": 0.4438366620022898, "ewc_loss": 0.019709870219230652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.970987068489194e-05, "grad_norm": 14.213820457458496, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8537575006484985, "num_tokens": 133203507.0, "step": 3489 }, { "epoch": 0.4439638722808803, "ewc_loss": 0.019671646878123283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9671646441565827e-05, "grad_norm": 14.206487655639648, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8319169878959656, "num_tokens": 133240554.0, "step": 3490 }, { "epoch": 0.44409108255947083, "ewc_loss": 0.019676703959703445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9676703232107684e-05, "grad_norm": 14.183257102966309, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8382911682128906, "num_tokens": 133274987.0, "step": 3491 }, { "epoch": 0.4442182928380613, "ewc_loss": 0.01968679577112198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9686794985318556e-05, "grad_norm": 14.240653038024902, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8277773857116699, "num_tokens": 133319287.0, "step": 3492 }, { "epoch": 0.44434550311665183, "ewc_loss": 0.01968522183597088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.968522155948449e-05, "grad_norm": 14.1650972366333, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8599956631660461, "num_tokens": 133353670.0, "step": 3493 }, { "epoch": 0.44447271339524236, "ewc_loss": 0.019698332995176315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9698332835105248e-05, "grad_norm": 14.309684753417969, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8565621376037598, "num_tokens": 133391950.0, "step": 3494 }, { "epoch": 0.44459992367383283, "ewc_loss": 0.01971765235066414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.971765232156031e-05, "grad_norm": 14.12631893157959, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8603265881538391, "num_tokens": 133424758.0, "step": 3495 }, { "epoch": 0.44472713395242336, "ewc_loss": 0.019679918885231018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9679919205373153e-05, "grad_norm": 14.280010223388672, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.883103609085083, "num_tokens": 133459869.0, "step": 3496 }, { "epoch": 0.4448543442310139, "ewc_loss": 0.019783472642302513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9783472453127615e-05, "grad_norm": 14.233431816101074, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8528376817703247, "num_tokens": 133494598.0, "step": 3497 }, { "epoch": 0.44498155450960436, "ewc_loss": 0.01968451589345932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9684515791595913e-05, "grad_norm": 14.229138374328613, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8643775582313538, "num_tokens": 133536403.0, "step": 3498 }, { "epoch": 0.4451087647881949, "ewc_loss": 0.019756658002734184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9756658730329946e-05, "grad_norm": 14.26500415802002, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8336522579193115, "num_tokens": 133567319.0, "step": 3499 }, { "epoch": 0.4452359750667854, "ewc_loss": 0.01970958337187767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.970958328456618e-05, "grad_norm": 14.181035995483398, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8478561043739319, "num_tokens": 133604829.0, "step": 3500 }, { "epoch": 0.4453631853453759, "ewc_loss": 0.019754059612751007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.975405939447228e-05, "grad_norm": 14.28298568725586, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8551913499832153, "num_tokens": 133638373.0, "step": 3501 }, { "epoch": 0.4454903956239664, "ewc_loss": 0.019750745967030525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9750745195779018e-05, "grad_norm": 14.202201843261719, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8657190203666687, "num_tokens": 133674110.0, "step": 3502 }, { "epoch": 0.44561760590255695, "ewc_loss": 0.019707757979631424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9707758838194422e-05, "grad_norm": 14.221110343933105, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8557007908821106, "num_tokens": 133711005.0, "step": 3503 }, { "epoch": 0.4457448161811474, "ewc_loss": 0.0197740588337183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9774059182964265e-05, "grad_norm": 14.231972694396973, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8669542074203491, "num_tokens": 133746258.0, "step": 3504 }, { "epoch": 0.44587202645973795, "ewc_loss": 0.019761674106121063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.976167368411552e-05, "grad_norm": 14.189016342163086, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.86310875415802, "num_tokens": 133790695.0, "step": 3505 }, { "epoch": 0.4459992367383285, "ewc_loss": 0.01974787376821041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.974787301151082e-05, "grad_norm": 14.129891395568848, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8515157699584961, "num_tokens": 133830181.0, "step": 3506 }, { "epoch": 0.44612644701691895, "ewc_loss": 0.019763432443141937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.976343264686875e-05, "grad_norm": 14.178295135498047, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8586249351501465, "num_tokens": 133866849.0, "step": 3507 }, { "epoch": 0.4462536572955095, "ewc_loss": 0.019802121445536613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9802120732492767e-05, "grad_norm": 14.210684776306152, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8544373512268066, "num_tokens": 133906812.0, "step": 3508 }, { "epoch": 0.4463808675741, "ewc_loss": 0.01976930722594261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9769307982642204e-05, "grad_norm": 14.176983833312988, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8485328555107117, "num_tokens": 133948956.0, "step": 3509 }, { "epoch": 0.4465080778526905, "ewc_loss": 0.019797028973698616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9797029381152242e-05, "grad_norm": 14.201475143432617, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8377654552459717, "num_tokens": 133988764.0, "step": 3510 }, { "epoch": 0.446635288131281, "ewc_loss": 0.01981074921786785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.981075001822319e-05, "grad_norm": 14.198975563049316, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8601415157318115, "num_tokens": 134028148.0, "step": 3511 }, { "epoch": 0.44676249840987153, "ewc_loss": 0.019817855209112167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.981785499083344e-05, "grad_norm": 14.283598899841309, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8309741020202637, "num_tokens": 134065909.0, "step": 3512 }, { "epoch": 0.446889708688462, "ewc_loss": 0.019833127036690712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.983312722586561e-05, "grad_norm": 14.273996353149414, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.861332893371582, "num_tokens": 134108342.0, "step": 3513 }, { "epoch": 0.44701691896705253, "ewc_loss": 0.019773205742239952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9773206076934002e-05, "grad_norm": 14.202134132385254, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.854331910610199, "num_tokens": 134145928.0, "step": 3514 }, { "epoch": 0.44714412924564306, "ewc_loss": 0.01980271004140377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9802710085059516e-05, "grad_norm": 14.274420738220215, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8517404198646545, "num_tokens": 134185955.0, "step": 3515 }, { "epoch": 0.44727133952423354, "ewc_loss": 0.01977827586233616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9778275600401685e-05, "grad_norm": 14.18723201751709, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8609668612480164, "num_tokens": 134225829.0, "step": 3516 }, { "epoch": 0.44739854980282406, "ewc_loss": 0.019780848175287247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.97808476514183e-05, "grad_norm": 14.328500747680664, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.845471978187561, "num_tokens": 134261381.0, "step": 3517 }, { "epoch": 0.4475257600814146, "ewc_loss": 0.019801830872893333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9801831513177603e-05, "grad_norm": 14.185752868652344, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8623200058937073, "num_tokens": 134294697.0, "step": 3518 }, { "epoch": 0.44765297036000506, "ewc_loss": 0.01973273605108261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.973273538169451e-05, "grad_norm": 14.304436683654785, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8752931356430054, "num_tokens": 134334349.0, "step": 3519 }, { "epoch": 0.4477801806385956, "ewc_loss": 0.01981097273528576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.981097193493042e-05, "grad_norm": 14.172369003295898, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8443942070007324, "num_tokens": 134373381.0, "step": 3520 }, { "epoch": 0.4479073909171861, "ewc_loss": 0.019732138141989708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9732138753170148e-05, "grad_norm": 14.206059455871582, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8605448603630066, "num_tokens": 134407952.0, "step": 3521 }, { "epoch": 0.4480346011957766, "ewc_loss": 0.019836172461509705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9836172214127146e-05, "grad_norm": 14.240791320800781, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8448480367660522, "num_tokens": 134458862.0, "step": 3522 }, { "epoch": 0.4481618114743671, "ewc_loss": 0.01977340690791607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9773406165768392e-05, "grad_norm": 14.232757568359375, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8524703979492188, "num_tokens": 134494007.0, "step": 3523 }, { "epoch": 0.44828902175295765, "ewc_loss": 0.01978340931236744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.978340878849849e-05, "grad_norm": 14.253816604614258, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8406974077224731, "num_tokens": 134534195.0, "step": 3524 }, { "epoch": 0.4484162320315481, "ewc_loss": 0.019795197993516922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9795197658822872e-05, "grad_norm": 14.267553329467773, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8638121485710144, "num_tokens": 134570241.0, "step": 3525 }, { "epoch": 0.44854344231013865, "ewc_loss": 0.019784141331911087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.978414184122812e-05, "grad_norm": 14.233017921447754, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8528991937637329, "num_tokens": 134611904.0, "step": 3526 }, { "epoch": 0.4486706525887292, "ewc_loss": 0.019803492352366447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.980349225050304e-05, "grad_norm": 14.323415756225586, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8454536199569702, "num_tokens": 134644426.0, "step": 3527 }, { "epoch": 0.44879786286731965, "ewc_loss": 0.019805582240223885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9805582269327715e-05, "grad_norm": 14.305109977722168, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.863929033279419, "num_tokens": 134678979.0, "step": 3528 }, { "epoch": 0.4489250731459102, "ewc_loss": 0.019739210605621338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9739210983971134e-05, "grad_norm": 14.259018898010254, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8576574921607971, "num_tokens": 134717270.0, "step": 3529 }, { "epoch": 0.4490522834245007, "ewc_loss": 0.019782654941082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9782655726885423e-05, "grad_norm": 14.300972938537598, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.855330228805542, "num_tokens": 134755464.0, "step": 3530 }, { "epoch": 0.44917949370309124, "ewc_loss": 0.019766081124544144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9766081095440313e-05, "grad_norm": 14.204724311828613, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8471639156341553, "num_tokens": 134793737.0, "step": 3531 }, { "epoch": 0.4493067039816817, "ewc_loss": 0.019769269973039627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.976926978386473e-05, "grad_norm": 14.258584976196289, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8497878313064575, "num_tokens": 134834332.0, "step": 3532 }, { "epoch": 0.44943391426027224, "ewc_loss": 0.01980908028781414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9809080185950734e-05, "grad_norm": 14.257726669311523, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8655776977539062, "num_tokens": 134871027.0, "step": 3533 }, { "epoch": 0.44956112453886277, "ewc_loss": 0.01980910822749138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9809107470791787e-05, "grad_norm": 14.3059720993042, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8542467355728149, "num_tokens": 134908529.0, "step": 3534 }, { "epoch": 0.44968833481745324, "ewc_loss": 0.019824311137199402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9824310584226623e-05, "grad_norm": 14.278202056884766, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8639249205589294, "num_tokens": 134944177.0, "step": 3535 }, { "epoch": 0.44981554509604377, "ewc_loss": 0.019781535491347313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.978153522941284e-05, "grad_norm": 14.3064603805542, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8535642027854919, "num_tokens": 134977928.0, "step": 3536 }, { "epoch": 0.4499427553746343, "ewc_loss": 0.019792955368757248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.97929548448883e-05, "grad_norm": 14.18521499633789, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8552079200744629, "num_tokens": 135017554.0, "step": 3537 }, { "epoch": 0.45006996565322477, "ewc_loss": 0.01974686235189438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9746861653402448e-05, "grad_norm": 14.27285099029541, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8614540100097656, "num_tokens": 135058694.0, "step": 3538 }, { "epoch": 0.4501971759318153, "ewc_loss": 0.019870053976774216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9870054529746994e-05, "grad_norm": 14.288080215454102, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8569430112838745, "num_tokens": 135094037.0, "step": 3539 }, { "epoch": 0.4503243862104058, "ewc_loss": 0.019776878878474236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.977687861653976e-05, "grad_norm": 14.217491149902344, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8485692739486694, "num_tokens": 135132758.0, "step": 3540 }, { "epoch": 0.4504515964889963, "ewc_loss": 0.01979813538491726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9798135326709598e-05, "grad_norm": 14.369399070739746, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8680418133735657, "num_tokens": 135169398.0, "step": 3541 }, { "epoch": 0.4505788067675868, "ewc_loss": 0.019827330484986305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.982733010663651e-05, "grad_norm": 14.17642879486084, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8407271504402161, "num_tokens": 135204312.0, "step": 3542 }, { "epoch": 0.45070601704617735, "ewc_loss": 0.0197389367967844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.97389363165712e-05, "grad_norm": 14.267078399658203, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8522837162017822, "num_tokens": 135246287.0, "step": 3543 }, { "epoch": 0.4508332273247678, "ewc_loss": 0.019884852692484856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.988485200854484e-05, "grad_norm": 14.29403018951416, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8578860759735107, "num_tokens": 135280970.0, "step": 3544 }, { "epoch": 0.45096043760335836, "ewc_loss": 0.01975812390446663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.97581230167998e-05, "grad_norm": 14.176974296569824, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8770654201507568, "num_tokens": 135317873.0, "step": 3545 }, { "epoch": 0.4510876478819489, "ewc_loss": 0.019831158220767975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9831159079330973e-05, "grad_norm": 14.277067184448242, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8624812364578247, "num_tokens": 135354994.0, "step": 3546 }, { "epoch": 0.45121485816053936, "ewc_loss": 0.019821379333734512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.982138019229751e-05, "grad_norm": 14.242937088012695, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8497970700263977, "num_tokens": 135393050.0, "step": 3547 }, { "epoch": 0.4513420684391299, "ewc_loss": 0.019782779738307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9782779418164864e-05, "grad_norm": 14.231815338134766, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8599147796630859, "num_tokens": 135431704.0, "step": 3548 }, { "epoch": 0.4514692787177204, "ewc_loss": 0.019848648458719254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9848648662446067e-05, "grad_norm": 14.274539947509766, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.83284592628479, "num_tokens": 135471306.0, "step": 3549 }, { "epoch": 0.4515964889963109, "ewc_loss": 0.01982029341161251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9820294255623594e-05, "grad_norm": 14.222973823547363, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8430781364440918, "num_tokens": 135509647.0, "step": 3550 }, { "epoch": 0.4517236992749014, "ewc_loss": 0.01980622299015522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9806222553597763e-05, "grad_norm": 14.171451568603516, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8631740808486938, "num_tokens": 135550469.0, "step": 3551 }, { "epoch": 0.45185090955349194, "ewc_loss": 0.019849443808197975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9849443560815416e-05, "grad_norm": 14.314149856567383, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8550105094909668, "num_tokens": 135589290.0, "step": 3552 }, { "epoch": 0.4519781198320824, "ewc_loss": 0.019897008314728737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9897008314728737e-05, "grad_norm": 14.257641792297363, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8501731157302856, "num_tokens": 135624416.0, "step": 3553 }, { "epoch": 0.45210533011067294, "ewc_loss": 0.019824128597974777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.982412868528627e-05, "grad_norm": 14.326964378356934, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8579700589179993, "num_tokens": 135662316.0, "step": 3554 }, { "epoch": 0.45223254038926347, "ewc_loss": 0.019846947863698006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.984694790735375e-05, "grad_norm": 14.154228210449219, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8567178249359131, "num_tokens": 135701739.0, "step": 3555 }, { "epoch": 0.45235975066785394, "ewc_loss": 0.01981201022863388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9812010577879846e-05, "grad_norm": 14.40899658203125, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8518755435943604, "num_tokens": 135741372.0, "step": 3556 }, { "epoch": 0.45248696094644447, "ewc_loss": 0.019875474274158478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.987547511816956e-05, "grad_norm": 14.182928085327148, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8778095245361328, "num_tokens": 135779945.0, "step": 3557 }, { "epoch": 0.452614171225035, "ewc_loss": 0.019788814708590508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.978881482500583e-05, "grad_norm": 14.345237731933594, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8496644496917725, "num_tokens": 135813179.0, "step": 3558 }, { "epoch": 0.4527413815036255, "ewc_loss": 0.0199190191924572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9919019905501045e-05, "grad_norm": 14.317383766174316, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8578779101371765, "num_tokens": 135855147.0, "step": 3559 }, { "epoch": 0.452868591782216, "ewc_loss": 0.019756317138671875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.975631676032208e-05, "grad_norm": 14.202584266662598, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8639975190162659, "num_tokens": 135892990.0, "step": 3560 }, { "epoch": 0.45299580206080653, "ewc_loss": 0.01982787624001503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9827875803457573e-05, "grad_norm": 14.279451370239258, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.873954176902771, "num_tokens": 135926911.0, "step": 3561 }, { "epoch": 0.453123012339397, "ewc_loss": 0.019836392253637314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9836392311844975e-05, "grad_norm": 14.254624366760254, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8643377423286438, "num_tokens": 135963530.0, "step": 3562 }, { "epoch": 0.45325022261798753, "ewc_loss": 0.01981198415160179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9811983293038793e-05, "grad_norm": 14.349251747131348, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8457284569740295, "num_tokens": 136000677.0, "step": 3563 }, { "epoch": 0.45337743289657806, "ewc_loss": 0.019864704459905624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9864704881911166e-05, "grad_norm": 14.312455177307129, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8460076451301575, "num_tokens": 136037283.0, "step": 3564 }, { "epoch": 0.45350464317516853, "ewc_loss": 0.019798969849944115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9798970242845826e-05, "grad_norm": 14.3284912109375, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8380892276763916, "num_tokens": 136071454.0, "step": 3565 }, { "epoch": 0.45363185345375906, "ewc_loss": 0.019808808341622353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9808809156529605e-05, "grad_norm": 14.242374420166016, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8445329070091248, "num_tokens": 136105638.0, "step": 3566 }, { "epoch": 0.4537590637323496, "ewc_loss": 0.01979987695813179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.979987609956879e-05, "grad_norm": 14.212850570678711, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.85890793800354, "num_tokens": 136147968.0, "step": 3567 }, { "epoch": 0.45388627401094006, "ewc_loss": 0.01990107260644436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.990107193705626e-05, "grad_norm": 14.320141792297363, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.855993390083313, "num_tokens": 136190468.0, "step": 3568 }, { "epoch": 0.4540134842895306, "ewc_loss": 0.019879652187228203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9879651517840102e-05, "grad_norm": 14.238784790039062, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8596710562705994, "num_tokens": 136229363.0, "step": 3569 }, { "epoch": 0.4541406945681211, "ewc_loss": 0.019817369058728218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9817369320662692e-05, "grad_norm": 14.22606372833252, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8491231203079224, "num_tokens": 136268834.0, "step": 3570 }, { "epoch": 0.4542679048467116, "ewc_loss": 0.019867273047566414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9867273294948973e-05, "grad_norm": 14.23515510559082, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8327484726905823, "num_tokens": 136306761.0, "step": 3571 }, { "epoch": 0.4543951151253021, "ewc_loss": 0.01987369917333126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.98736997845117e-05, "grad_norm": 14.277487754821777, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8546836376190186, "num_tokens": 136347736.0, "step": 3572 }, { "epoch": 0.45452232540389265, "ewc_loss": 0.019917434081435204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9917433746741153e-05, "grad_norm": 14.234258651733398, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8580834865570068, "num_tokens": 136381009.0, "step": 3573 }, { "epoch": 0.4546495356824831, "ewc_loss": 0.019912246614694595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.991224598896224e-05, "grad_norm": 14.270185470581055, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8649867177009583, "num_tokens": 136418050.0, "step": 3574 }, { "epoch": 0.45477674596107365, "ewc_loss": 0.019950054585933685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9950055502704345e-05, "grad_norm": 14.272666931152344, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8377222418785095, "num_tokens": 136459109.0, "step": 3575 }, { "epoch": 0.4549039562396642, "ewc_loss": 0.019943948835134506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.994394915527664e-05, "grad_norm": 14.251045227050781, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8520288467407227, "num_tokens": 136497993.0, "step": 3576 }, { "epoch": 0.45503116651825465, "ewc_loss": 0.019923219457268715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9923219952033833e-05, "grad_norm": 14.320671081542969, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8574982285499573, "num_tokens": 136534446.0, "step": 3577 }, { "epoch": 0.4551583767968452, "ewc_loss": 0.01997169852256775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9971697838627733e-05, "grad_norm": 14.268594741821289, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8400691747665405, "num_tokens": 136571117.0, "step": 3578 }, { "epoch": 0.4552855870754357, "ewc_loss": 0.019912945106625557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9912944480893202e-05, "grad_norm": 14.277213096618652, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.862817645072937, "num_tokens": 136601706.0, "step": 3579 }, { "epoch": 0.4554127973540262, "ewc_loss": 0.01995575800538063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.995575803448446e-05, "grad_norm": 14.296119689941406, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8557894229888916, "num_tokens": 136639735.0, "step": 3580 }, { "epoch": 0.4555400076326167, "ewc_loss": 0.01995864324271679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9958642951678485e-05, "grad_norm": 14.229979515075684, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.848102331161499, "num_tokens": 136675647.0, "step": 3581 }, { "epoch": 0.45566721791120723, "ewc_loss": 0.019924012944102287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.992401303141378e-05, "grad_norm": 14.234529495239258, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8627500534057617, "num_tokens": 136713361.0, "step": 3582 }, { "epoch": 0.45579442818979776, "ewc_loss": 0.01998738758265972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.998738844122272e-05, "grad_norm": 14.333166122436523, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8544498682022095, "num_tokens": 136751440.0, "step": 3583 }, { "epoch": 0.45592163846838824, "ewc_loss": 0.01994401402771473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.994401463889517e-05, "grad_norm": 14.280280113220215, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8509032130241394, "num_tokens": 136786078.0, "step": 3584 }, { "epoch": 0.45604884874697876, "ewc_loss": 0.01994628645479679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9946286556660198e-05, "grad_norm": 14.32725715637207, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8628051280975342, "num_tokens": 136828313.0, "step": 3585 }, { "epoch": 0.4561760590255693, "ewc_loss": 0.019947683438658714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.994768354052212e-05, "grad_norm": 14.256603240966797, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8539674282073975, "num_tokens": 136862549.0, "step": 3586 }, { "epoch": 0.45630326930415976, "ewc_loss": 0.019970204681158066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9970204448327422e-05, "grad_norm": 14.371767044067383, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8584544658660889, "num_tokens": 136906423.0, "step": 3587 }, { "epoch": 0.4564304795827503, "ewc_loss": 0.019970053806900978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9970053472206928e-05, "grad_norm": 14.336770057678223, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8431974649429321, "num_tokens": 136941550.0, "step": 3588 }, { "epoch": 0.4565576898613408, "ewc_loss": 0.019939016550779343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9939016056014225e-05, "grad_norm": 14.273336410522461, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8587762117385864, "num_tokens": 136980234.0, "step": 3589 }, { "epoch": 0.4566849001399313, "ewc_loss": 0.019968025386333466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9968025299021974e-05, "grad_norm": 14.306539535522461, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.859332263469696, "num_tokens": 137023991.0, "step": 3590 }, { "epoch": 0.4568121104185218, "ewc_loss": 0.019956685602664948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.995668571908027e-05, "grad_norm": 14.332324981689453, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8462704420089722, "num_tokens": 137063261.0, "step": 3591 }, { "epoch": 0.45693932069711235, "ewc_loss": 0.01991581916809082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9915818484150805e-05, "grad_norm": 14.288983345031738, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.864406168460846, "num_tokens": 137103935.0, "step": 3592 }, { "epoch": 0.4570665309757028, "ewc_loss": 0.01991724595427513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9917246390832588e-05, "grad_norm": 14.243035316467285, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8556151390075684, "num_tokens": 137140194.0, "step": 3593 }, { "epoch": 0.45719374125429335, "ewc_loss": 0.019924333319067955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9924333173548803e-05, "grad_norm": 14.298605918884277, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8457949161529541, "num_tokens": 137179086.0, "step": 3594 }, { "epoch": 0.4573209515328839, "ewc_loss": 0.019923817366361618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9923816580558196e-05, "grad_norm": 14.270018577575684, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8607137799263, "num_tokens": 137216887.0, "step": 3595 }, { "epoch": 0.45744816181147435, "ewc_loss": 0.01990913227200508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.990913187910337e-05, "grad_norm": 14.311155319213867, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8707677125930786, "num_tokens": 137255029.0, "step": 3596 }, { "epoch": 0.4575753720900649, "ewc_loss": 0.01995999924838543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.995999991777353e-05, "grad_norm": 14.306053161621094, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8593978881835938, "num_tokens": 137291399.0, "step": 3597 }, { "epoch": 0.4577025823686554, "ewc_loss": 0.0198685172945261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9868517483700998e-05, "grad_norm": 14.288886070251465, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8600538372993469, "num_tokens": 137326569.0, "step": 3598 }, { "epoch": 0.4578297926472459, "ewc_loss": 0.019953057169914246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9953056835220195e-05, "grad_norm": 14.271537780761719, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8517237305641174, "num_tokens": 137370035.0, "step": 3599 }, { "epoch": 0.4579570029258364, "ewc_loss": 0.01990053802728653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9900537154171616e-05, "grad_norm": 14.271594047546387, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8516098856925964, "num_tokens": 137407969.0, "step": 3600 }, { "epoch": 0.45808421320442694, "ewc_loss": 0.019895773380994797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.989577322092373e-05, "grad_norm": 14.236714363098145, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8539677858352661, "num_tokens": 137447256.0, "step": 3601 }, { "epoch": 0.4582114234830174, "ewc_loss": 0.019914276897907257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9914277800126e-05, "grad_norm": 14.294422149658203, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8404271602630615, "num_tokens": 137479762.0, "step": 3602 }, { "epoch": 0.45833863376160794, "ewc_loss": 0.019913695752620697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9913695723516867e-05, "grad_norm": 14.156256675720215, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8485942482948303, "num_tokens": 137521823.0, "step": 3603 }, { "epoch": 0.45846584404019847, "ewc_loss": 0.01993277482688427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.993277510337066e-05, "grad_norm": 14.384467124938965, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8535683155059814, "num_tokens": 137558174.0, "step": 3604 }, { "epoch": 0.45859305431878894, "ewc_loss": 0.019991081207990646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9991080989711918e-05, "grad_norm": 14.171408653259277, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8314724564552307, "num_tokens": 137597265.0, "step": 3605 }, { "epoch": 0.45872026459737947, "ewc_loss": 0.019883740693330765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9883740606019273e-05, "grad_norm": 14.307637214660645, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8634800910949707, "num_tokens": 137635954.0, "step": 3606 }, { "epoch": 0.45884747487597, "ewc_loss": 0.02002968266606331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0029681763844565e-05, "grad_norm": 14.305585861206055, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8577169179916382, "num_tokens": 137676137.0, "step": 3607 }, { "epoch": 0.45897468515456047, "ewc_loss": 0.019930627197027206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.993062687688507e-05, "grad_norm": 14.243377685546875, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8583652973175049, "num_tokens": 137716723.0, "step": 3608 }, { "epoch": 0.459101895433151, "ewc_loss": 0.019958913326263428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9958913981099613e-05, "grad_norm": 14.295364379882812, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8374271988868713, "num_tokens": 137752435.0, "step": 3609 }, { "epoch": 0.4592291057117415, "ewc_loss": 0.01995077170431614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9950772184529342e-05, "grad_norm": 14.284276008605957, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8472061157226562, "num_tokens": 137786684.0, "step": 3610 }, { "epoch": 0.459356315990332, "ewc_loss": 0.01995164155960083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9951641661464237e-05, "grad_norm": 14.240158081054688, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8467166423797607, "num_tokens": 137830668.0, "step": 3611 }, { "epoch": 0.4594835262689225, "ewc_loss": 0.019991550594568253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9991550288978033e-05, "grad_norm": 14.292831420898438, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8614135980606079, "num_tokens": 137865262.0, "step": 3612 }, { "epoch": 0.45961073654751305, "ewc_loss": 0.02000090852379799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0000908989459276e-05, "grad_norm": 14.292341232299805, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8626022338867188, "num_tokens": 137898867.0, "step": 3613 }, { "epoch": 0.4597379468261035, "ewc_loss": 0.019977957010269165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9977956981165335e-05, "grad_norm": 14.282273292541504, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8563153743743896, "num_tokens": 137935108.0, "step": 3614 }, { "epoch": 0.45986515710469406, "ewc_loss": 0.019984478130936623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9984478058177046e-05, "grad_norm": 14.279679298400879, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.866675853729248, "num_tokens": 137977714.0, "step": 3615 }, { "epoch": 0.4599923673832846, "ewc_loss": 0.019965985789895058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.99659862119006e-05, "grad_norm": 14.31196117401123, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8678597807884216, "num_tokens": 138006345.0, "step": 3616 }, { "epoch": 0.46011957766187506, "ewc_loss": 0.020002389326691628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0002389646833763e-05, "grad_norm": 14.247281074523926, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8556877374649048, "num_tokens": 138045106.0, "step": 3617 }, { "epoch": 0.4602467879404656, "ewc_loss": 0.019977111369371414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9977111151092686e-05, "grad_norm": 14.31741714477539, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8503369092941284, "num_tokens": 138079993.0, "step": 3618 }, { "epoch": 0.4603739982190561, "ewc_loss": 0.020027026534080505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0027026039315388e-05, "grad_norm": 14.224242210388184, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8592157959938049, "num_tokens": 138118555.0, "step": 3619 }, { "epoch": 0.4605012084976466, "ewc_loss": 0.019993579015135765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9993578462162986e-05, "grad_norm": 14.292028427124023, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8635387420654297, "num_tokens": 138155096.0, "step": 3620 }, { "epoch": 0.4606284187762371, "ewc_loss": 0.020076368004083633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0076367945875973e-05, "grad_norm": 14.215509414672852, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8575684428215027, "num_tokens": 138194529.0, "step": 3621 }, { "epoch": 0.46075562905482764, "ewc_loss": 0.01999705471098423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 1.9997054550913163e-05, "grad_norm": 14.27354907989502, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8309034109115601, "num_tokens": 138235520.0, "step": 3622 }, { "epoch": 0.4608828393334181, "ewc_loss": 0.020124569535255432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0124569346080534e-05, "grad_norm": 14.303929328918457, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.866054892539978, "num_tokens": 138271391.0, "step": 3623 }, { "epoch": 0.46101004961200864, "ewc_loss": 0.020005542784929276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0005541955470107e-05, "grad_norm": 14.224075317382812, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8501540422439575, "num_tokens": 138313923.0, "step": 3624 }, { "epoch": 0.46113725989059917, "ewc_loss": 0.020055588334798813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0055587810929865e-05, "grad_norm": 14.272231101989746, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8478150963783264, "num_tokens": 138355992.0, "step": 3625 }, { "epoch": 0.46126447016918964, "ewc_loss": 0.02008983865380287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0089839381398633e-05, "grad_norm": 14.257242202758789, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8757697343826294, "num_tokens": 138393074.0, "step": 3626 }, { "epoch": 0.4613916804477802, "ewc_loss": 0.020036207512021065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0036208297824487e-05, "grad_norm": 14.332874298095703, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8628622889518738, "num_tokens": 138427854.0, "step": 3627 }, { "epoch": 0.4615188907263707, "ewc_loss": 0.020101312547922134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0101311747566797e-05, "grad_norm": 14.301802635192871, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8649647235870361, "num_tokens": 138467359.0, "step": 3628 }, { "epoch": 0.4616461010049612, "ewc_loss": 0.020062264055013657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0062263502040878e-05, "grad_norm": 14.285229682922363, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8439263105392456, "num_tokens": 138505355.0, "step": 3629 }, { "epoch": 0.4617733112835517, "ewc_loss": 0.020029639825224876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0029639927088283e-05, "grad_norm": 14.232933044433594, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8605780601501465, "num_tokens": 138543703.0, "step": 3630 }, { "epoch": 0.46190052156214223, "ewc_loss": 0.02003926783800125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.003926783800125e-05, "grad_norm": 14.2722806930542, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8489995002746582, "num_tokens": 138579860.0, "step": 3631 }, { "epoch": 0.46202773184073276, "ewc_loss": 0.020040282979607582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.004028283408843e-05, "grad_norm": 14.276603698730469, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8652433156967163, "num_tokens": 138615755.0, "step": 3632 }, { "epoch": 0.46215494211932323, "ewc_loss": 0.02001851424574852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0018514987896197e-05, "grad_norm": 14.299140930175781, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.851632833480835, "num_tokens": 138647607.0, "step": 3633 }, { "epoch": 0.46228215239791376, "ewc_loss": 0.02004777267575264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.004777343245223e-05, "grad_norm": 14.282479286193848, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8618119955062866, "num_tokens": 138679284.0, "step": 3634 }, { "epoch": 0.4624093626765043, "ewc_loss": 0.020031647756695747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0031648091389798e-05, "grad_norm": 14.28162956237793, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.86539226770401, "num_tokens": 138719690.0, "step": 3635 }, { "epoch": 0.46253657295509476, "ewc_loss": 0.020060285925865173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0060286260559224e-05, "grad_norm": 14.218422889709473, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8435719013214111, "num_tokens": 138758031.0, "step": 3636 }, { "epoch": 0.4626637832336853, "ewc_loss": 0.02007933333516121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0079332898603752e-05, "grad_norm": 14.352466583251953, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8553079962730408, "num_tokens": 138797283.0, "step": 3637 }, { "epoch": 0.4627909935122758, "ewc_loss": 0.0201104786247015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0110479454160668e-05, "grad_norm": 14.242472648620605, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8513278365135193, "num_tokens": 138834672.0, "step": 3638 }, { "epoch": 0.4629182037908663, "ewc_loss": 0.02006158046424389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0061579562025145e-05, "grad_norm": 14.291829109191895, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8661902546882629, "num_tokens": 138871105.0, "step": 3639 }, { "epoch": 0.4630454140694568, "ewc_loss": 0.020080305635929108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.008030605793465e-05, "grad_norm": 14.279050827026367, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8573377132415771, "num_tokens": 138903516.0, "step": 3640 }, { "epoch": 0.46317262434804735, "ewc_loss": 0.020062796771526337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0062796465936117e-05, "grad_norm": 14.266529083251953, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8540406227111816, "num_tokens": 138942021.0, "step": 3641 }, { "epoch": 0.4632998346266378, "ewc_loss": 0.02009204775094986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0092047634534538e-05, "grad_norm": 14.262406349182129, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8642966747283936, "num_tokens": 138978210.0, "step": 3642 }, { "epoch": 0.46342704490522835, "ewc_loss": 0.020105283707380295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.010528442042414e-05, "grad_norm": 14.292218208312988, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8585401177406311, "num_tokens": 139019122.0, "step": 3643 }, { "epoch": 0.4635542551838189, "ewc_loss": 0.02008000575006008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0080005924683064e-05, "grad_norm": 14.243823051452637, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8664715886116028, "num_tokens": 139062187.0, "step": 3644 }, { "epoch": 0.46368146546240935, "ewc_loss": 0.02008851245045662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0088513338123448e-05, "grad_norm": 14.283428192138672, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8752878904342651, "num_tokens": 139102960.0, "step": 3645 }, { "epoch": 0.4638086757409999, "ewc_loss": 0.020097576081752777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0097575543331914e-05, "grad_norm": 14.281488418579102, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8568456172943115, "num_tokens": 139137302.0, "step": 3646 }, { "epoch": 0.4639358860195904, "ewc_loss": 0.020099705085158348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0099705579923466e-05, "grad_norm": 14.222975730895996, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8371747732162476, "num_tokens": 139177195.0, "step": 3647 }, { "epoch": 0.4640630962981809, "ewc_loss": 0.020056994631886482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0056993889738806e-05, "grad_norm": 14.321394920349121, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8526278734207153, "num_tokens": 139214722.0, "step": 3648 }, { "epoch": 0.4641903065767714, "ewc_loss": 0.020140742883086205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0140743799856864e-05, "grad_norm": 14.269998550415039, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8552991151809692, "num_tokens": 139248569.0, "step": 3649 }, { "epoch": 0.46431751685536193, "ewc_loss": 0.02008313685655594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0083136405446567e-05, "grad_norm": 14.27891731262207, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8630790114402771, "num_tokens": 139289141.0, "step": 3650 }, { "epoch": 0.4644447271339524, "ewc_loss": 0.020109228789806366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.010922798945103e-05, "grad_norm": 14.214344024658203, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8728564381599426, "num_tokens": 139330791.0, "step": 3651 }, { "epoch": 0.46457193741254293, "ewc_loss": 0.02007320150732994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.00732010853244e-05, "grad_norm": 14.324725151062012, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.8249642848968506, "num_tokens": 139373009.0, "step": 3652 }, { "epoch": 0.46469914769113346, "ewc_loss": 0.020114516839385033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.011451761063654e-05, "grad_norm": 14.274853706359863, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8546236753463745, "num_tokens": 139408894.0, "step": 3653 }, { "epoch": 0.46482635796972394, "ewc_loss": 0.020080458372831345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0080458853044547e-05, "grad_norm": 14.302632331848145, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8546782732009888, "num_tokens": 139444040.0, "step": 3654 }, { "epoch": 0.46495356824831446, "ewc_loss": 0.020100055262446404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0100054825888947e-05, "grad_norm": 14.173774719238281, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8621060848236084, "num_tokens": 139479675.0, "step": 3655 }, { "epoch": 0.465080778526905, "ewc_loss": 0.02006528340280056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0065283024450764e-05, "grad_norm": 14.325974464416504, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8636798858642578, "num_tokens": 139521792.0, "step": 3656 }, { "epoch": 0.46520798880549546, "ewc_loss": 0.020145593211054802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0145593225606717e-05, "grad_norm": 14.270309448242188, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8630931377410889, "num_tokens": 139563471.0, "step": 3657 }, { "epoch": 0.465335199084086, "ewc_loss": 0.02002790756523609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0027908249176107e-05, "grad_norm": 14.271821022033691, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8562791347503662, "num_tokens": 139604702.0, "step": 3658 }, { "epoch": 0.4654624093626765, "ewc_loss": 0.020119937136769295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0119936380069703e-05, "grad_norm": 14.342552185058594, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8541100025177002, "num_tokens": 139644573.0, "step": 3659 }, { "epoch": 0.465589619641267, "ewc_loss": 0.020069638267159462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0069637685082853e-05, "grad_norm": 14.295122146606445, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.841500461101532, "num_tokens": 139680924.0, "step": 3660 }, { "epoch": 0.4657168299198575, "ewc_loss": 0.020080000162124634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0080000467714854e-05, "grad_norm": 14.239595413208008, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8491690754890442, "num_tokens": 139724369.0, "step": 3661 }, { "epoch": 0.46584404019844805, "ewc_loss": 0.020065514370799065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0065514036105014e-05, "grad_norm": 14.27669906616211, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8564870357513428, "num_tokens": 139769412.0, "step": 3662 }, { "epoch": 0.4659712504770385, "ewc_loss": 0.020110728219151497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0110728655708954e-05, "grad_norm": 14.31852912902832, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8315393328666687, "num_tokens": 139808066.0, "step": 3663 }, { "epoch": 0.46609846075562905, "ewc_loss": 0.020086131989955902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0086132280994207e-05, "grad_norm": 14.328634262084961, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8618050217628479, "num_tokens": 139851169.0, "step": 3664 }, { "epoch": 0.4662256710342196, "ewc_loss": 0.020105812698602676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.010581192735117e-05, "grad_norm": 14.314746856689453, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8659754395484924, "num_tokens": 139890550.0, "step": 3665 }, { "epoch": 0.46635288131281005, "ewc_loss": 0.020036041736602783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.003604095079936e-05, "grad_norm": 14.282783508300781, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8296523690223694, "num_tokens": 139931496.0, "step": 3666 }, { "epoch": 0.4664800915914006, "ewc_loss": 0.020081225782632828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0081226466572843e-05, "grad_norm": 14.291573524475098, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8465014696121216, "num_tokens": 139965872.0, "step": 3667 }, { "epoch": 0.4666073018699911, "ewc_loss": 0.02009965106844902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.009965101024136e-05, "grad_norm": 14.360267639160156, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8412482142448425, "num_tokens": 140000394.0, "step": 3668 }, { "epoch": 0.4667345121485816, "ewc_loss": 0.02009293995797634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.009294075833168e-05, "grad_norm": 14.242674827575684, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8391271829605103, "num_tokens": 140039967.0, "step": 3669 }, { "epoch": 0.4668617224271721, "ewc_loss": 0.020089417695999146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.008941737585701e-05, "grad_norm": 14.31244945526123, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8569588661193848, "num_tokens": 140075739.0, "step": 3670 }, { "epoch": 0.46698893270576264, "ewc_loss": 0.02014959417283535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0149595002294518e-05, "grad_norm": 14.267431259155273, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8603352308273315, "num_tokens": 140110016.0, "step": 3671 }, { "epoch": 0.4671161429843531, "ewc_loss": 0.020104385912418365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.010438583965879e-05, "grad_norm": 14.318275451660156, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8459146618843079, "num_tokens": 140147905.0, "step": 3672 }, { "epoch": 0.46724335326294364, "ewc_loss": 0.020148852840065956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.014885285461787e-05, "grad_norm": 14.303982734680176, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8504283428192139, "num_tokens": 140182284.0, "step": 3673 }, { "epoch": 0.46737056354153417, "ewc_loss": 0.02012050338089466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.012050390476361e-05, "grad_norm": 14.281160354614258, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8482636213302612, "num_tokens": 140220669.0, "step": 3674 }, { "epoch": 0.46749777382012464, "ewc_loss": 0.02013222686946392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0132227291469462e-05, "grad_norm": 14.282803535461426, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8418896794319153, "num_tokens": 140258399.0, "step": 3675 }, { "epoch": 0.46762498409871517, "ewc_loss": 0.02014542743563652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0145427697570994e-05, "grad_norm": 14.276928901672363, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8693768382072449, "num_tokens": 140296015.0, "step": 3676 }, { "epoch": 0.4677521943773057, "ewc_loss": 0.02019350416958332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.019350358750671e-05, "grad_norm": 14.37088680267334, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8451083898544312, "num_tokens": 140338618.0, "step": 3677 }, { "epoch": 0.46787940465589617, "ewc_loss": 0.02014770545065403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0147705072304234e-05, "grad_norm": 14.245834350585938, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8600391745567322, "num_tokens": 140379151.0, "step": 3678 }, { "epoch": 0.4680066149344867, "ewc_loss": 0.020170308649539948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0170307834632695e-05, "grad_norm": 14.339410781860352, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8415191173553467, "num_tokens": 140417740.0, "step": 3679 }, { "epoch": 0.4681338252130772, "ewc_loss": 0.020195210352540016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0195209799567237e-05, "grad_norm": 14.340998649597168, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8532989621162415, "num_tokens": 140452454.0, "step": 3680 }, { "epoch": 0.46826103549166775, "ewc_loss": 0.020216505974531174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0216506527503952e-05, "grad_norm": 14.375903129577637, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8322039246559143, "num_tokens": 140492981.0, "step": 3681 }, { "epoch": 0.4683882457702582, "ewc_loss": 0.020173266530036926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0173267330392264e-05, "grad_norm": 14.269607543945312, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8449692726135254, "num_tokens": 140534131.0, "step": 3682 }, { "epoch": 0.46851545604884876, "ewc_loss": 0.020168554037809372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0168554328847677e-05, "grad_norm": 14.371910095214844, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8484445810317993, "num_tokens": 140576761.0, "step": 3683 }, { "epoch": 0.4686426663274393, "ewc_loss": 0.020203016698360443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0203016902087256e-05, "grad_norm": 14.335545539855957, "learning_rate": 1e-06, "loss": 0.5385, "mean_token_accuracy": 0.835692286491394, "num_tokens": 140609538.0, "step": 3684 }, { "epoch": 0.46876987660602976, "ewc_loss": 0.02017422765493393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0174227756797336e-05, "grad_norm": 14.395760536193848, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.850818395614624, "num_tokens": 140641802.0, "step": 3685 }, { "epoch": 0.4688970868846203, "ewc_loss": 0.02021339163184166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0213392417645082e-05, "grad_norm": 14.281013488769531, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8433854579925537, "num_tokens": 140685832.0, "step": 3686 }, { "epoch": 0.4690242971632108, "ewc_loss": 0.02015094645321369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0150946511421353e-05, "grad_norm": 14.294805526733398, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.843475341796875, "num_tokens": 140725333.0, "step": 3687 }, { "epoch": 0.4691515074418013, "ewc_loss": 0.020210592076182365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.021059117396362e-05, "grad_norm": 14.289839744567871, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8536401987075806, "num_tokens": 140760838.0, "step": 3688 }, { "epoch": 0.4692787177203918, "ewc_loss": 0.02019374631345272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0193745513097383e-05, "grad_norm": 14.310321807861328, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8446913361549377, "num_tokens": 140797367.0, "step": 3689 }, { "epoch": 0.46940592799898234, "ewc_loss": 0.020245766267180443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.024576679104939e-05, "grad_norm": 14.336067199707031, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8529213666915894, "num_tokens": 140839778.0, "step": 3690 }, { "epoch": 0.4695331382775728, "ewc_loss": 0.020230041816830635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0230041627655737e-05, "grad_norm": 14.300593376159668, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8501664400100708, "num_tokens": 140880340.0, "step": 3691 }, { "epoch": 0.46966034855616334, "ewc_loss": 0.0202109906822443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.02109913516324e-05, "grad_norm": 14.430039405822754, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8478754162788391, "num_tokens": 140915451.0, "step": 3692 }, { "epoch": 0.46978755883475387, "ewc_loss": 0.020264919847249985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.026492074946873e-05, "grad_norm": 14.290522575378418, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8521801233291626, "num_tokens": 140956967.0, "step": 3693 }, { "epoch": 0.46991476911334434, "ewc_loss": 0.020185578614473343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0185578250675462e-05, "grad_norm": 14.313911437988281, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8690657615661621, "num_tokens": 140996640.0, "step": 3694 }, { "epoch": 0.47004197939193487, "ewc_loss": 0.020274624228477478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0274625057936646e-05, "grad_norm": 14.351896286010742, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.853850245475769, "num_tokens": 141036094.0, "step": 3695 }, { "epoch": 0.4701691896705254, "ewc_loss": 0.020234165713191032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0234165276633576e-05, "grad_norm": 14.32596206665039, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8435940742492676, "num_tokens": 141069033.0, "step": 3696 }, { "epoch": 0.4702963999491159, "ewc_loss": 0.020244743674993515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0244744519004598e-05, "grad_norm": 14.4213228225708, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.870978593826294, "num_tokens": 141111075.0, "step": 3697 }, { "epoch": 0.4704236102277064, "ewc_loss": 0.020252350717782974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0252351532690227e-05, "grad_norm": 14.310587882995605, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8701044321060181, "num_tokens": 141145270.0, "step": 3698 }, { "epoch": 0.47055082050629693, "ewc_loss": 0.020203042775392532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0203042367938906e-05, "grad_norm": 14.346442222595215, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8692976236343384, "num_tokens": 141182132.0, "step": 3699 }, { "epoch": 0.4706780307848874, "ewc_loss": 0.020245525985956192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0245526684448123e-05, "grad_norm": 14.314655303955078, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8548058867454529, "num_tokens": 141217455.0, "step": 3700 }, { "epoch": 0.47080524106347793, "ewc_loss": 0.020206710323691368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0206709450576454e-05, "grad_norm": 14.319204330444336, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8539738655090332, "num_tokens": 141249680.0, "step": 3701 }, { "epoch": 0.47093245134206846, "ewc_loss": 0.020269762724637985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0269762899260968e-05, "grad_norm": 14.336207389831543, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8549970984458923, "num_tokens": 141292085.0, "step": 3702 }, { "epoch": 0.47105966162065893, "ewc_loss": 0.020213423296809196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0213423340464942e-05, "grad_norm": 14.275050163269043, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8632807731628418, "num_tokens": 141331455.0, "step": 3703 }, { "epoch": 0.47118687189924946, "ewc_loss": 0.020271874964237213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0271874745958485e-05, "grad_norm": 14.44837474822998, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8546779155731201, "num_tokens": 141365623.0, "step": 3704 }, { "epoch": 0.47131408217784, "ewc_loss": 0.020251836627721786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0251836758689024e-05, "grad_norm": 14.354618072509766, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8453556299209595, "num_tokens": 141399033.0, "step": 3705 }, { "epoch": 0.47144129245643046, "ewc_loss": 0.020210139453411102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0210140064591542e-05, "grad_norm": 14.409390449523926, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8610147833824158, "num_tokens": 141430436.0, "step": 3706 }, { "epoch": 0.471568502735021, "ewc_loss": 0.020249977707862854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.02499777515186e-05, "grad_norm": 14.287684440612793, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8721702098846436, "num_tokens": 141463169.0, "step": 3707 }, { "epoch": 0.4716957130136115, "ewc_loss": 0.020168431103229523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0168430637568235e-05, "grad_norm": 14.30820369720459, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8722963333129883, "num_tokens": 141495524.0, "step": 3708 }, { "epoch": 0.471822923292202, "ewc_loss": 0.020261142402887344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0261142708477564e-05, "grad_norm": 14.318214416503906, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.86632239818573, "num_tokens": 141533099.0, "step": 3709 }, { "epoch": 0.4719501335707925, "ewc_loss": 0.020191559568047523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.019155908783432e-05, "grad_norm": 14.286293029785156, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.86046302318573, "num_tokens": 141576389.0, "step": 3710 }, { "epoch": 0.47207734384938305, "ewc_loss": 0.020251499488949776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0251500245649368e-05, "grad_norm": 14.308670997619629, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8577407598495483, "num_tokens": 141611870.0, "step": 3711 }, { "epoch": 0.4722045541279735, "ewc_loss": 0.020208168774843216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0208168280078098e-05, "grad_norm": 14.323904991149902, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8665904402732849, "num_tokens": 141648539.0, "step": 3712 }, { "epoch": 0.47233176440656405, "ewc_loss": 0.020265869796276093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.026587026193738e-05, "grad_norm": 14.306410789489746, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8595879673957825, "num_tokens": 141688549.0, "step": 3713 }, { "epoch": 0.4724589746851546, "ewc_loss": 0.020246991887688637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0246990970917977e-05, "grad_norm": 14.307605743408203, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8642889857292175, "num_tokens": 141725294.0, "step": 3714 }, { "epoch": 0.47258618496374505, "ewc_loss": 0.020272234454751015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0272234905860387e-05, "grad_norm": 14.35937213897705, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8400535583496094, "num_tokens": 141756761.0, "step": 3715 }, { "epoch": 0.4727133952423356, "ewc_loss": 0.020237112417817116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.023711203946732e-05, "grad_norm": 14.299662590026855, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8401223421096802, "num_tokens": 141790417.0, "step": 3716 }, { "epoch": 0.4728406055209261, "ewc_loss": 0.020264601334929466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0264600607333705e-05, "grad_norm": 14.310964584350586, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8668457269668579, "num_tokens": 141830749.0, "step": 3717 }, { "epoch": 0.4729678157995166, "ewc_loss": 0.020334405824542046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.033440614468418e-05, "grad_norm": 14.366640090942383, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8592220544815063, "num_tokens": 141868433.0, "step": 3718 }, { "epoch": 0.4730950260781071, "ewc_loss": 0.020277461037039757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0277460862416774e-05, "grad_norm": 14.29432201385498, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8728817701339722, "num_tokens": 141907140.0, "step": 3719 }, { "epoch": 0.47322223635669763, "ewc_loss": 0.020274154841899872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.027415575867053e-05, "grad_norm": 14.336642265319824, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8695942759513855, "num_tokens": 141947436.0, "step": 3720 }, { "epoch": 0.4733494466352881, "ewc_loss": 0.020262958481907845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0262958059902303e-05, "grad_norm": 14.290042877197266, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8496081829071045, "num_tokens": 141986876.0, "step": 3721 }, { "epoch": 0.47347665691387864, "ewc_loss": 0.020275864750146866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0275865608709864e-05, "grad_norm": 14.393655776977539, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8607363104820251, "num_tokens": 142025806.0, "step": 3722 }, { "epoch": 0.47360386719246916, "ewc_loss": 0.020318180322647095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.031818075920455e-05, "grad_norm": 14.384013175964355, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8551576137542725, "num_tokens": 142057932.0, "step": 3723 }, { "epoch": 0.47373107747105964, "ewc_loss": 0.020251739770174026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0251740352250636e-05, "grad_norm": 14.318893432617188, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8589898347854614, "num_tokens": 142088880.0, "step": 3724 }, { "epoch": 0.47385828774965016, "ewc_loss": 0.020297562703490257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.029756251431536e-05, "grad_norm": 14.390562057495117, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8691454529762268, "num_tokens": 142124959.0, "step": 3725 }, { "epoch": 0.4739854980282407, "ewc_loss": 0.020273583009839058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0273582777008414e-05, "grad_norm": 14.340557098388672, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8417474627494812, "num_tokens": 142160667.0, "step": 3726 }, { "epoch": 0.47411270830683117, "ewc_loss": 0.020277446135878563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0277446310501546e-05, "grad_norm": 14.278096199035645, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8553952574729919, "num_tokens": 142205017.0, "step": 3727 }, { "epoch": 0.4742399185854217, "ewc_loss": 0.020238865166902542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0238865545252338e-05, "grad_norm": 14.321438789367676, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.86335289478302, "num_tokens": 142241479.0, "step": 3728 }, { "epoch": 0.4743671288640122, "ewc_loss": 0.02030717208981514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0307172235334292e-05, "grad_norm": 14.36622142791748, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8572263121604919, "num_tokens": 142280881.0, "step": 3729 }, { "epoch": 0.4744943391426027, "ewc_loss": 0.02026120014488697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0261200916138478e-05, "grad_norm": 14.287484169006348, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8642405867576599, "num_tokens": 142328248.0, "step": 3730 }, { "epoch": 0.4746215494211932, "ewc_loss": 0.02028444968163967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0284449419705197e-05, "grad_norm": 14.362872123718262, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8400866985321045, "num_tokens": 142364872.0, "step": 3731 }, { "epoch": 0.47474875969978375, "ewc_loss": 0.020306767895817757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0306768419686705e-05, "grad_norm": 14.373559951782227, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8379528522491455, "num_tokens": 142400494.0, "step": 3732 }, { "epoch": 0.4748759699783743, "ewc_loss": 0.020256757736206055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0256757125025615e-05, "grad_norm": 14.317540168762207, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8534674644470215, "num_tokens": 142433739.0, "step": 3733 }, { "epoch": 0.47500318025696475, "ewc_loss": 0.0202573724091053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0257371943444014e-05, "grad_norm": 14.344874382019043, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8654699921607971, "num_tokens": 142469777.0, "step": 3734 }, { "epoch": 0.4751303905355553, "ewc_loss": 0.02028769813477993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.028769813477993e-05, "grad_norm": 14.302763938903809, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8465890288352966, "num_tokens": 142510052.0, "step": 3735 }, { "epoch": 0.4752576008141458, "ewc_loss": 0.020264215767383575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0264214981580153e-05, "grad_norm": 14.377317428588867, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8449958562850952, "num_tokens": 142545689.0, "step": 3736 }, { "epoch": 0.4753848110927363, "ewc_loss": 0.020291095599532127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0291096006985754e-05, "grad_norm": 14.330883026123047, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8559274673461914, "num_tokens": 142585916.0, "step": 3737 }, { "epoch": 0.4755120213713268, "ewc_loss": 0.02029101364314556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0291014152462594e-05, "grad_norm": 14.447912216186523, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8631267547607422, "num_tokens": 142623019.0, "step": 3738 }, { "epoch": 0.47563923164991734, "ewc_loss": 0.02029559761285782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.029559800575953e-05, "grad_norm": 14.286481857299805, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.853214681148529, "num_tokens": 142660986.0, "step": 3739 }, { "epoch": 0.4757664419285078, "ewc_loss": 0.020261626690626144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0261626559658907e-05, "grad_norm": 14.38783073425293, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8593649864196777, "num_tokens": 142702876.0, "step": 3740 }, { "epoch": 0.47589365220709834, "ewc_loss": 0.020295048132538795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.029504867095966e-05, "grad_norm": 14.32226848602295, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8454239964485168, "num_tokens": 142744082.0, "step": 3741 }, { "epoch": 0.47602086248568887, "ewc_loss": 0.020248234272003174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0248235159670003e-05, "grad_norm": 14.457425117492676, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.837496280670166, "num_tokens": 142777511.0, "step": 3742 }, { "epoch": 0.47614807276427934, "ewc_loss": 0.02031591162085533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.031591247941833e-05, "grad_norm": 14.32445240020752, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8515428304672241, "num_tokens": 142818882.0, "step": 3743 }, { "epoch": 0.47627528304286987, "ewc_loss": 0.02024737372994423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0247372958692722e-05, "grad_norm": 14.338579177856445, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8593026399612427, "num_tokens": 142852771.0, "step": 3744 }, { "epoch": 0.4764024933214604, "ewc_loss": 0.020280461758375168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0280462194932625e-05, "grad_norm": 14.297739028930664, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8574958443641663, "num_tokens": 142888398.0, "step": 3745 }, { "epoch": 0.47652970360005087, "ewc_loss": 0.020255591720342636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0255591152817942e-05, "grad_norm": 14.32259750366211, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8721867799758911, "num_tokens": 142924789.0, "step": 3746 }, { "epoch": 0.4766569138786414, "ewc_loss": 0.020329365506768227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0329365725046955e-05, "grad_norm": 14.379101753234863, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8640749454498291, "num_tokens": 142958843.0, "step": 3747 }, { "epoch": 0.4767841241572319, "ewc_loss": 0.02029292657971382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.029292591032572e-05, "grad_norm": 14.32093334197998, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8671197891235352, "num_tokens": 142996408.0, "step": 3748 }, { "epoch": 0.4769113344358224, "ewc_loss": 0.020291903987526894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0291903638280928e-05, "grad_norm": 14.388559341430664, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8495226502418518, "num_tokens": 143040031.0, "step": 3749 }, { "epoch": 0.4770385447144129, "ewc_loss": 0.02033533714711666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0335337467258796e-05, "grad_norm": 14.393148422241211, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8337770104408264, "num_tokens": 143081648.0, "step": 3750 }, { "epoch": 0.47716575499300345, "ewc_loss": 0.020245181396603584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.024518107646145e-05, "grad_norm": 14.291498184204102, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8351690769195557, "num_tokens": 143118913.0, "step": 3751 }, { "epoch": 0.4772929652715939, "ewc_loss": 0.02027214877307415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.027214941335842e-05, "grad_norm": 14.325650215148926, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8650447130203247, "num_tokens": 143153858.0, "step": 3752 }, { "epoch": 0.47742017555018446, "ewc_loss": 0.020354552194476128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.035455145232845e-05, "grad_norm": 14.359138488769531, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8671127557754517, "num_tokens": 143190977.0, "step": 3753 }, { "epoch": 0.477547385828775, "ewc_loss": 0.020328043028712273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0328043319750577e-05, "grad_norm": 14.37617301940918, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8491756916046143, "num_tokens": 143228009.0, "step": 3754 }, { "epoch": 0.47767459610736546, "ewc_loss": 0.020333318039774895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.033331838902086e-05, "grad_norm": 14.361014366149902, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8454091548919678, "num_tokens": 143269820.0, "step": 3755 }, { "epoch": 0.477801806385956, "ewc_loss": 0.020288536325097084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0288536688894965e-05, "grad_norm": 14.346410751342773, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8602014183998108, "num_tokens": 143303441.0, "step": 3756 }, { "epoch": 0.4779290166645465, "ewc_loss": 0.020321279764175415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0321280317148194e-05, "grad_norm": 14.395605087280273, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8477444648742676, "num_tokens": 143338748.0, "step": 3757 }, { "epoch": 0.478056226943137, "ewc_loss": 0.020327255129814148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0327255697338842e-05, "grad_norm": 14.361802101135254, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8575463891029358, "num_tokens": 143373900.0, "step": 3758 }, { "epoch": 0.4781834372217275, "ewc_loss": 0.020276958122849464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0276958821341395e-05, "grad_norm": 14.390238761901855, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8513354063034058, "num_tokens": 143411190.0, "step": 3759 }, { "epoch": 0.47831064750031804, "ewc_loss": 0.02032790519297123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0327905076555908e-05, "grad_norm": 14.311047554016113, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.852421760559082, "num_tokens": 143449470.0, "step": 3760 }, { "epoch": 0.4784378577789085, "ewc_loss": 0.020288877189159393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0288876839913428e-05, "grad_norm": 14.39240550994873, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8622731566429138, "num_tokens": 143485868.0, "step": 3761 }, { "epoch": 0.47856506805749904, "ewc_loss": 0.02036106213927269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.036106161540374e-05, "grad_norm": 14.374603271484375, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8651925325393677, "num_tokens": 143520334.0, "step": 3762 }, { "epoch": 0.47869227833608957, "ewc_loss": 0.020299328491091728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0299328753026202e-05, "grad_norm": 14.373453140258789, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8487333655357361, "num_tokens": 143557752.0, "step": 3763 }, { "epoch": 0.47881948861468004, "ewc_loss": 0.02033277414739132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.03327745111892e-05, "grad_norm": 14.281179428100586, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8559629321098328, "num_tokens": 143593385.0, "step": 3764 }, { "epoch": 0.4789466988932706, "ewc_loss": 0.02036169543862343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0361694623716176e-05, "grad_norm": 14.420014381408691, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8361635208129883, "num_tokens": 143631975.0, "step": 3765 }, { "epoch": 0.4790739091718611, "ewc_loss": 0.020371394231915474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0371393475215882e-05, "grad_norm": 14.326192855834961, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8382846117019653, "num_tokens": 143668382.0, "step": 3766 }, { "epoch": 0.4792011194504516, "ewc_loss": 0.020378801971673965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0378802219056524e-05, "grad_norm": 14.427406311035156, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8411206007003784, "num_tokens": 143706623.0, "step": 3767 }, { "epoch": 0.4793283297290421, "ewc_loss": 0.020418548956513405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0418548956513405e-05, "grad_norm": 14.295235633850098, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8596356511116028, "num_tokens": 143745664.0, "step": 3768 }, { "epoch": 0.47945554000763263, "ewc_loss": 0.020371438935399055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.037143894995097e-05, "grad_norm": 14.417171478271484, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8461930155754089, "num_tokens": 143780761.0, "step": 3769 }, { "epoch": 0.4795827502862231, "ewc_loss": 0.020463690161705017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0463690816541202e-05, "grad_norm": 14.416950225830078, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8597162365913391, "num_tokens": 143825298.0, "step": 3770 }, { "epoch": 0.47970996056481363, "ewc_loss": 0.02039591409265995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.039591345237568e-05, "grad_norm": 14.41706657409668, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8754074573516846, "num_tokens": 143859855.0, "step": 3771 }, { "epoch": 0.47983717084340416, "ewc_loss": 0.020404910668730736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0404910173965618e-05, "grad_norm": 14.35684585571289, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8694848418235779, "num_tokens": 143898553.0, "step": 3772 }, { "epoch": 0.47996438112199463, "ewc_loss": 0.020406238734722137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0406238036230206e-05, "grad_norm": 14.422033309936523, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8530224561691284, "num_tokens": 143935362.0, "step": 3773 }, { "epoch": 0.48009159140058516, "ewc_loss": 0.020390309393405914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0390309146023355e-05, "grad_norm": 14.33633041381836, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8569631576538086, "num_tokens": 143978692.0, "step": 3774 }, { "epoch": 0.4802188016791757, "ewc_loss": 0.02036201022565365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0362011127872393e-05, "grad_norm": 14.381400108337402, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8635637760162354, "num_tokens": 144020951.0, "step": 3775 }, { "epoch": 0.48034601195776616, "ewc_loss": 0.020407401025295258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0407400370459072e-05, "grad_norm": 14.355965614318848, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8652244210243225, "num_tokens": 144058341.0, "step": 3776 }, { "epoch": 0.4804732222363567, "ewc_loss": 0.020384641364216805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0384641175041907e-05, "grad_norm": 14.380048751831055, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8482539653778076, "num_tokens": 144098232.0, "step": 3777 }, { "epoch": 0.4806004325149472, "ewc_loss": 0.020419642329216003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0419642169144936e-05, "grad_norm": 14.377959251403809, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8509239554405212, "num_tokens": 144137537.0, "step": 3778 }, { "epoch": 0.4807276427935377, "ewc_loss": 0.020395657047629356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.039565697486978e-05, "grad_norm": 14.437872886657715, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8610794544219971, "num_tokens": 144178459.0, "step": 3779 }, { "epoch": 0.4808548530721282, "ewc_loss": 0.02039744332432747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0397443222464062e-05, "grad_norm": 14.387983322143555, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8504049181938171, "num_tokens": 144219257.0, "step": 3780 }, { "epoch": 0.48098206335071875, "ewc_loss": 0.020367451012134552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0367451725178398e-05, "grad_norm": 14.35153865814209, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8623040914535522, "num_tokens": 144262232.0, "step": 3781 }, { "epoch": 0.4811092736293093, "ewc_loss": 0.02035803720355034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0358036636025645e-05, "grad_norm": 14.38390827178955, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8341482877731323, "num_tokens": 144297144.0, "step": 3782 }, { "epoch": 0.48123648390789975, "ewc_loss": 0.020413000136613846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0412999219843186e-05, "grad_norm": 14.400018692016602, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8564355373382568, "num_tokens": 144336078.0, "step": 3783 }, { "epoch": 0.4813636941864903, "ewc_loss": 0.02036602795124054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0366027456475422e-05, "grad_norm": 14.30705451965332, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8482568264007568, "num_tokens": 144380162.0, "step": 3784 }, { "epoch": 0.4814909044650808, "ewc_loss": 0.02038843184709549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0388431948958896e-05, "grad_norm": 14.408756256103516, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8411851525306702, "num_tokens": 144416735.0, "step": 3785 }, { "epoch": 0.4816181147436713, "ewc_loss": 0.020401544868946075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0401545043569058e-05, "grad_norm": 14.285550117492676, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8451511859893799, "num_tokens": 144452780.0, "step": 3786 }, { "epoch": 0.4817453250222618, "ewc_loss": 0.02036806382238865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.036806290561799e-05, "grad_norm": 14.29923152923584, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.861600935459137, "num_tokens": 144494155.0, "step": 3787 }, { "epoch": 0.48187253530085233, "ewc_loss": 0.020438246428966522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0438246792764403e-05, "grad_norm": 14.387862205505371, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8646715879440308, "num_tokens": 144533989.0, "step": 3788 }, { "epoch": 0.4819997455794428, "ewc_loss": 0.02045006863772869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0450068404898047e-05, "grad_norm": 14.372284889221191, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8519501090049744, "num_tokens": 144577409.0, "step": 3789 }, { "epoch": 0.48212695585803333, "ewc_loss": 0.020401224493980408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0401224901434034e-05, "grad_norm": 14.382261276245117, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8481745719909668, "num_tokens": 144618501.0, "step": 3790 }, { "epoch": 0.48225416613662386, "ewc_loss": 0.020377663895487785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0377663531689905e-05, "grad_norm": 14.346331596374512, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8628730773925781, "num_tokens": 144659454.0, "step": 3791 }, { "epoch": 0.48238137641521434, "ewc_loss": 0.020376482978463173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0376483007567003e-05, "grad_norm": 14.398682594299316, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.860440731048584, "num_tokens": 144695580.0, "step": 3792 }, { "epoch": 0.48250858669380486, "ewc_loss": 0.020380988717079163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0380988644319586e-05, "grad_norm": 14.370455741882324, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8395427465438843, "num_tokens": 144735586.0, "step": 3793 }, { "epoch": 0.4826357969723954, "ewc_loss": 0.02034907042980194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0349070837255567e-05, "grad_norm": 14.303023338317871, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8568379282951355, "num_tokens": 144776371.0, "step": 3794 }, { "epoch": 0.48276300725098586, "ewc_loss": 0.020352205261588097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0352204955997877e-05, "grad_norm": 14.385812759399414, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8502801656723022, "num_tokens": 144816724.0, "step": 3795 }, { "epoch": 0.4828902175295764, "ewc_loss": 0.020424943417310715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.042494270426687e-05, "grad_norm": 14.384734153747559, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8626540303230286, "num_tokens": 144854591.0, "step": 3796 }, { "epoch": 0.4830174278081669, "ewc_loss": 0.020318569615483284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.031857002293691e-05, "grad_norm": 14.358550071716309, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8572494983673096, "num_tokens": 144893694.0, "step": 3797 }, { "epoch": 0.4831446380867574, "ewc_loss": 0.020355578511953354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0355579181341454e-05, "grad_norm": 14.352007865905762, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.852281391620636, "num_tokens": 144932985.0, "step": 3798 }, { "epoch": 0.4832718483653479, "ewc_loss": 0.02032577618956566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.032577685895376e-05, "grad_norm": 14.354434967041016, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8552732467651367, "num_tokens": 144967823.0, "step": 3799 }, { "epoch": 0.48339905864393845, "ewc_loss": 0.02036210335791111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0362103896331973e-05, "grad_norm": 14.291707992553711, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8751275539398193, "num_tokens": 145007750.0, "step": 3800 }, { "epoch": 0.4835262689225289, "ewc_loss": 0.02035379223525524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0353792933747172e-05, "grad_norm": 14.389386177062988, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8499423265457153, "num_tokens": 145048596.0, "step": 3801 }, { "epoch": 0.48365347920111945, "ewc_loss": 0.020384592935442924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.038459206232801e-05, "grad_norm": 14.410334587097168, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8450107574462891, "num_tokens": 145080868.0, "step": 3802 }, { "epoch": 0.48378068947971, "ewc_loss": 0.02036646381020546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0366464013932273e-05, "grad_norm": 14.397730827331543, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8653308153152466, "num_tokens": 145115865.0, "step": 3803 }, { "epoch": 0.48390789975830045, "ewc_loss": 0.020382745191454887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0382745788083412e-05, "grad_norm": 14.29642105102539, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8693623542785645, "num_tokens": 145151748.0, "step": 3804 }, { "epoch": 0.484035110036891, "ewc_loss": 0.02034788206219673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.034788121818565e-05, "grad_norm": 14.376652717590332, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8561787009239197, "num_tokens": 145187074.0, "step": 3805 }, { "epoch": 0.4841623203154815, "ewc_loss": 0.020450059324502945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.045005930995103e-05, "grad_norm": 14.34195327758789, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8734744787216187, "num_tokens": 145224855.0, "step": 3806 }, { "epoch": 0.484289530594072, "ewc_loss": 0.020388007164001465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0388006305438466e-05, "grad_norm": 14.372888565063477, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8658884763717651, "num_tokens": 145266538.0, "step": 3807 }, { "epoch": 0.4844167408726625, "ewc_loss": 0.0204313974827528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0431398297660053e-05, "grad_norm": 14.319923400878906, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8608148097991943, "num_tokens": 145303560.0, "step": 3808 }, { "epoch": 0.48454395115125304, "ewc_loss": 0.020419025793671608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0419025531737134e-05, "grad_norm": 14.411940574645996, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8572546243667603, "num_tokens": 145345985.0, "step": 3809 }, { "epoch": 0.4846711614298435, "ewc_loss": 0.020490510389208794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0490509996307082e-05, "grad_norm": 14.373231887817383, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.846222996711731, "num_tokens": 145382132.0, "step": 3810 }, { "epoch": 0.48479837170843404, "ewc_loss": 0.02042010985314846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0420109649421647e-05, "grad_norm": 14.412684440612793, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.846520721912384, "num_tokens": 145418992.0, "step": 3811 }, { "epoch": 0.48492558198702457, "ewc_loss": 0.020474765449762344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.047476482402999e-05, "grad_norm": 14.444960594177246, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8554259538650513, "num_tokens": 145455892.0, "step": 3812 }, { "epoch": 0.48505279226561504, "ewc_loss": 0.020431919023394585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0431918528629467e-05, "grad_norm": 14.34382152557373, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8546373844146729, "num_tokens": 145497217.0, "step": 3813 }, { "epoch": 0.48518000254420557, "ewc_loss": 0.020401744171977043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0401743313414045e-05, "grad_norm": 14.352100372314453, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.862776517868042, "num_tokens": 145533822.0, "step": 3814 }, { "epoch": 0.4853072128227961, "ewc_loss": 0.020475612953305244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0475612473092042e-05, "grad_norm": 14.389747619628906, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.875725507736206, "num_tokens": 145572967.0, "step": 3815 }, { "epoch": 0.48543442310138657, "ewc_loss": 0.02042161114513874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0421610315679573e-05, "grad_norm": 14.315962791442871, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8705745935440063, "num_tokens": 145609436.0, "step": 3816 }, { "epoch": 0.4855616333799771, "ewc_loss": 0.02042406052350998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.042406049440615e-05, "grad_norm": 14.436722755432129, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.868320643901825, "num_tokens": 145645083.0, "step": 3817 }, { "epoch": 0.4856888436585676, "ewc_loss": 0.020475516095757484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0475516066653654e-05, "grad_norm": 14.359356880187988, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8377761244773865, "num_tokens": 145680249.0, "step": 3818 }, { "epoch": 0.4858160539371581, "ewc_loss": 0.020439917221665382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0439916625036858e-05, "grad_norm": 14.439128875732422, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.861173152923584, "num_tokens": 145718569.0, "step": 3819 }, { "epoch": 0.4859432642157486, "ewc_loss": 0.020462581887841225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.046258123300504e-05, "grad_norm": 14.344709396362305, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8407310843467712, "num_tokens": 145751868.0, "step": 3820 }, { "epoch": 0.48607047449433916, "ewc_loss": 0.020426271483302116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0426270566531457e-05, "grad_norm": 14.452126502990723, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8611444234848022, "num_tokens": 145787881.0, "step": 3821 }, { "epoch": 0.48619768477292963, "ewc_loss": 0.0204766858369112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0476685676840134e-05, "grad_norm": 14.343822479248047, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8638328313827515, "num_tokens": 145824010.0, "step": 3822 }, { "epoch": 0.48632489505152016, "ewc_loss": 0.02047259919345379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0472598407650366e-05, "grad_norm": 14.423473358154297, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8629602193832397, "num_tokens": 145860981.0, "step": 3823 }, { "epoch": 0.4864521053301107, "ewc_loss": 0.020522475242614746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0522475097095594e-05, "grad_norm": 14.354125022888184, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8618056178092957, "num_tokens": 145900687.0, "step": 3824 }, { "epoch": 0.48657931560870116, "ewc_loss": 0.02046969346702099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0469693481572904e-05, "grad_norm": 14.386163711547852, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8476405739784241, "num_tokens": 145938888.0, "step": 3825 }, { "epoch": 0.4867065258872917, "ewc_loss": 0.020465834066271782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.046583358605858e-05, "grad_norm": 14.257073402404785, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8525981903076172, "num_tokens": 145987060.0, "step": 3826 }, { "epoch": 0.4868337361658822, "ewc_loss": 0.020478473976254463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.047847374342382e-05, "grad_norm": 14.370545387268066, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8518274426460266, "num_tokens": 146026447.0, "step": 3827 }, { "epoch": 0.4869609464444727, "ewc_loss": 0.02054723910987377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0547238818835467e-05, "grad_norm": 14.355311393737793, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8588254451751709, "num_tokens": 146067099.0, "step": 3828 }, { "epoch": 0.4870881567230632, "ewc_loss": 0.020491482689976692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.049148315563798e-05, "grad_norm": 14.402923583984375, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8433235883712769, "num_tokens": 146110767.0, "step": 3829 }, { "epoch": 0.48721536700165374, "ewc_loss": 0.02048456110060215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0484561900957488e-05, "grad_norm": 14.334037780761719, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.848670244216919, "num_tokens": 146147603.0, "step": 3830 }, { "epoch": 0.48734257728024427, "ewc_loss": 0.02045249752700329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.045249675575178e-05, "grad_norm": 14.38288402557373, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.867207407951355, "num_tokens": 146188821.0, "step": 3831 }, { "epoch": 0.48746978755883474, "ewc_loss": 0.020522085949778557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0522085833363235e-05, "grad_norm": 14.352254867553711, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8535254001617432, "num_tokens": 146231886.0, "step": 3832 }, { "epoch": 0.48759699783742527, "ewc_loss": 0.020477106794714928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0477107682381757e-05, "grad_norm": 14.40487289428711, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8345823287963867, "num_tokens": 146271965.0, "step": 3833 }, { "epoch": 0.4877242081160158, "ewc_loss": 0.02044435404241085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0444353140192106e-05, "grad_norm": 14.323044776916504, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8432950377464294, "num_tokens": 146316686.0, "step": 3834 }, { "epoch": 0.4878514183946063, "ewc_loss": 0.020416071638464928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0416071492945775e-05, "grad_norm": 14.402813911437988, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8471345901489258, "num_tokens": 146355098.0, "step": 3835 }, { "epoch": 0.4879786286731968, "ewc_loss": 0.02050933614373207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0509336536633782e-05, "grad_norm": 14.425544738769531, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8621822595596313, "num_tokens": 146390666.0, "step": 3836 }, { "epoch": 0.48810583895178733, "ewc_loss": 0.020415211096405983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0415211110957898e-05, "grad_norm": 14.375287055969238, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8630584478378296, "num_tokens": 146426740.0, "step": 3837 }, { "epoch": 0.4882330492303778, "ewc_loss": 0.020456761121749878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0456760466913693e-05, "grad_norm": 14.453405380249023, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8472766876220703, "num_tokens": 146462668.0, "step": 3838 }, { "epoch": 0.48836025950896833, "ewc_loss": 0.020446525886654854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.044652501353994e-05, "grad_norm": 14.395195007324219, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8496213555335999, "num_tokens": 146500160.0, "step": 3839 }, { "epoch": 0.48848746978755886, "ewc_loss": 0.020396683365106583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.039668288489338e-05, "grad_norm": 14.326384544372559, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8516881465911865, "num_tokens": 146542963.0, "step": 3840 }, { "epoch": 0.48861468006614933, "ewc_loss": 0.020440828055143356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0440827938728034e-05, "grad_norm": 14.423310279846191, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8636842966079712, "num_tokens": 146580649.0, "step": 3841 }, { "epoch": 0.48874189034473986, "ewc_loss": 0.02045348472893238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0453484466997907e-05, "grad_norm": 14.348331451416016, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8630971312522888, "num_tokens": 146620823.0, "step": 3842 }, { "epoch": 0.4888691006233304, "ewc_loss": 0.020417379215359688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0417379346326925e-05, "grad_norm": 14.347987174987793, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8675147294998169, "num_tokens": 146663700.0, "step": 3843 }, { "epoch": 0.48899631090192086, "ewc_loss": 0.02043912000954151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0439119907678105e-05, "grad_norm": 14.379683494567871, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8511736989021301, "num_tokens": 146698211.0, "step": 3844 }, { "epoch": 0.4891235211805114, "ewc_loss": 0.020446987822651863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.044698703684844e-05, "grad_norm": 14.353598594665527, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8539040684700012, "num_tokens": 146738380.0, "step": 3845 }, { "epoch": 0.4892507314591019, "ewc_loss": 0.020430196076631546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.043019594566431e-05, "grad_norm": 14.372213363647461, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8591078519821167, "num_tokens": 146775413.0, "step": 3846 }, { "epoch": 0.4893779417376924, "ewc_loss": 0.020448962226510048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.044896245934069e-05, "grad_norm": 14.350851058959961, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8579090237617493, "num_tokens": 146814774.0, "step": 3847 }, { "epoch": 0.4895051520162829, "ewc_loss": 0.020424243062734604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0424242393346503e-05, "grad_norm": 14.356008529663086, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8660075068473816, "num_tokens": 146853957.0, "step": 3848 }, { "epoch": 0.48963236229487345, "ewc_loss": 0.020470861345529556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.047086127276998e-05, "grad_norm": 14.378281593322754, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8493346571922302, "num_tokens": 146889060.0, "step": 3849 }, { "epoch": 0.4897595725734639, "ewc_loss": 0.020452525466680527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0452525859582238e-05, "grad_norm": 14.404239654541016, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8659231066703796, "num_tokens": 146929849.0, "step": 3850 }, { "epoch": 0.48988678285205445, "ewc_loss": 0.020422261208295822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0422261513886042e-05, "grad_norm": 14.331050872802734, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8578929305076599, "num_tokens": 146968376.0, "step": 3851 }, { "epoch": 0.490013993130645, "ewc_loss": 0.020449768751859665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0449768271646462e-05, "grad_norm": 14.478190422058105, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8413868546485901, "num_tokens": 147002871.0, "step": 3852 }, { "epoch": 0.49014120340923545, "ewc_loss": 0.020474309101700783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.04743082576897e-05, "grad_norm": 14.369142532348633, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8528599739074707, "num_tokens": 147039412.0, "step": 3853 }, { "epoch": 0.490268413687826, "ewc_loss": 0.020419837906956673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.041983862000052e-05, "grad_norm": 14.403696060180664, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.860419511795044, "num_tokens": 147080307.0, "step": 3854 }, { "epoch": 0.4903956239664165, "ewc_loss": 0.020480908453464508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0480909370235167e-05, "grad_norm": 14.401996612548828, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8646950125694275, "num_tokens": 147120131.0, "step": 3855 }, { "epoch": 0.490522834245007, "ewc_loss": 0.020419839769601822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0419840438989922e-05, "grad_norm": 14.424553871154785, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8495035171508789, "num_tokens": 147155760.0, "step": 3856 }, { "epoch": 0.4906500445235975, "ewc_loss": 0.020451655611395836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0451656382647343e-05, "grad_norm": 14.309959411621094, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8609330654144287, "num_tokens": 147195406.0, "step": 3857 }, { "epoch": 0.49077725480218803, "ewc_loss": 0.02042417600750923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0424176909727976e-05, "grad_norm": 14.371957778930664, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8700346946716309, "num_tokens": 147235688.0, "step": 3858 }, { "epoch": 0.4909044650807785, "ewc_loss": 0.020484963431954384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.048496389761567e-05, "grad_norm": 14.41494369506836, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8645996451377869, "num_tokens": 147271575.0, "step": 3859 }, { "epoch": 0.49103167535936904, "ewc_loss": 0.020429225638508797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.042922642431222e-05, "grad_norm": 14.368263244628906, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.853695273399353, "num_tokens": 147315473.0, "step": 3860 }, { "epoch": 0.49115888563795956, "ewc_loss": 0.020480558276176453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0480558305280283e-05, "grad_norm": 14.418973922729492, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8608564734458923, "num_tokens": 147359058.0, "step": 3861 }, { "epoch": 0.49128609591655004, "ewc_loss": 0.020477265119552612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0477265934459865e-05, "grad_norm": 14.39683723449707, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.861674427986145, "num_tokens": 147394443.0, "step": 3862 }, { "epoch": 0.49141330619514056, "ewc_loss": 0.02044055052101612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0440549633349292e-05, "grad_norm": 14.394927024841309, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8515774607658386, "num_tokens": 147436225.0, "step": 3863 }, { "epoch": 0.4915405164737311, "ewc_loss": 0.020465148612856865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0465147827053443e-05, "grad_norm": 14.396744728088379, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8670369386672974, "num_tokens": 147473449.0, "step": 3864 }, { "epoch": 0.49166772675232157, "ewc_loss": 0.020456068217754364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.045606743195094e-05, "grad_norm": 14.466106414794922, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8547984957695007, "num_tokens": 147510330.0, "step": 3865 }, { "epoch": 0.4917949370309121, "ewc_loss": 0.02046428620815277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0464285626076162e-05, "grad_norm": 14.409266471862793, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8682196736335754, "num_tokens": 147544683.0, "step": 3866 }, { "epoch": 0.4919221473095026, "ewc_loss": 0.02044203132390976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0442032109713182e-05, "grad_norm": 14.467682838439941, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8746135830879211, "num_tokens": 147589330.0, "step": 3867 }, { "epoch": 0.4920493575880931, "ewc_loss": 0.020435627549886703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0435627448023297e-05, "grad_norm": 14.425050735473633, "learning_rate": 1e-06, "loss": 0.5532, "mean_token_accuracy": 0.8259106278419495, "num_tokens": 147626864.0, "step": 3868 }, { "epoch": 0.4921765678666836, "ewc_loss": 0.02039291523396969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0392915757838637e-05, "grad_norm": 14.411405563354492, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8390443325042725, "num_tokens": 147672006.0, "step": 3869 }, { "epoch": 0.49230377814527415, "ewc_loss": 0.02044372633099556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0443725588847883e-05, "grad_norm": 14.447028160095215, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8471307158470154, "num_tokens": 147717388.0, "step": 3870 }, { "epoch": 0.4924309884238646, "ewc_loss": 0.020357225090265274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0357225366751663e-05, "grad_norm": 14.347660064697266, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8448234796524048, "num_tokens": 147759827.0, "step": 3871 }, { "epoch": 0.49255819870245515, "ewc_loss": 0.02035546489059925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.035546458500903e-05, "grad_norm": 14.411148071289062, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8381437659263611, "num_tokens": 147793742.0, "step": 3872 }, { "epoch": 0.4926854089810457, "ewc_loss": 0.020430052652955055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.043005224550143e-05, "grad_norm": 14.373054504394531, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8668473362922668, "num_tokens": 147829881.0, "step": 3873 }, { "epoch": 0.49281261925963615, "ewc_loss": 0.020391210913658142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0391211364767514e-05, "grad_norm": 14.414515495300293, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8418793678283691, "num_tokens": 147875051.0, "step": 3874 }, { "epoch": 0.4929398295382267, "ewc_loss": 0.02044123224914074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.044123175437562e-05, "grad_norm": 14.41112232208252, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8496972322463989, "num_tokens": 147912786.0, "step": 3875 }, { "epoch": 0.4930670398168172, "ewc_loss": 0.02041519246995449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0415192921063863e-05, "grad_norm": 14.401642799377441, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8361129760742188, "num_tokens": 147954981.0, "step": 3876 }, { "epoch": 0.4931942500954077, "ewc_loss": 0.020423097535967827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0423098249011673e-05, "grad_norm": 14.375249862670898, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8504437804222107, "num_tokens": 147992179.0, "step": 3877 }, { "epoch": 0.4933214603739982, "ewc_loss": 0.02046106569468975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0461066014831886e-05, "grad_norm": 14.459668159484863, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8623772859573364, "num_tokens": 148029866.0, "step": 3878 }, { "epoch": 0.49344867065258874, "ewc_loss": 0.020426848903298378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0426849005161785e-05, "grad_norm": 14.274669647216797, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8570587038993835, "num_tokens": 148070057.0, "step": 3879 }, { "epoch": 0.4935758809311792, "ewc_loss": 0.020493295043706894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.049329486908391e-05, "grad_norm": 14.477747917175293, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8595249056816101, "num_tokens": 148108461.0, "step": 3880 }, { "epoch": 0.49370309120976974, "ewc_loss": 0.020505782216787338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0505782231339253e-05, "grad_norm": 14.347543716430664, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8492491841316223, "num_tokens": 148147609.0, "step": 3881 }, { "epoch": 0.49383030148836027, "ewc_loss": 0.02045290730893612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0452907847356983e-05, "grad_norm": 14.483966827392578, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8511725664138794, "num_tokens": 148190475.0, "step": 3882 }, { "epoch": 0.4939575117669508, "ewc_loss": 0.02052069641649723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0520696125458926e-05, "grad_norm": 14.398174285888672, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8680740594863892, "num_tokens": 148221187.0, "step": 3883 }, { "epoch": 0.49408472204554127, "ewc_loss": 0.020450390875339508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0450390366022475e-05, "grad_norm": 14.381059646606445, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8650192022323608, "num_tokens": 148260916.0, "step": 3884 }, { "epoch": 0.4942119323241318, "ewc_loss": 0.020553313195705414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0553312424453907e-05, "grad_norm": 14.440913200378418, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8621358275413513, "num_tokens": 148295591.0, "step": 3885 }, { "epoch": 0.4943391426027223, "ewc_loss": 0.020500456914305687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.050045623036567e-05, "grad_norm": 14.36424732208252, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8514038920402527, "num_tokens": 148333445.0, "step": 3886 }, { "epoch": 0.4944663528813128, "ewc_loss": 0.02052135579288006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0521356418612413e-05, "grad_norm": 14.461353302001953, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8658987283706665, "num_tokens": 148367951.0, "step": 3887 }, { "epoch": 0.4945935631599033, "ewc_loss": 0.020576320588588715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0576320821419358e-05, "grad_norm": 14.412481307983398, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8484030961990356, "num_tokens": 148408491.0, "step": 3888 }, { "epoch": 0.49472077343849385, "ewc_loss": 0.020498279482126236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0498278900049627e-05, "grad_norm": 14.44230842590332, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.855495810508728, "num_tokens": 148443883.0, "step": 3889 }, { "epoch": 0.4948479837170843, "ewc_loss": 0.020588483661413193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.058848440356087e-05, "grad_norm": 14.440017700195312, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.857305645942688, "num_tokens": 148480511.0, "step": 3890 }, { "epoch": 0.49497519399567486, "ewc_loss": 0.02050287276506424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.050287184829358e-05, "grad_norm": 14.423457145690918, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8511592745780945, "num_tokens": 148514759.0, "step": 3891 }, { "epoch": 0.4951024042742654, "ewc_loss": 0.0205671526491642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0567153114825487e-05, "grad_norm": 14.411702156066895, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8582043647766113, "num_tokens": 148553209.0, "step": 3892 }, { "epoch": 0.49522961455285586, "ewc_loss": 0.020542772486805916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0542773199849762e-05, "grad_norm": 14.501815795898438, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8466383814811707, "num_tokens": 148590387.0, "step": 3893 }, { "epoch": 0.4953568248314464, "ewc_loss": 0.02051669918000698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0516699805739336e-05, "grad_norm": 14.368454933166504, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8544548153877258, "num_tokens": 148625201.0, "step": 3894 }, { "epoch": 0.4954840351100369, "ewc_loss": 0.020529480651021004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.052948002528865e-05, "grad_norm": 14.481511116027832, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8601090908050537, "num_tokens": 148665989.0, "step": 3895 }, { "epoch": 0.4956112453886274, "ewc_loss": 0.020591113716363907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0591112843248993e-05, "grad_norm": 14.46625804901123, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8597872257232666, "num_tokens": 148704839.0, "step": 3896 }, { "epoch": 0.4957384556672179, "ewc_loss": 0.020528560504317284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0528559616650455e-05, "grad_norm": 14.44124698638916, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8665271997451782, "num_tokens": 148740523.0, "step": 3897 }, { "epoch": 0.49586566594580844, "ewc_loss": 0.020519046112895012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.051904630206991e-05, "grad_norm": 14.417800903320312, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8721926212310791, "num_tokens": 148779103.0, "step": 3898 }, { "epoch": 0.4959928762243989, "ewc_loss": 0.020565353333950043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.056535413430538e-05, "grad_norm": 14.462625503540039, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8674273490905762, "num_tokens": 148814659.0, "step": 3899 }, { "epoch": 0.49612008650298944, "ewc_loss": 0.020537244156003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0537243472062983e-05, "grad_norm": 14.39004135131836, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8762283325195312, "num_tokens": 148856864.0, "step": 3900 }, { "epoch": 0.49624729678157997, "ewc_loss": 0.020572276785969734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0572277207975276e-05, "grad_norm": 14.40853214263916, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8626528978347778, "num_tokens": 148890309.0, "step": 3901 }, { "epoch": 0.49637450706017044, "ewc_loss": 0.02058042772114277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0580428099492565e-05, "grad_norm": 14.505444526672363, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8417874574661255, "num_tokens": 148926662.0, "step": 3902 }, { "epoch": 0.496501717338761, "ewc_loss": 0.020589102059602737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0589102859958075e-05, "grad_norm": 14.462904930114746, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8444802761077881, "num_tokens": 148963032.0, "step": 3903 }, { "epoch": 0.4966289276173515, "ewc_loss": 0.020553801208734512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.055380173260346e-05, "grad_norm": 14.418957710266113, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8625879883766174, "num_tokens": 148996684.0, "step": 3904 }, { "epoch": 0.496756137895942, "ewc_loss": 0.020567001774907112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0567002138704993e-05, "grad_norm": 14.432022094726562, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8565236330032349, "num_tokens": 149032735.0, "step": 3905 }, { "epoch": 0.4968833481745325, "ewc_loss": 0.020572282373905182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0572282664943486e-05, "grad_norm": 14.45374584197998, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8585993647575378, "num_tokens": 149069415.0, "step": 3906 }, { "epoch": 0.49701055845312303, "ewc_loss": 0.02059907093644142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0599070921889506e-05, "grad_norm": 14.457014083862305, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8425413370132446, "num_tokens": 149115042.0, "step": 3907 }, { "epoch": 0.4971377687317135, "ewc_loss": 0.020562006160616875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0562005374813452e-05, "grad_norm": 14.497808456420898, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.86458420753479, "num_tokens": 149149767.0, "step": 3908 }, { "epoch": 0.49726497901030403, "ewc_loss": 0.020592277869582176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0592276996467263e-05, "grad_norm": 14.38565444946289, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8500515818595886, "num_tokens": 149193000.0, "step": 3909 }, { "epoch": 0.49739218928889456, "ewc_loss": 0.020564192906022072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0564193619065918e-05, "grad_norm": 14.510017395019531, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8538157343864441, "num_tokens": 149226963.0, "step": 3910 }, { "epoch": 0.49751939956748503, "ewc_loss": 0.02061222866177559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0612229491234757e-05, "grad_norm": 14.422931671142578, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8516350984573364, "num_tokens": 149268139.0, "step": 3911 }, { "epoch": 0.49764660984607556, "ewc_loss": 0.020567158237099648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0567158571793698e-05, "grad_norm": 14.492679595947266, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8643130660057068, "num_tokens": 149305882.0, "step": 3912 }, { "epoch": 0.4977738201246661, "ewc_loss": 0.020628122612833977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0628122001653537e-05, "grad_norm": 14.385276794433594, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8531805276870728, "num_tokens": 149344119.0, "step": 3913 }, { "epoch": 0.49790103040325656, "ewc_loss": 0.020569590851664543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.056959056062624e-05, "grad_norm": 14.496715545654297, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.847116231918335, "num_tokens": 149384102.0, "step": 3914 }, { "epoch": 0.4980282406818471, "ewc_loss": 0.020630644634366035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0630644939956255e-05, "grad_norm": 14.377732276916504, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8545488119125366, "num_tokens": 149425677.0, "step": 3915 }, { "epoch": 0.4981554509604376, "ewc_loss": 0.02059515193104744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.059515281871427e-05, "grad_norm": 14.527200698852539, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8537421226501465, "num_tokens": 149463275.0, "step": 3916 }, { "epoch": 0.4982826612390281, "ewc_loss": 0.020637983456254005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.063798274321016e-05, "grad_norm": 14.358753204345703, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8713633418083191, "num_tokens": 149503950.0, "step": 3917 }, { "epoch": 0.4984098715176186, "ewc_loss": 0.02056397683918476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0563977159326896e-05, "grad_norm": 14.5382719039917, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8480375409126282, "num_tokens": 149541192.0, "step": 3918 }, { "epoch": 0.49853708179620915, "ewc_loss": 0.020671844482421875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.067184504994657e-05, "grad_norm": 14.40554428100586, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8529519438743591, "num_tokens": 149581784.0, "step": 3919 }, { "epoch": 0.4986642920747996, "ewc_loss": 0.020529333502054214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.052933268714696e-05, "grad_norm": 14.431756973266602, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8621774911880493, "num_tokens": 149619312.0, "step": 3920 }, { "epoch": 0.49879150235339015, "ewc_loss": 0.02066768705844879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0667686840170063e-05, "grad_norm": 14.497824668884277, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8540714979171753, "num_tokens": 149657443.0, "step": 3921 }, { "epoch": 0.4989187126319807, "ewc_loss": 0.020578304305672646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0578303519869223e-05, "grad_norm": 14.448920249938965, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8678796887397766, "num_tokens": 149694992.0, "step": 3922 }, { "epoch": 0.49904592291057115, "ewc_loss": 0.02062711864709854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.062711791950278e-05, "grad_norm": 14.46526050567627, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8530645966529846, "num_tokens": 149729554.0, "step": 3923 }, { "epoch": 0.4991731331891617, "ewc_loss": 0.02059653401374817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.059653343167156e-05, "grad_norm": 14.38564682006836, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.861993134021759, "num_tokens": 149770051.0, "step": 3924 }, { "epoch": 0.4993003434677522, "ewc_loss": 0.020610902458429337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.061090162897017e-05, "grad_norm": 14.499614715576172, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8251354098320007, "num_tokens": 149806724.0, "step": 3925 }, { "epoch": 0.4994275537463427, "ewc_loss": 0.0206470787525177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0647079509217292e-05, "grad_norm": 14.42968463897705, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8483080863952637, "num_tokens": 149849874.0, "step": 3926 }, { "epoch": 0.4995547640249332, "ewc_loss": 0.02057150937616825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.057150959444698e-05, "grad_norm": 14.425935745239258, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8555898666381836, "num_tokens": 149888076.0, "step": 3927 }, { "epoch": 0.49968197430352373, "ewc_loss": 0.02065689116716385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.065689113806002e-05, "grad_norm": 14.50532341003418, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8576615452766418, "num_tokens": 149923750.0, "step": 3928 }, { "epoch": 0.4998091845821142, "ewc_loss": 0.020608320832252502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0608320483006537e-05, "grad_norm": 14.418750762939453, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8440389633178711, "num_tokens": 149960657.0, "step": 3929 }, { "epoch": 0.49993639486070474, "ewc_loss": 0.020588574931025505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0588575353031047e-05, "grad_norm": 14.476303100585938, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.871891975402832, "num_tokens": 149995983.0, "step": 3930 }, { "epoch": 0.5000636051392953, "ewc_loss": 0.020622389391064644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.062238854705356e-05, "grad_norm": 14.399913787841797, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8347047567367554, "num_tokens": 150037508.0, "step": 3931 }, { "epoch": 0.5001908154178858, "ewc_loss": 0.020621681585907936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.062168096017558e-05, "grad_norm": 14.405699729919434, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8523764610290527, "num_tokens": 150077806.0, "step": 3932 }, { "epoch": 0.5003180256964763, "ewc_loss": 0.020668063312768936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0668063370976597e-05, "grad_norm": 14.52166748046875, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8554514646530151, "num_tokens": 150117589.0, "step": 3933 }, { "epoch": 0.5004452359750667, "ewc_loss": 0.02063468098640442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0634681277442724e-05, "grad_norm": 14.412004470825195, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8493312001228333, "num_tokens": 150161579.0, "step": 3934 }, { "epoch": 0.5005724462536573, "ewc_loss": 0.020594146102666855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0594146917574108e-05, "grad_norm": 14.406577110290527, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8438030481338501, "num_tokens": 150208018.0, "step": 3935 }, { "epoch": 0.5006996565322478, "ewc_loss": 0.020697621628642082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0697621948784217e-05, "grad_norm": 14.518244743347168, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8580491542816162, "num_tokens": 150250164.0, "step": 3936 }, { "epoch": 0.5008268668108383, "ewc_loss": 0.02062300406396389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.062300336547196e-05, "grad_norm": 14.417829513549805, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.875094473361969, "num_tokens": 150289972.0, "step": 3937 }, { "epoch": 0.5009540770894289, "ewc_loss": 0.020607784390449524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.060778388113249e-05, "grad_norm": 14.502298355102539, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8546040654182434, "num_tokens": 150333656.0, "step": 3938 }, { "epoch": 0.5010812873680194, "ewc_loss": 0.020627308636903763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0627308913390152e-05, "grad_norm": 14.41080093383789, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8696661591529846, "num_tokens": 150374086.0, "step": 3939 }, { "epoch": 0.5012084976466098, "ewc_loss": 0.020533885806798935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0533885617624037e-05, "grad_norm": 14.376493453979492, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8595580458641052, "num_tokens": 150407472.0, "step": 3940 }, { "epoch": 0.5013357079252003, "ewc_loss": 0.020612936466932297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0612937078112736e-05, "grad_norm": 14.480838775634766, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8618658781051636, "num_tokens": 150444560.0, "step": 3941 }, { "epoch": 0.5014629182037909, "ewc_loss": 0.02058711089193821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0587111066561192e-05, "grad_norm": 14.425806999206543, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8593564629554749, "num_tokens": 150483621.0, "step": 3942 }, { "epoch": 0.5015901284823814, "ewc_loss": 0.020535852760076523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.053585194516927e-05, "grad_norm": 14.49333381652832, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8567019104957581, "num_tokens": 150520414.0, "step": 3943 }, { "epoch": 0.5017173387609719, "ewc_loss": 0.020607924088835716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0607923943316564e-05, "grad_norm": 14.46374797821045, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8478908538818359, "num_tokens": 150556589.0, "step": 3944 }, { "epoch": 0.5018445490395624, "ewc_loss": 0.02055077627301216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0550776753225364e-05, "grad_norm": 14.522303581237793, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8550025820732117, "num_tokens": 150592393.0, "step": 3945 }, { "epoch": 0.5019717593181529, "ewc_loss": 0.02060793712735176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.060793667624239e-05, "grad_norm": 14.458714485168457, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8426578640937805, "num_tokens": 150630457.0, "step": 3946 }, { "epoch": 0.5020989695967434, "ewc_loss": 0.020519588142633438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0519588360912167e-05, "grad_norm": 14.518381118774414, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8638978004455566, "num_tokens": 150659482.0, "step": 3947 }, { "epoch": 0.5022261798753339, "ewc_loss": 0.020586948841810226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0586949176504277e-05, "grad_norm": 14.410981178283691, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8555036783218384, "num_tokens": 150700920.0, "step": 3948 }, { "epoch": 0.5023533901539244, "ewc_loss": 0.020555846393108368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0555846276693046e-05, "grad_norm": 14.505762100219727, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.867240309715271, "num_tokens": 150736393.0, "step": 3949 }, { "epoch": 0.502480600432515, "ewc_loss": 0.02065902017056942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0659019355662167e-05, "grad_norm": 14.431844711303711, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8531812429428101, "num_tokens": 150783848.0, "step": 3950 }, { "epoch": 0.5026078107111055, "ewc_loss": 0.020613189786672592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.061318991763983e-05, "grad_norm": 14.509110450744629, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8539142608642578, "num_tokens": 150824179.0, "step": 3951 }, { "epoch": 0.5027350209896959, "ewc_loss": 0.020649617537856102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.064961699943524e-05, "grad_norm": 14.439197540283203, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8565452694892883, "num_tokens": 150862775.0, "step": 3952 }, { "epoch": 0.5028622312682864, "ewc_loss": 0.020580744370818138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0580744603648782e-05, "grad_norm": 14.486800193786621, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8559530973434448, "num_tokens": 150900005.0, "step": 3953 }, { "epoch": 0.502989441546877, "ewc_loss": 0.020647022873163223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0647023120545782e-05, "grad_norm": 14.454303741455078, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8522987961769104, "num_tokens": 150936148.0, "step": 3954 }, { "epoch": 0.5031166518254675, "ewc_loss": 0.020646389573812485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0646390112233348e-05, "grad_norm": 14.480502128601074, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.853363037109375, "num_tokens": 150971661.0, "step": 3955 }, { "epoch": 0.503243862104058, "ewc_loss": 0.02065363898873329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.065363878500648e-05, "grad_norm": 14.469514846801758, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8684238791465759, "num_tokens": 151009521.0, "step": 3956 }, { "epoch": 0.5033710723826486, "ewc_loss": 0.020653774961829185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0653775209211744e-05, "grad_norm": 14.490999221801758, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8524302244186401, "num_tokens": 151053641.0, "step": 3957 }, { "epoch": 0.5034982826612391, "ewc_loss": 0.02066856250166893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.066856177407317e-05, "grad_norm": 14.429800987243652, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8597405552864075, "num_tokens": 151094476.0, "step": 3958 }, { "epoch": 0.5036254929398295, "ewc_loss": 0.020624250173568726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0624249373213388e-05, "grad_norm": 14.516097068786621, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8504468202590942, "num_tokens": 151126381.0, "step": 3959 }, { "epoch": 0.50375270321842, "ewc_loss": 0.020683107897639275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0683108232333325e-05, "grad_norm": 14.459112167358398, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8606772422790527, "num_tokens": 151156875.0, "step": 3960 }, { "epoch": 0.5038799134970106, "ewc_loss": 0.020644312724471092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.06443128263345e-05, "grad_norm": 14.448420524597168, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8527276515960693, "num_tokens": 151194394.0, "step": 3961 }, { "epoch": 0.5040071237756011, "ewc_loss": 0.020672395825386047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0672396203735843e-05, "grad_norm": 14.506674766540527, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8421739935874939, "num_tokens": 151234472.0, "step": 3962 }, { "epoch": 0.5041343340541916, "ewc_loss": 0.020690852776169777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0690853489213623e-05, "grad_norm": 14.43242359161377, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8524988889694214, "num_tokens": 151270426.0, "step": 3963 }, { "epoch": 0.5042615443327821, "ewc_loss": 0.02063978649675846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0639787180698477e-05, "grad_norm": 14.493407249450684, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8545310497283936, "num_tokens": 151305793.0, "step": 3964 }, { "epoch": 0.5043887546113726, "ewc_loss": 0.020725097507238388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0725097783724777e-05, "grad_norm": 14.454766273498535, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8434950709342957, "num_tokens": 151345116.0, "step": 3965 }, { "epoch": 0.5045159648899631, "ewc_loss": 0.020692069083452225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0692068574135192e-05, "grad_norm": 14.477811813354492, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8604180216789246, "num_tokens": 151381901.0, "step": 3966 }, { "epoch": 0.5046431751685536, "ewc_loss": 0.020760057494044304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0760056941071525e-05, "grad_norm": 14.434842109680176, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8580797910690308, "num_tokens": 151426884.0, "step": 3967 }, { "epoch": 0.5047703854471441, "ewc_loss": 0.02072548121213913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0725481590488926e-05, "grad_norm": 14.532108306884766, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8490217924118042, "num_tokens": 151467596.0, "step": 3968 }, { "epoch": 0.5048975957257347, "ewc_loss": 0.020733019337058067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.073301948257722e-05, "grad_norm": 14.448944091796875, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8752357959747314, "num_tokens": 151502131.0, "step": 3969 }, { "epoch": 0.5050248060043252, "ewc_loss": 0.020709317177534103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0709318050649017e-05, "grad_norm": 14.486181259155273, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8528084754943848, "num_tokens": 151538932.0, "step": 3970 }, { "epoch": 0.5051520162829156, "ewc_loss": 0.02075079455971718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.075079464702867e-05, "grad_norm": 14.425821304321289, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.855050802230835, "num_tokens": 151577839.0, "step": 3971 }, { "epoch": 0.5052792265615061, "ewc_loss": 0.020698141306638718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.069814217975363e-05, "grad_norm": 14.4600248336792, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8559437990188599, "num_tokens": 151618984.0, "step": 3972 }, { "epoch": 0.5054064368400967, "ewc_loss": 0.020759398117661476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.075939846690744e-05, "grad_norm": 14.45197582244873, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8563824892044067, "num_tokens": 151658392.0, "step": 3973 }, { "epoch": 0.5055336471186872, "ewc_loss": 0.02072281949222088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0722820408991538e-05, "grad_norm": 14.536588668823242, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8658154606819153, "num_tokens": 151689781.0, "step": 3974 }, { "epoch": 0.5056608573972777, "ewc_loss": 0.020763816311955452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0763816792168655e-05, "grad_norm": 14.52163314819336, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8410879373550415, "num_tokens": 151729612.0, "step": 3975 }, { "epoch": 0.5057880676758683, "ewc_loss": 0.020665563642978668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0665564079536125e-05, "grad_norm": 14.453299522399902, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8433672189712524, "num_tokens": 151769270.0, "step": 3976 }, { "epoch": 0.5059152779544587, "ewc_loss": 0.020717523992061615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0717523511848412e-05, "grad_norm": 14.480698585510254, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8372741937637329, "num_tokens": 151803936.0, "step": 3977 }, { "epoch": 0.5060424882330492, "ewc_loss": 0.020738963037729263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0738963939948007e-05, "grad_norm": 14.545903205871582, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8436394929885864, "num_tokens": 151840429.0, "step": 3978 }, { "epoch": 0.5061696985116397, "ewc_loss": 0.020666787400841713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.066678825940471e-05, "grad_norm": 14.481139183044434, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8575806021690369, "num_tokens": 151883296.0, "step": 3979 }, { "epoch": 0.5062969087902303, "ewc_loss": 0.020682716742157936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0682717149611562e-05, "grad_norm": 14.603227615356445, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8706242442131042, "num_tokens": 151917658.0, "step": 3980 }, { "epoch": 0.5064241190688208, "ewc_loss": 0.020714720711112022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0714720449177548e-05, "grad_norm": 14.444084167480469, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8557829856872559, "num_tokens": 151958822.0, "step": 3981 }, { "epoch": 0.5065513293474113, "ewc_loss": 0.020674346014857292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0674346160376444e-05, "grad_norm": 14.638374328613281, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8508843183517456, "num_tokens": 151994964.0, "step": 3982 }, { "epoch": 0.5066785396260017, "ewc_loss": 0.020730454474687576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.073045470751822e-05, "grad_norm": 14.655854225158691, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8512784242630005, "num_tokens": 152037013.0, "step": 3983 }, { "epoch": 0.5068057499045923, "ewc_loss": 0.02060021087527275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.060021142824553e-05, "grad_norm": 14.377573013305664, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8515482544898987, "num_tokens": 152074821.0, "step": 3984 }, { "epoch": 0.5069329601831828, "ewc_loss": 0.020632758736610413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0632758605643176e-05, "grad_norm": 14.548456192016602, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.848846435546875, "num_tokens": 152116667.0, "step": 3985 }, { "epoch": 0.5070601704617733, "ewc_loss": 0.020701617002487183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0701616449514404e-05, "grad_norm": 14.51993465423584, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8597506284713745, "num_tokens": 152152714.0, "step": 3986 }, { "epoch": 0.5071873807403638, "ewc_loss": 0.020619910210371017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.061991108348593e-05, "grad_norm": 14.365015029907227, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8418079614639282, "num_tokens": 152190230.0, "step": 3987 }, { "epoch": 0.5073145910189544, "ewc_loss": 0.020674504339694977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0674504412454553e-05, "grad_norm": 14.51907730102539, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8549139499664307, "num_tokens": 152228887.0, "step": 3988 }, { "epoch": 0.5074418012975448, "ewc_loss": 0.020718885585665703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0718885934911668e-05, "grad_norm": 14.471884727478027, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8649428486824036, "num_tokens": 152271863.0, "step": 3989 }, { "epoch": 0.5075690115761353, "ewc_loss": 0.020606553182005882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.060655242530629e-05, "grad_norm": 14.403454780578613, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8508560657501221, "num_tokens": 152310765.0, "step": 3990 }, { "epoch": 0.5076962218547258, "ewc_loss": 0.020737072452902794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.073707219096832e-05, "grad_norm": 14.465336799621582, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8774955868721008, "num_tokens": 152342545.0, "step": 3991 }, { "epoch": 0.5078234321333164, "ewc_loss": 0.020693732425570488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0693732949439436e-05, "grad_norm": 14.452914237976074, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8395745158195496, "num_tokens": 152386223.0, "step": 3992 }, { "epoch": 0.5079506424119069, "ewc_loss": 0.020671222358942032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0671222955570556e-05, "grad_norm": 14.398967742919922, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8629541397094727, "num_tokens": 152421811.0, "step": 3993 }, { "epoch": 0.5080778526904974, "ewc_loss": 0.020714744925498962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0714744096039794e-05, "grad_norm": 14.528172492980957, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8436843156814575, "num_tokens": 152454581.0, "step": 3994 }, { "epoch": 0.5082050629690879, "ewc_loss": 0.020796731114387512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0796731405425817e-05, "grad_norm": 14.468573570251465, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.845424234867096, "num_tokens": 152490591.0, "step": 3995 }, { "epoch": 0.5083322732476784, "ewc_loss": 0.02071399986743927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0714000129373744e-05, "grad_norm": 14.476150512695312, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8785628080368042, "num_tokens": 152532290.0, "step": 3996 }, { "epoch": 0.5084594835262689, "ewc_loss": 0.020770978182554245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0770978153450415e-05, "grad_norm": 14.51036548614502, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8608390092849731, "num_tokens": 152571984.0, "step": 3997 }, { "epoch": 0.5085866938048594, "ewc_loss": 0.02071724645793438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0717247025459073e-05, "grad_norm": 14.478271484375, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.856220543384552, "num_tokens": 152611082.0, "step": 3998 }, { "epoch": 0.50871390408345, "ewc_loss": 0.02076355181634426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0763551219715737e-05, "grad_norm": 14.465941429138184, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8639581203460693, "num_tokens": 152655303.0, "step": 3999 }, { "epoch": 0.5088411143620405, "ewc_loss": 0.020732946693897247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0732946723001078e-05, "grad_norm": 14.460580825805664, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8547309637069702, "num_tokens": 152688432.0, "step": 4000 }, { "epoch": 0.5089683246406309, "ewc_loss": 0.02073627896606922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0736279111588374e-05, "grad_norm": 14.432640075683594, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8519871234893799, "num_tokens": 152725637.0, "step": 4001 }, { "epoch": 0.5090955349192214, "ewc_loss": 0.020754873752593994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.075487282127142e-05, "grad_norm": 14.502191543579102, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8539300560951233, "num_tokens": 152763108.0, "step": 4002 }, { "epoch": 0.509222745197812, "ewc_loss": 0.020821494981646538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.082149512716569e-05, "grad_norm": 14.50796127319336, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8624354600906372, "num_tokens": 152800659.0, "step": 4003 }, { "epoch": 0.5093499554764025, "ewc_loss": 0.020770108327269554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.077010867651552e-05, "grad_norm": 14.464503288269043, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8467926383018494, "num_tokens": 152842797.0, "step": 4004 }, { "epoch": 0.509477165754993, "ewc_loss": 0.02079080045223236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0790799680980854e-05, "grad_norm": 14.523626327514648, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8465639352798462, "num_tokens": 152888257.0, "step": 4005 }, { "epoch": 0.5096043760335836, "ewc_loss": 0.020779741927981377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.07797420443967e-05, "grad_norm": 14.439098358154297, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8550523519515991, "num_tokens": 152929089.0, "step": 4006 }, { "epoch": 0.5097315863121741, "ewc_loss": 0.020740432664752007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.074043186439667e-05, "grad_norm": 14.513808250427246, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8495198488235474, "num_tokens": 152972710.0, "step": 4007 }, { "epoch": 0.5098587965907645, "ewc_loss": 0.020806439220905304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.080643935187254e-05, "grad_norm": 14.489847183227539, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8560324907302856, "num_tokens": 153008867.0, "step": 4008 }, { "epoch": 0.509986006869355, "ewc_loss": 0.02072891592979431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.072891584248282e-05, "grad_norm": 14.518277168273926, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8504108190536499, "num_tokens": 153051497.0, "step": 4009 }, { "epoch": 0.5101132171479456, "ewc_loss": 0.020783916115760803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0783916625077836e-05, "grad_norm": 14.475242614746094, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8716630935668945, "num_tokens": 153089295.0, "step": 4010 }, { "epoch": 0.5102404274265361, "ewc_loss": 0.02071269042789936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.071269045700319e-05, "grad_norm": 14.45253849029541, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8594921827316284, "num_tokens": 153123957.0, "step": 4011 }, { "epoch": 0.5103676377051266, "ewc_loss": 0.020758168771862984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0758168830070645e-05, "grad_norm": 14.435794830322266, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8526947498321533, "num_tokens": 153165248.0, "step": 4012 }, { "epoch": 0.5104948479837171, "ewc_loss": 0.02077457867562771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0774577933480032e-05, "grad_norm": 14.469461441040039, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8704414367675781, "num_tokens": 153206163.0, "step": 4013 }, { "epoch": 0.5106220582623076, "ewc_loss": 0.020788082852959633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0788082110811956e-05, "grad_norm": 14.500832557678223, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8467915654182434, "num_tokens": 153248666.0, "step": 4014 }, { "epoch": 0.5107492685408981, "ewc_loss": 0.020754724740982056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0754725483129732e-05, "grad_norm": 14.455657958984375, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.879179835319519, "num_tokens": 153289009.0, "step": 4015 }, { "epoch": 0.5108764788194886, "ewc_loss": 0.020743180066347122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0743180357385427e-05, "grad_norm": 14.499794006347656, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8614362478256226, "num_tokens": 153321354.0, "step": 4016 }, { "epoch": 0.5110036890980791, "ewc_loss": 0.020771164447069168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0771163690369576e-05, "grad_norm": 14.521986961364746, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.855380654335022, "num_tokens": 153361061.0, "step": 4017 }, { "epoch": 0.5111308993766697, "ewc_loss": 0.02076030522584915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0760304323630407e-05, "grad_norm": 14.447352409362793, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8496685028076172, "num_tokens": 153397002.0, "step": 4018 }, { "epoch": 0.5112581096552602, "ewc_loss": 0.020757298916578293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.075729935313575e-05, "grad_norm": 14.564908981323242, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8424631953239441, "num_tokens": 153435264.0, "step": 4019 }, { "epoch": 0.5113853199338506, "ewc_loss": 0.02075495757162571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0754958313773386e-05, "grad_norm": 14.412578582763672, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.853786826133728, "num_tokens": 153474751.0, "step": 4020 }, { "epoch": 0.5115125302124411, "ewc_loss": 0.02070191502571106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0701914763776585e-05, "grad_norm": 14.535289764404297, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.864703357219696, "num_tokens": 153510084.0, "step": 4021 }, { "epoch": 0.5116397404910317, "ewc_loss": 0.020776251330971718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0776251403731294e-05, "grad_norm": 14.444290161132812, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8427523374557495, "num_tokens": 153550387.0, "step": 4022 }, { "epoch": 0.5117669507696222, "ewc_loss": 0.020756669342517853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0756669982802123e-05, "grad_norm": 14.534503936767578, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8776187300682068, "num_tokens": 153592441.0, "step": 4023 }, { "epoch": 0.5118941610482127, "ewc_loss": 0.020809005945920944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0809005945920944e-05, "grad_norm": 14.495382308959961, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8623213171958923, "num_tokens": 153629202.0, "step": 4024 }, { "epoch": 0.5120213713268033, "ewc_loss": 0.020744234323501587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.074423355225008e-05, "grad_norm": 14.554831504821777, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8513191342353821, "num_tokens": 153669243.0, "step": 4025 }, { "epoch": 0.5121485816053937, "ewc_loss": 0.02075737901031971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0757379388669506e-05, "grad_norm": 14.485616683959961, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8506754040718079, "num_tokens": 153709206.0, "step": 4026 }, { "epoch": 0.5122757918839842, "ewc_loss": 0.02076771855354309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.076771852443926e-05, "grad_norm": 14.546564102172852, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8628780245780945, "num_tokens": 153741251.0, "step": 4027 }, { "epoch": 0.5124030021625747, "ewc_loss": 0.02078777551651001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0787774701602757e-05, "grad_norm": 14.553980827331543, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8421761989593506, "num_tokens": 153775559.0, "step": 4028 }, { "epoch": 0.5125302124411653, "ewc_loss": 0.020740101113915443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0740100808325224e-05, "grad_norm": 14.434591293334961, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8598877191543579, "num_tokens": 153819158.0, "step": 4029 }, { "epoch": 0.5126574227197558, "ewc_loss": 0.020756788551807404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0756788217113353e-05, "grad_norm": 14.502263069152832, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8497389554977417, "num_tokens": 153856231.0, "step": 4030 }, { "epoch": 0.5127846329983463, "ewc_loss": 0.020791510120034218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.079151090583764e-05, "grad_norm": 14.496417999267578, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8668265342712402, "num_tokens": 153897125.0, "step": 4031 }, { "epoch": 0.5129118432769367, "ewc_loss": 0.020760703831911087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0760704501299188e-05, "grad_norm": 14.453843116760254, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8621742725372314, "num_tokens": 153933552.0, "step": 4032 }, { "epoch": 0.5130390535555273, "ewc_loss": 0.020775943994522095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0775943994522095e-05, "grad_norm": 14.528229713439941, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.841898500919342, "num_tokens": 153974337.0, "step": 4033 }, { "epoch": 0.5131662638341178, "ewc_loss": 0.02080545574426651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0805455278605223e-05, "grad_norm": 14.465973854064941, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8502786159515381, "num_tokens": 154018911.0, "step": 4034 }, { "epoch": 0.5132934741127083, "ewc_loss": 0.020753297954797745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.075329757644795e-05, "grad_norm": 14.524593353271484, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8668215870857239, "num_tokens": 154056441.0, "step": 4035 }, { "epoch": 0.5134206843912988, "ewc_loss": 0.020797928795218468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.079792830045335e-05, "grad_norm": 14.467917442321777, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8465200662612915, "num_tokens": 154096326.0, "step": 4036 }, { "epoch": 0.5135478946698894, "ewc_loss": 0.020771851763129234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0771851268364117e-05, "grad_norm": 14.560791015625, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8601487874984741, "num_tokens": 154137223.0, "step": 4037 }, { "epoch": 0.5136751049484798, "ewc_loss": 0.020815027877688408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0815028619836085e-05, "grad_norm": 14.502023696899414, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8536885976791382, "num_tokens": 154168312.0, "step": 4038 }, { "epoch": 0.5138023152270703, "ewc_loss": 0.02074170671403408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0741706975968555e-05, "grad_norm": 14.477827072143555, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8695321083068848, "num_tokens": 154203125.0, "step": 4039 }, { "epoch": 0.5139295255056608, "ewc_loss": 0.020817816257476807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.081781713059172e-05, "grad_norm": 14.533353805541992, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8504937291145325, "num_tokens": 154248437.0, "step": 4040 }, { "epoch": 0.5140567357842514, "ewc_loss": 0.02080513909459114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0805138774449006e-05, "grad_norm": 14.509480476379395, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8597538471221924, "num_tokens": 154286303.0, "step": 4041 }, { "epoch": 0.5141839460628419, "ewc_loss": 0.020798543468117714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.079854311887175e-05, "grad_norm": 14.520654678344727, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8465156555175781, "num_tokens": 154328638.0, "step": 4042 }, { "epoch": 0.5143111563414324, "ewc_loss": 0.020821476355195045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0821476937271655e-05, "grad_norm": 14.56871509552002, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8549689650535583, "num_tokens": 154362953.0, "step": 4043 }, { "epoch": 0.5144383666200228, "ewc_loss": 0.02081301435828209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.081301499856636e-05, "grad_norm": 14.537904739379883, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8437177538871765, "num_tokens": 154401289.0, "step": 4044 }, { "epoch": 0.5145655768986134, "ewc_loss": 0.02081763744354248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.081763705064077e-05, "grad_norm": 14.593631744384766, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.860408365726471, "num_tokens": 154438779.0, "step": 4045 }, { "epoch": 0.5146927871772039, "ewc_loss": 0.020814182236790657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0814182789763436e-05, "grad_norm": 14.555351257324219, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8500325679779053, "num_tokens": 154467032.0, "step": 4046 }, { "epoch": 0.5148199974557944, "ewc_loss": 0.020825281739234924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0825282263103873e-05, "grad_norm": 14.556766510009766, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8658660054206848, "num_tokens": 154505222.0, "step": 4047 }, { "epoch": 0.514947207734385, "ewc_loss": 0.020828239619731903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0828239939874038e-05, "grad_norm": 14.551204681396484, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8434957265853882, "num_tokens": 154544953.0, "step": 4048 }, { "epoch": 0.5150744180129755, "ewc_loss": 0.02084115706384182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0841156583628617e-05, "grad_norm": 14.528141021728516, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8683412671089172, "num_tokens": 154586193.0, "step": 4049 }, { "epoch": 0.5152016282915659, "ewc_loss": 0.020856481045484543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.085648156935349e-05, "grad_norm": 14.506924629211426, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8603277206420898, "num_tokens": 154629531.0, "step": 4050 }, { "epoch": 0.5153288385701564, "ewc_loss": 0.020811865106225014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0811865397263318e-05, "grad_norm": 14.503959655761719, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8707399368286133, "num_tokens": 154659482.0, "step": 4051 }, { "epoch": 0.515456048848747, "ewc_loss": 0.020851001143455505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0851000954280607e-05, "grad_norm": 14.552057266235352, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8228158354759216, "num_tokens": 154702645.0, "step": 4052 }, { "epoch": 0.5155832591273375, "ewc_loss": 0.02086971141397953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.086971107928548e-05, "grad_norm": 14.521347045898438, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.864869236946106, "num_tokens": 154743329.0, "step": 4053 }, { "epoch": 0.515710469405928, "ewc_loss": 0.02088043838739395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.088043765979819e-05, "grad_norm": 14.523772239685059, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8528238534927368, "num_tokens": 154783460.0, "step": 4054 }, { "epoch": 0.5158376796845185, "ewc_loss": 0.02084963768720627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.084963853121735e-05, "grad_norm": 14.495980262756348, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8507484793663025, "num_tokens": 154827209.0, "step": 4055 }, { "epoch": 0.5159648899631091, "ewc_loss": 0.020872173830866814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.087217399093788e-05, "grad_norm": 14.523425102233887, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.83179771900177, "num_tokens": 154866577.0, "step": 4056 }, { "epoch": 0.5160921002416995, "ewc_loss": 0.0208902470767498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0890247469651513e-05, "grad_norm": 14.550985336303711, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.858426570892334, "num_tokens": 154912191.0, "step": 4057 }, { "epoch": 0.51621931052029, "ewc_loss": 0.020842378959059715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.08423789445078e-05, "grad_norm": 14.49630069732666, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8428794145584106, "num_tokens": 154953412.0, "step": 4058 }, { "epoch": 0.5163465207988805, "ewc_loss": 0.020857084542512894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0857083654846065e-05, "grad_norm": 14.455527305603027, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8671815395355225, "num_tokens": 154989913.0, "step": 4059 }, { "epoch": 0.5164737310774711, "ewc_loss": 0.02086290530860424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.086290442093741e-05, "grad_norm": 14.543940544128418, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8435614109039307, "num_tokens": 155038094.0, "step": 4060 }, { "epoch": 0.5166009413560616, "ewc_loss": 0.02085135690867901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0851357476203702e-05, "grad_norm": 14.508186340332031, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8457889556884766, "num_tokens": 155075900.0, "step": 4061 }, { "epoch": 0.5167281516346521, "ewc_loss": 0.020829029381275177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0829029381275177e-05, "grad_norm": 14.51968002319336, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8541533350944519, "num_tokens": 155117038.0, "step": 4062 }, { "epoch": 0.5168553619132426, "ewc_loss": 0.020837493240833282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0837493138969876e-05, "grad_norm": 14.594535827636719, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.869522213935852, "num_tokens": 155151442.0, "step": 4063 }, { "epoch": 0.5169825721918331, "ewc_loss": 0.020782895386219025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0782896172022447e-05, "grad_norm": 14.42869758605957, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8645275831222534, "num_tokens": 155192288.0, "step": 4064 }, { "epoch": 0.5171097824704236, "ewc_loss": 0.020779862999916077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0779862097697333e-05, "grad_norm": 14.513768196105957, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8586592078208923, "num_tokens": 155238931.0, "step": 4065 }, { "epoch": 0.5172369927490141, "ewc_loss": 0.02079826220870018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0798262994503602e-05, "grad_norm": 14.42973518371582, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.869234561920166, "num_tokens": 155276644.0, "step": 4066 }, { "epoch": 0.5173642030276047, "ewc_loss": 0.020801452919840813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0801453501917422e-05, "grad_norm": 14.581960678100586, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8685277700424194, "num_tokens": 155307638.0, "step": 4067 }, { "epoch": 0.5174914133061952, "ewc_loss": 0.020821740850806236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.082174069073517e-05, "grad_norm": 14.41759204864502, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8490455746650696, "num_tokens": 155343255.0, "step": 4068 }, { "epoch": 0.5176186235847856, "ewc_loss": 0.0208023339509964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0802333892788738e-05, "grad_norm": 14.628836631774902, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8578413724899292, "num_tokens": 155382283.0, "step": 4069 }, { "epoch": 0.5177458338633761, "ewc_loss": 0.020890509709715843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0890509404125623e-05, "grad_norm": 14.497063636779785, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8337175846099854, "num_tokens": 155427096.0, "step": 4070 }, { "epoch": 0.5178730441419667, "ewc_loss": 0.020805057138204575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0805056919925846e-05, "grad_norm": 14.567893981933594, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8429425954818726, "num_tokens": 155467472.0, "step": 4071 }, { "epoch": 0.5180002544205572, "ewc_loss": 0.02085699886083603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0856998162344098e-05, "grad_norm": 14.572449684143066, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8498339056968689, "num_tokens": 155508350.0, "step": 4072 }, { "epoch": 0.5181274646991477, "ewc_loss": 0.020815972238779068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0815972675336525e-05, "grad_norm": 14.535270690917969, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8796772360801697, "num_tokens": 155547215.0, "step": 4073 }, { "epoch": 0.5182546749777382, "ewc_loss": 0.020808260887861252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.080826016026549e-05, "grad_norm": 14.582107543945312, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8610740303993225, "num_tokens": 155586658.0, "step": 4074 }, { "epoch": 0.5183818852563287, "ewc_loss": 0.020787551999092102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0787552784895524e-05, "grad_norm": 14.509642601013184, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8589392900466919, "num_tokens": 155620944.0, "step": 4075 }, { "epoch": 0.5185090955349192, "ewc_loss": 0.020787013694643974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0787014364032075e-05, "grad_norm": 14.59524917602539, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8534195423126221, "num_tokens": 155662699.0, "step": 4076 }, { "epoch": 0.5186363058135097, "ewc_loss": 0.02082168497145176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.082168430206366e-05, "grad_norm": 14.501399993896484, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8440617918968201, "num_tokens": 155707783.0, "step": 4077 }, { "epoch": 0.5187635160921003, "ewc_loss": 0.02078060433268547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.078060424537398e-05, "grad_norm": 14.562811851501465, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8615494966506958, "num_tokens": 155745227.0, "step": 4078 }, { "epoch": 0.5188907263706908, "ewc_loss": 0.02082599140703678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0825991668971255e-05, "grad_norm": 14.456742286682129, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.856974720954895, "num_tokens": 155786466.0, "step": 4079 }, { "epoch": 0.5190179366492813, "ewc_loss": 0.020786726847290993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0786726963706315e-05, "grad_norm": 14.530863761901855, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8569889664649963, "num_tokens": 155833145.0, "step": 4080 }, { "epoch": 0.5191451469278717, "ewc_loss": 0.020872220396995544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.087221946567297e-05, "grad_norm": 14.551018714904785, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8466097116470337, "num_tokens": 155869533.0, "step": 4081 }, { "epoch": 0.5192723572064623, "ewc_loss": 0.020834844559431076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0834844690398313e-05, "grad_norm": 14.563234329223633, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8550465106964111, "num_tokens": 155909329.0, "step": 4082 }, { "epoch": 0.5193995674850528, "ewc_loss": 0.020843833684921265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0843834136030637e-05, "grad_norm": 14.585200309753418, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8651265501976013, "num_tokens": 155947848.0, "step": 4083 }, { "epoch": 0.5195267777636433, "ewc_loss": 0.020808327943086624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.080832746287342e-05, "grad_norm": 14.537383079528809, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8603875637054443, "num_tokens": 155982693.0, "step": 4084 }, { "epoch": 0.5196539880422338, "ewc_loss": 0.02082482911646366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.082482933474239e-05, "grad_norm": 14.565042495727539, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8547381162643433, "num_tokens": 156023414.0, "step": 4085 }, { "epoch": 0.5197811983208244, "ewc_loss": 0.02084350772202015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0843508536927402e-05, "grad_norm": 14.520169258117676, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8558636903762817, "num_tokens": 156063488.0, "step": 4086 }, { "epoch": 0.5199084085994148, "ewc_loss": 0.02083454839885235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0834548195125535e-05, "grad_norm": 14.629035949707031, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8563467860221863, "num_tokens": 156103081.0, "step": 4087 }, { "epoch": 0.5200356188780053, "ewc_loss": 0.020873283967375755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0873283574474044e-05, "grad_norm": 14.511221885681152, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8480932712554932, "num_tokens": 156139453.0, "step": 4088 }, { "epoch": 0.5201628291565958, "ewc_loss": 0.020816097036004066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0816096366615966e-05, "grad_norm": 14.587712287902832, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8511682152748108, "num_tokens": 156176564.0, "step": 4089 }, { "epoch": 0.5202900394351864, "ewc_loss": 0.020855482667684555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0855482944170944e-05, "grad_norm": 14.486377716064453, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8585264086723328, "num_tokens": 156221278.0, "step": 4090 }, { "epoch": 0.5204172497137769, "ewc_loss": 0.020824629813432693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0824629245908e-05, "grad_norm": 14.611021041870117, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8610154986381531, "num_tokens": 156259124.0, "step": 4091 }, { "epoch": 0.5205444599923674, "ewc_loss": 0.02083752490580082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0837524061789736e-05, "grad_norm": 14.497773170471191, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8457644581794739, "num_tokens": 156297832.0, "step": 4092 }, { "epoch": 0.5206716702709578, "ewc_loss": 0.020796069875359535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0796069293282926e-05, "grad_norm": 14.590167045593262, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8621522188186646, "num_tokens": 156329745.0, "step": 4093 }, { "epoch": 0.5207988805495484, "ewc_loss": 0.02088279090821743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.088279143208638e-05, "grad_norm": 14.563950538635254, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8567001223564148, "num_tokens": 156360154.0, "step": 4094 }, { "epoch": 0.5209260908281389, "ewc_loss": 0.02083422802388668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.083422805299051e-05, "grad_norm": 14.498255729675293, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8584367632865906, "num_tokens": 156399360.0, "step": 4095 }, { "epoch": 0.5210533011067294, "ewc_loss": 0.02085869386792183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0858693460468203e-05, "grad_norm": 14.56783676147461, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8550833463668823, "num_tokens": 156438782.0, "step": 4096 }, { "epoch": 0.52118051138532, "ewc_loss": 0.020863106474280357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0863106328761205e-05, "grad_norm": 14.515637397766113, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8611935973167419, "num_tokens": 156470261.0, "step": 4097 }, { "epoch": 0.5213077216639105, "ewc_loss": 0.020865680649876595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0865680198767222e-05, "grad_norm": 14.615999221801758, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8381120562553406, "num_tokens": 156510493.0, "step": 4098 }, { "epoch": 0.5214349319425009, "ewc_loss": 0.020936617627739906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0936617147526704e-05, "grad_norm": 14.495906829833984, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8582960367202759, "num_tokens": 156545040.0, "step": 4099 }, { "epoch": 0.5215621422210914, "ewc_loss": 0.020880388095974922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0880388547084294e-05, "grad_norm": 14.546658515930176, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8538410663604736, "num_tokens": 156583474.0, "step": 4100 }, { "epoch": 0.521689352499682, "ewc_loss": 0.020937640219926834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0937639419571497e-05, "grad_norm": 14.496758460998535, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8560371398925781, "num_tokens": 156627137.0, "step": 4101 }, { "epoch": 0.5218165627782725, "ewc_loss": 0.020922554656863213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.092255454044789e-05, "grad_norm": 14.58032512664795, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8454996943473816, "num_tokens": 156664726.0, "step": 4102 }, { "epoch": 0.521943773056863, "ewc_loss": 0.020990021526813507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0990020857425407e-05, "grad_norm": 14.559759140014648, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8735616207122803, "num_tokens": 156707014.0, "step": 4103 }, { "epoch": 0.5220709833354535, "ewc_loss": 0.020904863253235817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0904863049509004e-05, "grad_norm": 14.52417278289795, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8520243167877197, "num_tokens": 156747784.0, "step": 4104 }, { "epoch": 0.522198193614044, "ewc_loss": 0.020935937762260437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.093593866447918e-05, "grad_norm": 14.569172859191895, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8545387387275696, "num_tokens": 156784760.0, "step": 4105 }, { "epoch": 0.5223254038926345, "ewc_loss": 0.020939992740750313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0939993191859685e-05, "grad_norm": 14.541739463806152, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8526809215545654, "num_tokens": 156822899.0, "step": 4106 }, { "epoch": 0.522452614171225, "ewc_loss": 0.020926639437675476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0926639990648255e-05, "grad_norm": 14.557823181152344, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.861883819103241, "num_tokens": 156861308.0, "step": 4107 }, { "epoch": 0.5225798244498155, "ewc_loss": 0.02093636989593506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.093636976496782e-05, "grad_norm": 14.552472114562988, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8626500368118286, "num_tokens": 156901981.0, "step": 4108 }, { "epoch": 0.5227070347284061, "ewc_loss": 0.020894721150398254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0894720364594832e-05, "grad_norm": 14.507115364074707, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8620613813400269, "num_tokens": 156942215.0, "step": 4109 }, { "epoch": 0.5228342450069966, "ewc_loss": 0.020904572680592537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0904572011204436e-05, "grad_norm": 14.516610145568848, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8627364635467529, "num_tokens": 156983820.0, "step": 4110 }, { "epoch": 0.5229614552855871, "ewc_loss": 0.020910724997520447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0910725652356632e-05, "grad_norm": 14.554994583129883, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8687638640403748, "num_tokens": 157016215.0, "step": 4111 }, { "epoch": 0.5230886655641775, "ewc_loss": 0.020902100950479507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.090210182359442e-05, "grad_norm": 14.52638053894043, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8563128709793091, "num_tokens": 157052071.0, "step": 4112 }, { "epoch": 0.5232158758427681, "ewc_loss": 0.02091500349342823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0915003915433772e-05, "grad_norm": 14.575906753540039, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8342477083206177, "num_tokens": 157097711.0, "step": 4113 }, { "epoch": 0.5233430861213586, "ewc_loss": 0.020895473659038544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.08954734262079e-05, "grad_norm": 14.572623252868652, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8549384474754333, "num_tokens": 157134801.0, "step": 4114 }, { "epoch": 0.5234702963999491, "ewc_loss": 0.020918890833854675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.091889109578915e-05, "grad_norm": 14.615768432617188, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8565810918807983, "num_tokens": 157171885.0, "step": 4115 }, { "epoch": 0.5235975066785397, "ewc_loss": 0.0208900086581707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0890009182039648e-05, "grad_norm": 14.504101753234863, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.855092465877533, "num_tokens": 157211915.0, "step": 4116 }, { "epoch": 0.5237247169571302, "ewc_loss": 0.020870214328169823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0870214939350262e-05, "grad_norm": 14.55058765411377, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8504369258880615, "num_tokens": 157260564.0, "step": 4117 }, { "epoch": 0.5238519272357206, "ewc_loss": 0.020923681557178497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.092368231387809e-05, "grad_norm": 14.522326469421387, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8575326800346375, "num_tokens": 157294676.0, "step": 4118 }, { "epoch": 0.5239791375143111, "ewc_loss": 0.02092249132692814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0922490875818767e-05, "grad_norm": 14.543930053710938, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8669931888580322, "num_tokens": 157334607.0, "step": 4119 }, { "epoch": 0.5241063477929017, "ewc_loss": 0.02093224599957466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0932246115989983e-05, "grad_norm": 14.58512020111084, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8580493927001953, "num_tokens": 157372021.0, "step": 4120 }, { "epoch": 0.5242335580714922, "ewc_loss": 0.02095036953687668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.095036870741751e-05, "grad_norm": 14.585434913635254, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.861314594745636, "num_tokens": 157410092.0, "step": 4121 }, { "epoch": 0.5243607683500827, "ewc_loss": 0.020916318520903587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0916319044772536e-05, "grad_norm": 14.486113548278809, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8539009690284729, "num_tokens": 157447979.0, "step": 4122 }, { "epoch": 0.5244879786286732, "ewc_loss": 0.020895252004265785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0895251509500667e-05, "grad_norm": 14.5570068359375, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8518159985542297, "num_tokens": 157481376.0, "step": 4123 }, { "epoch": 0.5246151889072637, "ewc_loss": 0.020947366952896118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.094736737490166e-05, "grad_norm": 14.495512008666992, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8617810010910034, "num_tokens": 157519555.0, "step": 4124 }, { "epoch": 0.5247423991858542, "ewc_loss": 0.020938295871019363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0938296074746177e-05, "grad_norm": 14.563190460205078, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8547220230102539, "num_tokens": 157551849.0, "step": 4125 }, { "epoch": 0.5248696094644447, "ewc_loss": 0.021014563739299774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.101456448144745e-05, "grad_norm": 14.52290153503418, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.856730580329895, "num_tokens": 157590823.0, "step": 4126 }, { "epoch": 0.5249968197430352, "ewc_loss": 0.020956529304385185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.095652962452732e-05, "grad_norm": 14.525038719177246, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8665921092033386, "num_tokens": 157633771.0, "step": 4127 }, { "epoch": 0.5251240300216258, "ewc_loss": 0.02099602483212948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.099602534144651e-05, "grad_norm": 14.541780471801758, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8697382211685181, "num_tokens": 157667796.0, "step": 4128 }, { "epoch": 0.5252512403002163, "ewc_loss": 0.020991330966353416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.099133052979596e-05, "grad_norm": 14.53220272064209, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8661983013153076, "num_tokens": 157709452.0, "step": 4129 }, { "epoch": 0.5253784505788067, "ewc_loss": 0.021015698090195656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.101569771184586e-05, "grad_norm": 14.553116798400879, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8483350872993469, "num_tokens": 157741407.0, "step": 4130 }, { "epoch": 0.5255056608573972, "ewc_loss": 0.021005289629101753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.100528945447877e-05, "grad_norm": 14.592606544494629, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8675069212913513, "num_tokens": 157777693.0, "step": 4131 }, { "epoch": 0.5256328711359878, "ewc_loss": 0.02098539099097252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0985391529393382e-05, "grad_norm": 14.550599098205566, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8550404906272888, "num_tokens": 157814729.0, "step": 4132 }, { "epoch": 0.5257600814145783, "ewc_loss": 0.0210256390273571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1025638488936238e-05, "grad_norm": 14.567078590393066, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8666841983795166, "num_tokens": 157853698.0, "step": 4133 }, { "epoch": 0.5258872916931688, "ewc_loss": 0.021031208336353302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1031208234489895e-05, "grad_norm": 14.558820724487305, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8620007038116455, "num_tokens": 157888580.0, "step": 4134 }, { "epoch": 0.5260145019717594, "ewc_loss": 0.021013405174016953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1013405785197392e-05, "grad_norm": 14.618111610412598, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8543316125869751, "num_tokens": 157928434.0, "step": 4135 }, { "epoch": 0.5261417122503498, "ewc_loss": 0.020989324897527695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.098932418448385e-05, "grad_norm": 14.515334129333496, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8520714044570923, "num_tokens": 157968936.0, "step": 4136 }, { "epoch": 0.5262689225289403, "ewc_loss": 0.020994141697883606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0994140868424438e-05, "grad_norm": 14.598883628845215, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8458489179611206, "num_tokens": 158005496.0, "step": 4137 }, { "epoch": 0.5263961328075308, "ewc_loss": 0.021040210500359535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1040210413048044e-05, "grad_norm": 14.611869812011719, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8645565509796143, "num_tokens": 158047569.0, "step": 4138 }, { "epoch": 0.5265233430861214, "ewc_loss": 0.0209793783724308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.097937795042526e-05, "grad_norm": 14.541748046875, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8560099601745605, "num_tokens": 158085733.0, "step": 4139 }, { "epoch": 0.5266505533647119, "ewc_loss": 0.02098684385418892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0986843082937412e-05, "grad_norm": 14.581642150878906, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8477113842964172, "num_tokens": 158123906.0, "step": 4140 }, { "epoch": 0.5267777636433024, "ewc_loss": 0.02101563662290573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.101563586620614e-05, "grad_norm": 14.57445240020752, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8631721138954163, "num_tokens": 158163663.0, "step": 4141 }, { "epoch": 0.5269049739218928, "ewc_loss": 0.021013563498854637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.10135640372755e-05, "grad_norm": 14.56161117553711, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8449847102165222, "num_tokens": 158205878.0, "step": 4142 }, { "epoch": 0.5270321842004834, "ewc_loss": 0.020994747057557106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.099474659189582e-05, "grad_norm": 14.56467056274414, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.860136866569519, "num_tokens": 158241608.0, "step": 4143 }, { "epoch": 0.5271593944790739, "ewc_loss": 0.0210098959505558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.100989513564855e-05, "grad_norm": 14.56946849822998, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8690261840820312, "num_tokens": 158274259.0, "step": 4144 }, { "epoch": 0.5272866047576644, "ewc_loss": 0.020965231582522392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0965231669833884e-05, "grad_norm": 14.608118057250977, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8629752993583679, "num_tokens": 158304793.0, "step": 4145 }, { "epoch": 0.527413815036255, "ewc_loss": 0.021034404635429382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1034404198871925e-05, "grad_norm": 14.615289688110352, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8708255290985107, "num_tokens": 158349420.0, "step": 4146 }, { "epoch": 0.5275410253148455, "ewc_loss": 0.02100241184234619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.100241181324236e-05, "grad_norm": 14.580644607543945, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8557289838790894, "num_tokens": 158395834.0, "step": 4147 }, { "epoch": 0.5276682355934359, "ewc_loss": 0.02098746970295906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0987470634281635e-05, "grad_norm": 14.633589744567871, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.858169674873352, "num_tokens": 158435568.0, "step": 4148 }, { "epoch": 0.5277954458720264, "ewc_loss": 0.021021341904997826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1021342035965063e-05, "grad_norm": 14.5663480758667, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8666348457336426, "num_tokens": 158473584.0, "step": 4149 }, { "epoch": 0.527922656150617, "ewc_loss": 0.020955706015229225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0955705622327514e-05, "grad_norm": 14.600439071655273, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8580921292304993, "num_tokens": 158513435.0, "step": 4150 }, { "epoch": 0.5280498664292075, "ewc_loss": 0.021021047607064247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1021047359681688e-05, "grad_norm": 14.603230476379395, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8576152324676514, "num_tokens": 158549478.0, "step": 4151 }, { "epoch": 0.528177076707798, "ewc_loss": 0.021002447232604027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.100244637404103e-05, "grad_norm": 14.602463722229004, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8614664077758789, "num_tokens": 158598742.0, "step": 4152 }, { "epoch": 0.5283042869863885, "ewc_loss": 0.020970869809389114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0970870536984876e-05, "grad_norm": 14.562902450561523, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8459672927856445, "num_tokens": 158636806.0, "step": 4153 }, { "epoch": 0.528431497264979, "ewc_loss": 0.020968127995729446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.096812750096433e-05, "grad_norm": 14.637396812438965, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.850631594657898, "num_tokens": 158676043.0, "step": 4154 }, { "epoch": 0.5285587075435695, "ewc_loss": 0.020996635779738426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0996636521886103e-05, "grad_norm": 14.621126174926758, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8412654995918274, "num_tokens": 158711772.0, "step": 4155 }, { "epoch": 0.52868591782216, "ewc_loss": 0.020918285474181175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.091828537231777e-05, "grad_norm": 14.575478553771973, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8566485643386841, "num_tokens": 158749608.0, "step": 4156 }, { "epoch": 0.5288131281007505, "ewc_loss": 0.020977724343538284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0977724489057437e-05, "grad_norm": 14.645953178405762, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8575830459594727, "num_tokens": 158790502.0, "step": 4157 }, { "epoch": 0.5289403383793411, "ewc_loss": 0.02095070667564869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.095070703944657e-05, "grad_norm": 14.593558311462402, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8329615592956543, "num_tokens": 158828415.0, "step": 4158 }, { "epoch": 0.5290675486579316, "ewc_loss": 0.020962296053767204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.096229582093656e-05, "grad_norm": 14.622091293334961, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.847331166267395, "num_tokens": 158866972.0, "step": 4159 }, { "epoch": 0.5291947589365221, "ewc_loss": 0.020955946296453476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0955945728928782e-05, "grad_norm": 14.630090713500977, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8541821241378784, "num_tokens": 158902991.0, "step": 4160 }, { "epoch": 0.5293219692151125, "ewc_loss": 0.02097945287823677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0979452528990805e-05, "grad_norm": 14.517294883728027, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8650141954421997, "num_tokens": 158945963.0, "step": 4161 }, { "epoch": 0.5294491794937031, "ewc_loss": 0.02095501311123371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0955012587364763e-05, "grad_norm": 14.608816146850586, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8556301593780518, "num_tokens": 158985407.0, "step": 4162 }, { "epoch": 0.5295763897722936, "ewc_loss": 0.020983558148145676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0983557988074608e-05, "grad_norm": 14.546239852905273, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8498260974884033, "num_tokens": 159024387.0, "step": 4163 }, { "epoch": 0.5297036000508841, "ewc_loss": 0.02095516212284565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0955161744495854e-05, "grad_norm": 14.589869499206543, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8493814468383789, "num_tokens": 159061430.0, "step": 4164 }, { "epoch": 0.5298308103294747, "ewc_loss": 0.02102844975888729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.102845064655412e-05, "grad_norm": 14.589005470275879, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8582512736320496, "num_tokens": 159099350.0, "step": 4165 }, { "epoch": 0.5299580206080652, "ewc_loss": 0.021040581166744232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1040581486886367e-05, "grad_norm": 14.627995491027832, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8311758637428284, "num_tokens": 159136276.0, "step": 4166 }, { "epoch": 0.5300852308866556, "ewc_loss": 0.02099938690662384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0999386833864264e-05, "grad_norm": 14.57778549194336, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8780621290206909, "num_tokens": 159177102.0, "step": 4167 }, { "epoch": 0.5302124411652461, "ewc_loss": 0.02103108912706375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1031090000178665e-05, "grad_norm": 14.61526870727539, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8679620027542114, "num_tokens": 159213582.0, "step": 4168 }, { "epoch": 0.5303396514438367, "ewc_loss": 0.02105371467769146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1053714590379968e-05, "grad_norm": 14.632694244384766, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8559348583221436, "num_tokens": 159254283.0, "step": 4169 }, { "epoch": 0.5304668617224272, "ewc_loss": 0.021005718037486076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1005718735978007e-05, "grad_norm": 14.558866500854492, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8588056564331055, "num_tokens": 159288557.0, "step": 4170 }, { "epoch": 0.5305940720010177, "ewc_loss": 0.020999260246753693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.0999259504606016e-05, "grad_norm": 14.596813201904297, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8663136959075928, "num_tokens": 159325047.0, "step": 4171 }, { "epoch": 0.5307212822796082, "ewc_loss": 0.02103595994412899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1035959434811957e-05, "grad_norm": 14.614392280578613, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8609058856964111, "num_tokens": 159366291.0, "step": 4172 }, { "epoch": 0.5308484925581987, "ewc_loss": 0.0209998469799757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.099984703818336e-05, "grad_norm": 14.582475662231445, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8510376214981079, "num_tokens": 159401634.0, "step": 4173 }, { "epoch": 0.5309757028367892, "ewc_loss": 0.021044563502073288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.104456325469073e-05, "grad_norm": 14.630645751953125, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8487842082977295, "num_tokens": 159435316.0, "step": 4174 }, { "epoch": 0.5311029131153797, "ewc_loss": 0.02104991301894188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1049912902526557e-05, "grad_norm": 14.665108680725098, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8499373197555542, "num_tokens": 159472192.0, "step": 4175 }, { "epoch": 0.5312301233939702, "ewc_loss": 0.02106289379298687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.106289321091026e-05, "grad_norm": 14.543408393859863, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8633995652198792, "num_tokens": 159515342.0, "step": 4176 }, { "epoch": 0.5313573336725608, "ewc_loss": 0.02101808600127697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1018086044932716e-05, "grad_norm": 14.640876770019531, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8609342575073242, "num_tokens": 159553932.0, "step": 4177 }, { "epoch": 0.5314845439511513, "ewc_loss": 0.021110570058226585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.11105707421666e-05, "grad_norm": 14.628847122192383, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8584103584289551, "num_tokens": 159584711.0, "step": 4178 }, { "epoch": 0.5316117542297417, "ewc_loss": 0.021010512486100197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.101051177305635e-05, "grad_norm": 14.626737594604492, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8441159725189209, "num_tokens": 159618784.0, "step": 4179 }, { "epoch": 0.5317389645083322, "ewc_loss": 0.021047426387667656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.104742634401191e-05, "grad_norm": 14.555965423583984, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8556746244430542, "num_tokens": 159661288.0, "step": 4180 }, { "epoch": 0.5318661747869228, "ewc_loss": 0.02103404700756073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.103404767694883e-05, "grad_norm": 14.577919960021973, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8681715726852417, "num_tokens": 159701286.0, "step": 4181 }, { "epoch": 0.5319933850655133, "ewc_loss": 0.021047791466116905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1047791960882023e-05, "grad_norm": 14.574610710144043, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8439230918884277, "num_tokens": 159746631.0, "step": 4182 }, { "epoch": 0.5321205953441038, "ewc_loss": 0.021094849333167076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1094849216751754e-05, "grad_norm": 14.621962547302246, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8602324724197388, "num_tokens": 159787753.0, "step": 4183 }, { "epoch": 0.5322478056226944, "ewc_loss": 0.021077271550893784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.107727232214529e-05, "grad_norm": 14.58709716796875, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8490746021270752, "num_tokens": 159823092.0, "step": 4184 }, { "epoch": 0.5323750159012848, "ewc_loss": 0.021048951894044876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1048952476121485e-05, "grad_norm": 14.641788482666016, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8482097387313843, "num_tokens": 159858480.0, "step": 4185 }, { "epoch": 0.5325022261798753, "ewc_loss": 0.02110443077981472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1104431652929634e-05, "grad_norm": 14.574122428894043, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8477205038070679, "num_tokens": 159896220.0, "step": 4186 }, { "epoch": 0.5326294364584658, "ewc_loss": 0.021048305556178093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1048304915893823e-05, "grad_norm": 14.599334716796875, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8535376787185669, "num_tokens": 159930995.0, "step": 4187 }, { "epoch": 0.5327566467370564, "ewc_loss": 0.02112935110926628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1129351807758212e-05, "grad_norm": 14.6564302444458, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8520910739898682, "num_tokens": 159968013.0, "step": 4188 }, { "epoch": 0.5328838570156469, "ewc_loss": 0.021099332720041275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1099333025631495e-05, "grad_norm": 14.563532829284668, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8490318059921265, "num_tokens": 160004085.0, "step": 4189 }, { "epoch": 0.5330110672942374, "ewc_loss": 0.021070420742034912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1070420189062133e-05, "grad_norm": 14.58267879486084, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8667411804199219, "num_tokens": 160040606.0, "step": 4190 }, { "epoch": 0.5331382775728278, "ewc_loss": 0.021104389801621437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1104389816173352e-05, "grad_norm": 14.658013343811035, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8684551119804382, "num_tokens": 160080792.0, "step": 4191 }, { "epoch": 0.5332654878514184, "ewc_loss": 0.02112472802400589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.11247279366944e-05, "grad_norm": 14.58226203918457, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8450149297714233, "num_tokens": 160117055.0, "step": 4192 }, { "epoch": 0.5333926981300089, "ewc_loss": 0.021085824817419052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1085825210320763e-05, "grad_norm": 14.628480911254883, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8618376851081848, "num_tokens": 160153223.0, "step": 4193 }, { "epoch": 0.5335199084085994, "ewc_loss": 0.02114991657435894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.11499173019547e-05, "grad_norm": 14.584248542785645, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8395845293998718, "num_tokens": 160190920.0, "step": 4194 }, { "epoch": 0.53364711868719, "ewc_loss": 0.021083341911435127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1083342289784923e-05, "grad_norm": 14.595255851745605, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.855302631855011, "num_tokens": 160221898.0, "step": 4195 }, { "epoch": 0.5337743289657805, "ewc_loss": 0.021123597398400307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1123596525285393e-05, "grad_norm": 14.593192100524902, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8593375086784363, "num_tokens": 160267102.0, "step": 4196 }, { "epoch": 0.5339015392443709, "ewc_loss": 0.021142782643437386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1142783225513995e-05, "grad_norm": 14.60684585571289, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8621008396148682, "num_tokens": 160304422.0, "step": 4197 }, { "epoch": 0.5340287495229614, "ewc_loss": 0.021165482699871063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1165482394280843e-05, "grad_norm": 14.687548637390137, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.853893518447876, "num_tokens": 160339006.0, "step": 4198 }, { "epoch": 0.534155959801552, "ewc_loss": 0.021148400381207466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1148400264792144e-05, "grad_norm": 14.60944938659668, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8769710063934326, "num_tokens": 160377504.0, "step": 4199 }, { "epoch": 0.5342831700801425, "ewc_loss": 0.021094700321555138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1094700059620664e-05, "grad_norm": 14.676397323608398, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8363416194915771, "num_tokens": 160413315.0, "step": 4200 }, { "epoch": 0.534410380358733, "ewc_loss": 0.021145667880773544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.114566814270802e-05, "grad_norm": 14.559713363647461, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8706036806106567, "num_tokens": 160455034.0, "step": 4201 }, { "epoch": 0.5345375906373235, "ewc_loss": 0.021082807332277298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.108280750690028e-05, "grad_norm": 14.605493545532227, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8517974019050598, "num_tokens": 160490505.0, "step": 4202 }, { "epoch": 0.534664800915914, "ewc_loss": 0.021152183413505554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.115218376275152e-05, "grad_norm": 14.591227531433105, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8664697408676147, "num_tokens": 160528712.0, "step": 4203 }, { "epoch": 0.5347920111945045, "ewc_loss": 0.021114392206072807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.111439243890345e-05, "grad_norm": 14.615409851074219, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8479307889938354, "num_tokens": 160567945.0, "step": 4204 }, { "epoch": 0.534919221473095, "ewc_loss": 0.021139243617653847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1139243472134694e-05, "grad_norm": 14.574528694152832, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8635118007659912, "num_tokens": 160596767.0, "step": 4205 }, { "epoch": 0.5350464317516855, "ewc_loss": 0.021139798685908318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1139798263902776e-05, "grad_norm": 14.526180267333984, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8528705835342407, "num_tokens": 160636912.0, "step": 4206 }, { "epoch": 0.5351736420302761, "ewc_loss": 0.021155884489417076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1155885406187735e-05, "grad_norm": 14.623815536499023, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8496246337890625, "num_tokens": 160674588.0, "step": 4207 }, { "epoch": 0.5353008523088666, "ewc_loss": 0.021151965484023094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1151965484023094e-05, "grad_norm": 14.540858268737793, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8445324897766113, "num_tokens": 160713441.0, "step": 4208 }, { "epoch": 0.5354280625874571, "ewc_loss": 0.021150924265384674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1150925022084266e-05, "grad_norm": 14.591798782348633, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.866628110408783, "num_tokens": 160752716.0, "step": 4209 }, { "epoch": 0.5355552728660475, "ewc_loss": 0.021216293796896935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1216294044279493e-05, "grad_norm": 14.652735710144043, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8671005964279175, "num_tokens": 160798771.0, "step": 4210 }, { "epoch": 0.5356824831446381, "ewc_loss": 0.021109245717525482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.110924651788082e-05, "grad_norm": 14.511323928833008, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.874305248260498, "num_tokens": 160843085.0, "step": 4211 }, { "epoch": 0.5358096934232286, "ewc_loss": 0.0211419016122818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1141901015653275e-05, "grad_norm": 14.665282249450684, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8500144481658936, "num_tokens": 160883073.0, "step": 4212 }, { "epoch": 0.5359369037018191, "ewc_loss": 0.021149544045329094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1149544409126975e-05, "grad_norm": 14.533327102661133, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8634569644927979, "num_tokens": 160923734.0, "step": 4213 }, { "epoch": 0.5360641139804097, "ewc_loss": 0.021116789430379868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1116789866937324e-05, "grad_norm": 14.6143798828125, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8536481857299805, "num_tokens": 160963460.0, "step": 4214 }, { "epoch": 0.5361913242590002, "ewc_loss": 0.02115609496831894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1156094589969143e-05, "grad_norm": 14.530388832092285, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8545575141906738, "num_tokens": 161003691.0, "step": 4215 }, { "epoch": 0.5363185345375906, "ewc_loss": 0.021102838218212128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1102838218212128e-05, "grad_norm": 14.573253631591797, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8580014705657959, "num_tokens": 161042056.0, "step": 4216 }, { "epoch": 0.5364457448161811, "ewc_loss": 0.021169377490878105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1169376850593835e-05, "grad_norm": 14.633357048034668, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8803965449333191, "num_tokens": 161078670.0, "step": 4217 }, { "epoch": 0.5365729550947717, "ewc_loss": 0.02114405669271946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1144056518096477e-05, "grad_norm": 14.618016242980957, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.869990348815918, "num_tokens": 161113992.0, "step": 4218 }, { "epoch": 0.5367001653733622, "ewc_loss": 0.021158520132303238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.115851930284407e-05, "grad_norm": 14.640263557434082, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8631769418716431, "num_tokens": 161153763.0, "step": 4219 }, { "epoch": 0.5368273756519527, "ewc_loss": 0.021123073995113373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1123074475326575e-05, "grad_norm": 14.605961799621582, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8415287733078003, "num_tokens": 161184186.0, "step": 4220 }, { "epoch": 0.5369545859305432, "ewc_loss": 0.02112112194299698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.112112269969657e-05, "grad_norm": 14.626119613647461, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8550169467926025, "num_tokens": 161219298.0, "step": 4221 }, { "epoch": 0.5370817962091337, "ewc_loss": 0.021127330139279366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1127330910530873e-05, "grad_norm": 14.565469741821289, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8728942275047302, "num_tokens": 161255252.0, "step": 4222 }, { "epoch": 0.5372090064877242, "ewc_loss": 0.021104201674461365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1104202460264787e-05, "grad_norm": 14.591781616210938, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8615984916687012, "num_tokens": 161295890.0, "step": 4223 }, { "epoch": 0.5373362167663147, "ewc_loss": 0.021160583943128586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1160583855817094e-05, "grad_norm": 14.617903709411621, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8543962836265564, "num_tokens": 161330808.0, "step": 4224 }, { "epoch": 0.5374634270449052, "ewc_loss": 0.021115612238645554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.111561298079323e-05, "grad_norm": 14.535826683044434, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8657371401786804, "num_tokens": 161370515.0, "step": 4225 }, { "epoch": 0.5375906373234958, "ewc_loss": 0.021147454157471657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.11474543903023e-05, "grad_norm": 14.62066650390625, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8690537214279175, "num_tokens": 161405257.0, "step": 4226 }, { "epoch": 0.5377178476020863, "ewc_loss": 0.021176906302571297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.117690564773511e-05, "grad_norm": 14.603816986083984, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8576892614364624, "num_tokens": 161443526.0, "step": 4227 }, { "epoch": 0.5378450578806767, "ewc_loss": 0.021154310554265976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1154310161364265e-05, "grad_norm": 14.64344310760498, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8511074781417847, "num_tokens": 161481262.0, "step": 4228 }, { "epoch": 0.5379722681592672, "ewc_loss": 0.021162113174796104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1162113625905477e-05, "grad_norm": 14.661075592041016, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8429044485092163, "num_tokens": 161517756.0, "step": 4229 }, { "epoch": 0.5380994784378578, "ewc_loss": 0.021178066730499268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1178066162974574e-05, "grad_norm": 14.595908164978027, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8654186725616455, "num_tokens": 161552680.0, "step": 4230 }, { "epoch": 0.5382266887164483, "ewc_loss": 0.02115018106997013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1150181055418216e-05, "grad_norm": 14.65844440460205, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8550302386283875, "num_tokens": 161588607.0, "step": 4231 }, { "epoch": 0.5383538989950388, "ewc_loss": 0.021208198741078377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1208199541433714e-05, "grad_norm": 14.627829551696777, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8415204286575317, "num_tokens": 161626427.0, "step": 4232 }, { "epoch": 0.5384811092736294, "ewc_loss": 0.021140428259968758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1140427634236403e-05, "grad_norm": 14.594853401184082, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8634106516838074, "num_tokens": 161665182.0, "step": 4233 }, { "epoch": 0.5386083195522198, "ewc_loss": 0.021222149953246117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1222149371169508e-05, "grad_norm": 14.640254020690918, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8574860692024231, "num_tokens": 161704685.0, "step": 4234 }, { "epoch": 0.5387355298308103, "ewc_loss": 0.021177103742957115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1177103917580098e-05, "grad_norm": 14.602742195129395, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8635381460189819, "num_tokens": 161742960.0, "step": 4235 }, { "epoch": 0.5388627401094008, "ewc_loss": 0.021185100078582764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1185100194998085e-05, "grad_norm": 14.601261138916016, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8567010760307312, "num_tokens": 161778199.0, "step": 4236 }, { "epoch": 0.5389899503879914, "ewc_loss": 0.02123367227613926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.123367266904097e-05, "grad_norm": 14.651665687561035, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8531943559646606, "num_tokens": 161811235.0, "step": 4237 }, { "epoch": 0.5391171606665819, "ewc_loss": 0.02117173932492733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.117173971782904e-05, "grad_norm": 14.550116539001465, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8579482436180115, "num_tokens": 161847669.0, "step": 4238 }, { "epoch": 0.5392443709451724, "ewc_loss": 0.02121337316930294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12133727472974e-05, "grad_norm": 14.633695602416992, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8436899185180664, "num_tokens": 161888470.0, "step": 4239 }, { "epoch": 0.5393715812237628, "ewc_loss": 0.021214600652456284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1214600565144792e-05, "grad_norm": 14.596854209899902, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8387895822525024, "num_tokens": 161924563.0, "step": 4240 }, { "epoch": 0.5394987915023534, "ewc_loss": 0.021176517009735107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1176516384002753e-05, "grad_norm": 14.637055397033691, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8436676263809204, "num_tokens": 161959062.0, "step": 4241 }, { "epoch": 0.5396260017809439, "ewc_loss": 0.021226583048701286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122658224834595e-05, "grad_norm": 14.52392292022705, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8616684675216675, "num_tokens": 161994755.0, "step": 4242 }, { "epoch": 0.5397532120595344, "ewc_loss": 0.021195342764258385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1195342924329452e-05, "grad_norm": 14.636022567749023, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8673756122589111, "num_tokens": 162032812.0, "step": 4243 }, { "epoch": 0.5398804223381249, "ewc_loss": 0.021305838599801064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.130583925463725e-05, "grad_norm": 14.63404369354248, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.860704779624939, "num_tokens": 162067130.0, "step": 4244 }, { "epoch": 0.5400076326167155, "ewc_loss": 0.02122127264738083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1221272618277e-05, "grad_norm": 14.607860565185547, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8591205477714539, "num_tokens": 162107452.0, "step": 4245 }, { "epoch": 0.5401348428953059, "ewc_loss": 0.021249474957585335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1249474229989573e-05, "grad_norm": 14.619668960571289, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8572211861610413, "num_tokens": 162144220.0, "step": 4246 }, { "epoch": 0.5402620531738964, "ewc_loss": 0.021216075867414474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1216075765551068e-05, "grad_norm": 14.601639747619629, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8671789169311523, "num_tokens": 162181798.0, "step": 4247 }, { "epoch": 0.540389263452487, "ewc_loss": 0.02125089429318905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1250894860713743e-05, "grad_norm": 14.652909278869629, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8499279618263245, "num_tokens": 162222501.0, "step": 4248 }, { "epoch": 0.5405164737310775, "ewc_loss": 0.02124154008924961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1241539798211306e-05, "grad_norm": 14.581817626953125, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8541345596313477, "num_tokens": 162265533.0, "step": 4249 }, { "epoch": 0.540643684009668, "ewc_loss": 0.021224653348326683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122465411957819e-05, "grad_norm": 14.664247512817383, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8730259537696838, "num_tokens": 162304984.0, "step": 4250 }, { "epoch": 0.5407708942882585, "ewc_loss": 0.02124752290546894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.124752245435957e-05, "grad_norm": 14.609541893005371, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8489341735839844, "num_tokens": 162342358.0, "step": 4251 }, { "epoch": 0.540898104566849, "ewc_loss": 0.021213077008724213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.121307625202462e-05, "grad_norm": 14.662102699279785, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8596250414848328, "num_tokens": 162377343.0, "step": 4252 }, { "epoch": 0.5410253148454395, "ewc_loss": 0.021260760724544525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1260761059238575e-05, "grad_norm": 14.588020324707031, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8582100868225098, "num_tokens": 162416416.0, "step": 4253 }, { "epoch": 0.54115252512403, "ewc_loss": 0.021236246451735497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1236246539046988e-05, "grad_norm": 14.622867584228516, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8837804794311523, "num_tokens": 162447108.0, "step": 4254 }, { "epoch": 0.5412797354026205, "ewc_loss": 0.021230267360806465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1230267520877533e-05, "grad_norm": 14.629656791687012, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8602196574211121, "num_tokens": 162489659.0, "step": 4255 }, { "epoch": 0.5414069456812111, "ewc_loss": 0.021232254803180695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1232255676295608e-05, "grad_norm": 14.647966384887695, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8480803966522217, "num_tokens": 162525816.0, "step": 4256 }, { "epoch": 0.5415341559598016, "ewc_loss": 0.021219447255134583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.121944635291584e-05, "grad_norm": 14.606427192687988, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8537850975990295, "num_tokens": 162564154.0, "step": 4257 }, { "epoch": 0.5416613662383921, "ewc_loss": 0.021204009652137756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1204010408837348e-05, "grad_norm": 14.64082145690918, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8515424132347107, "num_tokens": 162602728.0, "step": 4258 }, { "epoch": 0.5417885765169825, "ewc_loss": 0.021262723952531815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1262723748805e-05, "grad_norm": 14.668917655944824, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8628588914871216, "num_tokens": 162636659.0, "step": 4259 }, { "epoch": 0.5419157867955731, "ewc_loss": 0.021204397082328796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1204397853580303e-05, "grad_norm": 14.582622528076172, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8743759393692017, "num_tokens": 162676639.0, "step": 4260 }, { "epoch": 0.5420429970741636, "ewc_loss": 0.02125198021531105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.125198079738766e-05, "grad_norm": 14.675511360168457, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8396650552749634, "num_tokens": 162716763.0, "step": 4261 }, { "epoch": 0.5421702073527541, "ewc_loss": 0.02124846912920475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1248468328849413e-05, "grad_norm": 14.593170166015625, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8476437330245972, "num_tokens": 162753943.0, "step": 4262 }, { "epoch": 0.5422974176313446, "ewc_loss": 0.0212385393679142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.123854028468486e-05, "grad_norm": 14.68205451965332, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8532297611236572, "num_tokens": 162792938.0, "step": 4263 }, { "epoch": 0.5424246279099352, "ewc_loss": 0.021283768117427826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1283767637214623e-05, "grad_norm": 14.609054565429688, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8557076454162598, "num_tokens": 162838938.0, "step": 4264 }, { "epoch": 0.5425518381885256, "ewc_loss": 0.021219836547970772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12198374356376e-05, "grad_norm": 14.62051773071289, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.863267183303833, "num_tokens": 162875026.0, "step": 4265 }, { "epoch": 0.5426790484671161, "ewc_loss": 0.02125425450503826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1254254534142092e-05, "grad_norm": 14.634964942932129, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.861289381980896, "num_tokens": 162910724.0, "step": 4266 }, { "epoch": 0.5428062587457066, "ewc_loss": 0.021203435957431793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1203435608185828e-05, "grad_norm": 14.680335998535156, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8607362508773804, "num_tokens": 162941602.0, "step": 4267 }, { "epoch": 0.5429334690242972, "ewc_loss": 0.021249061450362206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.124906131939497e-05, "grad_norm": 14.622005462646484, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8444812297821045, "num_tokens": 162980760.0, "step": 4268 }, { "epoch": 0.5430606793028877, "ewc_loss": 0.021252771839499474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1252772057778202e-05, "grad_norm": 14.725954055786133, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.867775559425354, "num_tokens": 163016155.0, "step": 4269 }, { "epoch": 0.5431878895814782, "ewc_loss": 0.02128256857395172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1282568923197687e-05, "grad_norm": 14.6202974319458, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8495310544967651, "num_tokens": 163052519.0, "step": 4270 }, { "epoch": 0.5433150998600687, "ewc_loss": 0.02120225876569748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1202258722041734e-05, "grad_norm": 14.667439460754395, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8661223649978638, "num_tokens": 163091806.0, "step": 4271 }, { "epoch": 0.5434423101386592, "ewc_loss": 0.02128007635474205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128007690771483e-05, "grad_norm": 14.624391555786133, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8731217384338379, "num_tokens": 163128830.0, "step": 4272 }, { "epoch": 0.5435695204172497, "ewc_loss": 0.021223999559879303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1223999283392914e-05, "grad_norm": 14.615312576293945, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8533703088760376, "num_tokens": 163169567.0, "step": 4273 }, { "epoch": 0.5436967306958402, "ewc_loss": 0.021274318918585777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1274319806252606e-05, "grad_norm": 14.680055618286133, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.850579023361206, "num_tokens": 163202913.0, "step": 4274 }, { "epoch": 0.5438239409744308, "ewc_loss": 0.02121749520301819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1217494577285834e-05, "grad_norm": 14.645225524902344, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8659217357635498, "num_tokens": 163236741.0, "step": 4275 }, { "epoch": 0.5439511512530213, "ewc_loss": 0.021248899400234222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1248899429338053e-05, "grad_norm": 14.636918067932129, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8519263863563538, "num_tokens": 163281402.0, "step": 4276 }, { "epoch": 0.5440783615316117, "ewc_loss": 0.021273138001561165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12731374631403e-05, "grad_norm": 14.622830390930176, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8672094941139221, "num_tokens": 163325288.0, "step": 4277 }, { "epoch": 0.5442055718102022, "ewc_loss": 0.021241744980216026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1241745344013907e-05, "grad_norm": 14.637458801269531, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.847172737121582, "num_tokens": 163367471.0, "step": 4278 }, { "epoch": 0.5443327820887928, "ewc_loss": 0.02125769853591919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1257697881083004e-05, "grad_norm": 14.683399200439453, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8594905138015747, "num_tokens": 163400077.0, "step": 4279 }, { "epoch": 0.5444599923673833, "ewc_loss": 0.021221568807959557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1221569113549776e-05, "grad_norm": 14.61245346069336, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8604549765586853, "num_tokens": 163433739.0, "step": 4280 }, { "epoch": 0.5445872026459738, "ewc_loss": 0.021221468225121498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122146906913258e-05, "grad_norm": 14.572031021118164, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8583692312240601, "num_tokens": 163467447.0, "step": 4281 }, { "epoch": 0.5447144129245644, "ewc_loss": 0.021266397088766098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126639628841076e-05, "grad_norm": 14.681830406188965, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8546620011329651, "num_tokens": 163506633.0, "step": 4282 }, { "epoch": 0.5448416232031548, "ewc_loss": 0.021302876994013786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1302876120898873e-05, "grad_norm": 14.637985229492188, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8424450159072876, "num_tokens": 163542773.0, "step": 4283 }, { "epoch": 0.5449688334817453, "ewc_loss": 0.021236203610897064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1236202883301303e-05, "grad_norm": 14.622647285461426, "learning_rate": 1e-06, "loss": 0.5492, "mean_token_accuracy": 0.8245899677276611, "num_tokens": 163585247.0, "step": 4284 }, { "epoch": 0.5450960437603358, "ewc_loss": 0.021257517859339714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1257517801132053e-05, "grad_norm": 14.65669059753418, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8461602926254272, "num_tokens": 163622802.0, "step": 4285 }, { "epoch": 0.5452232540389264, "ewc_loss": 0.021264508366584778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126450817740988e-05, "grad_norm": 14.617010116577148, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8824002146720886, "num_tokens": 163655980.0, "step": 4286 }, { "epoch": 0.5453504643175169, "ewc_loss": 0.021228188648819923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122818841598928e-05, "grad_norm": 14.704534530639648, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8630524277687073, "num_tokens": 163691494.0, "step": 4287 }, { "epoch": 0.5454776745961074, "ewc_loss": 0.02127387933433056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1273879610816948e-05, "grad_norm": 14.635833740234375, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8685051798820496, "num_tokens": 163725169.0, "step": 4288 }, { "epoch": 0.5456048848746978, "ewc_loss": 0.02122870273888111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1228703189990483e-05, "grad_norm": 14.661019325256348, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8622968196868896, "num_tokens": 163766050.0, "step": 4289 }, { "epoch": 0.5457320951532884, "ewc_loss": 0.021302644163370132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.130264329025522e-05, "grad_norm": 14.708613395690918, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8518701791763306, "num_tokens": 163799559.0, "step": 4290 }, { "epoch": 0.5458593054318789, "ewc_loss": 0.0212408360093832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1240835849312134e-05, "grad_norm": 14.652796745300293, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8530939221382141, "num_tokens": 163837052.0, "step": 4291 }, { "epoch": 0.5459865157104694, "ewc_loss": 0.021258380264043808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1258380002109334e-05, "grad_norm": 14.64877700805664, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8545910120010376, "num_tokens": 163872494.0, "step": 4292 }, { "epoch": 0.5461137259890599, "ewc_loss": 0.021276455372571945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.127645529981237e-05, "grad_norm": 14.6558256149292, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8506250381469727, "num_tokens": 163913499.0, "step": 4293 }, { "epoch": 0.5462409362676505, "ewc_loss": 0.0212757159024477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.127571679011453e-05, "grad_norm": 14.664344787597656, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8382077217102051, "num_tokens": 163957965.0, "step": 4294 }, { "epoch": 0.5463681465462409, "ewc_loss": 0.021285051479935646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1285051843733527e-05, "grad_norm": 14.69159984588623, "learning_rate": 1e-06, "loss": 0.5763, "mean_token_accuracy": 0.8174850940704346, "num_tokens": 164005661.0, "step": 4295 }, { "epoch": 0.5464953568248314, "ewc_loss": 0.021295582875609398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1295581973390654e-05, "grad_norm": 14.711556434631348, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8735289573669434, "num_tokens": 164044717.0, "step": 4296 }, { "epoch": 0.5466225671034219, "ewc_loss": 0.021226588636636734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122658770531416e-05, "grad_norm": 14.60195255279541, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8572450876235962, "num_tokens": 164080022.0, "step": 4297 }, { "epoch": 0.5467497773820125, "ewc_loss": 0.02122465893626213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12246595765464e-05, "grad_norm": 14.670319557189941, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8649356365203857, "num_tokens": 164112749.0, "step": 4298 }, { "epoch": 0.546876987660603, "ewc_loss": 0.021329404786229134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1329404262360185e-05, "grad_norm": 14.680298805236816, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8526260852813721, "num_tokens": 164152517.0, "step": 4299 }, { "epoch": 0.5470041979391935, "ewc_loss": 0.02126842923462391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1268429918563925e-05, "grad_norm": 14.6189546585083, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8638616800308228, "num_tokens": 164195245.0, "step": 4300 }, { "epoch": 0.5471314082177839, "ewc_loss": 0.021288132295012474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128813139279373e-05, "grad_norm": 14.65511703491211, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8524671792984009, "num_tokens": 164230643.0, "step": 4301 }, { "epoch": 0.5472586184963745, "ewc_loss": 0.021268371492624283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126837171090301e-05, "grad_norm": 14.639789581298828, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8540861010551453, "num_tokens": 164271089.0, "step": 4302 }, { "epoch": 0.547385828774965, "ewc_loss": 0.02129332348704338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.129332278855145e-05, "grad_norm": 14.682201385498047, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.856063961982727, "num_tokens": 164310618.0, "step": 4303 }, { "epoch": 0.5475130390535555, "ewc_loss": 0.021304277703166008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1304278561729006e-05, "grad_norm": 14.716690063476562, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.844398558139801, "num_tokens": 164348630.0, "step": 4304 }, { "epoch": 0.5476402493321461, "ewc_loss": 0.02125573717057705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1255737010505982e-05, "grad_norm": 14.627903938293457, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8640673160552979, "num_tokens": 164383735.0, "step": 4305 }, { "epoch": 0.5477674596107366, "ewc_loss": 0.021253950893878937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12539507629117e-05, "grad_norm": 14.6502685546875, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8592133522033691, "num_tokens": 164426115.0, "step": 4306 }, { "epoch": 0.5478946698893271, "ewc_loss": 0.021276723593473434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.127672269125469e-05, "grad_norm": 14.628539085388184, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8500970602035522, "num_tokens": 164462119.0, "step": 4307 }, { "epoch": 0.5480218801679175, "ewc_loss": 0.021257618442177773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1257617845549248e-05, "grad_norm": 14.653400421142578, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8551953434944153, "num_tokens": 164503030.0, "step": 4308 }, { "epoch": 0.5481490904465081, "ewc_loss": 0.021286264061927795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128626329067629e-05, "grad_norm": 14.67648983001709, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8603404760360718, "num_tokens": 164541091.0, "step": 4309 }, { "epoch": 0.5482763007250986, "ewc_loss": 0.021251218393445015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1251218640827574e-05, "grad_norm": 14.66977310180664, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8515456914901733, "num_tokens": 164577678.0, "step": 4310 }, { "epoch": 0.5484035110036891, "ewc_loss": 0.021262861788272858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126286199199967e-05, "grad_norm": 14.64294719696045, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.854807436466217, "num_tokens": 164612779.0, "step": 4311 }, { "epoch": 0.5485307212822796, "ewc_loss": 0.021262098103761673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126209801645018e-05, "grad_norm": 14.669328689575195, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8624095916748047, "num_tokens": 164649849.0, "step": 4312 }, { "epoch": 0.5486579315608702, "ewc_loss": 0.02127727121114731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.127727202605456e-05, "grad_norm": 14.675949096679688, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8541002869606018, "num_tokens": 164690538.0, "step": 4313 }, { "epoch": 0.5487851418394606, "ewc_loss": 0.021267209202051163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1267209376674145e-05, "grad_norm": 14.706878662109375, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8585774898529053, "num_tokens": 164729338.0, "step": 4314 }, { "epoch": 0.5489123521180511, "ewc_loss": 0.02125990204513073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1259902496240102e-05, "grad_norm": 14.600887298583984, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8690556883811951, "num_tokens": 164766648.0, "step": 4315 }, { "epoch": 0.5490395623966416, "ewc_loss": 0.02126999758183956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126999788742978e-05, "grad_norm": 14.75497055053711, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8568080067634583, "num_tokens": 164804254.0, "step": 4316 }, { "epoch": 0.5491667726752322, "ewc_loss": 0.021332692354917526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1332692995201796e-05, "grad_norm": 14.669404983520508, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8699934482574463, "num_tokens": 164839943.0, "step": 4317 }, { "epoch": 0.5492939829538227, "ewc_loss": 0.021259911358356476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.125991159118712e-05, "grad_norm": 14.748869895935059, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8496497869491577, "num_tokens": 164887057.0, "step": 4318 }, { "epoch": 0.5494211932324132, "ewc_loss": 0.02129277028143406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1292769815772772e-05, "grad_norm": 14.692314147949219, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.857656717300415, "num_tokens": 164928515.0, "step": 4319 }, { "epoch": 0.5495484035110036, "ewc_loss": 0.02125103771686554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1251038560876623e-05, "grad_norm": 14.678985595703125, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8564870357513428, "num_tokens": 164963818.0, "step": 4320 }, { "epoch": 0.5496756137895942, "ewc_loss": 0.02126389555633068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1263895177980885e-05, "grad_norm": 14.66425609588623, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.871488094329834, "num_tokens": 165001091.0, "step": 4321 }, { "epoch": 0.5498028240681847, "ewc_loss": 0.021269088611006737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126908839272801e-05, "grad_norm": 14.718866348266602, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.840539276599884, "num_tokens": 165046009.0, "step": 4322 }, { "epoch": 0.5499300343467752, "ewc_loss": 0.021263711154460907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1263711460051127e-05, "grad_norm": 14.616109848022461, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8534084558486938, "num_tokens": 165090729.0, "step": 4323 }, { "epoch": 0.5500572446253658, "ewc_loss": 0.02120703086256981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1207031750236638e-05, "grad_norm": 14.684330940246582, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8739226460456848, "num_tokens": 165129771.0, "step": 4324 }, { "epoch": 0.5501844549039563, "ewc_loss": 0.021263519302010536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.126351864717435e-05, "grad_norm": 14.710580825805664, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8726438283920288, "num_tokens": 165164310.0, "step": 4325 }, { "epoch": 0.5503116651825467, "ewc_loss": 0.021197639405727386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1197638488956727e-05, "grad_norm": 14.634060859680176, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8553515672683716, "num_tokens": 165202842.0, "step": 4326 }, { "epoch": 0.5504388754611372, "ewc_loss": 0.021170999854803085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1170999389141798e-05, "grad_norm": 14.687078475952148, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8618541955947876, "num_tokens": 165235998.0, "step": 4327 }, { "epoch": 0.5505660857397278, "ewc_loss": 0.02123621478676796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1236215616227128e-05, "grad_norm": 14.655067443847656, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8587977886199951, "num_tokens": 165271779.0, "step": 4328 }, { "epoch": 0.5506932960183183, "ewc_loss": 0.021200083196163177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1200083210715093e-05, "grad_norm": 14.679635047912598, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8522051572799683, "num_tokens": 165306136.0, "step": 4329 }, { "epoch": 0.5508205062969088, "ewc_loss": 0.021222509443759918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.122250953107141e-05, "grad_norm": 14.667351722717285, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8717851638793945, "num_tokens": 165342401.0, "step": 4330 }, { "epoch": 0.5509477165754993, "ewc_loss": 0.021229280158877373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1229279809631407e-05, "grad_norm": 14.695389747619629, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8612790107727051, "num_tokens": 165380120.0, "step": 4331 }, { "epoch": 0.5510749268540898, "ewc_loss": 0.02125096507370472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.125096580130048e-05, "grad_norm": 14.731287956237793, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8453562259674072, "num_tokens": 165418210.0, "step": 4332 }, { "epoch": 0.5512021371326803, "ewc_loss": 0.02123105339705944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1231053324299864e-05, "grad_norm": 14.638213157653809, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8372640013694763, "num_tokens": 165460952.0, "step": 4333 }, { "epoch": 0.5513293474112708, "ewc_loss": 0.021226363256573677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1226363969617523e-05, "grad_norm": 14.672767639160156, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8702266812324524, "num_tokens": 165500370.0, "step": 4334 }, { "epoch": 0.5514565576898613, "ewc_loss": 0.02128477208316326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128477171936538e-05, "grad_norm": 14.696051597595215, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8583824634552002, "num_tokens": 165534089.0, "step": 4335 }, { "epoch": 0.5515837679684519, "ewc_loss": 0.021260347217321396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1260346329654567e-05, "grad_norm": 14.686335563659668, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8503656387329102, "num_tokens": 165573111.0, "step": 4336 }, { "epoch": 0.5517109782470424, "ewc_loss": 0.02126297727227211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1262976588332094e-05, "grad_norm": 14.632204055786133, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8484779596328735, "num_tokens": 165611942.0, "step": 4337 }, { "epoch": 0.5518381885256328, "ewc_loss": 0.021286899223923683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128689993696753e-05, "grad_norm": 14.690354347229004, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8638890981674194, "num_tokens": 165651703.0, "step": 4338 }, { "epoch": 0.5519653988042234, "ewc_loss": 0.02129374071955681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1293741156114265e-05, "grad_norm": 14.705759048461914, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8553730249404907, "num_tokens": 165691257.0, "step": 4339 }, { "epoch": 0.5520926090828139, "ewc_loss": 0.02129015326499939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1290154109010473e-05, "grad_norm": 14.68714714050293, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.874211311340332, "num_tokens": 165727947.0, "step": 4340 }, { "epoch": 0.5522198193614044, "ewc_loss": 0.02126164361834526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1261643269099295e-05, "grad_norm": 14.69177532196045, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8662081956863403, "num_tokens": 165763400.0, "step": 4341 }, { "epoch": 0.5523470296399949, "ewc_loss": 0.021283641457557678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128364212694578e-05, "grad_norm": 14.656990051269531, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8578689098358154, "num_tokens": 165809586.0, "step": 4342 }, { "epoch": 0.5524742399185855, "ewc_loss": 0.02131272852420807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1312727767508477e-05, "grad_norm": 14.760266304016113, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8600683212280273, "num_tokens": 165839565.0, "step": 4343 }, { "epoch": 0.5526014501971759, "ewc_loss": 0.021305274218320847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1305273548932746e-05, "grad_norm": 14.710701942443848, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.861493706703186, "num_tokens": 165873539.0, "step": 4344 }, { "epoch": 0.5527286604757664, "ewc_loss": 0.021289972588419914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.128997221007012e-05, "grad_norm": 14.734600067138672, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8661749362945557, "num_tokens": 165920987.0, "step": 4345 }, { "epoch": 0.5528558707543569, "ewc_loss": 0.021262047812342644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1262047084746882e-05, "grad_norm": 14.60632038116455, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.84804767370224, "num_tokens": 165965661.0, "step": 4346 }, { "epoch": 0.5529830810329475, "ewc_loss": 0.021255232393741608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.12552331504412e-05, "grad_norm": 14.89148998260498, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8513364195823669, "num_tokens": 165998239.0, "step": 4347 }, { "epoch": 0.553110291311538, "ewc_loss": 0.021369382739067078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1369382011471316e-05, "grad_norm": 14.64413833618164, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8546637296676636, "num_tokens": 166036489.0, "step": 4348 }, { "epoch": 0.5532375015901285, "ewc_loss": 0.021199669688940048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1199670300120488e-05, "grad_norm": 14.604537010192871, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8540621399879456, "num_tokens": 166078774.0, "step": 4349 }, { "epoch": 0.5533647118687189, "ewc_loss": 0.021292351186275482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1292351448209956e-05, "grad_norm": 14.693191528320312, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8650369644165039, "num_tokens": 166119555.0, "step": 4350 }, { "epoch": 0.5534919221473095, "ewc_loss": 0.021290790289640427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1290790755301714e-05, "grad_norm": 14.637285232543945, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8566782474517822, "num_tokens": 166159788.0, "step": 4351 }, { "epoch": 0.5536191324259, "ewc_loss": 0.02125343307852745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.125343235093169e-05, "grad_norm": 14.675529479980469, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8527216911315918, "num_tokens": 166200230.0, "step": 4352 }, { "epoch": 0.5537463427044905, "ewc_loss": 0.021333899348974228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1333898985176347e-05, "grad_norm": 14.74476432800293, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8613964319229126, "num_tokens": 166237864.0, "step": 4353 }, { "epoch": 0.553873552983081, "ewc_loss": 0.021309901028871536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1309901057975367e-05, "grad_norm": 14.633630752563477, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8606038093566895, "num_tokens": 166278347.0, "step": 4354 }, { "epoch": 0.5540007632616716, "ewc_loss": 0.02129058539867401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1290585209499113e-05, "grad_norm": 14.766409873962402, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8492240309715271, "num_tokens": 166315744.0, "step": 4355 }, { "epoch": 0.554127973540262, "ewc_loss": 0.021348832175135612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.134883288817946e-05, "grad_norm": 14.697876930236816, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8579981327056885, "num_tokens": 166352705.0, "step": 4356 }, { "epoch": 0.5542551838188525, "ewc_loss": 0.021246686577796936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1246685719233938e-05, "grad_norm": 14.674219131469727, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8619794845581055, "num_tokens": 166393956.0, "step": 4357 }, { "epoch": 0.554382394097443, "ewc_loss": 0.021326689049601555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.132668851118069e-05, "grad_norm": 14.76758098602295, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8353572487831116, "num_tokens": 166430172.0, "step": 4358 }, { "epoch": 0.5545096043760336, "ewc_loss": 0.021270718425512314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1270718207233585e-05, "grad_norm": 14.671138763427734, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8580331206321716, "num_tokens": 166468732.0, "step": 4359 }, { "epoch": 0.5546368146546241, "ewc_loss": 0.02131795510649681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1317955543054268e-05, "grad_norm": 14.781171798706055, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.854170560836792, "num_tokens": 166511383.0, "step": 4360 }, { "epoch": 0.5547640249332146, "ewc_loss": 0.021291198208928108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1291198208928108e-05, "grad_norm": 14.678498268127441, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.861943244934082, "num_tokens": 166547674.0, "step": 4361 }, { "epoch": 0.5548912352118052, "ewc_loss": 0.021263830363750458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1263829694362357e-05, "grad_norm": 14.747540473937988, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8510892391204834, "num_tokens": 166581763.0, "step": 4362 }, { "epoch": 0.5550184454903956, "ewc_loss": 0.021327435970306396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.132743611582555e-05, "grad_norm": 14.788496017456055, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8549104332923889, "num_tokens": 166618604.0, "step": 4363 }, { "epoch": 0.5551456557689861, "ewc_loss": 0.0212359968572855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1235997337498702e-05, "grad_norm": 14.63762378692627, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8671209812164307, "num_tokens": 166654812.0, "step": 4364 }, { "epoch": 0.5552728660475766, "ewc_loss": 0.021272731944918633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.127273182850331e-05, "grad_norm": 14.735822677612305, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8447480797767639, "num_tokens": 166693206.0, "step": 4365 }, { "epoch": 0.5554000763261672, "ewc_loss": 0.02132631652057171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1326317437342368e-05, "grad_norm": 14.665553092956543, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8559858798980713, "num_tokens": 166734531.0, "step": 4366 }, { "epoch": 0.5555272866047577, "ewc_loss": 0.021297575905919075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.129757558577694e-05, "grad_norm": 14.6466646194458, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8563471436500549, "num_tokens": 166771362.0, "step": 4367 }, { "epoch": 0.5556544968833482, "ewc_loss": 0.02133253961801529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.13325402000919e-05, "grad_norm": 14.680188179016113, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8490005135536194, "num_tokens": 166802784.0, "step": 4368 }, { "epoch": 0.5557817071619386, "ewc_loss": 0.021389778703451157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1389778339653276e-05, "grad_norm": 14.724608421325684, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8529235124588013, "num_tokens": 166835590.0, "step": 4369 }, { "epoch": 0.5559089174405292, "ewc_loss": 0.021405348554253578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.140534888894763e-05, "grad_norm": 14.691045761108398, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8521786332130432, "num_tokens": 166872145.0, "step": 4370 }, { "epoch": 0.5560361277191197, "ewc_loss": 0.021357085555791855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1357085643103346e-05, "grad_norm": 14.643613815307617, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8540975451469421, "num_tokens": 166912855.0, "step": 4371 }, { "epoch": 0.5561633379977102, "ewc_loss": 0.021416569128632545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.14165684155887e-05, "grad_norm": 14.673905372619629, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8426978588104248, "num_tokens": 166949482.0, "step": 4372 }, { "epoch": 0.5562905482763008, "ewc_loss": 0.021437210962176323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1437210307340138e-05, "grad_norm": 14.714998245239258, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8574317693710327, "num_tokens": 166986727.0, "step": 4373 }, { "epoch": 0.5564177585548913, "ewc_loss": 0.02142215520143509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.142215453204699e-05, "grad_norm": 14.63996410369873, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8497815132141113, "num_tokens": 167025869.0, "step": 4374 }, { "epoch": 0.5565449688334817, "ewc_loss": 0.021419920027256012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1419920813059434e-05, "grad_norm": 14.72774600982666, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.867807924747467, "num_tokens": 167059825.0, "step": 4375 }, { "epoch": 0.5566721791120722, "ewc_loss": 0.021457063034176826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1457062757690437e-05, "grad_norm": 14.673070907592773, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8625431656837463, "num_tokens": 167097375.0, "step": 4376 }, { "epoch": 0.5567993893906628, "ewc_loss": 0.021435655653476715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1435655071400106e-05, "grad_norm": 14.710397720336914, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8479094505310059, "num_tokens": 167136555.0, "step": 4377 }, { "epoch": 0.5569265996692533, "ewc_loss": 0.021453894674777985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.145389407814946e-05, "grad_norm": 14.704924583435059, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8550337553024292, "num_tokens": 167179870.0, "step": 4378 }, { "epoch": 0.5570538099478438, "ewc_loss": 0.021462619304656982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.146261977031827e-05, "grad_norm": 14.728720664978027, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8561052083969116, "num_tokens": 167215242.0, "step": 4379 }, { "epoch": 0.5571810202264343, "ewc_loss": 0.02144717052578926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1447171093313955e-05, "grad_norm": 14.666407585144043, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8644179105758667, "num_tokens": 167250690.0, "step": 4380 }, { "epoch": 0.5573082305050248, "ewc_loss": 0.02141994796693325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1419948097900487e-05, "grad_norm": 14.76474666595459, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8441135883331299, "num_tokens": 167283677.0, "step": 4381 }, { "epoch": 0.5574354407836153, "ewc_loss": 0.021492905914783478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1492905943887308e-05, "grad_norm": 14.751840591430664, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8580121397972107, "num_tokens": 167321355.0, "step": 4382 }, { "epoch": 0.5575626510622058, "ewc_loss": 0.02139534056186676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.139534080924932e-05, "grad_norm": 14.683431625366211, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8489997386932373, "num_tokens": 167363103.0, "step": 4383 }, { "epoch": 0.5576898613407963, "ewc_loss": 0.021485090255737305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1485089746420272e-05, "grad_norm": 14.761125564575195, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8712408542633057, "num_tokens": 167399732.0, "step": 4384 }, { "epoch": 0.5578170716193869, "ewc_loss": 0.021432718262076378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.143271740351338e-05, "grad_norm": 14.739953994750977, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8557010889053345, "num_tokens": 167434938.0, "step": 4385 }, { "epoch": 0.5579442818979774, "ewc_loss": 0.0214410200715065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1441019271151163e-05, "grad_norm": 14.80195426940918, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.860531210899353, "num_tokens": 167472293.0, "step": 4386 }, { "epoch": 0.5580714921765678, "ewc_loss": 0.021404718980193138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1404719518614e-05, "grad_norm": 14.699203491210938, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8615866899490356, "num_tokens": 167511015.0, "step": 4387 }, { "epoch": 0.5581987024551583, "ewc_loss": 0.021367399021983147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.136739931302145e-05, "grad_norm": 14.642976760864258, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.856755793094635, "num_tokens": 167549777.0, "step": 4388 }, { "epoch": 0.5583259127337489, "ewc_loss": 0.021438244730234146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1438245312310755e-05, "grad_norm": 14.757920265197754, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8629790544509888, "num_tokens": 167592342.0, "step": 4389 }, { "epoch": 0.5584531230123394, "ewc_loss": 0.02139035426080227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.13903549592942e-05, "grad_norm": 14.688729286193848, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8615560531616211, "num_tokens": 167630829.0, "step": 4390 }, { "epoch": 0.5585803332909299, "ewc_loss": 0.02137972041964531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.137972114724107e-05, "grad_norm": 14.706844329833984, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8455103635787964, "num_tokens": 167675938.0, "step": 4391 }, { "epoch": 0.5587075435695205, "ewc_loss": 0.021412769332528114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1412768546724692e-05, "grad_norm": 14.684514045715332, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8606950044631958, "num_tokens": 167716550.0, "step": 4392 }, { "epoch": 0.5588347538481109, "ewc_loss": 0.02140440233051777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1404403014457785e-05, "grad_norm": 14.715154647827148, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8452073931694031, "num_tokens": 167754401.0, "step": 4393 }, { "epoch": 0.5589619641267014, "ewc_loss": 0.021417630836367607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.141763070540037e-05, "grad_norm": 14.728204727172852, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8616116046905518, "num_tokens": 167792124.0, "step": 4394 }, { "epoch": 0.5590891744052919, "ewc_loss": 0.021399565041065216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1399564502644353e-05, "grad_norm": 14.714394569396973, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8754891157150269, "num_tokens": 167829226.0, "step": 4395 }, { "epoch": 0.5592163846838825, "ewc_loss": 0.021402280777692795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1402280253823847e-05, "grad_norm": 14.691333770751953, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8672032356262207, "num_tokens": 167863439.0, "step": 4396 }, { "epoch": 0.559343594962473, "ewc_loss": 0.021378718316555023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1378718884079717e-05, "grad_norm": 14.722871780395508, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8583736419677734, "num_tokens": 167907019.0, "step": 4397 }, { "epoch": 0.5594708052410635, "ewc_loss": 0.021371720358729362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1371721231844276e-05, "grad_norm": 14.721866607666016, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8504437208175659, "num_tokens": 167942539.0, "step": 4398 }, { "epoch": 0.5595980155196539, "ewc_loss": 0.02136269584298134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.136269540642388e-05, "grad_norm": 14.687841415405273, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.857647180557251, "num_tokens": 167982848.0, "step": 4399 }, { "epoch": 0.5597252257982445, "ewc_loss": 0.02138381637632847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1383815692388453e-05, "grad_norm": 14.765752792358398, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8661620020866394, "num_tokens": 168017500.0, "step": 4400 }, { "epoch": 0.559852436076835, "ewc_loss": 0.02136077545583248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1360774553613737e-05, "grad_norm": 14.705266952514648, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8435896635055542, "num_tokens": 168052790.0, "step": 4401 }, { "epoch": 0.5599796463554255, "ewc_loss": 0.02136416919529438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1364168787840754e-05, "grad_norm": 14.768672943115234, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8641048073768616, "num_tokens": 168082541.0, "step": 4402 }, { "epoch": 0.560106856634016, "ewc_loss": 0.021369369700551033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.136936927854549e-05, "grad_norm": 14.63157844543457, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8632165193557739, "num_tokens": 168128218.0, "step": 4403 }, { "epoch": 0.5602340669126066, "ewc_loss": 0.021323323249816895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.132332338078413e-05, "grad_norm": 14.775076866149902, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8604059219360352, "num_tokens": 168165496.0, "step": 4404 }, { "epoch": 0.560361277191197, "ewc_loss": 0.021423963829874992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1423964426503517e-05, "grad_norm": 14.696162223815918, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8450173139572144, "num_tokens": 168204675.0, "step": 4405 }, { "epoch": 0.5604884874697875, "ewc_loss": 0.021336521953344345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.133652196789626e-05, "grad_norm": 14.70058536529541, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8540365695953369, "num_tokens": 168244832.0, "step": 4406 }, { "epoch": 0.560615697748378, "ewc_loss": 0.021405808627605438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1405809093266726e-05, "grad_norm": 14.73789119720459, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.852925717830658, "num_tokens": 168282440.0, "step": 4407 }, { "epoch": 0.5607429080269686, "ewc_loss": 0.021393347531557083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1393347196863033e-05, "grad_norm": 14.672754287719727, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8636302351951599, "num_tokens": 168325303.0, "step": 4408 }, { "epoch": 0.5608701183055591, "ewc_loss": 0.021386412903666496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1386413209256716e-05, "grad_norm": 14.756280899047852, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8634442090988159, "num_tokens": 168360628.0, "step": 4409 }, { "epoch": 0.5609973285841496, "ewc_loss": 0.02145395241677761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1453952285810374e-05, "grad_norm": 14.714637756347656, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8446547985076904, "num_tokens": 168398601.0, "step": 4410 }, { "epoch": 0.5611245388627402, "ewc_loss": 0.021357230842113495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.135723116225563e-05, "grad_norm": 14.75078010559082, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8435944318771362, "num_tokens": 168436893.0, "step": 4411 }, { "epoch": 0.5612517491413306, "ewc_loss": 0.021451802924275398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1451802240335383e-05, "grad_norm": 14.683953285217285, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8538904190063477, "num_tokens": 168478794.0, "step": 4412 }, { "epoch": 0.5613789594199211, "ewc_loss": 0.021339623257517815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1339623344829306e-05, "grad_norm": 14.701244354248047, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8550441265106201, "num_tokens": 168520184.0, "step": 4413 }, { "epoch": 0.5615061696985116, "ewc_loss": 0.02141151949763298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1411518901004456e-05, "grad_norm": 14.686788558959961, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8541023135185242, "num_tokens": 168554305.0, "step": 4414 }, { "epoch": 0.5616333799771022, "ewc_loss": 0.021370435133576393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.137043520633597e-05, "grad_norm": 14.703304290771484, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8413889408111572, "num_tokens": 168597752.0, "step": 4415 }, { "epoch": 0.5617605902556927, "ewc_loss": 0.021434146910905838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1434147129184566e-05, "grad_norm": 14.75304889678955, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8629286289215088, "num_tokens": 168637764.0, "step": 4416 }, { "epoch": 0.5618878005342832, "ewc_loss": 0.02142789028584957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.142788980563637e-05, "grad_norm": 14.729846000671387, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8379017114639282, "num_tokens": 168672260.0, "step": 4417 }, { "epoch": 0.5620150108128736, "ewc_loss": 0.021407634019851685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1407633539638482e-05, "grad_norm": 14.71711540222168, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8660560846328735, "num_tokens": 168711478.0, "step": 4418 }, { "epoch": 0.5621422210914642, "ewc_loss": 0.021435504779219627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.143550409527961e-05, "grad_norm": 14.670677185058594, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8604902029037476, "num_tokens": 168751773.0, "step": 4419 }, { "epoch": 0.5622694313700547, "ewc_loss": 0.02142653986811638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1426540115498938e-05, "grad_norm": 14.770755767822266, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8688087463378906, "num_tokens": 168786364.0, "step": 4420 }, { "epoch": 0.5623966416486452, "ewc_loss": 0.021425971761345863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.142597259080503e-05, "grad_norm": 14.682652473449707, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8551841378211975, "num_tokens": 168826819.0, "step": 4421 }, { "epoch": 0.5625238519272358, "ewc_loss": 0.021419888362288475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.141988807125017e-05, "grad_norm": 14.714909553527832, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8559238910675049, "num_tokens": 168869405.0, "step": 4422 }, { "epoch": 0.5626510622058263, "ewc_loss": 0.021455824375152588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1455824025906622e-05, "grad_norm": 14.756834030151367, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8638412356376648, "num_tokens": 168904323.0, "step": 4423 }, { "epoch": 0.5627782724844167, "ewc_loss": 0.02137857675552368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.137857700290624e-05, "grad_norm": 14.691323280334473, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8488813638687134, "num_tokens": 168948654.0, "step": 4424 }, { "epoch": 0.5629054827630072, "ewc_loss": 0.02145722135901451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1457221009768546e-05, "grad_norm": 14.761225700378418, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8640409111976624, "num_tokens": 168984333.0, "step": 4425 }, { "epoch": 0.5630326930415978, "ewc_loss": 0.021396083757281303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1396082956925966e-05, "grad_norm": 14.709912300109863, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8583173751831055, "num_tokens": 169019843.0, "step": 4426 }, { "epoch": 0.5631599033201883, "ewc_loss": 0.021404625847935677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.140462675015442e-05, "grad_norm": 14.788458824157715, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8678830862045288, "num_tokens": 169058177.0, "step": 4427 }, { "epoch": 0.5632871135987788, "ewc_loss": 0.021411456167697906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1411455236375332e-05, "grad_norm": 14.644369125366211, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8638478517532349, "num_tokens": 169102268.0, "step": 4428 }, { "epoch": 0.5634143238773693, "ewc_loss": 0.02133946307003498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1339463273761794e-05, "grad_norm": 14.738755226135254, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8524004220962524, "num_tokens": 169141709.0, "step": 4429 }, { "epoch": 0.5635415341559598, "ewc_loss": 0.021435828879475594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1435829694382846e-05, "grad_norm": 14.779765129089355, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8544921875, "num_tokens": 169177107.0, "step": 4430 }, { "epoch": 0.5636687444345503, "ewc_loss": 0.021405674517154694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1405674488050863e-05, "grad_norm": 14.720125198364258, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.853269100189209, "num_tokens": 169211531.0, "step": 4431 }, { "epoch": 0.5637959547131408, "ewc_loss": 0.02138708531856537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1387084416346624e-05, "grad_norm": 14.775882720947266, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8501306176185608, "num_tokens": 169247507.0, "step": 4432 }, { "epoch": 0.5639231649917313, "ewc_loss": 0.021418510004878044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1418509277282283e-05, "grad_norm": 14.7501802444458, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8712079524993896, "num_tokens": 169281007.0, "step": 4433 }, { "epoch": 0.5640503752703219, "ewc_loss": 0.02140853926539421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1408539396361448e-05, "grad_norm": 14.751145362854004, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8591716885566711, "num_tokens": 169322295.0, "step": 4434 }, { "epoch": 0.5641775855489124, "ewc_loss": 0.02142205461859703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1422054487629794e-05, "grad_norm": 14.765796661376953, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8548067212104797, "num_tokens": 169358184.0, "step": 4435 }, { "epoch": 0.5643047958275028, "ewc_loss": 0.02139236219227314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.139236130460631e-05, "grad_norm": 14.77182388305664, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8630778789520264, "num_tokens": 169391518.0, "step": 4436 }, { "epoch": 0.5644320061060933, "ewc_loss": 0.021393172442913055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1393172573880292e-05, "grad_norm": 14.731490135192871, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8573592901229858, "num_tokens": 169426743.0, "step": 4437 }, { "epoch": 0.5645592163846839, "ewc_loss": 0.02138890139758587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1388901586760767e-05, "grad_norm": 14.75871753692627, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.876604437828064, "num_tokens": 169466452.0, "step": 4438 }, { "epoch": 0.5646864266632744, "ewc_loss": 0.021392181515693665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.139218122465536e-05, "grad_norm": 14.742464065551758, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.862899661064148, "num_tokens": 169500295.0, "step": 4439 }, { "epoch": 0.5648136369418649, "ewc_loss": 0.021392526105046272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1392526832642034e-05, "grad_norm": 14.745636940002441, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8574211001396179, "num_tokens": 169541829.0, "step": 4440 }, { "epoch": 0.5649408472204555, "ewc_loss": 0.021399911493062973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.139991192962043e-05, "grad_norm": 14.76349925994873, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8424954414367676, "num_tokens": 169581547.0, "step": 4441 }, { "epoch": 0.5650680574990459, "ewc_loss": 0.02142445184290409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1424451915663667e-05, "grad_norm": 14.771953582763672, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8617838025093079, "num_tokens": 169622107.0, "step": 4442 }, { "epoch": 0.5651952677776364, "ewc_loss": 0.02144048735499382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1440488126245327e-05, "grad_norm": 14.717418670654297, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8547636270523071, "num_tokens": 169657258.0, "step": 4443 }, { "epoch": 0.5653224780562269, "ewc_loss": 0.021430863067507744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1430863853311166e-05, "grad_norm": 14.782357215881348, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8463540077209473, "num_tokens": 169691568.0, "step": 4444 }, { "epoch": 0.5654496883348175, "ewc_loss": 0.021441804245114326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1441805074573494e-05, "grad_norm": 14.746662139892578, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.864076554775238, "num_tokens": 169732429.0, "step": 4445 }, { "epoch": 0.565576898613408, "ewc_loss": 0.021403104066848755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1403104256023653e-05, "grad_norm": 14.730746269226074, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8500028848648071, "num_tokens": 169769224.0, "step": 4446 }, { "epoch": 0.5657041088919985, "ewc_loss": 0.02146068587899208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.14606861845823e-05, "grad_norm": 14.708734512329102, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8573746681213379, "num_tokens": 169812830.0, "step": 4447 }, { "epoch": 0.5658313191705889, "ewc_loss": 0.021430164575576782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1430165361380205e-05, "grad_norm": 14.750151634216309, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8544908761978149, "num_tokens": 169843823.0, "step": 4448 }, { "epoch": 0.5659585294491795, "ewc_loss": 0.021480204537510872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1480203940882348e-05, "grad_norm": 14.7491455078125, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8591738939285278, "num_tokens": 169882847.0, "step": 4449 }, { "epoch": 0.56608573972777, "ewc_loss": 0.02149905636906624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1499055947060697e-05, "grad_norm": 14.7335786819458, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8387007117271423, "num_tokens": 169921264.0, "step": 4450 }, { "epoch": 0.5662129500063605, "ewc_loss": 0.021474327892065048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1474328605108894e-05, "grad_norm": 14.808268547058105, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8631091117858887, "num_tokens": 169949681.0, "step": 4451 }, { "epoch": 0.566340160284951, "ewc_loss": 0.021532611921429634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.153261266357731e-05, "grad_norm": 14.7692232131958, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8722177743911743, "num_tokens": 169986254.0, "step": 4452 }, { "epoch": 0.5664673705635416, "ewc_loss": 0.021489253267645836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1489253413164988e-05, "grad_norm": 14.752427101135254, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8543140292167664, "num_tokens": 170022745.0, "step": 4453 }, { "epoch": 0.566594580842132, "ewc_loss": 0.021543113514780998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.154311368940398e-05, "grad_norm": 14.7700834274292, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8682470321655273, "num_tokens": 170057038.0, "step": 4454 }, { "epoch": 0.5667217911207225, "ewc_loss": 0.021512649953365326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1512649254873395e-05, "grad_norm": 14.741100311279297, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8761729598045349, "num_tokens": 170098460.0, "step": 4455 }, { "epoch": 0.566849001399313, "ewc_loss": 0.021519361063838005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.151936132577248e-05, "grad_norm": 14.735896110534668, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8454521894454956, "num_tokens": 170137689.0, "step": 4456 }, { "epoch": 0.5669762116779036, "ewc_loss": 0.02151641994714737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1516420019906946e-05, "grad_norm": 14.756913185119629, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.858442485332489, "num_tokens": 170176716.0, "step": 4457 }, { "epoch": 0.5671034219564941, "ewc_loss": 0.0215712059289217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1571206161752343e-05, "grad_norm": 14.765954971313477, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.854030966758728, "num_tokens": 170212621.0, "step": 4458 }, { "epoch": 0.5672306322350846, "ewc_loss": 0.02148555964231491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1485559045686387e-05, "grad_norm": 14.70648193359375, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8479398488998413, "num_tokens": 170254493.0, "step": 4459 }, { "epoch": 0.5673578425136752, "ewc_loss": 0.021546704694628716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.154670437448658e-05, "grad_norm": 14.745983123779297, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8782889246940613, "num_tokens": 170290559.0, "step": 4460 }, { "epoch": 0.5674850527922656, "ewc_loss": 0.02152387425303459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1523874238482676e-05, "grad_norm": 14.68601131439209, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8515084385871887, "num_tokens": 170335079.0, "step": 4461 }, { "epoch": 0.5676122630708561, "ewc_loss": 0.021530235186219215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1530235244426876e-05, "grad_norm": 14.717445373535156, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.861072838306427, "num_tokens": 170370665.0, "step": 4462 }, { "epoch": 0.5677394733494466, "ewc_loss": 0.021510809659957886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.151081025658641e-05, "grad_norm": 14.670662879943848, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8564457297325134, "num_tokens": 170412585.0, "step": 4463 }, { "epoch": 0.5678666836280372, "ewc_loss": 0.02153540402650833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1535404812311754e-05, "grad_norm": 14.731345176696777, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.864352285861969, "num_tokens": 170450806.0, "step": 4464 }, { "epoch": 0.5679938939066277, "ewc_loss": 0.021577013656497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1577014194917865e-05, "grad_norm": 14.77616024017334, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8575989007949829, "num_tokens": 170483265.0, "step": 4465 }, { "epoch": 0.5681211041852182, "ewc_loss": 0.021558422595262527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1558422304224223e-05, "grad_norm": 14.721930503845215, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8742414712905884, "num_tokens": 170520910.0, "step": 4466 }, { "epoch": 0.5682483144638086, "ewc_loss": 0.021504471078515053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1504471078515053e-05, "grad_norm": 14.765100479125977, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8403527736663818, "num_tokens": 170556516.0, "step": 4467 }, { "epoch": 0.5683755247423992, "ewc_loss": 0.021570812910795212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1570813260041177e-05, "grad_norm": 14.744745254516602, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8565561771392822, "num_tokens": 170595571.0, "step": 4468 }, { "epoch": 0.5685027350209897, "ewc_loss": 0.021518558263778687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1518559151445515e-05, "grad_norm": 14.648894309997559, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.87691730260849, "num_tokens": 170634117.0, "step": 4469 }, { "epoch": 0.5686299452995802, "ewc_loss": 0.02158152498304844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.158152528863866e-05, "grad_norm": 14.780662536621094, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8534852862358093, "num_tokens": 170677624.0, "step": 4470 }, { "epoch": 0.5687571555781707, "ewc_loss": 0.021568093448877335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1568093870882876e-05, "grad_norm": 14.688015937805176, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8434542417526245, "num_tokens": 170716732.0, "step": 4471 }, { "epoch": 0.5688843658567613, "ewc_loss": 0.02155640348792076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1556403225986287e-05, "grad_norm": 14.75865364074707, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8520976305007935, "num_tokens": 170758521.0, "step": 4472 }, { "epoch": 0.5690115761353517, "ewc_loss": 0.021553242579102516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1553241822402924e-05, "grad_norm": 14.692277908325195, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8565145134925842, "num_tokens": 170796799.0, "step": 4473 }, { "epoch": 0.5691387864139422, "ewc_loss": 0.021572953090071678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1572952391579747e-05, "grad_norm": 14.786431312561035, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8437903523445129, "num_tokens": 170838569.0, "step": 4474 }, { "epoch": 0.5692659966925327, "ewc_loss": 0.021578136831521988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1578136511379853e-05, "grad_norm": 14.721270561218262, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8597381114959717, "num_tokens": 170878206.0, "step": 4475 }, { "epoch": 0.5693932069711233, "ewc_loss": 0.021540865302085876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1540865418501198e-05, "grad_norm": 14.757912635803223, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.844972550868988, "num_tokens": 170915506.0, "step": 4476 }, { "epoch": 0.5695204172497138, "ewc_loss": 0.021521352231502533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.152135130017996e-05, "grad_norm": 14.729608535766602, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8542606830596924, "num_tokens": 170952303.0, "step": 4477 }, { "epoch": 0.5696476275283043, "ewc_loss": 0.021542366594076157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1542366084759124e-05, "grad_norm": 14.748638153076172, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8507490158081055, "num_tokens": 170993757.0, "step": 4478 }, { "epoch": 0.5697748378068948, "ewc_loss": 0.02154817432165146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1548174117924646e-05, "grad_norm": 14.705177307128906, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.855140209197998, "num_tokens": 171030240.0, "step": 4479 }, { "epoch": 0.5699020480854853, "ewc_loss": 0.0215309988707304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1530999219976366e-05, "grad_norm": 14.750283241271973, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8344759345054626, "num_tokens": 171069654.0, "step": 4480 }, { "epoch": 0.5700292583640758, "ewc_loss": 0.021570200100541115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1570200260612182e-05, "grad_norm": 14.776140213012695, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8412350416183472, "num_tokens": 171107341.0, "step": 4481 }, { "epoch": 0.5701564686426663, "ewc_loss": 0.02154342271387577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1543422917602584e-05, "grad_norm": 14.748458862304688, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8483392000198364, "num_tokens": 171144063.0, "step": 4482 }, { "epoch": 0.5702836789212569, "ewc_loss": 0.02150137536227703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1501375158550218e-05, "grad_norm": 14.712244987487793, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8516206741333008, "num_tokens": 171180448.0, "step": 4483 }, { "epoch": 0.5704108891998474, "ewc_loss": 0.021590223535895348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1590223695966415e-05, "grad_norm": 14.814157485961914, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8623836636543274, "num_tokens": 171214220.0, "step": 4484 }, { "epoch": 0.5705380994784378, "ewc_loss": 0.021560659632086754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1560659661190584e-05, "grad_norm": 14.675116539001465, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8638139963150024, "num_tokens": 171252798.0, "step": 4485 }, { "epoch": 0.5706653097570283, "ewc_loss": 0.021538564935326576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1538564396905713e-05, "grad_norm": 14.759603500366211, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8620895147323608, "num_tokens": 171295434.0, "step": 4486 }, { "epoch": 0.5707925200356189, "ewc_loss": 0.021581009030342102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1581008695648052e-05, "grad_norm": 14.648430824279785, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8593566417694092, "num_tokens": 171334129.0, "step": 4487 }, { "epoch": 0.5709197303142094, "ewc_loss": 0.021580619737505913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1580619431915693e-05, "grad_norm": 14.81993293762207, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8544429540634155, "num_tokens": 171370239.0, "step": 4488 }, { "epoch": 0.5710469405927999, "ewc_loss": 0.021625349298119545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.162535020033829e-05, "grad_norm": 14.722580909729004, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8711166381835938, "num_tokens": 171402411.0, "step": 4489 }, { "epoch": 0.5711741508713905, "ewc_loss": 0.021539976820349693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1539975932682864e-05, "grad_norm": 14.751224517822266, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8677008152008057, "num_tokens": 171438736.0, "step": 4490 }, { "epoch": 0.5713013611499809, "ewc_loss": 0.021598689258098602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1598689272650518e-05, "grad_norm": 14.74763298034668, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8641089797019958, "num_tokens": 171477949.0, "step": 4491 }, { "epoch": 0.5714285714285714, "ewc_loss": 0.02157827652990818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1578276573563926e-05, "grad_norm": 14.812244415283203, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8525644540786743, "num_tokens": 171520953.0, "step": 4492 }, { "epoch": 0.5715557817071619, "ewc_loss": 0.021622253581881523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1622254280373454e-05, "grad_norm": 14.800505638122559, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8633382320404053, "num_tokens": 171559415.0, "step": 4493 }, { "epoch": 0.5716829919857525, "ewc_loss": 0.02152404934167862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1524048861465417e-05, "grad_norm": 14.743613243103027, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8651493787765503, "num_tokens": 171593118.0, "step": 4494 }, { "epoch": 0.571810202264343, "ewc_loss": 0.02158672735095024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.15867275983328e-05, "grad_norm": 14.796538352966309, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8444483280181885, "num_tokens": 171634514.0, "step": 4495 }, { "epoch": 0.5719374125429335, "ewc_loss": 0.02158217504620552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1582174667855725e-05, "grad_norm": 14.74759578704834, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8609555959701538, "num_tokens": 171675239.0, "step": 4496 }, { "epoch": 0.5720646228215239, "ewc_loss": 0.02152070216834545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1520701920962892e-05, "grad_norm": 14.801732063293457, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8656564950942993, "num_tokens": 171713154.0, "step": 4497 }, { "epoch": 0.5721918331001145, "ewc_loss": 0.021567020565271378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1567020667134784e-05, "grad_norm": 14.714693069458008, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8503847122192383, "num_tokens": 171750046.0, "step": 4498 }, { "epoch": 0.572319043378705, "ewc_loss": 0.02154015377163887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1540154193644412e-05, "grad_norm": 14.869281768798828, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8478913307189941, "num_tokens": 171782708.0, "step": 4499 }, { "epoch": 0.5724462536572955, "ewc_loss": 0.021625105291604996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.162510463676881e-05, "grad_norm": 14.78451919555664, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8535858392715454, "num_tokens": 171819421.0, "step": 4500 }, { "epoch": 0.572573463935886, "ewc_loss": 0.021494615823030472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.149461579392664e-05, "grad_norm": 14.773839950561523, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8562063574790955, "num_tokens": 171860993.0, "step": 4501 }, { "epoch": 0.5727006742144766, "ewc_loss": 0.021570071578025818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.157007111236453e-05, "grad_norm": 14.757163047790527, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8561370372772217, "num_tokens": 171896894.0, "step": 4502 }, { "epoch": 0.572827884493067, "ewc_loss": 0.021538466215133667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.153846617147792e-05, "grad_norm": 14.767582893371582, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8547921180725098, "num_tokens": 171938694.0, "step": 4503 }, { "epoch": 0.5729550947716575, "ewc_loss": 0.02157432772219181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1574327547568828e-05, "grad_norm": 14.762031555175781, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8819315433502197, "num_tokens": 171972606.0, "step": 4504 }, { "epoch": 0.573082305050248, "ewc_loss": 0.021592017263174057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.159201721951831e-05, "grad_norm": 14.7645902633667, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8601465225219727, "num_tokens": 172010861.0, "step": 4505 }, { "epoch": 0.5732095153288386, "ewc_loss": 0.021609291434288025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1609292161883786e-05, "grad_norm": 14.85858154296875, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8415097594261169, "num_tokens": 172049867.0, "step": 4506 }, { "epoch": 0.5733367256074291, "ewc_loss": 0.02163206972181797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1632069547194988e-05, "grad_norm": 14.720046043395996, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8583880066871643, "num_tokens": 172092176.0, "step": 4507 }, { "epoch": 0.5734639358860196, "ewc_loss": 0.021555108949542046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1555108105530962e-05, "grad_norm": 14.795567512512207, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8577477335929871, "num_tokens": 172132789.0, "step": 4508 }, { "epoch": 0.5735911461646102, "ewc_loss": 0.0216356348246336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1635634766425937e-05, "grad_norm": 14.81422233581543, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8711384534835815, "num_tokens": 172170536.0, "step": 4509 }, { "epoch": 0.5737183564432006, "ewc_loss": 0.021564127877354622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1564128473983146e-05, "grad_norm": 14.82420825958252, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.856924295425415, "num_tokens": 172206828.0, "step": 4510 }, { "epoch": 0.5738455667217911, "ewc_loss": 0.021603168919682503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.160316944355145e-05, "grad_norm": 14.787125587463379, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8492175340652466, "num_tokens": 172242311.0, "step": 4511 }, { "epoch": 0.5739727770003816, "ewc_loss": 0.021566223353147507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.156622394977603e-05, "grad_norm": 14.756584167480469, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8531316518783569, "num_tokens": 172275135.0, "step": 4512 }, { "epoch": 0.5740999872789722, "ewc_loss": 0.021601082757115364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1601083062705584e-05, "grad_norm": 14.803797721862793, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8664371967315674, "num_tokens": 172308682.0, "step": 4513 }, { "epoch": 0.5742271975575627, "ewc_loss": 0.021624892950057983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1624893633998e-05, "grad_norm": 14.724370002746582, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8525686860084534, "num_tokens": 172345834.0, "step": 4514 }, { "epoch": 0.5743544078361532, "ewc_loss": 0.021601857617497444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1601857952191494e-05, "grad_norm": 14.778496742248535, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8355552554130554, "num_tokens": 172387976.0, "step": 4515 }, { "epoch": 0.5744816181147436, "ewc_loss": 0.021674416959285736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.167441743949894e-05, "grad_norm": 14.737082481384277, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8566209673881531, "num_tokens": 172427447.0, "step": 4516 }, { "epoch": 0.5746088283933342, "ewc_loss": 0.021625693887472153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.162569398933556e-05, "grad_norm": 14.784065246582031, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8487492203712463, "num_tokens": 172468686.0, "step": 4517 }, { "epoch": 0.5747360386719247, "ewc_loss": 0.02165946364402771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1659463527612388e-05, "grad_norm": 14.742915153503418, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8535358309745789, "num_tokens": 172508190.0, "step": 4518 }, { "epoch": 0.5748632489505152, "ewc_loss": 0.021658586338162422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.165858677471988e-05, "grad_norm": 14.73939323425293, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8729897737503052, "num_tokens": 172552295.0, "step": 4519 }, { "epoch": 0.5749904592291057, "ewc_loss": 0.021678989753127098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1678990378859453e-05, "grad_norm": 14.819164276123047, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.862148642539978, "num_tokens": 172591065.0, "step": 4520 }, { "epoch": 0.5751176695076963, "ewc_loss": 0.021651480346918106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1651479983120225e-05, "grad_norm": 14.729043960571289, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8598671555519104, "num_tokens": 172633741.0, "step": 4521 }, { "epoch": 0.5752448797862867, "ewc_loss": 0.021634869277477264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1634868971887045e-05, "grad_norm": 14.801314353942871, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8679282069206238, "num_tokens": 172669081.0, "step": 4522 }, { "epoch": 0.5753720900648772, "ewc_loss": 0.021663248538970947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1663248844561167e-05, "grad_norm": 14.696609497070312, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8353559970855713, "num_tokens": 172704956.0, "step": 4523 }, { "epoch": 0.5754993003434677, "ewc_loss": 0.021627001464366913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.162700184271671e-05, "grad_norm": 14.831184387207031, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8614093661308289, "num_tokens": 172742270.0, "step": 4524 }, { "epoch": 0.5756265106220583, "ewc_loss": 0.021706439554691315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.170643892895896e-05, "grad_norm": 14.808849334716797, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8457275629043579, "num_tokens": 172778065.0, "step": 4525 }, { "epoch": 0.5757537209006488, "ewc_loss": 0.02161630056798458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.161630072805565e-05, "grad_norm": 14.758379936218262, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8655891418457031, "num_tokens": 172815825.0, "step": 4526 }, { "epoch": 0.5758809311792393, "ewc_loss": 0.02166140079498291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1661400751327164e-05, "grad_norm": 14.766531944274902, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8557800054550171, "num_tokens": 172849560.0, "step": 4527 }, { "epoch": 0.5760081414578297, "ewc_loss": 0.021692894399166107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1692894733860157e-05, "grad_norm": 14.803497314453125, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8587643504142761, "num_tokens": 172885203.0, "step": 4528 }, { "epoch": 0.5761353517364203, "ewc_loss": 0.021672923117876053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1672924049198627e-05, "grad_norm": 14.795644760131836, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8573399782180786, "num_tokens": 172922341.0, "step": 4529 }, { "epoch": 0.5762625620150108, "ewc_loss": 0.021639859303832054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.163985845982097e-05, "grad_norm": 14.78262710571289, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8633745908737183, "num_tokens": 172953363.0, "step": 4530 }, { "epoch": 0.5763897722936013, "ewc_loss": 0.021669207140803337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1669207853847183e-05, "grad_norm": 14.807818412780762, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8624407052993774, "num_tokens": 172986253.0, "step": 4531 }, { "epoch": 0.5765169825721919, "ewc_loss": 0.0216961856931448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1696185285691172e-05, "grad_norm": 14.793085098266602, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8677840828895569, "num_tokens": 173023466.0, "step": 4532 }, { "epoch": 0.5766441928507824, "ewc_loss": 0.021632255986332893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.163225508411415e-05, "grad_norm": 14.775527954101562, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.852685809135437, "num_tokens": 173063329.0, "step": 4533 }, { "epoch": 0.5767714031293728, "ewc_loss": 0.021714333444833755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.171433334297035e-05, "grad_norm": 14.804279327392578, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.863419771194458, "num_tokens": 173100676.0, "step": 4534 }, { "epoch": 0.5768986134079633, "ewc_loss": 0.021707473322749138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1707473933929577e-05, "grad_norm": 14.804730415344238, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8676033616065979, "num_tokens": 173135088.0, "step": 4535 }, { "epoch": 0.5770258236865539, "ewc_loss": 0.02166774496436119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1667745386366732e-05, "grad_norm": 14.8213472366333, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8495787382125854, "num_tokens": 173175810.0, "step": 4536 }, { "epoch": 0.5771530339651444, "ewc_loss": 0.02169720269739628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1697202100767754e-05, "grad_norm": 14.815071105957031, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8587850332260132, "num_tokens": 173220826.0, "step": 4537 }, { "epoch": 0.5772802442437349, "ewc_loss": 0.021678859367966652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1678859411622398e-05, "grad_norm": 14.848917007446289, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8681735992431641, "num_tokens": 173258411.0, "step": 4538 }, { "epoch": 0.5774074545223254, "ewc_loss": 0.02171250246465206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1712501620640978e-05, "grad_norm": 14.794442176818848, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8410825133323669, "num_tokens": 173297298.0, "step": 4539 }, { "epoch": 0.5775346648009159, "ewc_loss": 0.021624475717544556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1624475266435184e-05, "grad_norm": 14.766420364379883, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8407886028289795, "num_tokens": 173330839.0, "step": 4540 }, { "epoch": 0.5776618750795064, "ewc_loss": 0.02166653610765934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1666535758413374e-05, "grad_norm": 14.794528007507324, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8498038053512573, "num_tokens": 173370219.0, "step": 4541 }, { "epoch": 0.5777890853580969, "ewc_loss": 0.02166888862848282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.166888771171216e-05, "grad_norm": 14.747318267822266, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8581687211990356, "num_tokens": 173410425.0, "step": 4542 }, { "epoch": 0.5779162956366874, "ewc_loss": 0.02167077362537384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1670774003723636e-05, "grad_norm": 14.779458999633789, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8572342395782471, "num_tokens": 173447109.0, "step": 4543 }, { "epoch": 0.578043505915278, "ewc_loss": 0.021681996062397957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.168199534935411e-05, "grad_norm": 14.795077323913574, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8628817796707153, "num_tokens": 173490064.0, "step": 4544 }, { "epoch": 0.5781707161938685, "ewc_loss": 0.021668754518032074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.16687549254857e-05, "grad_norm": 14.779753684997559, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8605598211288452, "num_tokens": 173523791.0, "step": 4545 }, { "epoch": 0.5782979264724589, "ewc_loss": 0.021657660603523254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1657660909113474e-05, "grad_norm": 14.759919166564941, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8658040165901184, "num_tokens": 173564069.0, "step": 4546 }, { "epoch": 0.5784251367510495, "ewc_loss": 0.021668316796422005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1668316549039446e-05, "grad_norm": 14.760401725769043, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8574830889701843, "num_tokens": 173601893.0, "step": 4547 }, { "epoch": 0.57855234702964, "ewc_loss": 0.021695593371987343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.169559411413502e-05, "grad_norm": 14.783012390136719, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8672210574150085, "num_tokens": 173642299.0, "step": 4548 }, { "epoch": 0.5786795573082305, "ewc_loss": 0.021729059517383575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1729059881181456e-05, "grad_norm": 14.896401405334473, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8602522611618042, "num_tokens": 173675824.0, "step": 4549 }, { "epoch": 0.578806767586821, "ewc_loss": 0.021663866937160492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1663867300958373e-05, "grad_norm": 14.726783752441406, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8597071766853333, "num_tokens": 173712192.0, "step": 4550 }, { "epoch": 0.5789339778654116, "ewc_loss": 0.021629415452480316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1629415641655214e-05, "grad_norm": 14.854623794555664, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.86483234167099, "num_tokens": 173746949.0, "step": 4551 }, { "epoch": 0.579061188144002, "ewc_loss": 0.021712567657232285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1712567104259506e-05, "grad_norm": 14.775871276855469, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8606843948364258, "num_tokens": 173784527.0, "step": 4552 }, { "epoch": 0.5791883984225925, "ewc_loss": 0.02168353833258152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1683537852368318e-05, "grad_norm": 14.888031959533691, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8636907935142517, "num_tokens": 173820648.0, "step": 4553 }, { "epoch": 0.579315608701183, "ewc_loss": 0.021727925166487694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1727924831793644e-05, "grad_norm": 14.81804084777832, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8518530130386353, "num_tokens": 173860427.0, "step": 4554 }, { "epoch": 0.5794428189797736, "ewc_loss": 0.021615905687212944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.161590600735508e-05, "grad_norm": 14.78118896484375, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8604875802993774, "num_tokens": 173899876.0, "step": 4555 }, { "epoch": 0.5795700292583641, "ewc_loss": 0.021646656095981598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1646656023222022e-05, "grad_norm": 14.859752655029297, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8669726252555847, "num_tokens": 173937354.0, "step": 4556 }, { "epoch": 0.5796972395369546, "ewc_loss": 0.021656488999724388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.165648947993759e-05, "grad_norm": 14.788630485534668, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8473128080368042, "num_tokens": 173976039.0, "step": 4557 }, { "epoch": 0.5798244498155452, "ewc_loss": 0.021586230024695396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.158622919523623e-05, "grad_norm": 14.752015113830566, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8378230333328247, "num_tokens": 174006359.0, "step": 4558 }, { "epoch": 0.5799516600941356, "ewc_loss": 0.02167375385761261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1673753508366644e-05, "grad_norm": 14.8681058883667, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8498764634132385, "num_tokens": 174044662.0, "step": 4559 }, { "epoch": 0.5800788703727261, "ewc_loss": 0.02165355160832405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1653551812050864e-05, "grad_norm": 14.766861915588379, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8529155850410461, "num_tokens": 174077133.0, "step": 4560 }, { "epoch": 0.5802060806513166, "ewc_loss": 0.021635109558701515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1635109078488313e-05, "grad_norm": 14.793464660644531, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8528374433517456, "num_tokens": 174121873.0, "step": 4561 }, { "epoch": 0.5803332909299072, "ewc_loss": 0.021708618849515915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1708618078264408e-05, "grad_norm": 14.81104564666748, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8598238825798035, "num_tokens": 174158873.0, "step": 4562 }, { "epoch": 0.5804605012084977, "ewc_loss": 0.021692493930459023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1692494556191377e-05, "grad_norm": 14.880910873413086, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8649497032165527, "num_tokens": 174189222.0, "step": 4563 }, { "epoch": 0.5805877114870882, "ewc_loss": 0.021691104397177696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.169110484828707e-05, "grad_norm": 14.828604698181152, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8494829535484314, "num_tokens": 174229018.0, "step": 4564 }, { "epoch": 0.5807149217656786, "ewc_loss": 0.021717313677072525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1717312847613357e-05, "grad_norm": 14.801149368286133, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8584682941436768, "num_tokens": 174269215.0, "step": 4565 }, { "epoch": 0.5808421320442692, "ewc_loss": 0.021731112152338028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1731111701228656e-05, "grad_norm": 14.811776161193848, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8653014898300171, "num_tokens": 174303926.0, "step": 4566 }, { "epoch": 0.5809693423228597, "ewc_loss": 0.021730592474341393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1730593289248645e-05, "grad_norm": 14.80648136138916, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8601450324058533, "num_tokens": 174347491.0, "step": 4567 }, { "epoch": 0.5810965526014502, "ewc_loss": 0.02171444520354271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.171444612031337e-05, "grad_norm": 14.836650848388672, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8594989776611328, "num_tokens": 174384246.0, "step": 4568 }, { "epoch": 0.5812237628800407, "ewc_loss": 0.021743712946772575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.174371365981642e-05, "grad_norm": 14.803228378295898, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8572028875350952, "num_tokens": 174417081.0, "step": 4569 }, { "epoch": 0.5813509731586313, "ewc_loss": 0.021749427542090416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.174942710553296e-05, "grad_norm": 14.880386352539062, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8479165434837341, "num_tokens": 174457364.0, "step": 4570 }, { "epoch": 0.5814781834372217, "ewc_loss": 0.021805213764309883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1805213691550307e-05, "grad_norm": 14.787333488464355, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8590000867843628, "num_tokens": 174494304.0, "step": 4571 }, { "epoch": 0.5816053937158122, "ewc_loss": 0.02174936607480049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.174936525989324e-05, "grad_norm": 14.856660842895508, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8585196733474731, "num_tokens": 174528047.0, "step": 4572 }, { "epoch": 0.5817326039944027, "ewc_loss": 0.021810833364725113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181083254981786e-05, "grad_norm": 14.87890911102295, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8626667261123657, "num_tokens": 174565990.0, "step": 4573 }, { "epoch": 0.5818598142729933, "ewc_loss": 0.021755412220954895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1755411580670625e-05, "grad_norm": 14.812004089355469, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8432183861732483, "num_tokens": 174599039.0, "step": 4574 }, { "epoch": 0.5819870245515838, "ewc_loss": 0.021762268617749214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1762269170721993e-05, "grad_norm": 14.819062232971191, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8512029647827148, "num_tokens": 174634786.0, "step": 4575 }, { "epoch": 0.5821142348301743, "ewc_loss": 0.021767716854810715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1767717043985613e-05, "grad_norm": 14.80997085571289, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8475804328918457, "num_tokens": 174677875.0, "step": 4576 }, { "epoch": 0.5822414451087647, "ewc_loss": 0.021755848079919815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1755848138127476e-05, "grad_norm": 14.754796028137207, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8508666753768921, "num_tokens": 174715421.0, "step": 4577 }, { "epoch": 0.5823686553873553, "ewc_loss": 0.02172093093395233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.172093081753701e-05, "grad_norm": 14.831507682800293, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8546448945999146, "num_tokens": 174746555.0, "step": 4578 }, { "epoch": 0.5824958656659458, "ewc_loss": 0.021813247352838516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181324816774577e-05, "grad_norm": 14.785541534423828, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8460538983345032, "num_tokens": 174786470.0, "step": 4579 }, { "epoch": 0.5826230759445363, "ewc_loss": 0.021718023344874382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1718024072470143e-05, "grad_norm": 14.741372108459473, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8545764684677124, "num_tokens": 174827131.0, "step": 4580 }, { "epoch": 0.5827502862231269, "ewc_loss": 0.021774623543024063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1774623746750876e-05, "grad_norm": 14.85036563873291, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.851759135723114, "num_tokens": 174861391.0, "step": 4581 }, { "epoch": 0.5828774965017174, "ewc_loss": 0.021750975400209427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1750975065515377e-05, "grad_norm": 14.72830581665039, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8616135120391846, "num_tokens": 174897437.0, "step": 4582 }, { "epoch": 0.5830047067803078, "ewc_loss": 0.02173522487282753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1735224436270073e-05, "grad_norm": 14.809216499328613, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8482061624526978, "num_tokens": 174932504.0, "step": 4583 }, { "epoch": 0.5831319170588983, "ewc_loss": 0.021850859746336937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1850859411642887e-05, "grad_norm": 14.778366088867188, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8518695831298828, "num_tokens": 174974746.0, "step": 4584 }, { "epoch": 0.5832591273374889, "ewc_loss": 0.0217583030462265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1758303773822263e-05, "grad_norm": 14.763428688049316, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.852036714553833, "num_tokens": 175016155.0, "step": 4585 }, { "epoch": 0.5833863376160794, "ewc_loss": 0.021820399910211563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.182040043408051e-05, "grad_norm": 14.791946411132812, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8420153856277466, "num_tokens": 175060595.0, "step": 4586 }, { "epoch": 0.5835135478946699, "ewc_loss": 0.02178839035332203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.178838985855691e-05, "grad_norm": 14.745877265930176, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8567299246788025, "num_tokens": 175101278.0, "step": 4587 }, { "epoch": 0.5836407581732604, "ewc_loss": 0.0218143742531538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1814374122186564e-05, "grad_norm": 14.872662544250488, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8506036996841431, "num_tokens": 175139479.0, "step": 4588 }, { "epoch": 0.5837679684518509, "ewc_loss": 0.021794645115733147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1794645363115706e-05, "grad_norm": 14.81273365020752, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8398757576942444, "num_tokens": 175182376.0, "step": 4589 }, { "epoch": 0.5838951787304414, "ewc_loss": 0.021790115162730217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1790116079500876e-05, "grad_norm": 14.788527488708496, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8492128252983093, "num_tokens": 175225449.0, "step": 4590 }, { "epoch": 0.5840223890090319, "ewc_loss": 0.021797575056552887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1797575755044818e-05, "grad_norm": 14.890336036682129, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8549495339393616, "num_tokens": 175255959.0, "step": 4591 }, { "epoch": 0.5841495992876224, "ewc_loss": 0.02179555594921112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1795556676806882e-05, "grad_norm": 14.904874801635742, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8441823720932007, "num_tokens": 175292142.0, "step": 4592 }, { "epoch": 0.584276809566213, "ewc_loss": 0.02178415283560753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1784153432236053e-05, "grad_norm": 14.812957763671875, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8505403995513916, "num_tokens": 175336630.0, "step": 4593 }, { "epoch": 0.5844040198448035, "ewc_loss": 0.021717436611652374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1717436538892798e-05, "grad_norm": 14.863642692565918, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8682037591934204, "num_tokens": 175372697.0, "step": 4594 }, { "epoch": 0.5845312301233939, "ewc_loss": 0.021763566881418228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.176356611016672e-05, "grad_norm": 14.864550590515137, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8446943759918213, "num_tokens": 175412395.0, "step": 4595 }, { "epoch": 0.5846584404019844, "ewc_loss": 0.02172771655023098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1727715648012236e-05, "grad_norm": 14.844733238220215, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8554478883743286, "num_tokens": 175447295.0, "step": 4596 }, { "epoch": 0.584785650680575, "ewc_loss": 0.021735265851020813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1735266273026355e-05, "grad_norm": 14.993046760559082, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8663250803947449, "num_tokens": 175480229.0, "step": 4597 }, { "epoch": 0.5849128609591655, "ewc_loss": 0.02174803428351879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1748033759649843e-05, "grad_norm": 14.848872184753418, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8562570810317993, "num_tokens": 175523081.0, "step": 4598 }, { "epoch": 0.585040071237756, "ewc_loss": 0.021637901663780212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1637901227222756e-05, "grad_norm": 14.849663734436035, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.859934389591217, "num_tokens": 175565036.0, "step": 4599 }, { "epoch": 0.5851672815163466, "ewc_loss": 0.0217320304363966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1732030290877447e-05, "grad_norm": 15.009895324707031, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8571493625640869, "num_tokens": 175606733.0, "step": 4600 }, { "epoch": 0.585294491794937, "ewc_loss": 0.021717343479394913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1717343770433217e-05, "grad_norm": 14.835299491882324, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8723033666610718, "num_tokens": 175637663.0, "step": 4601 }, { "epoch": 0.5854217020735275, "ewc_loss": 0.021624134853482246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.162413511541672e-05, "grad_norm": 14.919571876525879, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8583492636680603, "num_tokens": 175677381.0, "step": 4602 }, { "epoch": 0.585548912352118, "ewc_loss": 0.021725308150053024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.172530730604194e-05, "grad_norm": 14.873713493347168, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8681091070175171, "num_tokens": 175713049.0, "step": 4603 }, { "epoch": 0.5856761226307086, "ewc_loss": 0.021602554246783257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1602554625133052e-05, "grad_norm": 14.77770709991455, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8477264642715454, "num_tokens": 175753681.0, "step": 4604 }, { "epoch": 0.5858033329092991, "ewc_loss": 0.021669818088412285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.166981721529737e-05, "grad_norm": 14.934167861938477, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8450783491134644, "num_tokens": 175789725.0, "step": 4605 }, { "epoch": 0.5859305431878896, "ewc_loss": 0.02170608751475811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1706087864004076e-05, "grad_norm": 14.803359985351562, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8564390540122986, "num_tokens": 175823203.0, "step": 4606 }, { "epoch": 0.5860577534664801, "ewc_loss": 0.02165561355650425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.165561272704508e-05, "grad_norm": 14.866247177124023, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8510295152664185, "num_tokens": 175860506.0, "step": 4607 }, { "epoch": 0.5861849637450706, "ewc_loss": 0.021697118878364563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.169711842725519e-05, "grad_norm": 14.77770709991455, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8727112412452698, "num_tokens": 175895080.0, "step": 4608 }, { "epoch": 0.5863121740236611, "ewc_loss": 0.021686028689146042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1686028048861772e-05, "grad_norm": 14.794468879699707, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8507354259490967, "num_tokens": 175937113.0, "step": 4609 }, { "epoch": 0.5864393843022516, "ewc_loss": 0.02175922319293022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1759222363471054e-05, "grad_norm": 14.88507080078125, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8695110082626343, "num_tokens": 175966193.0, "step": 4610 }, { "epoch": 0.5865665945808421, "ewc_loss": 0.02174581028521061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.174581095459871e-05, "grad_norm": 14.788216590881348, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8503740429878235, "num_tokens": 176003771.0, "step": 4611 }, { "epoch": 0.5866938048594327, "ewc_loss": 0.021721601486206055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1721602024626918e-05, "grad_norm": 14.824995994567871, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8627196550369263, "num_tokens": 176050069.0, "step": 4612 }, { "epoch": 0.5868210151380232, "ewc_loss": 0.021770412102341652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1770412786281668e-05, "grad_norm": 14.809042930603027, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8723325133323669, "num_tokens": 176088763.0, "step": 4613 }, { "epoch": 0.5869482254166136, "ewc_loss": 0.021762007847428322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1762007236247882e-05, "grad_norm": 14.840473175048828, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.858582079410553, "num_tokens": 176127853.0, "step": 4614 }, { "epoch": 0.5870754356952042, "ewc_loss": 0.02174982614815235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1749825464212336e-05, "grad_norm": 14.807427406311035, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8429193496704102, "num_tokens": 176173775.0, "step": 4615 }, { "epoch": 0.5872026459737947, "ewc_loss": 0.02174065075814724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.174065048166085e-05, "grad_norm": 14.800311088562012, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8598570823669434, "num_tokens": 176210878.0, "step": 4616 }, { "epoch": 0.5873298562523852, "ewc_loss": 0.0217304490506649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1730449589085765e-05, "grad_norm": 14.87883472442627, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8514293432235718, "num_tokens": 176255094.0, "step": 4617 }, { "epoch": 0.5874570665309757, "ewc_loss": 0.02174673229455948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1746731363236904e-05, "grad_norm": 14.842851638793945, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8538592457771301, "num_tokens": 176293586.0, "step": 4618 }, { "epoch": 0.5875842768095663, "ewc_loss": 0.02169046737253666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1690468201995827e-05, "grad_norm": 14.831948280334473, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8595900535583496, "num_tokens": 176329203.0, "step": 4619 }, { "epoch": 0.5877114870881567, "ewc_loss": 0.021734880283474922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1734880647272803e-05, "grad_norm": 14.771242141723633, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8732990026473999, "num_tokens": 176367984.0, "step": 4620 }, { "epoch": 0.5878386973667472, "ewc_loss": 0.021764706820249557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1764706616522744e-05, "grad_norm": 14.883935928344727, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8708964586257935, "num_tokens": 176408949.0, "step": 4621 }, { "epoch": 0.5879659076453377, "ewc_loss": 0.02172829769551754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.172829772462137e-05, "grad_norm": 14.7682523727417, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8738361597061157, "num_tokens": 176451073.0, "step": 4622 }, { "epoch": 0.5880931179239283, "ewc_loss": 0.021692922338843346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.169292201870121e-05, "grad_norm": 14.84188175201416, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8642301559448242, "num_tokens": 176490623.0, "step": 4623 }, { "epoch": 0.5882203282025188, "ewc_loss": 0.021769162267446518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.176916314056143e-05, "grad_norm": 14.842144012451172, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8441237211227417, "num_tokens": 176525771.0, "step": 4624 }, { "epoch": 0.5883475384811093, "ewc_loss": 0.021709498018026352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1709498469135724e-05, "grad_norm": 14.820239067077637, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.876177191734314, "num_tokens": 176565417.0, "step": 4625 }, { "epoch": 0.5884747487596997, "ewc_loss": 0.02172795683145523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1727957573602907e-05, "grad_norm": 14.830609321594238, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8547496199607849, "num_tokens": 176599005.0, "step": 4626 }, { "epoch": 0.5886019590382903, "ewc_loss": 0.02171211689710617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1712115994887426e-05, "grad_norm": 14.77170467376709, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8599115014076233, "num_tokens": 176637947.0, "step": 4627 }, { "epoch": 0.5887291693168808, "ewc_loss": 0.02175387367606163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.175387453462463e-05, "grad_norm": 14.851940155029297, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8477808237075806, "num_tokens": 176675241.0, "step": 4628 }, { "epoch": 0.5888563795954713, "ewc_loss": 0.02172987349331379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.172987296944484e-05, "grad_norm": 14.74130916595459, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8517379760742188, "num_tokens": 176715375.0, "step": 4629 }, { "epoch": 0.5889835898740619, "ewc_loss": 0.02168014645576477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1680147256120108e-05, "grad_norm": 14.821823120117188, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8578899502754211, "num_tokens": 176750977.0, "step": 4630 }, { "epoch": 0.5891108001526524, "ewc_loss": 0.021788517013192177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.178851718781516e-05, "grad_norm": 14.782480239868164, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8606345057487488, "num_tokens": 176788968.0, "step": 4631 }, { "epoch": 0.5892380104312428, "ewc_loss": 0.021733839064836502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.173383836634457e-05, "grad_norm": 14.79395580291748, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8534062504768372, "num_tokens": 176824399.0, "step": 4632 }, { "epoch": 0.5893652207098333, "ewc_loss": 0.021790215745568275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.179021612391807e-05, "grad_norm": 14.807123184204102, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8532418012619019, "num_tokens": 176862053.0, "step": 4633 }, { "epoch": 0.5894924309884239, "ewc_loss": 0.02179877646267414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1798776288051158e-05, "grad_norm": 14.849600791931152, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8720710277557373, "num_tokens": 176896943.0, "step": 4634 }, { "epoch": 0.5896196412670144, "ewc_loss": 0.021797750145196915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.179775037802756e-05, "grad_norm": 14.804400444030762, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8578699231147766, "num_tokens": 176936263.0, "step": 4635 }, { "epoch": 0.5897468515456049, "ewc_loss": 0.02181527577340603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1815276340930723e-05, "grad_norm": 14.785259246826172, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.845497727394104, "num_tokens": 176976632.0, "step": 4636 }, { "epoch": 0.5898740618241954, "ewc_loss": 0.02181319147348404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181319177907426e-05, "grad_norm": 14.86357593536377, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8635311722755432, "num_tokens": 177007036.0, "step": 4637 }, { "epoch": 0.5900012721027859, "ewc_loss": 0.02184838429093361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.184838376706466e-05, "grad_norm": 14.851157188415527, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8483124375343323, "num_tokens": 177042189.0, "step": 4638 }, { "epoch": 0.5901284823813764, "ewc_loss": 0.02181938849389553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181938907597214e-05, "grad_norm": 14.839727401733398, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.853084921836853, "num_tokens": 177082428.0, "step": 4639 }, { "epoch": 0.5902556926599669, "ewc_loss": 0.021872224286198616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1872223442187533e-05, "grad_norm": 14.836140632629395, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8607437610626221, "num_tokens": 177124447.0, "step": 4640 }, { "epoch": 0.5903829029385574, "ewc_loss": 0.021821126341819763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1821126210852526e-05, "grad_norm": 14.81229305267334, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8732801675796509, "num_tokens": 177160516.0, "step": 4641 }, { "epoch": 0.590510113217148, "ewc_loss": 0.021835485473275185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1835485313204117e-05, "grad_norm": 14.902562141418457, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8646873831748962, "num_tokens": 177198100.0, "step": 4642 }, { "epoch": 0.5906373234957385, "ewc_loss": 0.02183675207197666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1836751329828985e-05, "grad_norm": 14.800537109375, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8510005474090576, "num_tokens": 177243170.0, "step": 4643 }, { "epoch": 0.5907645337743289, "ewc_loss": 0.02181696519255638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1816964363097213e-05, "grad_norm": 14.859502792358398, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8549410700798035, "num_tokens": 177282976.0, "step": 4644 }, { "epoch": 0.5908917440529194, "ewc_loss": 0.021849576383829117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1849577024113387e-05, "grad_norm": 14.872325897216797, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8396304249763489, "num_tokens": 177323725.0, "step": 4645 }, { "epoch": 0.59101895433151, "ewc_loss": 0.02177532948553562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1775329514639452e-05, "grad_norm": 14.901728630065918, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.857667863368988, "num_tokens": 177352178.0, "step": 4646 }, { "epoch": 0.5911461646101005, "ewc_loss": 0.021848563104867935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1848563847015612e-05, "grad_norm": 14.827127456665039, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.854364812374115, "num_tokens": 177392322.0, "step": 4647 }, { "epoch": 0.591273374888691, "ewc_loss": 0.021769238635897636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.176923953811638e-05, "grad_norm": 14.861050605773926, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8556070923805237, "num_tokens": 177432013.0, "step": 4648 }, { "epoch": 0.5914005851672816, "ewc_loss": 0.021829716861248016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1829717297805473e-05, "grad_norm": 14.895502090454102, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8708301782608032, "num_tokens": 177473542.0, "step": 4649 }, { "epoch": 0.591527795445872, "ewc_loss": 0.02176981419324875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.17698143387679e-05, "grad_norm": 14.84889030456543, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8537482023239136, "num_tokens": 177516590.0, "step": 4650 }, { "epoch": 0.5916550057244625, "ewc_loss": 0.02177075296640396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.177075293730013e-05, "grad_norm": 14.858367919921875, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8703373670578003, "num_tokens": 177553750.0, "step": 4651 }, { "epoch": 0.591782216003053, "ewc_loss": 0.021765630692243576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1765630663139746e-05, "grad_norm": 14.891239166259766, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.850445568561554, "num_tokens": 177586093.0, "step": 4652 }, { "epoch": 0.5919094262816436, "ewc_loss": 0.021788479760289192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1788478989037685e-05, "grad_norm": 14.844712257385254, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8546075820922852, "num_tokens": 177626800.0, "step": 4653 }, { "epoch": 0.5920366365602341, "ewc_loss": 0.021764136850833893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1764137272839434e-05, "grad_norm": 14.893457412719727, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8556753993034363, "num_tokens": 177662336.0, "step": 4654 }, { "epoch": 0.5921638468388246, "ewc_loss": 0.021764252334833145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1764251869171858e-05, "grad_norm": 14.772665023803711, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8656762838363647, "num_tokens": 177706922.0, "step": 4655 }, { "epoch": 0.592291057117415, "ewc_loss": 0.021727701649069786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1727701096097007e-05, "grad_norm": 14.845392227172852, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8552168011665344, "num_tokens": 177743922.0, "step": 4656 }, { "epoch": 0.5924182673960056, "ewc_loss": 0.021810069680213928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1810070393257774e-05, "grad_norm": 14.884881973266602, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.863640308380127, "num_tokens": 177777069.0, "step": 4657 }, { "epoch": 0.5925454776745961, "ewc_loss": 0.021779784932732582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1779784219688736e-05, "grad_norm": 14.872013092041016, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8656455874443054, "num_tokens": 177810959.0, "step": 4658 }, { "epoch": 0.5926726879531866, "ewc_loss": 0.021814515814185143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181451600336004e-05, "grad_norm": 14.923943519592285, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.848395824432373, "num_tokens": 177846366.0, "step": 4659 }, { "epoch": 0.5927998982317771, "ewc_loss": 0.0218250323086977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.182503158110194e-05, "grad_norm": 14.855347633361816, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8704606294631958, "num_tokens": 177881450.0, "step": 4660 }, { "epoch": 0.5929271085103677, "ewc_loss": 0.02181720733642578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.181720810767729e-05, "grad_norm": 14.857481956481934, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8587831854820251, "num_tokens": 177922931.0, "step": 4661 }, { "epoch": 0.5930543187889582, "ewc_loss": 0.021849341690540314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.184934237448033e-05, "grad_norm": 14.922539710998535, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8402061462402344, "num_tokens": 177962722.0, "step": 4662 }, { "epoch": 0.5931815290675486, "ewc_loss": 0.02184009924530983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1840100089320913e-05, "grad_norm": 14.83773422241211, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8559959530830383, "num_tokens": 177999956.0, "step": 4663 }, { "epoch": 0.5933087393461391, "ewc_loss": 0.021808339282870293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.18083387153456e-05, "grad_norm": 14.896089553833008, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8464736342430115, "num_tokens": 178032450.0, "step": 4664 }, { "epoch": 0.5934359496247297, "ewc_loss": 0.021861081942915916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1861082132090814e-05, "grad_norm": 14.891249656677246, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8583849668502808, "num_tokens": 178075028.0, "step": 4665 }, { "epoch": 0.5935631599033202, "ewc_loss": 0.021863682195544243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.186368146794848e-05, "grad_norm": 14.980964660644531, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8535910248756409, "num_tokens": 178108267.0, "step": 4666 }, { "epoch": 0.5936903701819107, "ewc_loss": 0.021836401894688606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1836402083863504e-05, "grad_norm": 14.836265563964844, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8582611083984375, "num_tokens": 178148091.0, "step": 4667 }, { "epoch": 0.5938175804605013, "ewc_loss": 0.02179333008825779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1793330233776942e-05, "grad_norm": 14.929621696472168, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8708264827728271, "num_tokens": 178185626.0, "step": 4668 }, { "epoch": 0.5939447907390917, "ewc_loss": 0.021875064820051193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.187506470363587e-05, "grad_norm": 14.93159008026123, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8649872541427612, "num_tokens": 178224165.0, "step": 4669 }, { "epoch": 0.5940720010176822, "ewc_loss": 0.02180996537208557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.180996489187237e-05, "grad_norm": 14.87564468383789, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8568084239959717, "num_tokens": 178264118.0, "step": 4670 }, { "epoch": 0.5941992112962727, "ewc_loss": 0.02176889404654503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1768893930129707e-05, "grad_norm": 14.765942573547363, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8684488534927368, "num_tokens": 178295634.0, "step": 4671 }, { "epoch": 0.5943264215748633, "ewc_loss": 0.02180720865726471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1807209122925997e-05, "grad_norm": 14.878310203552246, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.863622784614563, "num_tokens": 178327076.0, "step": 4672 }, { "epoch": 0.5944536318534538, "ewc_loss": 0.02185710147023201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1857102183275856e-05, "grad_norm": 14.90142822265625, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.848163366317749, "num_tokens": 178357469.0, "step": 4673 }, { "epoch": 0.5945808421320443, "ewc_loss": 0.021846354007720947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1846353774890304e-05, "grad_norm": 14.88021183013916, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8642787933349609, "num_tokens": 178393830.0, "step": 4674 }, { "epoch": 0.5947080524106347, "ewc_loss": 0.021863386034965515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1863386791665107e-05, "grad_norm": 14.928617477416992, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8571486473083496, "num_tokens": 178427560.0, "step": 4675 }, { "epoch": 0.5948352626892253, "ewc_loss": 0.021853012964129448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1853013095096685e-05, "grad_norm": 14.883623123168945, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8596867322921753, "num_tokens": 178464666.0, "step": 4676 }, { "epoch": 0.5949624729678158, "ewc_loss": 0.021858341991901398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1858342734049074e-05, "grad_norm": 14.865177154541016, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8477352857589722, "num_tokens": 178508771.0, "step": 4677 }, { "epoch": 0.5950896832464063, "ewc_loss": 0.021867645904421806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1867645045858808e-05, "grad_norm": 14.84664249420166, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8623909950256348, "num_tokens": 178548516.0, "step": 4678 }, { "epoch": 0.5952168935249968, "ewc_loss": 0.02189609780907631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1896097678109072e-05, "grad_norm": 14.908769607543945, "learning_rate": 1e-06, "loss": 0.5637, "mean_token_accuracy": 0.8231034278869629, "num_tokens": 178591276.0, "step": 4679 }, { "epoch": 0.5953441038035874, "ewc_loss": 0.02188337780535221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1883377485210076e-05, "grad_norm": 14.82456111907959, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8563735485076904, "num_tokens": 178632580.0, "step": 4680 }, { "epoch": 0.5954713140821778, "ewc_loss": 0.021885892376303673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.188589314755518e-05, "grad_norm": 14.891813278198242, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8531155586242676, "num_tokens": 178672858.0, "step": 4681 }, { "epoch": 0.5955985243607683, "ewc_loss": 0.021934516727924347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1934516553301364e-05, "grad_norm": 14.838866233825684, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8592298030853271, "num_tokens": 178706435.0, "step": 4682 }, { "epoch": 0.5957257346393589, "ewc_loss": 0.02189227007329464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.189227052440401e-05, "grad_norm": 14.899710655212402, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8549563884735107, "num_tokens": 178746309.0, "step": 4683 }, { "epoch": 0.5958529449179494, "ewc_loss": 0.021915728226304054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.191572821175214e-05, "grad_norm": 14.806707382202148, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8392873406410217, "num_tokens": 178787639.0, "step": 4684 }, { "epoch": 0.5959801551965399, "ewc_loss": 0.021889854222536087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1889854906476103e-05, "grad_norm": 14.902241706848145, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8619406819343567, "num_tokens": 178828271.0, "step": 4685 }, { "epoch": 0.5961073654751304, "ewc_loss": 0.021947048604488373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1947049390291795e-05, "grad_norm": 14.826126098632812, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8782650232315063, "num_tokens": 178862862.0, "step": 4686 }, { "epoch": 0.5962345757537209, "ewc_loss": 0.021872811019420624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1872810975764878e-05, "grad_norm": 14.876941680908203, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8689141869544983, "num_tokens": 178897969.0, "step": 4687 }, { "epoch": 0.5963617860323114, "ewc_loss": 0.021901637315750122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.190163650084287e-05, "grad_norm": 14.737916946411133, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8623626232147217, "num_tokens": 178942002.0, "step": 4688 }, { "epoch": 0.5964889963109019, "ewc_loss": 0.021848345175385475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1848345568287186e-05, "grad_norm": 14.885719299316406, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8668725490570068, "num_tokens": 178983473.0, "step": 4689 }, { "epoch": 0.5966162065894924, "ewc_loss": 0.021934838965535164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1934838514425792e-05, "grad_norm": 14.857918739318848, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8452033996582031, "num_tokens": 179022370.0, "step": 4690 }, { "epoch": 0.596743416868083, "ewc_loss": 0.021872347220778465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1872347133466974e-05, "grad_norm": 14.847455024719238, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8723894953727722, "num_tokens": 179064841.0, "step": 4691 }, { "epoch": 0.5968706271466735, "ewc_loss": 0.021915998309850693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1915999241173267e-05, "grad_norm": 14.88869857788086, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8299005627632141, "num_tokens": 179099399.0, "step": 4692 }, { "epoch": 0.5969978374252639, "ewc_loss": 0.02186853624880314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1868536350666545e-05, "grad_norm": 14.929713249206543, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8486658334732056, "num_tokens": 179133969.0, "step": 4693 }, { "epoch": 0.5971250477038544, "ewc_loss": 0.02187521383166313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1875213860766962e-05, "grad_norm": 14.823423385620117, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8551808595657349, "num_tokens": 179165456.0, "step": 4694 }, { "epoch": 0.597252257982445, "ewc_loss": 0.02186121605336666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1861216737306677e-05, "grad_norm": 14.94659423828125, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8617986440658569, "num_tokens": 179198209.0, "step": 4695 }, { "epoch": 0.5973794682610355, "ewc_loss": 0.02193761244416237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.19376124732662e-05, "grad_norm": 14.863594055175781, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8433878421783447, "num_tokens": 179233872.0, "step": 4696 }, { "epoch": 0.597506678539626, "ewc_loss": 0.021853048354387283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1853047655895352e-05, "grad_norm": 14.799360275268555, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8785483837127686, "num_tokens": 179272721.0, "step": 4697 }, { "epoch": 0.5976338888182166, "ewc_loss": 0.021905340254306793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.190533996326849e-05, "grad_norm": 14.926766395568848, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.856831431388855, "num_tokens": 179305781.0, "step": 4698 }, { "epoch": 0.597761099096807, "ewc_loss": 0.021941671147942543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.194167063862551e-05, "grad_norm": 14.841322898864746, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8625255227088928, "num_tokens": 179350049.0, "step": 4699 }, { "epoch": 0.5978883093753975, "ewc_loss": 0.021849052980542183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1849053155165166e-05, "grad_norm": 14.842520713806152, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8576416969299316, "num_tokens": 179388063.0, "step": 4700 }, { "epoch": 0.598015519653988, "ewc_loss": 0.021912038326263428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1912037482252344e-05, "grad_norm": 14.865690231323242, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8645257949829102, "num_tokens": 179427607.0, "step": 4701 }, { "epoch": 0.5981427299325786, "ewc_loss": 0.02188105322420597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1881052816752344e-05, "grad_norm": 14.832141876220703, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8410215973854065, "num_tokens": 179471456.0, "step": 4702 }, { "epoch": 0.5982699402111691, "ewc_loss": 0.02192409709095955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1924097381997854e-05, "grad_norm": 14.847590446472168, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8694928288459778, "num_tokens": 179514318.0, "step": 4703 }, { "epoch": 0.5983971504897596, "ewc_loss": 0.021915560588240623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1915560864727013e-05, "grad_norm": 14.862980842590332, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8714578151702881, "num_tokens": 179553307.0, "step": 4704 }, { "epoch": 0.59852436076835, "ewc_loss": 0.02189852111041546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1898520571994595e-05, "grad_norm": 14.848722457885742, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8495113849639893, "num_tokens": 179597288.0, "step": 4705 }, { "epoch": 0.5986515710469406, "ewc_loss": 0.021931778639554977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1931778974249028e-05, "grad_norm": 14.924461364746094, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8570749163627625, "num_tokens": 179629583.0, "step": 4706 }, { "epoch": 0.5987787813255311, "ewc_loss": 0.021883288398385048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1883288354729302e-05, "grad_norm": 14.81359577178955, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8518930673599243, "num_tokens": 179672104.0, "step": 4707 }, { "epoch": 0.5989059916041216, "ewc_loss": 0.021901706233620644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1901705622440204e-05, "grad_norm": 14.9061279296875, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8484662771224976, "num_tokens": 179712951.0, "step": 4708 }, { "epoch": 0.5990332018827121, "ewc_loss": 0.021951163187623024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1951163944322616e-05, "grad_norm": 14.846342086791992, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8572521805763245, "num_tokens": 179754626.0, "step": 4709 }, { "epoch": 0.5991604121613027, "ewc_loss": 0.021864190697669983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1864190784981474e-05, "grad_norm": 14.849431991577148, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8630121946334839, "num_tokens": 179790068.0, "step": 4710 }, { "epoch": 0.5992876224398932, "ewc_loss": 0.021920546889305115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1920546714682132e-05, "grad_norm": 14.884269714355469, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.838160514831543, "num_tokens": 179823936.0, "step": 4711 }, { "epoch": 0.5994148327184836, "ewc_loss": 0.021899506449699402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1899506464251317e-05, "grad_norm": 14.829596519470215, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.858898401260376, "num_tokens": 179860920.0, "step": 4712 }, { "epoch": 0.5995420429970741, "ewc_loss": 0.02192274108529091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.192274041590281e-05, "grad_norm": 14.8720121383667, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8482465744018555, "num_tokens": 179899848.0, "step": 4713 }, { "epoch": 0.5996692532756647, "ewc_loss": 0.021959662437438965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1959662262815982e-05, "grad_norm": 14.87353515625, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8747814893722534, "num_tokens": 179934462.0, "step": 4714 }, { "epoch": 0.5997964635542552, "ewc_loss": 0.021941501647233963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.194150147261098e-05, "grad_norm": 14.830768585205078, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8585097789764404, "num_tokens": 179973955.0, "step": 4715 }, { "epoch": 0.5999236738328457, "ewc_loss": 0.02195565402507782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1955653210170567e-05, "grad_norm": 14.833608627319336, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8596570491790771, "num_tokens": 180015256.0, "step": 4716 }, { "epoch": 0.6000508841114363, "ewc_loss": 0.021967396140098572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.196739660575986e-05, "grad_norm": 14.836755752563477, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8651360273361206, "num_tokens": 180054145.0, "step": 4717 }, { "epoch": 0.6001780943900267, "ewc_loss": 0.021985124796628952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1985124476486817e-05, "grad_norm": 14.869915008544922, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8632162809371948, "num_tokens": 180093411.0, "step": 4718 }, { "epoch": 0.6003053046686172, "ewc_loss": 0.02197425626218319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.197425601480063e-05, "grad_norm": 14.881478309631348, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8675181865692139, "num_tokens": 180135799.0, "step": 4719 }, { "epoch": 0.6004325149472077, "ewc_loss": 0.021912850439548492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.191285057051573e-05, "grad_norm": 14.806769371032715, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8522456288337708, "num_tokens": 180172623.0, "step": 4720 }, { "epoch": 0.6005597252257983, "ewc_loss": 0.021960938349366188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.196093919337727e-05, "grad_norm": 14.955900192260742, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8545359373092651, "num_tokens": 180213833.0, "step": 4721 }, { "epoch": 0.6006869355043888, "ewc_loss": 0.02201247587800026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2012476620147936e-05, "grad_norm": 14.84009838104248, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8655022382736206, "num_tokens": 180254982.0, "step": 4722 }, { "epoch": 0.6008141457829793, "ewc_loss": 0.021895868703722954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1895868485444225e-05, "grad_norm": 14.894030570983887, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8663443326950073, "num_tokens": 180288544.0, "step": 4723 }, { "epoch": 0.6009413560615697, "ewc_loss": 0.021963931620121002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1963931430946104e-05, "grad_norm": 14.86882495880127, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8541955947875977, "num_tokens": 180330233.0, "step": 4724 }, { "epoch": 0.6010685663401603, "ewc_loss": 0.021918796002864838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.191879684687592e-05, "grad_norm": 14.921412467956543, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8613210320472717, "num_tokens": 180361516.0, "step": 4725 }, { "epoch": 0.6011957766187508, "ewc_loss": 0.021948419511318207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1948419089312665e-05, "grad_norm": 14.869159698486328, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8565341830253601, "num_tokens": 180404380.0, "step": 4726 }, { "epoch": 0.6013229868973413, "ewc_loss": 0.02195126749575138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1951267626718618e-05, "grad_norm": 14.968864440917969, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8511602878570557, "num_tokens": 180436916.0, "step": 4727 }, { "epoch": 0.6014501971759318, "ewc_loss": 0.021924536675214767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1924535758444108e-05, "grad_norm": 14.827888488769531, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8401864171028137, "num_tokens": 180476895.0, "step": 4728 }, { "epoch": 0.6015774074545224, "ewc_loss": 0.021889766678214073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1889767594984733e-05, "grad_norm": 14.935198783874512, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8587703704833984, "num_tokens": 180515248.0, "step": 4729 }, { "epoch": 0.6017046177331128, "ewc_loss": 0.021969672292470932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1969672161503695e-05, "grad_norm": 14.910675048828125, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.864722490310669, "num_tokens": 180554575.0, "step": 4730 }, { "epoch": 0.6018318280117033, "ewc_loss": 0.021901575848460197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1901576474192552e-05, "grad_norm": 14.848600387573242, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8486165404319763, "num_tokens": 180594485.0, "step": 4731 }, { "epoch": 0.6019590382902938, "ewc_loss": 0.021913697943091393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1913698219577782e-05, "grad_norm": 14.92066764831543, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8435908555984497, "num_tokens": 180634746.0, "step": 4732 }, { "epoch": 0.6020862485688844, "ewc_loss": 0.021935630589723587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1935629774816334e-05, "grad_norm": 14.88048267364502, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8477845191955566, "num_tokens": 180674946.0, "step": 4733 }, { "epoch": 0.6022134588474749, "ewc_loss": 0.021896319463849068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1896319594816305e-05, "grad_norm": 14.824992179870605, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8579338788986206, "num_tokens": 180715538.0, "step": 4734 }, { "epoch": 0.6023406691260654, "ewc_loss": 0.021899675950407982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1899675630265847e-05, "grad_norm": 14.980670928955078, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.856351375579834, "num_tokens": 180755298.0, "step": 4735 }, { "epoch": 0.6024678794046558, "ewc_loss": 0.021976837888360023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1976837160764262e-05, "grad_norm": 14.866035461425781, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8515524864196777, "num_tokens": 180797104.0, "step": 4736 }, { "epoch": 0.6025950896832464, "ewc_loss": 0.02189042791724205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.189042788813822e-05, "grad_norm": 15.032761573791504, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8683302402496338, "num_tokens": 180839845.0, "step": 4737 }, { "epoch": 0.6027222999618369, "ewc_loss": 0.02188441902399063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1884419766138308e-05, "grad_norm": 14.7424898147583, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8719728589057922, "num_tokens": 180878849.0, "step": 4738 }, { "epoch": 0.6028495102404274, "ewc_loss": 0.021881666034460068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.188166581618134e-05, "grad_norm": 14.972251892089844, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8566662073135376, "num_tokens": 180918427.0, "step": 4739 }, { "epoch": 0.602976720519018, "ewc_loss": 0.021942555904388428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1942556486465037e-05, "grad_norm": 14.827561378479004, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8504564762115479, "num_tokens": 180956206.0, "step": 4740 }, { "epoch": 0.6031039307976085, "ewc_loss": 0.021848095580935478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.18480963667389e-05, "grad_norm": 14.911502838134766, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8638028502464294, "num_tokens": 180996945.0, "step": 4741 }, { "epoch": 0.6032311410761989, "ewc_loss": 0.02195991761982441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.195991692133248e-05, "grad_norm": 14.915260314941406, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8636502027511597, "num_tokens": 181031763.0, "step": 4742 }, { "epoch": 0.6033583513547894, "ewc_loss": 0.021839572116732597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1839572582393885e-05, "grad_norm": 14.922525405883789, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8660269379615784, "num_tokens": 181070661.0, "step": 4743 }, { "epoch": 0.60348556163338, "ewc_loss": 0.021909652277827263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1909652787144296e-05, "grad_norm": 14.815876960754395, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.844716489315033, "num_tokens": 181112769.0, "step": 4744 }, { "epoch": 0.6036127719119705, "ewc_loss": 0.021837415173649788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.183741526096128e-05, "grad_norm": 14.886685371398926, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8578883409500122, "num_tokens": 181151155.0, "step": 4745 }, { "epoch": 0.603739982190561, "ewc_loss": 0.021918008103966713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1918007405474782e-05, "grad_norm": 14.924798965454102, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8581151366233826, "num_tokens": 181188689.0, "step": 4746 }, { "epoch": 0.6038671924691515, "ewc_loss": 0.021883808076381683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1883808585698716e-05, "grad_norm": 14.945289611816406, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8629537224769592, "num_tokens": 181230031.0, "step": 4747 }, { "epoch": 0.603994402747742, "ewc_loss": 0.021901965141296387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.190196573792491e-05, "grad_norm": 14.935660362243652, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8576214909553528, "num_tokens": 181271464.0, "step": 4748 }, { "epoch": 0.6041216130263325, "ewc_loss": 0.021848464384675026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1848463802598417e-05, "grad_norm": 14.885034561157227, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.853510856628418, "num_tokens": 181312607.0, "step": 4749 }, { "epoch": 0.604248823304923, "ewc_loss": 0.021871918812394142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.187191967095714e-05, "grad_norm": 14.93968677520752, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8665803074836731, "num_tokens": 181348628.0, "step": 4750 }, { "epoch": 0.6043760335835135, "ewc_loss": 0.021874306723475456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1874306185054593e-05, "grad_norm": 14.931694030761719, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8495960235595703, "num_tokens": 181387975.0, "step": 4751 }, { "epoch": 0.6045032438621041, "ewc_loss": 0.021856999024748802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1856998500879854e-05, "grad_norm": 14.902502059936523, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8711220026016235, "num_tokens": 181424163.0, "step": 4752 }, { "epoch": 0.6046304541406946, "ewc_loss": 0.021821647882461548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1821648260811344e-05, "grad_norm": 14.97476863861084, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8485016822814941, "num_tokens": 181457002.0, "step": 4753 }, { "epoch": 0.604757664419285, "ewc_loss": 0.02187703736126423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1877036488149315e-05, "grad_norm": 14.902856826782227, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8540385365486145, "num_tokens": 181494410.0, "step": 4754 }, { "epoch": 0.6048848746978756, "ewc_loss": 0.021820098161697388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1820098481839523e-05, "grad_norm": 14.890338897705078, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.855178952217102, "num_tokens": 181534595.0, "step": 4755 }, { "epoch": 0.6050120849764661, "ewc_loss": 0.021845513954758644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1845513401785865e-05, "grad_norm": 14.926265716552734, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8643437623977661, "num_tokens": 181570654.0, "step": 4756 }, { "epoch": 0.6051392952550566, "ewc_loss": 0.02191394567489624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1913945602136664e-05, "grad_norm": 14.928647994995117, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8524012565612793, "num_tokens": 181608366.0, "step": 4757 }, { "epoch": 0.6052665055336471, "ewc_loss": 0.0218704454600811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.187044628954027e-05, "grad_norm": 14.899089813232422, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8464714884757996, "num_tokens": 181647168.0, "step": 4758 }, { "epoch": 0.6053937158122377, "ewc_loss": 0.0219154953956604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1915495381108485e-05, "grad_norm": 14.844734191894531, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8613711595535278, "num_tokens": 181689177.0, "step": 4759 }, { "epoch": 0.6055209260908282, "ewc_loss": 0.02190820872783661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.190820850955788e-05, "grad_norm": 14.912858009338379, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8636422753334045, "num_tokens": 181723742.0, "step": 4760 }, { "epoch": 0.6056481363694186, "ewc_loss": 0.021966053172945976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.196605237259064e-05, "grad_norm": 14.943062782287598, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8670145273208618, "num_tokens": 181761539.0, "step": 4761 }, { "epoch": 0.6057753466480091, "ewc_loss": 0.021906785666942596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1906786059844308e-05, "grad_norm": 14.910286903381348, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.856765627861023, "num_tokens": 181799674.0, "step": 4762 }, { "epoch": 0.6059025569265997, "ewc_loss": 0.021927811205387115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1927811758359894e-05, "grad_norm": 14.848526000976562, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.845859944820404, "num_tokens": 181838784.0, "step": 4763 }, { "epoch": 0.6060297672051902, "ewc_loss": 0.021929234266281128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1929234208073467e-05, "grad_norm": 14.934739112854004, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8536502122879028, "num_tokens": 181875669.0, "step": 4764 }, { "epoch": 0.6061569774837807, "ewc_loss": 0.02192326821386814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1923267922829837e-05, "grad_norm": 14.835301399230957, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8410694599151611, "num_tokens": 181912450.0, "step": 4765 }, { "epoch": 0.6062841877623713, "ewc_loss": 0.021918296813964844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1918296624789946e-05, "grad_norm": 14.88354206085205, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8579710721969604, "num_tokens": 181947691.0, "step": 4766 }, { "epoch": 0.6064113980409617, "ewc_loss": 0.021992258727550507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1992258552927524e-05, "grad_norm": 14.89127254486084, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.862756609916687, "num_tokens": 181982364.0, "step": 4767 }, { "epoch": 0.6065386083195522, "ewc_loss": 0.02196361869573593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1963618564768694e-05, "grad_norm": 14.89245891571045, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8576120138168335, "num_tokens": 182017071.0, "step": 4768 }, { "epoch": 0.6066658185981427, "ewc_loss": 0.022019289433956146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2019288735464215e-05, "grad_norm": 14.89734172821045, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8583635091781616, "num_tokens": 182057234.0, "step": 4769 }, { "epoch": 0.6067930288767333, "ewc_loss": 0.02195150963962078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.195150955230929e-05, "grad_norm": 14.835433006286621, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8689645528793335, "num_tokens": 182092450.0, "step": 4770 }, { "epoch": 0.6069202391553238, "ewc_loss": 0.022031256929039955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2031257685739547e-05, "grad_norm": 14.96730899810791, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8507798910140991, "num_tokens": 182126398.0, "step": 4771 }, { "epoch": 0.6070474494339143, "ewc_loss": 0.022035982459783554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2035981601220556e-05, "grad_norm": 14.886214256286621, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8728986978530884, "num_tokens": 182160848.0, "step": 4772 }, { "epoch": 0.6071746597125047, "ewc_loss": 0.022021565586328506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2021566110197455e-05, "grad_norm": 14.86208438873291, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8660793304443359, "num_tokens": 182202596.0, "step": 4773 }, { "epoch": 0.6073018699910953, "ewc_loss": 0.022050639614462852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2050639017834328e-05, "grad_norm": 14.914593696594238, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8566529750823975, "num_tokens": 182239235.0, "step": 4774 }, { "epoch": 0.6074290802696858, "ewc_loss": 0.022013550624251366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2013549823896028e-05, "grad_norm": 14.836649894714355, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8703034520149231, "num_tokens": 182273662.0, "step": 4775 }, { "epoch": 0.6075562905482763, "ewc_loss": 0.022078247740864754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2078247639001347e-05, "grad_norm": 14.951102256774902, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8431658148765564, "num_tokens": 182311991.0, "step": 4776 }, { "epoch": 0.6076835008268668, "ewc_loss": 0.022057538852095604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2057538444641978e-05, "grad_norm": 14.879143714904785, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8597326874732971, "num_tokens": 182350722.0, "step": 4777 }, { "epoch": 0.6078107111054574, "ewc_loss": 0.022099249064922333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2099249690654688e-05, "grad_norm": 14.89403247833252, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8610684871673584, "num_tokens": 182386755.0, "step": 4778 }, { "epoch": 0.6079379213840478, "ewc_loss": 0.022054750472307205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2054749933886342e-05, "grad_norm": 14.869148254394531, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8749594688415527, "num_tokens": 182423128.0, "step": 4779 }, { "epoch": 0.6080651316626383, "ewc_loss": 0.022057851776480675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2057851310819387e-05, "grad_norm": 14.882332801818848, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8447372913360596, "num_tokens": 182461518.0, "step": 4780 }, { "epoch": 0.6081923419412288, "ewc_loss": 0.022106114774942398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.210611455666367e-05, "grad_norm": 14.851862907409668, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8582822680473328, "num_tokens": 182495707.0, "step": 4781 }, { "epoch": 0.6083195522198194, "ewc_loss": 0.02205946110188961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2059461116441526e-05, "grad_norm": 14.888815879821777, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8482174277305603, "num_tokens": 182539058.0, "step": 4782 }, { "epoch": 0.6084467624984099, "ewc_loss": 0.02215738222002983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.215738277300261e-05, "grad_norm": 14.944823265075684, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8431445360183716, "num_tokens": 182572482.0, "step": 4783 }, { "epoch": 0.6085739727770004, "ewc_loss": 0.02211233228445053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.211233186244499e-05, "grad_norm": 14.898712158203125, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8627141714096069, "num_tokens": 182607665.0, "step": 4784 }, { "epoch": 0.6087011830555908, "ewc_loss": 0.022113148123025894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.211314858868718e-05, "grad_norm": 14.901163101196289, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8605214357376099, "num_tokens": 182650503.0, "step": 4785 }, { "epoch": 0.6088283933341814, "ewc_loss": 0.022114546969532967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2114547391538508e-05, "grad_norm": 14.941061019897461, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8690012693405151, "num_tokens": 182681662.0, "step": 4786 }, { "epoch": 0.6089556036127719, "ewc_loss": 0.022149451076984406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.214945197920315e-05, "grad_norm": 14.975909233093262, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.845634937286377, "num_tokens": 182714912.0, "step": 4787 }, { "epoch": 0.6090828138913624, "ewc_loss": 0.022132165729999542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2132166122901253e-05, "grad_norm": 14.936545372009277, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8555971384048462, "num_tokens": 182748813.0, "step": 4788 }, { "epoch": 0.609210024169953, "ewc_loss": 0.022123917937278748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2123918824945576e-05, "grad_norm": 14.852197647094727, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8423819541931152, "num_tokens": 182786123.0, "step": 4789 }, { "epoch": 0.6093372344485435, "ewc_loss": 0.022135788574814796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2135787730803713e-05, "grad_norm": 14.863295555114746, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8716621398925781, "num_tokens": 182820550.0, "step": 4790 }, { "epoch": 0.6094644447271339, "ewc_loss": 0.022168081253767014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2168082068674266e-05, "grad_norm": 14.90999698638916, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8584686517715454, "num_tokens": 182863135.0, "step": 4791 }, { "epoch": 0.6095916550057244, "ewc_loss": 0.022202378138899803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.220237729488872e-05, "grad_norm": 14.91551399230957, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8601042032241821, "num_tokens": 182905193.0, "step": 4792 }, { "epoch": 0.609718865284315, "ewc_loss": 0.022147729992866516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2147729396237992e-05, "grad_norm": 14.923711776733398, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8586372137069702, "num_tokens": 182944934.0, "step": 4793 }, { "epoch": 0.6098460755629055, "ewc_loss": 0.022164292633533478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.216429311374668e-05, "grad_norm": 14.908439636230469, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8454284071922302, "num_tokens": 182984604.0, "step": 4794 }, { "epoch": 0.609973285841496, "ewc_loss": 0.022132769227027893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.213277002738323e-05, "grad_norm": 14.903209686279297, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8571816086769104, "num_tokens": 183020421.0, "step": 4795 }, { "epoch": 0.6101004961200865, "ewc_loss": 0.022149229422211647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2149230062495917e-05, "grad_norm": 14.9313383102417, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8496544361114502, "num_tokens": 183057635.0, "step": 4796 }, { "epoch": 0.610227706398677, "ewc_loss": 0.02212061733007431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.212061735917814e-05, "grad_norm": 14.929365158081055, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8464709520339966, "num_tokens": 183094689.0, "step": 4797 }, { "epoch": 0.6103549166772675, "ewc_loss": 0.02216019667685032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2160196749609895e-05, "grad_norm": 14.896197319030762, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8639829754829407, "num_tokens": 183129118.0, "step": 4798 }, { "epoch": 0.610482126955858, "ewc_loss": 0.02215546742081642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2155467377160676e-05, "grad_norm": 15.00353717803955, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8557730913162231, "num_tokens": 183163365.0, "step": 4799 }, { "epoch": 0.6106093372344485, "ewc_loss": 0.02209813892841339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.209813828812912e-05, "grad_norm": 14.896525382995605, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8617866039276123, "num_tokens": 183199271.0, "step": 4800 }, { "epoch": 0.6107365475130391, "ewc_loss": 0.022111408412456512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2111407815827988e-05, "grad_norm": 14.921486854553223, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8776391744613647, "num_tokens": 183234523.0, "step": 4801 }, { "epoch": 0.6108637577916296, "ewc_loss": 0.02215486951172352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.215486892964691e-05, "grad_norm": 14.917915344238281, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8585159778594971, "num_tokens": 183267098.0, "step": 4802 }, { "epoch": 0.61099096807022, "ewc_loss": 0.022120386362075806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.212038634752389e-05, "grad_norm": 14.964956283569336, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8582385182380676, "num_tokens": 183306825.0, "step": 4803 }, { "epoch": 0.6111181783488105, "ewc_loss": 0.022128596901893616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2128597265691496e-05, "grad_norm": 14.864334106445312, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8573663234710693, "num_tokens": 183348712.0, "step": 4804 }, { "epoch": 0.6112453886274011, "ewc_loss": 0.02211875468492508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.211875471402891e-05, "grad_norm": 14.930685043334961, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.870383083820343, "num_tokens": 183383797.0, "step": 4805 }, { "epoch": 0.6113725989059916, "ewc_loss": 0.022176628932356834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.217662949988153e-05, "grad_norm": 14.972943305969238, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8698952198028564, "num_tokens": 183417484.0, "step": 4806 }, { "epoch": 0.6114998091845821, "ewc_loss": 0.022088980302214622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2088979676482268e-05, "grad_norm": 14.913803100585938, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8643576502799988, "num_tokens": 183453808.0, "step": 4807 }, { "epoch": 0.6116270194631727, "ewc_loss": 0.02213999442756176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2139995053294115e-05, "grad_norm": 14.98395824432373, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8653914928436279, "num_tokens": 183496693.0, "step": 4808 }, { "epoch": 0.6117542297417632, "ewc_loss": 0.022164568305015564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2164567781146616e-05, "grad_norm": 14.891112327575684, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8616727590560913, "num_tokens": 183536727.0, "step": 4809 }, { "epoch": 0.6118814400203536, "ewc_loss": 0.022094689309597015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2094689484219998e-05, "grad_norm": 14.990581512451172, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8606107234954834, "num_tokens": 183571809.0, "step": 4810 }, { "epoch": 0.6120086502989441, "ewc_loss": 0.022129444405436516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.212944491475355e-05, "grad_norm": 14.885597229003906, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8578845858573914, "num_tokens": 183610761.0, "step": 4811 }, { "epoch": 0.6121358605775347, "ewc_loss": 0.02208077535033226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2080776034272276e-05, "grad_norm": 14.99171257019043, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8573378920555115, "num_tokens": 183648304.0, "step": 4812 }, { "epoch": 0.6122630708561252, "ewc_loss": 0.022155512124300003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2155512851895764e-05, "grad_norm": 14.90937328338623, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8542234897613525, "num_tokens": 183684201.0, "step": 4813 }, { "epoch": 0.6123902811347157, "ewc_loss": 0.022082213312387466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2082213035901077e-05, "grad_norm": 14.959517478942871, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8518249988555908, "num_tokens": 183726595.0, "step": 4814 }, { "epoch": 0.6125174914133062, "ewc_loss": 0.02215675450861454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2156755221658386e-05, "grad_norm": 14.96794605255127, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8674821853637695, "num_tokens": 183762222.0, "step": 4815 }, { "epoch": 0.6126447016918967, "ewc_loss": 0.022040728479623795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.204072916356381e-05, "grad_norm": 14.891382217407227, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8709245324134827, "num_tokens": 183797167.0, "step": 4816 }, { "epoch": 0.6127719119704872, "ewc_loss": 0.02206416055560112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2064161385060288e-05, "grad_norm": 14.945672988891602, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8575171828269958, "num_tokens": 183839591.0, "step": 4817 }, { "epoch": 0.6128991222490777, "ewc_loss": 0.022122159600257874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2122159862192348e-05, "grad_norm": 14.982499122619629, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8582750558853149, "num_tokens": 183877862.0, "step": 4818 }, { "epoch": 0.6130263325276682, "ewc_loss": 0.022065158933401108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.206515819125343e-05, "grad_norm": 14.958849906921387, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8641852140426636, "num_tokens": 183915964.0, "step": 4819 }, { "epoch": 0.6131535428062588, "ewc_loss": 0.02206672728061676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.206672797910869e-05, "grad_norm": 15.02009105682373, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8555881977081299, "num_tokens": 183952823.0, "step": 4820 }, { "epoch": 0.6132807530848493, "ewc_loss": 0.022057754918932915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2057754904381e-05, "grad_norm": 14.899198532104492, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8495042324066162, "num_tokens": 183991650.0, "step": 4821 }, { "epoch": 0.6134079633634397, "ewc_loss": 0.021995674818754196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.1995674615027383e-05, "grad_norm": 14.873035430908203, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8602785468101501, "num_tokens": 184027768.0, "step": 4822 }, { "epoch": 0.6135351736420303, "ewc_loss": 0.02208072319626808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2080723283579573e-05, "grad_norm": 14.92776107788086, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8707528114318848, "num_tokens": 184063754.0, "step": 4823 }, { "epoch": 0.6136623839206208, "ewc_loss": 0.022049548104405403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.20495476241922e-05, "grad_norm": 14.902140617370605, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8529583215713501, "num_tokens": 184105286.0, "step": 4824 }, { "epoch": 0.6137895941992113, "ewc_loss": 0.022076992318034172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.20769925363129e-05, "grad_norm": 14.997746467590332, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8637189865112305, "num_tokens": 184139312.0, "step": 4825 }, { "epoch": 0.6139168044778018, "ewc_loss": 0.022039465606212616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2039464965928346e-05, "grad_norm": 14.917895317077637, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8581510186195374, "num_tokens": 184181541.0, "step": 4826 }, { "epoch": 0.6140440147563924, "ewc_loss": 0.022023919969797134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2023919882485643e-05, "grad_norm": 14.914444923400879, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8581732511520386, "num_tokens": 184216857.0, "step": 4827 }, { "epoch": 0.6141712250349828, "ewc_loss": 0.02208358235657215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2083582734921947e-05, "grad_norm": 14.971588134765625, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8617192506790161, "num_tokens": 184253318.0, "step": 4828 }, { "epoch": 0.6142984353135733, "ewc_loss": 0.022040989249944687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2040989279048517e-05, "grad_norm": 14.884757041931152, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8697841763496399, "num_tokens": 184298625.0, "step": 4829 }, { "epoch": 0.6144256455921638, "ewc_loss": 0.022054284811019897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2054284272599034e-05, "grad_norm": 14.98820972442627, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8561657071113586, "num_tokens": 184334569.0, "step": 4830 }, { "epoch": 0.6145528558707544, "ewc_loss": 0.022088460624217987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2088461264502257e-05, "grad_norm": 14.983434677124023, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8423718214035034, "num_tokens": 184377477.0, "step": 4831 }, { "epoch": 0.6146800661493449, "ewc_loss": 0.022015497088432312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2015497961547226e-05, "grad_norm": 14.970415115356445, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8459312915802002, "num_tokens": 184416671.0, "step": 4832 }, { "epoch": 0.6148072764279354, "ewc_loss": 0.02206542156636715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2065421944716945e-05, "grad_norm": 14.946309089660645, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8538167476654053, "num_tokens": 184453174.0, "step": 4833 }, { "epoch": 0.6149344867065258, "ewc_loss": 0.022037168964743614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.203716940130107e-05, "grad_norm": 14.947172164916992, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8658319115638733, "num_tokens": 184488313.0, "step": 4834 }, { "epoch": 0.6150616969851164, "ewc_loss": 0.022035054862499237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.203505573561415e-05, "grad_norm": 15.052448272705078, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8390305042266846, "num_tokens": 184523733.0, "step": 4835 }, { "epoch": 0.6151889072637069, "ewc_loss": 0.02207876183092594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.207876241300255e-05, "grad_norm": 15.051079750061035, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.856816291809082, "num_tokens": 184568399.0, "step": 4836 }, { "epoch": 0.6153161175422974, "ewc_loss": 0.02203197032213211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2031970729585737e-05, "grad_norm": 14.983341217041016, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8573668599128723, "num_tokens": 184603302.0, "step": 4837 }, { "epoch": 0.615443327820888, "ewc_loss": 0.0220407173037529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2040716430637985e-05, "grad_norm": 14.998815536499023, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8778876662254333, "num_tokens": 184637346.0, "step": 4838 }, { "epoch": 0.6155705380994785, "ewc_loss": 0.02202906459569931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.202906398451887e-05, "grad_norm": 15.013599395751953, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8654179573059082, "num_tokens": 184671687.0, "step": 4839 }, { "epoch": 0.6156977483780689, "ewc_loss": 0.022058311849832535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2058311515138485e-05, "grad_norm": 15.032722473144531, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8676264882087708, "num_tokens": 184706867.0, "step": 4840 }, { "epoch": 0.6158249586566594, "ewc_loss": 0.022049875929951668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.204987504228484e-05, "grad_norm": 15.009222984313965, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.855267345905304, "num_tokens": 184743916.0, "step": 4841 }, { "epoch": 0.61595216893525, "ewc_loss": 0.022051138803362846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2051139239920303e-05, "grad_norm": 14.933961868286133, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8674833178520203, "num_tokens": 184783247.0, "step": 4842 }, { "epoch": 0.6160793792138405, "ewc_loss": 0.022017477080225945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2017477022018284e-05, "grad_norm": 14.96522045135498, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8589737415313721, "num_tokens": 184826501.0, "step": 4843 }, { "epoch": 0.616206589492431, "ewc_loss": 0.022066906094551086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.206690624007024e-05, "grad_norm": 14.955032348632812, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8573013544082642, "num_tokens": 184868743.0, "step": 4844 }, { "epoch": 0.6163337997710215, "ewc_loss": 0.02205774188041687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2057742171455175e-05, "grad_norm": 14.979351043701172, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8562090396881104, "num_tokens": 184908992.0, "step": 4845 }, { "epoch": 0.616461010049612, "ewc_loss": 0.022044559940695763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2044559955247678e-05, "grad_norm": 15.009878158569336, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.847692608833313, "num_tokens": 184942456.0, "step": 4846 }, { "epoch": 0.6165882203282025, "ewc_loss": 0.022050531581044197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.205053169745952e-05, "grad_norm": 15.052213668823242, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8483091592788696, "num_tokens": 184975834.0, "step": 4847 }, { "epoch": 0.616715430606793, "ewc_loss": 0.022054115310311317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2054115106584504e-05, "grad_norm": 14.906204223632812, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8804991841316223, "num_tokens": 185012295.0, "step": 4848 }, { "epoch": 0.6168426408853835, "ewc_loss": 0.022020544856786728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2020545657142065e-05, "grad_norm": 14.925592422485352, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8638355731964111, "num_tokens": 185049272.0, "step": 4849 }, { "epoch": 0.6169698511639741, "ewc_loss": 0.0220857672393322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2085767341195606e-05, "grad_norm": 14.884105682373047, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8712347149848938, "num_tokens": 185081054.0, "step": 4850 }, { "epoch": 0.6170970614425646, "ewc_loss": 0.022069498896598816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.206949829997029e-05, "grad_norm": 15.00549030303955, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8552685976028442, "num_tokens": 185119809.0, "step": 4851 }, { "epoch": 0.617224271721155, "ewc_loss": 0.02213253639638424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2132537196739577e-05, "grad_norm": 14.932231903076172, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8704360723495483, "num_tokens": 185161462.0, "step": 4852 }, { "epoch": 0.6173514819997455, "ewc_loss": 0.022083301097154617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2083300791564398e-05, "grad_norm": 15.028610229492188, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.849984347820282, "num_tokens": 185196238.0, "step": 4853 }, { "epoch": 0.6174786922783361, "ewc_loss": 0.022142861038446426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2142861780594103e-05, "grad_norm": 14.964254379272461, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8579219579696655, "num_tokens": 185239317.0, "step": 4854 }, { "epoch": 0.6176059025569266, "ewc_loss": 0.02206885628402233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.206885619671084e-05, "grad_norm": 14.915596961975098, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8683781623840332, "num_tokens": 185273555.0, "step": 4855 }, { "epoch": 0.6177331128355171, "ewc_loss": 0.022154703736305237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2154703401611187e-05, "grad_norm": 14.99412727355957, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8672971725463867, "num_tokens": 185309721.0, "step": 4856 }, { "epoch": 0.6178603231141077, "ewc_loss": 0.02214721217751503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2147212803247385e-05, "grad_norm": 14.965481758117676, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.851402759552002, "num_tokens": 185350846.0, "step": 4857 }, { "epoch": 0.6179875333926982, "ewc_loss": 0.02210909314453602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2109092242317274e-05, "grad_norm": 14.987935066223145, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8519264459609985, "num_tokens": 185392472.0, "step": 4858 }, { "epoch": 0.6181147436712886, "ewc_loss": 0.02215060591697693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2150605218484998e-05, "grad_norm": 15.009369850158691, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8552299737930298, "num_tokens": 185438250.0, "step": 4859 }, { "epoch": 0.6182419539498791, "ewc_loss": 0.022095294669270515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.209529520769138e-05, "grad_norm": 14.994240760803223, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8563948273658752, "num_tokens": 185479455.0, "step": 4860 }, { "epoch": 0.6183691642284697, "ewc_loss": 0.02214639261364937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2146392439026386e-05, "grad_norm": 14.980329513549805, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8552625179290771, "num_tokens": 185518719.0, "step": 4861 }, { "epoch": 0.6184963745070602, "ewc_loss": 0.02208014205098152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.208014120697044e-05, "grad_norm": 15.007972717285156, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8441092371940613, "num_tokens": 185556671.0, "step": 4862 }, { "epoch": 0.6186235847856507, "ewc_loss": 0.022121738642454147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2121737856650725e-05, "grad_norm": 14.98814868927002, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8564608097076416, "num_tokens": 185595063.0, "step": 4863 }, { "epoch": 0.6187507950642412, "ewc_loss": 0.022094974294304848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2094975065556355e-05, "grad_norm": 14.979183197021484, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8494755029678345, "num_tokens": 185635626.0, "step": 4864 }, { "epoch": 0.6188780053428317, "ewc_loss": 0.022129839286208153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2129839635454118e-05, "grad_norm": 15.022613525390625, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.847163200378418, "num_tokens": 185671687.0, "step": 4865 }, { "epoch": 0.6190052156214222, "ewc_loss": 0.022093428298830986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.209342892456334e-05, "grad_norm": 14.937302589416504, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8614040613174438, "num_tokens": 185706459.0, "step": 4866 }, { "epoch": 0.6191324259000127, "ewc_loss": 0.022133976221084595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.213397601735778e-05, "grad_norm": 15.057056427001953, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8769030570983887, "num_tokens": 185743162.0, "step": 4867 }, { "epoch": 0.6192596361786032, "ewc_loss": 0.022110525518655777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.211052560596727e-05, "grad_norm": 14.918424606323242, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8427484631538391, "num_tokens": 185784036.0, "step": 4868 }, { "epoch": 0.6193868464571938, "ewc_loss": 0.022100158035755157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2100157366367057e-05, "grad_norm": 15.019137382507324, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8551294207572937, "num_tokens": 185821407.0, "step": 4869 }, { "epoch": 0.6195140567357843, "ewc_loss": 0.022134428843855858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2134428945719264e-05, "grad_norm": 14.951701164245605, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8639461994171143, "num_tokens": 185857789.0, "step": 4870 }, { "epoch": 0.6196412670143747, "ewc_loss": 0.0220970269292593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2097026885603555e-05, "grad_norm": 15.055106163024902, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8695870041847229, "num_tokens": 185896989.0, "step": 4871 }, { "epoch": 0.6197684772929652, "ewc_loss": 0.0221572183072567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.215721906395629e-05, "grad_norm": 14.904236793518066, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8732125759124756, "num_tokens": 185938123.0, "step": 4872 }, { "epoch": 0.6198956875715558, "ewc_loss": 0.02208719775080681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2087197066866793e-05, "grad_norm": 15.005816459655762, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8591384887695312, "num_tokens": 185973440.0, "step": 4873 }, { "epoch": 0.6200228978501463, "ewc_loss": 0.022184643894433975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.218464396719355e-05, "grad_norm": 14.963231086730957, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8592877984046936, "num_tokens": 186010069.0, "step": 4874 }, { "epoch": 0.6201501081287368, "ewc_loss": 0.022122401744127274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.212240178778302e-05, "grad_norm": 14.920165061950684, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8528885841369629, "num_tokens": 186048365.0, "step": 4875 }, { "epoch": 0.6202773184073274, "ewc_loss": 0.022186411544680595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2186412024893798e-05, "grad_norm": 14.922355651855469, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8637222647666931, "num_tokens": 186085358.0, "step": 4876 }, { "epoch": 0.6204045286859178, "ewc_loss": 0.022152332589030266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2152333258418366e-05, "grad_norm": 15.028816223144531, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8628177046775818, "num_tokens": 186118476.0, "step": 4877 }, { "epoch": 0.6205317389645083, "ewc_loss": 0.022195961326360703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2195961719262414e-05, "grad_norm": 14.962993621826172, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8553606271743774, "num_tokens": 186155487.0, "step": 4878 }, { "epoch": 0.6206589492430988, "ewc_loss": 0.02215801365673542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.215801396232564e-05, "grad_norm": 14.939427375793457, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8639332056045532, "num_tokens": 186192941.0, "step": 4879 }, { "epoch": 0.6207861595216894, "ewc_loss": 0.02217087335884571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.217087421740871e-05, "grad_norm": 14.970576286315918, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8624096512794495, "num_tokens": 186233860.0, "step": 4880 }, { "epoch": 0.6209133698002799, "ewc_loss": 0.022138381376862526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.213838160969317e-05, "grad_norm": 14.864913940429688, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.847597599029541, "num_tokens": 186273081.0, "step": 4881 }, { "epoch": 0.6210405800788704, "ewc_loss": 0.02216440439224243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2164404072100297e-05, "grad_norm": 14.969281196594238, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8641371130943298, "num_tokens": 186310596.0, "step": 4882 }, { "epoch": 0.6211677903574608, "ewc_loss": 0.02221195586025715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2211956093087792e-05, "grad_norm": 14.917919158935547, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8536232709884644, "num_tokens": 186346262.0, "step": 4883 }, { "epoch": 0.6212950006360514, "ewc_loss": 0.022197937592864037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2197937141754664e-05, "grad_norm": 14.976211547851562, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8620830774307251, "num_tokens": 186386508.0, "step": 4884 }, { "epoch": 0.6214222109146419, "ewc_loss": 0.022171858698129654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2171858290676028e-05, "grad_norm": 14.851717948913574, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8537176847457886, "num_tokens": 186425184.0, "step": 4885 }, { "epoch": 0.6215494211932324, "ewc_loss": 0.02218090184032917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2180902305990458e-05, "grad_norm": 14.990981101989746, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8578811883926392, "num_tokens": 186463072.0, "step": 4886 }, { "epoch": 0.621676631471823, "ewc_loss": 0.02223895862698555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.223895899078343e-05, "grad_norm": 14.954463005065918, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8694124817848206, "num_tokens": 186504062.0, "step": 4887 }, { "epoch": 0.6218038417504135, "ewc_loss": 0.022162050008773804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2162050299812108e-05, "grad_norm": 14.9119873046875, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8496689796447754, "num_tokens": 186544564.0, "step": 4888 }, { "epoch": 0.6219310520290039, "ewc_loss": 0.022212563082575798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2212563635548577e-05, "grad_norm": 14.963894844055176, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8454350829124451, "num_tokens": 186587196.0, "step": 4889 }, { "epoch": 0.6220582623075944, "ewc_loss": 0.02216896042227745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.216896064055618e-05, "grad_norm": 14.927811622619629, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8595855236053467, "num_tokens": 186620673.0, "step": 4890 }, { "epoch": 0.622185472586185, "ewc_loss": 0.02217802032828331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.217802102677524e-05, "grad_norm": 14.909366607666016, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8613477945327759, "num_tokens": 186654028.0, "step": 4891 }, { "epoch": 0.6223126828647755, "ewc_loss": 0.022210869938135147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2210870156413876e-05, "grad_norm": 14.969990730285645, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8549267053604126, "num_tokens": 186693395.0, "step": 4892 }, { "epoch": 0.622439893143366, "ewc_loss": 0.022198202088475227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2198202714207582e-05, "grad_norm": 14.998363494873047, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.841984748840332, "num_tokens": 186734440.0, "step": 4893 }, { "epoch": 0.6225671034219565, "ewc_loss": 0.022166235372424126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2166235794429667e-05, "grad_norm": 14.898322105407715, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8611471652984619, "num_tokens": 186769637.0, "step": 4894 }, { "epoch": 0.622694313700547, "ewc_loss": 0.02216319739818573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2163198082125746e-05, "grad_norm": 14.978350639343262, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.854780375957489, "num_tokens": 186810188.0, "step": 4895 }, { "epoch": 0.6228215239791375, "ewc_loss": 0.02218763530254364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.218763438577298e-05, "grad_norm": 14.904888153076172, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8484901189804077, "num_tokens": 186847512.0, "step": 4896 }, { "epoch": 0.622948734257728, "ewc_loss": 0.02218075841665268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2180758605827577e-05, "grad_norm": 14.931283950805664, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8683691620826721, "num_tokens": 186878721.0, "step": 4897 }, { "epoch": 0.6230759445363185, "ewc_loss": 0.02222277596592903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2222775442060083e-05, "grad_norm": 14.923242568969727, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8639103174209595, "num_tokens": 186920239.0, "step": 4898 }, { "epoch": 0.6232031548149091, "ewc_loss": 0.022213870659470558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2213871488929726e-05, "grad_norm": 14.944067001342773, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8524068593978882, "num_tokens": 186964158.0, "step": 4899 }, { "epoch": 0.6233303650934996, "ewc_loss": 0.022266171872615814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2266171072260477e-05, "grad_norm": 15.02896499633789, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.873489260673523, "num_tokens": 187003176.0, "step": 4900 }, { "epoch": 0.62345757537209, "ewc_loss": 0.02222451940178871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2224519852898084e-05, "grad_norm": 14.909451484680176, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8572440147399902, "num_tokens": 187046152.0, "step": 4901 }, { "epoch": 0.6235847856506805, "ewc_loss": 0.022210901603102684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2210901079233736e-05, "grad_norm": 14.93718433380127, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8703285455703735, "num_tokens": 187086629.0, "step": 4902 }, { "epoch": 0.6237119959292711, "ewc_loss": 0.022249950096011162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2249949324759655e-05, "grad_norm": 14.972960472106934, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8530319929122925, "num_tokens": 187122856.0, "step": 4903 }, { "epoch": 0.6238392062078616, "ewc_loss": 0.02222992479801178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.222992407041602e-05, "grad_norm": 14.961161613464355, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8728458881378174, "num_tokens": 187161568.0, "step": 4904 }, { "epoch": 0.6239664164864521, "ewc_loss": 0.022274792194366455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2274793082033284e-05, "grad_norm": 15.010266304016113, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8590385913848877, "num_tokens": 187202831.0, "step": 4905 }, { "epoch": 0.6240936267650427, "ewc_loss": 0.022242551669478416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2242551494855434e-05, "grad_norm": 15.031097412109375, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8657584190368652, "num_tokens": 187239059.0, "step": 4906 }, { "epoch": 0.6242208370436332, "ewc_loss": 0.022194374352693558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2194373741513118e-05, "grad_norm": 15.026594161987305, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8531654477119446, "num_tokens": 187272940.0, "step": 4907 }, { "epoch": 0.6243480473222236, "ewc_loss": 0.02220030315220356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2200303646968678e-05, "grad_norm": 14.927343368530273, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8721700310707092, "num_tokens": 187310963.0, "step": 4908 }, { "epoch": 0.6244752576008141, "ewc_loss": 0.022175760939717293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2175761841936037e-05, "grad_norm": 14.995377540588379, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8779761791229248, "num_tokens": 187345698.0, "step": 4909 }, { "epoch": 0.6246024678794047, "ewc_loss": 0.02224850282073021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2248503228183836e-05, "grad_norm": 14.978776931762695, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8574390411376953, "num_tokens": 187384762.0, "step": 4910 }, { "epoch": 0.6247296781579952, "ewc_loss": 0.022149374708533287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.21493755816482e-05, "grad_norm": 14.962154388427734, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8487523794174194, "num_tokens": 187424288.0, "step": 4911 }, { "epoch": 0.6248568884365857, "ewc_loss": 0.02225360833108425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.225360913143959e-05, "grad_norm": 15.004719734191895, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8679800629615784, "num_tokens": 187461481.0, "step": 4912 }, { "epoch": 0.6249840987151762, "ewc_loss": 0.022194428369402885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2194428311195225e-05, "grad_norm": 15.028300285339355, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8735164999961853, "num_tokens": 187497690.0, "step": 4913 }, { "epoch": 0.6251113089937667, "ewc_loss": 0.022208940237760544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2208940208656713e-05, "grad_norm": 14.97708511352539, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8521259427070618, "num_tokens": 187537825.0, "step": 4914 }, { "epoch": 0.6252385192723572, "ewc_loss": 0.022199276834726334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2199277736945078e-05, "grad_norm": 14.996720314025879, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.868897557258606, "num_tokens": 187583998.0, "step": 4915 }, { "epoch": 0.6253657295509477, "ewc_loss": 0.022214602679014206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.221460272266995e-05, "grad_norm": 15.03549861907959, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8580062389373779, "num_tokens": 187621962.0, "step": 4916 }, { "epoch": 0.6254929398295382, "ewc_loss": 0.02217876724898815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2178766812430695e-05, "grad_norm": 15.010942459106445, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8652173280715942, "num_tokens": 187655913.0, "step": 4917 }, { "epoch": 0.6256201501081288, "ewc_loss": 0.022143781185150146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2143780370242894e-05, "grad_norm": 15.01011848449707, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8511340618133545, "num_tokens": 187692433.0, "step": 4918 }, { "epoch": 0.6257473603867193, "ewc_loss": 0.022220762446522713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2220761820790358e-05, "grad_norm": 15.025832176208496, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8714351058006287, "num_tokens": 187724311.0, "step": 4919 }, { "epoch": 0.6258745706653097, "ewc_loss": 0.022176258265972137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2176258426043205e-05, "grad_norm": 14.991477012634277, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.867504894733429, "num_tokens": 187766614.0, "step": 4920 }, { "epoch": 0.6260017809439002, "ewc_loss": 0.0222184918820858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2218491722014733e-05, "grad_norm": 15.024046897888184, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8696184158325195, "num_tokens": 187804847.0, "step": 4921 }, { "epoch": 0.6261289912224908, "ewc_loss": 0.022221550345420837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2221549443202093e-05, "grad_norm": 15.013004302978516, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8608903884887695, "num_tokens": 187842791.0, "step": 4922 }, { "epoch": 0.6262562015010813, "ewc_loss": 0.022155269980430603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.215526910731569e-05, "grad_norm": 15.00351333618164, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8497376441955566, "num_tokens": 187878304.0, "step": 4923 }, { "epoch": 0.6263834117796718, "ewc_loss": 0.022190922871232033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.219092311861459e-05, "grad_norm": 15.033742904663086, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.860683798789978, "num_tokens": 187915022.0, "step": 4924 }, { "epoch": 0.6265106220582624, "ewc_loss": 0.022238027304410934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2238027668208815e-05, "grad_norm": 15.039916038513184, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8603106141090393, "num_tokens": 187950533.0, "step": 4925 }, { "epoch": 0.6266378323368528, "ewc_loss": 0.022242842242121696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.224284253316e-05, "grad_norm": 15.063276290893555, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8546066284179688, "num_tokens": 187989893.0, "step": 4926 }, { "epoch": 0.6267650426154433, "ewc_loss": 0.02222626470029354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2226264263736084e-05, "grad_norm": 15.004642486572266, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8521729111671448, "num_tokens": 188025031.0, "step": 4927 }, { "epoch": 0.6268922528940338, "ewc_loss": 0.022183062508702278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.218306326540187e-05, "grad_norm": 15.039546012878418, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8515684604644775, "num_tokens": 188060797.0, "step": 4928 }, { "epoch": 0.6270194631726244, "ewc_loss": 0.02225274033844471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2252739654504694e-05, "grad_norm": 15.034817695617676, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.852325439453125, "num_tokens": 188096271.0, "step": 4929 }, { "epoch": 0.6271466734512149, "ewc_loss": 0.022211072966456413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.221107206423767e-05, "grad_norm": 15.044088363647461, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8593015670776367, "num_tokens": 188136697.0, "step": 4930 }, { "epoch": 0.6272738837298054, "ewc_loss": 0.022231336683034897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2231337425182573e-05, "grad_norm": 14.973387718200684, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8570690751075745, "num_tokens": 188174715.0, "step": 4931 }, { "epoch": 0.6274010940083958, "ewc_loss": 0.022194499149918556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2194499251781963e-05, "grad_norm": 15.02135944366455, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8374314904212952, "num_tokens": 188216379.0, "step": 4932 }, { "epoch": 0.6275283042869864, "ewc_loss": 0.022247666493058205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2247666493058205e-05, "grad_norm": 14.94923210144043, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8643296360969543, "num_tokens": 188252183.0, "step": 4933 }, { "epoch": 0.6276555145655769, "ewc_loss": 0.0222354494035244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.223545016022399e-05, "grad_norm": 15.023480415344238, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8608288168907166, "num_tokens": 188291128.0, "step": 4934 }, { "epoch": 0.6277827248441674, "ewc_loss": 0.022289201617240906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.228920129709877e-05, "grad_norm": 15.00141716003418, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8421003222465515, "num_tokens": 188334256.0, "step": 4935 }, { "epoch": 0.627909935122758, "ewc_loss": 0.022244015708565712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2244015781325288e-05, "grad_norm": 15.050846099853516, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8522736430168152, "num_tokens": 188373207.0, "step": 4936 }, { "epoch": 0.6280371454013485, "ewc_loss": 0.022238625213503838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2238626115722582e-05, "grad_norm": 15.02725601196289, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8551861047744751, "num_tokens": 188414937.0, "step": 4937 }, { "epoch": 0.6281643556799389, "ewc_loss": 0.022245585918426514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2245585569180548e-05, "grad_norm": 15.109423637390137, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8551648855209351, "num_tokens": 188455398.0, "step": 4938 }, { "epoch": 0.6282915659585294, "ewc_loss": 0.02224131114780903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2241310944082215e-05, "grad_norm": 14.993002891540527, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8582015037536621, "num_tokens": 188496561.0, "step": 4939 }, { "epoch": 0.62841877623712, "ewc_loss": 0.022188089787960052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2188089133123867e-05, "grad_norm": 15.103896141052246, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8615986108779907, "num_tokens": 188531318.0, "step": 4940 }, { "epoch": 0.6285459865157105, "ewc_loss": 0.02222297713160515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2222977349883877e-05, "grad_norm": 14.990756034851074, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.850064754486084, "num_tokens": 188566807.0, "step": 4941 }, { "epoch": 0.628673196794301, "ewc_loss": 0.022162456065416336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.21624559344491e-05, "grad_norm": 15.02098274230957, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8547763824462891, "num_tokens": 188608062.0, "step": 4942 }, { "epoch": 0.6288004070728915, "ewc_loss": 0.02221904695034027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2219046513782814e-05, "grad_norm": 14.983839988708496, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8701531887054443, "num_tokens": 188644953.0, "step": 4943 }, { "epoch": 0.628927617351482, "ewc_loss": 0.022200286388397217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2200285457074642e-05, "grad_norm": 15.027094841003418, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.873753547668457, "num_tokens": 188687720.0, "step": 4944 }, { "epoch": 0.6290548276300725, "ewc_loss": 0.022233735769987106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.223373667220585e-05, "grad_norm": 15.026806831359863, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8519278764724731, "num_tokens": 188730077.0, "step": 4945 }, { "epoch": 0.629182037908663, "ewc_loss": 0.02216317318379879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2163172616274096e-05, "grad_norm": 14.972646713256836, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8645322322845459, "num_tokens": 188768306.0, "step": 4946 }, { "epoch": 0.6293092481872535, "ewc_loss": 0.02219409868121147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2194099074113183e-05, "grad_norm": 15.073925018310547, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.838327944278717, "num_tokens": 188807480.0, "step": 4947 }, { "epoch": 0.6294364584658441, "ewc_loss": 0.022216318175196648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2216318029677495e-05, "grad_norm": 15.016950607299805, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8494269251823425, "num_tokens": 188836383.0, "step": 4948 }, { "epoch": 0.6295636687444346, "ewc_loss": 0.02216586098074913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2165861082612537e-05, "grad_norm": 14.941646575927734, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8625415563583374, "num_tokens": 188876079.0, "step": 4949 }, { "epoch": 0.629690879023025, "ewc_loss": 0.0222057793289423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.220577880507335e-05, "grad_norm": 14.998930931091309, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8550875782966614, "num_tokens": 188912936.0, "step": 4950 }, { "epoch": 0.6298180893016155, "ewc_loss": 0.022217268124222755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2217267542146146e-05, "grad_norm": 14.948005676269531, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8572564125061035, "num_tokens": 188951475.0, "step": 4951 }, { "epoch": 0.6299452995802061, "ewc_loss": 0.02225123718380928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2251237169257365e-05, "grad_norm": 15.041699409484863, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8440293669700623, "num_tokens": 188996972.0, "step": 4952 }, { "epoch": 0.6300725098587966, "ewc_loss": 0.022265011444687843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2265010557021014e-05, "grad_norm": 14.986978530883789, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.877991795539856, "num_tokens": 189030706.0, "step": 4953 }, { "epoch": 0.6301997201373871, "ewc_loss": 0.022209081798791885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.220908208983019e-05, "grad_norm": 15.026875495910645, "learning_rate": 1e-06, "loss": 0.3614, "mean_token_accuracy": 0.879874050617218, "num_tokens": 189065638.0, "step": 4954 }, { "epoch": 0.6303269304159776, "ewc_loss": 0.02227049693465233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.227049662906211e-05, "grad_norm": 15.026019096374512, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8714407682418823, "num_tokens": 189101747.0, "step": 4955 }, { "epoch": 0.6304541406945681, "ewc_loss": 0.022242216393351555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.224221680080518e-05, "grad_norm": 14.978967666625977, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8551977872848511, "num_tokens": 189142965.0, "step": 4956 }, { "epoch": 0.6305813509731586, "ewc_loss": 0.02226397953927517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2263979190029204e-05, "grad_norm": 15.10495662689209, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8639577031135559, "num_tokens": 189177822.0, "step": 4957 }, { "epoch": 0.6307085612517491, "ewc_loss": 0.02225678041577339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.225677962996997e-05, "grad_norm": 15.010313034057617, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8535773754119873, "num_tokens": 189215710.0, "step": 4958 }, { "epoch": 0.6308357715303397, "ewc_loss": 0.022239599376916885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.223959927505348e-05, "grad_norm": 15.065018653869629, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8327248096466064, "num_tokens": 189250788.0, "step": 4959 }, { "epoch": 0.6309629818089302, "ewc_loss": 0.022271595895290375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.227159529866185e-05, "grad_norm": 15.124900817871094, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8561946153640747, "num_tokens": 189278451.0, "step": 4960 }, { "epoch": 0.6310901920875207, "ewc_loss": 0.022261792793869972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2261792764766142e-05, "grad_norm": 14.989654541015625, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8632215261459351, "num_tokens": 189316857.0, "step": 4961 }, { "epoch": 0.6312174023661112, "ewc_loss": 0.022255219519138336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2255218937061727e-05, "grad_norm": 15.063928604125977, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8701828718185425, "num_tokens": 189356600.0, "step": 4962 }, { "epoch": 0.6313446126447017, "ewc_loss": 0.02228035405278206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2280353732639924e-05, "grad_norm": 14.989524841308594, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8541525602340698, "num_tokens": 189393134.0, "step": 4963 }, { "epoch": 0.6314718229232922, "ewc_loss": 0.022287173196673393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2287173123913817e-05, "grad_norm": 15.071969032287598, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8431755304336548, "num_tokens": 189433431.0, "step": 4964 }, { "epoch": 0.6315990332018827, "ewc_loss": 0.022292619571089745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2292619178188033e-05, "grad_norm": 14.999478340148926, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8457478880882263, "num_tokens": 189473738.0, "step": 4965 }, { "epoch": 0.6317262434804732, "ewc_loss": 0.022239960730075836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2239961253944784e-05, "grad_norm": 15.079472541809082, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8730341196060181, "num_tokens": 189509930.0, "step": 4966 }, { "epoch": 0.6318534537590638, "ewc_loss": 0.0223329346626997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2332935259328224e-05, "grad_norm": 15.004826545715332, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8639118075370789, "num_tokens": 189547801.0, "step": 4967 }, { "epoch": 0.6319806640376543, "ewc_loss": 0.022247787564992905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2247788365348242e-05, "grad_norm": 14.999407768249512, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8713622093200684, "num_tokens": 189582226.0, "step": 4968 }, { "epoch": 0.6321078743162447, "ewc_loss": 0.022379791364073753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.237979060737416e-05, "grad_norm": 15.10966682434082, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.858811616897583, "num_tokens": 189622915.0, "step": 4969 }, { "epoch": 0.6322350845948352, "ewc_loss": 0.022279059514403343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2279060431174003e-05, "grad_norm": 14.992293357849121, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.866862952709198, "num_tokens": 189658182.0, "step": 4970 }, { "epoch": 0.6323622948734258, "ewc_loss": 0.022279512137174606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2279511540546082e-05, "grad_norm": 15.036401748657227, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8611147999763489, "num_tokens": 189696627.0, "step": 4971 }, { "epoch": 0.6324895051520163, "ewc_loss": 0.022310182452201843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231018152087927e-05, "grad_norm": 15.003094673156738, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8569478392601013, "num_tokens": 189729826.0, "step": 4972 }, { "epoch": 0.6326167154306068, "ewc_loss": 0.022312626242637634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2312626242637634e-05, "grad_norm": 15.058354377746582, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.864722728729248, "num_tokens": 189771504.0, "step": 4973 }, { "epoch": 0.6327439257091974, "ewc_loss": 0.022318633273243904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2318632545648143e-05, "grad_norm": 15.07189655303955, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8757008910179138, "num_tokens": 189809220.0, "step": 4974 }, { "epoch": 0.6328711359877878, "ewc_loss": 0.022288676351308823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2288675609161146e-05, "grad_norm": 15.017799377441406, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8674135804176331, "num_tokens": 189849586.0, "step": 4975 }, { "epoch": 0.6329983462663783, "ewc_loss": 0.022256402298808098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2256403099163435e-05, "grad_norm": 14.992130279541016, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8605438470840454, "num_tokens": 189885054.0, "step": 4976 }, { "epoch": 0.6331255565449688, "ewc_loss": 0.02226456068456173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.226456126663834e-05, "grad_norm": 15.058381080627441, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8524233102798462, "num_tokens": 189918286.0, "step": 4977 }, { "epoch": 0.6332527668235594, "ewc_loss": 0.02230851538479328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.230851532658562e-05, "grad_norm": 15.008562088012695, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8409969210624695, "num_tokens": 189956845.0, "step": 4978 }, { "epoch": 0.6333799771021499, "ewc_loss": 0.02226351387798786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2263513528741896e-05, "grad_norm": 15.024561882019043, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8758503198623657, "num_tokens": 189994827.0, "step": 4979 }, { "epoch": 0.6335071873807404, "ewc_loss": 0.022302182391285896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2302181605482474e-05, "grad_norm": 15.033171653747559, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8544999361038208, "num_tokens": 190036066.0, "step": 4980 }, { "epoch": 0.6336343976593308, "ewc_loss": 0.022266831248998642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2266831365413964e-05, "grad_norm": 15.034477233886719, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8702127933502197, "num_tokens": 190073326.0, "step": 4981 }, { "epoch": 0.6337616079379214, "ewc_loss": 0.022255826741456985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.225582647952251e-05, "grad_norm": 14.959976196289062, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8572291135787964, "num_tokens": 190109125.0, "step": 4982 }, { "epoch": 0.6338888182165119, "ewc_loss": 0.022263219580054283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2263218852458522e-05, "grad_norm": 15.020249366760254, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8649698495864868, "num_tokens": 190148520.0, "step": 4983 }, { "epoch": 0.6340160284951024, "ewc_loss": 0.022295428439974785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2295427697827108e-05, "grad_norm": 15.017608642578125, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8613340854644775, "num_tokens": 190184272.0, "step": 4984 }, { "epoch": 0.6341432387736929, "ewc_loss": 0.022250430658459663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2250431356951594e-05, "grad_norm": 14.937751770019531, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.858207106590271, "num_tokens": 190221483.0, "step": 4985 }, { "epoch": 0.6342704490522835, "ewc_loss": 0.022277778014540672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2277778043644503e-05, "grad_norm": 15.006577491760254, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8710193037986755, "num_tokens": 190266430.0, "step": 4986 }, { "epoch": 0.6343976593308739, "ewc_loss": 0.022290555760264397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2290556444204412e-05, "grad_norm": 14.945555686950684, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8711141347885132, "num_tokens": 190308575.0, "step": 4987 }, { "epoch": 0.6345248696094644, "ewc_loss": 0.022302208468317986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2302208890323527e-05, "grad_norm": 15.055496215820312, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8700594305992126, "num_tokens": 190344584.0, "step": 4988 }, { "epoch": 0.6346520798880549, "ewc_loss": 0.02230282686650753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2302827346720733e-05, "grad_norm": 14.963176727294922, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8665804862976074, "num_tokens": 190376497.0, "step": 4989 }, { "epoch": 0.6347792901666455, "ewc_loss": 0.02230975404381752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2309754058369435e-05, "grad_norm": 15.146403312683105, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8480756282806396, "num_tokens": 190416879.0, "step": 4990 }, { "epoch": 0.634906500445236, "ewc_loss": 0.022312048822641373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231204962299671e-05, "grad_norm": 14.970270156860352, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8793740272521973, "num_tokens": 190454684.0, "step": 4991 }, { "epoch": 0.6350337107238265, "ewc_loss": 0.022216029465198517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.221602881036233e-05, "grad_norm": 15.01996898651123, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8522352576255798, "num_tokens": 190490524.0, "step": 4992 }, { "epoch": 0.635160921002417, "ewc_loss": 0.022323979064822197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2323978555505164e-05, "grad_norm": 15.033024787902832, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8667172789573669, "num_tokens": 190526558.0, "step": 4993 }, { "epoch": 0.6352881312810075, "ewc_loss": 0.022240297868847847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.224029776698444e-05, "grad_norm": 14.964883804321289, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8604750633239746, "num_tokens": 190571206.0, "step": 4994 }, { "epoch": 0.635415341559598, "ewc_loss": 0.022293638437986374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.229363781225402e-05, "grad_norm": 15.032254219055176, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8482833504676819, "num_tokens": 190611480.0, "step": 4995 }, { "epoch": 0.6355425518381885, "ewc_loss": 0.02228691801428795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.228691846539732e-05, "grad_norm": 15.055261611938477, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8325852155685425, "num_tokens": 190647699.0, "step": 4996 }, { "epoch": 0.6356697621167791, "ewc_loss": 0.022303486242890358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2303485820884816e-05, "grad_norm": 15.060096740722656, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8602577447891235, "num_tokens": 190684863.0, "step": 4997 }, { "epoch": 0.6357969723953696, "ewc_loss": 0.022272169589996338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.227217009931337e-05, "grad_norm": 14.974778175354004, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8451595306396484, "num_tokens": 190725006.0, "step": 4998 }, { "epoch": 0.63592418267396, "ewc_loss": 0.022280098870396614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2280099074123427e-05, "grad_norm": 15.041901588439941, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8574427366256714, "num_tokens": 190757431.0, "step": 4999 }, { "epoch": 0.6360513929525505, "ewc_loss": 0.02228604443371296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.228604535048362e-05, "grad_norm": 15.03127384185791, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.865157425403595, "num_tokens": 190793118.0, "step": 5000 }, { "epoch": 0.6361786032311411, "ewc_loss": 0.02229221910238266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2292219000519253e-05, "grad_norm": 15.024622917175293, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.869449257850647, "num_tokens": 190837105.0, "step": 5001 }, { "epoch": 0.6363058135097316, "ewc_loss": 0.02227386273443699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2273863578448072e-05, "grad_norm": 15.023192405700684, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8536888957023621, "num_tokens": 190875372.0, "step": 5002 }, { "epoch": 0.6364330237883221, "ewc_loss": 0.022303935140371323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2303935111267492e-05, "grad_norm": 15.044014930725098, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.864433765411377, "num_tokens": 190921307.0, "step": 5003 }, { "epoch": 0.6365602340669126, "ewc_loss": 0.022277358919382095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2277359676081687e-05, "grad_norm": 14.972591400146484, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8600535988807678, "num_tokens": 190967919.0, "step": 5004 }, { "epoch": 0.6366874443455031, "ewc_loss": 0.022286025807261467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.228602534160018e-05, "grad_norm": 15.078644752502441, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8580772876739502, "num_tokens": 191005668.0, "step": 5005 }, { "epoch": 0.6368146546240936, "ewc_loss": 0.0223124697804451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231246980954893e-05, "grad_norm": 15.056321144104004, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8487387895584106, "num_tokens": 191044455.0, "step": 5006 }, { "epoch": 0.6369418649026841, "ewc_loss": 0.022259971126914024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2259971956373192e-05, "grad_norm": 15.013699531555176, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8650643825531006, "num_tokens": 191086369.0, "step": 5007 }, { "epoch": 0.6370690751812746, "ewc_loss": 0.022298313677310944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2298314434010535e-05, "grad_norm": 15.026344299316406, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8545631766319275, "num_tokens": 191127070.0, "step": 5008 }, { "epoch": 0.6371962854598652, "ewc_loss": 0.02227274887263775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.22727485379437e-05, "grad_norm": 15.086132049560547, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8735564947128296, "num_tokens": 191163623.0, "step": 5009 }, { "epoch": 0.6373234957384557, "ewc_loss": 0.02227768301963806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.227768345619552e-05, "grad_norm": 15.06863784790039, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8548229932785034, "num_tokens": 191196790.0, "step": 5010 }, { "epoch": 0.6374507060170462, "ewc_loss": 0.022243816405534744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2243815692490898e-05, "grad_norm": 15.097070693969727, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8561416864395142, "num_tokens": 191230265.0, "step": 5011 }, { "epoch": 0.6375779162956366, "ewc_loss": 0.022285105660557747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2285104932961985e-05, "grad_norm": 15.08056354522705, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8503197431564331, "num_tokens": 191266781.0, "step": 5012 }, { "epoch": 0.6377051265742272, "ewc_loss": 0.0222540944814682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2254094801610336e-05, "grad_norm": 15.04261589050293, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8498014211654663, "num_tokens": 191305869.0, "step": 5013 }, { "epoch": 0.6378323368528177, "ewc_loss": 0.02224665693938732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2246656953939237e-05, "grad_norm": 15.033632278442383, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8483989238739014, "num_tokens": 191343498.0, "step": 5014 }, { "epoch": 0.6379595471314082, "ewc_loss": 0.022270390763878822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2270391127676703e-05, "grad_norm": 15.05386734008789, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8447014689445496, "num_tokens": 191383260.0, "step": 5015 }, { "epoch": 0.6380867574099988, "ewc_loss": 0.022280253469944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.228025368822273e-05, "grad_norm": 15.15728759765625, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.850740909576416, "num_tokens": 191417350.0, "step": 5016 }, { "epoch": 0.6382139676885893, "ewc_loss": 0.022278496995568275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2278496544458903e-05, "grad_norm": 15.042439460754395, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8598032593727112, "num_tokens": 191454336.0, "step": 5017 }, { "epoch": 0.6383411779671797, "ewc_loss": 0.02226482331752777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.226482320111245e-05, "grad_norm": 15.059597969055176, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8604111671447754, "num_tokens": 191489977.0, "step": 5018 }, { "epoch": 0.6384683882457702, "ewc_loss": 0.022294869646430016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.229486926808022e-05, "grad_norm": 15.057304382324219, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8495094180107117, "num_tokens": 191524622.0, "step": 5019 }, { "epoch": 0.6385955985243608, "ewc_loss": 0.022303210571408272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.230321115348488e-05, "grad_norm": 15.064484596252441, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8486203551292419, "num_tokens": 191565389.0, "step": 5020 }, { "epoch": 0.6387228088029513, "ewc_loss": 0.022361528128385544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.236152795376256e-05, "grad_norm": 15.096373558044434, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8787466287612915, "num_tokens": 191604640.0, "step": 5021 }, { "epoch": 0.6388500190815418, "ewc_loss": 0.022319642826914787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231964208476711e-05, "grad_norm": 14.976564407348633, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8589191436767578, "num_tokens": 191651029.0, "step": 5022 }, { "epoch": 0.6389772293601323, "ewc_loss": 0.02231530472636223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2315303795039654e-05, "grad_norm": 15.144567489624023, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8601112365722656, "num_tokens": 191690524.0, "step": 5023 }, { "epoch": 0.6391044396387228, "ewc_loss": 0.022359106689691544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.235910687886644e-05, "grad_norm": 15.029390335083008, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8544245958328247, "num_tokens": 191733992.0, "step": 5024 }, { "epoch": 0.6392316499173133, "ewc_loss": 0.022281521931290627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2281521523837e-05, "grad_norm": 14.991720199584961, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8699432015419006, "num_tokens": 191769717.0, "step": 5025 }, { "epoch": 0.6393588601959038, "ewc_loss": 0.02232365868985653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.232365841337014e-05, "grad_norm": 15.057132720947266, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8448092937469482, "num_tokens": 191813334.0, "step": 5026 }, { "epoch": 0.6394860704744944, "ewc_loss": 0.022317729890346527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2317730326903984e-05, "grad_norm": 15.004884719848633, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8608244061470032, "num_tokens": 191848295.0, "step": 5027 }, { "epoch": 0.6396132807530849, "ewc_loss": 0.02234179526567459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2341795556712896e-05, "grad_norm": 15.004531860351562, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8522219061851501, "num_tokens": 191885329.0, "step": 5028 }, { "epoch": 0.6397404910316754, "ewc_loss": 0.02235855720937252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.235855754406657e-05, "grad_norm": 15.034195899963379, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8723039627075195, "num_tokens": 191918070.0, "step": 5029 }, { "epoch": 0.6398677013102658, "ewc_loss": 0.022345315665006638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2345315301208757e-05, "grad_norm": 15.089849472045898, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.862755537033081, "num_tokens": 191966084.0, "step": 5030 }, { "epoch": 0.6399949115888564, "ewc_loss": 0.022320499643683434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.232049882877618e-05, "grad_norm": 14.950486183166504, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8592425584793091, "num_tokens": 192004907.0, "step": 5031 }, { "epoch": 0.6401221218674469, "ewc_loss": 0.02234065532684326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2340655050356872e-05, "grad_norm": 15.107399940490723, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8415412306785583, "num_tokens": 192042942.0, "step": 5032 }, { "epoch": 0.6402493321460374, "ewc_loss": 0.02237870916724205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2378708308679052e-05, "grad_norm": 15.025906562805176, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8597240447998047, "num_tokens": 192081133.0, "step": 5033 }, { "epoch": 0.6403765424246279, "ewc_loss": 0.022278903052210808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2278902179095894e-05, "grad_norm": 15.01648998260498, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8598068952560425, "num_tokens": 192120171.0, "step": 5034 }, { "epoch": 0.6405037527032185, "ewc_loss": 0.02236388437449932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2363883545040153e-05, "grad_norm": 15.068802833557129, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8606909513473511, "num_tokens": 192159583.0, "step": 5035 }, { "epoch": 0.6406309629818089, "ewc_loss": 0.02238316833972931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2383168470696546e-05, "grad_norm": 15.085474967956543, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.856861412525177, "num_tokens": 192199513.0, "step": 5036 }, { "epoch": 0.6407581732603994, "ewc_loss": 0.022347930818796158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2347931007971056e-05, "grad_norm": 15.050536155700684, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8624082207679749, "num_tokens": 192228352.0, "step": 5037 }, { "epoch": 0.6408853835389899, "ewc_loss": 0.022361941635608673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2361940864357166e-05, "grad_norm": 15.050477981567383, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8545817136764526, "num_tokens": 192264480.0, "step": 5038 }, { "epoch": 0.6410125938175805, "ewc_loss": 0.02239374816417694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.239374771306757e-05, "grad_norm": 15.084113121032715, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.857062578201294, "num_tokens": 192303307.0, "step": 5039 }, { "epoch": 0.641139804096171, "ewc_loss": 0.022390004247426987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.239000423287507e-05, "grad_norm": 15.082771301269531, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8618662357330322, "num_tokens": 192342064.0, "step": 5040 }, { "epoch": 0.6412670143747615, "ewc_loss": 0.022365687415003777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.236568798252847e-05, "grad_norm": 15.05776596069336, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8671622276306152, "num_tokens": 192379297.0, "step": 5041 }, { "epoch": 0.6413942246533519, "ewc_loss": 0.022336198017001152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2336198526318185e-05, "grad_norm": 14.973653793334961, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8626867532730103, "num_tokens": 192416087.0, "step": 5042 }, { "epoch": 0.6415214349319425, "ewc_loss": 0.0223635733127594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2363572497852147e-05, "grad_norm": 14.990129470825195, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8648535013198853, "num_tokens": 192454421.0, "step": 5043 }, { "epoch": 0.641648645210533, "ewc_loss": 0.022366859018802643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2366859411704354e-05, "grad_norm": 15.009117126464844, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8560888767242432, "num_tokens": 192493860.0, "step": 5044 }, { "epoch": 0.6417758554891235, "ewc_loss": 0.022436965256929398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2436965082306415e-05, "grad_norm": 15.069364547729492, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8505770564079285, "num_tokens": 192533672.0, "step": 5045 }, { "epoch": 0.641903065767714, "ewc_loss": 0.02243923954665661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2439238819060847e-05, "grad_norm": 15.085108757019043, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8467636108398438, "num_tokens": 192569569.0, "step": 5046 }, { "epoch": 0.6420302760463046, "ewc_loss": 0.022441506385803223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.244150709884707e-05, "grad_norm": 15.020315170288086, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8655178546905518, "num_tokens": 192610716.0, "step": 5047 }, { "epoch": 0.642157486324895, "ewc_loss": 0.02243306301534176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.243306335003581e-05, "grad_norm": 15.07784652709961, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8530838489532471, "num_tokens": 192648040.0, "step": 5048 }, { "epoch": 0.6422846966034855, "ewc_loss": 0.022443365305662155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2443366106017493e-05, "grad_norm": 15.061005592346191, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8577837944030762, "num_tokens": 192690063.0, "step": 5049 }, { "epoch": 0.6424119068820761, "ewc_loss": 0.022452980279922485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2452979465015233e-05, "grad_norm": 15.073009490966797, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8540544509887695, "num_tokens": 192732067.0, "step": 5050 }, { "epoch": 0.6425391171606666, "ewc_loss": 0.02244207076728344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2442070985562168e-05, "grad_norm": 14.975137710571289, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8696427345275879, "num_tokens": 192768598.0, "step": 5051 }, { "epoch": 0.6426663274392571, "ewc_loss": 0.02242649905383587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2426498617278412e-05, "grad_norm": 15.115283012390137, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8318175673484802, "num_tokens": 192805420.0, "step": 5052 }, { "epoch": 0.6427935377178476, "ewc_loss": 0.022473108023405075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.247310840175487e-05, "grad_norm": 15.01054859161377, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.854672908782959, "num_tokens": 192852082.0, "step": 5053 }, { "epoch": 0.6429207479964381, "ewc_loss": 0.022406775504350662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2406775315175764e-05, "grad_norm": 15.039328575134277, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8603972792625427, "num_tokens": 192892846.0, "step": 5054 }, { "epoch": 0.6430479582750286, "ewc_loss": 0.022459622472524643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2459622414316982e-05, "grad_norm": 15.074176788330078, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8772338628768921, "num_tokens": 192932091.0, "step": 5055 }, { "epoch": 0.6431751685536191, "ewc_loss": 0.022398579865694046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2398578948923387e-05, "grad_norm": 15.038657188415527, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8613156080245972, "num_tokens": 192960628.0, "step": 5056 }, { "epoch": 0.6433023788322096, "ewc_loss": 0.02243390865623951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2433909180108458e-05, "grad_norm": 15.039342880249023, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8523315191268921, "num_tokens": 193002075.0, "step": 5057 }, { "epoch": 0.6434295891108002, "ewc_loss": 0.02242710068821907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2427100702770986e-05, "grad_norm": 15.076562881469727, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8486818075180054, "num_tokens": 193039720.0, "step": 5058 }, { "epoch": 0.6435567993893907, "ewc_loss": 0.02242635376751423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.242635309812613e-05, "grad_norm": 15.062854766845703, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8590353727340698, "num_tokens": 193072806.0, "step": 5059 }, { "epoch": 0.6436840096679812, "ewc_loss": 0.02244560606777668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2445605281973258e-05, "grad_norm": 15.017455101013184, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8681020736694336, "num_tokens": 193108478.0, "step": 5060 }, { "epoch": 0.6438112199465716, "ewc_loss": 0.022446563467383385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2446563889388926e-05, "grad_norm": 15.069379806518555, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8631288409233093, "num_tokens": 193150663.0, "step": 5061 }, { "epoch": 0.6439384302251622, "ewc_loss": 0.02250765636563301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2507656467496417e-05, "grad_norm": 15.117951393127441, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8626625537872314, "num_tokens": 193191499.0, "step": 5062 }, { "epoch": 0.6440656405037527, "ewc_loss": 0.022423559799790382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2423559130402282e-05, "grad_norm": 15.015680313110352, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8498386740684509, "num_tokens": 193236103.0, "step": 5063 }, { "epoch": 0.6441928507823432, "ewc_loss": 0.022423934191465378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2423933842219412e-05, "grad_norm": 15.008960723876953, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8376818895339966, "num_tokens": 193274095.0, "step": 5064 }, { "epoch": 0.6443200610609338, "ewc_loss": 0.02249160222709179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2491602067020722e-05, "grad_norm": 15.114252090454102, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8654325008392334, "num_tokens": 193316988.0, "step": 5065 }, { "epoch": 0.6444472713395243, "ewc_loss": 0.022414082661271095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2414082195609808e-05, "grad_norm": 15.01237964630127, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8579999208450317, "num_tokens": 193357430.0, "step": 5066 }, { "epoch": 0.6445744816181147, "ewc_loss": 0.022425111383199692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2425110728363506e-05, "grad_norm": 15.083226203918457, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8668158054351807, "num_tokens": 193390392.0, "step": 5067 }, { "epoch": 0.6447016918967052, "ewc_loss": 0.022417476400732994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2417476429836825e-05, "grad_norm": 15.034581184387207, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8561102747917175, "num_tokens": 193430961.0, "step": 5068 }, { "epoch": 0.6448289021752958, "ewc_loss": 0.022366011515259743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.23660117626423e-05, "grad_norm": 15.018794059753418, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8567463159561157, "num_tokens": 193473767.0, "step": 5069 }, { "epoch": 0.6449561124538863, "ewc_loss": 0.022424405440688133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.242440496047493e-05, "grad_norm": 15.087576866149902, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8475546836853027, "num_tokens": 193511375.0, "step": 5070 }, { "epoch": 0.6450833227324768, "ewc_loss": 0.022401124238967896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2401123715098947e-05, "grad_norm": 15.004850387573242, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8696457147598267, "num_tokens": 193551296.0, "step": 5071 }, { "epoch": 0.6452105330110673, "ewc_loss": 0.022411653771996498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2411653844756074e-05, "grad_norm": 15.0606689453125, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8632981777191162, "num_tokens": 193596484.0, "step": 5072 }, { "epoch": 0.6453377432896578, "ewc_loss": 0.02242966555058956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2429665477829985e-05, "grad_norm": 15.073638916015625, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8647762537002563, "num_tokens": 193637513.0, "step": 5073 }, { "epoch": 0.6454649535682483, "ewc_loss": 0.022409871220588684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.24098712351406e-05, "grad_norm": 15.158354759216309, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8534020185470581, "num_tokens": 193672808.0, "step": 5074 }, { "epoch": 0.6455921638468388, "ewc_loss": 0.022405484691262245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2405483832699247e-05, "grad_norm": 15.105583190917969, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8740661144256592, "num_tokens": 193713018.0, "step": 5075 }, { "epoch": 0.6457193741254293, "ewc_loss": 0.02234426513314247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.234426574432291e-05, "grad_norm": 15.079955101013184, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8615055084228516, "num_tokens": 193757649.0, "step": 5076 }, { "epoch": 0.6458465844040199, "ewc_loss": 0.022396206855773926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2396206986741163e-05, "grad_norm": 15.146139144897461, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8558600544929504, "num_tokens": 193792390.0, "step": 5077 }, { "epoch": 0.6459737946826104, "ewc_loss": 0.022342374548316002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2342373995343223e-05, "grad_norm": 15.121062278747559, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8669076561927795, "num_tokens": 193827721.0, "step": 5078 }, { "epoch": 0.6461010049612008, "ewc_loss": 0.022373737767338753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.237373701063916e-05, "grad_norm": 15.131359100341797, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.850602924823761, "num_tokens": 193867217.0, "step": 5079 }, { "epoch": 0.6462282152397913, "ewc_loss": 0.02235964685678482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2359647118719295e-05, "grad_norm": 15.158696174621582, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8664963245391846, "num_tokens": 193900128.0, "step": 5080 }, { "epoch": 0.6463554255183819, "ewc_loss": 0.02236795425415039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.236795444332529e-05, "grad_norm": 15.129769325256348, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8691248893737793, "num_tokens": 193935744.0, "step": 5081 }, { "epoch": 0.6464826357969724, "ewc_loss": 0.022315382957458496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.231538383057341e-05, "grad_norm": 15.01772689819336, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8563136458396912, "num_tokens": 193977344.0, "step": 5082 }, { "epoch": 0.6466098460755629, "ewc_loss": 0.022318053990602493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2318054107017815e-05, "grad_norm": 15.077983856201172, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.833130955696106, "num_tokens": 194014295.0, "step": 5083 }, { "epoch": 0.6467370563541535, "ewc_loss": 0.022424571216106415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2424570488510653e-05, "grad_norm": 15.182856559753418, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8436986207962036, "num_tokens": 194055245.0, "step": 5084 }, { "epoch": 0.6468642666327439, "ewc_loss": 0.022381514310836792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2381515009328723e-05, "grad_norm": 15.131126403808594, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8515561819076538, "num_tokens": 194089313.0, "step": 5085 }, { "epoch": 0.6469914769113344, "ewc_loss": 0.022341392934322357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2341393560054712e-05, "grad_norm": 15.09649658203125, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8653385043144226, "num_tokens": 194128293.0, "step": 5086 }, { "epoch": 0.6471186871899249, "ewc_loss": 0.02238171361386776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.238171327917371e-05, "grad_norm": 15.143516540527344, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8518332242965698, "num_tokens": 194166632.0, "step": 5087 }, { "epoch": 0.6472458974685155, "ewc_loss": 0.022410957142710686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2410957171814516e-05, "grad_norm": 15.105208396911621, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.845260739326477, "num_tokens": 194209176.0, "step": 5088 }, { "epoch": 0.647373107747106, "ewc_loss": 0.02234760858118534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2347609046846628e-05, "grad_norm": 15.01611042022705, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8493032455444336, "num_tokens": 194245617.0, "step": 5089 }, { "epoch": 0.6475003180256965, "ewc_loss": 0.022351613268256187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2351612642523833e-05, "grad_norm": 15.094915390014648, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8649807572364807, "num_tokens": 194285983.0, "step": 5090 }, { "epoch": 0.6476275283042869, "ewc_loss": 0.022394657135009766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2394657207769342e-05, "grad_norm": 15.103327751159668, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8553153276443481, "num_tokens": 194317088.0, "step": 5091 }, { "epoch": 0.6477547385828775, "ewc_loss": 0.02237646095454693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2376461856765673e-05, "grad_norm": 15.04694938659668, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8641608953475952, "num_tokens": 194354025.0, "step": 5092 }, { "epoch": 0.647881948861468, "ewc_loss": 0.02242213487625122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2422134861699305e-05, "grad_norm": 15.098121643066406, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.863206684589386, "num_tokens": 194388539.0, "step": 5093 }, { "epoch": 0.6480091591400585, "ewc_loss": 0.02242603711783886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.242603659396991e-05, "grad_norm": 15.102667808532715, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8612195253372192, "num_tokens": 194424518.0, "step": 5094 }, { "epoch": 0.648136369418649, "ewc_loss": 0.02242419496178627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2424195776693523e-05, "grad_norm": 15.066621780395508, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8553265333175659, "num_tokens": 194463310.0, "step": 5095 }, { "epoch": 0.6482635796972396, "ewc_loss": 0.022419074550271034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.241907532152254e-05, "grad_norm": 15.083075523376465, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.858401358127594, "num_tokens": 194499911.0, "step": 5096 }, { "epoch": 0.64839078997583, "ewc_loss": 0.022439051419496536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2439051463152282e-05, "grad_norm": 15.184965133666992, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8491693735122681, "num_tokens": 194536731.0, "step": 5097 }, { "epoch": 0.6485180002544205, "ewc_loss": 0.02242627553641796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2426274881581776e-05, "grad_norm": 15.01110553741455, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8481189608573914, "num_tokens": 194574014.0, "step": 5098 }, { "epoch": 0.648645210533011, "ewc_loss": 0.022409558296203613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.240955836896319e-05, "grad_norm": 15.119063377380371, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8506956100463867, "num_tokens": 194611944.0, "step": 5099 }, { "epoch": 0.6487724208116016, "ewc_loss": 0.022496221587061882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2496222300105728e-05, "grad_norm": 15.109928131103516, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.868171215057373, "num_tokens": 194649734.0, "step": 5100 }, { "epoch": 0.6488996310901921, "ewc_loss": 0.02247784473001957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2477845050161704e-05, "grad_norm": 15.07218074798584, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.866621196269989, "num_tokens": 194688344.0, "step": 5101 }, { "epoch": 0.6490268413687826, "ewc_loss": 0.022447660565376282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2447660739999264e-05, "grad_norm": 15.088691711425781, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8461836576461792, "num_tokens": 194728582.0, "step": 5102 }, { "epoch": 0.649154051647373, "ewc_loss": 0.022477874532341957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.247787415399216e-05, "grad_norm": 15.05727767944336, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.859196662902832, "num_tokens": 194762753.0, "step": 5103 }, { "epoch": 0.6492812619259636, "ewc_loss": 0.022470105439424515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2470105250249617e-05, "grad_norm": 15.018632888793945, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.872507631778717, "num_tokens": 194802325.0, "step": 5104 }, { "epoch": 0.6494084722045541, "ewc_loss": 0.022475602105259895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2475602236227132e-05, "grad_norm": 15.061394691467285, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8580447435379028, "num_tokens": 194846349.0, "step": 5105 }, { "epoch": 0.6495356824831446, "ewc_loss": 0.022501518949866295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2501519197248854e-05, "grad_norm": 15.086812019348145, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8591947555541992, "num_tokens": 194880334.0, "step": 5106 }, { "epoch": 0.6496628927617352, "ewc_loss": 0.022499825805425644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2499825718114153e-05, "grad_norm": 15.078879356384277, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8496466875076294, "num_tokens": 194919343.0, "step": 5107 }, { "epoch": 0.6497901030403257, "ewc_loss": 0.02252514660358429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.252514605061151e-05, "grad_norm": 15.130581855773926, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8602441549301147, "num_tokens": 194958800.0, "step": 5108 }, { "epoch": 0.6499173133189162, "ewc_loss": 0.022498149424791336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2498148609884083e-05, "grad_norm": 15.103569984436035, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8318943977355957, "num_tokens": 194997249.0, "step": 5109 }, { "epoch": 0.6500445235975066, "ewc_loss": 0.02249419502913952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2494195945910178e-05, "grad_norm": 15.138105392456055, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8610931634902954, "num_tokens": 195035879.0, "step": 5110 }, { "epoch": 0.6501717338760972, "ewc_loss": 0.022463489323854446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.246348958578892e-05, "grad_norm": 15.1248140335083, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.853373646736145, "num_tokens": 195075925.0, "step": 5111 }, { "epoch": 0.6502989441546877, "ewc_loss": 0.022498155012726784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2498155885841697e-05, "grad_norm": 15.119691848754883, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8606700897216797, "num_tokens": 195110185.0, "step": 5112 }, { "epoch": 0.6504261544332782, "ewc_loss": 0.0224643275141716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2464328139903955e-05, "grad_norm": 15.03567886352539, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8505649566650391, "num_tokens": 195154666.0, "step": 5113 }, { "epoch": 0.6505533647118688, "ewc_loss": 0.022454576566815376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2454576537711546e-05, "grad_norm": 15.06912899017334, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8712397813796997, "num_tokens": 195188998.0, "step": 5114 }, { "epoch": 0.6506805749904593, "ewc_loss": 0.022488275542855263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2488275135401636e-05, "grad_norm": 15.072747230529785, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8666919469833374, "num_tokens": 195224124.0, "step": 5115 }, { "epoch": 0.6508077852690497, "ewc_loss": 0.022481882944703102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2481883206637576e-05, "grad_norm": 15.117443084716797, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8598329424858093, "num_tokens": 195253601.0, "step": 5116 }, { "epoch": 0.6509349955476402, "ewc_loss": 0.02250567637383938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2505675588035956e-05, "grad_norm": 15.13700008392334, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8509703874588013, "num_tokens": 195289045.0, "step": 5117 }, { "epoch": 0.6510622058262308, "ewc_loss": 0.022466374561190605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2466374502982944e-05, "grad_norm": 15.078450202941895, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8608230352401733, "num_tokens": 195326983.0, "step": 5118 }, { "epoch": 0.6511894161048213, "ewc_loss": 0.022538641467690468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2538641132996418e-05, "grad_norm": 15.164143562316895, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8718255758285522, "num_tokens": 195360425.0, "step": 5119 }, { "epoch": 0.6513166263834118, "ewc_loss": 0.02251475304365158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.251475234515965e-05, "grad_norm": 15.018107414245605, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8482310175895691, "num_tokens": 195402648.0, "step": 5120 }, { "epoch": 0.6514438366620023, "ewc_loss": 0.02251732163131237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2517320758197457e-05, "grad_norm": 15.107010841369629, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8765897750854492, "num_tokens": 195440945.0, "step": 5121 }, { "epoch": 0.6515710469405928, "ewc_loss": 0.022527920082211494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2527920009451918e-05, "grad_norm": 15.067633628845215, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.86167973279953, "num_tokens": 195481872.0, "step": 5122 }, { "epoch": 0.6516982572191833, "ewc_loss": 0.022519461810588837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.251946170872543e-05, "grad_norm": 15.026187896728516, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8636019229888916, "num_tokens": 195525460.0, "step": 5123 }, { "epoch": 0.6518254674977738, "ewc_loss": 0.02251744456589222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2517444449476898e-05, "grad_norm": 15.133655548095703, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8613962531089783, "num_tokens": 195563522.0, "step": 5124 }, { "epoch": 0.6519526777763643, "ewc_loss": 0.022548003122210503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2548003471456468e-05, "grad_norm": 15.042440414428711, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8687212467193604, "num_tokens": 195598114.0, "step": 5125 }, { "epoch": 0.6520798880549549, "ewc_loss": 0.0224923063069582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2492306015919894e-05, "grad_norm": 15.1181640625, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.854331910610199, "num_tokens": 195639826.0, "step": 5126 }, { "epoch": 0.6522070983335454, "ewc_loss": 0.022604839876294136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2604839614359662e-05, "grad_norm": 15.051681518554688, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8661567568778992, "num_tokens": 195682047.0, "step": 5127 }, { "epoch": 0.6523343086121358, "ewc_loss": 0.02251388691365719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2513886506203562e-05, "grad_norm": 15.095394134521484, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8633104562759399, "num_tokens": 195722453.0, "step": 5128 }, { "epoch": 0.6524615188907263, "ewc_loss": 0.02253490686416626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2534906747750938e-05, "grad_norm": 15.094889640808105, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8549855351448059, "num_tokens": 195759182.0, "step": 5129 }, { "epoch": 0.6525887291693169, "ewc_loss": 0.02247796580195427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.247796510346234e-05, "grad_norm": 15.094667434692383, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8632082343101501, "num_tokens": 195795641.0, "step": 5130 }, { "epoch": 0.6527159394479074, "ewc_loss": 0.02251918986439705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2519190679304302e-05, "grad_norm": 15.098243713378906, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8693267107009888, "num_tokens": 195831200.0, "step": 5131 }, { "epoch": 0.6528431497264979, "ewc_loss": 0.022491687908768654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.249168755952269e-05, "grad_norm": 15.10572338104248, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8617833852767944, "num_tokens": 195875110.0, "step": 5132 }, { "epoch": 0.6529703600050885, "ewc_loss": 0.02251262590289116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2512625946546905e-05, "grad_norm": 15.082862854003906, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8612549901008606, "num_tokens": 195916724.0, "step": 5133 }, { "epoch": 0.6530975702836789, "ewc_loss": 0.022434087470173836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2434087441070005e-05, "grad_norm": 15.056303024291992, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8530406355857849, "num_tokens": 195956219.0, "step": 5134 }, { "epoch": 0.6532247805622694, "ewc_loss": 0.022458916530013084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2458916646428406e-05, "grad_norm": 15.045565605163574, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8631554841995239, "num_tokens": 195991563.0, "step": 5135 }, { "epoch": 0.6533519908408599, "ewc_loss": 0.02245541289448738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2455413272837177e-05, "grad_norm": 15.054354667663574, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8663939237594604, "num_tokens": 196031864.0, "step": 5136 }, { "epoch": 0.6534792011194505, "ewc_loss": 0.02248397283256054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.248397322546225e-05, "grad_norm": 15.026721954345703, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8517816066741943, "num_tokens": 196067208.0, "step": 5137 }, { "epoch": 0.653606411398041, "ewc_loss": 0.02249581180512905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2495811208500527e-05, "grad_norm": 15.141962051391602, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.863065242767334, "num_tokens": 196103934.0, "step": 5138 }, { "epoch": 0.6537336216766315, "ewc_loss": 0.022514406591653824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2514406737172976e-05, "grad_norm": 15.118316650390625, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8549491167068481, "num_tokens": 196140886.0, "step": 5139 }, { "epoch": 0.6538608319552219, "ewc_loss": 0.022474683821201324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2474683646578342e-05, "grad_norm": 15.073803901672363, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8684685230255127, "num_tokens": 196178940.0, "step": 5140 }, { "epoch": 0.6539880422338125, "ewc_loss": 0.022484539076685905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2484538931166753e-05, "grad_norm": 15.038311004638672, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8610538244247437, "num_tokens": 196229461.0, "step": 5141 }, { "epoch": 0.654115252512403, "ewc_loss": 0.022466396912932396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2466396330855787e-05, "grad_norm": 15.082220077514648, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8732101917266846, "num_tokens": 196265793.0, "step": 5142 }, { "epoch": 0.6542424627909935, "ewc_loss": 0.022510234266519547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2510233975481242e-05, "grad_norm": 15.053889274597168, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8668621182441711, "num_tokens": 196304919.0, "step": 5143 }, { "epoch": 0.654369673069584, "ewc_loss": 0.02247067540884018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2470674593932927e-05, "grad_norm": 15.072896003723145, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8746965527534485, "num_tokens": 196344761.0, "step": 5144 }, { "epoch": 0.6544968833481746, "ewc_loss": 0.0224965400993824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.249654062325135e-05, "grad_norm": 15.122260093688965, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8746540546417236, "num_tokens": 196382625.0, "step": 5145 }, { "epoch": 0.654624093626765, "ewc_loss": 0.022482773289084435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.248277269245591e-05, "grad_norm": 15.060720443725586, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8710510730743408, "num_tokens": 196422279.0, "step": 5146 }, { "epoch": 0.6547513039053555, "ewc_loss": 0.022451413795351982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.245141331513878e-05, "grad_norm": 15.101318359375, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8553214073181152, "num_tokens": 196463684.0, "step": 5147 }, { "epoch": 0.654878514183946, "ewc_loss": 0.02249191515147686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.249191493319813e-05, "grad_norm": 15.03540325164795, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.863197922706604, "num_tokens": 196506319.0, "step": 5148 }, { "epoch": 0.6550057244625366, "ewc_loss": 0.02242148108780384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2421481844503433e-05, "grad_norm": 15.108514785766602, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8542537689208984, "num_tokens": 196546367.0, "step": 5149 }, { "epoch": 0.6551329347411271, "ewc_loss": 0.022485550493001938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2485550289275125e-05, "grad_norm": 15.151469230651855, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8447126150131226, "num_tokens": 196576598.0, "step": 5150 }, { "epoch": 0.6552601450197176, "ewc_loss": 0.02248460240662098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2484602595795877e-05, "grad_norm": 15.091939926147461, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8537511825561523, "num_tokens": 196610633.0, "step": 5151 }, { "epoch": 0.655387355298308, "ewc_loss": 0.02242802083492279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.242802111140918e-05, "grad_norm": 15.030444145202637, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8514661192893982, "num_tokens": 196646183.0, "step": 5152 }, { "epoch": 0.6555145655768986, "ewc_loss": 0.02248552069067955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2485521185444668e-05, "grad_norm": 15.049298286437988, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8686014413833618, "num_tokens": 196689454.0, "step": 5153 }, { "epoch": 0.6556417758554891, "ewc_loss": 0.022539369761943817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.253937054774724e-05, "grad_norm": 15.079080581665039, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8447738885879517, "num_tokens": 196731820.0, "step": 5154 }, { "epoch": 0.6557689861340796, "ewc_loss": 0.02250552549958229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2505526430904865e-05, "grad_norm": 15.07987117767334, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8495831489562988, "num_tokens": 196768542.0, "step": 5155 }, { "epoch": 0.6558961964126702, "ewc_loss": 0.02250378578901291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2503785658045672e-05, "grad_norm": 15.055560111999512, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8444250822067261, "num_tokens": 196810300.0, "step": 5156 }, { "epoch": 0.6560234066912607, "ewc_loss": 0.02254759520292282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2547596017830074e-05, "grad_norm": 15.164433479309082, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8435108661651611, "num_tokens": 196847271.0, "step": 5157 }, { "epoch": 0.6561506169698512, "ewc_loss": 0.022542627528309822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.254262835776899e-05, "grad_norm": 15.06235122680664, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8750667572021484, "num_tokens": 196880439.0, "step": 5158 }, { "epoch": 0.6562778272484416, "ewc_loss": 0.022496230900287628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2496231395052746e-05, "grad_norm": 15.15339469909668, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8527085185050964, "num_tokens": 196914911.0, "step": 5159 }, { "epoch": 0.6564050375270322, "ewc_loss": 0.02257637307047844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.257637243019417e-05, "grad_norm": 15.09259033203125, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8606980443000793, "num_tokens": 196949419.0, "step": 5160 }, { "epoch": 0.6565322478056227, "ewc_loss": 0.022541189566254616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2541189537150785e-05, "grad_norm": 15.089611053466797, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.851889967918396, "num_tokens": 196989401.0, "step": 5161 }, { "epoch": 0.6566594580842132, "ewc_loss": 0.022550374269485474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.255037361464929e-05, "grad_norm": 15.075915336608887, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8530473709106445, "num_tokens": 197028377.0, "step": 5162 }, { "epoch": 0.6567866683628037, "ewc_loss": 0.02254716120660305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2547161279362626e-05, "grad_norm": 15.130436897277832, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8389797210693359, "num_tokens": 197068334.0, "step": 5163 }, { "epoch": 0.6569138786413943, "ewc_loss": 0.022575346753001213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.257534652017057e-05, "grad_norm": 15.094454765319824, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.86203533411026, "num_tokens": 197105978.0, "step": 5164 }, { "epoch": 0.6570410889199847, "ewc_loss": 0.022543298080563545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2543297745869495e-05, "grad_norm": 15.112207412719727, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8435736894607544, "num_tokens": 197148641.0, "step": 5165 }, { "epoch": 0.6571682991985752, "ewc_loss": 0.022543810307979584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2543810700881295e-05, "grad_norm": 15.04930591583252, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.840317964553833, "num_tokens": 197188820.0, "step": 5166 }, { "epoch": 0.6572955094771658, "ewc_loss": 0.02258153073489666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2581531084142625e-05, "grad_norm": 15.092935562133789, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8462021350860596, "num_tokens": 197227109.0, "step": 5167 }, { "epoch": 0.6574227197557563, "ewc_loss": 0.022558405995368958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2558406271855347e-05, "grad_norm": 15.07458209991455, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8650743961334229, "num_tokens": 197264083.0, "step": 5168 }, { "epoch": 0.6575499300343468, "ewc_loss": 0.02256280928850174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.256281004520133e-05, "grad_norm": 15.105058670043945, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8553714156150818, "num_tokens": 197301349.0, "step": 5169 }, { "epoch": 0.6576771403129373, "ewc_loss": 0.022602306678891182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2602307581109926e-05, "grad_norm": 15.117560386657715, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.84842848777771, "num_tokens": 197348176.0, "step": 5170 }, { "epoch": 0.6578043505915278, "ewc_loss": 0.02255617268383503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2556172552867793e-05, "grad_norm": 15.049786567687988, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8517827987670898, "num_tokens": 197380490.0, "step": 5171 }, { "epoch": 0.6579315608701183, "ewc_loss": 0.022564884275197983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2564883693121374e-05, "grad_norm": 15.118770599365234, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8464823365211487, "num_tokens": 197423075.0, "step": 5172 }, { "epoch": 0.6580587711487088, "ewc_loss": 0.02255677990615368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2556780095328577e-05, "grad_norm": 15.056503295898438, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8642841577529907, "num_tokens": 197461730.0, "step": 5173 }, { "epoch": 0.6581859814272993, "ewc_loss": 0.02255617454648018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2556174371857196e-05, "grad_norm": 15.085906982421875, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8478631973266602, "num_tokens": 197505801.0, "step": 5174 }, { "epoch": 0.6583131917058899, "ewc_loss": 0.022544512525200844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2544512830791064e-05, "grad_norm": 15.021871566772461, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8767597675323486, "num_tokens": 197534643.0, "step": 5175 }, { "epoch": 0.6584404019844804, "ewc_loss": 0.022582560777664185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2582560632145032e-05, "grad_norm": 15.121055603027344, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8653906583786011, "num_tokens": 197576490.0, "step": 5176 }, { "epoch": 0.6585676122630708, "ewc_loss": 0.022612204775214195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261220470245462e-05, "grad_norm": 15.022669792175293, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8718621730804443, "num_tokens": 197612745.0, "step": 5177 }, { "epoch": 0.6586948225416613, "ewc_loss": 0.02257048524916172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.257048436149489e-05, "grad_norm": 15.18341064453125, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8545042276382446, "num_tokens": 197653756.0, "step": 5178 }, { "epoch": 0.6588220328202519, "ewc_loss": 0.022606974467635155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2606975107919425e-05, "grad_norm": 15.007107734680176, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.883311927318573, "num_tokens": 197694417.0, "step": 5179 }, { "epoch": 0.6589492430988424, "ewc_loss": 0.02253975346684456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2539754354511388e-05, "grad_norm": 15.145090103149414, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.855467677116394, "num_tokens": 197732738.0, "step": 5180 }, { "epoch": 0.6590764533774329, "ewc_loss": 0.02263674885034561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2636748326476663e-05, "grad_norm": 15.084685325622559, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8503057956695557, "num_tokens": 197759408.0, "step": 5181 }, { "epoch": 0.6592036636560235, "ewc_loss": 0.022607604041695595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.260760447825305e-05, "grad_norm": 15.140984535217285, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8616234660148621, "num_tokens": 197794334.0, "step": 5182 }, { "epoch": 0.6593308739346139, "ewc_loss": 0.022689633071422577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2689633624395356e-05, "grad_norm": 15.035659790039062, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8688323497772217, "num_tokens": 197832497.0, "step": 5183 }, { "epoch": 0.6594580842132044, "ewc_loss": 0.022617382928729057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2617383365286514e-05, "grad_norm": 15.13625431060791, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8511526584625244, "num_tokens": 197869021.0, "step": 5184 }, { "epoch": 0.6595852944917949, "ewc_loss": 0.02273714542388916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2737145627615973e-05, "grad_norm": 15.112299919128418, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8458170890808105, "num_tokens": 197906365.0, "step": 5185 }, { "epoch": 0.6597125047703855, "ewc_loss": 0.022639287635684013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2639287635684013e-05, "grad_norm": 15.06484317779541, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8621271252632141, "num_tokens": 197950725.0, "step": 5186 }, { "epoch": 0.659839715048976, "ewc_loss": 0.022711703553795815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2711703422828577e-05, "grad_norm": 15.077993392944336, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8672105073928833, "num_tokens": 197991178.0, "step": 5187 }, { "epoch": 0.6599669253275665, "ewc_loss": 0.022670572623610497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.26705724344356e-05, "grad_norm": 15.06711483001709, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8501991629600525, "num_tokens": 198030176.0, "step": 5188 }, { "epoch": 0.6600941356061569, "ewc_loss": 0.02270474284887314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2704742150381207e-05, "grad_norm": 15.155470848083496, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8534387946128845, "num_tokens": 198067169.0, "step": 5189 }, { "epoch": 0.6602213458847475, "ewc_loss": 0.02273547649383545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2735475795343518e-05, "grad_norm": 15.116923332214355, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8487813472747803, "num_tokens": 198104994.0, "step": 5190 }, { "epoch": 0.660348556163338, "ewc_loss": 0.022688621655106544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2688622266286984e-05, "grad_norm": 15.129775047302246, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8633725643157959, "num_tokens": 198142791.0, "step": 5191 }, { "epoch": 0.6604757664419285, "ewc_loss": 0.022672727704048157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.26727279368788e-05, "grad_norm": 15.055442810058594, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8559374213218689, "num_tokens": 198179300.0, "step": 5192 }, { "epoch": 0.660602976720519, "ewc_loss": 0.022701658308506012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2701658963342197e-05, "grad_norm": 15.166147232055664, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8733686804771423, "num_tokens": 198220230.0, "step": 5193 }, { "epoch": 0.6607301869991096, "ewc_loss": 0.022709140554070473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.270914046675898e-05, "grad_norm": 15.164862632751465, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8478178977966309, "num_tokens": 198251067.0, "step": 5194 }, { "epoch": 0.6608573972777, "ewc_loss": 0.02271352894604206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2713529688189737e-05, "grad_norm": 15.212305068969727, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8679467439651489, "num_tokens": 198282130.0, "step": 5195 }, { "epoch": 0.6609846075562905, "ewc_loss": 0.02268938533961773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2689386241836473e-05, "grad_norm": 15.089044570922852, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8699440956115723, "num_tokens": 198325609.0, "step": 5196 }, { "epoch": 0.661111817834881, "ewc_loss": 0.022646449506282806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2646448996965773e-05, "grad_norm": 15.132481575012207, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8613322973251343, "num_tokens": 198365116.0, "step": 5197 }, { "epoch": 0.6612390281134716, "ewc_loss": 0.022712407633662224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.271240737172775e-05, "grad_norm": 15.171561241149902, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8596838116645813, "num_tokens": 198403467.0, "step": 5198 }, { "epoch": 0.6613662383920621, "ewc_loss": 0.022642141208052635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2642141630058177e-05, "grad_norm": 15.078693389892578, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8656465411186218, "num_tokens": 198440676.0, "step": 5199 }, { "epoch": 0.6614934486706526, "ewc_loss": 0.022632434964179993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2632435502600856e-05, "grad_norm": 15.155494689941406, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8614157438278198, "num_tokens": 198479082.0, "step": 5200 }, { "epoch": 0.661620658949243, "ewc_loss": 0.02274372987449169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.274373036925681e-05, "grad_norm": 15.097630500793457, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8560279011726379, "num_tokens": 198521741.0, "step": 5201 }, { "epoch": 0.6617478692278336, "ewc_loss": 0.022586332634091377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2586333216167986e-05, "grad_norm": 15.191975593566895, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8526001572608948, "num_tokens": 198562177.0, "step": 5202 }, { "epoch": 0.6618750795064241, "ewc_loss": 0.022659827023744583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2659827664028853e-05, "grad_norm": 15.117429733276367, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8606557250022888, "num_tokens": 198600726.0, "step": 5203 }, { "epoch": 0.6620022897850146, "ewc_loss": 0.022634917870163918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2634918423136696e-05, "grad_norm": 15.1770601272583, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.831290066242218, "num_tokens": 198638156.0, "step": 5204 }, { "epoch": 0.6621295000636052, "ewc_loss": 0.022666094824671745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.266609408252407e-05, "grad_norm": 15.101840019226074, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8408839702606201, "num_tokens": 198680312.0, "step": 5205 }, { "epoch": 0.6622567103421957, "ewc_loss": 0.022625382989645004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.262538328068331e-05, "grad_norm": 15.161534309387207, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8774216175079346, "num_tokens": 198723886.0, "step": 5206 }, { "epoch": 0.6623839206207861, "ewc_loss": 0.022627562284469604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2627562429988757e-05, "grad_norm": 15.058614730834961, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.859108030796051, "num_tokens": 198765331.0, "step": 5207 }, { "epoch": 0.6625111308993766, "ewc_loss": 0.022594114765524864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2594114852836356e-05, "grad_norm": 15.15571117401123, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.862578272819519, "num_tokens": 198802421.0, "step": 5208 }, { "epoch": 0.6626383411779672, "ewc_loss": 0.02265096828341484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2650969185633585e-05, "grad_norm": 15.098876953125, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8703119158744812, "num_tokens": 198843335.0, "step": 5209 }, { "epoch": 0.6627655514565577, "ewc_loss": 0.022571740671992302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.257174128317274e-05, "grad_norm": 15.110901832580566, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8660860657691956, "num_tokens": 198880925.0, "step": 5210 }, { "epoch": 0.6628927617351482, "ewc_loss": 0.02263951301574707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2639513190370053e-05, "grad_norm": 15.180726051330566, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8584595918655396, "num_tokens": 198915448.0, "step": 5211 }, { "epoch": 0.6630199720137387, "ewc_loss": 0.022616904228925705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261690497107338e-05, "grad_norm": 15.09123706817627, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8700916767120361, "num_tokens": 198952254.0, "step": 5212 }, { "epoch": 0.6631471822923293, "ewc_loss": 0.022603627294301987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.26036281674169e-05, "grad_norm": 15.127808570861816, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8646610975265503, "num_tokens": 198990841.0, "step": 5213 }, { "epoch": 0.6632743925709197, "ewc_loss": 0.022617937996983528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2617938157054596e-05, "grad_norm": 15.13139820098877, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8625476360321045, "num_tokens": 199029383.0, "step": 5214 }, { "epoch": 0.6634016028495102, "ewc_loss": 0.02255449816584587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.255449908261653e-05, "grad_norm": 15.127140998840332, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8590489625930786, "num_tokens": 199066731.0, "step": 5215 }, { "epoch": 0.6635288131281007, "ewc_loss": 0.022604186087846756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.260418659716379e-05, "grad_norm": 15.075581550598145, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8721277713775635, "num_tokens": 199108752.0, "step": 5216 }, { "epoch": 0.6636560234066913, "ewc_loss": 0.02263030596077442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2630305466009304e-05, "grad_norm": 15.190781593322754, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.864021897315979, "num_tokens": 199146438.0, "step": 5217 }, { "epoch": 0.6637832336852818, "ewc_loss": 0.02263977937400341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.263977876282297e-05, "grad_norm": 15.128425598144531, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8626095056533813, "num_tokens": 199186367.0, "step": 5218 }, { "epoch": 0.6639104439638723, "ewc_loss": 0.022619416937232018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261941699543968e-05, "grad_norm": 15.177824974060059, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.873470664024353, "num_tokens": 199225785.0, "step": 5219 }, { "epoch": 0.6640376542424627, "ewc_loss": 0.02260475978255272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2604759578825906e-05, "grad_norm": 15.071269989013672, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8632614612579346, "num_tokens": 199262574.0, "step": 5220 }, { "epoch": 0.6641648645210533, "ewc_loss": 0.02259918861091137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2599188014282845e-05, "grad_norm": 15.149418830871582, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8656455278396606, "num_tokens": 199299093.0, "step": 5221 }, { "epoch": 0.6642920747996438, "ewc_loss": 0.02260730415582657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2607304345001467e-05, "grad_norm": 15.025487899780273, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8659366965293884, "num_tokens": 199337446.0, "step": 5222 }, { "epoch": 0.6644192850782343, "ewc_loss": 0.022591272369027138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2591271772398613e-05, "grad_norm": 15.223756790161133, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8619414567947388, "num_tokens": 199377321.0, "step": 5223 }, { "epoch": 0.6645464953568249, "ewc_loss": 0.022665997967123985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.266599767608568e-05, "grad_norm": 15.13209342956543, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8701839447021484, "num_tokens": 199413057.0, "step": 5224 }, { "epoch": 0.6646737056354154, "ewc_loss": 0.02252284809947014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2522848666994832e-05, "grad_norm": 15.067597389221191, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8675655126571655, "num_tokens": 199452730.0, "step": 5225 }, { "epoch": 0.6648009159140058, "ewc_loss": 0.02260875329375267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2608754079556093e-05, "grad_norm": 15.16023063659668, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8701215386390686, "num_tokens": 199487860.0, "step": 5226 }, { "epoch": 0.6649281261925963, "ewc_loss": 0.0226100143045187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261001463921275e-05, "grad_norm": 15.057802200317383, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8674345016479492, "num_tokens": 199532790.0, "step": 5227 }, { "epoch": 0.6650553364711869, "ewc_loss": 0.022553451359272003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2553451344720088e-05, "grad_norm": 15.13178539276123, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8803105354309082, "num_tokens": 199561072.0, "step": 5228 }, { "epoch": 0.6651825467497774, "ewc_loss": 0.02260442078113556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2604421246796846e-05, "grad_norm": 15.144271850585938, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.866370439529419, "num_tokens": 199595820.0, "step": 5229 }, { "epoch": 0.6653097570283679, "ewc_loss": 0.022600345313549042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.26003448915435e-05, "grad_norm": 15.10030746459961, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8478845357894897, "num_tokens": 199640101.0, "step": 5230 }, { "epoch": 0.6654369673069584, "ewc_loss": 0.022601252421736717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.260125256725587e-05, "grad_norm": 15.213042259216309, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8567334413528442, "num_tokens": 199680287.0, "step": 5231 }, { "epoch": 0.6655641775855489, "ewc_loss": 0.022604672238230705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2604672267334536e-05, "grad_norm": 15.084736824035645, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8588788509368896, "num_tokens": 199721958.0, "step": 5232 }, { "epoch": 0.6656913878641394, "ewc_loss": 0.022590365260839462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2590365915675648e-05, "grad_norm": 15.166427612304688, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8665547370910645, "num_tokens": 199756021.0, "step": 5233 }, { "epoch": 0.6658185981427299, "ewc_loss": 0.022618938237428665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2618938601226546e-05, "grad_norm": 15.177351951599121, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8647596836090088, "num_tokens": 199793586.0, "step": 5234 }, { "epoch": 0.6659458084213205, "ewc_loss": 0.022593585774302483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2593585526919924e-05, "grad_norm": 15.157744407653809, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8478062748908997, "num_tokens": 199835728.0, "step": 5235 }, { "epoch": 0.666073018699911, "ewc_loss": 0.022613486275076866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261348708998412e-05, "grad_norm": 15.14716911315918, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8690851926803589, "num_tokens": 199873348.0, "step": 5236 }, { "epoch": 0.6662002289785015, "ewc_loss": 0.022584741935133934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2584741600439884e-05, "grad_norm": 15.135075569152832, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.860074520111084, "num_tokens": 199908223.0, "step": 5237 }, { "epoch": 0.6663274392570919, "ewc_loss": 0.022608453407883644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2608453946304508e-05, "grad_norm": 15.118460655212402, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8568555116653442, "num_tokens": 199947017.0, "step": 5238 }, { "epoch": 0.6664546495356825, "ewc_loss": 0.022603929042816162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2603928300668485e-05, "grad_norm": 15.141790390014648, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8687278032302856, "num_tokens": 199984806.0, "step": 5239 }, { "epoch": 0.666581859814273, "ewc_loss": 0.022600820288062096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2600819647777826e-05, "grad_norm": 15.161826133728027, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.854586660861969, "num_tokens": 200022600.0, "step": 5240 }, { "epoch": 0.6667090700928635, "ewc_loss": 0.022650327533483505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2650327082374133e-05, "grad_norm": 15.152092933654785, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8704505562782288, "num_tokens": 200058407.0, "step": 5241 }, { "epoch": 0.666836280371454, "ewc_loss": 0.022613318637013435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.261331792396959e-05, "grad_norm": 15.101943016052246, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.833514928817749, "num_tokens": 200101871.0, "step": 5242 }, { "epoch": 0.6669634906500446, "ewc_loss": 0.02259705774486065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2597057977691293e-05, "grad_norm": 15.104227066040039, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8610063195228577, "num_tokens": 200138501.0, "step": 5243 }, { "epoch": 0.667090700928635, "ewc_loss": 0.022668272256851196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2668271412840113e-05, "grad_norm": 15.166923522949219, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.859045147895813, "num_tokens": 200176639.0, "step": 5244 }, { "epoch": 0.6672179112072255, "ewc_loss": 0.022679312154650688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2679312678519636e-05, "grad_norm": 15.098691940307617, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8563142418861389, "num_tokens": 200211873.0, "step": 5245 }, { "epoch": 0.667345121485816, "ewc_loss": 0.022626522928476334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2626523787039332e-05, "grad_norm": 15.144676208496094, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.856385350227356, "num_tokens": 200246596.0, "step": 5246 }, { "epoch": 0.6674723317644066, "ewc_loss": 0.022707877680659294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.270787808811292e-05, "grad_norm": 15.152225494384766, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8761301040649414, "num_tokens": 200282761.0, "step": 5247 }, { "epoch": 0.6675995420429971, "ewc_loss": 0.022649269551038742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2649270249530673e-05, "grad_norm": 15.120105743408203, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8452461361885071, "num_tokens": 200321821.0, "step": 5248 }, { "epoch": 0.6677267523215876, "ewc_loss": 0.022680019959807396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2680020265397616e-05, "grad_norm": 15.10219955444336, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8442457914352417, "num_tokens": 200369066.0, "step": 5249 }, { "epoch": 0.667853962600178, "ewc_loss": 0.022689640522003174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.268964090035297e-05, "grad_norm": 15.175530433654785, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.852414608001709, "num_tokens": 200408067.0, "step": 5250 }, { "epoch": 0.6679811728787686, "ewc_loss": 0.02271624468266964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.271624543936923e-05, "grad_norm": 15.131175994873047, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8771231770515442, "num_tokens": 200446693.0, "step": 5251 }, { "epoch": 0.6681083831573591, "ewc_loss": 0.022717656567692757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2717656975146383e-05, "grad_norm": 15.187813758850098, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8420639038085938, "num_tokens": 200485340.0, "step": 5252 }, { "epoch": 0.6682355934359496, "ewc_loss": 0.022715216502547264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2715215891366825e-05, "grad_norm": 15.108830451965332, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8577964305877686, "num_tokens": 200521978.0, "step": 5253 }, { "epoch": 0.6683628037145402, "ewc_loss": 0.022664109244942665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.26641095650848e-05, "grad_norm": 15.15483283996582, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8623014092445374, "num_tokens": 200560604.0, "step": 5254 }, { "epoch": 0.6684900139931307, "ewc_loss": 0.022749971598386765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2749971321900375e-05, "grad_norm": 15.136381149291992, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8561677932739258, "num_tokens": 200601718.0, "step": 5255 }, { "epoch": 0.6686172242717211, "ewc_loss": 0.022684846073389053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2684846044285223e-05, "grad_norm": 15.118691444396973, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8357230424880981, "num_tokens": 200646186.0, "step": 5256 }, { "epoch": 0.6687444345503116, "ewc_loss": 0.022739244624972343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2739244741387665e-05, "grad_norm": 15.199771881103516, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8627927303314209, "num_tokens": 200684849.0, "step": 5257 }, { "epoch": 0.6688716448289022, "ewc_loss": 0.022711195051670074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2711195924784988e-05, "grad_norm": 15.058023452758789, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8646599054336548, "num_tokens": 200719542.0, "step": 5258 }, { "epoch": 0.6689988551074927, "ewc_loss": 0.022725196555256844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.272519668622408e-05, "grad_norm": 15.19921875, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8618206977844238, "num_tokens": 200756212.0, "step": 5259 }, { "epoch": 0.6691260653860832, "ewc_loss": 0.022807899862527847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.28079006774351e-05, "grad_norm": 15.141414642333984, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8633288741111755, "num_tokens": 200791184.0, "step": 5260 }, { "epoch": 0.6692532756646737, "ewc_loss": 0.02269480563700199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2694805011269636e-05, "grad_norm": 15.153968811035156, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8440170288085938, "num_tokens": 200823966.0, "step": 5261 }, { "epoch": 0.6693804859432643, "ewc_loss": 0.022764183580875397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.276418308611028e-05, "grad_norm": 15.147664070129395, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8651809692382812, "num_tokens": 200862718.0, "step": 5262 }, { "epoch": 0.6695076962218547, "ewc_loss": 0.02271338552236557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2713385988026857e-05, "grad_norm": 15.118745803833008, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8557347059249878, "num_tokens": 200901755.0, "step": 5263 }, { "epoch": 0.6696349065004452, "ewc_loss": 0.022729596123099327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2729596821591258e-05, "grad_norm": 15.178483009338379, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8713966608047485, "num_tokens": 200935788.0, "step": 5264 }, { "epoch": 0.6697621167790357, "ewc_loss": 0.022725438699126244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2725438611814752e-05, "grad_norm": 15.083893775939941, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8738899230957031, "num_tokens": 200975159.0, "step": 5265 }, { "epoch": 0.6698893270576263, "ewc_loss": 0.022737091407179832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2737091057933867e-05, "grad_norm": 15.125541687011719, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8729535341262817, "num_tokens": 201010813.0, "step": 5266 }, { "epoch": 0.6700165373362168, "ewc_loss": 0.022718988358974457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.271898847538978e-05, "grad_norm": 15.092717170715332, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8585753440856934, "num_tokens": 201051937.0, "step": 5267 }, { "epoch": 0.6701437476148073, "ewc_loss": 0.02277701161801815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2777012418373488e-05, "grad_norm": 15.168354988098145, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8665820956230164, "num_tokens": 201090926.0, "step": 5268 }, { "epoch": 0.6702709578933977, "ewc_loss": 0.022738628089427948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2738628103979863e-05, "grad_norm": 15.210993766784668, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8407135605812073, "num_tokens": 201126086.0, "step": 5269 }, { "epoch": 0.6703981681719883, "ewc_loss": 0.022771555930376053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.277155545016285e-05, "grad_norm": 15.125467300415039, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8495854139328003, "num_tokens": 201165933.0, "step": 5270 }, { "epoch": 0.6705253784505788, "ewc_loss": 0.022699452936649323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2699452529195696e-05, "grad_norm": 15.159171104431152, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8635770082473755, "num_tokens": 201198794.0, "step": 5271 }, { "epoch": 0.6706525887291693, "ewc_loss": 0.022725673392415047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.272567326144781e-05, "grad_norm": 15.14013385772705, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8519203662872314, "num_tokens": 201234081.0, "step": 5272 }, { "epoch": 0.6707797990077599, "ewc_loss": 0.022772792726755142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.277279236295726e-05, "grad_norm": 15.164885520935059, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8570679426193237, "num_tokens": 201271796.0, "step": 5273 }, { "epoch": 0.6709070092863504, "ewc_loss": 0.02272838167846203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.272838173666969e-05, "grad_norm": 15.135377883911133, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8422242403030396, "num_tokens": 201309526.0, "step": 5274 }, { "epoch": 0.6710342195649408, "ewc_loss": 0.02275994047522545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.275994120282121e-05, "grad_norm": 15.142080307006836, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8677152395248413, "num_tokens": 201344015.0, "step": 5275 }, { "epoch": 0.6711614298435313, "ewc_loss": 0.02275579236447811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2755792087991722e-05, "grad_norm": 15.106741905212402, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8612423539161682, "num_tokens": 201385201.0, "step": 5276 }, { "epoch": 0.6712886401221219, "ewc_loss": 0.022749852389097214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2749853087589145e-05, "grad_norm": 15.18857479095459, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8654295206069946, "num_tokens": 201422370.0, "step": 5277 }, { "epoch": 0.6714158504007124, "ewc_loss": 0.022786099463701248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2786100089433603e-05, "grad_norm": 15.096059799194336, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8452281951904297, "num_tokens": 201467085.0, "step": 5278 }, { "epoch": 0.6715430606793029, "ewc_loss": 0.02272934652864933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.272934580105357e-05, "grad_norm": 15.117073059082031, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8744224905967712, "num_tokens": 201503432.0, "step": 5279 }, { "epoch": 0.6716702709578934, "ewc_loss": 0.022801559418439865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.280155968037434e-05, "grad_norm": 15.178909301757812, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.866850733757019, "num_tokens": 201541358.0, "step": 5280 }, { "epoch": 0.6717974812364839, "ewc_loss": 0.022771330550312996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.277132989547681e-05, "grad_norm": 15.152087211608887, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8577205538749695, "num_tokens": 201580235.0, "step": 5281 }, { "epoch": 0.6719246915150744, "ewc_loss": 0.02280310168862343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2803102183388546e-05, "grad_norm": 15.173859596252441, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8605964183807373, "num_tokens": 201627703.0, "step": 5282 }, { "epoch": 0.6720519017936649, "ewc_loss": 0.022710708901286125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2710708435624838e-05, "grad_norm": 15.097269058227539, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8627736568450928, "num_tokens": 201667182.0, "step": 5283 }, { "epoch": 0.6721791120722554, "ewc_loss": 0.02270357683300972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2703576178173535e-05, "grad_norm": 15.185150146484375, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8615190982818604, "num_tokens": 201699876.0, "step": 5284 }, { "epoch": 0.672306322350846, "ewc_loss": 0.02275959961116314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2759599232813343e-05, "grad_norm": 15.17576789855957, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8508185148239136, "num_tokens": 201732770.0, "step": 5285 }, { "epoch": 0.6724335326294365, "ewc_loss": 0.02271205745637417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.271205812576227e-05, "grad_norm": 15.144498825073242, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8615721464157104, "num_tokens": 201771388.0, "step": 5286 }, { "epoch": 0.6725607429080269, "ewc_loss": 0.02273888699710369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2738886400475167e-05, "grad_norm": 15.161587715148926, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8546226024627686, "num_tokens": 201809283.0, "step": 5287 }, { "epoch": 0.6726879531866174, "ewc_loss": 0.02270488440990448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2704884031554684e-05, "grad_norm": 15.10423755645752, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8663820624351501, "num_tokens": 201854950.0, "step": 5288 }, { "epoch": 0.672815163465208, "ewc_loss": 0.02275894023478031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.275894075864926e-05, "grad_norm": 15.24331283569336, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8638063669204712, "num_tokens": 201895638.0, "step": 5289 }, { "epoch": 0.6729423737437985, "ewc_loss": 0.022732693701982498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2732694560545497e-05, "grad_norm": 15.091686248779297, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8469957113265991, "num_tokens": 201928243.0, "step": 5290 }, { "epoch": 0.673069584022389, "ewc_loss": 0.022677769884467125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.267777017550543e-05, "grad_norm": 15.20217227935791, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.859588086605072, "num_tokens": 201964687.0, "step": 5291 }, { "epoch": 0.6731967943009796, "ewc_loss": 0.02275867946445942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.275867882417515e-05, "grad_norm": 15.177162170410156, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8728434443473816, "num_tokens": 201997166.0, "step": 5292 }, { "epoch": 0.67332400457957, "ewc_loss": 0.02272188849747181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.272188794449903e-05, "grad_norm": 15.12237548828125, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8624935150146484, "num_tokens": 202030771.0, "step": 5293 }, { "epoch": 0.6734512148581605, "ewc_loss": 0.02276449091732502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2764490495319478e-05, "grad_norm": 15.139927864074707, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8526837825775146, "num_tokens": 202077280.0, "step": 5294 }, { "epoch": 0.673578425136751, "ewc_loss": 0.022775735706090927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.27757354878122e-05, "grad_norm": 15.217498779296875, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.852789044380188, "num_tokens": 202115273.0, "step": 5295 }, { "epoch": 0.6737056354153416, "ewc_loss": 0.022761236876249313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2761236323276535e-05, "grad_norm": 15.142502784729004, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8705105781555176, "num_tokens": 202145347.0, "step": 5296 }, { "epoch": 0.6738328456939321, "ewc_loss": 0.02275054156780243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.275054248457309e-05, "grad_norm": 15.198156356811523, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8654249310493469, "num_tokens": 202184621.0, "step": 5297 }, { "epoch": 0.6739600559725226, "ewc_loss": 0.022816283628344536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2816284399596043e-05, "grad_norm": 15.170607566833496, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8630660772323608, "num_tokens": 202230099.0, "step": 5298 }, { "epoch": 0.674087266251113, "ewc_loss": 0.02269490621984005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2694906874676235e-05, "grad_norm": 15.070481300354004, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8548544049263, "num_tokens": 202267872.0, "step": 5299 }, { "epoch": 0.6742144765297036, "ewc_loss": 0.022771699354052544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.277169915032573e-05, "grad_norm": 15.206560134887695, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8588640689849854, "num_tokens": 202305517.0, "step": 5300 }, { "epoch": 0.6743416868082941, "ewc_loss": 0.0228162482380867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2816248019807972e-05, "grad_norm": 15.198731422424316, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8473423719406128, "num_tokens": 202340671.0, "step": 5301 }, { "epoch": 0.6744688970868846, "ewc_loss": 0.02273465506732464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.273465543112252e-05, "grad_norm": 15.099113464355469, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8682999014854431, "num_tokens": 202379552.0, "step": 5302 }, { "epoch": 0.6745961073654752, "ewc_loss": 0.022785300388932228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2785299734096043e-05, "grad_norm": 15.242077827453613, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8550145626068115, "num_tokens": 202421404.0, "step": 5303 }, { "epoch": 0.6747233176440657, "ewc_loss": 0.022829938679933548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.282993955304846e-05, "grad_norm": 15.159948348999023, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8458547592163086, "num_tokens": 202463765.0, "step": 5304 }, { "epoch": 0.6748505279226561, "ewc_loss": 0.022725533694028854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2725533199263737e-05, "grad_norm": 15.10119915008545, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8560699224472046, "num_tokens": 202505872.0, "step": 5305 }, { "epoch": 0.6749777382012466, "ewc_loss": 0.022763147950172424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.276314808113966e-05, "grad_norm": 15.303278923034668, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8579025864601135, "num_tokens": 202544085.0, "step": 5306 }, { "epoch": 0.6751049484798372, "ewc_loss": 0.022803040221333504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2803040337748826e-05, "grad_norm": 15.189385414123535, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8565922975540161, "num_tokens": 202588237.0, "step": 5307 }, { "epoch": 0.6752321587584277, "ewc_loss": 0.022688763216137886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2688762328471057e-05, "grad_norm": 15.251893997192383, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8531566262245178, "num_tokens": 202628462.0, "step": 5308 }, { "epoch": 0.6753593690370182, "ewc_loss": 0.022724786773324013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2724787413608283e-05, "grad_norm": 15.230101585388184, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8619579076766968, "num_tokens": 202668149.0, "step": 5309 }, { "epoch": 0.6754865793156087, "ewc_loss": 0.022701792418956757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2701791749568656e-05, "grad_norm": 15.230932235717773, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8752341866493225, "num_tokens": 202708651.0, "step": 5310 }, { "epoch": 0.6756137895941993, "ewc_loss": 0.022672079503536224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2672080376651138e-05, "grad_norm": 15.19507122039795, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.864616870880127, "num_tokens": 202738613.0, "step": 5311 }, { "epoch": 0.6757409998727897, "ewc_loss": 0.02266872301697731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2668722522212192e-05, "grad_norm": 15.224260330200195, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8563848733901978, "num_tokens": 202779279.0, "step": 5312 }, { "epoch": 0.6758682101513802, "ewc_loss": 0.022716442123055458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2716441890224814e-05, "grad_norm": 15.178145408630371, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8605624437332153, "num_tokens": 202812095.0, "step": 5313 }, { "epoch": 0.6759954204299707, "ewc_loss": 0.02269251085817814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2692511265631765e-05, "grad_norm": 15.12060832977295, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8557010889053345, "num_tokens": 202855705.0, "step": 5314 }, { "epoch": 0.6761226307085613, "ewc_loss": 0.0226726233959198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2672624254482798e-05, "grad_norm": 15.196955680847168, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8431418538093567, "num_tokens": 202894180.0, "step": 5315 }, { "epoch": 0.6762498409871518, "ewc_loss": 0.022756434977054596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2756434191251174e-05, "grad_norm": 15.196950912475586, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8668919205665588, "num_tokens": 202925842.0, "step": 5316 }, { "epoch": 0.6763770512657423, "ewc_loss": 0.022700849920511246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.270084951305762e-05, "grad_norm": 15.17111873626709, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8660539984703064, "num_tokens": 202959103.0, "step": 5317 }, { "epoch": 0.6765042615443327, "ewc_loss": 0.0227240901440382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2724090740666725e-05, "grad_norm": 15.12748908996582, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.860514760017395, "num_tokens": 203000393.0, "step": 5318 }, { "epoch": 0.6766314718229233, "ewc_loss": 0.02273632027208805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2736319806426764e-05, "grad_norm": 15.177512168884277, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8604716062545776, "num_tokens": 203041613.0, "step": 5319 }, { "epoch": 0.6767586821015138, "ewc_loss": 0.02279032953083515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2790329239796847e-05, "grad_norm": 15.170270919799805, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8744988441467285, "num_tokens": 203073744.0, "step": 5320 }, { "epoch": 0.6768858923801043, "ewc_loss": 0.022768894210457802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2768894268665463e-05, "grad_norm": 15.148223876953125, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8448882699012756, "num_tokens": 203108350.0, "step": 5321 }, { "epoch": 0.6770131026586949, "ewc_loss": 0.022794507443904877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2794507458456792e-05, "grad_norm": 15.229109764099121, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8633145093917847, "num_tokens": 203139149.0, "step": 5322 }, { "epoch": 0.6771403129372854, "ewc_loss": 0.022813137620687485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.281313754792791e-05, "grad_norm": 15.183571815490723, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8429359197616577, "num_tokens": 203173953.0, "step": 5323 }, { "epoch": 0.6772675232158758, "ewc_loss": 0.022784553468227386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.278455394844059e-05, "grad_norm": 15.15575885772705, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8706369400024414, "num_tokens": 203208915.0, "step": 5324 }, { "epoch": 0.6773947334944663, "ewc_loss": 0.02280925028026104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.280925036757253e-05, "grad_norm": 15.163532257080078, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8482043147087097, "num_tokens": 203251906.0, "step": 5325 }, { "epoch": 0.6775219437730569, "ewc_loss": 0.022848106920719147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2848107619211078e-05, "grad_norm": 15.232848167419434, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8605291247367859, "num_tokens": 203289460.0, "step": 5326 }, { "epoch": 0.6776491540516474, "ewc_loss": 0.022898830473423004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2898830138728954e-05, "grad_norm": 15.208754539489746, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8682796955108643, "num_tokens": 203325647.0, "step": 5327 }, { "epoch": 0.6777763643302379, "ewc_loss": 0.02281228080391884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.281228080391884e-05, "grad_norm": 15.162571907043457, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8555322885513306, "num_tokens": 203364972.0, "step": 5328 }, { "epoch": 0.6779035746088284, "ewc_loss": 0.022867154330015182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2867154257255606e-05, "grad_norm": 15.163704872131348, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8602898120880127, "num_tokens": 203407778.0, "step": 5329 }, { "epoch": 0.6780307848874189, "ewc_loss": 0.022829202935099602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2829202862340026e-05, "grad_norm": 15.161783218383789, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8604904413223267, "num_tokens": 203443820.0, "step": 5330 }, { "epoch": 0.6781579951660094, "ewc_loss": 0.022838791832327843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.283879257447552e-05, "grad_norm": 15.188949584960938, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8656165599822998, "num_tokens": 203475742.0, "step": 5331 }, { "epoch": 0.6782852054445999, "ewc_loss": 0.022900979965925217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2900980184203945e-05, "grad_norm": 15.22545337677002, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8342580199241638, "num_tokens": 203512617.0, "step": 5332 }, { "epoch": 0.6784124157231904, "ewc_loss": 0.022835716605186462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2835716663394123e-05, "grad_norm": 15.155179977416992, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8441928625106812, "num_tokens": 203546778.0, "step": 5333 }, { "epoch": 0.678539626001781, "ewc_loss": 0.022855980321764946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2855980205349624e-05, "grad_norm": 15.199283599853516, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8617241382598877, "num_tokens": 203585222.0, "step": 5334 }, { "epoch": 0.6786668362803715, "ewc_loss": 0.02284502238035202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2845022613182664e-05, "grad_norm": 15.134860038757324, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8539956212043762, "num_tokens": 203627014.0, "step": 5335 }, { "epoch": 0.6787940465589619, "ewc_loss": 0.022831514477729797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2831514797871932e-05, "grad_norm": 15.187033653259277, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.866550862789154, "num_tokens": 203659189.0, "step": 5336 }, { "epoch": 0.6789212568375524, "ewc_loss": 0.022866809740662575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2866810468258336e-05, "grad_norm": 15.211564064025879, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.871442437171936, "num_tokens": 203695355.0, "step": 5337 }, { "epoch": 0.679048467116143, "ewc_loss": 0.022853294387459755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2853293558000587e-05, "grad_norm": 15.181539535522461, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8681325912475586, "num_tokens": 203733446.0, "step": 5338 }, { "epoch": 0.6791756773947335, "ewc_loss": 0.0228559747338295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2855974748381414e-05, "grad_norm": 15.206263542175293, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8443096876144409, "num_tokens": 203775387.0, "step": 5339 }, { "epoch": 0.679302887673324, "ewc_loss": 0.02284979447722435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2849793822388165e-05, "grad_norm": 15.208831787109375, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.855266809463501, "num_tokens": 203810713.0, "step": 5340 }, { "epoch": 0.6794300979519146, "ewc_loss": 0.022849734872579575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.284973561472725e-05, "grad_norm": 15.165545463562012, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.872482419013977, "num_tokens": 203847144.0, "step": 5341 }, { "epoch": 0.679557308230505, "ewc_loss": 0.02283903770148754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2839038138044998e-05, "grad_norm": 15.142395973205566, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8654788136482239, "num_tokens": 203889716.0, "step": 5342 }, { "epoch": 0.6796845185090955, "ewc_loss": 0.022848928347229958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2848927983432077e-05, "grad_norm": 15.168293952941895, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8459354043006897, "num_tokens": 203927692.0, "step": 5343 }, { "epoch": 0.679811728787686, "ewc_loss": 0.022864442318677902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.286444214405492e-05, "grad_norm": 15.168096542358398, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8460778594017029, "num_tokens": 203960180.0, "step": 5344 }, { "epoch": 0.6799389390662766, "ewc_loss": 0.022847352549433708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2847352738608606e-05, "grad_norm": 15.203577995300293, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8712739944458008, "num_tokens": 203997375.0, "step": 5345 }, { "epoch": 0.6800661493448671, "ewc_loss": 0.022886255756020546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.288625546498224e-05, "grad_norm": 15.184514999389648, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8563353419303894, "num_tokens": 204037588.0, "step": 5346 }, { "epoch": 0.6801933596234576, "ewc_loss": 0.022862369194626808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2862368496134877e-05, "grad_norm": 15.13666820526123, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8525142669677734, "num_tokens": 204080993.0, "step": 5347 }, { "epoch": 0.680320569902048, "ewc_loss": 0.022867638617753983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.286763810843695e-05, "grad_norm": 15.219436645507812, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8621915578842163, "num_tokens": 204116863.0, "step": 5348 }, { "epoch": 0.6804477801806386, "ewc_loss": 0.02282620780169964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2826206986792386e-05, "grad_norm": 15.179363250732422, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8537521362304688, "num_tokens": 204156551.0, "step": 5349 }, { "epoch": 0.6805749904592291, "ewc_loss": 0.022853955626487732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2853955670143478e-05, "grad_norm": 15.18629264831543, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8529465794563293, "num_tokens": 204196210.0, "step": 5350 }, { "epoch": 0.6807022007378196, "ewc_loss": 0.02280682697892189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2806827473687008e-05, "grad_norm": 15.197715759277344, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8583220839500427, "num_tokens": 204234791.0, "step": 5351 }, { "epoch": 0.6808294110164101, "ewc_loss": 0.022853288799524307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2853288101032376e-05, "grad_norm": 15.153285026550293, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8628362417221069, "num_tokens": 204271631.0, "step": 5352 }, { "epoch": 0.6809566212950007, "ewc_loss": 0.02283870242536068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2838701625005342e-05, "grad_norm": 15.230449676513672, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8709549903869629, "num_tokens": 204311879.0, "step": 5353 }, { "epoch": 0.6810838315735911, "ewc_loss": 0.022837797179818153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.283779758727178e-05, "grad_norm": 15.169931411743164, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8632553815841675, "num_tokens": 204357123.0, "step": 5354 }, { "epoch": 0.6812110418521816, "ewc_loss": 0.022786136716604233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2786136469221674e-05, "grad_norm": 15.19284439086914, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8572060465812683, "num_tokens": 204397196.0, "step": 5355 }, { "epoch": 0.6813382521307721, "ewc_loss": 0.022814355790615082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2814356270828284e-05, "grad_norm": 15.205727577209473, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8498948812484741, "num_tokens": 204434654.0, "step": 5356 }, { "epoch": 0.6814654624093627, "ewc_loss": 0.022740675136446953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2740674467058852e-05, "grad_norm": 15.22380256652832, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8822457194328308, "num_tokens": 204466581.0, "step": 5357 }, { "epoch": 0.6815926726879532, "ewc_loss": 0.02277454547584057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.277454586874228e-05, "grad_norm": 15.18977165222168, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8722277879714966, "num_tokens": 204503890.0, "step": 5358 }, { "epoch": 0.6817198829665437, "ewc_loss": 0.022764580324292183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2764579625800252e-05, "grad_norm": 15.205673217773438, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.855686366558075, "num_tokens": 204537160.0, "step": 5359 }, { "epoch": 0.6818470932451343, "ewc_loss": 0.022835148498415947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2835149138700217e-05, "grad_norm": 15.225927352905273, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8710933327674866, "num_tokens": 204570077.0, "step": 5360 }, { "epoch": 0.6819743035237247, "ewc_loss": 0.022747281938791275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.274728103657253e-05, "grad_norm": 15.142058372497559, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8759960532188416, "num_tokens": 204603638.0, "step": 5361 }, { "epoch": 0.6821015138023152, "ewc_loss": 0.022837597876787186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.283759749843739e-05, "grad_norm": 15.257652282714844, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8553968667984009, "num_tokens": 204644584.0, "step": 5362 }, { "epoch": 0.6822287240809057, "ewc_loss": 0.0228312686085701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2831269234302454e-05, "grad_norm": 15.113499641418457, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8455526828765869, "num_tokens": 204678459.0, "step": 5363 }, { "epoch": 0.6823559343594963, "ewc_loss": 0.022817697376012802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2817697754362598e-05, "grad_norm": 15.268904685974121, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8570712804794312, "num_tokens": 204712923.0, "step": 5364 }, { "epoch": 0.6824831446380868, "ewc_loss": 0.022934334352612495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2934334992896765e-05, "grad_norm": 15.16399097442627, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8423359990119934, "num_tokens": 204751303.0, "step": 5365 }, { "epoch": 0.6826103549166773, "ewc_loss": 0.02279805764555931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2798058125772513e-05, "grad_norm": 15.207857131958008, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.860280454158783, "num_tokens": 204791685.0, "step": 5366 }, { "epoch": 0.6827375651952677, "ewc_loss": 0.022904224693775177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.290422526129987e-05, "grad_norm": 15.198463439941406, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8610905408859253, "num_tokens": 204827748.0, "step": 5367 }, { "epoch": 0.6828647754738583, "ewc_loss": 0.022835226729512215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.283522735524457e-05, "grad_norm": 15.120593070983887, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8756614923477173, "num_tokens": 204866568.0, "step": 5368 }, { "epoch": 0.6829919857524488, "ewc_loss": 0.02290390059351921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2903899662196636e-05, "grad_norm": 15.269522666931152, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8595114350318909, "num_tokens": 204908334.0, "step": 5369 }, { "epoch": 0.6831191960310393, "ewc_loss": 0.022959286347031593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2959286070545204e-05, "grad_norm": 15.195393562316895, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8630640506744385, "num_tokens": 204949929.0, "step": 5370 }, { "epoch": 0.6832464063096299, "ewc_loss": 0.02286490797996521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2864907805342227e-05, "grad_norm": 15.200380325317383, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8503007292747498, "num_tokens": 204991435.0, "step": 5371 }, { "epoch": 0.6833736165882204, "ewc_loss": 0.02290540561079979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2905405785422772e-05, "grad_norm": 15.195556640625, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8663475513458252, "num_tokens": 205028434.0, "step": 5372 }, { "epoch": 0.6835008268668108, "ewc_loss": 0.022886456921696663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2886457372806035e-05, "grad_norm": 15.244058609008789, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8547723293304443, "num_tokens": 205064138.0, "step": 5373 }, { "epoch": 0.6836280371454013, "ewc_loss": 0.0228781346231699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2878135496284813e-05, "grad_norm": 15.151418685913086, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8539215922355652, "num_tokens": 205104058.0, "step": 5374 }, { "epoch": 0.6837552474239919, "ewc_loss": 0.022874312475323677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.287431198055856e-05, "grad_norm": 15.277047157287598, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8584960699081421, "num_tokens": 205142609.0, "step": 5375 }, { "epoch": 0.6838824577025824, "ewc_loss": 0.02288120426237583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2881204131408595e-05, "grad_norm": 15.17968463897705, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8707077503204346, "num_tokens": 205180407.0, "step": 5376 }, { "epoch": 0.6840096679811729, "ewc_loss": 0.022796612232923508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2796612029196694e-05, "grad_norm": 15.172955513000488, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8807438611984253, "num_tokens": 205217340.0, "step": 5377 }, { "epoch": 0.6841368782597634, "ewc_loss": 0.022870153188705444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2870153770782053e-05, "grad_norm": 15.1969633102417, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8480013608932495, "num_tokens": 205263671.0, "step": 5378 }, { "epoch": 0.6842640885383539, "ewc_loss": 0.02281329408288002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2813293981016614e-05, "grad_norm": 15.165302276611328, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8571406006813049, "num_tokens": 205302970.0, "step": 5379 }, { "epoch": 0.6843912988169444, "ewc_loss": 0.02285907417535782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2859074306325056e-05, "grad_norm": 15.232465744018555, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.864163875579834, "num_tokens": 205342247.0, "step": 5380 }, { "epoch": 0.6845185090955349, "ewc_loss": 0.022851238027215004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.285123809997458e-05, "grad_norm": 15.264020919799805, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8443588614463806, "num_tokens": 205374897.0, "step": 5381 }, { "epoch": 0.6846457193741254, "ewc_loss": 0.022869853302836418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.286985363753047e-05, "grad_norm": 15.230456352233887, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8567339181900024, "num_tokens": 205411645.0, "step": 5382 }, { "epoch": 0.684772929652716, "ewc_loss": 0.022828850895166397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.282885179738514e-05, "grad_norm": 15.236349105834961, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8594177961349487, "num_tokens": 205450972.0, "step": 5383 }, { "epoch": 0.6849001399313065, "ewc_loss": 0.022859878838062286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2859878299641423e-05, "grad_norm": 15.191282272338867, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8612771034240723, "num_tokens": 205494314.0, "step": 5384 }, { "epoch": 0.6850273502098969, "ewc_loss": 0.022832190617918968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.283219146193005e-05, "grad_norm": 15.227330207824707, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8470918536186218, "num_tokens": 205534696.0, "step": 5385 }, { "epoch": 0.6851545604884874, "ewc_loss": 0.022877296432852745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2877296942169778e-05, "grad_norm": 15.192037582397461, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8724666833877563, "num_tokens": 205575579.0, "step": 5386 }, { "epoch": 0.685281770767078, "ewc_loss": 0.02281680703163147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.281680644955486e-05, "grad_norm": 15.202129364013672, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8635454773902893, "num_tokens": 205609249.0, "step": 5387 }, { "epoch": 0.6854089810456685, "ewc_loss": 0.022825315594673157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2825315681984648e-05, "grad_norm": 15.216480255126953, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8572245240211487, "num_tokens": 205644135.0, "step": 5388 }, { "epoch": 0.685536191324259, "ewc_loss": 0.02286163717508316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2861637262394652e-05, "grad_norm": 15.27730655670166, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8532457947731018, "num_tokens": 205685312.0, "step": 5389 }, { "epoch": 0.6856634016028496, "ewc_loss": 0.022854480892419815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2854481358081102e-05, "grad_norm": 15.207103729248047, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8547472953796387, "num_tokens": 205720525.0, "step": 5390 }, { "epoch": 0.68579061188144, "ewc_loss": 0.022828135639429092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2828135115560144e-05, "grad_norm": 15.203713417053223, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8409723043441772, "num_tokens": 205763358.0, "step": 5391 }, { "epoch": 0.6859178221600305, "ewc_loss": 0.022870954126119614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2870954126119614e-05, "grad_norm": 15.186933517456055, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8636305928230286, "num_tokens": 205801413.0, "step": 5392 }, { "epoch": 0.686045032438621, "ewc_loss": 0.02286382205784321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.286382186866831e-05, "grad_norm": 15.240347862243652, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8613775968551636, "num_tokens": 205840442.0, "step": 5393 }, { "epoch": 0.6861722427172116, "ewc_loss": 0.022935325279831886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2935324523132294e-05, "grad_norm": 15.19604778289795, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8739184141159058, "num_tokens": 205877319.0, "step": 5394 }, { "epoch": 0.6862994529958021, "ewc_loss": 0.02288062684237957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.288062751176767e-05, "grad_norm": 15.308724403381348, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8584963083267212, "num_tokens": 205916186.0, "step": 5395 }, { "epoch": 0.6864266632743926, "ewc_loss": 0.02289287932217121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2892878405400552e-05, "grad_norm": 15.155925750732422, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8498435020446777, "num_tokens": 205956336.0, "step": 5396 }, { "epoch": 0.686553873552983, "ewc_loss": 0.022820882499217987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2820882804808207e-05, "grad_norm": 15.232012748718262, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8580691814422607, "num_tokens": 205989570.0, "step": 5397 }, { "epoch": 0.6866810838315736, "ewc_loss": 0.02299494668841362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.299494735780172e-05, "grad_norm": 15.229985237121582, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.851337730884552, "num_tokens": 206026139.0, "step": 5398 }, { "epoch": 0.6868082941101641, "ewc_loss": 0.02286033146083355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2860331228002906e-05, "grad_norm": 15.180326461791992, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.853908896446228, "num_tokens": 206070059.0, "step": 5399 }, { "epoch": 0.6869355043887546, "ewc_loss": 0.022897163406014442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2897163944435306e-05, "grad_norm": 15.264957427978516, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8633749485015869, "num_tokens": 206104645.0, "step": 5400 }, { "epoch": 0.6870627146673451, "ewc_loss": 0.022938795387744904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.293879515491426e-05, "grad_norm": 15.164104461669922, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8574276566505432, "num_tokens": 206141220.0, "step": 5401 }, { "epoch": 0.6871899249459357, "ewc_loss": 0.02285543829202652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2855438146507367e-05, "grad_norm": 15.257665634155273, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8571626543998718, "num_tokens": 206177917.0, "step": 5402 }, { "epoch": 0.6873171352245261, "ewc_loss": 0.02294653281569481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2946533135836944e-05, "grad_norm": 15.230879783630371, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8594076633453369, "num_tokens": 206213419.0, "step": 5403 }, { "epoch": 0.6874443455031166, "ewc_loss": 0.022841330617666245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2841330064693466e-05, "grad_norm": 15.181315422058105, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8435947895050049, "num_tokens": 206252421.0, "step": 5404 }, { "epoch": 0.6875715557817071, "ewc_loss": 0.022906819358468056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2906819140189327e-05, "grad_norm": 15.218307495117188, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8643354773521423, "num_tokens": 206287195.0, "step": 5405 }, { "epoch": 0.6876987660602977, "ewc_loss": 0.022913383319973946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2913383872946724e-05, "grad_norm": 15.191773414611816, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8403929471969604, "num_tokens": 206328339.0, "step": 5406 }, { "epoch": 0.6878259763388882, "ewc_loss": 0.022881092503666878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2881093173054978e-05, "grad_norm": 15.200427055358887, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8443126678466797, "num_tokens": 206369523.0, "step": 5407 }, { "epoch": 0.6879531866174787, "ewc_loss": 0.022897042334079742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.289704207214527e-05, "grad_norm": 15.111213684082031, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8788815140724182, "num_tokens": 206402864.0, "step": 5408 }, { "epoch": 0.6880803968960693, "ewc_loss": 0.022911852225661278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.291185228386894e-05, "grad_norm": 15.213205337524414, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8485010862350464, "num_tokens": 206439004.0, "step": 5409 }, { "epoch": 0.6882076071746597, "ewc_loss": 0.022995933890342712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2995933250058442e-05, "grad_norm": 15.206746101379395, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8617821931838989, "num_tokens": 206472320.0, "step": 5410 }, { "epoch": 0.6883348174532502, "ewc_loss": 0.02295743115246296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2957430701353587e-05, "grad_norm": 15.260942459106445, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8524062037467957, "num_tokens": 206518649.0, "step": 5411 }, { "epoch": 0.6884620277318407, "ewc_loss": 0.022941239178180695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2941239876672626e-05, "grad_norm": 15.156351089477539, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8736133575439453, "num_tokens": 206551318.0, "step": 5412 }, { "epoch": 0.6885892380104313, "ewc_loss": 0.022909941151738167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2909940526005812e-05, "grad_norm": 15.25178337097168, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8479464054107666, "num_tokens": 206588173.0, "step": 5413 }, { "epoch": 0.6887164482890218, "ewc_loss": 0.022993052378296852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2993051970843226e-05, "grad_norm": 15.204930305480957, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8592358231544495, "num_tokens": 206624013.0, "step": 5414 }, { "epoch": 0.6888436585676123, "ewc_loss": 0.02296227216720581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2962272851145826e-05, "grad_norm": 15.221245765686035, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8670845031738281, "num_tokens": 206668180.0, "step": 5415 }, { "epoch": 0.6889708688462027, "ewc_loss": 0.023018335923552513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3018335923552513e-05, "grad_norm": 15.212509155273438, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8547941446304321, "num_tokens": 206710619.0, "step": 5416 }, { "epoch": 0.6890980791247933, "ewc_loss": 0.022915463894605637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.291546479682438e-05, "grad_norm": 15.182134628295898, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8646094799041748, "num_tokens": 206750121.0, "step": 5417 }, { "epoch": 0.6892252894033838, "ewc_loss": 0.022954830899834633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.295483136549592e-05, "grad_norm": 15.21828556060791, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8708711862564087, "num_tokens": 206789538.0, "step": 5418 }, { "epoch": 0.6893524996819743, "ewc_loss": 0.022934522479772568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.293452234880533e-05, "grad_norm": 15.135419845581055, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8591805696487427, "num_tokens": 206827513.0, "step": 5419 }, { "epoch": 0.6894797099605648, "ewc_loss": 0.022951576858758926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2951577193452977e-05, "grad_norm": 15.200035095214844, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8666313886642456, "num_tokens": 206866563.0, "step": 5420 }, { "epoch": 0.6896069202391554, "ewc_loss": 0.022984398528933525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2984399038250558e-05, "grad_norm": 15.193424224853516, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8695213198661804, "num_tokens": 206906281.0, "step": 5421 }, { "epoch": 0.6897341305177458, "ewc_loss": 0.022956043481826782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2956042812438682e-05, "grad_norm": 15.20602798461914, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8409671783447266, "num_tokens": 206944055.0, "step": 5422 }, { "epoch": 0.6898613407963363, "ewc_loss": 0.02292180247604847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2921802155906335e-05, "grad_norm": 15.186028480529785, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.859978199005127, "num_tokens": 206984421.0, "step": 5423 }, { "epoch": 0.6899885510749268, "ewc_loss": 0.02296769991517067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2967700715526007e-05, "grad_norm": 15.241430282592773, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8489434719085693, "num_tokens": 207020319.0, "step": 5424 }, { "epoch": 0.6901157613535174, "ewc_loss": 0.022933823987841606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.293382385687437e-05, "grad_norm": 15.181646347045898, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8623156547546387, "num_tokens": 207061487.0, "step": 5425 }, { "epoch": 0.6902429716321079, "ewc_loss": 0.02293279767036438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.293279794685077e-05, "grad_norm": 15.267929077148438, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8612635135650635, "num_tokens": 207098343.0, "step": 5426 }, { "epoch": 0.6903701819106984, "ewc_loss": 0.022950196638703346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2950196580495685e-05, "grad_norm": 15.196474075317383, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8561493754386902, "num_tokens": 207140381.0, "step": 5427 }, { "epoch": 0.6904973921892888, "ewc_loss": 0.022856850177049637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.285684968228452e-05, "grad_norm": 15.177165985107422, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.873002290725708, "num_tokens": 207173923.0, "step": 5428 }, { "epoch": 0.6906246024678794, "ewc_loss": 0.022957874462008476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2957874534768052e-05, "grad_norm": 15.242417335510254, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8666417598724365, "num_tokens": 207214840.0, "step": 5429 }, { "epoch": 0.6907518127464699, "ewc_loss": 0.022890200838446617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2890200852998532e-05, "grad_norm": 15.199642181396484, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8584204912185669, "num_tokens": 207247850.0, "step": 5430 }, { "epoch": 0.6908790230250604, "ewc_loss": 0.022887375205755234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2887375962454826e-05, "grad_norm": 15.164759635925293, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.860284686088562, "num_tokens": 207286698.0, "step": 5431 }, { "epoch": 0.691006233303651, "ewc_loss": 0.022869672626256943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2869673557579517e-05, "grad_norm": 15.123085975646973, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8541531562805176, "num_tokens": 207322470.0, "step": 5432 }, { "epoch": 0.6911334435822415, "ewc_loss": 0.022932235151529312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2932235879125074e-05, "grad_norm": 15.208819389343262, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8458054661750793, "num_tokens": 207361517.0, "step": 5433 }, { "epoch": 0.6912606538608319, "ewc_loss": 0.022922713309526443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.292271346959751e-05, "grad_norm": 15.16822338104248, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.868714451789856, "num_tokens": 207394504.0, "step": 5434 }, { "epoch": 0.6913878641394224, "ewc_loss": 0.022927047684788704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.292704812134616e-05, "grad_norm": 15.255589485168457, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8407494425773621, "num_tokens": 207430148.0, "step": 5435 }, { "epoch": 0.691515074418013, "ewc_loss": 0.022970348596572876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.297034916409757e-05, "grad_norm": 15.272531509399414, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8565414547920227, "num_tokens": 207461094.0, "step": 5436 }, { "epoch": 0.6916422846966035, "ewc_loss": 0.02297106571495533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2971065845922567e-05, "grad_norm": 15.251543998718262, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8664011359214783, "num_tokens": 207500572.0, "step": 5437 }, { "epoch": 0.691769494975194, "ewc_loss": 0.022955583408474922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2955582608119585e-05, "grad_norm": 15.243609428405762, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.852473258972168, "num_tokens": 207534977.0, "step": 5438 }, { "epoch": 0.6918967052537845, "ewc_loss": 0.022955575957894325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.295557533216197e-05, "grad_norm": 15.17402458190918, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8652913570404053, "num_tokens": 207569809.0, "step": 5439 }, { "epoch": 0.692023915532375, "ewc_loss": 0.02294854074716568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.294854130013846e-05, "grad_norm": 15.26994800567627, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8605589866638184, "num_tokens": 207606785.0, "step": 5440 }, { "epoch": 0.6921511258109655, "ewc_loss": 0.02301369234919548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301369204360526e-05, "grad_norm": 15.179924964904785, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8576343655586243, "num_tokens": 207648344.0, "step": 5441 }, { "epoch": 0.692278336089556, "ewc_loss": 0.022964386269450188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2964386516832747e-05, "grad_norm": 15.26245403289795, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.867576003074646, "num_tokens": 207680962.0, "step": 5442 }, { "epoch": 0.6924055463681466, "ewc_loss": 0.023046663030982018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3046663045533933e-05, "grad_norm": 15.163583755493164, "learning_rate": 1e-06, "loss": 0.3313, "mean_token_accuracy": 0.8920010924339294, "num_tokens": 207709162.0, "step": 5443 }, { "epoch": 0.6925327566467371, "ewc_loss": 0.023006539791822433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3006539777270518e-05, "grad_norm": 15.244841575622559, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8556035757064819, "num_tokens": 207746089.0, "step": 5444 }, { "epoch": 0.6926599669253276, "ewc_loss": 0.02302880398929119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.302880420756992e-05, "grad_norm": 15.15882682800293, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8705915212631226, "num_tokens": 207791235.0, "step": 5445 }, { "epoch": 0.692787177203918, "ewc_loss": 0.023009298369288445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3009299184195697e-05, "grad_norm": 15.335661888122559, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8508880138397217, "num_tokens": 207824527.0, "step": 5446 }, { "epoch": 0.6929143874825086, "ewc_loss": 0.02311220020055771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3112199414754286e-05, "grad_norm": 15.226806640625, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8701416254043579, "num_tokens": 207863535.0, "step": 5447 }, { "epoch": 0.6930415977610991, "ewc_loss": 0.02297370694577694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2973707018536516e-05, "grad_norm": 15.27768611907959, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8542341589927673, "num_tokens": 207899694.0, "step": 5448 }, { "epoch": 0.6931688080396896, "ewc_loss": 0.023069873452186584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3069873350323178e-05, "grad_norm": 15.271177291870117, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.851860761642456, "num_tokens": 207931440.0, "step": 5449 }, { "epoch": 0.6932960183182801, "ewc_loss": 0.023023296147584915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3023296307655983e-05, "grad_norm": 15.214569091796875, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8585571646690369, "num_tokens": 207963758.0, "step": 5450 }, { "epoch": 0.6934232285968707, "ewc_loss": 0.02306137979030609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3061380488798022e-05, "grad_norm": 15.358036994934082, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8650600910186768, "num_tokens": 208002383.0, "step": 5451 }, { "epoch": 0.6935504388754611, "ewc_loss": 0.02303004264831543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3030042939353734e-05, "grad_norm": 15.237257957458496, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8636261224746704, "num_tokens": 208038723.0, "step": 5452 }, { "epoch": 0.6936776491540516, "ewc_loss": 0.023034337908029556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3034337573335506e-05, "grad_norm": 15.36028003692627, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8649507164955139, "num_tokens": 208071851.0, "step": 5453 }, { "epoch": 0.6938048594326421, "ewc_loss": 0.02303035743534565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3030357624520548e-05, "grad_norm": 15.210389137268066, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8428694009780884, "num_tokens": 208106916.0, "step": 5454 }, { "epoch": 0.6939320697112327, "ewc_loss": 0.023017359897494316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301735912624281e-05, "grad_norm": 15.458768844604492, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8702768087387085, "num_tokens": 208146883.0, "step": 5455 }, { "epoch": 0.6940592799898232, "ewc_loss": 0.023058757185935974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305875750607811e-05, "grad_norm": 15.179019927978516, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8632922172546387, "num_tokens": 208187364.0, "step": 5456 }, { "epoch": 0.6941864902684137, "ewc_loss": 0.022915828973054886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.291582859470509e-05, "grad_norm": 15.233242988586426, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8678637742996216, "num_tokens": 208226087.0, "step": 5457 }, { "epoch": 0.6943137005470043, "ewc_loss": 0.02305571548640728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305571615579538e-05, "grad_norm": 15.330923080444336, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8576960563659668, "num_tokens": 208265193.0, "step": 5458 }, { "epoch": 0.6944409108255947, "ewc_loss": 0.023000001907348633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3000002329354174e-05, "grad_norm": 15.150131225585938, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8737730979919434, "num_tokens": 208305163.0, "step": 5459 }, { "epoch": 0.6945681211041852, "ewc_loss": 0.022963400930166245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2963400624576025e-05, "grad_norm": 15.273958206176758, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8493849039077759, "num_tokens": 208342229.0, "step": 5460 }, { "epoch": 0.6946953313827757, "ewc_loss": 0.023038484156131744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303848486917559e-05, "grad_norm": 15.276253700256348, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8510104417800903, "num_tokens": 208384606.0, "step": 5461 }, { "epoch": 0.6948225416613663, "ewc_loss": 0.022962911054491997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296291131642647e-05, "grad_norm": 15.291540145874023, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8798705339431763, "num_tokens": 208416777.0, "step": 5462 }, { "epoch": 0.6949497519399568, "ewc_loss": 0.023014593869447708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301459426234942e-05, "grad_norm": 15.188114166259766, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8452819585800171, "num_tokens": 208456101.0, "step": 5463 }, { "epoch": 0.6950769622185473, "ewc_loss": 0.0229561198502779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.295611920999363e-05, "grad_norm": 15.277629852294922, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8706316947937012, "num_tokens": 208489201.0, "step": 5464 }, { "epoch": 0.6952041724971377, "ewc_loss": 0.023034673184156418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3034674086375162e-05, "grad_norm": 15.306535720825195, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8746720552444458, "num_tokens": 208525006.0, "step": 5465 }, { "epoch": 0.6953313827757283, "ewc_loss": 0.0229655634611845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296556340297684e-05, "grad_norm": 15.243288040161133, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8581427335739136, "num_tokens": 208568997.0, "step": 5466 }, { "epoch": 0.6954585930543188, "ewc_loss": 0.02294696681201458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2946966055314988e-05, "grad_norm": 15.215381622314453, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8522900342941284, "num_tokens": 208605925.0, "step": 5467 }, { "epoch": 0.6955858033329093, "ewc_loss": 0.02293935790657997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2939357222639956e-05, "grad_norm": 15.230900764465332, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8352108001708984, "num_tokens": 208646290.0, "step": 5468 }, { "epoch": 0.6957130136114998, "ewc_loss": 0.02299453876912594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2994538085185923e-05, "grad_norm": 15.246773719787598, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8612638711929321, "num_tokens": 208686424.0, "step": 5469 }, { "epoch": 0.6958402238900904, "ewc_loss": 0.022955354303121567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.295535523444414e-05, "grad_norm": 15.273615837097168, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8627327084541321, "num_tokens": 208719643.0, "step": 5470 }, { "epoch": 0.6959674341686808, "ewc_loss": 0.022967204451560974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296720413141884e-05, "grad_norm": 15.217811584472656, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8440381288528442, "num_tokens": 208760804.0, "step": 5471 }, { "epoch": 0.6960946444472713, "ewc_loss": 0.022962095215916634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296209459018428e-05, "grad_norm": 15.241864204406738, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8549527525901794, "num_tokens": 208803022.0, "step": 5472 }, { "epoch": 0.6962218547258618, "ewc_loss": 0.022955285385251045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2955286112846807e-05, "grad_norm": 15.25576114654541, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8611904382705688, "num_tokens": 208844205.0, "step": 5473 }, { "epoch": 0.6963490650044524, "ewc_loss": 0.022898724302649498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2898724637343548e-05, "grad_norm": 15.21513843536377, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8707926273345947, "num_tokens": 208886882.0, "step": 5474 }, { "epoch": 0.6964762752830429, "ewc_loss": 0.022893913090229034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.289391341037117e-05, "grad_norm": 15.232858657836914, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8575482368469238, "num_tokens": 208926649.0, "step": 5475 }, { "epoch": 0.6966034855616334, "ewc_loss": 0.022929999977350235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2930000341148116e-05, "grad_norm": 15.210603713989258, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8543900847434998, "num_tokens": 208963403.0, "step": 5476 }, { "epoch": 0.6967306958402238, "ewc_loss": 0.022926000878214836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.292600038344972e-05, "grad_norm": 15.2804536819458, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8523785471916199, "num_tokens": 209002475.0, "step": 5477 }, { "epoch": 0.6968579061188144, "ewc_loss": 0.022928647696971893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2928647013031878e-05, "grad_norm": 15.210906028747559, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8599562644958496, "num_tokens": 209044216.0, "step": 5478 }, { "epoch": 0.6969851163974049, "ewc_loss": 0.022908464074134827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2908463506610133e-05, "grad_norm": 15.24544906616211, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8499437570571899, "num_tokens": 209080774.0, "step": 5479 }, { "epoch": 0.6971123266759954, "ewc_loss": 0.022929998114705086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2929998522158712e-05, "grad_norm": 15.24029541015625, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.866135835647583, "num_tokens": 209117167.0, "step": 5480 }, { "epoch": 0.697239536954586, "ewc_loss": 0.0229484885931015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2948488549445756e-05, "grad_norm": 15.284860610961914, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8539340496063232, "num_tokens": 209148872.0, "step": 5481 }, { "epoch": 0.6973667472331765, "ewc_loss": 0.02293494902551174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2934949811315164e-05, "grad_norm": 15.164337158203125, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8624910116195679, "num_tokens": 209187912.0, "step": 5482 }, { "epoch": 0.6974939575117669, "ewc_loss": 0.022932976484298706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2932976207812317e-05, "grad_norm": 15.21658706665039, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8674625158309937, "num_tokens": 209227055.0, "step": 5483 }, { "epoch": 0.6976211677903574, "ewc_loss": 0.02303173579275608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3031736418488435e-05, "grad_norm": 15.254331588745117, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8739359378814697, "num_tokens": 209255343.0, "step": 5484 }, { "epoch": 0.697748378068948, "ewc_loss": 0.022991986945271492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.299198786204215e-05, "grad_norm": 15.188844680786133, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8535974621772766, "num_tokens": 209294621.0, "step": 5485 }, { "epoch": 0.6978755883475385, "ewc_loss": 0.023011470213532448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301147105754353e-05, "grad_norm": 15.206795692443848, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8755288124084473, "num_tokens": 209336519.0, "step": 5486 }, { "epoch": 0.698002798626129, "ewc_loss": 0.02305489592254162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305489579157438e-05, "grad_norm": 15.229704856872559, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8571926355361938, "num_tokens": 209372913.0, "step": 5487 }, { "epoch": 0.6981300089047195, "ewc_loss": 0.023055696859955788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305569614691194e-05, "grad_norm": 15.226655960083008, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8510107398033142, "num_tokens": 209417768.0, "step": 5488 }, { "epoch": 0.69825721918331, "ewc_loss": 0.023046474903821945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3046475689625368e-05, "grad_norm": 15.267996788024902, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8624880313873291, "num_tokens": 209451221.0, "step": 5489 }, { "epoch": 0.6983844294619005, "ewc_loss": 0.023072320967912674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.307232171006035e-05, "grad_norm": 15.258544921875, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8591258525848389, "num_tokens": 209488710.0, "step": 5490 }, { "epoch": 0.698511639740491, "ewc_loss": 0.02308892272412777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.308892180735711e-05, "grad_norm": 15.274431228637695, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8575922250747681, "num_tokens": 209525166.0, "step": 5491 }, { "epoch": 0.6986388500190815, "ewc_loss": 0.02305220067501068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3052200049278326e-05, "grad_norm": 15.256216049194336, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8626448512077332, "num_tokens": 209564804.0, "step": 5492 }, { "epoch": 0.6987660602976721, "ewc_loss": 0.02303369529545307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3033695470076054e-05, "grad_norm": 15.269808769226074, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8640706539154053, "num_tokens": 209607155.0, "step": 5493 }, { "epoch": 0.6988932705762626, "ewc_loss": 0.023064009845256805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3064008928486146e-05, "grad_norm": 15.363324165344238, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8491771221160889, "num_tokens": 209641226.0, "step": 5494 }, { "epoch": 0.699020480854853, "ewc_loss": 0.023006238043308258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.300623782502953e-05, "grad_norm": 15.255878448486328, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8571270108222961, "num_tokens": 209679180.0, "step": 5495 }, { "epoch": 0.6991476911334435, "ewc_loss": 0.023016750812530518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3016751583782025e-05, "grad_norm": 15.218780517578125, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8515833616256714, "num_tokens": 209716412.0, "step": 5496 }, { "epoch": 0.6992749014120341, "ewc_loss": 0.022996552288532257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2996551706455648e-05, "grad_norm": 15.254971504211426, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8513728976249695, "num_tokens": 209755697.0, "step": 5497 }, { "epoch": 0.6994021116906246, "ewc_loss": 0.02302475832402706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3024758775136434e-05, "grad_norm": 15.254566192626953, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8670006394386292, "num_tokens": 209793654.0, "step": 5498 }, { "epoch": 0.6995293219692151, "ewc_loss": 0.023026905953884125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3026905182632618e-05, "grad_norm": 15.264102935791016, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8583956956863403, "num_tokens": 209830703.0, "step": 5499 }, { "epoch": 0.6996565322478057, "ewc_loss": 0.02301018312573433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301018321304582e-05, "grad_norm": 15.27295970916748, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.860326886177063, "num_tokens": 209868994.0, "step": 5500 }, { "epoch": 0.6997837425263961, "ewc_loss": 0.023011157289147377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301115819136612e-05, "grad_norm": 15.23594856262207, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8694987297058105, "num_tokens": 209909605.0, "step": 5501 }, { "epoch": 0.6999109528049866, "ewc_loss": 0.022997353225946426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.299735388078261e-05, "grad_norm": 15.241047859191895, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8666180372238159, "num_tokens": 209945196.0, "step": 5502 }, { "epoch": 0.7000381630835771, "ewc_loss": 0.023025913164019585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3025913833407685e-05, "grad_norm": 15.306018829345703, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8608121871948242, "num_tokens": 209988585.0, "step": 5503 }, { "epoch": 0.7001653733621677, "ewc_loss": 0.02303970232605934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3039701773086563e-05, "grad_norm": 15.28764820098877, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8592105507850647, "num_tokens": 210032084.0, "step": 5504 }, { "epoch": 0.7002925836407582, "ewc_loss": 0.022966621443629265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2966622054809704e-05, "grad_norm": 15.263436317443848, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8493676781654358, "num_tokens": 210070639.0, "step": 5505 }, { "epoch": 0.7004197939193487, "ewc_loss": 0.023005004972219467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3005004550213926e-05, "grad_norm": 15.327986717224121, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8676265478134155, "num_tokens": 210113793.0, "step": 5506 }, { "epoch": 0.7005470041979391, "ewc_loss": 0.02298625186085701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.298625258845277e-05, "grad_norm": 15.268196105957031, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8519599437713623, "num_tokens": 210153517.0, "step": 5507 }, { "epoch": 0.7006742144765297, "ewc_loss": 0.02294471301138401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2944712327443995e-05, "grad_norm": 15.365269660949707, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8546892404556274, "num_tokens": 210186818.0, "step": 5508 }, { "epoch": 0.7008014247551202, "ewc_loss": 0.022948503494262695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2948503101360984e-05, "grad_norm": 15.272729873657227, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8523447513580322, "num_tokens": 210224283.0, "step": 5509 }, { "epoch": 0.7009286350337107, "ewc_loss": 0.022905811667442322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2905811420059763e-05, "grad_norm": 15.286884307861328, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8545480966567993, "num_tokens": 210263439.0, "step": 5510 }, { "epoch": 0.7010558453123013, "ewc_loss": 0.022975414991378784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2975415049586445e-05, "grad_norm": 15.248074531555176, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8625257611274719, "num_tokens": 210305887.0, "step": 5511 }, { "epoch": 0.7011830555908918, "ewc_loss": 0.022885307669639587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2885307771502994e-05, "grad_norm": 15.218424797058105, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8588402271270752, "num_tokens": 210340118.0, "step": 5512 }, { "epoch": 0.7013102658694823, "ewc_loss": 0.023010727018117905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.301072709087748e-05, "grad_norm": 15.391047477722168, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8615898489952087, "num_tokens": 210376098.0, "step": 5513 }, { "epoch": 0.7014374761480727, "ewc_loss": 0.02296261675655842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2962616640143096e-05, "grad_norm": 15.24085807800293, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8635648488998413, "num_tokens": 210418801.0, "step": 5514 }, { "epoch": 0.7015646864266633, "ewc_loss": 0.02294612117111683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2946122044231743e-05, "grad_norm": 15.290767669677734, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8498218655586243, "num_tokens": 210452392.0, "step": 5515 }, { "epoch": 0.7016918967052538, "ewc_loss": 0.023000577464699745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3000577130005695e-05, "grad_norm": 15.27843189239502, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8672515749931335, "num_tokens": 210489697.0, "step": 5516 }, { "epoch": 0.7018191069838443, "ewc_loss": 0.022966625168919563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296662569278851e-05, "grad_norm": 15.323834419250488, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8448456525802612, "num_tokens": 210530212.0, "step": 5517 }, { "epoch": 0.7019463172624348, "ewc_loss": 0.02292732335627079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2927322788746096e-05, "grad_norm": 15.26024341583252, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8657258749008179, "num_tokens": 210564974.0, "step": 5518 }, { "epoch": 0.7020735275410254, "ewc_loss": 0.02296474762260914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2964748495724052e-05, "grad_norm": 15.295708656311035, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8609288930892944, "num_tokens": 210601272.0, "step": 5519 }, { "epoch": 0.7022007378196158, "ewc_loss": 0.02299429289996624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2994292521616444e-05, "grad_norm": 15.249269485473633, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8622673749923706, "num_tokens": 210636908.0, "step": 5520 }, { "epoch": 0.7023279480982063, "ewc_loss": 0.022985350340604782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2985350369708613e-05, "grad_norm": 15.277393341064453, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8568679094314575, "num_tokens": 210672629.0, "step": 5521 }, { "epoch": 0.7024551583767968, "ewc_loss": 0.02303566411137581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303566361661069e-05, "grad_norm": 15.270133018493652, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8745225667953491, "num_tokens": 210703531.0, "step": 5522 }, { "epoch": 0.7025823686553874, "ewc_loss": 0.02302849106490612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.302849134139251e-05, "grad_norm": 15.255952835083008, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8736757040023804, "num_tokens": 210742946.0, "step": 5523 }, { "epoch": 0.7027095789339779, "ewc_loss": 0.02301318570971489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3013186364551075e-05, "grad_norm": 15.246554374694824, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8684957027435303, "num_tokens": 210778345.0, "step": 5524 }, { "epoch": 0.7028367892125684, "ewc_loss": 0.023014463484287262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3014463295112364e-05, "grad_norm": 15.301520347595215, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8661226630210876, "num_tokens": 210821544.0, "step": 5525 }, { "epoch": 0.7029639994911588, "ewc_loss": 0.02303311973810196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303311885043513e-05, "grad_norm": 15.255158424377441, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8636398911476135, "num_tokens": 210858060.0, "step": 5526 }, { "epoch": 0.7030912097697494, "ewc_loss": 0.023027779534459114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3027780116535723e-05, "grad_norm": 15.275300979614258, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8550438284873962, "num_tokens": 210900365.0, "step": 5527 }, { "epoch": 0.7032184200483399, "ewc_loss": 0.02304522693157196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.304522604390513e-05, "grad_norm": 15.213207244873047, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.855797529220581, "num_tokens": 210941118.0, "step": 5528 }, { "epoch": 0.7033456303269304, "ewc_loss": 0.023038484156131744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303848486917559e-05, "grad_norm": 15.271857261657715, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8676875829696655, "num_tokens": 210979865.0, "step": 5529 }, { "epoch": 0.703472840605521, "ewc_loss": 0.02304387278854847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3043872715788893e-05, "grad_norm": 15.266993522644043, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8599328994750977, "num_tokens": 211016385.0, "step": 5530 }, { "epoch": 0.7036000508841115, "ewc_loss": 0.023037051782011986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3037051505525596e-05, "grad_norm": 15.277726173400879, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8483012318611145, "num_tokens": 211059071.0, "step": 5531 }, { "epoch": 0.7037272611627019, "ewc_loss": 0.023031506687402725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303150722582359e-05, "grad_norm": 15.291820526123047, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8625303506851196, "num_tokens": 211094189.0, "step": 5532 }, { "epoch": 0.7038544714412924, "ewc_loss": 0.023053938522934914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3053939003148116e-05, "grad_norm": 15.26990795135498, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8738864660263062, "num_tokens": 211130524.0, "step": 5533 }, { "epoch": 0.703981681719883, "ewc_loss": 0.023026376962661743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.302637767570559e-05, "grad_norm": 15.335500717163086, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8538787961006165, "num_tokens": 211172335.0, "step": 5534 }, { "epoch": 0.7041088919984735, "ewc_loss": 0.023061072453856468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3061073079588823e-05, "grad_norm": 15.315189361572266, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8619449138641357, "num_tokens": 211205406.0, "step": 5535 }, { "epoch": 0.704236102277064, "ewc_loss": 0.022999925538897514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2999925931799226e-05, "grad_norm": 15.27214527130127, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8561549782752991, "num_tokens": 211246443.0, "step": 5536 }, { "epoch": 0.7043633125556545, "ewc_loss": 0.02304827980697155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3048280127113685e-05, "grad_norm": 15.293411254882812, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8745449781417847, "num_tokens": 211286707.0, "step": 5537 }, { "epoch": 0.704490522834245, "ewc_loss": 0.022976502776145935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2976502805249766e-05, "grad_norm": 15.287216186523438, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8715099096298218, "num_tokens": 211319448.0, "step": 5538 }, { "epoch": 0.7046177331128355, "ewc_loss": 0.023032497614622116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3032496756059118e-05, "grad_norm": 15.348854064941406, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.854936420917511, "num_tokens": 211363747.0, "step": 5539 }, { "epoch": 0.704744943391426, "ewc_loss": 0.02297965995967388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.297966057085432e-05, "grad_norm": 15.320676803588867, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8671540021896362, "num_tokens": 211408080.0, "step": 5540 }, { "epoch": 0.7048721536700165, "ewc_loss": 0.022952359169721603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.29523593588965e-05, "grad_norm": 15.305624008178711, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.850218653678894, "num_tokens": 211445656.0, "step": 5541 }, { "epoch": 0.7049993639486071, "ewc_loss": 0.02302475832402706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3024758775136434e-05, "grad_norm": 15.317764282226562, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8700000643730164, "num_tokens": 211484342.0, "step": 5542 }, { "epoch": 0.7051265742271976, "ewc_loss": 0.022989382967352867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2989383069216274e-05, "grad_norm": 15.32396125793457, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8559086322784424, "num_tokens": 211518535.0, "step": 5543 }, { "epoch": 0.705253784505788, "ewc_loss": 0.02297915890812874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2979158529778942e-05, "grad_norm": 15.34359073638916, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8537996411323547, "num_tokens": 211557236.0, "step": 5544 }, { "epoch": 0.7053809947843785, "ewc_loss": 0.022937441244721413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2937441826798022e-05, "grad_norm": 15.250852584838867, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8708688020706177, "num_tokens": 211599216.0, "step": 5545 }, { "epoch": 0.7055082050629691, "ewc_loss": 0.02295856550335884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.29585657507414e-05, "grad_norm": 15.272265434265137, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8426579833030701, "num_tokens": 211638930.0, "step": 5546 }, { "epoch": 0.7056354153415596, "ewc_loss": 0.0229178499430418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.291784949193243e-05, "grad_norm": 15.20539665222168, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.852889358997345, "num_tokens": 211680765.0, "step": 5547 }, { "epoch": 0.7057626256201501, "ewc_loss": 0.02297527715563774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2975276806391776e-05, "grad_norm": 15.370316505432129, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8737685680389404, "num_tokens": 211719037.0, "step": 5548 }, { "epoch": 0.7058898358987407, "ewc_loss": 0.023004166781902313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.300416599609889e-05, "grad_norm": 15.354403495788574, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8549549579620361, "num_tokens": 211759655.0, "step": 5549 }, { "epoch": 0.7060170461773311, "ewc_loss": 0.02295948565006256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2959486159379594e-05, "grad_norm": 15.287105560302734, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.843051552772522, "num_tokens": 211800405.0, "step": 5550 }, { "epoch": 0.7061442564559216, "ewc_loss": 0.022931978106498718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.293197758262977e-05, "grad_norm": 15.327325820922852, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8421468138694763, "num_tokens": 211846057.0, "step": 5551 }, { "epoch": 0.7062714667345121, "ewc_loss": 0.022932643070816994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2932643332751468e-05, "grad_norm": 15.230566024780273, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.857647180557251, "num_tokens": 211885944.0, "step": 5552 }, { "epoch": 0.7063986770131027, "ewc_loss": 0.022938668727874756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2938669644645415e-05, "grad_norm": 15.286845207214355, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8636131286621094, "num_tokens": 211928727.0, "step": 5553 }, { "epoch": 0.7065258872916932, "ewc_loss": 0.022952018305659294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2952019207878038e-05, "grad_norm": 15.217179298400879, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8498489260673523, "num_tokens": 211967593.0, "step": 5554 }, { "epoch": 0.7066530975702837, "ewc_loss": 0.022968187928199768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2968188204686157e-05, "grad_norm": 15.297019004821777, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8550081253051758, "num_tokens": 212001023.0, "step": 5555 }, { "epoch": 0.7067803078488741, "ewc_loss": 0.02302289381623268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.30228943109978e-05, "grad_norm": 15.272809982299805, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8522999882698059, "num_tokens": 212045611.0, "step": 5556 }, { "epoch": 0.7069075181274647, "ewc_loss": 0.0229628998786211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.296290040249005e-05, "grad_norm": 15.254984855651855, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8553100824356079, "num_tokens": 212087167.0, "step": 5557 }, { "epoch": 0.7070347284060552, "ewc_loss": 0.023009363561868668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.300936284882482e-05, "grad_norm": 15.305852890014648, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8615428805351257, "num_tokens": 212124254.0, "step": 5558 }, { "epoch": 0.7071619386846457, "ewc_loss": 0.023024121299386024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3024122128845192e-05, "grad_norm": 15.304854393005371, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8542142510414124, "num_tokens": 212162706.0, "step": 5559 }, { "epoch": 0.7072891489632362, "ewc_loss": 0.02297661267220974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2976611944613978e-05, "grad_norm": 15.330684661865234, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8592425584793091, "num_tokens": 212197359.0, "step": 5560 }, { "epoch": 0.7074163592418268, "ewc_loss": 0.022992726415395737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.299272637173999e-05, "grad_norm": 15.277874946594238, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8664032220840454, "num_tokens": 212237300.0, "step": 5561 }, { "epoch": 0.7075435695204173, "ewc_loss": 0.022951392456889153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2951391656533815e-05, "grad_norm": 15.261563301086426, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.840535044670105, "num_tokens": 212283472.0, "step": 5562 }, { "epoch": 0.7076707797990077, "ewc_loss": 0.023027310147881508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.302731081726961e-05, "grad_norm": 15.35273265838623, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8541126251220703, "num_tokens": 212321433.0, "step": 5563 }, { "epoch": 0.7077979900775982, "ewc_loss": 0.023026447743177414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3026448616292328e-05, "grad_norm": 15.255578994750977, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8574002385139465, "num_tokens": 212362234.0, "step": 5564 }, { "epoch": 0.7079252003561888, "ewc_loss": 0.022995924577116966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2995924155111425e-05, "grad_norm": 15.375492095947266, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8503890037536621, "num_tokens": 212399259.0, "step": 5565 }, { "epoch": 0.7080524106347793, "ewc_loss": 0.023038312792778015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3038312065182254e-05, "grad_norm": 15.288969993591309, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.859905481338501, "num_tokens": 212438489.0, "step": 5566 }, { "epoch": 0.7081796209133698, "ewc_loss": 0.022955825552344322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.295582635269966e-05, "grad_norm": 15.26917552947998, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8506968021392822, "num_tokens": 212482283.0, "step": 5567 }, { "epoch": 0.7083068311919604, "ewc_loss": 0.02303943783044815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303943801962305e-05, "grad_norm": 15.311448097229004, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8542693853378296, "num_tokens": 212522749.0, "step": 5568 }, { "epoch": 0.7084340414705508, "ewc_loss": 0.023000292479991913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3000291548669338e-05, "grad_norm": 15.244542121887207, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8512033224105835, "num_tokens": 212559937.0, "step": 5569 }, { "epoch": 0.7085612517491413, "ewc_loss": 0.023013601079583168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3013601094135083e-05, "grad_norm": 15.345318794250488, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8651372194290161, "num_tokens": 212595578.0, "step": 5570 }, { "epoch": 0.7086884620277318, "ewc_loss": 0.023055549710989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3055548808770254e-05, "grad_norm": 15.322808265686035, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8708440065383911, "num_tokens": 212634085.0, "step": 5571 }, { "epoch": 0.7088156723063224, "ewc_loss": 0.022974370047450066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.297437094966881e-05, "grad_norm": 15.335244178771973, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8486337661743164, "num_tokens": 212673557.0, "step": 5572 }, { "epoch": 0.7089428825849129, "ewc_loss": 0.022999653592705727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2999653083388694e-05, "grad_norm": 15.257850646972656, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8693780303001404, "num_tokens": 212711142.0, "step": 5573 }, { "epoch": 0.7090700928635034, "ewc_loss": 0.022970661520957947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.297066203027498e-05, "grad_norm": 15.31204891204834, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8590812683105469, "num_tokens": 212745570.0, "step": 5574 }, { "epoch": 0.7091973031420938, "ewc_loss": 0.022997254505753517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2997253836365417e-05, "grad_norm": 15.253029823303223, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8631986379623413, "num_tokens": 212783048.0, "step": 5575 }, { "epoch": 0.7093245134206844, "ewc_loss": 0.022976871579885483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2976872060098685e-05, "grad_norm": 15.23511791229248, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8768887519836426, "num_tokens": 212821231.0, "step": 5576 }, { "epoch": 0.7094517236992749, "ewc_loss": 0.02302313782274723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3023138055577874e-05, "grad_norm": 15.307511329650879, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8569953441619873, "num_tokens": 212861715.0, "step": 5577 }, { "epoch": 0.7095789339778654, "ewc_loss": 0.023034557700157166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3034557671053335e-05, "grad_norm": 15.279959678649902, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.849651575088501, "num_tokens": 212900655.0, "step": 5578 }, { "epoch": 0.709706144256456, "ewc_loss": 0.023004405200481415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.300440610270016e-05, "grad_norm": 15.26379108428955, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8506183624267578, "num_tokens": 212939308.0, "step": 5579 }, { "epoch": 0.7098333545350465, "ewc_loss": 0.023001641035079956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.300164123880677e-05, "grad_norm": 15.185097694396973, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.862809956073761, "num_tokens": 212981174.0, "step": 5580 }, { "epoch": 0.7099605648136369, "ewc_loss": 0.023032357916235924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3032358512864448e-05, "grad_norm": 15.349405288696289, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8694258332252502, "num_tokens": 213016401.0, "step": 5581 }, { "epoch": 0.7100877750922274, "ewc_loss": 0.023074723780155182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3074724595062435e-05, "grad_norm": 15.27197265625, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8478065729141235, "num_tokens": 213058480.0, "step": 5582 }, { "epoch": 0.710214985370818, "ewc_loss": 0.023030297830700874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.303029759787023e-05, "grad_norm": 15.353401184082031, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8722540140151978, "num_tokens": 213093527.0, "step": 5583 }, { "epoch": 0.7103421956494085, "ewc_loss": 0.023075150325894356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3075150238582864e-05, "grad_norm": 15.286325454711914, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8566973805427551, "num_tokens": 213126778.0, "step": 5584 }, { "epoch": 0.710469405927999, "ewc_loss": 0.022995878010988235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.2995878680376336e-05, "grad_norm": 15.275087356567383, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8622775077819824, "num_tokens": 213170863.0, "step": 5585 }, { "epoch": 0.7105966162065895, "ewc_loss": 0.023090781643986702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3090780814527534e-05, "grad_norm": 15.277978897094727, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8354675769805908, "num_tokens": 213211336.0, "step": 5586 }, { "epoch": 0.71072382648518, "ewc_loss": 0.02306896634399891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.306896567461081e-05, "grad_norm": 15.326598167419434, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8669332265853882, "num_tokens": 213245262.0, "step": 5587 }, { "epoch": 0.7108510367637705, "ewc_loss": 0.023070702329277992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3070702809491195e-05, "grad_norm": 15.275819778442383, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8509719371795654, "num_tokens": 213282150.0, "step": 5588 }, { "epoch": 0.710978247042361, "ewc_loss": 0.023052621632814407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.305262205481995e-05, "grad_norm": 15.313919067382812, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8642197847366333, "num_tokens": 213325573.0, "step": 5589 }, { "epoch": 0.7111054573209515, "ewc_loss": 0.023089852184057236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3089851310942322e-05, "grad_norm": 15.300597190856934, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8680535554885864, "num_tokens": 213363738.0, "step": 5590 }, { "epoch": 0.7112326675995421, "ewc_loss": 0.023085232824087143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.308523289684672e-05, "grad_norm": 15.30024242401123, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8656473159790039, "num_tokens": 213404796.0, "step": 5591 }, { "epoch": 0.7113598778781326, "ewc_loss": 0.023072434589266777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.307243448740337e-05, "grad_norm": 15.337215423583984, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8524311780929565, "num_tokens": 213436711.0, "step": 5592 }, { "epoch": 0.711487088156723, "ewc_loss": 0.02309144102036953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.309144110768102e-05, "grad_norm": 15.336170196533203, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8620981574058533, "num_tokens": 213473458.0, "step": 5593 }, { "epoch": 0.7116142984353135, "ewc_loss": 0.023108502849936485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.310850322828628e-05, "grad_norm": 15.320517539978027, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8704597353935242, "num_tokens": 213507355.0, "step": 5594 }, { "epoch": 0.7117415087139041, "ewc_loss": 0.023106073960661888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3106074877432548e-05, "grad_norm": 15.329959869384766, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8423123359680176, "num_tokens": 213543771.0, "step": 5595 }, { "epoch": 0.7118687189924946, "ewc_loss": 0.023099413141608238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3099413738236763e-05, "grad_norm": 15.366571426391602, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8573570847511292, "num_tokens": 213578317.0, "step": 5596 }, { "epoch": 0.7119959292710851, "ewc_loss": 0.023109979927539825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.310998024768196e-05, "grad_norm": 15.320172309875488, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8549842238426208, "num_tokens": 213615795.0, "step": 5597 }, { "epoch": 0.7121231395496757, "ewc_loss": 0.02308032475411892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3080325263435952e-05, "grad_norm": 15.305838584899902, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.868379533290863, "num_tokens": 213655452.0, "step": 5598 }, { "epoch": 0.7122503498282661, "ewc_loss": 0.0231171902269125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3117190721677616e-05, "grad_norm": 15.3362455368042, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8505975008010864, "num_tokens": 213688079.0, "step": 5599 }, { "epoch": 0.7123775601068566, "ewc_loss": 0.02311651036143303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.311651041964069e-05, "grad_norm": 15.338851928710938, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.875612735748291, "num_tokens": 213725310.0, "step": 5600 }, { "epoch": 0.7125047703854471, "ewc_loss": 0.023127315565943718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3127315216697752e-05, "grad_norm": 15.257526397705078, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8532140254974365, "num_tokens": 213757950.0, "step": 5601 }, { "epoch": 0.7126319806640377, "ewc_loss": 0.023127155378460884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.312715514563024e-05, "grad_norm": 15.32930850982666, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8467646241188049, "num_tokens": 213794146.0, "step": 5602 }, { "epoch": 0.7127591909426282, "ewc_loss": 0.023169750347733498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3169750420493074e-05, "grad_norm": 15.272889137268066, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8634414672851562, "num_tokens": 213834922.0, "step": 5603 }, { "epoch": 0.7128864012212187, "ewc_loss": 0.023121047765016556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3121046979213133e-05, "grad_norm": 15.332209587097168, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.87752366065979, "num_tokens": 213879651.0, "step": 5604 }, { "epoch": 0.7130136114998091, "ewc_loss": 0.02320944145321846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3209440769278444e-05, "grad_norm": 15.283966064453125, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8584681749343872, "num_tokens": 213921833.0, "step": 5605 }, { "epoch": 0.7131408217783997, "ewc_loss": 0.02314135991036892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.314135963388253e-05, "grad_norm": 15.331711769104004, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8649711608886719, "num_tokens": 213962909.0, "step": 5606 }, { "epoch": 0.7132680320569902, "ewc_loss": 0.023189770057797432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.31897702178685e-05, "grad_norm": 15.285907745361328, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8615918159484863, "num_tokens": 214002030.0, "step": 5607 }, { "epoch": 0.7133952423355807, "ewc_loss": 0.023126667365431786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.312666765647009e-05, "grad_norm": 15.282144546508789, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8521692156791687, "num_tokens": 214037094.0, "step": 5608 }, { "epoch": 0.7135224526141712, "ewc_loss": 0.023165171965956688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.316517202416435e-05, "grad_norm": 15.317593574523926, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8637668490409851, "num_tokens": 214076649.0, "step": 5609 }, { "epoch": 0.7136496628927618, "ewc_loss": 0.023143667727708817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.314366793143563e-05, "grad_norm": 15.353439331054688, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8558333516120911, "num_tokens": 214114194.0, "step": 5610 }, { "epoch": 0.7137768731713523, "ewc_loss": 0.02312707155942917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3127071472117677e-05, "grad_norm": 15.308175086975098, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8612221479415894, "num_tokens": 214149652.0, "step": 5611 }, { "epoch": 0.7139040834499427, "ewc_loss": 0.023122591897845268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3122591301216744e-05, "grad_norm": 15.307538032531738, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8697187900543213, "num_tokens": 214189068.0, "step": 5612 }, { "epoch": 0.7140312937285332, "ewc_loss": 0.02315996214747429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3159962438512594e-05, "grad_norm": 15.327241897583008, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8476370573043823, "num_tokens": 214220251.0, "step": 5613 }, { "epoch": 0.7141585040071238, "ewc_loss": 0.023179998621344566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.317999860679265e-05, "grad_norm": 15.320021629333496, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8572261333465576, "num_tokens": 214256705.0, "step": 5614 }, { "epoch": 0.7142857142857143, "ewc_loss": 0.023124422878026962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3124423023546115e-05, "grad_norm": 15.233131408691406, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.873703122138977, "num_tokens": 214292563.0, "step": 5615 }, { "epoch": 0.7144129245643048, "ewc_loss": 0.02311498485505581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3114984287531115e-05, "grad_norm": 15.221460342407227, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8540594577789307, "num_tokens": 214333843.0, "step": 5616 }, { "epoch": 0.7145401348428954, "ewc_loss": 0.023199250921607018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.319925079063978e-05, "grad_norm": 15.27241039276123, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8711435794830322, "num_tokens": 214372272.0, "step": 5617 }, { "epoch": 0.7146673451214858, "ewc_loss": 0.02318957820534706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3189577404991724e-05, "grad_norm": 15.268609046936035, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8663352727890015, "num_tokens": 214416364.0, "step": 5618 }, { "epoch": 0.7147945554000763, "ewc_loss": 0.023138675838708878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.31386766245123e-05, "grad_norm": 15.22244930267334, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8571428656578064, "num_tokens": 214453102.0, "step": 5619 }, { "epoch": 0.7149217656786668, "ewc_loss": 0.02318689599633217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3186896214610897e-05, "grad_norm": 15.308902740478516, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8640391826629639, "num_tokens": 214492374.0, "step": 5620 }, { "epoch": 0.7150489759572574, "ewc_loss": 0.02319621481001377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3196214897325262e-05, "grad_norm": 15.221040725708008, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.858975350856781, "num_tokens": 214530918.0, "step": 5621 }, { "epoch": 0.7151761862358479, "ewc_loss": 0.023168133571743965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.316813333891332e-05, "grad_norm": 15.295493125915527, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8763293027877808, "num_tokens": 214570227.0, "step": 5622 }, { "epoch": 0.7153033965144384, "ewc_loss": 0.023242056369781494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3242057068273425e-05, "grad_norm": 15.269989013671875, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8629487752914429, "num_tokens": 214609519.0, "step": 5623 }, { "epoch": 0.7154306067930288, "ewc_loss": 0.023204518482089043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.320451858395245e-05, "grad_norm": 15.381086349487305, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8572096228599548, "num_tokens": 214643564.0, "step": 5624 }, { "epoch": 0.7155578170716194, "ewc_loss": 0.02319658361375332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3196584152174182e-05, "grad_norm": 15.266849517822266, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8684927225112915, "num_tokens": 214677632.0, "step": 5625 }, { "epoch": 0.7156850273502099, "ewc_loss": 0.023192714899778366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.319271516171284e-05, "grad_norm": 15.280549049377441, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8622733354568481, "num_tokens": 214710363.0, "step": 5626 }, { "epoch": 0.7158122376288004, "ewc_loss": 0.023213351145386696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3213351596496068e-05, "grad_norm": 15.318381309509277, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8678028583526611, "num_tokens": 214746982.0, "step": 5627 }, { "epoch": 0.715939447907391, "ewc_loss": 0.02318531461060047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3185313693829812e-05, "grad_norm": 15.253275871276855, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.850723147392273, "num_tokens": 214786447.0, "step": 5628 }, { "epoch": 0.7160666581859815, "ewc_loss": 0.023190373554825783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3190374122350477e-05, "grad_norm": 15.31668472290039, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8684623837471008, "num_tokens": 214824627.0, "step": 5629 }, { "epoch": 0.7161938684645719, "ewc_loss": 0.02320338785648346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3203387172543444e-05, "grad_norm": 15.307705879211426, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.841758131980896, "num_tokens": 214863051.0, "step": 5630 }, { "epoch": 0.7163210787431624, "ewc_loss": 0.02322005107998848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3220050934469327e-05, "grad_norm": 15.302631378173828, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8470532298088074, "num_tokens": 214900524.0, "step": 5631 }, { "epoch": 0.716448289021753, "ewc_loss": 0.023233095183968544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3233094907482155e-05, "grad_norm": 15.309127807617188, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8654890060424805, "num_tokens": 214943949.0, "step": 5632 }, { "epoch": 0.7165754993003435, "ewc_loss": 0.02320977672934532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.32097772823181e-05, "grad_norm": 15.22616958618164, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8653359413146973, "num_tokens": 214979126.0, "step": 5633 }, { "epoch": 0.716702709578934, "ewc_loss": 0.023200448602437973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3200447685667314e-05, "grad_norm": 15.333149909973145, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8605800867080688, "num_tokens": 215017740.0, "step": 5634 }, { "epoch": 0.7168299198575245, "ewc_loss": 0.023272253572940826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.327225411136169e-05, "grad_norm": 15.29001235961914, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8687736988067627, "num_tokens": 215054472.0, "step": 5635 }, { "epoch": 0.716957130136115, "ewc_loss": 0.023171698674559593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.317169855814427e-05, "grad_norm": 15.372145652770996, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8635520339012146, "num_tokens": 215086894.0, "step": 5636 }, { "epoch": 0.7170843404147055, "ewc_loss": 0.02325104922056198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3251050151884556e-05, "grad_norm": 15.323963165283203, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8610458970069885, "num_tokens": 215128157.0, "step": 5637 }, { "epoch": 0.717211550693296, "ewc_loss": 0.02320588007569313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.32058791880263e-05, "grad_norm": 15.349583625793457, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8673985004425049, "num_tokens": 215161008.0, "step": 5638 }, { "epoch": 0.7173387609718865, "ewc_loss": 0.023280473425984383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3280474124476314e-05, "grad_norm": 15.361686706542969, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8439502716064453, "num_tokens": 215197157.0, "step": 5639 }, { "epoch": 0.7174659712504771, "ewc_loss": 0.023236166685819626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3236167180584744e-05, "grad_norm": 15.346794128417969, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8681821823120117, "num_tokens": 215240270.0, "step": 5640 }, { "epoch": 0.7175931815290676, "ewc_loss": 0.023219842463731766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.321984175068792e-05, "grad_norm": 15.284310340881348, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8678725957870483, "num_tokens": 215279828.0, "step": 5641 }, { "epoch": 0.717720391807658, "ewc_loss": 0.023247864097356796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3247863282449543e-05, "grad_norm": 15.394587516784668, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8550182580947876, "num_tokens": 215315271.0, "step": 5642 }, { "epoch": 0.7178476020862485, "ewc_loss": 0.02323330193758011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.323330227227416e-05, "grad_norm": 15.324904441833496, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8626319169998169, "num_tokens": 215352621.0, "step": 5643 }, { "epoch": 0.7179748123648391, "ewc_loss": 0.02321510948240757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3215108740259893e-05, "grad_norm": 15.322917938232422, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8626261949539185, "num_tokens": 215390231.0, "step": 5644 }, { "epoch": 0.7181020226434296, "ewc_loss": 0.02322402037680149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3224019969347864e-05, "grad_norm": 15.30639934539795, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8741655945777893, "num_tokens": 215426812.0, "step": 5645 }, { "epoch": 0.7182292329220201, "ewc_loss": 0.02319619618356228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3196196707431227e-05, "grad_norm": 15.33609676361084, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8500645160675049, "num_tokens": 215467551.0, "step": 5646 }, { "epoch": 0.7183564432006107, "ewc_loss": 0.02320229262113571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.320229214092251e-05, "grad_norm": 15.312725067138672, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8691885471343994, "num_tokens": 215508559.0, "step": 5647 }, { "epoch": 0.7184836534792011, "ewc_loss": 0.023245345801115036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3245345801115036e-05, "grad_norm": 15.38425350189209, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8536007404327393, "num_tokens": 215548709.0, "step": 5648 }, { "epoch": 0.7186108637577916, "ewc_loss": 0.023239582777023315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3239583242684603e-05, "grad_norm": 15.257390975952148, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8610171675682068, "num_tokens": 215593844.0, "step": 5649 }, { "epoch": 0.7187380740363821, "ewc_loss": 0.023159941658377647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3159942429629155e-05, "grad_norm": 15.354068756103516, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8662047386169434, "num_tokens": 215633853.0, "step": 5650 }, { "epoch": 0.7188652843149727, "ewc_loss": 0.023259826004505157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3259826775756665e-05, "grad_norm": 15.384209632873535, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8594260811805725, "num_tokens": 215667272.0, "step": 5651 }, { "epoch": 0.7189924945935632, "ewc_loss": 0.02321069873869419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3210699509945698e-05, "grad_norm": 15.380528450012207, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8437868356704712, "num_tokens": 215705889.0, "step": 5652 }, { "epoch": 0.7191197048721537, "ewc_loss": 0.02316407486796379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.316407517355401e-05, "grad_norm": 15.323054313659668, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8692294359207153, "num_tokens": 215749143.0, "step": 5653 }, { "epoch": 0.7192469151507441, "ewc_loss": 0.023211177438497543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.321117790415883e-05, "grad_norm": 15.339436531066895, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8557762503623962, "num_tokens": 215787583.0, "step": 5654 }, { "epoch": 0.7193741254293347, "ewc_loss": 0.02317858301103115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3178583433036692e-05, "grad_norm": 15.299933433532715, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8614998459815979, "num_tokens": 215825552.0, "step": 5655 }, { "epoch": 0.7195013357079252, "ewc_loss": 0.023223625496029854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3223625248647295e-05, "grad_norm": 15.419952392578125, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8702095746994019, "num_tokens": 215858310.0, "step": 5656 }, { "epoch": 0.7196285459865157, "ewc_loss": 0.023181600496172905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3181601136457175e-05, "grad_norm": 15.366663932800293, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8499982357025146, "num_tokens": 215898403.0, "step": 5657 }, { "epoch": 0.7197557562651062, "ewc_loss": 0.023194771260023117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3194770619738847e-05, "grad_norm": 15.374151229858398, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8379601240158081, "num_tokens": 215935930.0, "step": 5658 }, { "epoch": 0.7198829665436968, "ewc_loss": 0.02316688746213913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3166887331171893e-05, "grad_norm": 15.34671401977539, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8578634262084961, "num_tokens": 215971509.0, "step": 5659 }, { "epoch": 0.7200101768222873, "ewc_loss": 0.02319915033876896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3199150746222585e-05, "grad_norm": 15.323164939880371, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8665072321891785, "num_tokens": 216008696.0, "step": 5660 }, { "epoch": 0.7201373871008777, "ewc_loss": 0.02316882647573948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3168826373876072e-05, "grad_norm": 15.329129219055176, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8606641888618469, "num_tokens": 216045861.0, "step": 5661 }, { "epoch": 0.7202645973794682, "ewc_loss": 0.023175479844212532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3175480237114243e-05, "grad_norm": 15.27487850189209, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.870093822479248, "num_tokens": 216081307.0, "step": 5662 }, { "epoch": 0.7203918076580588, "ewc_loss": 0.023203695192933083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3203694581752643e-05, "grad_norm": 15.378798484802246, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8700069189071655, "num_tokens": 216118506.0, "step": 5663 }, { "epoch": 0.7205190179366493, "ewc_loss": 0.023205669596791267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3205670004244894e-05, "grad_norm": 15.251814842224121, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8536407947540283, "num_tokens": 216159000.0, "step": 5664 }, { "epoch": 0.7206462282152398, "ewc_loss": 0.023207630962133408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3207630874821916e-05, "grad_norm": 15.381075859069824, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8537060022354126, "num_tokens": 216204712.0, "step": 5665 }, { "epoch": 0.7207734384938304, "ewc_loss": 0.023182423785328865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3182423319667578e-05, "grad_norm": 15.264411926269531, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.852950930595398, "num_tokens": 216235001.0, "step": 5666 }, { "epoch": 0.7209006487724208, "ewc_loss": 0.023202283307909966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.320228304597549e-05, "grad_norm": 15.353408813476562, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8544010519981384, "num_tokens": 216271383.0, "step": 5667 }, { "epoch": 0.7210278590510113, "ewc_loss": 0.023266535252332687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.326653520867694e-05, "grad_norm": 15.318796157836914, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.861440896987915, "num_tokens": 216307923.0, "step": 5668 }, { "epoch": 0.7211550693296018, "ewc_loss": 0.023227611556649208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3227610654430464e-05, "grad_norm": 15.250044822692871, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8799616098403931, "num_tokens": 216345147.0, "step": 5669 }, { "epoch": 0.7212822796081924, "ewc_loss": 0.023223010823130608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3223010430228896e-05, "grad_norm": 15.293463706970215, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.852703869342804, "num_tokens": 216386991.0, "step": 5670 }, { "epoch": 0.7214094898867829, "ewc_loss": 0.023243572562932968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.324357228644658e-05, "grad_norm": 15.265647888183594, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8632756471633911, "num_tokens": 216425534.0, "step": 5671 }, { "epoch": 0.7215367001653734, "ewc_loss": 0.02325090579688549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3250906451721676e-05, "grad_norm": 15.333643913269043, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8611850738525391, "num_tokens": 216466686.0, "step": 5672 }, { "epoch": 0.7216639104439638, "ewc_loss": 0.023250756785273552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3250757294590585e-05, "grad_norm": 15.246545791625977, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8723870515823364, "num_tokens": 216505245.0, "step": 5673 }, { "epoch": 0.7217911207225544, "ewc_loss": 0.02326924540102482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3269245502888225e-05, "grad_norm": 15.419051170349121, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8573826551437378, "num_tokens": 216538024.0, "step": 5674 }, { "epoch": 0.7219183310011449, "ewc_loss": 0.023316798731684685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3316799342865124e-05, "grad_norm": 15.305975914001465, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8597894310951233, "num_tokens": 216573722.0, "step": 5675 }, { "epoch": 0.7220455412797354, "ewc_loss": 0.023221541196107864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.322154068679083e-05, "grad_norm": 15.361066818237305, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8666229248046875, "num_tokens": 216610362.0, "step": 5676 }, { "epoch": 0.7221727515583259, "ewc_loss": 0.023286940529942513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.328694063180592e-05, "grad_norm": 15.30692195892334, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8526818156242371, "num_tokens": 216648732.0, "step": 5677 }, { "epoch": 0.7222999618369165, "ewc_loss": 0.023234140127897263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3234140826389194e-05, "grad_norm": 15.283421516418457, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8695594072341919, "num_tokens": 216680324.0, "step": 5678 }, { "epoch": 0.7224271721155069, "ewc_loss": 0.0232810340821743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3281034373212606e-05, "grad_norm": 15.328356742858887, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8653448224067688, "num_tokens": 216712773.0, "step": 5679 }, { "epoch": 0.7225543823940974, "ewc_loss": 0.02326880395412445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3268803488463163e-05, "grad_norm": 15.204913139343262, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8526435494422913, "num_tokens": 216752538.0, "step": 5680 }, { "epoch": 0.722681592672688, "ewc_loss": 0.02327546663582325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3275466446648352e-05, "grad_norm": 15.296879768371582, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8635232448577881, "num_tokens": 216791795.0, "step": 5681 }, { "epoch": 0.7228088029512785, "ewc_loss": 0.02336287312209606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3362872525467537e-05, "grad_norm": 15.25225830078125, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8583950996398926, "num_tokens": 216831729.0, "step": 5682 }, { "epoch": 0.722936013229869, "ewc_loss": 0.023342305794358253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3342305212281644e-05, "grad_norm": 15.305048942565918, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8611563444137573, "num_tokens": 216869494.0, "step": 5683 }, { "epoch": 0.7230632235084595, "ewc_loss": 0.02338247187435627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3382472136290744e-05, "grad_norm": 15.32631778717041, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.843023419380188, "num_tokens": 216906672.0, "step": 5684 }, { "epoch": 0.72319043378705, "ewc_loss": 0.02339904010295868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.339903949177824e-05, "grad_norm": 15.398024559020996, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8677694797515869, "num_tokens": 216941587.0, "step": 5685 }, { "epoch": 0.7233176440656405, "ewc_loss": 0.023381197825074196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3381197024718858e-05, "grad_norm": 15.271053314208984, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8535187840461731, "num_tokens": 216981812.0, "step": 5686 }, { "epoch": 0.723444854344231, "ewc_loss": 0.023342980071902275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.334298005735036e-05, "grad_norm": 15.353109359741211, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.86945641040802, "num_tokens": 217024137.0, "step": 5687 }, { "epoch": 0.7235720646228215, "ewc_loss": 0.023382725194096565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3382724975817837e-05, "grad_norm": 15.30262565612793, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8727366328239441, "num_tokens": 217063753.0, "step": 5688 }, { "epoch": 0.7236992749014121, "ewc_loss": 0.023322628811001778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.332262920390349e-05, "grad_norm": 15.345815658569336, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8595522046089172, "num_tokens": 217105611.0, "step": 5689 }, { "epoch": 0.7238264851800026, "ewc_loss": 0.023348677903413773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3348677132162265e-05, "grad_norm": 15.301164627075195, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8410452604293823, "num_tokens": 217145989.0, "step": 5690 }, { "epoch": 0.723953695458593, "ewc_loss": 0.02329091727733612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.329091694264207e-05, "grad_norm": 15.337972640991211, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.879326343536377, "num_tokens": 217183726.0, "step": 5691 }, { "epoch": 0.7240809057371835, "ewc_loss": 0.023319296538829803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3319296815316193e-05, "grad_norm": 15.329245567321777, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8577604293823242, "num_tokens": 217226644.0, "step": 5692 }, { "epoch": 0.7242081160157741, "ewc_loss": 0.02330121025443077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3301210603676736e-05, "grad_norm": 15.335674285888672, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8460408449172974, "num_tokens": 217263327.0, "step": 5693 }, { "epoch": 0.7243353262943646, "ewc_loss": 0.023283816874027252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.328381742700003e-05, "grad_norm": 15.318926811218262, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8654955625534058, "num_tokens": 217300595.0, "step": 5694 }, { "epoch": 0.7244625365729551, "ewc_loss": 0.023276474326848984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3276474166777916e-05, "grad_norm": 15.348973274230957, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.84725022315979, "num_tokens": 217336385.0, "step": 5695 }, { "epoch": 0.7245897468515456, "ewc_loss": 0.0233279038220644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3327904273173772e-05, "grad_norm": 15.41751480102539, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8676312565803528, "num_tokens": 217374309.0, "step": 5696 }, { "epoch": 0.7247169571301361, "ewc_loss": 0.023286232724785805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.328623304492794e-05, "grad_norm": 15.301955223083496, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8515475392341614, "num_tokens": 217408386.0, "step": 5697 }, { "epoch": 0.7248441674087266, "ewc_loss": 0.0232253335416317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3225333279697224e-05, "grad_norm": 15.396716117858887, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8472825288772583, "num_tokens": 217442855.0, "step": 5698 }, { "epoch": 0.7249713776873171, "ewc_loss": 0.023362379521131516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3362379579339176e-05, "grad_norm": 15.326807975769043, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8651429414749146, "num_tokens": 217476892.0, "step": 5699 }, { "epoch": 0.7250985879659076, "ewc_loss": 0.023237336426973343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3237336790771224e-05, "grad_norm": 15.391549110412598, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8586163520812988, "num_tokens": 217513569.0, "step": 5700 }, { "epoch": 0.7252257982444982, "ewc_loss": 0.023311275988817215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3311275072046556e-05, "grad_norm": 15.363465309143066, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8544628620147705, "num_tokens": 217545585.0, "step": 5701 }, { "epoch": 0.7253530085230887, "ewc_loss": 0.023244183510541916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.324418346688617e-05, "grad_norm": 15.320564270019531, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.863052487373352, "num_tokens": 217583156.0, "step": 5702 }, { "epoch": 0.7254802188016791, "ewc_loss": 0.023301076143980026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3301075998460874e-05, "grad_norm": 15.349502563476562, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8622655868530273, "num_tokens": 217619630.0, "step": 5703 }, { "epoch": 0.7256074290802697, "ewc_loss": 0.023304445669054985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3304446585825644e-05, "grad_norm": 15.344552040100098, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8721063137054443, "num_tokens": 217653616.0, "step": 5704 }, { "epoch": 0.7257346393588602, "ewc_loss": 0.023302927613258362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3302927729673684e-05, "grad_norm": 15.351688385009766, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8561110496520996, "num_tokens": 217692758.0, "step": 5705 }, { "epoch": 0.7258618496374507, "ewc_loss": 0.023333296179771423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3333295757765882e-05, "grad_norm": 15.346683502197266, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8454585075378418, "num_tokens": 217734175.0, "step": 5706 }, { "epoch": 0.7259890599160412, "ewc_loss": 0.02331671305000782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3316713850363158e-05, "grad_norm": 15.358091354370117, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8379241824150085, "num_tokens": 217773372.0, "step": 5707 }, { "epoch": 0.7261162701946318, "ewc_loss": 0.02335490472614765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3354905351880006e-05, "grad_norm": 15.374390602111816, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8703653812408447, "num_tokens": 217809174.0, "step": 5708 }, { "epoch": 0.7262434804732223, "ewc_loss": 0.023361140862107277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.336114084755536e-05, "grad_norm": 15.389864921569824, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8541302680969238, "num_tokens": 217845882.0, "step": 5709 }, { "epoch": 0.7263706907518127, "ewc_loss": 0.023321164771914482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3321164917433634e-05, "grad_norm": 15.307628631591797, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8564130067825317, "num_tokens": 217885798.0, "step": 5710 }, { "epoch": 0.7264979010304032, "ewc_loss": 0.023315075784921646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3315074940910563e-05, "grad_norm": 15.36407470703125, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8569165468215942, "num_tokens": 217924967.0, "step": 5711 }, { "epoch": 0.7266251113089938, "ewc_loss": 0.02331853099167347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.33185310207773e-05, "grad_norm": 15.320127487182617, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8711116313934326, "num_tokens": 217967381.0, "step": 5712 }, { "epoch": 0.7267523215875843, "ewc_loss": 0.02329406328499317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3294063794310205e-05, "grad_norm": 15.36613655090332, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8672193288803101, "num_tokens": 218001097.0, "step": 5713 }, { "epoch": 0.7268795318661748, "ewc_loss": 0.02333099953830242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3331000193138607e-05, "grad_norm": 15.363932609558105, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8630309104919434, "num_tokens": 218040705.0, "step": 5714 }, { "epoch": 0.7270067421447653, "ewc_loss": 0.023314425721764565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3314425561693497e-05, "grad_norm": 15.36396312713623, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.859976589679718, "num_tokens": 218077549.0, "step": 5715 }, { "epoch": 0.7271339524233558, "ewc_loss": 0.023321159183979034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3321159460465424e-05, "grad_norm": 15.307297706604004, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8591121435165405, "num_tokens": 218116251.0, "step": 5716 }, { "epoch": 0.7272611627019463, "ewc_loss": 0.023310843855142593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3310843971557915e-05, "grad_norm": 15.356538772583008, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8340604305267334, "num_tokens": 218155798.0, "step": 5717 }, { "epoch": 0.7273883729805368, "ewc_loss": 0.02330908179283142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.330908137082588e-05, "grad_norm": 15.309350967407227, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8658138513565063, "num_tokens": 218191277.0, "step": 5718 }, { "epoch": 0.7275155832591274, "ewc_loss": 0.023284360766410828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.328436130483169e-05, "grad_norm": 15.314677238464355, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8670700788497925, "num_tokens": 218233165.0, "step": 5719 }, { "epoch": 0.7276427935377179, "ewc_loss": 0.023339929059147835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3339929612120613e-05, "grad_norm": 15.379706382751465, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8399906158447266, "num_tokens": 218271263.0, "step": 5720 }, { "epoch": 0.7277700038163084, "ewc_loss": 0.02331915870308876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3319158572121523e-05, "grad_norm": 15.302262306213379, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8616020083427429, "num_tokens": 218309081.0, "step": 5721 }, { "epoch": 0.7278972140948988, "ewc_loss": 0.023309504613280296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3309505195356905e-05, "grad_norm": 15.34241008758545, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8562381267547607, "num_tokens": 218348314.0, "step": 5722 }, { "epoch": 0.7280244243734894, "ewc_loss": 0.02332589216530323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.332589247089345e-05, "grad_norm": 15.35561752319336, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8639389276504517, "num_tokens": 218387556.0, "step": 5723 }, { "epoch": 0.7281516346520799, "ewc_loss": 0.023304544389247894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3304544811253436e-05, "grad_norm": 15.346342086791992, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.85276198387146, "num_tokens": 218423130.0, "step": 5724 }, { "epoch": 0.7282788449306704, "ewc_loss": 0.02333608642220497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.333608608751092e-05, "grad_norm": 15.344755172729492, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8420773148536682, "num_tokens": 218464079.0, "step": 5725 }, { "epoch": 0.7284060552092609, "ewc_loss": 0.02332179807126522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.332179792574607e-05, "grad_norm": 15.346236228942871, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8680067658424377, "num_tokens": 218501307.0, "step": 5726 }, { "epoch": 0.7285332654878515, "ewc_loss": 0.023313719779253006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.331371979380492e-05, "grad_norm": 15.333620071411133, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8691165447235107, "num_tokens": 218537710.0, "step": 5727 }, { "epoch": 0.7286604757664419, "ewc_loss": 0.023391714319586754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.339171442145016e-05, "grad_norm": 15.378870010375977, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8557136058807373, "num_tokens": 218576026.0, "step": 5728 }, { "epoch": 0.7287876860450324, "ewc_loss": 0.023341644555330276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3341644919128157e-05, "grad_norm": 15.380369186401367, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.857193112373352, "num_tokens": 218612491.0, "step": 5729 }, { "epoch": 0.7289148963236229, "ewc_loss": 0.023335373029112816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.333537304366473e-05, "grad_norm": 15.343990325927734, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8683174252510071, "num_tokens": 218642541.0, "step": 5730 }, { "epoch": 0.7290421066022135, "ewc_loss": 0.023293614387512207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.329361450392753e-05, "grad_norm": 15.366243362426758, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8464264273643494, "num_tokens": 218678514.0, "step": 5731 }, { "epoch": 0.729169316880804, "ewc_loss": 0.02335202321410179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.335202407266479e-05, "grad_norm": 15.356379508972168, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8648647665977478, "num_tokens": 218715415.0, "step": 5732 }, { "epoch": 0.7292965271593945, "ewc_loss": 0.023359838873147964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3359838451142423e-05, "grad_norm": 15.373224258422852, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8684926629066467, "num_tokens": 218754422.0, "step": 5733 }, { "epoch": 0.7294237374379849, "ewc_loss": 0.023365670815110207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.336567013117019e-05, "grad_norm": 15.344444274902344, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.852024495601654, "num_tokens": 218787141.0, "step": 5734 }, { "epoch": 0.7295509477165755, "ewc_loss": 0.023373277857899666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.337327714485582e-05, "grad_norm": 15.344367027282715, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8749585151672363, "num_tokens": 218824764.0, "step": 5735 }, { "epoch": 0.729678157995166, "ewc_loss": 0.023383434861898422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338343438168522e-05, "grad_norm": 15.357166290283203, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8587660789489746, "num_tokens": 218866142.0, "step": 5736 }, { "epoch": 0.7298053682737565, "ewc_loss": 0.023373473435640335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3373473595711403e-05, "grad_norm": 15.276055335998535, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8556017875671387, "num_tokens": 218909208.0, "step": 5737 }, { "epoch": 0.7299325785523471, "ewc_loss": 0.023395072668790817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3395072275889106e-05, "grad_norm": 15.41543197631836, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8679381608963013, "num_tokens": 218942469.0, "step": 5738 }, { "epoch": 0.7300597888309376, "ewc_loss": 0.02337586134672165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3375861928798258e-05, "grad_norm": 15.364447593688965, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.853480875492096, "num_tokens": 218982934.0, "step": 5739 }, { "epoch": 0.730186999109528, "ewc_loss": 0.02335825376212597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3358254111371934e-05, "grad_norm": 15.316847801208496, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8579246401786804, "num_tokens": 219027055.0, "step": 5740 }, { "epoch": 0.7303142093881185, "ewc_loss": 0.023374052718281746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.337405203434173e-05, "grad_norm": 15.335952758789062, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8610327243804932, "num_tokens": 219069639.0, "step": 5741 }, { "epoch": 0.7304414196667091, "ewc_loss": 0.023389045149087906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338904596399516e-05, "grad_norm": 15.39415168762207, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8547403812408447, "num_tokens": 219111774.0, "step": 5742 }, { "epoch": 0.7305686299452996, "ewc_loss": 0.023376241326332092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.33762420975836e-05, "grad_norm": 15.323607444763184, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8468344211578369, "num_tokens": 219156962.0, "step": 5743 }, { "epoch": 0.7306958402238901, "ewc_loss": 0.02333890087902546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3338900064118207e-05, "grad_norm": 15.41139030456543, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8699080944061279, "num_tokens": 219193352.0, "step": 5744 }, { "epoch": 0.7308230505024806, "ewc_loss": 0.023359302431344986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3359301849268377e-05, "grad_norm": 15.334439277648926, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8461143374443054, "num_tokens": 219230041.0, "step": 5745 }, { "epoch": 0.7309502607810711, "ewc_loss": 0.023332471027970314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3332471755566075e-05, "grad_norm": 15.475687026977539, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8381549715995789, "num_tokens": 219262748.0, "step": 5746 }, { "epoch": 0.7310774710596616, "ewc_loss": 0.023399995639920235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3399996280204505e-05, "grad_norm": 15.351856231689453, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8710379600524902, "num_tokens": 219296556.0, "step": 5747 }, { "epoch": 0.7312046813382521, "ewc_loss": 0.023277640342712402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.327764013898559e-05, "grad_norm": 15.330475807189941, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8472517728805542, "num_tokens": 219330801.0, "step": 5748 }, { "epoch": 0.7313318916168426, "ewc_loss": 0.02339138276875019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3391383365378715e-05, "grad_norm": 15.360211372375488, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8600810766220093, "num_tokens": 219367470.0, "step": 5749 }, { "epoch": 0.7314591018954332, "ewc_loss": 0.023327359929680824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.332736039534211e-05, "grad_norm": 15.313509941101074, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8467777371406555, "num_tokens": 219405291.0, "step": 5750 }, { "epoch": 0.7315863121740237, "ewc_loss": 0.023430999368429184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.343099913559854e-05, "grad_norm": 15.31928539276123, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8613365888595581, "num_tokens": 219443365.0, "step": 5751 }, { "epoch": 0.7317135224526141, "ewc_loss": 0.023381158709526062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3381158825941384e-05, "grad_norm": 15.372621536254883, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8614486455917358, "num_tokens": 219477524.0, "step": 5752 }, { "epoch": 0.7318407327312046, "ewc_loss": 0.023456480354070663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.345647953916341e-05, "grad_norm": 15.358297348022461, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8515435457229614, "num_tokens": 219514576.0, "step": 5753 }, { "epoch": 0.7319679430097952, "ewc_loss": 0.023460110649466515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3460110242012888e-05, "grad_norm": 15.336307525634766, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8608753085136414, "num_tokens": 219558253.0, "step": 5754 }, { "epoch": 0.7320951532883857, "ewc_loss": 0.023459598422050476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3459599105990492e-05, "grad_norm": 15.356626510620117, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8555678725242615, "num_tokens": 219601156.0, "step": 5755 }, { "epoch": 0.7322223635669762, "ewc_loss": 0.02349458634853363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3494585548178293e-05, "grad_norm": 15.436625480651855, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8497822880744934, "num_tokens": 219638241.0, "step": 5756 }, { "epoch": 0.7323495738455668, "ewc_loss": 0.02349705435335636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3497053916798905e-05, "grad_norm": 15.403080940246582, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8602814674377441, "num_tokens": 219678168.0, "step": 5757 }, { "epoch": 0.7324767841241572, "ewc_loss": 0.02340194396674633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3401944417855702e-05, "grad_norm": 15.335402488708496, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8569432497024536, "num_tokens": 219718115.0, "step": 5758 }, { "epoch": 0.7326039944027477, "ewc_loss": 0.023461313918232918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3461314412998036e-05, "grad_norm": 15.422441482543945, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8592736721038818, "num_tokens": 219754221.0, "step": 5759 }, { "epoch": 0.7327312046813382, "ewc_loss": 0.0234229639172554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3422964659403078e-05, "grad_norm": 15.348867416381836, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.866016149520874, "num_tokens": 219786880.0, "step": 5760 }, { "epoch": 0.7328584149599288, "ewc_loss": 0.023401563987135887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.340156424907036e-05, "grad_norm": 15.341986656188965, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8636230826377869, "num_tokens": 219830585.0, "step": 5761 }, { "epoch": 0.7329856252385193, "ewc_loss": 0.0234258733689785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3425873223459348e-05, "grad_norm": 15.411370277404785, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.863576352596283, "num_tokens": 219867249.0, "step": 5762 }, { "epoch": 0.7331128355171098, "ewc_loss": 0.023421796038746834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3421796868206002e-05, "grad_norm": 15.375560760498047, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8524429202079773, "num_tokens": 219906046.0, "step": 5763 }, { "epoch": 0.7332400457957003, "ewc_loss": 0.023373521864414215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.33735227084253e-05, "grad_norm": 15.384163856506348, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8516103029251099, "num_tokens": 219947466.0, "step": 5764 }, { "epoch": 0.7333672560742908, "ewc_loss": 0.023387473076581955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338747253816109e-05, "grad_norm": 15.354202270507812, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8620253801345825, "num_tokens": 219985698.0, "step": 5765 }, { "epoch": 0.7334944663528813, "ewc_loss": 0.023367328569293022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3367329049506225e-05, "grad_norm": 15.341571807861328, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.861061692237854, "num_tokens": 220026158.0, "step": 5766 }, { "epoch": 0.7336216766314718, "ewc_loss": 0.023376792669296265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3376793251372874e-05, "grad_norm": 15.400851249694824, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8578866720199585, "num_tokens": 220071849.0, "step": 5767 }, { "epoch": 0.7337488869100623, "ewc_loss": 0.023390447720885277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3390448404825293e-05, "grad_norm": 15.381046295166016, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8667614459991455, "num_tokens": 220110255.0, "step": 5768 }, { "epoch": 0.7338760971886529, "ewc_loss": 0.02332097664475441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.332097756152507e-05, "grad_norm": 15.38148307800293, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8453227281570435, "num_tokens": 220144962.0, "step": 5769 }, { "epoch": 0.7340033074672434, "ewc_loss": 0.023352453485131264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3352453354164027e-05, "grad_norm": 15.395834922790527, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8514853715896606, "num_tokens": 220182118.0, "step": 5770 }, { "epoch": 0.7341305177458338, "ewc_loss": 0.023373322561383247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3373322619590908e-05, "grad_norm": 15.379780769348145, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8510457277297974, "num_tokens": 220218954.0, "step": 5771 }, { "epoch": 0.7342577280244243, "ewc_loss": 0.023373018950223923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3373018848360516e-05, "grad_norm": 15.43620777130127, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8588765859603882, "num_tokens": 220259116.0, "step": 5772 }, { "epoch": 0.7343849383030149, "ewc_loss": 0.023359887301921844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.335988756385632e-05, "grad_norm": 15.344642639160156, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8561926484107971, "num_tokens": 220301742.0, "step": 5773 }, { "epoch": 0.7345121485816054, "ewc_loss": 0.023308221250772476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3308220988838002e-05, "grad_norm": 15.399083137512207, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8636917471885681, "num_tokens": 220342572.0, "step": 5774 }, { "epoch": 0.7346393588601959, "ewc_loss": 0.02337123267352581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3371232600766234e-05, "grad_norm": 15.353110313415527, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8659753799438477, "num_tokens": 220376373.0, "step": 5775 }, { "epoch": 0.7347665691387865, "ewc_loss": 0.023334888741374016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3334889192483388e-05, "grad_norm": 15.395167350769043, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8757649064064026, "num_tokens": 220413290.0, "step": 5776 }, { "epoch": 0.7348937794173769, "ewc_loss": 0.023380160331726074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3380160200758837e-05, "grad_norm": 15.346776962280273, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.855530858039856, "num_tokens": 220453219.0, "step": 5777 }, { "epoch": 0.7350209896959674, "ewc_loss": 0.023315828293561935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.331582800252363e-05, "grad_norm": 15.4271879196167, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8601822257041931, "num_tokens": 220488369.0, "step": 5778 }, { "epoch": 0.7351481999745579, "ewc_loss": 0.023394787684082985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3394788513542153e-05, "grad_norm": 15.362565994262695, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8634340763092041, "num_tokens": 220527798.0, "step": 5779 }, { "epoch": 0.7352754102531485, "ewc_loss": 0.023342479020357132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3342479835264385e-05, "grad_norm": 15.420517921447754, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8777621984481812, "num_tokens": 220565398.0, "step": 5780 }, { "epoch": 0.735402620531739, "ewc_loss": 0.023364301770925522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3364302251138724e-05, "grad_norm": 15.407123565673828, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8695226907730103, "num_tokens": 220603903.0, "step": 5781 }, { "epoch": 0.7355298308103295, "ewc_loss": 0.02337057888507843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.337057958357036e-05, "grad_norm": 15.391575813293457, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8421059846878052, "num_tokens": 220644168.0, "step": 5782 }, { "epoch": 0.7356570410889199, "ewc_loss": 0.023310543969273567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.331054383830633e-05, "grad_norm": 15.417489051818848, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8718074560165405, "num_tokens": 220679709.0, "step": 5783 }, { "epoch": 0.7357842513675105, "ewc_loss": 0.0233604796230793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3360480554401875e-05, "grad_norm": 15.418551445007324, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8540264368057251, "num_tokens": 220721184.0, "step": 5784 }, { "epoch": 0.735911461646101, "ewc_loss": 0.02330874465405941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3308744857786223e-05, "grad_norm": 15.386463165283203, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8646143674850464, "num_tokens": 220755027.0, "step": 5785 }, { "epoch": 0.7360386719246915, "ewc_loss": 0.02331695705652237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3316957594943233e-05, "grad_norm": 15.36378002166748, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8369499444961548, "num_tokens": 220793597.0, "step": 5786 }, { "epoch": 0.736165882203282, "ewc_loss": 0.023350924253463745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3350923584075645e-05, "grad_norm": 15.419539451599121, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8588916063308716, "num_tokens": 220833549.0, "step": 5787 }, { "epoch": 0.7362930924818726, "ewc_loss": 0.023343846201896667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3343845896306448e-05, "grad_norm": 15.346835136413574, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8658531904220581, "num_tokens": 220865551.0, "step": 5788 }, { "epoch": 0.736420302760463, "ewc_loss": 0.02337384596467018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.337384648853913e-05, "grad_norm": 15.380387306213379, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8722385168075562, "num_tokens": 220911626.0, "step": 5789 }, { "epoch": 0.7365475130390535, "ewc_loss": 0.02338135614991188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3381355276796967e-05, "grad_norm": 15.438090324401855, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8611414432525635, "num_tokens": 220949839.0, "step": 5790 }, { "epoch": 0.736674723317644, "ewc_loss": 0.02335837483406067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.335837416467257e-05, "grad_norm": 15.377364158630371, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8430936336517334, "num_tokens": 220984069.0, "step": 5791 }, { "epoch": 0.7368019335962346, "ewc_loss": 0.023357871919870377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.335787212359719e-05, "grad_norm": 15.449065208435059, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8796244859695435, "num_tokens": 221020999.0, "step": 5792 }, { "epoch": 0.7369291438748251, "ewc_loss": 0.023405158892273903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3405158572131768e-05, "grad_norm": 15.418859481811523, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8510025143623352, "num_tokens": 221051082.0, "step": 5793 }, { "epoch": 0.7370563541534156, "ewc_loss": 0.02330438606441021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3304386559175327e-05, "grad_norm": 15.337279319763184, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8423872590065002, "num_tokens": 221095444.0, "step": 5794 }, { "epoch": 0.737183564432006, "ewc_loss": 0.023384690284729004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3384689484373666e-05, "grad_norm": 15.420426368713379, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8624841570854187, "num_tokens": 221136433.0, "step": 5795 }, { "epoch": 0.7373107747105966, "ewc_loss": 0.023415133357048035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3415133910020813e-05, "grad_norm": 15.48916244506836, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.852685272693634, "num_tokens": 221171278.0, "step": 5796 }, { "epoch": 0.7374379849891871, "ewc_loss": 0.023357311263680458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3357311874860898e-05, "grad_norm": 15.40908145904541, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8579109907150269, "num_tokens": 221209380.0, "step": 5797 }, { "epoch": 0.7375651952677776, "ewc_loss": 0.023329244926571846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3329244868364185e-05, "grad_norm": 15.280752182006836, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8586472272872925, "num_tokens": 221253076.0, "step": 5798 }, { "epoch": 0.7376924055463682, "ewc_loss": 0.023397265002131462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.339726415812038e-05, "grad_norm": 15.423462867736816, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8557819128036499, "num_tokens": 221290348.0, "step": 5799 }, { "epoch": 0.7378196158249587, "ewc_loss": 0.023408832028508186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3408831111737527e-05, "grad_norm": 15.271605491638184, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8658401370048523, "num_tokens": 221329635.0, "step": 5800 }, { "epoch": 0.7379468261035491, "ewc_loss": 0.023392681032419205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3392680304823443e-05, "grad_norm": 15.441021919250488, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8668464422225952, "num_tokens": 221368521.0, "step": 5801 }, { "epoch": 0.7380740363821396, "ewc_loss": 0.02345358021557331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.345358007005416e-05, "grad_norm": 15.356956481933594, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8717208504676819, "num_tokens": 221407090.0, "step": 5802 }, { "epoch": 0.7382012466607302, "ewc_loss": 0.023337293416261673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3337293896474876e-05, "grad_norm": 15.39745044708252, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8694630861282349, "num_tokens": 221450340.0, "step": 5803 }, { "epoch": 0.7383284569393207, "ewc_loss": 0.02344362810254097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.344362837902736e-05, "grad_norm": 15.406278610229492, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8604776859283447, "num_tokens": 221491996.0, "step": 5804 }, { "epoch": 0.7384556672179112, "ewc_loss": 0.023349599912762642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3349599359789863e-05, "grad_norm": 15.367803573608398, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8687875270843506, "num_tokens": 221531466.0, "step": 5805 }, { "epoch": 0.7385828774965018, "ewc_loss": 0.02341054193675518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3410542780766264e-05, "grad_norm": 15.397961616516113, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8409973978996277, "num_tokens": 221567003.0, "step": 5806 }, { "epoch": 0.7387100877750922, "ewc_loss": 0.023410623893141747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3410624635289423e-05, "grad_norm": 15.417807579040527, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8732812404632568, "num_tokens": 221607396.0, "step": 5807 }, { "epoch": 0.7388372980536827, "ewc_loss": 0.023386768996715546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338676858926192e-05, "grad_norm": 15.4187593460083, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8529361486434937, "num_tokens": 221644776.0, "step": 5808 }, { "epoch": 0.7389645083322732, "ewc_loss": 0.023380715399980545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338071499252692e-05, "grad_norm": 15.445717811584473, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8720035552978516, "num_tokens": 221685643.0, "step": 5809 }, { "epoch": 0.7390917186108638, "ewc_loss": 0.023376712575554848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3376713215839118e-05, "grad_norm": 15.38918685913086, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8539303541183472, "num_tokens": 221724673.0, "step": 5810 }, { "epoch": 0.7392189288894543, "ewc_loss": 0.023380503058433533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3380503989756107e-05, "grad_norm": 15.438841819763184, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8353564739227295, "num_tokens": 221766287.0, "step": 5811 }, { "epoch": 0.7393461391680448, "ewc_loss": 0.023387707769870758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338770718779415e-05, "grad_norm": 15.428812026977539, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8593488335609436, "num_tokens": 221805357.0, "step": 5812 }, { "epoch": 0.7394733494466353, "ewc_loss": 0.02332782931625843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3327829694608226e-05, "grad_norm": 15.383532524108887, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8571338653564453, "num_tokens": 221838513.0, "step": 5813 }, { "epoch": 0.7396005597252258, "ewc_loss": 0.02338544651865959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338544618396554e-05, "grad_norm": 15.430795669555664, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8486895561218262, "num_tokens": 221882808.0, "step": 5814 }, { "epoch": 0.7397277700038163, "ewc_loss": 0.023352257907390594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3352258722297847e-05, "grad_norm": 15.386680603027344, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8730267882347107, "num_tokens": 221918505.0, "step": 5815 }, { "epoch": 0.7398549802824068, "ewc_loss": 0.023399949073791504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3399948986480013e-05, "grad_norm": 15.421303749084473, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8605769872665405, "num_tokens": 221955983.0, "step": 5816 }, { "epoch": 0.7399821905609973, "ewc_loss": 0.023393992334604263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.33939917961834e-05, "grad_norm": 15.381596565246582, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8592543005943298, "num_tokens": 221997363.0, "step": 5817 }, { "epoch": 0.7401094008395879, "ewc_loss": 0.02335638739168644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3356387828243896e-05, "grad_norm": 15.36126708984375, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8461521863937378, "num_tokens": 222039302.0, "step": 5818 }, { "epoch": 0.7402366111181784, "ewc_loss": 0.023381389677524567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3381389837595634e-05, "grad_norm": 15.421305656433105, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8589023351669312, "num_tokens": 222075779.0, "step": 5819 }, { "epoch": 0.7403638213967688, "ewc_loss": 0.023356139659881592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3356140445685014e-05, "grad_norm": 15.307570457458496, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8780419826507568, "num_tokens": 222115120.0, "step": 5820 }, { "epoch": 0.7404910316753593, "ewc_loss": 0.02337126061320305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3371259885607287e-05, "grad_norm": 15.416032791137695, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8522948622703552, "num_tokens": 222156570.0, "step": 5821 }, { "epoch": 0.7406182419539499, "ewc_loss": 0.023426292464137077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3426291591022164e-05, "grad_norm": 15.441511154174805, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8597285151481628, "num_tokens": 222189487.0, "step": 5822 }, { "epoch": 0.7407454522325404, "ewc_loss": 0.023391589522361755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3391588911181316e-05, "grad_norm": 15.400712966918945, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8581314086914062, "num_tokens": 222223163.0, "step": 5823 }, { "epoch": 0.7408726625111309, "ewc_loss": 0.023382369428873062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3382370272884145e-05, "grad_norm": 15.372617721557617, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8728968501091003, "num_tokens": 222259042.0, "step": 5824 }, { "epoch": 0.7409998727897215, "ewc_loss": 0.02341623231768608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3416232579620555e-05, "grad_norm": 15.479117393493652, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8406658172607422, "num_tokens": 222300497.0, "step": 5825 }, { "epoch": 0.7411270830683119, "ewc_loss": 0.023433607071638107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3433607566403225e-05, "grad_norm": 15.395228385925293, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8558379411697388, "num_tokens": 222333630.0, "step": 5826 }, { "epoch": 0.7412542933469024, "ewc_loss": 0.023408185690641403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3408185370499268e-05, "grad_norm": 15.417842864990234, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8550181984901428, "num_tokens": 222371715.0, "step": 5827 }, { "epoch": 0.7413815036254929, "ewc_loss": 0.023393699899315834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3393700757878833e-05, "grad_norm": 15.41036319732666, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8605473637580872, "num_tokens": 222412044.0, "step": 5828 }, { "epoch": 0.7415087139040835, "ewc_loss": 0.023429306223988533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342930565646384e-05, "grad_norm": 15.372416496276855, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8624771237373352, "num_tokens": 222452668.0, "step": 5829 }, { "epoch": 0.741635924182674, "ewc_loss": 0.023362882435321808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3362881620414555e-05, "grad_norm": 15.398908615112305, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8613271713256836, "num_tokens": 222485631.0, "step": 5830 }, { "epoch": 0.7417631344612645, "ewc_loss": 0.02345249243080616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3452492314390838e-05, "grad_norm": 15.41674518585205, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8750548958778381, "num_tokens": 222522967.0, "step": 5831 }, { "epoch": 0.7418903447398549, "ewc_loss": 0.02343769557774067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3437694835592993e-05, "grad_norm": 15.355536460876465, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8497406244277954, "num_tokens": 222563258.0, "step": 5832 }, { "epoch": 0.7420175550184455, "ewc_loss": 0.02343151345849037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3431513909599744e-05, "grad_norm": 15.397989273071289, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8545827269554138, "num_tokens": 222599248.0, "step": 5833 }, { "epoch": 0.742144765297036, "ewc_loss": 0.02351291850209236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.351291914237663e-05, "grad_norm": 15.402551651000977, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8674834966659546, "num_tokens": 222642575.0, "step": 5834 }, { "epoch": 0.7422719755756265, "ewc_loss": 0.023454606533050537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.345460598007776e-05, "grad_norm": 15.48104476928711, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8363497257232666, "num_tokens": 222687330.0, "step": 5835 }, { "epoch": 0.742399185854217, "ewc_loss": 0.023408524692058563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.340852552151773e-05, "grad_norm": 15.331421852111816, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8336254358291626, "num_tokens": 222720020.0, "step": 5836 }, { "epoch": 0.7425263961328076, "ewc_loss": 0.023415081202983856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.341508115932811e-05, "grad_norm": 15.372973442077637, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8731510639190674, "num_tokens": 222758634.0, "step": 5837 }, { "epoch": 0.742653606411398, "ewc_loss": 0.023509226739406586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3509226593887433e-05, "grad_norm": 15.392618179321289, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8676515817642212, "num_tokens": 222794823.0, "step": 5838 }, { "epoch": 0.7427808166899885, "ewc_loss": 0.023435572162270546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3435572074959055e-05, "grad_norm": 15.342965126037598, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8594679832458496, "num_tokens": 222833836.0, "step": 5839 }, { "epoch": 0.742908026968579, "ewc_loss": 0.023479288443922997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3479287847294472e-05, "grad_norm": 15.41662311553955, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8613224029541016, "num_tokens": 222875712.0, "step": 5840 }, { "epoch": 0.7430352372471696, "ewc_loss": 0.023497208952903748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3497208530898206e-05, "grad_norm": 15.403600692749023, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8486407995223999, "num_tokens": 222915494.0, "step": 5841 }, { "epoch": 0.7431624475257601, "ewc_loss": 0.023444879800081253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3444879843737e-05, "grad_norm": 15.367463111877441, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8759697675704956, "num_tokens": 222951675.0, "step": 5842 }, { "epoch": 0.7432896578043506, "ewc_loss": 0.023497672751545906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.349767237319611e-05, "grad_norm": 15.433517456054688, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8549237251281738, "num_tokens": 222993245.0, "step": 5843 }, { "epoch": 0.743416868082941, "ewc_loss": 0.023461809381842613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.34618091781158e-05, "grad_norm": 15.38885498046875, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8668657541275024, "num_tokens": 223029876.0, "step": 5844 }, { "epoch": 0.7435440783615316, "ewc_loss": 0.023471670225262642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3471669919672422e-05, "grad_norm": 15.395211219787598, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8652541637420654, "num_tokens": 223072550.0, "step": 5845 }, { "epoch": 0.7436712886401221, "ewc_loss": 0.02344912849366665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3449129002983682e-05, "grad_norm": 15.367673873901367, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8579859733581543, "num_tokens": 223112892.0, "step": 5846 }, { "epoch": 0.7437984989187126, "ewc_loss": 0.023458391427993774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3458391297026537e-05, "grad_norm": 15.410260200500488, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8645156621932983, "num_tokens": 223157093.0, "step": 5847 }, { "epoch": 0.7439257091973032, "ewc_loss": 0.023464113473892212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3464113837690093e-05, "grad_norm": 15.442167282104492, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8514876961708069, "num_tokens": 223190756.0, "step": 5848 }, { "epoch": 0.7440529194758937, "ewc_loss": 0.023446515202522278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3446515115210786e-05, "grad_norm": 15.450654983520508, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8506895899772644, "num_tokens": 223226013.0, "step": 5849 }, { "epoch": 0.7441801297544841, "ewc_loss": 0.023423150181770325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342315019632224e-05, "grad_norm": 15.404918670654297, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8728243112564087, "num_tokens": 223262858.0, "step": 5850 }, { "epoch": 0.7443073400330746, "ewc_loss": 0.023458682000637054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3458682335331105e-05, "grad_norm": 15.4846773147583, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8630924224853516, "num_tokens": 223310528.0, "step": 5851 }, { "epoch": 0.7444345503116652, "ewc_loss": 0.02339283563196659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3392834918922745e-05, "grad_norm": 15.339818954467773, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8624173402786255, "num_tokens": 223345021.0, "step": 5852 }, { "epoch": 0.7445617605902557, "ewc_loss": 0.02342391386628151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342391417187173e-05, "grad_norm": 15.375641822814941, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8704768419265747, "num_tokens": 223382055.0, "step": 5853 }, { "epoch": 0.7446889708688462, "ewc_loss": 0.023424286395311356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3424287064699456e-05, "grad_norm": 15.374645233154297, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8497622013092041, "num_tokens": 223418441.0, "step": 5854 }, { "epoch": 0.7448161811474368, "ewc_loss": 0.023422617465257645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3422617232427e-05, "grad_norm": 15.417622566223145, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8583976030349731, "num_tokens": 223458150.0, "step": 5855 }, { "epoch": 0.7449433914260272, "ewc_loss": 0.023465804755687714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.346580549783539e-05, "grad_norm": 15.39925765991211, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8560030460357666, "num_tokens": 223496916.0, "step": 5856 }, { "epoch": 0.7450706017046177, "ewc_loss": 0.023418130353093147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3418129785568453e-05, "grad_norm": 15.437183380126953, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8659318685531616, "num_tokens": 223534808.0, "step": 5857 }, { "epoch": 0.7451978119832082, "ewc_loss": 0.023439234122633934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3439233700628392e-05, "grad_norm": 15.394001960754395, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8685758709907532, "num_tokens": 223571255.0, "step": 5858 }, { "epoch": 0.7453250222617988, "ewc_loss": 0.023391973227262497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3391972717945464e-05, "grad_norm": 15.411907196044922, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8629739880561829, "num_tokens": 223616608.0, "step": 5859 }, { "epoch": 0.7454522325403893, "ewc_loss": 0.023420313373208046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342031257285271e-05, "grad_norm": 15.458935737609863, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8601057529449463, "num_tokens": 223656188.0, "step": 5860 }, { "epoch": 0.7455794428189798, "ewc_loss": 0.023378770798444748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3378770492854528e-05, "grad_norm": 15.398056983947754, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8555259704589844, "num_tokens": 223693787.0, "step": 5861 }, { "epoch": 0.7457066530975703, "ewc_loss": 0.023400668054819107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3400667487294413e-05, "grad_norm": 15.503957748413086, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8711536526679993, "num_tokens": 223729350.0, "step": 5862 }, { "epoch": 0.7458338633761608, "ewc_loss": 0.023427367210388184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342736661375966e-05, "grad_norm": 15.349466323852539, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8482469320297241, "num_tokens": 223770846.0, "step": 5863 }, { "epoch": 0.7459610736547513, "ewc_loss": 0.023386649787425995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338665035495069e-05, "grad_norm": 15.442399978637695, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8597191572189331, "num_tokens": 223814194.0, "step": 5864 }, { "epoch": 0.7460882839333418, "ewc_loss": 0.023422660306096077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3422660888172686e-05, "grad_norm": 15.407282829284668, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8526461124420166, "num_tokens": 223849141.0, "step": 5865 }, { "epoch": 0.7462154942119323, "ewc_loss": 0.02336967922747135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3369679183815606e-05, "grad_norm": 15.376984596252441, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8808003067970276, "num_tokens": 223888654.0, "step": 5866 }, { "epoch": 0.7463427044905229, "ewc_loss": 0.02339114062488079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3391141439788043e-05, "grad_norm": 15.45304012298584, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8628665804862976, "num_tokens": 223924509.0, "step": 5867 }, { "epoch": 0.7464699147691134, "ewc_loss": 0.023418406024575233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3418406271957792e-05, "grad_norm": 15.38284969329834, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8448360562324524, "num_tokens": 223966191.0, "step": 5868 }, { "epoch": 0.7465971250477038, "ewc_loss": 0.023410841822624207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3410841095028445e-05, "grad_norm": 15.39284610748291, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8619617223739624, "num_tokens": 224001415.0, "step": 5869 }, { "epoch": 0.7467243353262943, "ewc_loss": 0.023462969809770584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3462969693355262e-05, "grad_norm": 15.423094749450684, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8660336136817932, "num_tokens": 224042820.0, "step": 5870 }, { "epoch": 0.7468515456048849, "ewc_loss": 0.0234265998005867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3426599000231363e-05, "grad_norm": 15.420512199401855, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8658497333526611, "num_tokens": 224078411.0, "step": 5871 }, { "epoch": 0.7469787558834754, "ewc_loss": 0.023432694375514984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3432694433722645e-05, "grad_norm": 15.379582405090332, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8666177988052368, "num_tokens": 224117136.0, "step": 5872 }, { "epoch": 0.7471059661620659, "ewc_loss": 0.02342981845140457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342981861147564e-05, "grad_norm": 15.464058876037598, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8540434241294861, "num_tokens": 224155879.0, "step": 5873 }, { "epoch": 0.7472331764406565, "ewc_loss": 0.023468991741538048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3468992367270403e-05, "grad_norm": 15.416075706481934, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8714205026626587, "num_tokens": 224195186.0, "step": 5874 }, { "epoch": 0.7473603867192469, "ewc_loss": 0.023413866758346558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3413866074406542e-05, "grad_norm": 15.45240592956543, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8570975661277771, "num_tokens": 224235722.0, "step": 5875 }, { "epoch": 0.7474875969978374, "ewc_loss": 0.023412832990288734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3412832888425328e-05, "grad_norm": 15.443490982055664, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8674418926239014, "num_tokens": 224268461.0, "step": 5876 }, { "epoch": 0.7476148072764279, "ewc_loss": 0.023377854377031326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.337785372219514e-05, "grad_norm": 15.42965316772461, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8728581070899963, "num_tokens": 224302921.0, "step": 5877 }, { "epoch": 0.7477420175550185, "ewc_loss": 0.023388801142573357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.338880040042568e-05, "grad_norm": 15.441964149475098, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8613404035568237, "num_tokens": 224344081.0, "step": 5878 }, { "epoch": 0.747869227833609, "ewc_loss": 0.023408956825733185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.340895662200637e-05, "grad_norm": 15.455313682556152, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8479963541030884, "num_tokens": 224384464.0, "step": 5879 }, { "epoch": 0.7479964381121995, "ewc_loss": 0.02341092750430107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3410928406519815e-05, "grad_norm": 15.446919441223145, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8690828084945679, "num_tokens": 224425542.0, "step": 5880 }, { "epoch": 0.7481236483907899, "ewc_loss": 0.023352395743131638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3352395146503113e-05, "grad_norm": 15.42089557647705, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8776945471763611, "num_tokens": 224465342.0, "step": 5881 }, { "epoch": 0.7482508586693805, "ewc_loss": 0.023404138162732124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.340413811907638e-05, "grad_norm": 15.39652156829834, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8657497763633728, "num_tokens": 224504787.0, "step": 5882 }, { "epoch": 0.748378068947971, "ewc_loss": 0.02338641695678234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3386417524307035e-05, "grad_norm": 15.49908447265625, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8607262969017029, "num_tokens": 224548534.0, "step": 5883 }, { "epoch": 0.7485052792265615, "ewc_loss": 0.023422786965966225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.342278639844153e-05, "grad_norm": 15.432435035705566, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8524171710014343, "num_tokens": 224590251.0, "step": 5884 }, { "epoch": 0.748632489505152, "ewc_loss": 0.023343244567513466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3343243810813874e-05, "grad_norm": 15.396208763122559, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8704553842544556, "num_tokens": 224633745.0, "step": 5885 }, { "epoch": 0.7487596997837426, "ewc_loss": 0.023403016850352287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3403017621603794e-05, "grad_norm": 15.474030494689941, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8449816107749939, "num_tokens": 224671319.0, "step": 5886 }, { "epoch": 0.748886910062333, "ewc_loss": 0.023403925821185112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3403925297316164e-05, "grad_norm": 15.445070266723633, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8608099818229675, "num_tokens": 224712670.0, "step": 5887 }, { "epoch": 0.7490141203409235, "ewc_loss": 0.023394957184791565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3394957679556683e-05, "grad_norm": 15.409157752990723, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.858548641204834, "num_tokens": 224747840.0, "step": 5888 }, { "epoch": 0.749141330619514, "ewc_loss": 0.02341149002313614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.341149047424551e-05, "grad_norm": 15.513361930847168, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8704155683517456, "num_tokens": 224783274.0, "step": 5889 }, { "epoch": 0.7492685408981046, "ewc_loss": 0.023437416180968285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.343741653021425e-05, "grad_norm": 15.438761711120605, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8555080890655518, "num_tokens": 224820456.0, "step": 5890 }, { "epoch": 0.7493957511766951, "ewc_loss": 0.023364771157503128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.336477155040484e-05, "grad_norm": 15.405823707580566, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8649221658706665, "num_tokens": 224858569.0, "step": 5891 }, { "epoch": 0.7495229614552856, "ewc_loss": 0.023417361080646515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3417360353050753e-05, "grad_norm": 15.417774200439453, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8595185279846191, "num_tokens": 224901033.0, "step": 5892 }, { "epoch": 0.749650171733876, "ewc_loss": 0.023411603644490242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.341160325158853e-05, "grad_norm": 15.385574340820312, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8754213452339172, "num_tokens": 224934345.0, "step": 5893 }, { "epoch": 0.7497773820124666, "ewc_loss": 0.02345067262649536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3450673324987292e-05, "grad_norm": 15.40106201171875, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8533962965011597, "num_tokens": 224972479.0, "step": 5894 }, { "epoch": 0.7499045922910571, "ewc_loss": 0.023500749841332436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.350075010326691e-05, "grad_norm": 15.477653503417969, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8773690462112427, "num_tokens": 225014213.0, "step": 5895 }, { "epoch": 0.7500318025696476, "ewc_loss": 0.023472661152482033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3472661268897355e-05, "grad_norm": 15.37846565246582, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8667583465576172, "num_tokens": 225049929.0, "step": 5896 }, { "epoch": 0.7501590128482382, "ewc_loss": 0.023467442020773888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.346744258829858e-05, "grad_norm": 15.479947090148926, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8453788757324219, "num_tokens": 225086409.0, "step": 5897 }, { "epoch": 0.7502862231268287, "ewc_loss": 0.023498203605413437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3498203518101946e-05, "grad_norm": 15.417701721191406, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8425021171569824, "num_tokens": 225124443.0, "step": 5898 }, { "epoch": 0.7504134334054191, "ewc_loss": 0.023476526141166687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.347652662137989e-05, "grad_norm": 15.391539573669434, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8503758311271667, "num_tokens": 225166665.0, "step": 5899 }, { "epoch": 0.7505406436840096, "ewc_loss": 0.02349521964788437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.349522037548013e-05, "grad_norm": 15.427260398864746, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8369301557540894, "num_tokens": 225203841.0, "step": 5900 }, { "epoch": 0.7506678539626002, "ewc_loss": 0.023502741008996964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3502741896663792e-05, "grad_norm": 15.437384605407715, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8442208170890808, "num_tokens": 225234844.0, "step": 5901 }, { "epoch": 0.7507950642411907, "ewc_loss": 0.023505134508013725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3505133867729455e-05, "grad_norm": 15.455536842346191, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8423197865486145, "num_tokens": 225269828.0, "step": 5902 }, { "epoch": 0.7509222745197812, "ewc_loss": 0.023502357304096222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3502358089899644e-05, "grad_norm": 15.3805513381958, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8654580116271973, "num_tokens": 225312361.0, "step": 5903 }, { "epoch": 0.7510494847983717, "ewc_loss": 0.023554012179374695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3554011931992136e-05, "grad_norm": 15.507492065429688, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8498132228851318, "num_tokens": 225353708.0, "step": 5904 }, { "epoch": 0.7511766950769622, "ewc_loss": 0.023516690358519554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3516689907410182e-05, "grad_norm": 15.329047203063965, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.878866970539093, "num_tokens": 225399730.0, "step": 5905 }, { "epoch": 0.7513039053555527, "ewc_loss": 0.023474013432860374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.347401277802419e-05, "grad_norm": 15.467158317565918, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8534090518951416, "num_tokens": 225441836.0, "step": 5906 }, { "epoch": 0.7514311156341432, "ewc_loss": 0.023563120514154434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.356311961193569e-05, "grad_norm": 15.383359909057617, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8671216368675232, "num_tokens": 225471004.0, "step": 5907 }, { "epoch": 0.7515583259127337, "ewc_loss": 0.023500557988882065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3500557290390134e-05, "grad_norm": 15.51574420928955, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8423266410827637, "num_tokens": 225505789.0, "step": 5908 }, { "epoch": 0.7516855361913243, "ewc_loss": 0.023542972281575203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3542972485302016e-05, "grad_norm": 15.38305377960205, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.86297208070755, "num_tokens": 225541138.0, "step": 5909 }, { "epoch": 0.7518127464699148, "ewc_loss": 0.023538723587989807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3538723326055333e-05, "grad_norm": 15.427563667297363, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8633595108985901, "num_tokens": 225581695.0, "step": 5910 }, { "epoch": 0.7519399567485053, "ewc_loss": 0.023573841899633408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3573842554469593e-05, "grad_norm": 15.39082145690918, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8660498261451721, "num_tokens": 225619532.0, "step": 5911 }, { "epoch": 0.7520671670270958, "ewc_loss": 0.02354884333908558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3548844183096662e-05, "grad_norm": 15.525809288024902, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8526310920715332, "num_tokens": 225657403.0, "step": 5912 }, { "epoch": 0.7521943773056863, "ewc_loss": 0.02356213703751564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3562137357657775e-05, "grad_norm": 15.397132873535156, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8564025163650513, "num_tokens": 225693402.0, "step": 5913 }, { "epoch": 0.7523215875842768, "ewc_loss": 0.02349402755498886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3494027118431404e-05, "grad_norm": 15.4926118850708, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8589054942131042, "num_tokens": 225727208.0, "step": 5914 }, { "epoch": 0.7524487978628673, "ewc_loss": 0.023598521947860718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3598522602696903e-05, "grad_norm": 15.43919849395752, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8643251657485962, "num_tokens": 225762117.0, "step": 5915 }, { "epoch": 0.7525760081414579, "ewc_loss": 0.023588387295603752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3588387193740346e-05, "grad_norm": 15.412156105041504, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8450714349746704, "num_tokens": 225794737.0, "step": 5916 }, { "epoch": 0.7527032184200484, "ewc_loss": 0.023595167323946953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.359516656724736e-05, "grad_norm": 15.440101623535156, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8587783575057983, "num_tokens": 225829074.0, "step": 5917 }, { "epoch": 0.7528304286986388, "ewc_loss": 0.02356189303100109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.35618936130777e-05, "grad_norm": 15.329907417297363, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.873467743396759, "num_tokens": 225871344.0, "step": 5918 }, { "epoch": 0.7529576389772293, "ewc_loss": 0.023623405024409294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3623404558748007e-05, "grad_norm": 15.472860336303711, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8549690246582031, "num_tokens": 225911321.0, "step": 5919 }, { "epoch": 0.7530848492558199, "ewc_loss": 0.023606611415743828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.360661164857447e-05, "grad_norm": 15.410537719726562, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8685033321380615, "num_tokens": 225944613.0, "step": 5920 }, { "epoch": 0.7532120595344104, "ewc_loss": 0.023592466488480568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.35924671869725e-05, "grad_norm": 15.457548141479492, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8655418157577515, "num_tokens": 225989501.0, "step": 5921 }, { "epoch": 0.7533392698130009, "ewc_loss": 0.023644618690013885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3644619432161562e-05, "grad_norm": 15.483421325683594, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8613085746765137, "num_tokens": 226024984.0, "step": 5922 }, { "epoch": 0.7534664800915915, "ewc_loss": 0.023616855964064598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3616856196895242e-05, "grad_norm": 15.45263385772705, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8728127479553223, "num_tokens": 226063255.0, "step": 5923 }, { "epoch": 0.7535936903701819, "ewc_loss": 0.023564090952277184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3564090952277184e-05, "grad_norm": 15.466194152832031, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.848421037197113, "num_tokens": 226105538.0, "step": 5924 }, { "epoch": 0.7537209006487724, "ewc_loss": 0.02360854670405388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3608547053299844e-05, "grad_norm": 15.506386756896973, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8634568452835083, "num_tokens": 226137449.0, "step": 5925 }, { "epoch": 0.7538481109273629, "ewc_loss": 0.023575803264975548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3575803425046615e-05, "grad_norm": 15.447005271911621, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8545379638671875, "num_tokens": 226171078.0, "step": 5926 }, { "epoch": 0.7539753212059535, "ewc_loss": 0.023562708869576454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.356270852033049e-05, "grad_norm": 15.43281364440918, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8622769117355347, "num_tokens": 226208060.0, "step": 5927 }, { "epoch": 0.754102531484544, "ewc_loss": 0.023590214550495148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.359021527809091e-05, "grad_norm": 15.379454612731934, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8698300719261169, "num_tokens": 226245330.0, "step": 5928 }, { "epoch": 0.7542297417631345, "ewc_loss": 0.023567862808704376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3567863536300138e-05, "grad_norm": 15.470298767089844, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8812664747238159, "num_tokens": 226281974.0, "step": 5929 }, { "epoch": 0.7543569520417249, "ewc_loss": 0.02357299067080021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3572991267428733e-05, "grad_norm": 15.324307441711426, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8505271077156067, "num_tokens": 226323302.0, "step": 5930 }, { "epoch": 0.7544841623203155, "ewc_loss": 0.023594282567501068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3594282538397238e-05, "grad_norm": 15.478496551513672, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8474685549736023, "num_tokens": 226361346.0, "step": 5931 }, { "epoch": 0.754611372598906, "ewc_loss": 0.02370816096663475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3708160370006226e-05, "grad_norm": 15.498265266418457, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8561657667160034, "num_tokens": 226397744.0, "step": 5932 }, { "epoch": 0.7547385828774965, "ewc_loss": 0.023583874106407166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3583874281030148e-05, "grad_norm": 15.405780792236328, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8680967092514038, "num_tokens": 226436745.0, "step": 5933 }, { "epoch": 0.754865793156087, "ewc_loss": 0.023592613637447357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3592612706124783e-05, "grad_norm": 15.461343765258789, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8685222864151001, "num_tokens": 226470302.0, "step": 5934 }, { "epoch": 0.7549930034346776, "ewc_loss": 0.023638363927602768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3638363927602768e-05, "grad_norm": 15.479235649108887, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8620879054069519, "num_tokens": 226500090.0, "step": 5935 }, { "epoch": 0.755120213713268, "ewc_loss": 0.02362140640616417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3621407308382913e-05, "grad_norm": 15.507955551147461, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8615131378173828, "num_tokens": 226533359.0, "step": 5936 }, { "epoch": 0.7552474239918585, "ewc_loss": 0.023668579757213593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3668579160585068e-05, "grad_norm": 15.51920223236084, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8509313464164734, "num_tokens": 226567060.0, "step": 5937 }, { "epoch": 0.755374634270449, "ewc_loss": 0.023628242313861847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.362824307056144e-05, "grad_norm": 15.536609649658203, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8587941527366638, "num_tokens": 226604270.0, "step": 5938 }, { "epoch": 0.7555018445490396, "ewc_loss": 0.023604420945048332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3604421585332602e-05, "grad_norm": 15.395844459533691, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8583585619926453, "num_tokens": 226641310.0, "step": 5939 }, { "epoch": 0.7556290548276301, "ewc_loss": 0.023614073172211647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3614073143107817e-05, "grad_norm": 15.479406356811523, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8493210077285767, "num_tokens": 226678305.0, "step": 5940 }, { "epoch": 0.7557562651062206, "ewc_loss": 0.02369280718266964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.36928080994403e-05, "grad_norm": 15.47885513305664, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8671437501907349, "num_tokens": 226715628.0, "step": 5941 }, { "epoch": 0.755883475384811, "ewc_loss": 0.023622499778866768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3622500521014445e-05, "grad_norm": 15.394782066345215, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8722352981567383, "num_tokens": 226755346.0, "step": 5942 }, { "epoch": 0.7560106856634016, "ewc_loss": 0.023623263463377953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.362326267757453e-05, "grad_norm": 15.450063705444336, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8495688438415527, "num_tokens": 226792243.0, "step": 5943 }, { "epoch": 0.7561378959419921, "ewc_loss": 0.023708000779151917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3708000298938714e-05, "grad_norm": 15.528322219848633, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8742989897727966, "num_tokens": 226829033.0, "step": 5944 }, { "epoch": 0.7562651062205826, "ewc_loss": 0.02365723066031933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3657230485696346e-05, "grad_norm": 15.427496910095215, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8745908737182617, "num_tokens": 226870645.0, "step": 5945 }, { "epoch": 0.7563923164991732, "ewc_loss": 0.023602159693837166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3602160581503995e-05, "grad_norm": 15.512474060058594, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8657348155975342, "num_tokens": 226902395.0, "step": 5946 }, { "epoch": 0.7565195267777637, "ewc_loss": 0.02365337871015072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3653377866139635e-05, "grad_norm": 15.424599647521973, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8668402433395386, "num_tokens": 226942620.0, "step": 5947 }, { "epoch": 0.7566467370563541, "ewc_loss": 0.0235932394862175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3593240257469006e-05, "grad_norm": 15.464564323425293, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8683197498321533, "num_tokens": 226976472.0, "step": 5948 }, { "epoch": 0.7567739473349446, "ewc_loss": 0.023694558069109917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.369455796724651e-05, "grad_norm": 15.556827545166016, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8507394790649414, "num_tokens": 227013626.0, "step": 5949 }, { "epoch": 0.7569011576135352, "ewc_loss": 0.023606695234775543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3606695322087035e-05, "grad_norm": 15.42362117767334, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8483333587646484, "num_tokens": 227049336.0, "step": 5950 }, { "epoch": 0.7570283678921257, "ewc_loss": 0.023636696860194206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.363669773330912e-05, "grad_norm": 15.483270645141602, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8837968707084656, "num_tokens": 227087371.0, "step": 5951 }, { "epoch": 0.7571555781707162, "ewc_loss": 0.023652682080864906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3652681193198077e-05, "grad_norm": 15.41736125946045, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8625568151473999, "num_tokens": 227129188.0, "step": 5952 }, { "epoch": 0.7572827884493067, "ewc_loss": 0.023603789508342743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3603788577020168e-05, "grad_norm": 15.506675720214844, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8532516956329346, "num_tokens": 227161807.0, "step": 5953 }, { "epoch": 0.7574099987278972, "ewc_loss": 0.023671401664614677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.367140223213937e-05, "grad_norm": 15.475089073181152, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8519554138183594, "num_tokens": 227198431.0, "step": 5954 }, { "epoch": 0.7575372090064877, "ewc_loss": 0.023587258532643318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3587259420310147e-05, "grad_norm": 15.448282241821289, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8800200819969177, "num_tokens": 227236046.0, "step": 5955 }, { "epoch": 0.7576644192850782, "ewc_loss": 0.023642854765057564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3642855012440123e-05, "grad_norm": 15.444186210632324, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8767541646957397, "num_tokens": 227272925.0, "step": 5956 }, { "epoch": 0.7577916295636687, "ewc_loss": 0.02361268736422062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3612687073182315e-05, "grad_norm": 15.487411499023438, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8649090528488159, "num_tokens": 227312311.0, "step": 5957 }, { "epoch": 0.7579188398422593, "ewc_loss": 0.023647887632250786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3647888156119734e-05, "grad_norm": 15.447164535522461, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8812047243118286, "num_tokens": 227349632.0, "step": 5958 }, { "epoch": 0.7580460501208498, "ewc_loss": 0.02363145723938942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3631457224837504e-05, "grad_norm": 15.42185115814209, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8596842288970947, "num_tokens": 227392982.0, "step": 5959 }, { "epoch": 0.7581732603994403, "ewc_loss": 0.023604340851306915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3604341549798846e-05, "grad_norm": 15.438948631286621, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8577567338943481, "num_tokens": 227431878.0, "step": 5960 }, { "epoch": 0.7583004706780307, "ewc_loss": 0.023610340431332588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.361034057685174e-05, "grad_norm": 15.429609298706055, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8509094715118408, "num_tokens": 227470401.0, "step": 5961 }, { "epoch": 0.7584276809566213, "ewc_loss": 0.023640261963009834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3640261133550666e-05, "grad_norm": 15.476362228393555, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8660390377044678, "num_tokens": 227507189.0, "step": 5962 }, { "epoch": 0.7585548912352118, "ewc_loss": 0.023639211431145668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.363921157666482e-05, "grad_norm": 15.474852561950684, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.865254819393158, "num_tokens": 227548070.0, "step": 5963 }, { "epoch": 0.7586821015138023, "ewc_loss": 0.02365947887301445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3659478756599128e-05, "grad_norm": 15.446645736694336, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8660522103309631, "num_tokens": 227584360.0, "step": 5964 }, { "epoch": 0.7588093117923929, "ewc_loss": 0.023612262681126595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.361226324865129e-05, "grad_norm": 15.492169380187988, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8680282235145569, "num_tokens": 227618961.0, "step": 5965 }, { "epoch": 0.7589365220709834, "ewc_loss": 0.023635443300008774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3635442630620673e-05, "grad_norm": 15.443116188049316, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8623252511024475, "num_tokens": 227658304.0, "step": 5966 }, { "epoch": 0.7590637323495738, "ewc_loss": 0.02357461303472519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3574613805976696e-05, "grad_norm": 15.483840942382812, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8588519096374512, "num_tokens": 227701526.0, "step": 5967 }, { "epoch": 0.7591909426281643, "ewc_loss": 0.023618051782250404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.361805127293337e-05, "grad_norm": 15.427825927734375, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8668306469917297, "num_tokens": 227744930.0, "step": 5968 }, { "epoch": 0.7593181529067549, "ewc_loss": 0.02359137311577797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3591373974340968e-05, "grad_norm": 15.531070709228516, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8724038004875183, "num_tokens": 227780607.0, "step": 5969 }, { "epoch": 0.7594453631853454, "ewc_loss": 0.023645613342523575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3645612600375898e-05, "grad_norm": 15.423365592956543, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.857986569404602, "num_tokens": 227825742.0, "step": 5970 }, { "epoch": 0.7595725734639359, "ewc_loss": 0.023536207154393196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.353620766371023e-05, "grad_norm": 15.437999725341797, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8426097631454468, "num_tokens": 227865278.0, "step": 5971 }, { "epoch": 0.7596997837425264, "ewc_loss": 0.023638298735022545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.363829844398424e-05, "grad_norm": 15.545209884643555, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8581342697143555, "num_tokens": 227906912.0, "step": 5972 }, { "epoch": 0.7598269940211169, "ewc_loss": 0.023594262078404427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.35942625295138e-05, "grad_norm": 15.434614181518555, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8595707416534424, "num_tokens": 227947067.0, "step": 5973 }, { "epoch": 0.7599542042997074, "ewc_loss": 0.023562544956803322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.356254481128417e-05, "grad_norm": 15.517415046691895, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8434054851531982, "num_tokens": 227988563.0, "step": 5974 }, { "epoch": 0.7600814145782979, "ewc_loss": 0.023618480190634727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3618480554432608e-05, "grad_norm": 15.47111701965332, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8538895845413208, "num_tokens": 228028556.0, "step": 5975 }, { "epoch": 0.7602086248568884, "ewc_loss": 0.02353687584400177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.353687523282133e-05, "grad_norm": 15.458563804626465, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8758131265640259, "num_tokens": 228066265.0, "step": 5976 }, { "epoch": 0.760335835135479, "ewc_loss": 0.023582924157381058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3582924768561497e-05, "grad_norm": 15.543514251708984, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8586657047271729, "num_tokens": 228103732.0, "step": 5977 }, { "epoch": 0.7604630454140695, "ewc_loss": 0.023580189794301987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3580189008498564e-05, "grad_norm": 15.406911849975586, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.859093189239502, "num_tokens": 228149609.0, "step": 5978 }, { "epoch": 0.7605902556926599, "ewc_loss": 0.023539908230304718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.353990748815704e-05, "grad_norm": 15.551280975341797, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8637461066246033, "num_tokens": 228188462.0, "step": 5979 }, { "epoch": 0.7607174659712505, "ewc_loss": 0.0235727708786726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3572771169710904e-05, "grad_norm": 15.470641136169434, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8746228814125061, "num_tokens": 228229252.0, "step": 5980 }, { "epoch": 0.760844676249841, "ewc_loss": 0.02350880205631256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3508802769356407e-05, "grad_norm": 15.507817268371582, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8588484525680542, "num_tokens": 228267397.0, "step": 5981 }, { "epoch": 0.7609718865284315, "ewc_loss": 0.02354677952826023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3546779630123638e-05, "grad_norm": 15.430607795715332, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8691561222076416, "num_tokens": 228307561.0, "step": 5982 }, { "epoch": 0.761099096807022, "ewc_loss": 0.02346770092844963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3467700884793885e-05, "grad_norm": 15.436476707458496, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8636274337768555, "num_tokens": 228347667.0, "step": 5983 }, { "epoch": 0.7612263070856126, "ewc_loss": 0.023545777425169945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3545777366962284e-05, "grad_norm": 15.408575057983398, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8597358465194702, "num_tokens": 228386820.0, "step": 5984 }, { "epoch": 0.761353517364203, "ewc_loss": 0.023600557819008827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.360055805183947e-05, "grad_norm": 15.471829414367676, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8712102174758911, "num_tokens": 228427304.0, "step": 5985 }, { "epoch": 0.7614807276427935, "ewc_loss": 0.023565998300909996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3565999072161503e-05, "grad_norm": 15.390876770019531, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8520410060882568, "num_tokens": 228470779.0, "step": 5986 }, { "epoch": 0.761607937921384, "ewc_loss": 0.023547179996967316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3547179807792418e-05, "grad_norm": 15.509870529174805, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8780786991119385, "num_tokens": 228500454.0, "step": 5987 }, { "epoch": 0.7617351481999746, "ewc_loss": 0.023618381470441818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3618382329004817e-05, "grad_norm": 15.49068832397461, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8697025179862976, "num_tokens": 228537374.0, "step": 5988 }, { "epoch": 0.7618623584785651, "ewc_loss": 0.023571370169520378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3571370547870174e-05, "grad_norm": 15.499987602233887, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8622999787330627, "num_tokens": 228577215.0, "step": 5989 }, { "epoch": 0.7619895687571556, "ewc_loss": 0.023594029247760773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3594029698870145e-05, "grad_norm": 15.469475746154785, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8656555414199829, "num_tokens": 228611693.0, "step": 5990 }, { "epoch": 0.762116779035746, "ewc_loss": 0.02355428971350193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.355429023737088e-05, "grad_norm": 15.449851036071777, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8604874610900879, "num_tokens": 228644231.0, "step": 5991 }, { "epoch": 0.7622439893143366, "ewc_loss": 0.02359931543469429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.359931568207685e-05, "grad_norm": 15.487701416015625, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8719196319580078, "num_tokens": 228684776.0, "step": 5992 }, { "epoch": 0.7623711995929271, "ewc_loss": 0.023615755140781403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3615755708306096e-05, "grad_norm": 15.485771179199219, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8681938052177429, "num_tokens": 228724214.0, "step": 5993 }, { "epoch": 0.7624984098715176, "ewc_loss": 0.023600134998559952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3600134227308445e-05, "grad_norm": 15.436220169067383, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8601331114768982, "num_tokens": 228765098.0, "step": 5994 }, { "epoch": 0.7626256201501082, "ewc_loss": 0.023575913161039352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3575912564410828e-05, "grad_norm": 15.485125541687012, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8627938032150269, "num_tokens": 228802423.0, "step": 5995 }, { "epoch": 0.7627528304286987, "ewc_loss": 0.02364354208111763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3643542590434663e-05, "grad_norm": 15.47897720336914, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.851514458656311, "num_tokens": 228840218.0, "step": 5996 }, { "epoch": 0.7628800407072891, "ewc_loss": 0.023550502955913544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3550503101432696e-05, "grad_norm": 15.432629585266113, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8666284084320068, "num_tokens": 228882810.0, "step": 5997 }, { "epoch": 0.7630072509858796, "ewc_loss": 0.023644261062145233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3644261091249064e-05, "grad_norm": 15.5200834274292, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8682818412780762, "num_tokens": 228917664.0, "step": 5998 }, { "epoch": 0.7631344612644702, "ewc_loss": 0.02360454946756363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.360454891459085e-05, "grad_norm": 15.412504196166992, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.868632435798645, "num_tokens": 228951017.0, "step": 5999 }, { "epoch": 0.7632616715430607, "ewc_loss": 0.023561270907521248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3561271518701687e-05, "grad_norm": 15.488454818725586, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8636492490768433, "num_tokens": 228990949.0, "step": 6000 }, { "epoch": 0.7633888818216512, "ewc_loss": 0.023684699088335037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368469904467929e-05, "grad_norm": 15.458864212036133, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8665125370025635, "num_tokens": 229026604.0, "step": 6001 }, { "epoch": 0.7635160921002417, "ewc_loss": 0.023575225844979286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3575224986416288e-05, "grad_norm": 15.388284683227539, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8560915589332581, "num_tokens": 229061810.0, "step": 6002 }, { "epoch": 0.7636433023788322, "ewc_loss": 0.02364337630569935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.364337706239894e-05, "grad_norm": 15.496553421020508, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8388183116912842, "num_tokens": 229107765.0, "step": 6003 }, { "epoch": 0.7637705126574227, "ewc_loss": 0.023619171231985092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3619171770405956e-05, "grad_norm": 15.354381561279297, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8496295213699341, "num_tokens": 229147822.0, "step": 6004 }, { "epoch": 0.7638977229360132, "ewc_loss": 0.02363193966448307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3631939257029444e-05, "grad_norm": 15.586036682128906, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8715798854827881, "num_tokens": 229184421.0, "step": 6005 }, { "epoch": 0.7640249332146037, "ewc_loss": 0.023687388747930527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3687389330007136e-05, "grad_norm": 15.429241180419922, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8622785210609436, "num_tokens": 229221737.0, "step": 6006 }, { "epoch": 0.7641521434931943, "ewc_loss": 0.023592988029122353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3592987417941913e-05, "grad_norm": 15.535444259643555, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8557587265968323, "num_tokens": 229254727.0, "step": 6007 }, { "epoch": 0.7642793537717848, "ewc_loss": 0.023706113919615746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3706114006927237e-05, "grad_norm": 15.468315124511719, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8730095624923706, "num_tokens": 229292648.0, "step": 6008 }, { "epoch": 0.7644065640503753, "ewc_loss": 0.02360880933701992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3608808987773955e-05, "grad_norm": 15.4896240234375, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.866549551486969, "num_tokens": 229335708.0, "step": 6009 }, { "epoch": 0.7645337743289657, "ewc_loss": 0.023667875677347183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3667875211685896e-05, "grad_norm": 15.473593711853027, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8705818057060242, "num_tokens": 229373791.0, "step": 6010 }, { "epoch": 0.7646609846075563, "ewc_loss": 0.023661822080612183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3661821614950895e-05, "grad_norm": 15.476058959960938, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8720159530639648, "num_tokens": 229409891.0, "step": 6011 }, { "epoch": 0.7647881948861468, "ewc_loss": 0.02365729585289955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3657295969314873e-05, "grad_norm": 15.465397834777832, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8634772300720215, "num_tokens": 229449859.0, "step": 6012 }, { "epoch": 0.7649154051647373, "ewc_loss": 0.023656414821743965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3656415578443557e-05, "grad_norm": 15.44534969329834, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8692924976348877, "num_tokens": 229484224.0, "step": 6013 }, { "epoch": 0.7650426154433279, "ewc_loss": 0.023656222969293594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.365622276556678e-05, "grad_norm": 15.472293853759766, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8580302596092224, "num_tokens": 229521104.0, "step": 6014 }, { "epoch": 0.7651698257219184, "ewc_loss": 0.023671235889196396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.367123670410365e-05, "grad_norm": 15.470324516296387, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8464345932006836, "num_tokens": 229558918.0, "step": 6015 }, { "epoch": 0.7652970360005088, "ewc_loss": 0.023688625544309616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3688626242801547e-05, "grad_norm": 15.498741149902344, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.848853588104248, "num_tokens": 229596931.0, "step": 6016 }, { "epoch": 0.7654242462790993, "ewc_loss": 0.02367456816136837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.367456727370154e-05, "grad_norm": 15.493264198303223, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8480690717697144, "num_tokens": 229633903.0, "step": 6017 }, { "epoch": 0.7655514565576899, "ewc_loss": 0.02364959381520748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.364959436818026e-05, "grad_norm": 15.515827178955078, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.859137237071991, "num_tokens": 229674224.0, "step": 6018 }, { "epoch": 0.7656786668362804, "ewc_loss": 0.023687373846769333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3687372959102504e-05, "grad_norm": 15.504179000854492, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8604276180267334, "num_tokens": 229710547.0, "step": 6019 }, { "epoch": 0.7658058771148709, "ewc_loss": 0.02359659969806671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3596599930897355e-05, "grad_norm": 15.435830116271973, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8779850602149963, "num_tokens": 229752020.0, "step": 6020 }, { "epoch": 0.7659330873934614, "ewc_loss": 0.023687152191996574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3687152861384675e-05, "grad_norm": 15.532437324523926, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.859259843826294, "num_tokens": 229785710.0, "step": 6021 }, { "epoch": 0.7660602976720519, "ewc_loss": 0.023667367175221443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3667367713642307e-05, "grad_norm": 15.431050300598145, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8690456748008728, "num_tokens": 229824118.0, "step": 6022 }, { "epoch": 0.7661875079506424, "ewc_loss": 0.023679867386817932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3679867808823474e-05, "grad_norm": 15.590121269226074, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.873496949672699, "num_tokens": 229854231.0, "step": 6023 }, { "epoch": 0.7663147182292329, "ewc_loss": 0.023720204830169678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3720203898847103e-05, "grad_norm": 15.468057632446289, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8516631126403809, "num_tokens": 229888102.0, "step": 6024 }, { "epoch": 0.7664419285078234, "ewc_loss": 0.023683737963438034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368373861827422e-05, "grad_norm": 15.554210662841797, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8666869401931763, "num_tokens": 229927634.0, "step": 6025 }, { "epoch": 0.766569138786414, "ewc_loss": 0.023709049448370934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.370904985582456e-05, "grad_norm": 15.472624778747559, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8706976771354675, "num_tokens": 229966526.0, "step": 6026 }, { "epoch": 0.7666963490650045, "ewc_loss": 0.023666230961680412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.366623084526509e-05, "grad_norm": 15.468140602111816, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8710452318191528, "num_tokens": 230002761.0, "step": 6027 }, { "epoch": 0.7668235593435949, "ewc_loss": 0.023701351135969162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3701351892668754e-05, "grad_norm": 15.431342124938965, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8682186603546143, "num_tokens": 230041303.0, "step": 6028 }, { "epoch": 0.7669507696221854, "ewc_loss": 0.02369742840528488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3697428332525305e-05, "grad_norm": 15.458430290222168, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8661651015281677, "num_tokens": 230081117.0, "step": 6029 }, { "epoch": 0.767077979900776, "ewc_loss": 0.023691872134804726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3691871319897473e-05, "grad_norm": 15.423943519592285, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8573570251464844, "num_tokens": 230119411.0, "step": 6030 }, { "epoch": 0.7672051901793665, "ewc_loss": 0.023717915639281273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3717915610177442e-05, "grad_norm": 15.468487739562988, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8699599504470825, "num_tokens": 230161111.0, "step": 6031 }, { "epoch": 0.767332400457957, "ewc_loss": 0.02368120290338993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3681202947045676e-05, "grad_norm": 15.441400527954102, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8574314117431641, "num_tokens": 230196999.0, "step": 6032 }, { "epoch": 0.7674596107365476, "ewc_loss": 0.023746879771351814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3746879378450103e-05, "grad_norm": 15.601283073425293, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8644518852233887, "num_tokens": 230231495.0, "step": 6033 }, { "epoch": 0.767586821015138, "ewc_loss": 0.023714764043688774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3714763301541097e-05, "grad_norm": 15.523903846740723, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.855154275894165, "num_tokens": 230267732.0, "step": 6034 }, { "epoch": 0.7677140312937285, "ewc_loss": 0.02367316745221615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.367316665186081e-05, "grad_norm": 15.505148887634277, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8618623614311218, "num_tokens": 230297559.0, "step": 6035 }, { "epoch": 0.767841241572319, "ewc_loss": 0.023711584508419037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.37115837080637e-05, "grad_norm": 15.50737476348877, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8504369258880615, "num_tokens": 230336396.0, "step": 6036 }, { "epoch": 0.7679684518509096, "ewc_loss": 0.023676257580518723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3676257114857435e-05, "grad_norm": 15.490412712097168, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8623141050338745, "num_tokens": 230368152.0, "step": 6037 }, { "epoch": 0.7680956621295001, "ewc_loss": 0.02373136579990387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.373136521782726e-05, "grad_norm": 15.55562973022461, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8676651120185852, "num_tokens": 230408081.0, "step": 6038 }, { "epoch": 0.7682228724080906, "ewc_loss": 0.023729611188173294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3729611712042242e-05, "grad_norm": 15.448894500732422, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8544270396232605, "num_tokens": 230441691.0, "step": 6039 }, { "epoch": 0.768350082686681, "ewc_loss": 0.023709891363978386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.37098920479184e-05, "grad_norm": 15.498835563659668, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8603850603103638, "num_tokens": 230480089.0, "step": 6040 }, { "epoch": 0.7684772929652716, "ewc_loss": 0.02375091053545475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.375091025896836e-05, "grad_norm": 15.451237678527832, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8561605215072632, "num_tokens": 230518655.0, "step": 6041 }, { "epoch": 0.7686045032438621, "ewc_loss": 0.023742256686091423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3742257326375693e-05, "grad_norm": 15.529128074645996, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8648453950881958, "num_tokens": 230555717.0, "step": 6042 }, { "epoch": 0.7687317135224526, "ewc_loss": 0.023768611252307892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3768610844854265e-05, "grad_norm": 15.427030563354492, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8706783056259155, "num_tokens": 230596699.0, "step": 6043 }, { "epoch": 0.7688589238010431, "ewc_loss": 0.023710209876298904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3710210371064022e-05, "grad_norm": 15.527167320251465, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8710089921951294, "num_tokens": 230633284.0, "step": 6044 }, { "epoch": 0.7689861340796337, "ewc_loss": 0.02382095530629158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3820955902920105e-05, "grad_norm": 15.482218742370605, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8662322163581848, "num_tokens": 230673133.0, "step": 6045 }, { "epoch": 0.7691133443582241, "ewc_loss": 0.023720165714621544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.372016570006963e-05, "grad_norm": 15.460851669311523, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8632756471633911, "num_tokens": 230719003.0, "step": 6046 }, { "epoch": 0.7692405546368146, "ewc_loss": 0.023732097819447517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.373209827055689e-05, "grad_norm": 15.483380317687988, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8717702627182007, "num_tokens": 230754805.0, "step": 6047 }, { "epoch": 0.7693677649154052, "ewc_loss": 0.02374420315027237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3744203645037487e-05, "grad_norm": 15.525055885314941, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.849990725517273, "num_tokens": 230793249.0, "step": 6048 }, { "epoch": 0.7694949751939957, "ewc_loss": 0.023779643699526787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.377964301558677e-05, "grad_norm": 15.531357765197754, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8792091608047485, "num_tokens": 230826025.0, "step": 6049 }, { "epoch": 0.7696221854725862, "ewc_loss": 0.023668358102440834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3668357243877836e-05, "grad_norm": 15.467496871948242, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8741627931594849, "num_tokens": 230866615.0, "step": 6050 }, { "epoch": 0.7697493957511767, "ewc_loss": 0.023706544190645218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3706543288426474e-05, "grad_norm": 15.519706726074219, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8465067148208618, "num_tokens": 230901587.0, "step": 6051 }, { "epoch": 0.7698766060297672, "ewc_loss": 0.023753389716148376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3753389541525394e-05, "grad_norm": 15.480111122131348, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8679327964782715, "num_tokens": 230944267.0, "step": 6052 }, { "epoch": 0.7700038163083577, "ewc_loss": 0.023641832172870636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.364183274039533e-05, "grad_norm": 15.451034545898438, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8530661463737488, "num_tokens": 230980022.0, "step": 6053 }, { "epoch": 0.7701310265869482, "ewc_loss": 0.023730220273137093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.373022107349243e-05, "grad_norm": 15.584667205810547, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8685004115104675, "num_tokens": 231018110.0, "step": 6054 }, { "epoch": 0.7702582368655387, "ewc_loss": 0.023682324215769768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368232344451826e-05, "grad_norm": 15.442766189575195, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8664748668670654, "num_tokens": 231056531.0, "step": 6055 }, { "epoch": 0.7703854471441293, "ewc_loss": 0.023648688569664955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3648688511457294e-05, "grad_norm": 15.449323654174805, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8678626418113708, "num_tokens": 231096270.0, "step": 6056 }, { "epoch": 0.7705126574227198, "ewc_loss": 0.023708593100309372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.370859328948427e-05, "grad_norm": 15.512500762939453, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.855557918548584, "num_tokens": 231136300.0, "step": 6057 }, { "epoch": 0.7706398677013102, "ewc_loss": 0.023702984675765038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3702985345153138e-05, "grad_norm": 15.544483184814453, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8493842482566833, "num_tokens": 231169005.0, "step": 6058 }, { "epoch": 0.7707670779799007, "ewc_loss": 0.02369847148656845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3698470613453537e-05, "grad_norm": 15.502423286437988, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8696349859237671, "num_tokens": 231205274.0, "step": 6059 }, { "epoch": 0.7708942882584913, "ewc_loss": 0.023685071617364883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368507193750702e-05, "grad_norm": 15.53881549835205, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8678627014160156, "num_tokens": 231239022.0, "step": 6060 }, { "epoch": 0.7710214985370818, "ewc_loss": 0.023716913536190987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.371691334701609e-05, "grad_norm": 15.490371704101562, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8450403213500977, "num_tokens": 231282952.0, "step": 6061 }, { "epoch": 0.7711487088156723, "ewc_loss": 0.023677706718444824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.367770684941206e-05, "grad_norm": 15.59965705871582, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8598663210868835, "num_tokens": 231322980.0, "step": 6062 }, { "epoch": 0.7712759190942629, "ewc_loss": 0.023701218888163567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3701219106442295e-05, "grad_norm": 15.549147605895996, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8544113636016846, "num_tokens": 231358382.0, "step": 6063 }, { "epoch": 0.7714031293728534, "ewc_loss": 0.02362099662423134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3620996216777712e-05, "grad_norm": 15.38426399230957, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8709505200386047, "num_tokens": 231401915.0, "step": 6064 }, { "epoch": 0.7715303396514438, "ewc_loss": 0.02371044084429741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3710441382718273e-05, "grad_norm": 15.586101531982422, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.839986264705658, "num_tokens": 231433241.0, "step": 6065 }, { "epoch": 0.7716575499300343, "ewc_loss": 0.02371145971119404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3711460016784258e-05, "grad_norm": 15.435961723327637, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8571836948394775, "num_tokens": 231478318.0, "step": 6066 }, { "epoch": 0.7717847602086249, "ewc_loss": 0.02367805875837803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3678057914366946e-05, "grad_norm": 15.52290153503418, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8549869060516357, "num_tokens": 231518584.0, "step": 6067 }, { "epoch": 0.7719119704872154, "ewc_loss": 0.023726388812065125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.372638846281916e-05, "grad_norm": 15.465879440307617, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8624639511108398, "num_tokens": 231556689.0, "step": 6068 }, { "epoch": 0.7720391807658059, "ewc_loss": 0.023713838309049606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3713837435934693e-05, "grad_norm": 15.581637382507324, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8612725734710693, "num_tokens": 231593936.0, "step": 6069 }, { "epoch": 0.7721663910443964, "ewc_loss": 0.02372780814766884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3727807274553925e-05, "grad_norm": 15.491349220275879, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8812451362609863, "num_tokens": 231634785.0, "step": 6070 }, { "epoch": 0.7722936013229869, "ewc_loss": 0.023683220148086548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.368322020629421e-05, "grad_norm": 15.570013046264648, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.858069896697998, "num_tokens": 231675798.0, "step": 6071 }, { "epoch": 0.7724208116015774, "ewc_loss": 0.023739784955978394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3739785319776274e-05, "grad_norm": 15.500631332397461, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8600258231163025, "num_tokens": 231711611.0, "step": 6072 }, { "epoch": 0.7725480218801679, "ewc_loss": 0.02366003394126892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.366003354836721e-05, "grad_norm": 15.506707191467285, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8522495627403259, "num_tokens": 231746997.0, "step": 6073 }, { "epoch": 0.7726752321587584, "ewc_loss": 0.023701351135969162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3701351892668754e-05, "grad_norm": 15.485136985778809, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8424091339111328, "num_tokens": 231787696.0, "step": 6074 }, { "epoch": 0.772802442437349, "ewc_loss": 0.023690933361649513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3690932721365243e-05, "grad_norm": 15.570215225219727, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8537229299545288, "num_tokens": 231826127.0, "step": 6075 }, { "epoch": 0.7729296527159395, "ewc_loss": 0.02369762398302555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.369762478338089e-05, "grad_norm": 15.54636001586914, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8566890954971313, "num_tokens": 231857538.0, "step": 6076 }, { "epoch": 0.7730568629945299, "ewc_loss": 0.02370516024529934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.370516085647978e-05, "grad_norm": 15.513631820678711, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8603436946868896, "num_tokens": 231893913.0, "step": 6077 }, { "epoch": 0.7731840732731204, "ewc_loss": 0.02375851944088936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3758519091643393e-05, "grad_norm": 15.471136093139648, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8645737171173096, "num_tokens": 231934274.0, "step": 6078 }, { "epoch": 0.773311283551711, "ewc_loss": 0.02377455122768879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3774551664246246e-05, "grad_norm": 15.525657653808594, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8558139801025391, "num_tokens": 231983197.0, "step": 6079 }, { "epoch": 0.7734384938303015, "ewc_loss": 0.023867497220635414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.386749656579923e-05, "grad_norm": 15.522493362426758, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8586846590042114, "num_tokens": 232021265.0, "step": 6080 }, { "epoch": 0.773565704108892, "ewc_loss": 0.023728778585791588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3728778614895418e-05, "grad_norm": 15.430977821350098, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8596547842025757, "num_tokens": 232060992.0, "step": 6081 }, { "epoch": 0.7736929143874826, "ewc_loss": 0.023802369832992554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3802369469194673e-05, "grad_norm": 15.541976928710938, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.859923243522644, "num_tokens": 232096758.0, "step": 6082 }, { "epoch": 0.773820124666073, "ewc_loss": 0.023830654099583626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.383065475441981e-05, "grad_norm": 15.524657249450684, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8582544326782227, "num_tokens": 232133773.0, "step": 6083 }, { "epoch": 0.7739473349446635, "ewc_loss": 0.02375873178243637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3758731913403608e-05, "grad_norm": 15.428939819335938, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8599115014076233, "num_tokens": 232173182.0, "step": 6084 }, { "epoch": 0.774074545223254, "ewc_loss": 0.023807717487215996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3807717298041098e-05, "grad_norm": 15.52326488494873, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8699870109558105, "num_tokens": 232206478.0, "step": 6085 }, { "epoch": 0.7742017555018446, "ewc_loss": 0.023786716163158417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3786715246387757e-05, "grad_norm": 15.514100074768066, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8562601208686829, "num_tokens": 232242231.0, "step": 6086 }, { "epoch": 0.7743289657804351, "ewc_loss": 0.023769602179527283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3769602194079198e-05, "grad_norm": 15.539763450622559, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8651408553123474, "num_tokens": 232270636.0, "step": 6087 }, { "epoch": 0.7744561760590256, "ewc_loss": 0.023796355351805687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.379635589022655e-05, "grad_norm": 15.460079193115234, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8674023747444153, "num_tokens": 232309974.0, "step": 6088 }, { "epoch": 0.774583386337616, "ewc_loss": 0.02377624809741974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3776248781359755e-05, "grad_norm": 15.514876365661621, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8706674575805664, "num_tokens": 232344118.0, "step": 6089 }, { "epoch": 0.7747105966162066, "ewc_loss": 0.023813337087631226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.381333615630865e-05, "grad_norm": 15.508160591125488, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8516120910644531, "num_tokens": 232385183.0, "step": 6090 }, { "epoch": 0.7748378068947971, "ewc_loss": 0.023753976449370384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.375397707510274e-05, "grad_norm": 15.425326347351074, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8624383807182312, "num_tokens": 232421670.0, "step": 6091 }, { "epoch": 0.7749650171733876, "ewc_loss": 0.023792728781700134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.379272882535588e-05, "grad_norm": 15.532029151916504, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8493722677230835, "num_tokens": 232459563.0, "step": 6092 }, { "epoch": 0.7750922274519781, "ewc_loss": 0.023817075416445732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.381707599852234e-05, "grad_norm": 15.426802635192871, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8618795275688171, "num_tokens": 232496357.0, "step": 6093 }, { "epoch": 0.7752194377305687, "ewc_loss": 0.023775191977620125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3775191948516294e-05, "grad_norm": 15.480607032775879, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8625248670578003, "num_tokens": 232530436.0, "step": 6094 }, { "epoch": 0.7753466480091591, "ewc_loss": 0.023840000852942467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.384000072197523e-05, "grad_norm": 15.458456039428711, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.855765163898468, "num_tokens": 232568585.0, "step": 6095 }, { "epoch": 0.7754738582877496, "ewc_loss": 0.023777775466442108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.377777491346933e-05, "grad_norm": 15.474576950073242, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.867713212966919, "num_tokens": 232602506.0, "step": 6096 }, { "epoch": 0.7756010685663401, "ewc_loss": 0.023849567398428917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3849566787248477e-05, "grad_norm": 15.42991828918457, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8694242835044861, "num_tokens": 232642819.0, "step": 6097 }, { "epoch": 0.7757282788449307, "ewc_loss": 0.02381698600947857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3816986868041568e-05, "grad_norm": 15.458834648132324, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.850078821182251, "num_tokens": 232679505.0, "step": 6098 }, { "epoch": 0.7758554891235212, "ewc_loss": 0.02387864887714386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.387864878983237e-05, "grad_norm": 15.519824981689453, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.86614990234375, "num_tokens": 232715701.0, "step": 6099 }, { "epoch": 0.7759826994021117, "ewc_loss": 0.02382897213101387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.382897218922153e-05, "grad_norm": 15.48294734954834, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.862816333770752, "num_tokens": 232750786.0, "step": 6100 }, { "epoch": 0.7761099096807021, "ewc_loss": 0.023865072056651115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3865071852924302e-05, "grad_norm": 15.524678230285645, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8501803874969482, "num_tokens": 232791370.0, "step": 6101 }, { "epoch": 0.7762371199592927, "ewc_loss": 0.02384371869266033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3843718736316077e-05, "grad_norm": 15.529853820800781, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8631439208984375, "num_tokens": 232830895.0, "step": 6102 }, { "epoch": 0.7763643302378832, "ewc_loss": 0.023839062079787254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3839062123443e-05, "grad_norm": 15.468070030212402, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8609648942947388, "num_tokens": 232869940.0, "step": 6103 }, { "epoch": 0.7764915405164737, "ewc_loss": 0.023815132677555084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3815133317839354e-05, "grad_norm": 15.564553260803223, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8573002815246582, "num_tokens": 232910743.0, "step": 6104 }, { "epoch": 0.7766187507950643, "ewc_loss": 0.0238371379673481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.383713763265405e-05, "grad_norm": 15.49505615234375, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8609594106674194, "num_tokens": 232948428.0, "step": 6105 }, { "epoch": 0.7767459610736548, "ewc_loss": 0.023786015808582306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3786014935467392e-05, "grad_norm": 15.520378112792969, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8672294616699219, "num_tokens": 232985142.0, "step": 6106 }, { "epoch": 0.7768731713522452, "ewc_loss": 0.02381638064980507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3816381144570187e-05, "grad_norm": 15.52802848815918, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8714824318885803, "num_tokens": 233020810.0, "step": 6107 }, { "epoch": 0.7770003816308357, "ewc_loss": 0.023775365203619003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.377536475250963e-05, "grad_norm": 15.642606735229492, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8540396094322205, "num_tokens": 233055674.0, "step": 6108 }, { "epoch": 0.7771275919094263, "ewc_loss": 0.02384430356323719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.384430445090402e-05, "grad_norm": 15.562257766723633, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8567969799041748, "num_tokens": 233084506.0, "step": 6109 }, { "epoch": 0.7772548021880168, "ewc_loss": 0.023732664063572884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3732663976261392e-05, "grad_norm": 15.499852180480957, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8785061836242676, "num_tokens": 233120172.0, "step": 6110 }, { "epoch": 0.7773820124666073, "ewc_loss": 0.02378896065056324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3788959879311733e-05, "grad_norm": 15.552835464477539, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8510030508041382, "num_tokens": 233160075.0, "step": 6111 }, { "epoch": 0.7775092227451978, "ewc_loss": 0.023763375356793404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.376337579335086e-05, "grad_norm": 15.508816719055176, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8540241122245789, "num_tokens": 233201714.0, "step": 6112 }, { "epoch": 0.7776364330237884, "ewc_loss": 0.02376406081020832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3764061552355997e-05, "grad_norm": 15.59058666229248, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8681739568710327, "num_tokens": 233237642.0, "step": 6113 }, { "epoch": 0.7777636433023788, "ewc_loss": 0.02381541021168232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3815409804228693e-05, "grad_norm": 15.456659317016602, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8672431707382202, "num_tokens": 233279712.0, "step": 6114 }, { "epoch": 0.7778908535809693, "ewc_loss": 0.023756500333547592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3756500013405457e-05, "grad_norm": 15.5982084274292, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.866626501083374, "num_tokens": 233322441.0, "step": 6115 }, { "epoch": 0.7780180638595598, "ewc_loss": 0.023863617330789566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3863616661401466e-05, "grad_norm": 15.539164543151855, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8558164834976196, "num_tokens": 233371053.0, "step": 6116 }, { "epoch": 0.7781452741381504, "ewc_loss": 0.023736821487545967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3736822186037898e-05, "grad_norm": 15.545766830444336, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8640635013580322, "num_tokens": 233405372.0, "step": 6117 }, { "epoch": 0.7782724844167409, "ewc_loss": 0.023803753778338432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.380375372013077e-05, "grad_norm": 15.480165481567383, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.868301510810852, "num_tokens": 233447209.0, "step": 6118 }, { "epoch": 0.7783996946953314, "ewc_loss": 0.023741574957966805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3741575205349363e-05, "grad_norm": 15.537323951721191, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8642317056655884, "num_tokens": 233484621.0, "step": 6119 }, { "epoch": 0.7785269049739219, "ewc_loss": 0.02384061925113201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3840619178372435e-05, "grad_norm": 15.597700119018555, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8445508480072021, "num_tokens": 233520756.0, "step": 6120 }, { "epoch": 0.7786541152525124, "ewc_loss": 0.023779813200235367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3779814000590704e-05, "grad_norm": 15.534100532531738, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8536041975021362, "num_tokens": 233559120.0, "step": 6121 }, { "epoch": 0.7787813255311029, "ewc_loss": 0.023762866854667664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3762866476317868e-05, "grad_norm": 15.550787925720215, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8722801208496094, "num_tokens": 233590513.0, "step": 6122 }, { "epoch": 0.7789085358096934, "ewc_loss": 0.02374017797410488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.374017822148744e-05, "grad_norm": 15.504738807678223, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8779974579811096, "num_tokens": 233625986.0, "step": 6123 }, { "epoch": 0.779035746088284, "ewc_loss": 0.023739010095596313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3739010430290364e-05, "grad_norm": 15.574753761291504, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8705475926399231, "num_tokens": 233655097.0, "step": 6124 }, { "epoch": 0.7791629563668745, "ewc_loss": 0.023742971941828728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3742972189211287e-05, "grad_norm": 15.529314994812012, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8517411351203918, "num_tokens": 233687815.0, "step": 6125 }, { "epoch": 0.7792901666454649, "ewc_loss": 0.02375601790845394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3756017981213517e-05, "grad_norm": 15.577447891235352, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8747284412384033, "num_tokens": 233722611.0, "step": 6126 }, { "epoch": 0.7794173769240554, "ewc_loss": 0.023788969963788986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3788970793248154e-05, "grad_norm": 15.51839542388916, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8487297296524048, "num_tokens": 233765345.0, "step": 6127 }, { "epoch": 0.779544587202646, "ewc_loss": 0.023747457191348076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.374745781708043e-05, "grad_norm": 15.549859046936035, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8630602359771729, "num_tokens": 233804358.0, "step": 6128 }, { "epoch": 0.7796717974812365, "ewc_loss": 0.023790936917066574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3790937120793387e-05, "grad_norm": 15.498151779174805, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8509508967399597, "num_tokens": 233845117.0, "step": 6129 }, { "epoch": 0.779799007759827, "ewc_loss": 0.02378414012491703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3784139557392336e-05, "grad_norm": 15.530301094055176, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8543241620063782, "num_tokens": 233884091.0, "step": 6130 }, { "epoch": 0.7799262180384176, "ewc_loss": 0.02380056492984295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3800565031706356e-05, "grad_norm": 15.50896167755127, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8586057424545288, "num_tokens": 233922590.0, "step": 6131 }, { "epoch": 0.780053428317008, "ewc_loss": 0.023774299770593643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3774300643708557e-05, "grad_norm": 15.554727554321289, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8648800849914551, "num_tokens": 233954138.0, "step": 6132 }, { "epoch": 0.7801806385955985, "ewc_loss": 0.023837970569729805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3837970729800873e-05, "grad_norm": 15.505590438842773, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8706873655319214, "num_tokens": 233988035.0, "step": 6133 }, { "epoch": 0.780307848874189, "ewc_loss": 0.02379501238465309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.379501165705733e-05, "grad_norm": 15.530220985412598, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8519415259361267, "num_tokens": 234023747.0, "step": 6134 }, { "epoch": 0.7804350591527796, "ewc_loss": 0.023827778175473213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3827778932172805e-05, "grad_norm": 15.482664108276367, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.869364857673645, "num_tokens": 234064182.0, "step": 6135 }, { "epoch": 0.7805622694313701, "ewc_loss": 0.02378467284142971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3784672521287575e-05, "grad_norm": 15.47864055633545, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8458285331726074, "num_tokens": 234106533.0, "step": 6136 }, { "epoch": 0.7806894797099606, "ewc_loss": 0.023848751559853554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.384875187999569e-05, "grad_norm": 15.500251770019531, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8747442960739136, "num_tokens": 234145492.0, "step": 6137 }, { "epoch": 0.780816689988551, "ewc_loss": 0.02383301593363285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3833015802665614e-05, "grad_norm": 15.53614616394043, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.864271879196167, "num_tokens": 234181709.0, "step": 6138 }, { "epoch": 0.7809439002671416, "ewc_loss": 0.023825999349355698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3825999960536137e-05, "grad_norm": 15.488544464111328, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8625938892364502, "num_tokens": 234217854.0, "step": 6139 }, { "epoch": 0.7810711105457321, "ewc_loss": 0.023847214877605438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3847214833949693e-05, "grad_norm": 15.5149507522583, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8605848550796509, "num_tokens": 234255769.0, "step": 6140 }, { "epoch": 0.7811983208243226, "ewc_loss": 0.023837579414248466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.383757964707911e-05, "grad_norm": 15.535948753356934, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8484665155410767, "num_tokens": 234290828.0, "step": 6141 }, { "epoch": 0.7813255311029131, "ewc_loss": 0.02386440895497799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.386440974078141e-05, "grad_norm": 15.516913414001465, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8607462644577026, "num_tokens": 234326336.0, "step": 6142 }, { "epoch": 0.7814527413815037, "ewc_loss": 0.023871131241321564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3871130906627513e-05, "grad_norm": 15.541327476501465, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8560057878494263, "num_tokens": 234366210.0, "step": 6143 }, { "epoch": 0.7815799516600941, "ewc_loss": 0.023867405951023102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.386740561632905e-05, "grad_norm": 15.485292434692383, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8533835411071777, "num_tokens": 234398975.0, "step": 6144 }, { "epoch": 0.7817071619386846, "ewc_loss": 0.02385897748172283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3858978238422424e-05, "grad_norm": 15.636701583862305, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.856128990650177, "num_tokens": 234437746.0, "step": 6145 }, { "epoch": 0.7818343722172751, "ewc_loss": 0.023920588195323944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3920587409520522e-05, "grad_norm": 15.532254219055176, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8659939169883728, "num_tokens": 234478402.0, "step": 6146 }, { "epoch": 0.7819615824958657, "ewc_loss": 0.023815566673874855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3815566237317398e-05, "grad_norm": 15.561850547790527, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8479483127593994, "num_tokens": 234509699.0, "step": 6147 }, { "epoch": 0.7820887927744562, "ewc_loss": 0.023890336975455284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3890337615739554e-05, "grad_norm": 15.536626815795898, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8559187650680542, "num_tokens": 234551515.0, "step": 6148 }, { "epoch": 0.7822160030530467, "ewc_loss": 0.023851662874221802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3851662263041362e-05, "grad_norm": 15.526863098144531, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.867745041847229, "num_tokens": 234582811.0, "step": 6149 }, { "epoch": 0.7823432133316371, "ewc_loss": 0.023908305913209915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.390830559306778e-05, "grad_norm": 15.582070350646973, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8534448742866516, "num_tokens": 234625299.0, "step": 6150 }, { "epoch": 0.7824704236102277, "ewc_loss": 0.023881930857896805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3881930246716365e-05, "grad_norm": 15.499178886413574, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.86790931224823, "num_tokens": 234665641.0, "step": 6151 }, { "epoch": 0.7825976338888182, "ewc_loss": 0.023834848776459694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3834849343984388e-05, "grad_norm": 15.550224304199219, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8607707023620605, "num_tokens": 234704668.0, "step": 6152 }, { "epoch": 0.7827248441674087, "ewc_loss": 0.023907097056508064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3907097784103826e-05, "grad_norm": 15.6006498336792, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8668515086174011, "num_tokens": 234738080.0, "step": 6153 }, { "epoch": 0.7828520544459993, "ewc_loss": 0.023856161162257195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385616062383633e-05, "grad_norm": 15.546002388000488, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8590348958969116, "num_tokens": 234781077.0, "step": 6154 }, { "epoch": 0.7829792647245898, "ewc_loss": 0.023800980299711227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3800979761290364e-05, "grad_norm": 15.568824768066406, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8455016613006592, "num_tokens": 234809374.0, "step": 6155 }, { "epoch": 0.7831064750031802, "ewc_loss": 0.023855116218328476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3855116523918696e-05, "grad_norm": 15.557873725891113, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8502148985862732, "num_tokens": 234845585.0, "step": 6156 }, { "epoch": 0.7832336852817707, "ewc_loss": 0.02380666323006153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3806664103176445e-05, "grad_norm": 15.51661205291748, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8603343367576599, "num_tokens": 234883891.0, "step": 6157 }, { "epoch": 0.7833608955603613, "ewc_loss": 0.023854248225688934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3854248865973204e-05, "grad_norm": 15.5393648147583, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8604472875595093, "num_tokens": 234919445.0, "step": 6158 }, { "epoch": 0.7834881058389518, "ewc_loss": 0.023852799087762833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385279913141858e-05, "grad_norm": 15.498518943786621, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8637685775756836, "num_tokens": 234951465.0, "step": 6159 }, { "epoch": 0.7836153161175423, "ewc_loss": 0.023850781843066216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3850781872170046e-05, "grad_norm": 15.513851165771484, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8526409864425659, "num_tokens": 234989970.0, "step": 6160 }, { "epoch": 0.7837425263961328, "ewc_loss": 0.023902760818600655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3902761313365772e-05, "grad_norm": 15.525065422058105, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8622626066207886, "num_tokens": 235029673.0, "step": 6161 }, { "epoch": 0.7838697366747234, "ewc_loss": 0.023872671648859978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3872671590652317e-05, "grad_norm": 15.470219612121582, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8605250120162964, "num_tokens": 235065389.0, "step": 6162 }, { "epoch": 0.7839969469533138, "ewc_loss": 0.02385038137435913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3850381694501266e-05, "grad_norm": 15.546061515808105, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8735429644584656, "num_tokens": 235099795.0, "step": 6163 }, { "epoch": 0.7841241572319043, "ewc_loss": 0.0239259023219347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3925902496557683e-05, "grad_norm": 15.472298622131348, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.862371563911438, "num_tokens": 235137475.0, "step": 6164 }, { "epoch": 0.7842513675104948, "ewc_loss": 0.023863529786467552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3863529349910095e-05, "grad_norm": 15.555147171020508, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8561711311340332, "num_tokens": 235178454.0, "step": 6165 }, { "epoch": 0.7843785777890854, "ewc_loss": 0.023948803544044495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3948803573148325e-05, "grad_norm": 15.515294075012207, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8796650171279907, "num_tokens": 235216139.0, "step": 6166 }, { "epoch": 0.7845057880676759, "ewc_loss": 0.02388617768883705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3886177586973645e-05, "grad_norm": 15.513408660888672, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8764365315437317, "num_tokens": 235253571.0, "step": 6167 }, { "epoch": 0.7846329983462664, "ewc_loss": 0.023869669064879417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3869668439147063e-05, "grad_norm": 15.525066375732422, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8462215662002563, "num_tokens": 235293950.0, "step": 6168 }, { "epoch": 0.7847602086248568, "ewc_loss": 0.023876093327999115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3876093109720387e-05, "grad_norm": 15.468500137329102, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8657260537147522, "num_tokens": 235336419.0, "step": 6169 }, { "epoch": 0.7848874189034474, "ewc_loss": 0.023908428847789764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.390842928434722e-05, "grad_norm": 15.553845405578613, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8665244579315186, "num_tokens": 235376039.0, "step": 6170 }, { "epoch": 0.7850146291820379, "ewc_loss": 0.023906614631414413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3906613932922482e-05, "grad_norm": 15.565237045288086, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8501490950584412, "num_tokens": 235412736.0, "step": 6171 }, { "epoch": 0.7851418394606284, "ewc_loss": 0.023892655968666077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3892656827229075e-05, "grad_norm": 15.56289291381836, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.861815333366394, "num_tokens": 235457318.0, "step": 6172 }, { "epoch": 0.785269049739219, "ewc_loss": 0.02391202002763748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391201996942982e-05, "grad_norm": 15.591256141662598, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8515313863754272, "num_tokens": 235491471.0, "step": 6173 }, { "epoch": 0.7853962600178095, "ewc_loss": 0.023862477391958237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3862477974034846e-05, "grad_norm": 15.595382690429688, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8717212677001953, "num_tokens": 235531324.0, "step": 6174 }, { "epoch": 0.7855234702963999, "ewc_loss": 0.02385595627129078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3855956897023134e-05, "grad_norm": 15.521230697631836, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8493749499320984, "num_tokens": 235573233.0, "step": 6175 }, { "epoch": 0.7856506805749904, "ewc_loss": 0.023827461525797844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3827460609027185e-05, "grad_norm": 15.523760795593262, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8627183437347412, "num_tokens": 235614400.0, "step": 6176 }, { "epoch": 0.785777890853581, "ewc_loss": 0.023796677589416504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3796677851350978e-05, "grad_norm": 15.506123542785645, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8678914308547974, "num_tokens": 235652247.0, "step": 6177 }, { "epoch": 0.7859051011321715, "ewc_loss": 0.023875629529356956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3875629267422482e-05, "grad_norm": 15.611340522766113, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8582767248153687, "num_tokens": 235681810.0, "step": 6178 }, { "epoch": 0.786032311410762, "ewc_loss": 0.023816267028450966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3816266548237763e-05, "grad_norm": 15.432377815246582, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8712239861488342, "num_tokens": 235718422.0, "step": 6179 }, { "epoch": 0.7861595216893525, "ewc_loss": 0.0238143689930439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3814369342289865e-05, "grad_norm": 15.656394958496094, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8569144010543823, "num_tokens": 235758886.0, "step": 6180 }, { "epoch": 0.786286731967943, "ewc_loss": 0.023899702355265617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3899701773189008e-05, "grad_norm": 15.508599281311035, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8647243976593018, "num_tokens": 235792270.0, "step": 6181 }, { "epoch": 0.7864139422465335, "ewc_loss": 0.023753967136144638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.375396798015572e-05, "grad_norm": 15.583192825317383, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.852057933807373, "num_tokens": 235826678.0, "step": 6182 }, { "epoch": 0.786541152525124, "ewc_loss": 0.02387996017932892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3879960281192325e-05, "grad_norm": 15.517549514770508, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8727012872695923, "num_tokens": 235860437.0, "step": 6183 }, { "epoch": 0.7866683628037145, "ewc_loss": 0.02385850064456463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385849984420929e-05, "grad_norm": 15.599923133850098, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8665167093276978, "num_tokens": 235900865.0, "step": 6184 }, { "epoch": 0.7867955730823051, "ewc_loss": 0.023913651704788208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.39136516029248e-05, "grad_norm": 15.547880172729492, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.847314715385437, "num_tokens": 235937419.0, "step": 6185 }, { "epoch": 0.7869227833608956, "ewc_loss": 0.02382386103272438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3823860828997567e-05, "grad_norm": 15.567907333374023, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8527496457099915, "num_tokens": 235976425.0, "step": 6186 }, { "epoch": 0.787049993639486, "ewc_loss": 0.023855814710259438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3855815015849657e-05, "grad_norm": 15.465983390808105, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8548892140388489, "num_tokens": 236012968.0, "step": 6187 }, { "epoch": 0.7871772039180766, "ewc_loss": 0.02386367879807949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3863678507041186e-05, "grad_norm": 15.485241889953613, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8634492754936218, "num_tokens": 236052462.0, "step": 6188 }, { "epoch": 0.7873044141966671, "ewc_loss": 0.023909123614430428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3909124138299376e-05, "grad_norm": 15.492454528808594, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8774435520172119, "num_tokens": 236087604.0, "step": 6189 }, { "epoch": 0.7874316244752576, "ewc_loss": 0.02388124167919159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.388124084973242e-05, "grad_norm": 15.549271583557129, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8613611459732056, "num_tokens": 236126344.0, "step": 6190 }, { "epoch": 0.7875588347538481, "ewc_loss": 0.023928161710500717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3928161681396887e-05, "grad_norm": 15.505671501159668, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8697278499603271, "num_tokens": 236161913.0, "step": 6191 }, { "epoch": 0.7876860450324387, "ewc_loss": 0.02388661913573742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3886619601398706e-05, "grad_norm": 15.600526809692383, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8647365570068359, "num_tokens": 236200612.0, "step": 6192 }, { "epoch": 0.7878132553110291, "ewc_loss": 0.023918334394693375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391833368164953e-05, "grad_norm": 15.47436237335205, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8674639463424683, "num_tokens": 236238136.0, "step": 6193 }, { "epoch": 0.7879404655896196, "ewc_loss": 0.023851200938224792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385120023973286e-05, "grad_norm": 15.506061553955078, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8582003116607666, "num_tokens": 236281303.0, "step": 6194 }, { "epoch": 0.7880676758682101, "ewc_loss": 0.0239326823502779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3932681870064698e-05, "grad_norm": 15.5415678024292, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8591398000717163, "num_tokens": 236322355.0, "step": 6195 }, { "epoch": 0.7881948861468007, "ewc_loss": 0.023878909647464752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3878908905317076e-05, "grad_norm": 15.480392456054688, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8605471849441528, "num_tokens": 236354263.0, "step": 6196 }, { "epoch": 0.7883220964253912, "ewc_loss": 0.02391984686255455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391984708083328e-05, "grad_norm": 15.526883125305176, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.862794041633606, "num_tokens": 236391581.0, "step": 6197 }, { "epoch": 0.7884493067039817, "ewc_loss": 0.023916590958833694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391659108980093e-05, "grad_norm": 15.532230377197266, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8711164593696594, "num_tokens": 236426003.0, "step": 6198 }, { "epoch": 0.7885765169825721, "ewc_loss": 0.02391650155186653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3916501959320158e-05, "grad_norm": 15.529154777526855, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8711071610450745, "num_tokens": 236464874.0, "step": 6199 }, { "epoch": 0.7887037272611627, "ewc_loss": 0.02394983172416687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.394983130216133e-05, "grad_norm": 15.497529029846191, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8722453713417053, "num_tokens": 236499826.0, "step": 6200 }, { "epoch": 0.7888309375397532, "ewc_loss": 0.023922381922602654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.392238275206182e-05, "grad_norm": 15.568039894104004, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.87386554479599, "num_tokens": 236538615.0, "step": 6201 }, { "epoch": 0.7889581478183437, "ewc_loss": 0.02396460436284542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3964605134096928e-05, "grad_norm": 15.51514720916748, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.859713077545166, "num_tokens": 236577860.0, "step": 6202 }, { "epoch": 0.7890853580969343, "ewc_loss": 0.02392120659351349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3921205865917727e-05, "grad_norm": 15.53115177154541, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8767385482788086, "num_tokens": 236619041.0, "step": 6203 }, { "epoch": 0.7892125683755248, "ewc_loss": 0.023995304480195045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.399530421826057e-05, "grad_norm": 15.669771194458008, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.865646243095398, "num_tokens": 236655788.0, "step": 6204 }, { "epoch": 0.7893397786541152, "ewc_loss": 0.023914063349366188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3914062694530003e-05, "grad_norm": 15.570457458496094, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8732331991195679, "num_tokens": 236691532.0, "step": 6205 }, { "epoch": 0.7894669889327057, "ewc_loss": 0.02384003810584545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3840038920752704e-05, "grad_norm": 15.515050888061523, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8686168193817139, "num_tokens": 236733792.0, "step": 6206 }, { "epoch": 0.7895941992112963, "ewc_loss": 0.023896310478448868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.38963111769408e-05, "grad_norm": 15.606775283813477, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8738139867782593, "num_tokens": 236775454.0, "step": 6207 }, { "epoch": 0.7897214094898868, "ewc_loss": 0.023866115137934685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3866115952841938e-05, "grad_norm": 15.509355545043945, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8733590841293335, "num_tokens": 236811572.0, "step": 6208 }, { "epoch": 0.7898486197684773, "ewc_loss": 0.023800428956747055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.380042860750109e-05, "grad_norm": 15.56314754486084, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8463352918624878, "num_tokens": 236851018.0, "step": 6209 }, { "epoch": 0.7899758300470678, "ewc_loss": 0.023913783952593803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391378438915126e-05, "grad_norm": 15.611140251159668, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8662850856781006, "num_tokens": 236891871.0, "step": 6210 }, { "epoch": 0.7901030403256584, "ewc_loss": 0.023823749274015427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.382374987064395e-05, "grad_norm": 15.517210006713867, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8656379580497742, "num_tokens": 236926771.0, "step": 6211 }, { "epoch": 0.7902302506042488, "ewc_loss": 0.02382044680416584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3820446585887112e-05, "grad_norm": 15.592622756958008, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8495779633522034, "num_tokens": 236964768.0, "step": 6212 }, { "epoch": 0.7903574608828393, "ewc_loss": 0.023885613307356834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3885613700258546e-05, "grad_norm": 15.570442199707031, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8508843779563904, "num_tokens": 237005075.0, "step": 6213 }, { "epoch": 0.7904846711614298, "ewc_loss": 0.023829037323594093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.382903767284006e-05, "grad_norm": 15.561077117919922, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8630133867263794, "num_tokens": 237035517.0, "step": 6214 }, { "epoch": 0.7906118814400204, "ewc_loss": 0.023847997188568115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3847996999393217e-05, "grad_norm": 15.474285125732422, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.879780650138855, "num_tokens": 237076319.0, "step": 6215 }, { "epoch": 0.7907390917186109, "ewc_loss": 0.023844175040721893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3844175302656367e-05, "grad_norm": 15.624329566955566, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8530091047286987, "num_tokens": 237117935.0, "step": 6216 }, { "epoch": 0.7908663019972014, "ewc_loss": 0.023885950446128845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.38859502132982e-05, "grad_norm": 15.500459671020508, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8715238571166992, "num_tokens": 237155789.0, "step": 6217 }, { "epoch": 0.7909935122757918, "ewc_loss": 0.02381919138133526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3819191483198665e-05, "grad_norm": 15.595588684082031, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8680851459503174, "num_tokens": 237198870.0, "step": 6218 }, { "epoch": 0.7911207225543824, "ewc_loss": 0.023918194696307182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.391819543845486e-05, "grad_norm": 15.573719024658203, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.867891252040863, "num_tokens": 237243084.0, "step": 6219 }, { "epoch": 0.7912479328329729, "ewc_loss": 0.023854602128267288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3854601749917492e-05, "grad_norm": 15.62153434753418, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8553266525268555, "num_tokens": 237277245.0, "step": 6220 }, { "epoch": 0.7913751431115634, "ewc_loss": 0.023854179307818413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385417974437587e-05, "grad_norm": 15.515880584716797, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8676537871360779, "num_tokens": 237317841.0, "step": 6221 }, { "epoch": 0.791502353390154, "ewc_loss": 0.02382957935333252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3829579731682315e-05, "grad_norm": 15.616717338562012, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8515546917915344, "num_tokens": 237349327.0, "step": 6222 }, { "epoch": 0.7916295636687445, "ewc_loss": 0.023872600868344307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.387260065006558e-05, "grad_norm": 15.559305191040039, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8486279845237732, "num_tokens": 237396179.0, "step": 6223 }, { "epoch": 0.7917567739473349, "ewc_loss": 0.023810861632227898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.381086233071983e-05, "grad_norm": 15.507240295410156, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8584244251251221, "num_tokens": 237437766.0, "step": 6224 }, { "epoch": 0.7918839842259254, "ewc_loss": 0.023839371278882027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3839371351641603e-05, "grad_norm": 15.573115348815918, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8671413064002991, "num_tokens": 237479239.0, "step": 6225 }, { "epoch": 0.792011194504516, "ewc_loss": 0.023847978562116623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3847978809499182e-05, "grad_norm": 15.573259353637695, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8763667941093445, "num_tokens": 237514641.0, "step": 6226 }, { "epoch": 0.7921384047831065, "ewc_loss": 0.023880036547780037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3880036678747274e-05, "grad_norm": 15.566214561462402, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8730710744857788, "num_tokens": 237555319.0, "step": 6227 }, { "epoch": 0.792265615061697, "ewc_loss": 0.023838991299271584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3838991182856262e-05, "grad_norm": 15.5166015625, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8648747801780701, "num_tokens": 237597786.0, "step": 6228 }, { "epoch": 0.7923928253402875, "ewc_loss": 0.023860136047005653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3860136934672482e-05, "grad_norm": 15.586204528808594, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8738516569137573, "num_tokens": 237636584.0, "step": 6229 }, { "epoch": 0.792520035618878, "ewc_loss": 0.02385544218122959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385544212302193e-05, "grad_norm": 15.558480262756348, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8544019460678101, "num_tokens": 237676000.0, "step": 6230 }, { "epoch": 0.7926472458974685, "ewc_loss": 0.023810965940356255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.381096601311583e-05, "grad_norm": 15.582544326782227, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8886390924453735, "num_tokens": 237713003.0, "step": 6231 }, { "epoch": 0.792774456176059, "ewc_loss": 0.02383560501039028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.383560422458686e-05, "grad_norm": 15.569190979003906, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8648464679718018, "num_tokens": 237742054.0, "step": 6232 }, { "epoch": 0.7929016664546495, "ewc_loss": 0.023858869448304176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.385886909905821e-05, "grad_norm": 15.569868087768555, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.855484127998352, "num_tokens": 237783147.0, "step": 6233 }, { "epoch": 0.7930288767332401, "ewc_loss": 0.02384704351425171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.384704384894576e-05, "grad_norm": 15.601755142211914, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8567196726799011, "num_tokens": 237814322.0, "step": 6234 }, { "epoch": 0.7931560870118306, "ewc_loss": 0.023869860917329788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.386986125202384e-05, "grad_norm": 15.528253555297852, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8672864437103271, "num_tokens": 237849278.0, "step": 6235 }, { "epoch": 0.793283297290421, "ewc_loss": 0.02383638359606266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3836382752051577e-05, "grad_norm": 15.569012641906738, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8668277859687805, "num_tokens": 237893434.0, "step": 6236 }, { "epoch": 0.7934105075690115, "ewc_loss": 0.02388336881995201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.388336906733457e-05, "grad_norm": 15.537337303161621, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8702359795570374, "num_tokens": 237932135.0, "step": 6237 }, { "epoch": 0.7935377178476021, "ewc_loss": 0.02390892244875431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3908922230475582e-05, "grad_norm": 15.617471694946289, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8576706647872925, "num_tokens": 237966320.0, "step": 6238 }, { "epoch": 0.7936649281261926, "ewc_loss": 0.02389875054359436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3898750441730954e-05, "grad_norm": 15.458712577819824, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8670330047607422, "num_tokens": 238006365.0, "step": 6239 }, { "epoch": 0.7937921384047831, "ewc_loss": 0.023894181475043297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3894181140349247e-05, "grad_norm": 15.669175148010254, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8612561225891113, "num_tokens": 238039307.0, "step": 6240 }, { "epoch": 0.7939193486833737, "ewc_loss": 0.02397789992392063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3977900127647445e-05, "grad_norm": 15.508758544921875, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8490077257156372, "num_tokens": 238078015.0, "step": 6241 }, { "epoch": 0.7940465589619641, "ewc_loss": 0.023831753060221672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3831753424019553e-05, "grad_norm": 15.554373741149902, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8599517345428467, "num_tokens": 238116790.0, "step": 6242 }, { "epoch": 0.7941737692405546, "ewc_loss": 0.023968908935785294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3968908863025717e-05, "grad_norm": 15.507098197937012, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8688445687294006, "num_tokens": 238155330.0, "step": 6243 }, { "epoch": 0.7943009795191451, "ewc_loss": 0.023906584829092026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3906584829092026e-05, "grad_norm": 15.622709274291992, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8472900390625, "num_tokens": 238194492.0, "step": 6244 }, { "epoch": 0.7944281897977357, "ewc_loss": 0.023990388959646225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.399038930889219e-05, "grad_norm": 15.553287506103516, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8626198768615723, "num_tokens": 238230451.0, "step": 6245 }, { "epoch": 0.7945554000763262, "ewc_loss": 0.023886462673544884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.388646316831e-05, "grad_norm": 15.55904769897461, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8706672191619873, "num_tokens": 238266331.0, "step": 6246 }, { "epoch": 0.7946826103549167, "ewc_loss": 0.023918943479657173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3918943043099716e-05, "grad_norm": 15.503984451293945, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8514425754547119, "num_tokens": 238307753.0, "step": 6247 }, { "epoch": 0.7948098206335071, "ewc_loss": 0.02392689511179924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.392689566477202e-05, "grad_norm": 15.570782661437988, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8636100888252258, "num_tokens": 238349644.0, "step": 6248 }, { "epoch": 0.7949370309120977, "ewc_loss": 0.02393300272524357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3933002012199722e-05, "grad_norm": 15.642779350280762, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8479536771774292, "num_tokens": 238386308.0, "step": 6249 }, { "epoch": 0.7950642411906882, "ewc_loss": 0.023948270827531815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3948270609253086e-05, "grad_norm": 15.608915328979492, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8471229672431946, "num_tokens": 238427935.0, "step": 6250 }, { "epoch": 0.7951914514692787, "ewc_loss": 0.0238939356058836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3893935576779768e-05, "grad_norm": 15.602100372314453, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8673499226570129, "num_tokens": 238463639.0, "step": 6251 }, { "epoch": 0.7953186617478692, "ewc_loss": 0.02390323393046856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3903234250610694e-05, "grad_norm": 15.648773193359375, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8750999569892883, "num_tokens": 238498295.0, "step": 6252 }, { "epoch": 0.7954458720264598, "ewc_loss": 0.02394791506230831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3947915906319395e-05, "grad_norm": 15.624030113220215, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8622768521308899, "num_tokens": 238538109.0, "step": 6253 }, { "epoch": 0.7955730823050502, "ewc_loss": 0.02388904057443142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3889040676294826e-05, "grad_norm": 15.621448516845703, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8492392301559448, "num_tokens": 238574415.0, "step": 6254 }, { "epoch": 0.7957002925836407, "ewc_loss": 0.023866120725870132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3866121409810148e-05, "grad_norm": 15.5902681350708, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8641502261161804, "num_tokens": 238616368.0, "step": 6255 }, { "epoch": 0.7958275028622313, "ewc_loss": 0.023887477815151215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.388747816439718e-05, "grad_norm": 15.588403701782227, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8665099143981934, "num_tokens": 238650634.0, "step": 6256 }, { "epoch": 0.7959547131408218, "ewc_loss": 0.023884953930974007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.388495340710506e-05, "grad_norm": 15.652424812316895, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8572215437889099, "num_tokens": 238684383.0, "step": 6257 }, { "epoch": 0.7960819234194123, "ewc_loss": 0.02392001636326313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.392001624684781e-05, "grad_norm": 15.559678077697754, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8594918847084045, "num_tokens": 238722863.0, "step": 6258 }, { "epoch": 0.7962091336980028, "ewc_loss": 0.023834267631173134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3834267267375253e-05, "grad_norm": 15.539933204650879, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8635823726654053, "num_tokens": 238765027.0, "step": 6259 }, { "epoch": 0.7963363439765934, "ewc_loss": 0.023928241804242134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3928241716930643e-05, "grad_norm": 15.597319602966309, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8669887185096741, "num_tokens": 238801950.0, "step": 6260 }, { "epoch": 0.7964635542551838, "ewc_loss": 0.023898759856820107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.389875953667797e-05, "grad_norm": 15.577086448669434, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.854038655757904, "num_tokens": 238840963.0, "step": 6261 }, { "epoch": 0.7965907645337743, "ewc_loss": 0.023933090269565582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3933091142680496e-05, "grad_norm": 15.605910301208496, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8487591743469238, "num_tokens": 238876167.0, "step": 6262 }, { "epoch": 0.7967179748123648, "ewc_loss": 0.02391623891890049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3916238205856644e-05, "grad_norm": 15.626421928405762, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8692355155944824, "num_tokens": 238910293.0, "step": 6263 }, { "epoch": 0.7968451850909554, "ewc_loss": 0.023972420021891594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.397241951257456e-05, "grad_norm": 15.518543243408203, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8592371940612793, "num_tokens": 238947279.0, "step": 6264 }, { "epoch": 0.7969723953695459, "ewc_loss": 0.023901715874671936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3901715394458733e-05, "grad_norm": 15.58191204071045, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8515861630439758, "num_tokens": 238984148.0, "step": 6265 }, { "epoch": 0.7970996056481364, "ewc_loss": 0.02399596944451332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.399596996838227e-05, "grad_norm": 15.532352447509766, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8492010831832886, "num_tokens": 239022370.0, "step": 6266 }, { "epoch": 0.7972268159267268, "ewc_loss": 0.023935262113809586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.393526301602833e-05, "grad_norm": 15.562346458435059, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8590131402015686, "num_tokens": 239064542.0, "step": 6267 }, { "epoch": 0.7973540262053174, "ewc_loss": 0.02401227317750454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.401227357040625e-05, "grad_norm": 15.56584358215332, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8610657453536987, "num_tokens": 239103193.0, "step": 6268 }, { "epoch": 0.7974812364839079, "ewc_loss": 0.023957250639796257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3957250959938392e-05, "grad_norm": 15.629870414733887, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8699649572372437, "num_tokens": 239137952.0, "step": 6269 }, { "epoch": 0.7976084467624984, "ewc_loss": 0.023986104875802994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.398610558884684e-05, "grad_norm": 15.477984428405762, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8521686792373657, "num_tokens": 239174600.0, "step": 6270 }, { "epoch": 0.797735657041089, "ewc_loss": 0.023938573896884918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3938573576742783e-05, "grad_norm": 15.591623306274414, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8673077821731567, "num_tokens": 239206157.0, "step": 6271 }, { "epoch": 0.7978628673196795, "ewc_loss": 0.02408251352608204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4082513846224174e-05, "grad_norm": 15.526169776916504, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.866791844367981, "num_tokens": 239243094.0, "step": 6272 }, { "epoch": 0.7979900775982699, "ewc_loss": 0.023937204852700233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3937205696711317e-05, "grad_norm": 15.56081485748291, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8642915487289429, "num_tokens": 239281208.0, "step": 6273 }, { "epoch": 0.7981172878768604, "ewc_loss": 0.024053065106272697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4053064407780766e-05, "grad_norm": 15.60334300994873, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8581504821777344, "num_tokens": 239320159.0, "step": 6274 }, { "epoch": 0.798244498155451, "ewc_loss": 0.02401483617722988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4014836526475847e-05, "grad_norm": 15.605058670043945, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8510353565216064, "num_tokens": 239360160.0, "step": 6275 }, { "epoch": 0.7983717084340415, "ewc_loss": 0.02402198128402233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4021981516852975e-05, "grad_norm": 15.620189666748047, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8645787239074707, "num_tokens": 239398101.0, "step": 6276 }, { "epoch": 0.798498918712632, "ewc_loss": 0.024018995463848114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4018994736252353e-05, "grad_norm": 15.619099617004395, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8776718378067017, "num_tokens": 239438560.0, "step": 6277 }, { "epoch": 0.7986261289912225, "ewc_loss": 0.02396468259394169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.396468335064128e-05, "grad_norm": 15.59665584564209, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8569995164871216, "num_tokens": 239475851.0, "step": 6278 }, { "epoch": 0.798753339269813, "ewc_loss": 0.024007583037018776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4007582396734506e-05, "grad_norm": 15.606915473937988, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8668840527534485, "num_tokens": 239519165.0, "step": 6279 }, { "epoch": 0.7988805495484035, "ewc_loss": 0.023964248597621918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3964248612173833e-05, "grad_norm": 15.578332901000977, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8498210906982422, "num_tokens": 239560602.0, "step": 6280 }, { "epoch": 0.799007759826994, "ewc_loss": 0.02392434887588024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3924349079607055e-05, "grad_norm": 15.555668830871582, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8636128902435303, "num_tokens": 239591933.0, "step": 6281 }, { "epoch": 0.7991349701055845, "ewc_loss": 0.0239745881408453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3974587747943588e-05, "grad_norm": 15.63204288482666, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8558608293533325, "num_tokens": 239625495.0, "step": 6282 }, { "epoch": 0.7992621803841751, "ewc_loss": 0.023944398388266563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3944397980812937e-05, "grad_norm": 15.515222549438477, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8774459362030029, "num_tokens": 239658249.0, "step": 6283 }, { "epoch": 0.7993893906627656, "ewc_loss": 0.02397608384490013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3976084776222706e-05, "grad_norm": 15.621635437011719, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8536238670349121, "num_tokens": 239690194.0, "step": 6284 }, { "epoch": 0.799516600941356, "ewc_loss": 0.02398364432156086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3983644496183842e-05, "grad_norm": 15.586709976196289, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8600075840950012, "num_tokens": 239728315.0, "step": 6285 }, { "epoch": 0.7996438112199465, "ewc_loss": 0.024009259417653084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4009259504964575e-05, "grad_norm": 15.559974670410156, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8676475286483765, "num_tokens": 239764505.0, "step": 6286 }, { "epoch": 0.7997710214985371, "ewc_loss": 0.024003969505429268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4003969883779064e-05, "grad_norm": 15.618307113647461, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8470024466514587, "num_tokens": 239809461.0, "step": 6287 }, { "epoch": 0.7998982317771276, "ewc_loss": 0.02399522066116333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3995220544748008e-05, "grad_norm": 15.558424949645996, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8568106889724731, "num_tokens": 239845232.0, "step": 6288 }, { "epoch": 0.8000254420557181, "ewc_loss": 0.0239401962608099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3940196115290746e-05, "grad_norm": 15.562948226928711, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.868083119392395, "num_tokens": 239885489.0, "step": 6289 }, { "epoch": 0.8001526523343087, "ewc_loss": 0.023994335904717445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3994336515897885e-05, "grad_norm": 15.548223495483398, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8491564393043518, "num_tokens": 239921159.0, "step": 6290 }, { "epoch": 0.8002798626128991, "ewc_loss": 0.02403656765818596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.403656799288001e-05, "grad_norm": 15.583925247192383, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8664339184761047, "num_tokens": 239960879.0, "step": 6291 }, { "epoch": 0.8004070728914896, "ewc_loss": 0.02400648035109043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4006480089155957e-05, "grad_norm": 15.630985260009766, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8518136739730835, "num_tokens": 239994302.0, "step": 6292 }, { "epoch": 0.8005342831700801, "ewc_loss": 0.024023309350013733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4023309379117563e-05, "grad_norm": 15.545331001281738, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8680663108825684, "num_tokens": 240027759.0, "step": 6293 }, { "epoch": 0.8006614934486707, "ewc_loss": 0.024010270833969116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4010270863072947e-05, "grad_norm": 15.560172080993652, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8570363521575928, "num_tokens": 240065049.0, "step": 6294 }, { "epoch": 0.8007887037272612, "ewc_loss": 0.02405601739883423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.405601662758272e-05, "grad_norm": 15.563515663146973, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8486323356628418, "num_tokens": 240101453.0, "step": 6295 }, { "epoch": 0.8009159140058517, "ewc_loss": 0.024016106501221657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4016106181079522e-05, "grad_norm": 15.590153694152832, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8413442969322205, "num_tokens": 240136831.0, "step": 6296 }, { "epoch": 0.8010431242844421, "ewc_loss": 0.02409525029361248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4095250410027802e-05, "grad_norm": 15.59134292602539, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8602312803268433, "num_tokens": 240173364.0, "step": 6297 }, { "epoch": 0.8011703345630327, "ewc_loss": 0.024026798084378242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4026798200793564e-05, "grad_norm": 15.61837100982666, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8401105403900146, "num_tokens": 240206718.0, "step": 6298 }, { "epoch": 0.8012975448416232, "ewc_loss": 0.02405240572988987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4052405933616683e-05, "grad_norm": 15.54710865020752, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8489967584609985, "num_tokens": 240245633.0, "step": 6299 }, { "epoch": 0.8014247551202137, "ewc_loss": 0.024020565673708916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4020566343097016e-05, "grad_norm": 15.572626113891602, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.875795304775238, "num_tokens": 240283887.0, "step": 6300 }, { "epoch": 0.8015519653988042, "ewc_loss": 0.024092789739370346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4092789317364804e-05, "grad_norm": 15.600631713867188, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8532657623291016, "num_tokens": 240322208.0, "step": 6301 }, { "epoch": 0.8016791756773948, "ewc_loss": 0.024059245362877846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4059245333774015e-05, "grad_norm": 15.540072441101074, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8653643727302551, "num_tokens": 240356154.0, "step": 6302 }, { "epoch": 0.8018063859559852, "ewc_loss": 0.024083225056529045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.408322507108096e-05, "grad_norm": 15.647928237915039, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.860209584236145, "num_tokens": 240389103.0, "step": 6303 }, { "epoch": 0.8019335962345757, "ewc_loss": 0.024112511426210403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.411251080047805e-05, "grad_norm": 15.550439834594727, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8784846067428589, "num_tokens": 240429545.0, "step": 6304 }, { "epoch": 0.8020608065131662, "ewc_loss": 0.02403528243303299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4035281967371702e-05, "grad_norm": 15.626521110534668, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8791822195053101, "num_tokens": 240468556.0, "step": 6305 }, { "epoch": 0.8021880167917568, "ewc_loss": 0.024132104590535164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4132104954333045e-05, "grad_norm": 15.570099830627441, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8620579838752747, "num_tokens": 240508534.0, "step": 6306 }, { "epoch": 0.8023152270703473, "ewc_loss": 0.024025434628129005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4025433958740905e-05, "grad_norm": 15.640652656555176, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.861124575138092, "num_tokens": 240540833.0, "step": 6307 }, { "epoch": 0.8024424373489378, "ewc_loss": 0.02410176396369934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.41017642110819e-05, "grad_norm": 15.545323371887207, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8539646863937378, "num_tokens": 240577768.0, "step": 6308 }, { "epoch": 0.8025696476275284, "ewc_loss": 0.024046016857028008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.404601764283143e-05, "grad_norm": 15.605826377868652, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8567304611206055, "num_tokens": 240610607.0, "step": 6309 }, { "epoch": 0.8026968579061188, "ewc_loss": 0.024113263934850693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4113263862091117e-05, "grad_norm": 15.552505493164062, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8582906723022461, "num_tokens": 240646056.0, "step": 6310 }, { "epoch": 0.8028240681847093, "ewc_loss": 0.024063941091299057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406394196441397e-05, "grad_norm": 15.575297355651855, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8685103058815002, "num_tokens": 240682513.0, "step": 6311 }, { "epoch": 0.8029512784632998, "ewc_loss": 0.024079324677586555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4079325157799758e-05, "grad_norm": 15.497142791748047, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8628274202346802, "num_tokens": 240716311.0, "step": 6312 }, { "epoch": 0.8030784887418904, "ewc_loss": 0.024134859442710876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4134858904290013e-05, "grad_norm": 15.5617094039917, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8534702062606812, "num_tokens": 240758873.0, "step": 6313 }, { "epoch": 0.8032056990204809, "ewc_loss": 0.02416612207889557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4166121875168756e-05, "grad_norm": 15.582950592041016, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8664423227310181, "num_tokens": 240797163.0, "step": 6314 }, { "epoch": 0.8033329092990714, "ewc_loss": 0.024110490456223488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.411048990325071e-05, "grad_norm": 15.631340980529785, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.872859001159668, "num_tokens": 240835719.0, "step": 6315 }, { "epoch": 0.8034601195776618, "ewc_loss": 0.024146828800439835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4146829673554748e-05, "grad_norm": 15.607131004333496, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8557743430137634, "num_tokens": 240874359.0, "step": 6316 }, { "epoch": 0.8035873298562524, "ewc_loss": 0.024098768830299377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409876833553426e-05, "grad_norm": 15.618946075439453, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.875447154045105, "num_tokens": 240909125.0, "step": 6317 }, { "epoch": 0.8037145401348429, "ewc_loss": 0.024074506014585495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4074506654869765e-05, "grad_norm": 15.561439514160156, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.867030143737793, "num_tokens": 240949391.0, "step": 6318 }, { "epoch": 0.8038417504134334, "ewc_loss": 0.024091724306344986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409172520856373e-05, "grad_norm": 15.68294620513916, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.851723313331604, "num_tokens": 240987390.0, "step": 6319 }, { "epoch": 0.803968960692024, "ewc_loss": 0.024085136130452156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4085136828944087e-05, "grad_norm": 15.571807861328125, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8547507524490356, "num_tokens": 241025359.0, "step": 6320 }, { "epoch": 0.8040961709706145, "ewc_loss": 0.024031173437833786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4031172870309092e-05, "grad_norm": 15.6244478225708, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.856518030166626, "num_tokens": 241060532.0, "step": 6321 }, { "epoch": 0.8042233812492049, "ewc_loss": 0.024064157158136368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406415660516359e-05, "grad_norm": 15.623512268066406, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8687806725502014, "num_tokens": 241098547.0, "step": 6322 }, { "epoch": 0.8043505915277954, "ewc_loss": 0.024012306705117226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4012306312215514e-05, "grad_norm": 15.591426849365234, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8609439730644226, "num_tokens": 241131091.0, "step": 6323 }, { "epoch": 0.804477801806386, "ewc_loss": 0.02404646947979927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.404646875220351e-05, "grad_norm": 15.61755084991455, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8511642217636108, "num_tokens": 241164495.0, "step": 6324 }, { "epoch": 0.8046050120849765, "ewc_loss": 0.024077992886304855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4077993657556362e-05, "grad_norm": 15.557172775268555, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8556511998176575, "num_tokens": 241203777.0, "step": 6325 }, { "epoch": 0.804732222363567, "ewc_loss": 0.024042442440986633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4042443328653462e-05, "grad_norm": 15.544930458068848, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8558820486068726, "num_tokens": 241243219.0, "step": 6326 }, { "epoch": 0.8048594326421575, "ewc_loss": 0.024131540209054947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4131541067617945e-05, "grad_norm": 15.620353698730469, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8458422422409058, "num_tokens": 241284547.0, "step": 6327 }, { "epoch": 0.804986642920748, "ewc_loss": 0.0240949559956789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4094955733744428e-05, "grad_norm": 15.558145523071289, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.844027042388916, "num_tokens": 241331110.0, "step": 6328 }, { "epoch": 0.8051138531993385, "ewc_loss": 0.024102596566081047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4102597308228724e-05, "grad_norm": 15.589190483093262, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8611056804656982, "num_tokens": 241372720.0, "step": 6329 }, { "epoch": 0.805241063477929, "ewc_loss": 0.024123577401041985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.412357753200922e-05, "grad_norm": 15.636009216308594, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8630385398864746, "num_tokens": 241408173.0, "step": 6330 }, { "epoch": 0.8053682737565195, "ewc_loss": 0.02407030574977398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4070306608336978e-05, "grad_norm": 15.566888809204102, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8675792217254639, "num_tokens": 241445021.0, "step": 6331 }, { "epoch": 0.8054954840351101, "ewc_loss": 0.0240846648812294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.408466571068857e-05, "grad_norm": 15.649051666259766, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8661344051361084, "num_tokens": 241481874.0, "step": 6332 }, { "epoch": 0.8056226943137006, "ewc_loss": 0.024075888097286224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4075887267827056e-05, "grad_norm": 15.533211708068848, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8465474843978882, "num_tokens": 241523140.0, "step": 6333 }, { "epoch": 0.805749904592291, "ewc_loss": 0.02404054068028927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4040540665737353e-05, "grad_norm": 15.614805221557617, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8442549705505371, "num_tokens": 241563424.0, "step": 6334 }, { "epoch": 0.8058771148708815, "ewc_loss": 0.024128777906298637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.412877802271396e-05, "grad_norm": 15.64177131652832, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8745066523551941, "num_tokens": 241600004.0, "step": 6335 }, { "epoch": 0.8060043251494721, "ewc_loss": 0.02400997094810009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.400997072982136e-05, "grad_norm": 15.535518646240234, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8602631688117981, "num_tokens": 241642704.0, "step": 6336 }, { "epoch": 0.8061315354280626, "ewc_loss": 0.02407037653028965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4070375729934312e-05, "grad_norm": 15.689095497131348, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8660717010498047, "num_tokens": 241679176.0, "step": 6337 }, { "epoch": 0.8062587457066531, "ewc_loss": 0.02405773103237152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.405773011560086e-05, "grad_norm": 15.504379272460938, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8662832975387573, "num_tokens": 241721168.0, "step": 6338 }, { "epoch": 0.8063859559852437, "ewc_loss": 0.023967517539858818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3967517336132005e-05, "grad_norm": 15.610383987426758, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8749195337295532, "num_tokens": 241759735.0, "step": 6339 }, { "epoch": 0.8065131662638341, "ewc_loss": 0.024077553302049637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4077553462120704e-05, "grad_norm": 15.559345245361328, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.860931396484375, "num_tokens": 241799445.0, "step": 6340 }, { "epoch": 0.8066403765424246, "ewc_loss": 0.023981409147381783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3981408958206885e-05, "grad_norm": 15.550567626953125, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8656589984893799, "num_tokens": 241839333.0, "step": 6341 }, { "epoch": 0.8067675868210151, "ewc_loss": 0.02403893508017063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4038934498094022e-05, "grad_norm": 15.611566543579102, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8498525619506836, "num_tokens": 241880248.0, "step": 6342 }, { "epoch": 0.8068947970996057, "ewc_loss": 0.02400875836610794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4008757463889197e-05, "grad_norm": 15.559246063232422, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8755167722702026, "num_tokens": 241926717.0, "step": 6343 }, { "epoch": 0.8070220073781962, "ewc_loss": 0.02399396523833275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.399396544205956e-05, "grad_norm": 15.578685760498047, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8519805669784546, "num_tokens": 241966080.0, "step": 6344 }, { "epoch": 0.8071492176567867, "ewc_loss": 0.024030402302742004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.403040161880199e-05, "grad_norm": 15.588096618652344, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.858910322189331, "num_tokens": 242011975.0, "step": 6345 }, { "epoch": 0.8072764279353771, "ewc_loss": 0.023967690765857697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3967690140125342e-05, "grad_norm": 15.600041389465332, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8746398687362671, "num_tokens": 242046320.0, "step": 6346 }, { "epoch": 0.8074036382139677, "ewc_loss": 0.024041036143898964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4041035430855118e-05, "grad_norm": 15.577731132507324, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8635298013687134, "num_tokens": 242085157.0, "step": 6347 }, { "epoch": 0.8075308484925582, "ewc_loss": 0.02400565706193447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4005657905945554e-05, "grad_norm": 15.66342830657959, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8578912615776062, "num_tokens": 242116548.0, "step": 6348 }, { "epoch": 0.8076580587711487, "ewc_loss": 0.02401282824575901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4012828362174332e-05, "grad_norm": 15.573138236999512, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8679636716842651, "num_tokens": 242151883.0, "step": 6349 }, { "epoch": 0.8077852690497392, "ewc_loss": 0.024019209668040276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.401920937700197e-05, "grad_norm": 15.603185653686523, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8586511611938477, "num_tokens": 242190486.0, "step": 6350 }, { "epoch": 0.8079124793283298, "ewc_loss": 0.02399982511997223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3999824406928383e-05, "grad_norm": 15.56535816192627, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8651196360588074, "num_tokens": 242232720.0, "step": 6351 }, { "epoch": 0.8080396896069202, "ewc_loss": 0.024012425914406776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.401242636551615e-05, "grad_norm": 15.637382507324219, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8631837368011475, "num_tokens": 242273926.0, "step": 6352 }, { "epoch": 0.8081668998855107, "ewc_loss": 0.024028491228818893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4028491679928266e-05, "grad_norm": 15.620582580566406, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8509656190872192, "num_tokens": 242313484.0, "step": 6353 }, { "epoch": 0.8082941101641012, "ewc_loss": 0.02399703674018383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3997035896172747e-05, "grad_norm": 15.660155296325684, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.85561203956604, "num_tokens": 242352101.0, "step": 6354 }, { "epoch": 0.8084213204426918, "ewc_loss": 0.024054478853940964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4054479581536725e-05, "grad_norm": 15.605537414550781, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8768354654312134, "num_tokens": 242392958.0, "step": 6355 }, { "epoch": 0.8085485307212823, "ewc_loss": 0.023987410590052605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3987409804249182e-05, "grad_norm": 15.646102905273438, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8622891902923584, "num_tokens": 242432327.0, "step": 6356 }, { "epoch": 0.8086757409998728, "ewc_loss": 0.024002185091376305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4002185455174185e-05, "grad_norm": 15.54433822631836, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8403629064559937, "num_tokens": 242473714.0, "step": 6357 }, { "epoch": 0.8088029512784632, "ewc_loss": 0.02400193363428116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4001934434636496e-05, "grad_norm": 15.632627487182617, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8691067099571228, "num_tokens": 242513179.0, "step": 6358 }, { "epoch": 0.8089301615570538, "ewc_loss": 0.023996392264962196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3996391973923892e-05, "grad_norm": 15.56091594696045, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8621189594268799, "num_tokens": 242549398.0, "step": 6359 }, { "epoch": 0.8090573718356443, "ewc_loss": 0.02399361878633499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3993618015083484e-05, "grad_norm": 15.685434341430664, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.856492280960083, "num_tokens": 242583705.0, "step": 6360 }, { "epoch": 0.8091845821142348, "ewc_loss": 0.024045927450060844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4045926693361253e-05, "grad_norm": 15.56963062286377, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8658655881881714, "num_tokens": 242625900.0, "step": 6361 }, { "epoch": 0.8093117923928254, "ewc_loss": 0.024026306346058846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4026307073654607e-05, "grad_norm": 15.639403343200684, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8498320579528809, "num_tokens": 242665766.0, "step": 6362 }, { "epoch": 0.8094390026714159, "ewc_loss": 0.024081045761704445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4081045921775512e-05, "grad_norm": 15.708821296691895, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8641834259033203, "num_tokens": 242707634.0, "step": 6363 }, { "epoch": 0.8095662129500064, "ewc_loss": 0.023987412452697754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3987411623238586e-05, "grad_norm": 15.649084091186523, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8569918274879456, "num_tokens": 242746791.0, "step": 6364 }, { "epoch": 0.8096934232285968, "ewc_loss": 0.024008767679333687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4008768377825618e-05, "grad_norm": 15.544092178344727, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8742243647575378, "num_tokens": 242784861.0, "step": 6365 }, { "epoch": 0.8098206335071874, "ewc_loss": 0.023936444893479347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3936445359140635e-05, "grad_norm": 15.625227928161621, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8761535882949829, "num_tokens": 242817612.0, "step": 6366 }, { "epoch": 0.8099478437857779, "ewc_loss": 0.02404407411813736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4044074962148443e-05, "grad_norm": 15.631889343261719, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.869249165058136, "num_tokens": 242854951.0, "step": 6367 }, { "epoch": 0.8100750540643684, "ewc_loss": 0.02394683286547661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3946833607624285e-05, "grad_norm": 15.602838516235352, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.876925528049469, "num_tokens": 242894166.0, "step": 6368 }, { "epoch": 0.8102022643429589, "ewc_loss": 0.02397439442574978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3974394935066812e-05, "grad_norm": 15.571621894836426, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8670105338096619, "num_tokens": 242936230.0, "step": 6369 }, { "epoch": 0.8103294746215495, "ewc_loss": 0.023998429998755455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3998429242055863e-05, "grad_norm": 15.630705833435059, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8730624914169312, "num_tokens": 242973148.0, "step": 6370 }, { "epoch": 0.8104566849001399, "ewc_loss": 0.024008093401789665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4008093532756902e-05, "grad_norm": 15.629379272460938, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8629224300384521, "num_tokens": 243013029.0, "step": 6371 }, { "epoch": 0.8105838951787304, "ewc_loss": 0.023954695090651512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.395469527982641e-05, "grad_norm": 15.631308555603027, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8612928986549377, "num_tokens": 243054847.0, "step": 6372 }, { "epoch": 0.810711105457321, "ewc_loss": 0.023971399292349815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3971399059519172e-05, "grad_norm": 15.602288246154785, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8551762700080872, "num_tokens": 243096524.0, "step": 6373 }, { "epoch": 0.8108383157359115, "ewc_loss": 0.023939579725265503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3939579477882944e-05, "grad_norm": 15.582539558410645, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8545911312103271, "num_tokens": 243132397.0, "step": 6374 }, { "epoch": 0.810965526014502, "ewc_loss": 0.023969391360878944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3969390895217657e-05, "grad_norm": 15.585736274719238, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.848567008972168, "num_tokens": 243171790.0, "step": 6375 }, { "epoch": 0.8110927362930925, "ewc_loss": 0.023978954181075096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3978953322512098e-05, "grad_norm": 15.66219711303711, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.860893964767456, "num_tokens": 243208217.0, "step": 6376 }, { "epoch": 0.811219946571683, "ewc_loss": 0.02402174100279808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4021741410251707e-05, "grad_norm": 15.61219596862793, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8670466542243958, "num_tokens": 243245260.0, "step": 6377 }, { "epoch": 0.8113471568502735, "ewc_loss": 0.023973768576979637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3973769202711992e-05, "grad_norm": 15.750327110290527, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.86977618932724, "num_tokens": 243283924.0, "step": 6378 }, { "epoch": 0.811474367128864, "ewc_loss": 0.024057041853666306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.405704253760632e-05, "grad_norm": 15.671555519104004, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8701595067977905, "num_tokens": 243321977.0, "step": 6379 }, { "epoch": 0.8116015774074545, "ewc_loss": 0.023991812020540237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3991811758605763e-05, "grad_norm": 15.665048599243164, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8591330051422119, "num_tokens": 243359349.0, "step": 6380 }, { "epoch": 0.8117287876860451, "ewc_loss": 0.02405374124646187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4053741071838886e-05, "grad_norm": 15.682585716247559, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8649899959564209, "num_tokens": 243395722.0, "step": 6381 }, { "epoch": 0.8118559979646356, "ewc_loss": 0.024003436788916588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4003436919883825e-05, "grad_norm": 15.642745971679688, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8515620827674866, "num_tokens": 243429022.0, "step": 6382 }, { "epoch": 0.811983208243226, "ewc_loss": 0.023972127586603165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3972128474269994e-05, "grad_norm": 15.664770126342773, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8408803939819336, "num_tokens": 243468474.0, "step": 6383 }, { "epoch": 0.8121104185218165, "ewc_loss": 0.024072010070085526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4072009182418697e-05, "grad_norm": 15.610848426818848, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8775602579116821, "num_tokens": 243503045.0, "step": 6384 }, { "epoch": 0.8122376288004071, "ewc_loss": 0.023994529619812965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.399452932877466e-05, "grad_norm": 15.620203971862793, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8509312868118286, "num_tokens": 243543865.0, "step": 6385 }, { "epoch": 0.8123648390789976, "ewc_loss": 0.024071449413895607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4071448933682404e-05, "grad_norm": 15.605388641357422, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8549160957336426, "num_tokens": 243581258.0, "step": 6386 }, { "epoch": 0.8124920493575881, "ewc_loss": 0.02408364787697792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4083647076622583e-05, "grad_norm": 15.63606071472168, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8554903268814087, "num_tokens": 243622777.0, "step": 6387 }, { "epoch": 0.8126192596361786, "ewc_loss": 0.02407384291291237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.407384272373747e-05, "grad_norm": 15.631040573120117, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.849279522895813, "num_tokens": 243664549.0, "step": 6388 }, { "epoch": 0.8127464699147691, "ewc_loss": 0.02407298982143402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4072989617707208e-05, "grad_norm": 15.613069534301758, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8560945987701416, "num_tokens": 243710408.0, "step": 6389 }, { "epoch": 0.8128736801933596, "ewc_loss": 0.02411436289548874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4114362531690858e-05, "grad_norm": 15.670451164245605, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.862533688545227, "num_tokens": 243752598.0, "step": 6390 }, { "epoch": 0.8130008904719501, "ewc_loss": 0.024083439260721207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.408343971183058e-05, "grad_norm": 15.605974197387695, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8584752082824707, "num_tokens": 243789805.0, "step": 6391 }, { "epoch": 0.8131281007505406, "ewc_loss": 0.02408750355243683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.40875033341581e-05, "grad_norm": 15.715195655822754, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8721446394920349, "num_tokens": 243824124.0, "step": 6392 }, { "epoch": 0.8132553110291312, "ewc_loss": 0.024098338559269905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4098339054035023e-05, "grad_norm": 15.71662425994873, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.861352801322937, "num_tokens": 243863448.0, "step": 6393 }, { "epoch": 0.8133825213077217, "ewc_loss": 0.024055983871221542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4055983885773458e-05, "grad_norm": 15.603178977966309, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8648300170898438, "num_tokens": 243902085.0, "step": 6394 }, { "epoch": 0.8135097315863121, "ewc_loss": 0.02406015247106552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4060153009486385e-05, "grad_norm": 15.670679092407227, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8555786609649658, "num_tokens": 243945349.0, "step": 6395 }, { "epoch": 0.8136369418649027, "ewc_loss": 0.024072423577308655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4072423912002705e-05, "grad_norm": 15.659722328186035, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8591269850730896, "num_tokens": 243982221.0, "step": 6396 }, { "epoch": 0.8137641521434932, "ewc_loss": 0.024063726887106895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4063727323664352e-05, "grad_norm": 15.710593223571777, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8487299680709839, "num_tokens": 244012576.0, "step": 6397 }, { "epoch": 0.8138913624220837, "ewc_loss": 0.02408248744904995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.408248656138312e-05, "grad_norm": 15.6898832321167, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8685919046401978, "num_tokens": 244052089.0, "step": 6398 }, { "epoch": 0.8140185727006742, "ewc_loss": 0.024022039026021957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.402203972451389e-05, "grad_norm": 15.66122817993164, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8626219034194946, "num_tokens": 244088712.0, "step": 6399 }, { "epoch": 0.8141457829792648, "ewc_loss": 0.024003533646464348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4003533326322213e-05, "grad_norm": 15.661538124084473, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8630149364471436, "num_tokens": 244127655.0, "step": 6400 }, { "epoch": 0.8142729932578552, "ewc_loss": 0.02400912158191204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4009121261769906e-05, "grad_norm": 15.632219314575195, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8672743439674377, "num_tokens": 244158588.0, "step": 6401 }, { "epoch": 0.8144002035364457, "ewc_loss": 0.024055154994130135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.405515442660544e-05, "grad_norm": 15.688070297241211, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8796430826187134, "num_tokens": 244193872.0, "step": 6402 }, { "epoch": 0.8145274138150362, "ewc_loss": 0.024077756330370903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.40777571889339e-05, "grad_norm": 15.668217658996582, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8465332984924316, "num_tokens": 244234055.0, "step": 6403 }, { "epoch": 0.8146546240936268, "ewc_loss": 0.024020114913582802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4020115233724937e-05, "grad_norm": 15.63217830657959, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8628518581390381, "num_tokens": 244275163.0, "step": 6404 }, { "epoch": 0.8147818343722173, "ewc_loss": 0.024026403203606606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4026403480092995e-05, "grad_norm": 15.632957458496094, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8583725690841675, "num_tokens": 244315241.0, "step": 6405 }, { "epoch": 0.8149090446508078, "ewc_loss": 0.024117862805724144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.411786226730328e-05, "grad_norm": 15.687782287597656, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8762509822845459, "num_tokens": 244351947.0, "step": 6406 }, { "epoch": 0.8150362549293982, "ewc_loss": 0.024066705256700516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4066705009317957e-05, "grad_norm": 15.640016555786133, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8596249222755432, "num_tokens": 244387706.0, "step": 6407 }, { "epoch": 0.8151634652079888, "ewc_loss": 0.02404169738292694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.404169754299801e-05, "grad_norm": 15.633800506591797, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8703908324241638, "num_tokens": 244428447.0, "step": 6408 }, { "epoch": 0.8152906754865793, "ewc_loss": 0.02409040927886963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4090410079224966e-05, "grad_norm": 15.662769317626953, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8465908765792847, "num_tokens": 244471191.0, "step": 6409 }, { "epoch": 0.8154178857651698, "ewc_loss": 0.024051057174801826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4051058062468655e-05, "grad_norm": 15.66301155090332, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8434657454490662, "num_tokens": 244509198.0, "step": 6410 }, { "epoch": 0.8155450960437604, "ewc_loss": 0.024064140394330025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4064140234258957e-05, "grad_norm": 15.626631736755371, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8531530499458313, "num_tokens": 244549677.0, "step": 6411 }, { "epoch": 0.8156723063223509, "ewc_loss": 0.024067014455795288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406701423751656e-05, "grad_norm": 15.689638137817383, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8487274646759033, "num_tokens": 244583586.0, "step": 6412 }, { "epoch": 0.8157995166009414, "ewc_loss": 0.02407406084239483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4074061002465896e-05, "grad_norm": 15.617080688476562, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8693218231201172, "num_tokens": 244621220.0, "step": 6413 }, { "epoch": 0.8159267268795318, "ewc_loss": 0.024047594517469406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4047594706644304e-05, "grad_norm": 15.675336837768555, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8686701655387878, "num_tokens": 244655941.0, "step": 6414 }, { "epoch": 0.8160539371581224, "ewc_loss": 0.02410123124718666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.410123124718666e-05, "grad_norm": 15.666854858398438, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8785510063171387, "num_tokens": 244685807.0, "step": 6415 }, { "epoch": 0.8161811474367129, "ewc_loss": 0.024049032479524612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4049031708273105e-05, "grad_norm": 15.660826683044434, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8591147661209106, "num_tokens": 244726866.0, "step": 6416 }, { "epoch": 0.8163083577153034, "ewc_loss": 0.02410479076206684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.41047910094494e-05, "grad_norm": 15.656171798706055, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8524259328842163, "num_tokens": 244767273.0, "step": 6417 }, { "epoch": 0.8164355679938939, "ewc_loss": 0.024089422076940536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.408942236797884e-05, "grad_norm": 15.616124153137207, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8568471670150757, "num_tokens": 244806111.0, "step": 6418 }, { "epoch": 0.8165627782724845, "ewc_loss": 0.02411014400422573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4110144295264035e-05, "grad_norm": 15.623924255371094, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8723289370536804, "num_tokens": 244841706.0, "step": 6419 }, { "epoch": 0.8166899885510749, "ewc_loss": 0.024128222838044167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4128223230945878e-05, "grad_norm": 15.650289535522461, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8640081286430359, "num_tokens": 244883908.0, "step": 6420 }, { "epoch": 0.8168171988296654, "ewc_loss": 0.02410024031996727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4100239897961728e-05, "grad_norm": 15.61884593963623, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.853671133518219, "num_tokens": 244919090.0, "step": 6421 }, { "epoch": 0.8169444091082559, "ewc_loss": 0.024153269827365875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.41532688960433e-05, "grad_norm": 15.638605117797852, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8757144212722778, "num_tokens": 244961141.0, "step": 6422 }, { "epoch": 0.8170716193868465, "ewc_loss": 0.024125628173351288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4125627533067018e-05, "grad_norm": 15.608914375305176, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8460013270378113, "num_tokens": 245004503.0, "step": 6423 }, { "epoch": 0.817198829665437, "ewc_loss": 0.02411689981818199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4116900021908805e-05, "grad_norm": 15.589113235473633, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8605374097824097, "num_tokens": 245045042.0, "step": 6424 }, { "epoch": 0.8173260399440275, "ewc_loss": 0.024141481146216393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4141481844708323e-05, "grad_norm": 15.622934341430664, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8600326180458069, "num_tokens": 245085416.0, "step": 6425 }, { "epoch": 0.8174532502226179, "ewc_loss": 0.024113943800330162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4113944164128043e-05, "grad_norm": 15.58838939666748, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8517867922782898, "num_tokens": 245116137.0, "step": 6426 }, { "epoch": 0.8175804605012085, "ewc_loss": 0.024156494066119194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4156493964255787e-05, "grad_norm": 15.629905700683594, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8740154504776001, "num_tokens": 245159181.0, "step": 6427 }, { "epoch": 0.817707670779799, "ewc_loss": 0.024156270548701286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.415627022855915e-05, "grad_norm": 15.553972244262695, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8531574606895447, "num_tokens": 245205199.0, "step": 6428 }, { "epoch": 0.8178348810583895, "ewc_loss": 0.02416534349322319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4165343347704038e-05, "grad_norm": 15.7289457321167, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8664717078208923, "num_tokens": 245242455.0, "step": 6429 }, { "epoch": 0.8179620913369801, "ewc_loss": 0.024170907214283943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4170907636289485e-05, "grad_norm": 15.586984634399414, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8809858560562134, "num_tokens": 245280675.0, "step": 6430 }, { "epoch": 0.8180893016155706, "ewc_loss": 0.024064404889941216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406440398772247e-05, "grad_norm": 15.642094612121582, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8771365880966187, "num_tokens": 245316947.0, "step": 6431 }, { "epoch": 0.818216511894161, "ewc_loss": 0.02416585385799408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4165854483726434e-05, "grad_norm": 15.631778717041016, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8654970526695251, "num_tokens": 245355145.0, "step": 6432 }, { "epoch": 0.8183437221727515, "ewc_loss": 0.024115532636642456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4115532141877338e-05, "grad_norm": 15.66292953491211, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8728774785995483, "num_tokens": 245390282.0, "step": 6433 }, { "epoch": 0.8184709324513421, "ewc_loss": 0.024139221757650375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.413922265986912e-05, "grad_norm": 15.663227081298828, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8713135719299316, "num_tokens": 245430262.0, "step": 6434 }, { "epoch": 0.8185981427299326, "ewc_loss": 0.02413262613117695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4132627004291862e-05, "grad_norm": 15.701436996459961, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8590152263641357, "num_tokens": 245470874.0, "step": 6435 }, { "epoch": 0.8187253530085231, "ewc_loss": 0.024128686636686325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4128687073243782e-05, "grad_norm": 15.628379821777344, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8682457208633423, "num_tokens": 245506626.0, "step": 6436 }, { "epoch": 0.8188525632871136, "ewc_loss": 0.024125877767801285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4125878553604707e-05, "grad_norm": 15.634345054626465, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8459705114364624, "num_tokens": 245546794.0, "step": 6437 }, { "epoch": 0.8189797735657041, "ewc_loss": 0.024158267304301262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4158267478924245e-05, "grad_norm": 15.68198013305664, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8644031286239624, "num_tokens": 245589544.0, "step": 6438 }, { "epoch": 0.8191069838442946, "ewc_loss": 0.024117620661854744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.411762034171261e-05, "grad_norm": 15.654313087463379, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.878456175327301, "num_tokens": 245628889.0, "step": 6439 }, { "epoch": 0.8192341941228851, "ewc_loss": 0.0240945965051651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4094595573842525e-05, "grad_norm": 15.659581184387207, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8592281341552734, "num_tokens": 245674615.0, "step": 6440 }, { "epoch": 0.8193614044014756, "ewc_loss": 0.02410772070288658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4107721401378512e-05, "grad_norm": 15.591218948364258, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8614762425422668, "num_tokens": 245713966.0, "step": 6441 }, { "epoch": 0.8194886146800662, "ewc_loss": 0.02411123551428318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4111235688906163e-05, "grad_norm": 15.65079116821289, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.866304337978363, "num_tokens": 245750035.0, "step": 6442 }, { "epoch": 0.8196158249586567, "ewc_loss": 0.02413215860724449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.413215952401515e-05, "grad_norm": 15.6304349899292, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.876959502696991, "num_tokens": 245788506.0, "step": 6443 }, { "epoch": 0.8197430352372471, "ewc_loss": 0.024131853133440018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4131853933795355e-05, "grad_norm": 15.70992660522461, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8647782802581787, "num_tokens": 245826547.0, "step": 6444 }, { "epoch": 0.8198702455158376, "ewc_loss": 0.024115463718771935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4115463020280004e-05, "grad_norm": 15.671693801879883, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.852716863155365, "num_tokens": 245874363.0, "step": 6445 }, { "epoch": 0.8199974557944282, "ewc_loss": 0.02403821051120758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.403821054031141e-05, "grad_norm": 15.632163047790527, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8434363007545471, "num_tokens": 245917813.0, "step": 6446 }, { "epoch": 0.8201246660730187, "ewc_loss": 0.02409779466688633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4097795176203363e-05, "grad_norm": 15.677311897277832, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8579813241958618, "num_tokens": 245952177.0, "step": 6447 }, { "epoch": 0.8202518763516092, "ewc_loss": 0.02410467527806759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4104674594127573e-05, "grad_norm": 15.689436912536621, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8640648126602173, "num_tokens": 245992897.0, "step": 6448 }, { "epoch": 0.8203790866301998, "ewc_loss": 0.02407657541334629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4076574845821597e-05, "grad_norm": 15.640419960021973, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8704859018325806, "num_tokens": 246029657.0, "step": 6449 }, { "epoch": 0.8205062969087902, "ewc_loss": 0.02407471276819706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4074712200672366e-05, "grad_norm": 15.668008804321289, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8520770072937012, "num_tokens": 246072425.0, "step": 6450 }, { "epoch": 0.8206335071873807, "ewc_loss": 0.024066103622317314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4066102923825383e-05, "grad_norm": 15.620827674865723, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8799890279769897, "num_tokens": 246113131.0, "step": 6451 }, { "epoch": 0.8207607174659712, "ewc_loss": 0.02405049279332161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4050492356764153e-05, "grad_norm": 15.676589012145996, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8648616671562195, "num_tokens": 246154548.0, "step": 6452 }, { "epoch": 0.8208879277445618, "ewc_loss": 0.024037975817918777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4037975890678354e-05, "grad_norm": 15.649805068969727, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8582773208618164, "num_tokens": 246189518.0, "step": 6453 }, { "epoch": 0.8210151380231523, "ewc_loss": 0.024096431210637093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4096430934150703e-05, "grad_norm": 15.658495903015137, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8576000928878784, "num_tokens": 246235248.0, "step": 6454 }, { "epoch": 0.8211423483017428, "ewc_loss": 0.02408479154109955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4084791220957413e-05, "grad_norm": 15.65526008605957, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8557257056236267, "num_tokens": 246277016.0, "step": 6455 }, { "epoch": 0.8212695585803332, "ewc_loss": 0.024129575118422508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4129574740072712e-05, "grad_norm": 15.677326202392578, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8590412735939026, "num_tokens": 246312352.0, "step": 6456 }, { "epoch": 0.8213967688589238, "ewc_loss": 0.024075163528323174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4075163310044445e-05, "grad_norm": 15.655632972717285, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8483558893203735, "num_tokens": 246355079.0, "step": 6457 }, { "epoch": 0.8215239791375143, "ewc_loss": 0.024039292708039284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.403929283900652e-05, "grad_norm": 15.677350044250488, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8492177724838257, "num_tokens": 246391073.0, "step": 6458 }, { "epoch": 0.8216511894161048, "ewc_loss": 0.02412610501050949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4126104108290747e-05, "grad_norm": 15.68831729888916, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8680679202079773, "num_tokens": 246435407.0, "step": 6459 }, { "epoch": 0.8217783996946953, "ewc_loss": 0.024026785045862198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.402678546786774e-05, "grad_norm": 15.688931465148926, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8489012718200684, "num_tokens": 246475303.0, "step": 6460 }, { "epoch": 0.8219056099732859, "ewc_loss": 0.024090567603707314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4090568331303075e-05, "grad_norm": 15.692346572875977, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8595739603042603, "num_tokens": 246515237.0, "step": 6461 }, { "epoch": 0.8220328202518764, "ewc_loss": 0.024014055728912354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4014056180021726e-05, "grad_norm": 15.676701545715332, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.857193112373352, "num_tokens": 246554958.0, "step": 6462 }, { "epoch": 0.8221600305304668, "ewc_loss": 0.02401007153093815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4010070774238557e-05, "grad_norm": 15.641515731811523, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8624832034111023, "num_tokens": 246592327.0, "step": 6463 }, { "epoch": 0.8222872408090574, "ewc_loss": 0.02404547855257988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.404547922196798e-05, "grad_norm": 15.674126625061035, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8657008409500122, "num_tokens": 246626965.0, "step": 6464 }, { "epoch": 0.8224144510876479, "ewc_loss": 0.024051034823060036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.405103441560641e-05, "grad_norm": 15.70299243927002, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.858585774898529, "num_tokens": 246662049.0, "step": 6465 }, { "epoch": 0.8225416613662384, "ewc_loss": 0.024054814130067825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4054814275586978e-05, "grad_norm": 15.632184982299805, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8630663156509399, "num_tokens": 246696989.0, "step": 6466 }, { "epoch": 0.8226688716448289, "ewc_loss": 0.024013293907046318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.401329402346164e-05, "grad_norm": 15.654891014099121, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8661851286888123, "num_tokens": 246738418.0, "step": 6467 }, { "epoch": 0.8227960819234195, "ewc_loss": 0.024094998836517334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4094999389490113e-05, "grad_norm": 15.707085609436035, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8631420135498047, "num_tokens": 246776779.0, "step": 6468 }, { "epoch": 0.8229232922020099, "ewc_loss": 0.024064917117357254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406491694273427e-05, "grad_norm": 15.689168930053711, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8678210377693176, "num_tokens": 246814029.0, "step": 6469 }, { "epoch": 0.8230505024806004, "ewc_loss": 0.024068651720881462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.406865132797975e-05, "grad_norm": 15.678971290588379, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8532042503356934, "num_tokens": 246856390.0, "step": 6470 }, { "epoch": 0.8231777127591909, "ewc_loss": 0.02403492107987404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.40349218074698e-05, "grad_norm": 15.58273983001709, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8633682727813721, "num_tokens": 246895563.0, "step": 6471 }, { "epoch": 0.8233049230377815, "ewc_loss": 0.024091452360153198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4091452360153198e-05, "grad_norm": 15.715167045593262, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8579825162887573, "num_tokens": 246935956.0, "step": 6472 }, { "epoch": 0.823432133316372, "ewc_loss": 0.024077901616692543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.407790088909678e-05, "grad_norm": 15.617910385131836, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8579795360565186, "num_tokens": 246984765.0, "step": 6473 }, { "epoch": 0.8235593435949625, "ewc_loss": 0.02408401295542717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4084012693492696e-05, "grad_norm": 15.693340301513672, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.865036129951477, "num_tokens": 247017086.0, "step": 6474 }, { "epoch": 0.8236865538735529, "ewc_loss": 0.02403557300567627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.403557300567627e-05, "grad_norm": 15.659679412841797, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8669856786727905, "num_tokens": 247056195.0, "step": 6475 }, { "epoch": 0.8238137641521435, "ewc_loss": 0.024084782227873802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4084782126010396e-05, "grad_norm": 15.592540740966797, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8761137127876282, "num_tokens": 247094063.0, "step": 6476 }, { "epoch": 0.823940974430734, "ewc_loss": 0.024061014875769615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4061015210463665e-05, "grad_norm": 15.646881103515625, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.860548198223114, "num_tokens": 247131743.0, "step": 6477 }, { "epoch": 0.8240681847093245, "ewc_loss": 0.02412765845656395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.412765934423078e-05, "grad_norm": 15.68394660949707, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8592392802238464, "num_tokens": 247169477.0, "step": 6478 }, { "epoch": 0.824195394987915, "ewc_loss": 0.024088425561785698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4088425561785698e-05, "grad_norm": 15.654975891113281, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8709137439727783, "num_tokens": 247209159.0, "step": 6479 }, { "epoch": 0.8243226052665056, "ewc_loss": 0.0241185761988163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4118577130138874e-05, "grad_norm": 15.679924964904785, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8634454607963562, "num_tokens": 247247603.0, "step": 6480 }, { "epoch": 0.824449815545096, "ewc_loss": 0.02409021183848381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409021180937998e-05, "grad_norm": 15.655858993530273, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8715618848800659, "num_tokens": 247290184.0, "step": 6481 }, { "epoch": 0.8245770258236865, "ewc_loss": 0.024052469059824944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4052469598245807e-05, "grad_norm": 15.630617141723633, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8442236185073853, "num_tokens": 247331862.0, "step": 6482 }, { "epoch": 0.824704236102277, "ewc_loss": 0.024075860157608986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4075859982986003e-05, "grad_norm": 15.670823097229004, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8617020845413208, "num_tokens": 247368271.0, "step": 6483 }, { "epoch": 0.8248314463808676, "ewc_loss": 0.024093162268400192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409316221019253e-05, "grad_norm": 15.659975051879883, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8610708117485046, "num_tokens": 247408077.0, "step": 6484 }, { "epoch": 0.8249586566594581, "ewc_loss": 0.02409822680056095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4098226276692003e-05, "grad_norm": 15.642271041870117, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8697046041488647, "num_tokens": 247447408.0, "step": 6485 }, { "epoch": 0.8250858669380486, "ewc_loss": 0.024097129702568054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4097129426081665e-05, "grad_norm": 15.646761894226074, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8731850385665894, "num_tokens": 247489863.0, "step": 6486 }, { "epoch": 0.8252130772166391, "ewc_loss": 0.024136660620570183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4136659703799523e-05, "grad_norm": 15.620652198791504, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8578273057937622, "num_tokens": 247532233.0, "step": 6487 }, { "epoch": 0.8253402874952296, "ewc_loss": 0.02412816695868969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4128166842274368e-05, "grad_norm": 15.73654556274414, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8373523354530334, "num_tokens": 247570719.0, "step": 6488 }, { "epoch": 0.8254674977738201, "ewc_loss": 0.024140631780028343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4140632376656868e-05, "grad_norm": 15.718524932861328, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8675317764282227, "num_tokens": 247603302.0, "step": 6489 }, { "epoch": 0.8255947080524106, "ewc_loss": 0.02408183366060257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4081833544187248e-05, "grad_norm": 15.59333610534668, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.864499032497406, "num_tokens": 247642837.0, "step": 6490 }, { "epoch": 0.8257219183310012, "ewc_loss": 0.02408682368695736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4086823032121174e-05, "grad_norm": 15.716141700744629, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8592463731765747, "num_tokens": 247685246.0, "step": 6491 }, { "epoch": 0.8258491286095917, "ewc_loss": 0.02413252554833889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4132525140885264e-05, "grad_norm": 15.6226224899292, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8538349270820618, "num_tokens": 247720871.0, "step": 6492 }, { "epoch": 0.8259763388881821, "ewc_loss": 0.024116987362504005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4116987333400175e-05, "grad_norm": 15.678553581237793, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8635627031326294, "num_tokens": 247759796.0, "step": 6493 }, { "epoch": 0.8261035491667726, "ewc_loss": 0.024157030507922173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4157030566129833e-05, "grad_norm": 15.633270263671875, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8418078422546387, "num_tokens": 247806486.0, "step": 6494 }, { "epoch": 0.8262307594453632, "ewc_loss": 0.024098271504044533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409827175142709e-05, "grad_norm": 15.640203475952148, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8696470260620117, "num_tokens": 247842031.0, "step": 6495 }, { "epoch": 0.8263579697239537, "ewc_loss": 0.02415633760392666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4156337531167082e-05, "grad_norm": 15.707185745239258, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8655810356140137, "num_tokens": 247883438.0, "step": 6496 }, { "epoch": 0.8264851800025442, "ewc_loss": 0.024120893329381943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4120892703649588e-05, "grad_norm": 15.547394752502441, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8707979917526245, "num_tokens": 247921491.0, "step": 6497 }, { "epoch": 0.8266123902811348, "ewc_loss": 0.024147264659404755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4147264412022196e-05, "grad_norm": 15.720036506652832, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8722440004348755, "num_tokens": 247957454.0, "step": 6498 }, { "epoch": 0.8267396005597252, "ewc_loss": 0.024209817871451378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4209817638620734e-05, "grad_norm": 15.630520820617676, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8469846248626709, "num_tokens": 247997705.0, "step": 6499 }, { "epoch": 0.8268668108383157, "ewc_loss": 0.024097532033920288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.409753142273985e-05, "grad_norm": 15.605847358703613, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8572432994842529, "num_tokens": 248034536.0, "step": 6500 }, { "epoch": 0.8269940211169062, "ewc_loss": 0.024174267426133156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4174267309717834e-05, "grad_norm": 15.597185134887695, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8628608584403992, "num_tokens": 248080916.0, "step": 6501 }, { "epoch": 0.8271212313954968, "ewc_loss": 0.02417081966996193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4170820324798115e-05, "grad_norm": 15.669902801513672, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8611953258514404, "num_tokens": 248119719.0, "step": 6502 }, { "epoch": 0.8272484416740873, "ewc_loss": 0.024190228432416916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.419022894173395e-05, "grad_norm": 15.678098678588867, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8590376377105713, "num_tokens": 248156221.0, "step": 6503 }, { "epoch": 0.8273756519526778, "ewc_loss": 0.02416297234594822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4162973204511218e-05, "grad_norm": 15.65600872039795, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8660876154899597, "num_tokens": 248191780.0, "step": 6504 }, { "epoch": 0.8275028622312682, "ewc_loss": 0.024141013622283936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4141014364431612e-05, "grad_norm": 15.63736343383789, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8568499088287354, "num_tokens": 248228594.0, "step": 6505 }, { "epoch": 0.8276300725098588, "ewc_loss": 0.024191297590732574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4191298507503234e-05, "grad_norm": 15.633438110351562, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8662145137786865, "num_tokens": 248260372.0, "step": 6506 }, { "epoch": 0.8277572827884493, "ewc_loss": 0.02421814389526844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4218143153120764e-05, "grad_norm": 15.679816246032715, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8536636829376221, "num_tokens": 248299361.0, "step": 6507 }, { "epoch": 0.8278844930670398, "ewc_loss": 0.02415010891854763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.415010931144934e-05, "grad_norm": 15.578621864318848, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8543563485145569, "num_tokens": 248337370.0, "step": 6508 }, { "epoch": 0.8280117033456303, "ewc_loss": 0.024221647530794144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4221648345701396e-05, "grad_norm": 15.707993507385254, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8643207550048828, "num_tokens": 248377541.0, "step": 6509 }, { "epoch": 0.8281389136242209, "ewc_loss": 0.024234313517808914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4234313968918286e-05, "grad_norm": 15.649754524230957, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8446009159088135, "num_tokens": 248413928.0, "step": 6510 }, { "epoch": 0.8282661239028114, "ewc_loss": 0.02422577142715454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4225771994679235e-05, "grad_norm": 15.689888000488281, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8618016839027405, "num_tokens": 248453283.0, "step": 6511 }, { "epoch": 0.8283933341814018, "ewc_loss": 0.024274496361613274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4274497263832018e-05, "grad_norm": 15.629384994506836, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8664727210998535, "num_tokens": 248490286.0, "step": 6512 }, { "epoch": 0.8285205444599923, "ewc_loss": 0.024206191301345825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4206190573750064e-05, "grad_norm": 15.655167579650879, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8472393751144409, "num_tokens": 248526349.0, "step": 6513 }, { "epoch": 0.8286477547385829, "ewc_loss": 0.024334274232387543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4334274712600745e-05, "grad_norm": 15.756731033325195, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8543083071708679, "num_tokens": 248561751.0, "step": 6514 }, { "epoch": 0.8287749650171734, "ewc_loss": 0.024241836741566658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4241837309091352e-05, "grad_norm": 15.636283874511719, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8750932216644287, "num_tokens": 248601921.0, "step": 6515 }, { "epoch": 0.8289021752957639, "ewc_loss": 0.0242534801363945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.425348066026345e-05, "grad_norm": 15.66964340209961, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8730677962303162, "num_tokens": 248641218.0, "step": 6516 }, { "epoch": 0.8290293855743545, "ewc_loss": 0.024271389469504356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4271388610941358e-05, "grad_norm": 15.729260444641113, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8620893359184265, "num_tokens": 248672616.0, "step": 6517 }, { "epoch": 0.8291565958529449, "ewc_loss": 0.024246465414762497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4246464818133973e-05, "grad_norm": 15.684210777282715, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.865525484085083, "num_tokens": 248708212.0, "step": 6518 }, { "epoch": 0.8292838061315354, "ewc_loss": 0.024252889677882195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4252889488707297e-05, "grad_norm": 15.704055786132812, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.866115927696228, "num_tokens": 248746575.0, "step": 6519 }, { "epoch": 0.8294110164101259, "ewc_loss": 0.024253955110907555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4253955416497774e-05, "grad_norm": 15.726948738098145, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8580823540687561, "num_tokens": 248789423.0, "step": 6520 }, { "epoch": 0.8295382266887165, "ewc_loss": 0.024212893098592758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.421289354970213e-05, "grad_norm": 15.668082237243652, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8646414279937744, "num_tokens": 248824044.0, "step": 6521 }, { "epoch": 0.829665436967307, "ewc_loss": 0.024213170632719994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.421317003609147e-05, "grad_norm": 15.730058670043945, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8586669564247131, "num_tokens": 248860805.0, "step": 6522 }, { "epoch": 0.8297926472458975, "ewc_loss": 0.02421005256474018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.421005228825379e-05, "grad_norm": 15.607298851013184, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.868389368057251, "num_tokens": 248898958.0, "step": 6523 }, { "epoch": 0.8299198575244879, "ewc_loss": 0.024197140708565712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4197141101467423e-05, "grad_norm": 15.683952331542969, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8609521985054016, "num_tokens": 248938847.0, "step": 6524 }, { "epoch": 0.8300470678030785, "ewc_loss": 0.024282939732074738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4282939193653874e-05, "grad_norm": 15.619519233703613, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8582157492637634, "num_tokens": 248977944.0, "step": 6525 }, { "epoch": 0.830174278081669, "ewc_loss": 0.024258993566036224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4258994017145596e-05, "grad_norm": 15.751374244689941, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8558392524719238, "num_tokens": 249014610.0, "step": 6526 }, { "epoch": 0.8303014883602595, "ewc_loss": 0.024288693442940712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.428869265713729e-05, "grad_norm": 15.661144256591797, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8313367962837219, "num_tokens": 249053869.0, "step": 6527 }, { "epoch": 0.83042869863885, "ewc_loss": 0.024207891896367073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.420789132884238e-05, "grad_norm": 15.683152198791504, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8544237017631531, "num_tokens": 249096874.0, "step": 6528 }, { "epoch": 0.8305559089174406, "ewc_loss": 0.024304717779159546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.430471795378253e-05, "grad_norm": 15.669821739196777, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8612719774246216, "num_tokens": 249133141.0, "step": 6529 }, { "epoch": 0.830683119196031, "ewc_loss": 0.024273714050650597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.427371327939909e-05, "grad_norm": 15.693563461303711, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8669813871383667, "num_tokens": 249163813.0, "step": 6530 }, { "epoch": 0.8308103294746215, "ewc_loss": 0.02431625872850418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4316259441548027e-05, "grad_norm": 15.734638214111328, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.85404372215271, "num_tokens": 249202935.0, "step": 6531 }, { "epoch": 0.830937539753212, "ewc_loss": 0.02427939884364605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4279399440274574e-05, "grad_norm": 15.687809944152832, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8689646124839783, "num_tokens": 249241061.0, "step": 6532 }, { "epoch": 0.8310647500318026, "ewc_loss": 0.02427513524889946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4275135729112662e-05, "grad_norm": 15.696852684020996, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8500373363494873, "num_tokens": 249277863.0, "step": 6533 }, { "epoch": 0.8311919603103931, "ewc_loss": 0.0243141558021307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4314156689797528e-05, "grad_norm": 15.731995582580566, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8608700037002563, "num_tokens": 249318047.0, "step": 6534 }, { "epoch": 0.8313191705889836, "ewc_loss": 0.024299466982483864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4299466531374492e-05, "grad_norm": 15.722846031188965, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8561503887176514, "num_tokens": 249362095.0, "step": 6535 }, { "epoch": 0.831446380867574, "ewc_loss": 0.024304796010255814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.430479617032688e-05, "grad_norm": 15.7341947555542, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8752633929252625, "num_tokens": 249403362.0, "step": 6536 }, { "epoch": 0.8315735911461646, "ewc_loss": 0.024277877062559128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4277876946143806e-05, "grad_norm": 15.738015174865723, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.868636965751648, "num_tokens": 249438051.0, "step": 6537 }, { "epoch": 0.8317008014247551, "ewc_loss": 0.02429635263979435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.429635242151562e-05, "grad_norm": 15.729377746582031, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8660167455673218, "num_tokens": 249477185.0, "step": 6538 }, { "epoch": 0.8318280117033456, "ewc_loss": 0.024228673428297043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.422867328277789e-05, "grad_norm": 15.701146125793457, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8550875782966614, "num_tokens": 249516880.0, "step": 6539 }, { "epoch": 0.8319552219819362, "ewc_loss": 0.02423654869198799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4236549506895244e-05, "grad_norm": 15.644925117492676, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8719109892845154, "num_tokens": 249555864.0, "step": 6540 }, { "epoch": 0.8320824322605267, "ewc_loss": 0.02427995391190052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4279954232042655e-05, "grad_norm": 15.747299194335938, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8597381711006165, "num_tokens": 249590566.0, "step": 6541 }, { "epoch": 0.8322096425391171, "ewc_loss": 0.024259712547063828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4259712517959997e-05, "grad_norm": 15.723796844482422, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8740489482879639, "num_tokens": 249631523.0, "step": 6542 }, { "epoch": 0.8323368528177076, "ewc_loss": 0.02421310916543007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4213110009441152e-05, "grad_norm": 15.747687339782715, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8710918426513672, "num_tokens": 249674113.0, "step": 6543 }, { "epoch": 0.8324640630962982, "ewc_loss": 0.024248560890555382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4248560293926857e-05, "grad_norm": 15.748201370239258, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8720353245735168, "num_tokens": 249712934.0, "step": 6544 }, { "epoch": 0.8325912733748887, "ewc_loss": 0.024232741445302963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.423274054308422e-05, "grad_norm": 15.7196683883667, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8669948577880859, "num_tokens": 249750295.0, "step": 6545 }, { "epoch": 0.8327184836534792, "ewc_loss": 0.02420087531208992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4200875486712903e-05, "grad_norm": 15.732538223266602, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8657606840133667, "num_tokens": 249790289.0, "step": 6546 }, { "epoch": 0.8328456939320698, "ewc_loss": 0.024256426841020584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4256427423097193e-05, "grad_norm": 15.766536712646484, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8759194612503052, "num_tokens": 249822873.0, "step": 6547 }, { "epoch": 0.8329729042106602, "ewc_loss": 0.024206433445215225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.420643431833014e-05, "grad_norm": 15.73432731628418, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8633477091789246, "num_tokens": 249860564.0, "step": 6548 }, { "epoch": 0.8331001144892507, "ewc_loss": 0.024146832525730133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4146833311533555e-05, "grad_norm": 15.69127082824707, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.860933780670166, "num_tokens": 249901448.0, "step": 6549 }, { "epoch": 0.8332273247678412, "ewc_loss": 0.024182675406336784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4182674678741023e-05, "grad_norm": 15.747427940368652, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8519202470779419, "num_tokens": 249933524.0, "step": 6550 }, { "epoch": 0.8333545350464318, "ewc_loss": 0.024205723777413368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4205723093473352e-05, "grad_norm": 15.712477684020996, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8575325012207031, "num_tokens": 249971856.0, "step": 6551 }, { "epoch": 0.8334817453250223, "ewc_loss": 0.024154847487807274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.415484777884558e-05, "grad_norm": 15.67950439453125, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8542135953903198, "num_tokens": 250013940.0, "step": 6552 }, { "epoch": 0.8336089556036128, "ewc_loss": 0.024204816669225693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4204817236750387e-05, "grad_norm": 15.760953903198242, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8449164628982544, "num_tokens": 250044826.0, "step": 6553 }, { "epoch": 0.8337361658822032, "ewc_loss": 0.02420845814049244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4208458853536285e-05, "grad_norm": 15.679574012756348, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8465521931648254, "num_tokens": 250087409.0, "step": 6554 }, { "epoch": 0.8338633761607938, "ewc_loss": 0.024217279627919197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.421727913315408e-05, "grad_norm": 15.706730842590332, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.869163990020752, "num_tokens": 250126236.0, "step": 6555 }, { "epoch": 0.8339905864393843, "ewc_loss": 0.024284755811095238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4284756364068016e-05, "grad_norm": 15.700508117675781, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8698303699493408, "num_tokens": 250163559.0, "step": 6556 }, { "epoch": 0.8341177967179748, "ewc_loss": 0.024243587628006935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4243587176897563e-05, "grad_norm": 15.707362174987793, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8623703718185425, "num_tokens": 250203536.0, "step": 6557 }, { "epoch": 0.8342450069965653, "ewc_loss": 0.024274230003356934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4274229872389697e-05, "grad_norm": 15.676207542419434, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8661850690841675, "num_tokens": 250237455.0, "step": 6558 }, { "epoch": 0.8343722172751559, "ewc_loss": 0.024237480014562607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.423748082946986e-05, "grad_norm": 15.695453643798828, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8698071241378784, "num_tokens": 250274559.0, "step": 6559 }, { "epoch": 0.8344994275537464, "ewc_loss": 0.024287432432174683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4287432097480632e-05, "grad_norm": 15.697638511657715, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8639016151428223, "num_tokens": 250309392.0, "step": 6560 }, { "epoch": 0.8346266378323368, "ewc_loss": 0.024278691038489342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4278691853396595e-05, "grad_norm": 15.683953285217285, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8675399422645569, "num_tokens": 250352010.0, "step": 6561 }, { "epoch": 0.8347538481109273, "ewc_loss": 0.02430843934416771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4308439606102183e-05, "grad_norm": 15.714925765991211, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8502256870269775, "num_tokens": 250385745.0, "step": 6562 }, { "epoch": 0.8348810583895179, "ewc_loss": 0.0243134256452322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4313425456057303e-05, "grad_norm": 15.638104438781738, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8597893714904785, "num_tokens": 250426055.0, "step": 6563 }, { "epoch": 0.8350082686681084, "ewc_loss": 0.024323193356394768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4323193429154344e-05, "grad_norm": 15.784157752990723, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8593814373016357, "num_tokens": 250468795.0, "step": 6564 }, { "epoch": 0.8351354789466989, "ewc_loss": 0.024366192519664764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4366192519664764e-05, "grad_norm": 15.704032897949219, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8602767586708069, "num_tokens": 250509115.0, "step": 6565 }, { "epoch": 0.8352626892252895, "ewc_loss": 0.02425880916416645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4258808480226435e-05, "grad_norm": 15.7761869430542, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8628031015396118, "num_tokens": 250547770.0, "step": 6566 }, { "epoch": 0.8353898995038799, "ewc_loss": 0.02434498257935047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.434498310321942e-05, "grad_norm": 15.657544136047363, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.859199047088623, "num_tokens": 250589737.0, "step": 6567 }, { "epoch": 0.8355171097824704, "ewc_loss": 0.024248434230685234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4248434783658013e-05, "grad_norm": 15.76086139678955, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8701587319374084, "num_tokens": 250623058.0, "step": 6568 }, { "epoch": 0.8356443200610609, "ewc_loss": 0.024346690624952316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.434669113426935e-05, "grad_norm": 15.730118751525879, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8623913526535034, "num_tokens": 250660526.0, "step": 6569 }, { "epoch": 0.8357715303396515, "ewc_loss": 0.024248410016298294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4248409317806363e-05, "grad_norm": 15.735940933227539, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8458437919616699, "num_tokens": 250698218.0, "step": 6570 }, { "epoch": 0.835898740618242, "ewc_loss": 0.024313298985362053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4313298126799054e-05, "grad_norm": 15.72396183013916, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8548063039779663, "num_tokens": 250743410.0, "step": 6571 }, { "epoch": 0.8360259508968325, "ewc_loss": 0.024229871109128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4229871996794827e-05, "grad_norm": 15.625271797180176, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8720101118087769, "num_tokens": 250781919.0, "step": 6572 }, { "epoch": 0.8361531611754229, "ewc_loss": 0.02428598515689373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4285986000904813e-05, "grad_norm": 15.757177352905273, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.860388457775116, "num_tokens": 250817051.0, "step": 6573 }, { "epoch": 0.8362803714540135, "ewc_loss": 0.024327784776687622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4327784558408894e-05, "grad_norm": 15.657354354858398, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8590940833091736, "num_tokens": 250857241.0, "step": 6574 }, { "epoch": 0.836407581732604, "ewc_loss": 0.024285802617669106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4285802282975055e-05, "grad_norm": 15.64482307434082, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8634998798370361, "num_tokens": 250903591.0, "step": 6575 }, { "epoch": 0.8365347920111945, "ewc_loss": 0.024309836328029633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4309836589964107e-05, "grad_norm": 15.73367691040039, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8553165793418884, "num_tokens": 250945788.0, "step": 6576 }, { "epoch": 0.836662002289785, "ewc_loss": 0.024366235360503197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.436623617541045e-05, "grad_norm": 15.740764617919922, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8554494976997375, "num_tokens": 250984884.0, "step": 6577 }, { "epoch": 0.8367892125683756, "ewc_loss": 0.024287067353725433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.428706648061052e-05, "grad_norm": 15.6998929977417, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8737266659736633, "num_tokens": 251024347.0, "step": 6578 }, { "epoch": 0.836916422846966, "ewc_loss": 0.024293802678585052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.429380219837185e-05, "grad_norm": 15.716306686401367, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8835813999176025, "num_tokens": 251068527.0, "step": 6579 }, { "epoch": 0.8370436331255565, "ewc_loss": 0.024269958958029747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.426995888527017e-05, "grad_norm": 15.678502082824707, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8669171929359436, "num_tokens": 251109914.0, "step": 6580 }, { "epoch": 0.837170843404147, "ewc_loss": 0.024228084832429886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4228083930211142e-05, "grad_norm": 15.693976402282715, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8540600538253784, "num_tokens": 251144760.0, "step": 6581 }, { "epoch": 0.8372980536827376, "ewc_loss": 0.024252373725175858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.425237289571669e-05, "grad_norm": 15.7490873336792, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8600596785545349, "num_tokens": 251184456.0, "step": 6582 }, { "epoch": 0.8374252639613281, "ewc_loss": 0.0242521520704031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.425215279799886e-05, "grad_norm": 15.695091247558594, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8549304008483887, "num_tokens": 251229493.0, "step": 6583 }, { "epoch": 0.8375524742399186, "ewc_loss": 0.024212302640080452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4212302378145978e-05, "grad_norm": 15.742313385009766, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8663538694381714, "num_tokens": 251270886.0, "step": 6584 }, { "epoch": 0.837679684518509, "ewc_loss": 0.024270491674542427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.427049184916541e-05, "grad_norm": 15.764467239379883, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8704096674919128, "num_tokens": 251304833.0, "step": 6585 }, { "epoch": 0.8378068947970996, "ewc_loss": 0.024210380390286446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.421037970634643e-05, "grad_norm": 15.722716331481934, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8648310899734497, "num_tokens": 251334196.0, "step": 6586 }, { "epoch": 0.8379341050756901, "ewc_loss": 0.024245452135801315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4245451641036198e-05, "grad_norm": 15.746194839477539, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8518132567405701, "num_tokens": 251380278.0, "step": 6587 }, { "epoch": 0.8380613153542806, "ewc_loss": 0.02423601783812046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.423601836198941e-05, "grad_norm": 15.865255355834961, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8565306663513184, "num_tokens": 251418432.0, "step": 6588 }, { "epoch": 0.8381885256328712, "ewc_loss": 0.02422180213034153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4221802959800698e-05, "grad_norm": 15.729884147644043, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8601809740066528, "num_tokens": 251453747.0, "step": 6589 }, { "epoch": 0.8383157359114617, "ewc_loss": 0.024133184924721718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.413318543403875e-05, "grad_norm": 15.8146333694458, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8624098300933838, "num_tokens": 251492439.0, "step": 6590 }, { "epoch": 0.8384429461900521, "ewc_loss": 0.02422669343650341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4226694222306833e-05, "grad_norm": 15.768814086914062, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8629722595214844, "num_tokens": 251527945.0, "step": 6591 }, { "epoch": 0.8385701564686426, "ewc_loss": 0.024166323244571686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.416632378299255e-05, "grad_norm": 15.702945709228516, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8705444931983948, "num_tokens": 251567161.0, "step": 6592 }, { "epoch": 0.8386973667472332, "ewc_loss": 0.024182429537177086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4182429115171544e-05, "grad_norm": 15.76734447479248, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.861997127532959, "num_tokens": 251601189.0, "step": 6593 }, { "epoch": 0.8388245770258237, "ewc_loss": 0.024239525198936462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4239525373559445e-05, "grad_norm": 15.809027671813965, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8600171804428101, "num_tokens": 251630846.0, "step": 6594 }, { "epoch": 0.8389517873044142, "ewc_loss": 0.024195896461606026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4195896912715398e-05, "grad_norm": 15.618090629577637, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8693651556968689, "num_tokens": 251665701.0, "step": 6595 }, { "epoch": 0.8390789975830047, "ewc_loss": 0.024260109290480614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.426010905764997e-05, "grad_norm": 15.803799629211426, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8547914624214172, "num_tokens": 251699909.0, "step": 6596 }, { "epoch": 0.8392062078615952, "ewc_loss": 0.024309545755386353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.430954555165954e-05, "grad_norm": 15.69851303100586, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8604884147644043, "num_tokens": 251734052.0, "step": 6597 }, { "epoch": 0.8393334181401857, "ewc_loss": 0.024250581860542297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4250581191154197e-05, "grad_norm": 15.695021629333496, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.850233256816864, "num_tokens": 251772518.0, "step": 6598 }, { "epoch": 0.8394606284187762, "ewc_loss": 0.024320533499121666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.432053406664636e-05, "grad_norm": 15.755602836608887, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8601573705673218, "num_tokens": 251816378.0, "step": 6599 }, { "epoch": 0.8395878386973668, "ewc_loss": 0.024319138377904892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.431913890177384e-05, "grad_norm": 15.676286697387695, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8620086908340454, "num_tokens": 251851264.0, "step": 6600 }, { "epoch": 0.8397150489759573, "ewc_loss": 0.024327998980879784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4327999199158512e-05, "grad_norm": 15.775497436523438, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8640233278274536, "num_tokens": 251888906.0, "step": 6601 }, { "epoch": 0.8398422592545478, "ewc_loss": 0.024429447948932648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.442944787617307e-05, "grad_norm": 15.697399139404297, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8687710762023926, "num_tokens": 251930641.0, "step": 6602 }, { "epoch": 0.8399694695331382, "ewc_loss": 0.024308649823069572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4308650608872995e-05, "grad_norm": 15.723925590515137, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8431814312934875, "num_tokens": 251968929.0, "step": 6603 }, { "epoch": 0.8400966798117288, "ewc_loss": 0.024377329275012016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4377328372793272e-05, "grad_norm": 15.800369262695312, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8542660474777222, "num_tokens": 252006751.0, "step": 6604 }, { "epoch": 0.8402238900903193, "ewc_loss": 0.024380497634410858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.438049705233425e-05, "grad_norm": 15.676637649536133, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8581199049949646, "num_tokens": 252045861.0, "step": 6605 }, { "epoch": 0.8403511003689098, "ewc_loss": 0.024335632100701332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.433563167869579e-05, "grad_norm": 15.743517875671387, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8552197813987732, "num_tokens": 252084412.0, "step": 6606 }, { "epoch": 0.8404783106475003, "ewc_loss": 0.024437660351395607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.443766061333008e-05, "grad_norm": 15.717767715454102, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8660013675689697, "num_tokens": 252120576.0, "step": 6607 }, { "epoch": 0.8406055209260909, "ewc_loss": 0.024380506947636604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4380506147281267e-05, "grad_norm": 15.706975936889648, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8600557446479797, "num_tokens": 252161636.0, "step": 6608 }, { "epoch": 0.8407327312046813, "ewc_loss": 0.024389132857322693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4389133614022285e-05, "grad_norm": 15.714098930358887, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8630250096321106, "num_tokens": 252199669.0, "step": 6609 }, { "epoch": 0.8408599414832718, "ewc_loss": 0.024382516741752625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4382516130572185e-05, "grad_norm": 15.69994831085205, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.869232714176178, "num_tokens": 252236832.0, "step": 6610 }, { "epoch": 0.8409871517618623, "ewc_loss": 0.02436358667910099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4363585907849483e-05, "grad_norm": 15.675336837768555, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8575088381767273, "num_tokens": 252282518.0, "step": 6611 }, { "epoch": 0.8411143620404529, "ewc_loss": 0.02438877522945404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4388775273109786e-05, "grad_norm": 15.702207565307617, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8555270433425903, "num_tokens": 252316479.0, "step": 6612 }, { "epoch": 0.8412415723190434, "ewc_loss": 0.024401094764471054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4401095288340002e-05, "grad_norm": 15.735328674316406, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.855660080909729, "num_tokens": 252353789.0, "step": 6613 }, { "epoch": 0.8413687825976339, "ewc_loss": 0.024402081966400146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4402081180596724e-05, "grad_norm": 15.69508171081543, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8648664951324463, "num_tokens": 252391609.0, "step": 6614 }, { "epoch": 0.8414959928762245, "ewc_loss": 0.02440161630511284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4401615519309416e-05, "grad_norm": 15.728737831115723, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8743428587913513, "num_tokens": 252433514.0, "step": 6615 }, { "epoch": 0.8416232031548149, "ewc_loss": 0.02439524419605732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4395243599428795e-05, "grad_norm": 15.715696334838867, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8653851747512817, "num_tokens": 252469700.0, "step": 6616 }, { "epoch": 0.8417504134334054, "ewc_loss": 0.024377088993787766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4377088266192004e-05, "grad_norm": 15.71324634552002, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8581775426864624, "num_tokens": 252501658.0, "step": 6617 }, { "epoch": 0.8418776237119959, "ewc_loss": 0.02440715953707695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4407159799011424e-05, "grad_norm": 15.826824188232422, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8643540143966675, "num_tokens": 252534546.0, "step": 6618 }, { "epoch": 0.8420048339905865, "ewc_loss": 0.024389687925577164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4389688405790366e-05, "grad_norm": 15.646889686584473, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8693827390670776, "num_tokens": 252573172.0, "step": 6619 }, { "epoch": 0.842132044269177, "ewc_loss": 0.024332687258720398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.433268673485145e-05, "grad_norm": 15.760159492492676, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8706321716308594, "num_tokens": 252607258.0, "step": 6620 }, { "epoch": 0.8422592545477675, "ewc_loss": 0.02440537139773369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.440537173242774e-05, "grad_norm": 15.701165199279785, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8720622062683105, "num_tokens": 252651451.0, "step": 6621 }, { "epoch": 0.8423864648263579, "ewc_loss": 0.02431202121078968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4312021196237765e-05, "grad_norm": 15.649415969848633, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8741835951805115, "num_tokens": 252686348.0, "step": 6622 }, { "epoch": 0.8425136751049485, "ewc_loss": 0.02438586950302124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4385870347032323e-05, "grad_norm": 15.735442161560059, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8564954996109009, "num_tokens": 252724316.0, "step": 6623 }, { "epoch": 0.842640885383539, "ewc_loss": 0.024390805512666702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4390805265284143e-05, "grad_norm": 15.749085426330566, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8509694933891296, "num_tokens": 252764612.0, "step": 6624 }, { "epoch": 0.8427680956621295, "ewc_loss": 0.024402912706136703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4402912458754145e-05, "grad_norm": 15.729923248291016, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8405114412307739, "num_tokens": 252809510.0, "step": 6625 }, { "epoch": 0.84289530594072, "ewc_loss": 0.024373749271035194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4373748601647094e-05, "grad_norm": 15.677661895751953, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8596934080123901, "num_tokens": 252847598.0, "step": 6626 }, { "epoch": 0.8430225162193106, "ewc_loss": 0.024345384910702705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4345385099877603e-05, "grad_norm": 15.732417106628418, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8472866415977478, "num_tokens": 252884401.0, "step": 6627 }, { "epoch": 0.843149726497901, "ewc_loss": 0.02438851073384285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4388511519646272e-05, "grad_norm": 15.667366027832031, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8466030955314636, "num_tokens": 252928348.0, "step": 6628 }, { "epoch": 0.8432769367764915, "ewc_loss": 0.02438890002667904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.438890078337863e-05, "grad_norm": 15.835864067077637, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8622443675994873, "num_tokens": 252965084.0, "step": 6629 }, { "epoch": 0.843404147055082, "ewc_loss": 0.024384714663028717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4384715288761072e-05, "grad_norm": 15.646358489990234, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8663910627365112, "num_tokens": 253003697.0, "step": 6630 }, { "epoch": 0.8435313573336726, "ewc_loss": 0.024326294660568237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.432629480608739e-05, "grad_norm": 15.749808311462402, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8478603363037109, "num_tokens": 253037239.0, "step": 6631 }, { "epoch": 0.8436585676122631, "ewc_loss": 0.024416398257017136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4416398446192034e-05, "grad_norm": 15.698690414428711, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8533631563186646, "num_tokens": 253078163.0, "step": 6632 }, { "epoch": 0.8437857778908536, "ewc_loss": 0.024365326389670372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4365326680708677e-05, "grad_norm": 15.682594299316406, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8598901033401489, "num_tokens": 253118907.0, "step": 6633 }, { "epoch": 0.843912988169444, "ewc_loss": 0.024421412497758865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.442141339997761e-05, "grad_norm": 15.808523178100586, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8804271221160889, "num_tokens": 253153132.0, "step": 6634 }, { "epoch": 0.8440401984480346, "ewc_loss": 0.024355832487344742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4355831556022167e-05, "grad_norm": 15.686725616455078, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.857820451259613, "num_tokens": 253194446.0, "step": 6635 }, { "epoch": 0.8441674087266251, "ewc_loss": 0.024344924837350845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4344924895558506e-05, "grad_norm": 15.750243186950684, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8640338182449341, "num_tokens": 253230798.0, "step": 6636 }, { "epoch": 0.8442946190052156, "ewc_loss": 0.024404464289546013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.440446405671537e-05, "grad_norm": 15.709566116333008, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8546954393386841, "num_tokens": 253269029.0, "step": 6637 }, { "epoch": 0.8444218292838062, "ewc_loss": 0.02435595914721489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4355958885280415e-05, "grad_norm": 15.72515869140625, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8565621376037598, "num_tokens": 253310656.0, "step": 6638 }, { "epoch": 0.8445490395623967, "ewc_loss": 0.024387599900364876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4387600205955096e-05, "grad_norm": 15.740532875061035, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8795948624610901, "num_tokens": 253346882.0, "step": 6639 }, { "epoch": 0.8446762498409871, "ewc_loss": 0.02437010407447815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4370103346882388e-05, "grad_norm": 15.70128059387207, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8516569137573242, "num_tokens": 253389601.0, "step": 6640 }, { "epoch": 0.8448034601195776, "ewc_loss": 0.024350032210350037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4350032617803663e-05, "grad_norm": 15.724159240722656, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8603909015655518, "num_tokens": 253428725.0, "step": 6641 }, { "epoch": 0.8449306703981682, "ewc_loss": 0.024346251040697098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.434625093883369e-05, "grad_norm": 15.687307357788086, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8761478662490845, "num_tokens": 253461404.0, "step": 6642 }, { "epoch": 0.8450578806767587, "ewc_loss": 0.02434016391634941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4340164600289427e-05, "grad_norm": 15.738953590393066, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8665837049484253, "num_tokens": 253500324.0, "step": 6643 }, { "epoch": 0.8451850909553492, "ewc_loss": 0.024378005415201187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.437800503685139e-05, "grad_norm": 15.723729133605957, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8489599823951721, "num_tokens": 253542500.0, "step": 6644 }, { "epoch": 0.8453123012339397, "ewc_loss": 0.024318398907780647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4318398573086597e-05, "grad_norm": 15.720333099365234, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8437631726264954, "num_tokens": 253581887.0, "step": 6645 }, { "epoch": 0.8454395115125302, "ewc_loss": 0.02433651313185692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.433651388855651e-05, "grad_norm": 15.700435638427734, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8702629804611206, "num_tokens": 253616942.0, "step": 6646 }, { "epoch": 0.8455667217911207, "ewc_loss": 0.024349024519324303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.43490248976741e-05, "grad_norm": 15.778329849243164, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.855278730392456, "num_tokens": 253652610.0, "step": 6647 }, { "epoch": 0.8456939320697112, "ewc_loss": 0.024394964799284935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4394965294050053e-05, "grad_norm": 15.79944133758545, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.858780026435852, "num_tokens": 253686269.0, "step": 6648 }, { "epoch": 0.8458211423483017, "ewc_loss": 0.024322977289557457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4322976969415322e-05, "grad_norm": 15.707696914672852, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8566951751708984, "num_tokens": 253723632.0, "step": 6649 }, { "epoch": 0.8459483526268923, "ewc_loss": 0.02436557598412037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4365575882256962e-05, "grad_norm": 15.799795150756836, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.855148434638977, "num_tokens": 253755990.0, "step": 6650 }, { "epoch": 0.8460755629054828, "ewc_loss": 0.02438805066049099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4388051315327175e-05, "grad_norm": 15.68960952758789, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8564913272857666, "num_tokens": 253794200.0, "step": 6651 }, { "epoch": 0.8462027731840732, "ewc_loss": 0.024314315989613533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.431431676086504e-05, "grad_norm": 15.639962196350098, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8725724816322327, "num_tokens": 253840554.0, "step": 6652 }, { "epoch": 0.8463299834626637, "ewc_loss": 0.0243954099714756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4395409127464518e-05, "grad_norm": 15.705144882202148, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8704376816749573, "num_tokens": 253873642.0, "step": 6653 }, { "epoch": 0.8464571937412543, "ewc_loss": 0.024445876479148865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4445876988465898e-05, "grad_norm": 15.794853210449219, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8668023347854614, "num_tokens": 253911839.0, "step": 6654 }, { "epoch": 0.8465844040198448, "ewc_loss": 0.024358177557587624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.435817805235274e-05, "grad_norm": 15.652823448181152, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.851648211479187, "num_tokens": 253944649.0, "step": 6655 }, { "epoch": 0.8467116142984353, "ewc_loss": 0.02446950413286686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4469503841828555e-05, "grad_norm": 29.905406951904297, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8615713119506836, "num_tokens": 253977739.0, "step": 6656 }, { "epoch": 0.8468388245770259, "ewc_loss": 0.033208537846803665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3208536478923634e-05, "grad_norm": 18.431167602539062, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8468390703201294, "num_tokens": 254021216.0, "step": 6657 }, { "epoch": 0.8469660348556163, "ewc_loss": 0.023500533774495125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.3500533643527888e-05, "grad_norm": 13.890869140625, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8592415452003479, "num_tokens": 254063209.0, "step": 6658 }, { "epoch": 0.8470932451342068, "ewc_loss": 0.0281196441501379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8119644412072375e-05, "grad_norm": 17.10784149169922, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.857682466506958, "num_tokens": 254102325.0, "step": 6659 }, { "epoch": 0.8472204554127973, "ewc_loss": 0.02936766855418682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.936766941274982e-05, "grad_norm": 16.182592391967773, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8662887811660767, "num_tokens": 254136991.0, "step": 6660 }, { "epoch": 0.8473476656913879, "ewc_loss": 0.027119962498545647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.711996239668224e-05, "grad_norm": 15.92013931274414, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.860944390296936, "num_tokens": 254179520.0, "step": 6661 }, { "epoch": 0.8474748759699784, "ewc_loss": 0.028442397713661194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8442398615879938e-05, "grad_norm": 16.04796600341797, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8613249063491821, "num_tokens": 254220661.0, "step": 6662 }, { "epoch": 0.8476020862485689, "ewc_loss": 0.02792644128203392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7926440452574752e-05, "grad_norm": 16.036083221435547, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.868980884552002, "num_tokens": 254261853.0, "step": 6663 }, { "epoch": 0.8477292965271594, "ewc_loss": 0.028203431516885757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8203430701978505e-05, "grad_norm": 16.053739547729492, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8619210720062256, "num_tokens": 254301577.0, "step": 6664 }, { "epoch": 0.8478565068057499, "ewc_loss": 0.028053194284439087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8053194910171442e-05, "grad_norm": 16.008087158203125, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.874521017074585, "num_tokens": 254341792.0, "step": 6665 }, { "epoch": 0.8479837170843404, "ewc_loss": 0.027855033054947853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7855032385559753e-05, "grad_norm": 16.053617477416992, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8499340415000916, "num_tokens": 254376755.0, "step": 6666 }, { "epoch": 0.8481109273629309, "ewc_loss": 0.02765757404267788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7657573809847236e-05, "grad_norm": 15.883607864379883, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8731080889701843, "num_tokens": 254413565.0, "step": 6667 }, { "epoch": 0.8482381376415215, "ewc_loss": 0.027507511898875237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7507512641022913e-05, "grad_norm": 15.99159049987793, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8508117198944092, "num_tokens": 254449889.0, "step": 6668 }, { "epoch": 0.848365347920112, "ewc_loss": 0.027406994253396988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7406993467593566e-05, "grad_norm": 15.973461151123047, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8751872777938843, "num_tokens": 254488129.0, "step": 6669 }, { "epoch": 0.8484925581987025, "ewc_loss": 0.027105655521154404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7105656045023352e-05, "grad_norm": 15.87360668182373, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8725563883781433, "num_tokens": 254522031.0, "step": 6670 }, { "epoch": 0.8486197684772929, "ewc_loss": 0.0269490797072649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6949079256155528e-05, "grad_norm": 15.97034740447998, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8729955554008484, "num_tokens": 254559249.0, "step": 6671 }, { "epoch": 0.8487469787558835, "ewc_loss": 0.026801131665706635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6801131753018126e-05, "grad_norm": 15.916319847106934, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8551062941551208, "num_tokens": 254603968.0, "step": 6672 }, { "epoch": 0.848874189034474, "ewc_loss": 0.026509743183851242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6509742383495905e-05, "grad_norm": 16.003463745117188, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8714679479598999, "num_tokens": 254642178.0, "step": 6673 }, { "epoch": 0.8490013993130645, "ewc_loss": 0.026393990963697433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6393990992801264e-05, "grad_norm": 15.853569984436035, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8538556694984436, "num_tokens": 254679129.0, "step": 6674 }, { "epoch": 0.849128609591655, "ewc_loss": 0.02616116963326931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6161169444094412e-05, "grad_norm": 15.972518920898438, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8324389457702637, "num_tokens": 254714952.0, "step": 6675 }, { "epoch": 0.8492558198702456, "ewc_loss": 0.026135018095374107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6135017833439633e-05, "grad_norm": 15.9135103225708, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8570138216018677, "num_tokens": 254743323.0, "step": 6676 }, { "epoch": 0.849383030148836, "ewc_loss": 0.025874929502606392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587492963357363e-05, "grad_norm": 15.828569412231445, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8679134845733643, "num_tokens": 254784749.0, "step": 6677 }, { "epoch": 0.8495102404274265, "ewc_loss": 0.025795994326472282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5795994588406757e-05, "grad_norm": 15.885662078857422, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.868074893951416, "num_tokens": 254820102.0, "step": 6678 }, { "epoch": 0.849637450706017, "ewc_loss": 0.025698598474264145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5698598619783297e-05, "grad_norm": 15.88100814819336, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8575912117958069, "num_tokens": 254859988.0, "step": 6679 }, { "epoch": 0.8497646609846076, "ewc_loss": 0.025514420121908188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.551442048570607e-05, "grad_norm": 15.758578300476074, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8576231598854065, "num_tokens": 254906792.0, "step": 6680 }, { "epoch": 0.8498918712631981, "ewc_loss": 0.02544001303613186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5440012905164622e-05, "grad_norm": 15.945328712463379, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8638504147529602, "num_tokens": 254941552.0, "step": 6681 }, { "epoch": 0.8500190815417886, "ewc_loss": 0.02539367601275444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5393675969098695e-05, "grad_norm": 15.857179641723633, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8541584014892578, "num_tokens": 254974653.0, "step": 6682 }, { "epoch": 0.850146291820379, "ewc_loss": 0.02525184489786625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5251845727325417e-05, "grad_norm": 15.923171043395996, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.845775842666626, "num_tokens": 255018418.0, "step": 6683 }, { "epoch": 0.8502735020989696, "ewc_loss": 0.025167429819703102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.516743006708566e-05, "grad_norm": 15.893173217773438, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8659571409225464, "num_tokens": 255049626.0, "step": 6684 }, { "epoch": 0.8504007123775601, "ewc_loss": 0.025042863562703133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5042863853741437e-05, "grad_norm": 15.838305473327637, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8544951677322388, "num_tokens": 255086323.0, "step": 6685 }, { "epoch": 0.8505279226561506, "ewc_loss": 0.02500014565885067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5000144887599163e-05, "grad_norm": 15.878080368041992, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8701798915863037, "num_tokens": 255124102.0, "step": 6686 }, { "epoch": 0.8506551329347412, "ewc_loss": 0.0249248705804348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4924871468101628e-05, "grad_norm": 15.80217170715332, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8651385307312012, "num_tokens": 255162176.0, "step": 6687 }, { "epoch": 0.8507823432133317, "ewc_loss": 0.024811329320073128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4811330149532296e-05, "grad_norm": 15.868385314941406, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.86419677734375, "num_tokens": 255196256.0, "step": 6688 }, { "epoch": 0.8509095534919221, "ewc_loss": 0.024843381717801094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4843382561812177e-05, "grad_norm": 15.802502632141113, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8542096018791199, "num_tokens": 255236752.0, "step": 6689 }, { "epoch": 0.8510367637705126, "ewc_loss": 0.024696430191397667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4696430045878515e-05, "grad_norm": 15.7767333984375, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.862851619720459, "num_tokens": 255268694.0, "step": 6690 }, { "epoch": 0.8511639740491032, "ewc_loss": 0.024775853380560875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4775852580205537e-05, "grad_norm": 15.905845642089844, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.867231011390686, "num_tokens": 255309445.0, "step": 6691 }, { "epoch": 0.8512911843276937, "ewc_loss": 0.024686187505722046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4686187316547148e-05, "grad_norm": 15.817475318908691, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8643401861190796, "num_tokens": 255341126.0, "step": 6692 }, { "epoch": 0.8514183946062842, "ewc_loss": 0.024601567536592484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4601567929494195e-05, "grad_norm": 15.758523941040039, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8708899021148682, "num_tokens": 255379553.0, "step": 6693 }, { "epoch": 0.8515456048848747, "ewc_loss": 0.024622876197099686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.462287557136733e-05, "grad_norm": 15.789365768432617, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8622221946716309, "num_tokens": 255416588.0, "step": 6694 }, { "epoch": 0.8516728151634652, "ewc_loss": 0.024645566940307617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4645567464176565e-05, "grad_norm": 15.814197540283203, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8502388000488281, "num_tokens": 255449574.0, "step": 6695 }, { "epoch": 0.8518000254420557, "ewc_loss": 0.024631155654788017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.463115561113227e-05, "grad_norm": 15.856157302856445, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8599421381950378, "num_tokens": 255494088.0, "step": 6696 }, { "epoch": 0.8519272357206462, "ewc_loss": 0.02458423562347889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4584234779467806e-05, "grad_norm": 15.78787612915039, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8704169988632202, "num_tokens": 255529785.0, "step": 6697 }, { "epoch": 0.8520544459992367, "ewc_loss": 0.02455691248178482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4556911739637144e-05, "grad_norm": 15.81596851348877, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.84858238697052, "num_tokens": 255566675.0, "step": 6698 }, { "epoch": 0.8521816562778273, "ewc_loss": 0.024570876732468605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4570876121288165e-05, "grad_norm": 15.826674461364746, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8656365275382996, "num_tokens": 255606502.0, "step": 6699 }, { "epoch": 0.8523088665564178, "ewc_loss": 0.02452917769551277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.452917760820128e-05, "grad_norm": 15.867132186889648, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8530222773551941, "num_tokens": 255649179.0, "step": 6700 }, { "epoch": 0.8524360768350082, "ewc_loss": 0.024532921612262726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4532921088393778e-05, "grad_norm": 15.840963363647461, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8607791662216187, "num_tokens": 255688190.0, "step": 6701 }, { "epoch": 0.8525632871135987, "ewc_loss": 0.024495253339409828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.449525345582515e-05, "grad_norm": 15.870124816894531, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8599971532821655, "num_tokens": 255730401.0, "step": 6702 }, { "epoch": 0.8526904973921893, "ewc_loss": 0.024472694844007492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4472694349242374e-05, "grad_norm": 15.739413261413574, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8567478656768799, "num_tokens": 255767881.0, "step": 6703 }, { "epoch": 0.8528177076707798, "ewc_loss": 0.024444878101348877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.444487836328335e-05, "grad_norm": 15.830394744873047, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.854712963104248, "num_tokens": 255808347.0, "step": 6704 }, { "epoch": 0.8529449179493703, "ewc_loss": 0.024493113160133362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4493112505297177e-05, "grad_norm": 15.79541301727295, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8422238826751709, "num_tokens": 255849538.0, "step": 6705 }, { "epoch": 0.8530721282279609, "ewc_loss": 0.024433646351099014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4433646103716455e-05, "grad_norm": 15.892948150634766, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8719040155410767, "num_tokens": 255890228.0, "step": 6706 }, { "epoch": 0.8531993385065513, "ewc_loss": 0.02446635067462921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.446635153319221e-05, "grad_norm": 15.794699668884277, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8604305982589722, "num_tokens": 255930189.0, "step": 6707 }, { "epoch": 0.8533265487851418, "ewc_loss": 0.02438107505440712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4381075490964577e-05, "grad_norm": 15.781140327453613, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8682558536529541, "num_tokens": 255968310.0, "step": 6708 }, { "epoch": 0.8534537590637323, "ewc_loss": 0.024451930075883865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.44519305852009e-05, "grad_norm": 15.820159912109375, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8723005056381226, "num_tokens": 256005872.0, "step": 6709 }, { "epoch": 0.8535809693423229, "ewc_loss": 0.02443508803844452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4435088562313467e-05, "grad_norm": 15.817984580993652, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8665022850036621, "num_tokens": 256045374.0, "step": 6710 }, { "epoch": 0.8537081796209134, "ewc_loss": 0.02440297044813633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4402970666415058e-05, "grad_norm": 15.816787719726562, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8645634055137634, "num_tokens": 256084874.0, "step": 6711 }, { "epoch": 0.8538353898995039, "ewc_loss": 0.024380872026085854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.438087176415138e-05, "grad_norm": 15.794588088989258, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8548349142074585, "num_tokens": 256122251.0, "step": 6712 }, { "epoch": 0.8539626001780944, "ewc_loss": 0.02442587912082672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4425879018963315e-05, "grad_norm": 15.834763526916504, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8595885038375854, "num_tokens": 256165025.0, "step": 6713 }, { "epoch": 0.8540898104566849, "ewc_loss": 0.02436605468392372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4366054276470095e-05, "grad_norm": 15.808921813964844, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8678047060966492, "num_tokens": 256202220.0, "step": 6714 }, { "epoch": 0.8542170207352754, "ewc_loss": 0.02439088001847267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.439087984384969e-05, "grad_norm": 15.851348876953125, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8644394874572754, "num_tokens": 256234036.0, "step": 6715 }, { "epoch": 0.8543442310138659, "ewc_loss": 0.024333573877811432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.433357440168038e-05, "grad_norm": 15.814583778381348, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8598705530166626, "num_tokens": 256275253.0, "step": 6716 }, { "epoch": 0.8544714412924564, "ewc_loss": 0.024319037795066833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.431903703836724e-05, "grad_norm": 15.762614250183105, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8582244515419006, "num_tokens": 256311296.0, "step": 6717 }, { "epoch": 0.854598651571047, "ewc_loss": 0.024392951279878616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4392951672780327e-05, "grad_norm": 15.870807647705078, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8677594661712646, "num_tokens": 256348149.0, "step": 6718 }, { "epoch": 0.8547258618496375, "ewc_loss": 0.024324441328644753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4324441255885176e-05, "grad_norm": 15.786946296691895, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8758241534233093, "num_tokens": 256387861.0, "step": 6719 }, { "epoch": 0.8548530721282279, "ewc_loss": 0.02432328648865223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4323286197613925e-05, "grad_norm": 15.738133430480957, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8709816932678223, "num_tokens": 256427131.0, "step": 6720 }, { "epoch": 0.8549802824068184, "ewc_loss": 0.024351976811885834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4351977117476054e-05, "grad_norm": 15.849318504333496, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8667984008789062, "num_tokens": 256465041.0, "step": 6721 }, { "epoch": 0.855107492685409, "ewc_loss": 0.024329068139195442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4329068764927797e-05, "grad_norm": 15.79129409790039, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8527373671531677, "num_tokens": 256502576.0, "step": 6722 }, { "epoch": 0.8552347029639995, "ewc_loss": 0.024285251274704933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.428525112918578e-05, "grad_norm": 15.792231559753418, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.864777147769928, "num_tokens": 256540493.0, "step": 6723 }, { "epoch": 0.85536191324259, "ewc_loss": 0.02436201646924019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4362016119994223e-05, "grad_norm": 15.818024635314941, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8448557257652283, "num_tokens": 256578115.0, "step": 6724 }, { "epoch": 0.8554891235211806, "ewc_loss": 0.024333367124199867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4333367036888376e-05, "grad_norm": 15.793435096740723, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8691514134407043, "num_tokens": 256615957.0, "step": 6725 }, { "epoch": 0.855616333799771, "ewc_loss": 0.02437189593911171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.437189505144488e-05, "grad_norm": 15.913357734680176, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.865364134311676, "num_tokens": 256649972.0, "step": 6726 }, { "epoch": 0.8557435440783615, "ewc_loss": 0.024344170466065407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4344170014956035e-05, "grad_norm": 15.730330467224121, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8689568042755127, "num_tokens": 256686683.0, "step": 6727 }, { "epoch": 0.855870754356952, "ewc_loss": 0.02428782917559147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4287828637170605e-05, "grad_norm": 15.857542991638184, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8608602285385132, "num_tokens": 256723587.0, "step": 6728 }, { "epoch": 0.8559979646355426, "ewc_loss": 0.024430299177765846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.443029916321393e-05, "grad_norm": 15.863587379455566, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8629047870635986, "num_tokens": 256765245.0, "step": 6729 }, { "epoch": 0.8561251749141331, "ewc_loss": 0.024295546114444733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.429554660920985e-05, "grad_norm": 15.717207908630371, "learning_rate": 1e-06, "loss": 0.3374, "mean_token_accuracy": 0.8894174098968506, "num_tokens": 256806930.0, "step": 6730 }, { "epoch": 0.8562523851927236, "ewc_loss": 0.024356013163924217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4356013454962522e-05, "grad_norm": 15.84527587890625, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8595262765884399, "num_tokens": 256844568.0, "step": 6731 }, { "epoch": 0.856379595471314, "ewc_loss": 0.024346565827727318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4346565624000505e-05, "grad_norm": 15.78368854522705, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8610665798187256, "num_tokens": 256877483.0, "step": 6732 }, { "epoch": 0.8565068057499046, "ewc_loss": 0.024271007627248764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4271008442156017e-05, "grad_norm": 15.717570304870605, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8664446473121643, "num_tokens": 256921139.0, "step": 6733 }, { "epoch": 0.8566340160284951, "ewc_loss": 0.02437947876751423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4379478418268263e-05, "grad_norm": 15.854242324829102, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8625377416610718, "num_tokens": 256957861.0, "step": 6734 }, { "epoch": 0.8567612263070856, "ewc_loss": 0.024369094520807266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.436909380776342e-05, "grad_norm": 15.720239639282227, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8640087842941284, "num_tokens": 256998582.0, "step": 6735 }, { "epoch": 0.8568884365856761, "ewc_loss": 0.02438119798898697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4381197363254614e-05, "grad_norm": 15.902538299560547, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8617309331893921, "num_tokens": 257037638.0, "step": 6736 }, { "epoch": 0.8570156468642667, "ewc_loss": 0.024406198412179947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.440619755361695e-05, "grad_norm": 15.738551139831543, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8639768362045288, "num_tokens": 257074427.0, "step": 6737 }, { "epoch": 0.8571428571428571, "ewc_loss": 0.024332314729690552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4332315661013126e-05, "grad_norm": 15.820884704589844, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8657599687576294, "num_tokens": 257113826.0, "step": 6738 }, { "epoch": 0.8572700674214476, "ewc_loss": 0.024432305246591568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4432305508526042e-05, "grad_norm": 15.887120246887207, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8496644496917725, "num_tokens": 257152387.0, "step": 6739 }, { "epoch": 0.8573972777000382, "ewc_loss": 0.024363083764910698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4363083866774105e-05, "grad_norm": 15.799945831298828, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8637358546257019, "num_tokens": 257192083.0, "step": 6740 }, { "epoch": 0.8575244879786287, "ewc_loss": 0.024323655292391777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4323655452462845e-05, "grad_norm": 15.827079772949219, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.854489266872406, "num_tokens": 257235425.0, "step": 6741 }, { "epoch": 0.8576516982572192, "ewc_loss": 0.024359798058867455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.43597987719113e-05, "grad_norm": 15.741992950439453, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.859190821647644, "num_tokens": 257268621.0, "step": 6742 }, { "epoch": 0.8577789085358097, "ewc_loss": 0.02434656023979187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4346560167032294e-05, "grad_norm": 15.73646068572998, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8661828637123108, "num_tokens": 257306909.0, "step": 6743 }, { "epoch": 0.8579061188144002, "ewc_loss": 0.024330908432602882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4330907763214782e-05, "grad_norm": 15.778017044067383, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8761524558067322, "num_tokens": 257343388.0, "step": 6744 }, { "epoch": 0.8580333290929907, "ewc_loss": 0.024398198351264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4398197638220154e-05, "grad_norm": 15.75442886352539, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8672769069671631, "num_tokens": 257377172.0, "step": 6745 }, { "epoch": 0.8581605393715812, "ewc_loss": 0.024407191202044487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4407190721831284e-05, "grad_norm": 15.844927787780762, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8552006483078003, "num_tokens": 257410312.0, "step": 6746 }, { "epoch": 0.8582877496501717, "ewc_loss": 0.024393461644649506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.439346098981332e-05, "grad_norm": 15.739043235778809, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8653400540351868, "num_tokens": 257449919.0, "step": 6747 }, { "epoch": 0.8584149599287623, "ewc_loss": 0.024394620209932327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.439461968606338e-05, "grad_norm": 15.780802726745605, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8735384941101074, "num_tokens": 257488296.0, "step": 6748 }, { "epoch": 0.8585421702073528, "ewc_loss": 0.024465644732117653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.446564394631423e-05, "grad_norm": 15.832661628723145, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.870753288269043, "num_tokens": 257524678.0, "step": 6749 }, { "epoch": 0.8586693804859432, "ewc_loss": 0.024425826966762543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4425826268270612e-05, "grad_norm": 15.787259101867676, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.867238461971283, "num_tokens": 257559060.0, "step": 6750 }, { "epoch": 0.8587965907645337, "ewc_loss": 0.024431979283690453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4431979909422807e-05, "grad_norm": 15.812403678894043, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.87162184715271, "num_tokens": 257594340.0, "step": 6751 }, { "epoch": 0.8589238010431243, "ewc_loss": 0.024420568719506264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4420569388894364e-05, "grad_norm": 15.7696533203125, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8741637468338013, "num_tokens": 257627936.0, "step": 6752 }, { "epoch": 0.8590510113217148, "ewc_loss": 0.02445697970688343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.445698009978514e-05, "grad_norm": 15.825868606567383, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8603140711784363, "num_tokens": 257660311.0, "step": 6753 }, { "epoch": 0.8591782216003053, "ewc_loss": 0.02448154054582119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4481540094711818e-05, "grad_norm": 15.71202564239502, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.854303777217865, "num_tokens": 257697496.0, "step": 6754 }, { "epoch": 0.8593054318788959, "ewc_loss": 0.024405740201473236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.440574098727666e-05, "grad_norm": 15.758459091186523, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8454651832580566, "num_tokens": 257737242.0, "step": 6755 }, { "epoch": 0.8594326421574863, "ewc_loss": 0.024531658738851547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4531658709747717e-05, "grad_norm": 15.84179973602295, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8635833263397217, "num_tokens": 257772946.0, "step": 6756 }, { "epoch": 0.8595598524360768, "ewc_loss": 0.02450813353061676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4508133719791658e-05, "grad_norm": 15.852952003479004, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8570075035095215, "num_tokens": 257807198.0, "step": 6757 }, { "epoch": 0.8596870627146673, "ewc_loss": 0.024470254778862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.447025508445222e-05, "grad_norm": 15.772370338439941, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8685393333435059, "num_tokens": 257848618.0, "step": 6758 }, { "epoch": 0.8598142729932579, "ewc_loss": 0.024506311863660812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4506311092409305e-05, "grad_norm": 15.826637268066406, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8746474385261536, "num_tokens": 257887805.0, "step": 6759 }, { "epoch": 0.8599414832718484, "ewc_loss": 0.024463757872581482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4463757654302754e-05, "grad_norm": 15.775903701782227, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8689686059951782, "num_tokens": 257929276.0, "step": 6760 }, { "epoch": 0.8600686935504389, "ewc_loss": 0.02447819896042347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4478198611177504e-05, "grad_norm": 15.872642517089844, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8578020334243774, "num_tokens": 257958629.0, "step": 6761 }, { "epoch": 0.8601959038290294, "ewc_loss": 0.024516357108950615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.451635737088509e-05, "grad_norm": 15.748457908630371, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.859752357006073, "num_tokens": 257990303.0, "step": 6762 }, { "epoch": 0.8603231141076199, "ewc_loss": 0.02443639561533928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4436396415694617e-05, "grad_norm": 15.737451553344727, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8461151123046875, "num_tokens": 258028704.0, "step": 6763 }, { "epoch": 0.8604503243862104, "ewc_loss": 0.024595575407147408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4595576178398915e-05, "grad_norm": 15.81333065032959, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8706868886947632, "num_tokens": 258062757.0, "step": 6764 }, { "epoch": 0.8605775346648009, "ewc_loss": 0.024549541994929314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4549541194573976e-05, "grad_norm": 15.816323280334473, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8623860478401184, "num_tokens": 258098574.0, "step": 6765 }, { "epoch": 0.8607047449433914, "ewc_loss": 0.024575024843215942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4575025236117654e-05, "grad_norm": 15.772598266601562, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8719852566719055, "num_tokens": 258130937.0, "step": 6766 }, { "epoch": 0.860831955221982, "ewc_loss": 0.024506790563464165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.450679130561184e-05, "grad_norm": 15.753922462463379, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8652041554450989, "num_tokens": 258165142.0, "step": 6767 }, { "epoch": 0.8609591655005725, "ewc_loss": 0.024584265425801277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4584265702287666e-05, "grad_norm": 15.808905601501465, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8798651695251465, "num_tokens": 258203986.0, "step": 6768 }, { "epoch": 0.8610863757791629, "ewc_loss": 0.024526966735720634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4526967536075972e-05, "grad_norm": 15.805047988891602, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8632469773292542, "num_tokens": 258238692.0, "step": 6769 }, { "epoch": 0.8612135860577534, "ewc_loss": 0.02455475926399231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.455475987517275e-05, "grad_norm": 15.831838607788086, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8437318801879883, "num_tokens": 258280942.0, "step": 6770 }, { "epoch": 0.861340796336344, "ewc_loss": 0.02454555407166481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4545553969801404e-05, "grad_norm": 15.843563079833984, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8662434816360474, "num_tokens": 258318835.0, "step": 6771 }, { "epoch": 0.8614680066149345, "ewc_loss": 0.024579809978604317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457980917824898e-05, "grad_norm": 15.815690040588379, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8542352914810181, "num_tokens": 258359461.0, "step": 6772 }, { "epoch": 0.861595216893525, "ewc_loss": 0.024520300328731537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4520300939911976e-05, "grad_norm": 15.825800895690918, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8584029078483582, "num_tokens": 258394582.0, "step": 6773 }, { "epoch": 0.8617224271721156, "ewc_loss": 0.02453056536614895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4530565497116186e-05, "grad_norm": 15.722725868225098, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8663538098335266, "num_tokens": 258438526.0, "step": 6774 }, { "epoch": 0.861849637450706, "ewc_loss": 0.024512939155101776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4512939489795826e-05, "grad_norm": 15.809971809387207, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8499612808227539, "num_tokens": 258479332.0, "step": 6775 }, { "epoch": 0.8619768477292965, "ewc_loss": 0.024543579667806625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4543580366298556e-05, "grad_norm": 15.863113403320312, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8533136248588562, "num_tokens": 258513202.0, "step": 6776 }, { "epoch": 0.862104058007887, "ewc_loss": 0.024510582908988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.451058207952883e-05, "grad_norm": 15.80993366241455, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8648492097854614, "num_tokens": 258549145.0, "step": 6777 }, { "epoch": 0.8622312682864776, "ewc_loss": 0.02453370951116085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4533708710805513e-05, "grad_norm": 15.821635246276855, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8545976877212524, "num_tokens": 258586421.0, "step": 6778 }, { "epoch": 0.8623584785650681, "ewc_loss": 0.02450462244451046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4504623070242815e-05, "grad_norm": 15.827644348144531, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8631541728973389, "num_tokens": 258620014.0, "step": 6779 }, { "epoch": 0.8624856888436586, "ewc_loss": 0.0245274119079113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4527411369490437e-05, "grad_norm": 15.878863334655762, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8682084679603577, "num_tokens": 258656124.0, "step": 6780 }, { "epoch": 0.862612899122249, "ewc_loss": 0.024507636204361916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4507635316695087e-05, "grad_norm": 15.824418067932129, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8523278832435608, "num_tokens": 258692537.0, "step": 6781 }, { "epoch": 0.8627401094008396, "ewc_loss": 0.024507688358426094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.450768806738779e-05, "grad_norm": 15.838066101074219, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8659418821334839, "num_tokens": 258732737.0, "step": 6782 }, { "epoch": 0.8628673196794301, "ewc_loss": 0.024505160748958588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4505161491106264e-05, "grad_norm": 15.801539421081543, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.849760115146637, "num_tokens": 258770634.0, "step": 6783 }, { "epoch": 0.8629945299580206, "ewc_loss": 0.024482805281877518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4482806111336686e-05, "grad_norm": 15.824832916259766, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.861735999584198, "num_tokens": 258809643.0, "step": 6784 }, { "epoch": 0.8631217402366111, "ewc_loss": 0.024509372189641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4509372451575473e-05, "grad_norm": 15.807663917541504, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8681331872940063, "num_tokens": 258845336.0, "step": 6785 }, { "epoch": 0.8632489505152017, "ewc_loss": 0.024481797590851784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.448179839120712e-05, "grad_norm": 15.769054412841797, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.864596426486969, "num_tokens": 258882206.0, "step": 6786 }, { "epoch": 0.8633761607937921, "ewc_loss": 0.024519124999642372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4519125872757286e-05, "grad_norm": 15.869257926940918, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8703320026397705, "num_tokens": 258918661.0, "step": 6787 }, { "epoch": 0.8635033710723826, "ewc_loss": 0.02451922930777073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4519229555153288e-05, "grad_norm": 15.723640441894531, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8579130172729492, "num_tokens": 258957452.0, "step": 6788 }, { "epoch": 0.8636305813509731, "ewc_loss": 0.024523533880710602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4523533284082077e-05, "grad_norm": 15.823431968688965, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8540974855422974, "num_tokens": 259003507.0, "step": 6789 }, { "epoch": 0.8637577916295637, "ewc_loss": 0.024579579010605812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457957816659473e-05, "grad_norm": 15.823324203491211, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8572107553482056, "num_tokens": 259042284.0, "step": 6790 }, { "epoch": 0.8638850019081542, "ewc_loss": 0.024526657536625862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.452665830787737e-05, "grad_norm": 15.84698486328125, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8763778209686279, "num_tokens": 259082929.0, "step": 6791 }, { "epoch": 0.8640122121867447, "ewc_loss": 0.02449110522866249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4491106159985065e-05, "grad_norm": 15.770772933959961, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8775935173034668, "num_tokens": 259125920.0, "step": 6792 }, { "epoch": 0.8641394224653351, "ewc_loss": 0.024500438943505287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.450043939461466e-05, "grad_norm": 15.805717468261719, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.863398551940918, "num_tokens": 259163140.0, "step": 6793 }, { "epoch": 0.8642666327439257, "ewc_loss": 0.024529097601771355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4529097572667524e-05, "grad_norm": 15.82033920288086, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.86141037940979, "num_tokens": 259199038.0, "step": 6794 }, { "epoch": 0.8643938430225162, "ewc_loss": 0.024463117122650146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4463117370032705e-05, "grad_norm": 15.734448432922363, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8608360290527344, "num_tokens": 259235713.0, "step": 6795 }, { "epoch": 0.8645210533011067, "ewc_loss": 0.024483097717165947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4483097149641253e-05, "grad_norm": 15.883898735046387, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8523833155632019, "num_tokens": 259278314.0, "step": 6796 }, { "epoch": 0.8646482635796973, "ewc_loss": 0.02451602928340435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.451602995279245e-05, "grad_norm": 15.814674377441406, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8709097504615784, "num_tokens": 259313039.0, "step": 6797 }, { "epoch": 0.8647754738582878, "ewc_loss": 0.024437978863716125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.44379789364757e-05, "grad_norm": 15.786295890808105, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.869269073009491, "num_tokens": 259348845.0, "step": 6798 }, { "epoch": 0.8649026841368782, "ewc_loss": 0.024471914395689964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4471914002788253e-05, "grad_norm": 15.81747817993164, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8569945693016052, "num_tokens": 259387345.0, "step": 6799 }, { "epoch": 0.8650298944154687, "ewc_loss": 0.024494925513863564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4494926037732512e-05, "grad_norm": 15.81467342376709, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8617766499519348, "num_tokens": 259427835.0, "step": 6800 }, { "epoch": 0.8651571046940593, "ewc_loss": 0.02443220280110836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4432203645119444e-05, "grad_norm": 15.721216201782227, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8671376705169678, "num_tokens": 259460453.0, "step": 6801 }, { "epoch": 0.8652843149726498, "ewc_loss": 0.024495383724570274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4495384423062205e-05, "grad_norm": 15.874272346496582, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8689461350440979, "num_tokens": 259499296.0, "step": 6802 }, { "epoch": 0.8654115252512403, "ewc_loss": 0.02452697418630123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4526974812033586e-05, "grad_norm": 15.765153884887695, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8597968816757202, "num_tokens": 259537649.0, "step": 6803 }, { "epoch": 0.8655387355298308, "ewc_loss": 0.02442130073904991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.442130062263459e-05, "grad_norm": 15.770149230957031, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8550727367401123, "num_tokens": 259583136.0, "step": 6804 }, { "epoch": 0.8656659458084213, "ewc_loss": 0.02456848882138729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4568489607190713e-05, "grad_norm": 15.850225448608398, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8668866753578186, "num_tokens": 259620422.0, "step": 6805 }, { "epoch": 0.8657931560870118, "ewc_loss": 0.02445981279015541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4459812266286463e-05, "grad_norm": 15.815433502197266, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8506589531898499, "num_tokens": 259663948.0, "step": 6806 }, { "epoch": 0.8659203663656023, "ewc_loss": 0.024494027718901634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.449402745696716e-05, "grad_norm": 15.771955490112305, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8637324571609497, "num_tokens": 259701986.0, "step": 6807 }, { "epoch": 0.8660475766441929, "ewc_loss": 0.02450546808540821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4505468900315464e-05, "grad_norm": 15.818098068237305, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8774312734603882, "num_tokens": 259740684.0, "step": 6808 }, { "epoch": 0.8661747869227834, "ewc_loss": 0.02452656254172325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.452656190143898e-05, "grad_norm": 15.861096382141113, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8463583588600159, "num_tokens": 259781659.0, "step": 6809 }, { "epoch": 0.8663019972013739, "ewc_loss": 0.024515146389603615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4515145923942327e-05, "grad_norm": 15.852164268493652, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.856433629989624, "num_tokens": 259818554.0, "step": 6810 }, { "epoch": 0.8664292074799644, "ewc_loss": 0.024474935606122017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4474935344187543e-05, "grad_norm": 15.782073974609375, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.865308403968811, "num_tokens": 259856652.0, "step": 6811 }, { "epoch": 0.8665564177585549, "ewc_loss": 0.024518191814422607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4518192731193267e-05, "grad_norm": 15.838754653930664, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8804655075073242, "num_tokens": 259898255.0, "step": 6812 }, { "epoch": 0.8666836280371454, "ewc_loss": 0.024527257308363914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4527256755391136e-05, "grad_norm": 15.859847068786621, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8670929670333862, "num_tokens": 259935305.0, "step": 6813 }, { "epoch": 0.8668108383157359, "ewc_loss": 0.024457398802042007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4457398467347957e-05, "grad_norm": 15.81531047821045, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8665982484817505, "num_tokens": 259975245.0, "step": 6814 }, { "epoch": 0.8669380485943264, "ewc_loss": 0.024520255625247955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4520255465176888e-05, "grad_norm": 15.865059852600098, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8674421310424805, "num_tokens": 260014150.0, "step": 6815 }, { "epoch": 0.867065258872917, "ewc_loss": 0.024464301764965057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4464301532134414e-05, "grad_norm": 15.8447265625, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8532434105873108, "num_tokens": 260054655.0, "step": 6816 }, { "epoch": 0.8671924691515075, "ewc_loss": 0.02442372776567936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.442372715449892e-05, "grad_norm": 15.769086837768555, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8625800013542175, "num_tokens": 260091528.0, "step": 6817 }, { "epoch": 0.8673196794300979, "ewc_loss": 0.024467604234814644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4467604816891253e-05, "grad_norm": 15.796359062194824, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8777525424957275, "num_tokens": 260123069.0, "step": 6818 }, { "epoch": 0.8674468897086884, "ewc_loss": 0.024474425241351128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.447442602715455e-05, "grad_norm": 15.793631553649902, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8474531173706055, "num_tokens": 260154482.0, "step": 6819 }, { "epoch": 0.867574099987279, "ewc_loss": 0.024541476741433144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4541477614548057e-05, "grad_norm": 15.847779273986816, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8643431663513184, "num_tokens": 260193983.0, "step": 6820 }, { "epoch": 0.8677013102658695, "ewc_loss": 0.02448049932718277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.448049963277299e-05, "grad_norm": 15.747511863708496, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8646910190582275, "num_tokens": 260228705.0, "step": 6821 }, { "epoch": 0.86782852054446, "ewc_loss": 0.024469619616866112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4469620257150382e-05, "grad_norm": 15.782814979553223, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8617220520973206, "num_tokens": 260262821.0, "step": 6822 }, { "epoch": 0.8679557308230506, "ewc_loss": 0.024620838463306427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.462083830323536e-05, "grad_norm": 15.87572193145752, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8613660931587219, "num_tokens": 260298949.0, "step": 6823 }, { "epoch": 0.868082941101641, "ewc_loss": 0.024510378018021584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4510378352715634e-05, "grad_norm": 15.793323516845703, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8705919981002808, "num_tokens": 260340028.0, "step": 6824 }, { "epoch": 0.8682101513802315, "ewc_loss": 0.02450447715818882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.450447755109053e-05, "grad_norm": 15.719797134399414, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.862031877040863, "num_tokens": 260381328.0, "step": 6825 }, { "epoch": 0.868337361658822, "ewc_loss": 0.024571426212787628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4571425456088036e-05, "grad_norm": 15.826862335205078, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.863066554069519, "num_tokens": 260423421.0, "step": 6826 }, { "epoch": 0.8684645719374126, "ewc_loss": 0.024578887969255447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4578888769610785e-05, "grad_norm": 15.774127960205078, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8609979748725891, "num_tokens": 260461985.0, "step": 6827 }, { "epoch": 0.8685917822160031, "ewc_loss": 0.02452966570854187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.452966509736143e-05, "grad_norm": 15.844345092773438, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8467941284179688, "num_tokens": 260503041.0, "step": 6828 }, { "epoch": 0.8687189924945936, "ewc_loss": 0.024576786905527115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457678783684969e-05, "grad_norm": 15.778810501098633, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8711545467376709, "num_tokens": 260545553.0, "step": 6829 }, { "epoch": 0.868846202773184, "ewc_loss": 0.02452479861676693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4524799300706945e-05, "grad_norm": 15.828563690185547, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8703439235687256, "num_tokens": 260577160.0, "step": 6830 }, { "epoch": 0.8689734130517746, "ewc_loss": 0.024546725675463676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4546725398977287e-05, "grad_norm": 15.769368171691895, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8737601041793823, "num_tokens": 260614748.0, "step": 6831 }, { "epoch": 0.8691006233303651, "ewc_loss": 0.024556707590818405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4556708012823947e-05, "grad_norm": 15.82222843170166, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.862611711025238, "num_tokens": 260651762.0, "step": 6832 }, { "epoch": 0.8692278336089556, "ewc_loss": 0.02456626482307911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4566264983150177e-05, "grad_norm": 15.787359237670898, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8511363863945007, "num_tokens": 260694438.0, "step": 6833 }, { "epoch": 0.8693550438875461, "ewc_loss": 0.024546537548303604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4546538043068722e-05, "grad_norm": 15.857354164123535, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8712784051895142, "num_tokens": 260730256.0, "step": 6834 }, { "epoch": 0.8694822541661367, "ewc_loss": 0.024512484669685364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.451248474244494e-05, "grad_norm": 15.78926944732666, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8426393270492554, "num_tokens": 260766190.0, "step": 6835 }, { "epoch": 0.8696094644447271, "ewc_loss": 0.024495171383023262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.449517160130199e-05, "grad_norm": 15.814321517944336, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8774694204330444, "num_tokens": 260801733.0, "step": 6836 }, { "epoch": 0.8697366747233176, "ewc_loss": 0.024549340829253197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4549341105739586e-05, "grad_norm": 15.798099517822266, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8739403486251831, "num_tokens": 260843461.0, "step": 6837 }, { "epoch": 0.8698638850019081, "ewc_loss": 0.02454075776040554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4540757294744253e-05, "grad_norm": 15.871813774108887, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8606604337692261, "num_tokens": 260875006.0, "step": 6838 }, { "epoch": 0.8699910952804987, "ewc_loss": 0.024573441594839096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4573440896347165e-05, "grad_norm": 15.81192398071289, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8563735485076904, "num_tokens": 260917553.0, "step": 6839 }, { "epoch": 0.8701183055590892, "ewc_loss": 0.024488089606165886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4488090275553986e-05, "grad_norm": 15.793420791625977, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8376262187957764, "num_tokens": 260955293.0, "step": 6840 }, { "epoch": 0.8702455158376797, "ewc_loss": 0.0245670173317194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4567018044763245e-05, "grad_norm": 15.804598808288574, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8631271123886108, "num_tokens": 260983966.0, "step": 6841 }, { "epoch": 0.8703727261162701, "ewc_loss": 0.024603016674518585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4603015845059417e-05, "grad_norm": 15.76244831085205, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8759049773216248, "num_tokens": 261019434.0, "step": 6842 }, { "epoch": 0.8704999363948607, "ewc_loss": 0.024593325331807137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.459332608850673e-05, "grad_norm": 15.862791061401367, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8513698577880859, "num_tokens": 261053761.0, "step": 6843 }, { "epoch": 0.8706271466734512, "ewc_loss": 0.024634063243865967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.463406417518854e-05, "grad_norm": 15.783075332641602, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8528214693069458, "num_tokens": 261093490.0, "step": 6844 }, { "epoch": 0.8707543569520417, "ewc_loss": 0.024572322145104408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4572322217863984e-05, "grad_norm": 15.854954719543457, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8643595576286316, "num_tokens": 261129518.0, "step": 6845 }, { "epoch": 0.8708815672306323, "ewc_loss": 0.024621369317173958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4621369448141195e-05, "grad_norm": 15.748286247253418, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8458085060119629, "num_tokens": 261164282.0, "step": 6846 }, { "epoch": 0.8710087775092228, "ewc_loss": 0.02459706738591194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4597067749709822e-05, "grad_norm": 15.854077339172363, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8428856134414673, "num_tokens": 261205055.0, "step": 6847 }, { "epoch": 0.8711359877878132, "ewc_loss": 0.024676060304045677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4676061002537608e-05, "grad_norm": 15.820528030395508, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8569892644882202, "num_tokens": 261242329.0, "step": 6848 }, { "epoch": 0.8712631980664037, "ewc_loss": 0.024574963375926018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4574963390477933e-05, "grad_norm": 15.864495277404785, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8354443311691284, "num_tokens": 261280330.0, "step": 6849 }, { "epoch": 0.8713904083449943, "ewc_loss": 0.024667814373970032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.466781370458193e-05, "grad_norm": 15.778903007507324, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8728272318840027, "num_tokens": 261318232.0, "step": 6850 }, { "epoch": 0.8715176186235848, "ewc_loss": 0.02457432448863983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457432492519729e-05, "grad_norm": 15.806236267089844, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8795333504676819, "num_tokens": 261351483.0, "step": 6851 }, { "epoch": 0.8716448289021753, "ewc_loss": 0.024656271561980247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4656272216816433e-05, "grad_norm": 15.844649314880371, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8652666211128235, "num_tokens": 261387058.0, "step": 6852 }, { "epoch": 0.8717720391807658, "ewc_loss": 0.02463109977543354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.463109922246076e-05, "grad_norm": 15.782735824584961, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.878135621547699, "num_tokens": 261430110.0, "step": 6853 }, { "epoch": 0.8718992494593563, "ewc_loss": 0.024587849155068398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4587849111412652e-05, "grad_norm": 15.811912536621094, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8693367838859558, "num_tokens": 261470707.0, "step": 6854 }, { "epoch": 0.8720264597379468, "ewc_loss": 0.02466561086475849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4665610908414237e-05, "grad_norm": 15.765329360961914, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8700583577156067, "num_tokens": 261507146.0, "step": 6855 }, { "epoch": 0.8721536700165373, "ewc_loss": 0.02461576648056507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4615766960778274e-05, "grad_norm": 15.859342575073242, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8483338356018066, "num_tokens": 261549723.0, "step": 6856 }, { "epoch": 0.8722808802951278, "ewc_loss": 0.02462853118777275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4628530809422955e-05, "grad_norm": 15.764062881469727, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8690455555915833, "num_tokens": 261591030.0, "step": 6857 }, { "epoch": 0.8724080905737184, "ewc_loss": 0.02457314357161522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4573144401074387e-05, "grad_norm": 15.794174194335938, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8716589212417603, "num_tokens": 261623919.0, "step": 6858 }, { "epoch": 0.8725353008523089, "ewc_loss": 0.02465861476957798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46586150751682e-05, "grad_norm": 15.822257995605469, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8795398473739624, "num_tokens": 261655277.0, "step": 6859 }, { "epoch": 0.8726625111308994, "ewc_loss": 0.02463396266102791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4633962311781943e-05, "grad_norm": 15.785221099853516, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8501574993133545, "num_tokens": 261697462.0, "step": 6860 }, { "epoch": 0.8727897214094898, "ewc_loss": 0.02460946887731552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4609469619463198e-05, "grad_norm": 15.750812530517578, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.860519289970398, "num_tokens": 261739134.0, "step": 6861 }, { "epoch": 0.8729169316880804, "ewc_loss": 0.024647414684295654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4647415557410568e-05, "grad_norm": 15.828825950622559, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8668262958526611, "num_tokens": 261776295.0, "step": 6862 }, { "epoch": 0.8730441419666709, "ewc_loss": 0.02465019002556801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4650189516250975e-05, "grad_norm": 15.875582695007324, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8674687147140503, "num_tokens": 261812388.0, "step": 6863 }, { "epoch": 0.8731713522452614, "ewc_loss": 0.024608980864286423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4608980311313644e-05, "grad_norm": 15.764364242553711, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8640358448028564, "num_tokens": 261855537.0, "step": 6864 }, { "epoch": 0.873298562523852, "ewc_loss": 0.024588583037257195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.458858216414228e-05, "grad_norm": 15.83659839630127, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8710607886314392, "num_tokens": 261899922.0, "step": 6865 }, { "epoch": 0.8734257728024425, "ewc_loss": 0.02459104172885418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4591041437815875e-05, "grad_norm": 15.743425369262695, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8745310306549072, "num_tokens": 261934475.0, "step": 6866 }, { "epoch": 0.8735529830810329, "ewc_loss": 0.024593591690063477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4593591660959646e-05, "grad_norm": 15.8627290725708, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8635957837104797, "num_tokens": 261973395.0, "step": 6867 }, { "epoch": 0.8736801933596234, "ewc_loss": 0.02459883503615856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.459883580741007e-05, "grad_norm": 15.762639999389648, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.847080647945404, "num_tokens": 262011644.0, "step": 6868 }, { "epoch": 0.873807403638214, "ewc_loss": 0.024578770622611046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4578770535299554e-05, "grad_norm": 15.82852554321289, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8582286834716797, "num_tokens": 262044038.0, "step": 6869 }, { "epoch": 0.8739346139168045, "ewc_loss": 0.024571238085627556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457123810017947e-05, "grad_norm": 15.749503135681152, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.862774133682251, "num_tokens": 262081023.0, "step": 6870 }, { "epoch": 0.874061824195395, "ewc_loss": 0.024567529559135437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.456752918078564e-05, "grad_norm": 15.748090744018555, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8563541173934937, "num_tokens": 262123981.0, "step": 6871 }, { "epoch": 0.8741890344739855, "ewc_loss": 0.02467132918536663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4671329811098985e-05, "grad_norm": 15.807112693786621, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8534783124923706, "num_tokens": 262161542.0, "step": 6872 }, { "epoch": 0.874316244752576, "ewc_loss": 0.024625597521662712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4625596779515035e-05, "grad_norm": 15.826275825500488, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8534075021743774, "num_tokens": 262202482.0, "step": 6873 }, { "epoch": 0.8744434550311665, "ewc_loss": 0.024614138528704643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46141389652621e-05, "grad_norm": 15.747303009033203, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8640974760055542, "num_tokens": 262239499.0, "step": 6874 }, { "epoch": 0.874570665309757, "ewc_loss": 0.024627024307847023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.462702468619682e-05, "grad_norm": 15.84617805480957, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8638060092926025, "num_tokens": 262281324.0, "step": 6875 }, { "epoch": 0.8746978755883476, "ewc_loss": 0.024674102663993835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.467410195094999e-05, "grad_norm": 15.823413848876953, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.854283332824707, "num_tokens": 262318772.0, "step": 6876 }, { "epoch": 0.8748250858669381, "ewc_loss": 0.024624060839414597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4624061552458443e-05, "grad_norm": 15.768675804138184, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8759984374046326, "num_tokens": 262353264.0, "step": 6877 }, { "epoch": 0.8749522961455286, "ewc_loss": 0.02461319975554943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.461320036672987e-05, "grad_norm": 15.754202842712402, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8609092235565186, "num_tokens": 262393360.0, "step": 6878 }, { "epoch": 0.875079506424119, "ewc_loss": 0.024646244943141937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4646244128234684e-05, "grad_norm": 15.855646133422852, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8739678859710693, "num_tokens": 262432010.0, "step": 6879 }, { "epoch": 0.8752067167027096, "ewc_loss": 0.024675754830241203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4675755412317812e-05, "grad_norm": 15.831680297851562, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8628513813018799, "num_tokens": 262468828.0, "step": 6880 }, { "epoch": 0.8753339269813001, "ewc_loss": 0.024613097310066223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.461309668433387e-05, "grad_norm": 15.787290573120117, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8697731494903564, "num_tokens": 262515141.0, "step": 6881 }, { "epoch": 0.8754611372598906, "ewc_loss": 0.02465696632862091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4656967070768587e-05, "grad_norm": 15.890069961547852, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8598330616950989, "num_tokens": 262553137.0, "step": 6882 }, { "epoch": 0.8755883475384811, "ewc_loss": 0.024645689874887466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4645689336466603e-05, "grad_norm": 15.81406021118164, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.8852745890617371, "num_tokens": 262593847.0, "step": 6883 }, { "epoch": 0.8757155578170717, "ewc_loss": 0.024610858410596848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4610859327367507e-05, "grad_norm": 15.825108528137207, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8529412746429443, "num_tokens": 262635048.0, "step": 6884 }, { "epoch": 0.8758427680956621, "ewc_loss": 0.024595830589532852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.459583083691541e-05, "grad_norm": 15.878908157348633, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8548353314399719, "num_tokens": 262671200.0, "step": 6885 }, { "epoch": 0.8759699783742526, "ewc_loss": 0.02460538037121296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4605380531284027e-05, "grad_norm": 15.82109260559082, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.864621102809906, "num_tokens": 262708304.0, "step": 6886 }, { "epoch": 0.8760971886528431, "ewc_loss": 0.024595486000180244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4595485228928737e-05, "grad_norm": 15.83195972442627, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8601832389831543, "num_tokens": 262750680.0, "step": 6887 }, { "epoch": 0.8762243989314337, "ewc_loss": 0.024589240550994873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4589240638306364e-05, "grad_norm": 15.88867473602295, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8582249879837036, "num_tokens": 262784845.0, "step": 6888 }, { "epoch": 0.8763516092100242, "ewc_loss": 0.024612246081233025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.461224539729301e-05, "grad_norm": 15.939240455627441, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8635979294776917, "num_tokens": 262822646.0, "step": 6889 }, { "epoch": 0.8764788194886147, "ewc_loss": 0.02458498440682888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4584984203102067e-05, "grad_norm": 15.867972373962402, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8594716787338257, "num_tokens": 262863663.0, "step": 6890 }, { "epoch": 0.8766060297672051, "ewc_loss": 0.024593695998191833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4593695343355648e-05, "grad_norm": 16.024921417236328, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8500041961669922, "num_tokens": 262903726.0, "step": 6891 }, { "epoch": 0.8767332400457957, "ewc_loss": 0.024606190621852875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4606189981568605e-05, "grad_norm": 15.849067687988281, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8522120714187622, "num_tokens": 262942748.0, "step": 6892 }, { "epoch": 0.8768604503243862, "ewc_loss": 0.024483097717165947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4483097149641253e-05, "grad_norm": 15.902474403381348, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8426363468170166, "num_tokens": 262977639.0, "step": 6893 }, { "epoch": 0.8769876606029767, "ewc_loss": 0.024630628526210785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4630628104205243e-05, "grad_norm": 15.848655700683594, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8653953075408936, "num_tokens": 263015970.0, "step": 6894 }, { "epoch": 0.8771148708815673, "ewc_loss": 0.024510623887181282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4510623916285113e-05, "grad_norm": 15.855018615722656, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8721179962158203, "num_tokens": 263058618.0, "step": 6895 }, { "epoch": 0.8772420811601578, "ewc_loss": 0.024556539952754974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.455654066579882e-05, "grad_norm": 15.876087188720703, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8470607995986938, "num_tokens": 263098628.0, "step": 6896 }, { "epoch": 0.8773692914387482, "ewc_loss": 0.024594012647867203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4594011847511865e-05, "grad_norm": 15.881999015808105, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8660296201705933, "num_tokens": 263136793.0, "step": 6897 }, { "epoch": 0.8774965017173387, "ewc_loss": 0.024587342515587807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4587343432358466e-05, "grad_norm": 15.864913940429688, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.865102231502533, "num_tokens": 263170996.0, "step": 6898 }, { "epoch": 0.8776237119959293, "ewc_loss": 0.0245829951018095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4582994228694588e-05, "grad_norm": 15.905592918395996, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8530373573303223, "num_tokens": 263199961.0, "step": 6899 }, { "epoch": 0.8777509222745198, "ewc_loss": 0.024597350507974625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4597349693067372e-05, "grad_norm": 15.815195083618164, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8621107339859009, "num_tokens": 263241541.0, "step": 6900 }, { "epoch": 0.8778781325531103, "ewc_loss": 0.0245528481900692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4552848117309622e-05, "grad_norm": 15.851325035095215, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8748921155929565, "num_tokens": 263278778.0, "step": 6901 }, { "epoch": 0.8780053428317008, "ewc_loss": 0.024590998888015747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4590999601059593e-05, "grad_norm": 15.811959266662598, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8655346632003784, "num_tokens": 263310155.0, "step": 6902 }, { "epoch": 0.8781325531102913, "ewc_loss": 0.024601412937045097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4601413315394893e-05, "grad_norm": 15.85861587524414, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8594417572021484, "num_tokens": 263355793.0, "step": 6903 }, { "epoch": 0.8782597633888818, "ewc_loss": 0.024588242173194885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4588242013123818e-05, "grad_norm": 15.88350772857666, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8731114268302917, "num_tokens": 263394446.0, "step": 6904 }, { "epoch": 0.8783869736674723, "ewc_loss": 0.02455836348235607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4558363293181174e-05, "grad_norm": 15.908787727355957, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8663121461868286, "num_tokens": 263438479.0, "step": 6905 }, { "epoch": 0.8785141839460628, "ewc_loss": 0.02461421675980091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4614217181806453e-05, "grad_norm": 15.856587409973145, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8494168519973755, "num_tokens": 263478514.0, "step": 6906 }, { "epoch": 0.8786413942246534, "ewc_loss": 0.02451065368950367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.451065302011557e-05, "grad_norm": 15.82999324798584, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8432495594024658, "num_tokens": 263516734.0, "step": 6907 }, { "epoch": 0.8787686045032439, "ewc_loss": 0.024594737216830254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.459473762428388e-05, "grad_norm": 15.822239875793457, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8535109162330627, "num_tokens": 263560105.0, "step": 6908 }, { "epoch": 0.8788958147818343, "ewc_loss": 0.02461887151002884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4618871975690126e-05, "grad_norm": 15.863202095031738, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8653289079666138, "num_tokens": 263597033.0, "step": 6909 }, { "epoch": 0.8790230250604248, "ewc_loss": 0.02459673024713993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4596729417680763e-05, "grad_norm": 15.825094223022461, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8704677820205688, "num_tokens": 263631106.0, "step": 6910 }, { "epoch": 0.8791502353390154, "ewc_loss": 0.024578850716352463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.457885057083331e-05, "grad_norm": 15.862337112426758, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8574420213699341, "num_tokens": 263668227.0, "step": 6911 }, { "epoch": 0.8792774456176059, "ewc_loss": 0.024637799710035324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4637800379423425e-05, "grad_norm": 15.911731719970703, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8642903566360474, "num_tokens": 263704335.0, "step": 6912 }, { "epoch": 0.8794046558961964, "ewc_loss": 0.02461858093738556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.461858093738556e-05, "grad_norm": 15.795777320861816, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8697761297225952, "num_tokens": 263742532.0, "step": 6913 }, { "epoch": 0.879531866174787, "ewc_loss": 0.0245262049138546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4526205379515886e-05, "grad_norm": 15.825705528259277, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8642643690109253, "num_tokens": 263782321.0, "step": 6914 }, { "epoch": 0.8796590764533775, "ewc_loss": 0.02457299828529358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4572998881922103e-05, "grad_norm": 15.853543281555176, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8713789582252502, "num_tokens": 263818037.0, "step": 6915 }, { "epoch": 0.8797862867319679, "ewc_loss": 0.02458256483078003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.458256494719535e-05, "grad_norm": 15.849919319152832, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.848641574382782, "num_tokens": 263854901.0, "step": 6916 }, { "epoch": 0.8799134970105584, "ewc_loss": 0.024633869528770447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4633869543322362e-05, "grad_norm": 15.8729829788208, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8674658536911011, "num_tokens": 263896241.0, "step": 6917 }, { "epoch": 0.880040707289149, "ewc_loss": 0.024609055370092392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.460905488987919e-05, "grad_norm": 15.782122611999512, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8645906448364258, "num_tokens": 263934673.0, "step": 6918 }, { "epoch": 0.8801679175677395, "ewc_loss": 0.024600127711892128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4600127289886586e-05, "grad_norm": 15.862703323364258, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8539909720420837, "num_tokens": 263973456.0, "step": 6919 }, { "epoch": 0.88029512784633, "ewc_loss": 0.024601005017757416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46010058617685e-05, "grad_norm": 15.886711120605469, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8451018333435059, "num_tokens": 264006050.0, "step": 6920 }, { "epoch": 0.8804223381249205, "ewc_loss": 0.02464042790234089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.464042881911155e-05, "grad_norm": 15.897483825683594, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8731783628463745, "num_tokens": 264042736.0, "step": 6921 }, { "epoch": 0.880549548403511, "ewc_loss": 0.02462080679833889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46208073804155e-05, "grad_norm": 15.831238746643066, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8541488647460938, "num_tokens": 264084433.0, "step": 6922 }, { "epoch": 0.8806767586821015, "ewc_loss": 0.024613456800580025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.461345684423577e-05, "grad_norm": 15.881314277648926, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8708370923995972, "num_tokens": 264118331.0, "step": 6923 }, { "epoch": 0.880803968960692, "ewc_loss": 0.024621732532978058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4621733246021904e-05, "grad_norm": 15.817665100097656, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.865444540977478, "num_tokens": 264152015.0, "step": 6924 }, { "epoch": 0.8809311792392825, "ewc_loss": 0.02458345703780651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.458345625200309e-05, "grad_norm": 15.804780006408691, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8692548274993896, "num_tokens": 264187085.0, "step": 6925 }, { "epoch": 0.8810583895178731, "ewc_loss": 0.024646850302815437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4646849851706065e-05, "grad_norm": 15.842765808105469, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8589106798171997, "num_tokens": 264225558.0, "step": 6926 }, { "epoch": 0.8811855997964636, "ewc_loss": 0.024655254557728767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.465525540173985e-05, "grad_norm": 15.810664176940918, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8536473512649536, "num_tokens": 264264220.0, "step": 6927 }, { "epoch": 0.881312810075054, "ewc_loss": 0.02465507574379444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46550753217889e-05, "grad_norm": 15.84897518157959, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8661705255508423, "num_tokens": 264302097.0, "step": 6928 }, { "epoch": 0.8814400203536445, "ewc_loss": 0.02467431128025055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4674311134731397e-05, "grad_norm": 15.829879760742188, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8652282357215881, "num_tokens": 264341966.0, "step": 6929 }, { "epoch": 0.8815672306322351, "ewc_loss": 0.024652568623423576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4652568754390813e-05, "grad_norm": 15.863582611083984, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8403648734092712, "num_tokens": 264384298.0, "step": 6930 }, { "epoch": 0.8816944409108256, "ewc_loss": 0.024694420397281647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4694420062587596e-05, "grad_norm": 15.855374336242676, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.863599419593811, "num_tokens": 264422513.0, "step": 6931 }, { "epoch": 0.8818216511894161, "ewc_loss": 0.024633975699543953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4633975044707768e-05, "grad_norm": 15.8622465133667, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8749688863754272, "num_tokens": 264461819.0, "step": 6932 }, { "epoch": 0.8819488614680067, "ewc_loss": 0.0246950201690197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4695020329090767e-05, "grad_norm": 15.859957695007324, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8526148796081543, "num_tokens": 264502618.0, "step": 6933 }, { "epoch": 0.8820760717465971, "ewc_loss": 0.024636147543787956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46361469180556e-05, "grad_norm": 15.869494438171387, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8555876612663269, "num_tokens": 264534382.0, "step": 6934 }, { "epoch": 0.8822032820251876, "ewc_loss": 0.02467799372971058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4677994588273577e-05, "grad_norm": 15.829992294311523, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8521451950073242, "num_tokens": 264575297.0, "step": 6935 }, { "epoch": 0.8823304923037781, "ewc_loss": 0.024656880646944046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4656879759277217e-05, "grad_norm": 15.800614356994629, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8687350153923035, "num_tokens": 264611822.0, "step": 6936 }, { "epoch": 0.8824577025823687, "ewc_loss": 0.024674121290445328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4674121959833428e-05, "grad_norm": 15.832736015319824, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8587396740913391, "num_tokens": 264657483.0, "step": 6937 }, { "epoch": 0.8825849128609592, "ewc_loss": 0.024643830955028534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.464383032929618e-05, "grad_norm": 15.837738037109375, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8688640594482422, "num_tokens": 264690633.0, "step": 6938 }, { "epoch": 0.8827121231395497, "ewc_loss": 0.024670148268342018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.467014746798668e-05, "grad_norm": 15.864242553710938, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8774268627166748, "num_tokens": 264733268.0, "step": 6939 }, { "epoch": 0.8828393334181401, "ewc_loss": 0.024688702076673508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4688702978892252e-05, "grad_norm": 15.881656646728516, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8559251427650452, "num_tokens": 264774769.0, "step": 6940 }, { "epoch": 0.8829665436967307, "ewc_loss": 0.024699948728084564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4699947971384972e-05, "grad_norm": 15.9043550491333, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8573570847511292, "num_tokens": 264806658.0, "step": 6941 }, { "epoch": 0.8830937539753212, "ewc_loss": 0.024667980149388313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4667981051607057e-05, "grad_norm": 15.869606971740723, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8507935404777527, "num_tokens": 264841054.0, "step": 6942 }, { "epoch": 0.8832209642539117, "ewc_loss": 0.02466070093214512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4660701456014067e-05, "grad_norm": 15.889973640441895, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8569296598434448, "num_tokens": 264880241.0, "step": 6943 }, { "epoch": 0.8833481745325023, "ewc_loss": 0.024684520438313484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46845211222535e-05, "grad_norm": 15.908777236938477, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8647168874740601, "num_tokens": 264922449.0, "step": 6944 }, { "epoch": 0.8834753848110928, "ewc_loss": 0.02465706504881382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.465706529619638e-05, "grad_norm": 15.896093368530273, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8621569275856018, "num_tokens": 264961605.0, "step": 6945 }, { "epoch": 0.8836025950896832, "ewc_loss": 0.02462625317275524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4626253434689716e-05, "grad_norm": 15.83068561553955, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8708522319793701, "num_tokens": 264992924.0, "step": 6946 }, { "epoch": 0.8837298053682737, "ewc_loss": 0.024635659530758858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.463565942889545e-05, "grad_norm": 15.882772445678711, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8615297675132751, "num_tokens": 265031849.0, "step": 6947 }, { "epoch": 0.8838570156468643, "ewc_loss": 0.024647627025842667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.464762656018138e-05, "grad_norm": 15.870753288269043, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8752252459526062, "num_tokens": 265069714.0, "step": 6948 }, { "epoch": 0.8839842259254548, "ewc_loss": 0.02468273602426052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.468273669364862e-05, "grad_norm": 15.91477108001709, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8582317233085632, "num_tokens": 265103325.0, "step": 6949 }, { "epoch": 0.8841114362040453, "ewc_loss": 0.024655228480696678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4655228116898797e-05, "grad_norm": 15.86313533782959, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8411099910736084, "num_tokens": 265139133.0, "step": 6950 }, { "epoch": 0.8842386464826358, "ewc_loss": 0.024658484384417534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4658484107931145e-05, "grad_norm": 15.816534996032715, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8384023904800415, "num_tokens": 265183536.0, "step": 6951 }, { "epoch": 0.8843658567612263, "ewc_loss": 0.024648480117321014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4648479666211642e-05, "grad_norm": 15.829100608825684, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8535342216491699, "num_tokens": 265226288.0, "step": 6952 }, { "epoch": 0.8844930670398168, "ewc_loss": 0.024669110774993896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.466911064402666e-05, "grad_norm": 15.785615921020508, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8624126315116882, "num_tokens": 265265846.0, "step": 6953 }, { "epoch": 0.8846202773184073, "ewc_loss": 0.02465357631444931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4653576474520378e-05, "grad_norm": 15.852137565612793, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8765018582344055, "num_tokens": 265300612.0, "step": 6954 }, { "epoch": 0.8847474875969978, "ewc_loss": 0.02473585493862629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4735854822210968e-05, "grad_norm": 15.80028247833252, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8712067008018494, "num_tokens": 265341440.0, "step": 6955 }, { "epoch": 0.8848746978755884, "ewc_loss": 0.024681376293301582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.468137608957477e-05, "grad_norm": 15.85897159576416, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8532556295394897, "num_tokens": 265384854.0, "step": 6956 }, { "epoch": 0.8850019081541789, "ewc_loss": 0.024693720042705536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.469371975166723e-05, "grad_norm": 15.81230354309082, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8555459380149841, "num_tokens": 265420611.0, "step": 6957 }, { "epoch": 0.8851291184327693, "ewc_loss": 0.024700405076146126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4700404537725262e-05, "grad_norm": 15.851202011108398, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8610366582870483, "num_tokens": 265457720.0, "step": 6958 }, { "epoch": 0.8852563287113598, "ewc_loss": 0.024699389934539795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4699389541638084e-05, "grad_norm": 15.884400367736816, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8705140352249146, "num_tokens": 265493434.0, "step": 6959 }, { "epoch": 0.8853835389899504, "ewc_loss": 0.024673881009221077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.467388185323216e-05, "grad_norm": 15.873441696166992, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8643618822097778, "num_tokens": 265529015.0, "step": 6960 }, { "epoch": 0.8855107492685409, "ewc_loss": 0.02465340681374073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4653407308505848e-05, "grad_norm": 15.79506778717041, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8474986553192139, "num_tokens": 265568698.0, "step": 6961 }, { "epoch": 0.8856379595471314, "ewc_loss": 0.024661390110850334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.466139085299801e-05, "grad_norm": 15.890969276428223, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8632243871688843, "num_tokens": 265605631.0, "step": 6962 }, { "epoch": 0.885765169825722, "ewc_loss": 0.024699967354536057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.469996798026841e-05, "grad_norm": 15.834131240844727, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8637576103210449, "num_tokens": 265647071.0, "step": 6963 }, { "epoch": 0.8858923801043125, "ewc_loss": 0.02462667040526867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4626669983263128e-05, "grad_norm": 15.866304397583008, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8617687225341797, "num_tokens": 265688680.0, "step": 6964 }, { "epoch": 0.8860195903829029, "ewc_loss": 0.024689020588994026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4689021302037872e-05, "grad_norm": 15.887523651123047, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8686521053314209, "num_tokens": 265727707.0, "step": 6965 }, { "epoch": 0.8861468006614934, "ewc_loss": 0.024660952389240265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4660952476551756e-05, "grad_norm": 15.890533447265625, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8602015972137451, "num_tokens": 265765860.0, "step": 6966 }, { "epoch": 0.886274010940084, "ewc_loss": 0.024661840870976448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4661840143380687e-05, "grad_norm": 15.880095481872559, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8659070730209351, "num_tokens": 265802988.0, "step": 6967 }, { "epoch": 0.8864012212186745, "ewc_loss": 0.024591714143753052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4591714463895187e-05, "grad_norm": 15.867591857910156, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8582117557525635, "num_tokens": 265840614.0, "step": 6968 }, { "epoch": 0.886528431497265, "ewc_loss": 0.02468002401292324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4680024580447935e-05, "grad_norm": 15.867805480957031, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8359807729721069, "num_tokens": 265883032.0, "step": 6969 }, { "epoch": 0.8866556417758555, "ewc_loss": 0.024626564234495163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4626564481877722e-05, "grad_norm": 15.807323455810547, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8739560842514038, "num_tokens": 265918309.0, "step": 6970 }, { "epoch": 0.886782852054446, "ewc_loss": 0.024683881551027298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4683880837983452e-05, "grad_norm": 15.92123794555664, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8648796677589417, "num_tokens": 265956406.0, "step": 6971 }, { "epoch": 0.8869100623330365, "ewc_loss": 0.024660177528858185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4660177587065846e-05, "grad_norm": 15.824488639831543, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.870909571647644, "num_tokens": 265988413.0, "step": 6972 }, { "epoch": 0.887037272611627, "ewc_loss": 0.024626396596431732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4626397134852596e-05, "grad_norm": 15.875723838806152, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.861281156539917, "num_tokens": 266022036.0, "step": 6973 }, { "epoch": 0.8871644828902175, "ewc_loss": 0.02466689422726631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4666893295943737e-05, "grad_norm": 15.85855770111084, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8608925342559814, "num_tokens": 266053735.0, "step": 6974 }, { "epoch": 0.8872916931688081, "ewc_loss": 0.0246787890791893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4678789486642927e-05, "grad_norm": 15.849874496459961, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8769454956054688, "num_tokens": 266094198.0, "step": 6975 }, { "epoch": 0.8874189034473986, "ewc_loss": 0.024737169966101646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473716995154973e-05, "grad_norm": 15.910284042358398, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8715482354164124, "num_tokens": 266132213.0, "step": 6976 }, { "epoch": 0.887546113725989, "ewc_loss": 0.024690821766853333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4690822101547383e-05, "grad_norm": 15.842447280883789, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8630580902099609, "num_tokens": 266168001.0, "step": 6977 }, { "epoch": 0.8876733240045795, "ewc_loss": 0.02469903789460659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46990384766832e-05, "grad_norm": 15.893043518066406, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8339073657989502, "num_tokens": 266208771.0, "step": 6978 }, { "epoch": 0.8878005342831701, "ewc_loss": 0.02473253756761551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.47325369855389e-05, "grad_norm": 15.838112831115723, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8680528402328491, "num_tokens": 266248090.0, "step": 6979 }, { "epoch": 0.8879277445617606, "ewc_loss": 0.024704011157155037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4704011593712494e-05, "grad_norm": 15.845741271972656, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8518568873405457, "num_tokens": 266283748.0, "step": 6980 }, { "epoch": 0.8880549548403511, "ewc_loss": 0.024756770581007004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.475677138136234e-05, "grad_norm": 15.918390274047852, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8695588111877441, "num_tokens": 266316613.0, "step": 6981 }, { "epoch": 0.8881821651189417, "ewc_loss": 0.02471526712179184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4715267500141636e-05, "grad_norm": 15.797597885131836, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8643792867660522, "num_tokens": 266357515.0, "step": 6982 }, { "epoch": 0.8883093753975321, "ewc_loss": 0.02473895624279976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4738956199144013e-05, "grad_norm": 15.903867721557617, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.874603271484375, "num_tokens": 266393593.0, "step": 6983 }, { "epoch": 0.8884365856761226, "ewc_loss": 0.024726949632167816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4726949050091207e-05, "grad_norm": 15.754286766052246, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8741973638534546, "num_tokens": 266436104.0, "step": 6984 }, { "epoch": 0.8885637959547131, "ewc_loss": 0.024695036932826042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.46950366999954e-05, "grad_norm": 15.885734558105469, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.875080943107605, "num_tokens": 266472809.0, "step": 6985 }, { "epoch": 0.8886910062333037, "ewc_loss": 0.02479018084704876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4790180759737268e-05, "grad_norm": 15.830853462219238, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8728002309799194, "num_tokens": 266512726.0, "step": 6986 }, { "epoch": 0.8888182165118942, "ewc_loss": 0.024738986045122147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473898530297447e-05, "grad_norm": 15.830499649047852, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8517502546310425, "num_tokens": 266546098.0, "step": 6987 }, { "epoch": 0.8889454267904847, "ewc_loss": 0.02480723150074482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4807231966406107e-05, "grad_norm": 15.928400039672852, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.854024350643158, "num_tokens": 266588478.0, "step": 6988 }, { "epoch": 0.8890726370690751, "ewc_loss": 0.024740589782595634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4740589651628397e-05, "grad_norm": 15.817609786987305, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8519041538238525, "num_tokens": 266628975.0, "step": 6989 }, { "epoch": 0.8891998473476657, "ewc_loss": 0.024713803082704544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.471380321367178e-05, "grad_norm": 15.894227027893066, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8715595602989197, "num_tokens": 266667533.0, "step": 6990 }, { "epoch": 0.8893270576262562, "ewc_loss": 0.024784963577985764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.47849638981279e-05, "grad_norm": 15.909156799316406, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8447748422622681, "num_tokens": 266706990.0, "step": 6991 }, { "epoch": 0.8894542679048467, "ewc_loss": 0.024680353701114655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4680353817529976e-05, "grad_norm": 15.808340072631836, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8509450554847717, "num_tokens": 266747314.0, "step": 6992 }, { "epoch": 0.8895814781834372, "ewc_loss": 0.024756697937846184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.47566986217862e-05, "grad_norm": 15.92613410949707, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8534572124481201, "num_tokens": 266791031.0, "step": 6993 }, { "epoch": 0.8897086884620278, "ewc_loss": 0.02475995197892189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4759952793829143e-05, "grad_norm": 15.897561073303223, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8620544672012329, "num_tokens": 266823236.0, "step": 6994 }, { "epoch": 0.8898358987406182, "ewc_loss": 0.0246833898127079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4683389710844494e-05, "grad_norm": 15.834128379821777, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8602465391159058, "num_tokens": 266867993.0, "step": 6995 }, { "epoch": 0.8899631090192087, "ewc_loss": 0.024711977690458298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.471197694831062e-05, "grad_norm": 15.895796775817871, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8446944952011108, "num_tokens": 266908573.0, "step": 6996 }, { "epoch": 0.8900903192977992, "ewc_loss": 0.024735882878303528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473588210705202e-05, "grad_norm": 15.83743953704834, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8608136773109436, "num_tokens": 266951167.0, "step": 6997 }, { "epoch": 0.8902175295763898, "ewc_loss": 0.024698087945580482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4698087145225145e-05, "grad_norm": 15.868154525756836, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.860761284828186, "num_tokens": 266988976.0, "step": 6998 }, { "epoch": 0.8903447398549803, "ewc_loss": 0.024719860404729843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.471986044838559e-05, "grad_norm": 15.895282745361328, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8608214855194092, "num_tokens": 267022450.0, "step": 6999 }, { "epoch": 0.8904719501335708, "ewc_loss": 0.024744374677538872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4744374968577176e-05, "grad_norm": 15.866439819335938, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8552899956703186, "num_tokens": 267059933.0, "step": 7000 }, { "epoch": 0.8905991604121613, "ewc_loss": 0.024729758501052856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4729757569730282e-05, "grad_norm": 15.93963623046875, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8532901406288147, "num_tokens": 267092518.0, "step": 7001 }, { "epoch": 0.8907263706907518, "ewc_loss": 0.02477586455643177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4775865313131362e-05, "grad_norm": 15.89512825012207, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8572618961334229, "num_tokens": 267130008.0, "step": 7002 }, { "epoch": 0.8908535809693423, "ewc_loss": 0.024751879274845123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4751880118856207e-05, "grad_norm": 15.869365692138672, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8648519515991211, "num_tokens": 267171039.0, "step": 7003 }, { "epoch": 0.8909807912479328, "ewc_loss": 0.02474888786673546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4748887881287374e-05, "grad_norm": 15.884263038635254, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8643357753753662, "num_tokens": 267211197.0, "step": 7004 }, { "epoch": 0.8911080015265234, "ewc_loss": 0.02474304847419262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.474304892530199e-05, "grad_norm": 15.82099723815918, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8675853610038757, "num_tokens": 267247471.0, "step": 7005 }, { "epoch": 0.8912352118051139, "ewc_loss": 0.024730443954467773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473044332873542e-05, "grad_norm": 15.939017295837402, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8355531692504883, "num_tokens": 267287276.0, "step": 7006 }, { "epoch": 0.8913624220837043, "ewc_loss": 0.0247475728392601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.474757275194861e-05, "grad_norm": 15.832046508789062, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8601678013801575, "num_tokens": 267323827.0, "step": 7007 }, { "epoch": 0.8914896323622948, "ewc_loss": 0.02472710981965065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.472710912115872e-05, "grad_norm": 15.838927268981934, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8710533380508423, "num_tokens": 267364053.0, "step": 7008 }, { "epoch": 0.8916168426408854, "ewc_loss": 0.024771673604846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4771674361545593e-05, "grad_norm": 15.85572624206543, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.845279335975647, "num_tokens": 267404393.0, "step": 7009 }, { "epoch": 0.8917440529194759, "ewc_loss": 0.024737725034356117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4737724743317813e-05, "grad_norm": 15.868931770324707, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8629783391952515, "num_tokens": 267444589.0, "step": 7010 }, { "epoch": 0.8918712631980664, "ewc_loss": 0.02478611283004284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478611349943094e-05, "grad_norm": 15.883298873901367, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8629792332649231, "num_tokens": 267484460.0, "step": 7011 }, { "epoch": 0.891998473476657, "ewc_loss": 0.024729302152991295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4729302822379395e-05, "grad_norm": 15.87358283996582, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8549665808677673, "num_tokens": 267517390.0, "step": 7012 }, { "epoch": 0.8921256837552475, "ewc_loss": 0.024748221039772034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4748220312176272e-05, "grad_norm": 15.847369194030762, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8619479537010193, "num_tokens": 267555813.0, "step": 7013 }, { "epoch": 0.8922528940338379, "ewc_loss": 0.024722065776586533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4722065063542686e-05, "grad_norm": 15.780549049377441, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.859236478805542, "num_tokens": 267595818.0, "step": 7014 }, { "epoch": 0.8923801043124284, "ewc_loss": 0.024767620489001274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.476761983416509e-05, "grad_norm": 15.836250305175781, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8365092277526855, "num_tokens": 267633106.0, "step": 7015 }, { "epoch": 0.892507314591019, "ewc_loss": 0.02479836717247963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4798368031042628e-05, "grad_norm": 15.881709098815918, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.86452317237854, "num_tokens": 267670166.0, "step": 7016 }, { "epoch": 0.8926345248696095, "ewc_loss": 0.024803292006254196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4803292035358027e-05, "grad_norm": 15.883929252624512, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8587998151779175, "num_tokens": 267707284.0, "step": 7017 }, { "epoch": 0.8927617351482, "ewc_loss": 0.024809090420603752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.480909097357653e-05, "grad_norm": 15.861555099487305, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8624197244644165, "num_tokens": 267742349.0, "step": 7018 }, { "epoch": 0.8928889454267905, "ewc_loss": 0.024741563946008682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4741564629948698e-05, "grad_norm": 15.879473686218262, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8672956228256226, "num_tokens": 267776003.0, "step": 7019 }, { "epoch": 0.893016155705381, "ewc_loss": 0.024822605773806572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4822606064844877e-05, "grad_norm": 15.876327514648438, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8566910028457642, "num_tokens": 267815887.0, "step": 7020 }, { "epoch": 0.8931433659839715, "ewc_loss": 0.024788323789834976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4788323571556248e-05, "grad_norm": 15.889047622680664, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8736462593078613, "num_tokens": 267855874.0, "step": 7021 }, { "epoch": 0.893270576262562, "ewc_loss": 0.02477947436273098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4779474188107997e-05, "grad_norm": 15.83905029296875, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8721768856048584, "num_tokens": 267897901.0, "step": 7022 }, { "epoch": 0.8933977865411525, "ewc_loss": 0.02475874498486519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.475874498486519e-05, "grad_norm": 15.822272300720215, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8725132942199707, "num_tokens": 267938967.0, "step": 7023 }, { "epoch": 0.8935249968197431, "ewc_loss": 0.024767281487584114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.476728150213603e-05, "grad_norm": 15.875083923339844, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8582740426063538, "num_tokens": 267975196.0, "step": 7024 }, { "epoch": 0.8936522070983336, "ewc_loss": 0.024804048240184784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4804048734949902e-05, "grad_norm": 15.867138862609863, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8757269382476807, "num_tokens": 268014407.0, "step": 7025 }, { "epoch": 0.893779417376924, "ewc_loss": 0.024774516001343727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.477451562299393e-05, "grad_norm": 15.998127937316895, "learning_rate": 1e-06, "loss": 0.37, "mean_token_accuracy": 0.8776200413703918, "num_tokens": 268043892.0, "step": 7026 }, { "epoch": 0.8939066276555145, "ewc_loss": 0.02482948638498783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.482948730175849e-05, "grad_norm": 15.86332893371582, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8615531921386719, "num_tokens": 268079651.0, "step": 7027 }, { "epoch": 0.8940338379341051, "ewc_loss": 0.02470293827354908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4702938389964402e-05, "grad_norm": 15.948222160339355, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8648837804794312, "num_tokens": 268114021.0, "step": 7028 }, { "epoch": 0.8941610482126956, "ewc_loss": 0.024816714227199554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.481671435816679e-05, "grad_norm": 15.914356231689453, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8612440824508667, "num_tokens": 268155315.0, "step": 7029 }, { "epoch": 0.8942882584912861, "ewc_loss": 0.024732844904065132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.47328443947481e-05, "grad_norm": 15.919659614562988, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8587746024131775, "num_tokens": 268199312.0, "step": 7030 }, { "epoch": 0.8944154687698767, "ewc_loss": 0.024740440770983696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4740440494497307e-05, "grad_norm": 15.836437225341797, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8684513568878174, "num_tokens": 268236955.0, "step": 7031 }, { "epoch": 0.8945426790484671, "ewc_loss": 0.024713121354579926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.471312109264545e-05, "grad_norm": 15.919635772705078, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.858848512172699, "num_tokens": 268272183.0, "step": 7032 }, { "epoch": 0.8946698893270576, "ewc_loss": 0.02473163791000843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4731638404773548e-05, "grad_norm": 15.909478187561035, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8528247475624084, "num_tokens": 268305407.0, "step": 7033 }, { "epoch": 0.8947970996056481, "ewc_loss": 0.02473228983581066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4732289602980018e-05, "grad_norm": 15.910343170166016, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8736108541488647, "num_tokens": 268340443.0, "step": 7034 }, { "epoch": 0.8949243098842387, "ewc_loss": 0.024781683459877968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4781684260233305e-05, "grad_norm": 15.919849395751953, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8803882002830505, "num_tokens": 268375597.0, "step": 7035 }, { "epoch": 0.8950515201628292, "ewc_loss": 0.02476400136947632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4764001864241436e-05, "grad_norm": 15.892756462097168, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.855946958065033, "num_tokens": 268412872.0, "step": 7036 }, { "epoch": 0.8951787304414197, "ewc_loss": 0.024790288880467415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4790288080112077e-05, "grad_norm": 15.959092140197754, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8633283972740173, "num_tokens": 268452021.0, "step": 7037 }, { "epoch": 0.8953059407200101, "ewc_loss": 0.024765273556113243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4765273337834515e-05, "grad_norm": 15.851696968078613, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8709299564361572, "num_tokens": 268486354.0, "step": 7038 }, { "epoch": 0.8954331509986007, "ewc_loss": 0.024730876088142395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4730876248213463e-05, "grad_norm": 15.871152877807617, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8555646538734436, "num_tokens": 268519918.0, "step": 7039 }, { "epoch": 0.8955603612771912, "ewc_loss": 0.024780377745628357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478037822584156e-05, "grad_norm": 15.922835350036621, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8542977571487427, "num_tokens": 268554259.0, "step": 7040 }, { "epoch": 0.8956875715557817, "ewc_loss": 0.02478022128343582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4780221792752855e-05, "grad_norm": 15.897062301635742, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8562248945236206, "num_tokens": 268595785.0, "step": 7041 }, { "epoch": 0.8958147818343722, "ewc_loss": 0.024816306307911873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4816306904540397e-05, "grad_norm": 15.896811485290527, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8475608229637146, "num_tokens": 268640805.0, "step": 7042 }, { "epoch": 0.8959419921129628, "ewc_loss": 0.024781225249171257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4781225874903612e-05, "grad_norm": 15.845905303955078, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8681053519248962, "num_tokens": 268682446.0, "step": 7043 }, { "epoch": 0.8960692023915532, "ewc_loss": 0.02486484684050083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.486484663677402e-05, "grad_norm": 15.9075288772583, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8705527782440186, "num_tokens": 268720262.0, "step": 7044 }, { "epoch": 0.8961964126701437, "ewc_loss": 0.024797243997454643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4797243895591237e-05, "grad_norm": 15.839515686035156, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8487884998321533, "num_tokens": 268760519.0, "step": 7045 }, { "epoch": 0.8963236229487342, "ewc_loss": 0.02483372576534748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4833725547068752e-05, "grad_norm": 15.956395149230957, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.854993999004364, "num_tokens": 268796621.0, "step": 7046 }, { "epoch": 0.8964508332273248, "ewc_loss": 0.02479502372443676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4795022909529507e-05, "grad_norm": 15.867964744567871, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8700535297393799, "num_tokens": 268830527.0, "step": 7047 }, { "epoch": 0.8965780435059153, "ewc_loss": 0.02478722669184208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478722672094591e-05, "grad_norm": 15.910531997680664, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8659350872039795, "num_tokens": 268862112.0, "step": 7048 }, { "epoch": 0.8967052537845058, "ewc_loss": 0.024765949696302414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4765950001892634e-05, "grad_norm": 15.83687973022461, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8602153062820435, "num_tokens": 268901089.0, "step": 7049 }, { "epoch": 0.8968324640630962, "ewc_loss": 0.024735189974308014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4735190891078673e-05, "grad_norm": 15.866796493530273, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8644418120384216, "num_tokens": 268942766.0, "step": 7050 }, { "epoch": 0.8969596743416868, "ewc_loss": 0.024769125506281853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4769125957391225e-05, "grad_norm": 15.890620231628418, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8685016632080078, "num_tokens": 268978642.0, "step": 7051 }, { "epoch": 0.8970868846202773, "ewc_loss": 0.024804402142763138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.480440161889419e-05, "grad_norm": 15.86684513092041, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8684384822845459, "num_tokens": 269016048.0, "step": 7052 }, { "epoch": 0.8972140948988678, "ewc_loss": 0.024701949208974838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4701948859728873e-05, "grad_norm": 15.86218547821045, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8636172413825989, "num_tokens": 269054381.0, "step": 7053 }, { "epoch": 0.8973413051774584, "ewc_loss": 0.02478628233075142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478628266544547e-05, "grad_norm": 15.8787841796875, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8661287426948547, "num_tokens": 269090672.0, "step": 7054 }, { "epoch": 0.8974685154560489, "ewc_loss": 0.024760635569691658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4760634914855473e-05, "grad_norm": 15.869889259338379, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8659602403640747, "num_tokens": 269130231.0, "step": 7055 }, { "epoch": 0.8975957257346393, "ewc_loss": 0.02478748746216297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4787486836430617e-05, "grad_norm": 15.914459228515625, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8668690919876099, "num_tokens": 269166282.0, "step": 7056 }, { "epoch": 0.8977229360132298, "ewc_loss": 0.024822697043418884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4822697014315054e-05, "grad_norm": 15.946463584899902, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8661384582519531, "num_tokens": 269205326.0, "step": 7057 }, { "epoch": 0.8978501462918204, "ewc_loss": 0.024797655642032623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4797654987196438e-05, "grad_norm": 15.899556159973145, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8639125823974609, "num_tokens": 269243903.0, "step": 7058 }, { "epoch": 0.8979773565704109, "ewc_loss": 0.024736519902944565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4736520572332665e-05, "grad_norm": 15.877839088439941, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8566117286682129, "num_tokens": 269288330.0, "step": 7059 }, { "epoch": 0.8981045668490014, "ewc_loss": 0.02475859597325325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4758595827734098e-05, "grad_norm": 15.884050369262695, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8513698577880859, "num_tokens": 269328731.0, "step": 7060 }, { "epoch": 0.898231777127592, "ewc_loss": 0.024754146113991737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4754146579653025e-05, "grad_norm": 15.966440200805664, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8630424737930298, "num_tokens": 269369576.0, "step": 7061 }, { "epoch": 0.8983589874061825, "ewc_loss": 0.024754155427217484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4754155674600042e-05, "grad_norm": 15.838068008422852, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8614699840545654, "num_tokens": 269407734.0, "step": 7062 }, { "epoch": 0.8984861976847729, "ewc_loss": 0.02471931278705597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.471931293257512e-05, "grad_norm": 15.908982276916504, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.841206967830658, "num_tokens": 269444042.0, "step": 7063 }, { "epoch": 0.8986134079633634, "ewc_loss": 0.024803200736641884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.480320108588785e-05, "grad_norm": 15.872476577758789, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8848729133605957, "num_tokens": 269480979.0, "step": 7064 }, { "epoch": 0.898740618241954, "ewc_loss": 0.024761775508522987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4761775421211496e-05, "grad_norm": 15.961989402770996, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8641448020935059, "num_tokens": 269515594.0, "step": 7065 }, { "epoch": 0.8988678285205445, "ewc_loss": 0.024804893881082535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.480489456502255e-05, "grad_norm": 15.902246475219727, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8656405210494995, "num_tokens": 269556974.0, "step": 7066 }, { "epoch": 0.898995038799135, "ewc_loss": 0.024739986285567284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.473998574714642e-05, "grad_norm": 15.883340835571289, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8556208610534668, "num_tokens": 269591088.0, "step": 7067 }, { "epoch": 0.8991222490777255, "ewc_loss": 0.024748651310801506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4748651412664913e-05, "grad_norm": 15.944255828857422, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8716623783111572, "num_tokens": 269627921.0, "step": 7068 }, { "epoch": 0.899249459356316, "ewc_loss": 0.02478114143013954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478114220139105e-05, "grad_norm": 15.919148445129395, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8471516370773315, "num_tokens": 269666586.0, "step": 7069 }, { "epoch": 0.8993766696349065, "ewc_loss": 0.02477864734828472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4778646547929384e-05, "grad_norm": 15.925200462341309, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8647068738937378, "num_tokens": 269701205.0, "step": 7070 }, { "epoch": 0.899503879913497, "ewc_loss": 0.024789536371827126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4789536837488413e-05, "grad_norm": 15.886635780334473, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8750951290130615, "num_tokens": 269739982.0, "step": 7071 }, { "epoch": 0.8996310901920875, "ewc_loss": 0.02478879876434803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4788798327790573e-05, "grad_norm": 15.949910163879395, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8642440438270569, "num_tokens": 269773048.0, "step": 7072 }, { "epoch": 0.8997583004706781, "ewc_loss": 0.02482430450618267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.482430500094779e-05, "grad_norm": 15.86838150024414, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.86197429895401, "num_tokens": 269811343.0, "step": 7073 }, { "epoch": 0.8998855107492686, "ewc_loss": 0.024782268330454826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4782268155831844e-05, "grad_norm": 15.924006462097168, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8678265810012817, "num_tokens": 269851601.0, "step": 7074 }, { "epoch": 0.900012721027859, "ewc_loss": 0.024798128753900528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.479812792444136e-05, "grad_norm": 15.830117225646973, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8712260723114014, "num_tokens": 269891396.0, "step": 7075 }, { "epoch": 0.9001399313064495, "ewc_loss": 0.02477748692035675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4777487851679325e-05, "grad_norm": 15.899591445922852, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8624770045280457, "num_tokens": 269930292.0, "step": 7076 }, { "epoch": 0.9002671415850401, "ewc_loss": 0.02486061304807663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4860613848431967e-05, "grad_norm": 15.898073196411133, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8358663320541382, "num_tokens": 269968464.0, "step": 7077 }, { "epoch": 0.9003943518636306, "ewc_loss": 0.024785567075014114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4785567802609876e-05, "grad_norm": 15.851496696472168, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.849878191947937, "num_tokens": 270011569.0, "step": 7078 }, { "epoch": 0.9005215621422211, "ewc_loss": 0.02483639121055603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4836390366544947e-05, "grad_norm": 15.902578353881836, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8587357401847839, "num_tokens": 270050608.0, "step": 7079 }, { "epoch": 0.9006487724208116, "ewc_loss": 0.024834049865603447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4834049327182584e-05, "grad_norm": 15.876093864440918, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8483335971832275, "num_tokens": 270096100.0, "step": 7080 }, { "epoch": 0.9007759826994021, "ewc_loss": 0.024831622838974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4831622795318253e-05, "grad_norm": 15.922337532043457, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8726334571838379, "num_tokens": 270140915.0, "step": 7081 }, { "epoch": 0.9009031929779926, "ewc_loss": 0.024808567017316818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.480856710462831e-05, "grad_norm": 15.803787231445312, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8625523447990417, "num_tokens": 270184233.0, "step": 7082 }, { "epoch": 0.9010304032565831, "ewc_loss": 0.024790821596980095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4790821044007316e-05, "grad_norm": 15.92176342010498, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.86110919713974, "num_tokens": 270223889.0, "step": 7083 }, { "epoch": 0.9011576135351737, "ewc_loss": 0.02483157254755497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4831571863614954e-05, "grad_norm": 15.83477783203125, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8679133653640747, "num_tokens": 270266357.0, "step": 7084 }, { "epoch": 0.9012848238137642, "ewc_loss": 0.02476467192173004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.476467125234194e-05, "grad_norm": 15.933123588562012, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.870295524597168, "num_tokens": 270298252.0, "step": 7085 }, { "epoch": 0.9014120340923547, "ewc_loss": 0.02478126809000969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4781267711659893e-05, "grad_norm": 15.858641624450684, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8558754920959473, "num_tokens": 270343187.0, "step": 7086 }, { "epoch": 0.9015392443709451, "ewc_loss": 0.02478129044175148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478129135852214e-05, "grad_norm": 15.930057525634766, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8513389825820923, "num_tokens": 270385094.0, "step": 7087 }, { "epoch": 0.9016664546495357, "ewc_loss": 0.02481798827648163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4817987650749274e-05, "grad_norm": 15.908981323242188, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8495150804519653, "num_tokens": 270421772.0, "step": 7088 }, { "epoch": 0.9017936649281262, "ewc_loss": 0.02479359693825245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4793596821837127e-05, "grad_norm": 15.878946304321289, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8444203734397888, "num_tokens": 270465033.0, "step": 7089 }, { "epoch": 0.9019208752067167, "ewc_loss": 0.02477344498038292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4773444238235243e-05, "grad_norm": 15.927233695983887, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8852856755256653, "num_tokens": 270501370.0, "step": 7090 }, { "epoch": 0.9020480854853072, "ewc_loss": 0.024740327149629593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4740327717154287e-05, "grad_norm": 15.812664031982422, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8666781187057495, "num_tokens": 270543571.0, "step": 7091 }, { "epoch": 0.9021752957638978, "ewc_loss": 0.024799292907118797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.479929207765963e-05, "grad_norm": 16.02252960205078, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8718568086624146, "num_tokens": 270580635.0, "step": 7092 }, { "epoch": 0.9023025060424882, "ewc_loss": 0.02480671927332878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4806719011394307e-05, "grad_norm": 15.863497734069824, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8756912350654602, "num_tokens": 270616534.0, "step": 7093 }, { "epoch": 0.9024297163210787, "ewc_loss": 0.024764707311987877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4764707632130012e-05, "grad_norm": 15.96483039855957, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8436065912246704, "num_tokens": 270653998.0, "step": 7094 }, { "epoch": 0.9025569265996692, "ewc_loss": 0.024818362668156624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4818362362566404e-05, "grad_norm": 15.909256935119629, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8627145886421204, "num_tokens": 270688023.0, "step": 7095 }, { "epoch": 0.9026841368782598, "ewc_loss": 0.024775004014372826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.477500311215408e-05, "grad_norm": 15.906932830810547, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8448447585105896, "num_tokens": 270721298.0, "step": 7096 }, { "epoch": 0.9028113471568503, "ewc_loss": 0.02482103370130062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4821034458000213e-05, "grad_norm": 15.881599426269531, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8680784702301025, "num_tokens": 270762182.0, "step": 7097 }, { "epoch": 0.9029385574354408, "ewc_loss": 0.024829335510730743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4829336325637996e-05, "grad_norm": 15.9086332321167, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8777533769607544, "num_tokens": 270799577.0, "step": 7098 }, { "epoch": 0.9030657677140312, "ewc_loss": 0.024786675348877907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4786675567156635e-05, "grad_norm": 15.886164665222168, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8630374670028687, "num_tokens": 270840218.0, "step": 7099 }, { "epoch": 0.9031929779926218, "ewc_loss": 0.02478596195578575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4785962523310445e-05, "grad_norm": 15.899557113647461, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8835409283638, "num_tokens": 270872982.0, "step": 7100 }, { "epoch": 0.9033201882712123, "ewc_loss": 0.02483423613011837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.483423668309115e-05, "grad_norm": 15.949968338012695, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.866645097732544, "num_tokens": 270908381.0, "step": 7101 }, { "epoch": 0.9034473985498028, "ewc_loss": 0.024748271331191063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.474827124387957e-05, "grad_norm": 15.884013175964355, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8619154691696167, "num_tokens": 270949217.0, "step": 7102 }, { "epoch": 0.9035746088283934, "ewc_loss": 0.02480793371796608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4807934096315876e-05, "grad_norm": 15.988595962524414, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8467233777046204, "num_tokens": 270989667.0, "step": 7103 }, { "epoch": 0.9037018191069839, "ewc_loss": 0.02479892037808895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4798921003821306e-05, "grad_norm": 15.893080711364746, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8526396751403809, "num_tokens": 271030395.0, "step": 7104 }, { "epoch": 0.9038290293855743, "ewc_loss": 0.024777207523584366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.477720772731118e-05, "grad_norm": 15.94057559967041, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.844982385635376, "num_tokens": 271068239.0, "step": 7105 }, { "epoch": 0.9039562396641648, "ewc_loss": 0.024807630106806755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4807630325085483e-05, "grad_norm": 15.99812126159668, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8565313816070557, "num_tokens": 271110294.0, "step": 7106 }, { "epoch": 0.9040834499427554, "ewc_loss": 0.024784928187727928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4784927518339828e-05, "grad_norm": 15.828577995300293, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8804643154144287, "num_tokens": 271143288.0, "step": 7107 }, { "epoch": 0.9042106602213459, "ewc_loss": 0.02475065551698208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.475065593898762e-05, "grad_norm": 15.893820762634277, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8703051209449768, "num_tokens": 271181183.0, "step": 7108 }, { "epoch": 0.9043378704999364, "ewc_loss": 0.024870898574590683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4870898414519615e-05, "grad_norm": 15.898633003234863, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8616346120834351, "num_tokens": 271216201.0, "step": 7109 }, { "epoch": 0.9044650807785269, "ewc_loss": 0.02475425973534584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4754259356996045e-05, "grad_norm": 15.889108657836914, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8691771030426025, "num_tokens": 271250158.0, "step": 7110 }, { "epoch": 0.9045922910571175, "ewc_loss": 0.024842627346515656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4842627681209706e-05, "grad_norm": 15.898482322692871, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8756762146949768, "num_tokens": 271289994.0, "step": 7111 }, { "epoch": 0.9047195013357079, "ewc_loss": 0.024816462770104408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4816463337629102e-05, "grad_norm": 15.901131629943848, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8689213395118713, "num_tokens": 271325782.0, "step": 7112 }, { "epoch": 0.9048467116142984, "ewc_loss": 0.024815335869789124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4815335564198904e-05, "grad_norm": 15.923847198486328, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8504739999771118, "num_tokens": 271361604.0, "step": 7113 }, { "epoch": 0.9049739218928889, "ewc_loss": 0.024836242198944092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.483624302840326e-05, "grad_norm": 15.905630111694336, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8723903894424438, "num_tokens": 271398520.0, "step": 7114 }, { "epoch": 0.9051011321714795, "ewc_loss": 0.024811098352074623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4811099137878045e-05, "grad_norm": 15.873016357421875, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8571490049362183, "num_tokens": 271445138.0, "step": 7115 }, { "epoch": 0.90522834245007, "ewc_loss": 0.024820124730467796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.482012496329844e-05, "grad_norm": 15.954405784606934, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.868104100227356, "num_tokens": 271479147.0, "step": 7116 }, { "epoch": 0.9053555527286605, "ewc_loss": 0.02478513866662979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.478513852111064e-05, "grad_norm": 15.835531234741211, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8645636439323425, "num_tokens": 271512477.0, "step": 7117 }, { "epoch": 0.905482763007251, "ewc_loss": 0.024850819259881973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4850818590493873e-05, "grad_norm": 15.972716331481934, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8494513034820557, "num_tokens": 271552355.0, "step": 7118 }, { "epoch": 0.9056099732858415, "ewc_loss": 0.02486952766776085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4869526896509342e-05, "grad_norm": 15.892611503601074, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8591731786727905, "num_tokens": 271595825.0, "step": 7119 }, { "epoch": 0.905737183564432, "ewc_loss": 0.024771451950073242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.477145244483836e-05, "grad_norm": 15.868839263916016, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8502717614173889, "num_tokens": 271627384.0, "step": 7120 }, { "epoch": 0.9058643938430225, "ewc_loss": 0.024928385391831398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.492838575562928e-05, "grad_norm": 15.996515274047852, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8720589876174927, "num_tokens": 271666711.0, "step": 7121 }, { "epoch": 0.9059916041216131, "ewc_loss": 0.024871088564395905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4871089408406988e-05, "grad_norm": 15.894495964050293, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8689550161361694, "num_tokens": 271703145.0, "step": 7122 }, { "epoch": 0.9061188144002036, "ewc_loss": 0.024803316220641136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4803315682220273e-05, "grad_norm": 15.850730895996094, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8511714935302734, "num_tokens": 271743131.0, "step": 7123 }, { "epoch": 0.906246024678794, "ewc_loss": 0.024911988526582718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4911989385145716e-05, "grad_norm": 16.079010009765625, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8667685985565186, "num_tokens": 271778245.0, "step": 7124 }, { "epoch": 0.9063732349573845, "ewc_loss": 0.024900099262595177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4900098651414737e-05, "grad_norm": 15.868064880371094, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8541496396064758, "num_tokens": 271817279.0, "step": 7125 }, { "epoch": 0.9065004452359751, "ewc_loss": 0.024765368551015854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.47653679252835e-05, "grad_norm": 15.871538162231445, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.869485080242157, "num_tokens": 271857136.0, "step": 7126 }, { "epoch": 0.9066276555145656, "ewc_loss": 0.024905497208237648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.490549741196446e-05, "grad_norm": 15.86109733581543, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8713256120681763, "num_tokens": 271891651.0, "step": 7127 }, { "epoch": 0.9067548657931561, "ewc_loss": 0.02488325722515583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4883256628527306e-05, "grad_norm": 15.960122108459473, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.866838812828064, "num_tokens": 271925805.0, "step": 7128 }, { "epoch": 0.9068820760717466, "ewc_loss": 0.024912316352128983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4912316803238355e-05, "grad_norm": 15.849671363830566, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8673145771026611, "num_tokens": 271964954.0, "step": 7129 }, { "epoch": 0.9070092863503371, "ewc_loss": 0.024915460497140884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4915460016927682e-05, "grad_norm": 16.03403091430664, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8622605800628662, "num_tokens": 272000331.0, "step": 7130 }, { "epoch": 0.9071364966289276, "ewc_loss": 0.024914182722568512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4914183086366393e-05, "grad_norm": 15.74928092956543, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.869742751121521, "num_tokens": 272037853.0, "step": 7131 }, { "epoch": 0.9072637069075181, "ewc_loss": 0.024843258783221245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4843258870532736e-05, "grad_norm": 15.955062866210938, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8727958798408508, "num_tokens": 272077663.0, "step": 7132 }, { "epoch": 0.9073909171861086, "ewc_loss": 0.02500193566083908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5001936592161655e-05, "grad_norm": 15.91383171081543, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8584779500961304, "num_tokens": 272116942.0, "step": 7133 }, { "epoch": 0.9075181274646992, "ewc_loss": 0.024818483740091324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.481848423485644e-05, "grad_norm": 15.89370346069336, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8566666841506958, "num_tokens": 272151319.0, "step": 7134 }, { "epoch": 0.9076453377432897, "ewc_loss": 0.024913592264056206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.491359191481024e-05, "grad_norm": 15.839797019958496, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8498951196670532, "num_tokens": 272190817.0, "step": 7135 }, { "epoch": 0.9077725480218801, "ewc_loss": 0.02491036057472229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4910361389629543e-05, "grad_norm": 15.97157096862793, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8571582436561584, "num_tokens": 272230461.0, "step": 7136 }, { "epoch": 0.9078997583004706, "ewc_loss": 0.02494293451309204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.494293403287884e-05, "grad_norm": 15.91320514678955, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8566970825195312, "num_tokens": 272268510.0, "step": 7137 }, { "epoch": 0.9080269685790612, "ewc_loss": 0.024896321818232536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4896322429412976e-05, "grad_norm": 15.963400840759277, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.86351478099823, "num_tokens": 272312430.0, "step": 7138 }, { "epoch": 0.9081541788576517, "ewc_loss": 0.024921316653490067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.49213171628071e-05, "grad_norm": 15.87014389038086, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8614093661308289, "num_tokens": 272348437.0, "step": 7139 }, { "epoch": 0.9082813891362422, "ewc_loss": 0.024858281016349792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.485828190401662e-05, "grad_norm": 16.000591278076172, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8558584451675415, "num_tokens": 272378902.0, "step": 7140 }, { "epoch": 0.9084085994148328, "ewc_loss": 0.0249261986464262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4926199330366217e-05, "grad_norm": 15.892056465148926, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8771423101425171, "num_tokens": 272421257.0, "step": 7141 }, { "epoch": 0.9085358096934232, "ewc_loss": 0.02484114281833172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4841143385856412e-05, "grad_norm": 15.912772178649902, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8619740605354309, "num_tokens": 272459800.0, "step": 7142 }, { "epoch": 0.9086630199720137, "ewc_loss": 0.024917051196098328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4917051632655784e-05, "grad_norm": 15.869332313537598, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8821626901626587, "num_tokens": 272501245.0, "step": 7143 }, { "epoch": 0.9087902302506042, "ewc_loss": 0.024847298860549927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.484729884599801e-05, "grad_norm": 15.896074295043945, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8643152713775635, "num_tokens": 272535873.0, "step": 7144 }, { "epoch": 0.9089174405291948, "ewc_loss": 0.024881359189748764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4881359422579408e-05, "grad_norm": 15.88978385925293, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8613872528076172, "num_tokens": 272575213.0, "step": 7145 }, { "epoch": 0.9090446508077853, "ewc_loss": 0.024889221414923668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4889221094781533e-05, "grad_norm": 15.930927276611328, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8651950359344482, "num_tokens": 272611960.0, "step": 7146 }, { "epoch": 0.9091718610863758, "ewc_loss": 0.024871541187167168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4871540517779067e-05, "grad_norm": 15.870614051818848, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8631609678268433, "num_tokens": 272650895.0, "step": 7147 }, { "epoch": 0.9092990713649662, "ewc_loss": 0.02491612732410431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4916127586038783e-05, "grad_norm": 15.994665145874023, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8634935617446899, "num_tokens": 272690617.0, "step": 7148 }, { "epoch": 0.9094262816435568, "ewc_loss": 0.02485497109591961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4854971343302168e-05, "grad_norm": 15.797826766967773, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8547192811965942, "num_tokens": 272732526.0, "step": 7149 }, { "epoch": 0.9095534919221473, "ewc_loss": 0.024831676855683327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.483167736500036e-05, "grad_norm": 16.007524490356445, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8610756993293762, "num_tokens": 272769882.0, "step": 7150 }, { "epoch": 0.9096807022007378, "ewc_loss": 0.024945460259914398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4945460609160364e-05, "grad_norm": 15.913625717163086, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8655195236206055, "num_tokens": 272809614.0, "step": 7151 }, { "epoch": 0.9098079124793284, "ewc_loss": 0.024820389226078987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4820388716761954e-05, "grad_norm": 15.908435821533203, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8556872010231018, "num_tokens": 272844819.0, "step": 7152 }, { "epoch": 0.9099351227579189, "ewc_loss": 0.024932347238063812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.49323475145502e-05, "grad_norm": 15.959343910217285, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8684805631637573, "num_tokens": 272882798.0, "step": 7153 }, { "epoch": 0.9100623330365093, "ewc_loss": 0.024858469143509865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4858469259925187e-05, "grad_norm": 15.961597442626953, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8660683631896973, "num_tokens": 272917472.0, "step": 7154 }, { "epoch": 0.9101895433150998, "ewc_loss": 0.024869104847311974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.486910489096772e-05, "grad_norm": 15.888848304748535, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8524836301803589, "num_tokens": 272959947.0, "step": 7155 }, { "epoch": 0.9103167535936904, "ewc_loss": 0.024828553199768066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4828552341205068e-05, "grad_norm": 15.910338401794434, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8460376858711243, "num_tokens": 272998194.0, "step": 7156 }, { "epoch": 0.9104439638722809, "ewc_loss": 0.024899473413825035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4899472919059917e-05, "grad_norm": 15.923385620117188, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8796241283416748, "num_tokens": 273034688.0, "step": 7157 }, { "epoch": 0.9105711741508714, "ewc_loss": 0.024828460067510605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4828459572745487e-05, "grad_norm": 15.914346694946289, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8733972311019897, "num_tokens": 273071575.0, "step": 7158 }, { "epoch": 0.9106983844294619, "ewc_loss": 0.024847524240612984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.484752440068405e-05, "grad_norm": 15.905965805053711, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8376718759536743, "num_tokens": 273106106.0, "step": 7159 }, { "epoch": 0.9108255947080524, "ewc_loss": 0.02489366941154003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4893668523873203e-05, "grad_norm": 15.922178268432617, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8576038479804993, "num_tokens": 273142959.0, "step": 7160 }, { "epoch": 0.9109528049866429, "ewc_loss": 0.02488170564174652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.488170503056608e-05, "grad_norm": 15.987936019897461, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8477123975753784, "num_tokens": 273182781.0, "step": 7161 }, { "epoch": 0.9110800152652334, "ewc_loss": 0.02487126737833023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4871267669368535e-05, "grad_norm": 15.88898754119873, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8696523904800415, "num_tokens": 273225738.0, "step": 7162 }, { "epoch": 0.9112072255438239, "ewc_loss": 0.02482670173048973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4826702428981662e-05, "grad_norm": 15.918980598449707, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8702318668365479, "num_tokens": 273269166.0, "step": 7163 }, { "epoch": 0.9113344358224145, "ewc_loss": 0.02488134428858757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.488134487066418e-05, "grad_norm": 15.936102867126465, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8577836155891418, "num_tokens": 273309093.0, "step": 7164 }, { "epoch": 0.911461646101005, "ewc_loss": 0.02485889010131359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4858889446477406e-05, "grad_norm": 15.968234062194824, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8683021068572998, "num_tokens": 273346857.0, "step": 7165 }, { "epoch": 0.9115888563795955, "ewc_loss": 0.024851085618138313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4851085981936194e-05, "grad_norm": 15.86829662322998, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8583643436431885, "num_tokens": 273385720.0, "step": 7166 }, { "epoch": 0.9117160666581859, "ewc_loss": 0.02482626587152481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.482626587152481e-05, "grad_norm": 15.97358512878418, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8636130094528198, "num_tokens": 273431079.0, "step": 7167 }, { "epoch": 0.9118432769367765, "ewc_loss": 0.0248581413179636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.485814184183255e-05, "grad_norm": 15.887640953063965, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8645726442337036, "num_tokens": 273477316.0, "step": 7168 }, { "epoch": 0.911970487215367, "ewc_loss": 0.024836096912622452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4836097509250976e-05, "grad_norm": 15.969576835632324, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.866603672504425, "num_tokens": 273512831.0, "step": 7169 }, { "epoch": 0.9120976974939575, "ewc_loss": 0.024836067110300064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4836066586431116e-05, "grad_norm": 15.965107917785645, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8568536639213562, "num_tokens": 273551927.0, "step": 7170 }, { "epoch": 0.912224907772548, "ewc_loss": 0.024798233062028885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4798233425826766e-05, "grad_norm": 15.879826545715332, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8655458688735962, "num_tokens": 273583053.0, "step": 7171 }, { "epoch": 0.9123521180511386, "ewc_loss": 0.024792108684778214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4792108888505027e-05, "grad_norm": 15.939230918884277, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8530060052871704, "num_tokens": 273617953.0, "step": 7172 }, { "epoch": 0.912479328329729, "ewc_loss": 0.024852534756064415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4852533897501417e-05, "grad_norm": 16.004255294799805, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.849109411239624, "num_tokens": 273649959.0, "step": 7173 }, { "epoch": 0.9126065386083195, "ewc_loss": 0.02482769452035427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4827693778206594e-05, "grad_norm": 15.922592163085938, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8529441952705383, "num_tokens": 273682666.0, "step": 7174 }, { "epoch": 0.9127337488869101, "ewc_loss": 0.024826757609844208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4826756998663768e-05, "grad_norm": 15.943196296691895, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8816148042678833, "num_tokens": 273718641.0, "step": 7175 }, { "epoch": 0.9128609591655006, "ewc_loss": 0.02482159249484539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.48215928877471e-05, "grad_norm": 15.825119972229004, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8732613325119019, "num_tokens": 273760502.0, "step": 7176 }, { "epoch": 0.9129881694440911, "ewc_loss": 0.02483328990638256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4833290808601305e-05, "grad_norm": 15.922042846679688, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8570543527603149, "num_tokens": 273795068.0, "step": 7177 }, { "epoch": 0.9131153797226816, "ewc_loss": 0.0249228086322546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4922808734118007e-05, "grad_norm": 15.965359687805176, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8445099592208862, "num_tokens": 273825099.0, "step": 7178 }, { "epoch": 0.9132425900012721, "ewc_loss": 0.024904586374759674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4904586098273285e-05, "grad_norm": 15.863431930541992, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8694837689399719, "num_tokens": 273862566.0, "step": 7179 }, { "epoch": 0.9133698002798626, "ewc_loss": 0.024945825338363647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4945824407041073e-05, "grad_norm": 15.925050735473633, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8637262582778931, "num_tokens": 273896107.0, "step": 7180 }, { "epoch": 0.9134970105584531, "ewc_loss": 0.025015050545334816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5015049686771818e-05, "grad_norm": 15.951374053955078, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8687317371368408, "num_tokens": 273934066.0, "step": 7181 }, { "epoch": 0.9136242208370436, "ewc_loss": 0.024950552731752396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.495055196050089e-05, "grad_norm": 15.983187675476074, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8614881634712219, "num_tokens": 273971843.0, "step": 7182 }, { "epoch": 0.9137514311156342, "ewc_loss": 0.02494462952017784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4944629331002943e-05, "grad_norm": 15.931127548217773, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8549609184265137, "num_tokens": 274001646.0, "step": 7183 }, { "epoch": 0.9138786413942247, "ewc_loss": 0.025019600987434387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.501960079825949e-05, "grad_norm": 15.980091094970703, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8480833768844604, "num_tokens": 274042656.0, "step": 7184 }, { "epoch": 0.9140058516728151, "ewc_loss": 0.024998169392347336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4998169465106912e-05, "grad_norm": 15.958148956298828, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8492735624313354, "num_tokens": 274084609.0, "step": 7185 }, { "epoch": 0.9141330619514056, "ewc_loss": 0.02498522959649563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4985229174490087e-05, "grad_norm": 15.87830638885498, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8660820722579956, "num_tokens": 274127503.0, "step": 7186 }, { "epoch": 0.9142602722299962, "ewc_loss": 0.02497129701077938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.497129753464833e-05, "grad_norm": 15.982748985290527, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8664993047714233, "num_tokens": 274164126.0, "step": 7187 }, { "epoch": 0.9143874825085867, "ewc_loss": 0.025020865723490715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5020864995894954e-05, "grad_norm": 15.886616706848145, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8619643449783325, "num_tokens": 274198104.0, "step": 7188 }, { "epoch": 0.9145146927871772, "ewc_loss": 0.024963703006505966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4963703253888525e-05, "grad_norm": 15.986968994140625, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8518801331520081, "num_tokens": 274233487.0, "step": 7189 }, { "epoch": 0.9146419030657678, "ewc_loss": 0.025041932240128517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5041932531166822e-05, "grad_norm": 15.936863899230957, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8672780394554138, "num_tokens": 274267317.0, "step": 7190 }, { "epoch": 0.9147691133443582, "ewc_loss": 0.025004589930176735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.500459049770143e-05, "grad_norm": 16.023029327392578, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.856013834476471, "num_tokens": 274304169.0, "step": 7191 }, { "epoch": 0.9148963236229487, "ewc_loss": 0.025000061839818954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.50000612140866e-05, "grad_norm": 15.890146255493164, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8636797666549683, "num_tokens": 274343265.0, "step": 7192 }, { "epoch": 0.9150235339015392, "ewc_loss": 0.024970393627882004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4970393496914767e-05, "grad_norm": 15.889741897583008, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.847334623336792, "num_tokens": 274380756.0, "step": 7193 }, { "epoch": 0.9151507441801298, "ewc_loss": 0.025026297196745872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502629649825394e-05, "grad_norm": 15.927862167358398, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8726677894592285, "num_tokens": 274422035.0, "step": 7194 }, { "epoch": 0.9152779544587203, "ewc_loss": 0.025029951706528664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5029950847965665e-05, "grad_norm": 15.924809455871582, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8523224592208862, "num_tokens": 274460875.0, "step": 7195 }, { "epoch": 0.9154051647373108, "ewc_loss": 0.02502325363457203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502325332898181e-05, "grad_norm": 15.963287353515625, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8653771877288818, "num_tokens": 274495807.0, "step": 7196 }, { "epoch": 0.9155323750159012, "ewc_loss": 0.025000743567943573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.500074333511293e-05, "grad_norm": 15.949914932250977, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8702268600463867, "num_tokens": 274533098.0, "step": 7197 }, { "epoch": 0.9156595852944918, "ewc_loss": 0.02499544993042946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499545007594861e-05, "grad_norm": 15.911162376403809, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8580694198608398, "num_tokens": 274565282.0, "step": 7198 }, { "epoch": 0.9157867955730823, "ewc_loss": 0.025005195289850235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5005194402183406e-05, "grad_norm": 15.995948791503906, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8656395673751831, "num_tokens": 274605401.0, "step": 7199 }, { "epoch": 0.9159140058516728, "ewc_loss": 0.02503342740237713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503342693671584e-05, "grad_norm": 16.002084732055664, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8568695783615112, "num_tokens": 274645018.0, "step": 7200 }, { "epoch": 0.9160412161302633, "ewc_loss": 0.02498806081712246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4988061340991408e-05, "grad_norm": 15.914987564086914, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8719455003738403, "num_tokens": 274681541.0, "step": 7201 }, { "epoch": 0.9161684264088539, "ewc_loss": 0.02495124563574791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.495124499546364e-05, "grad_norm": 15.906213760375977, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8668326735496521, "num_tokens": 274716883.0, "step": 7202 }, { "epoch": 0.9162956366874443, "ewc_loss": 0.025036761537194252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503676114429254e-05, "grad_norm": 15.94714641571045, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8570322394371033, "num_tokens": 274753038.0, "step": 7203 }, { "epoch": 0.9164228469660348, "ewc_loss": 0.025013908743858337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5013909180415794e-05, "grad_norm": 15.899129867553711, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.869455873966217, "num_tokens": 274790760.0, "step": 7204 }, { "epoch": 0.9165500572446253, "ewc_loss": 0.02505098655819893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.505098564142827e-05, "grad_norm": 15.910415649414062, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8549779057502747, "num_tokens": 274830872.0, "step": 7205 }, { "epoch": 0.9166772675232159, "ewc_loss": 0.025076886638998985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.507688623154536e-05, "grad_norm": 15.947013854980469, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8499391078948975, "num_tokens": 274869832.0, "step": 7206 }, { "epoch": 0.9168044778018064, "ewc_loss": 0.025090493261814117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5090494091273285e-05, "grad_norm": 15.991771697998047, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8548222780227661, "num_tokens": 274903853.0, "step": 7207 }, { "epoch": 0.9169316880803969, "ewc_loss": 0.025034509599208832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503450923541095e-05, "grad_norm": 15.882887840270996, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8617719411849976, "num_tokens": 274942911.0, "step": 7208 }, { "epoch": 0.9170588983589874, "ewc_loss": 0.025051729753613472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.505172960809432e-05, "grad_norm": 15.961745262145996, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8697680234909058, "num_tokens": 274986427.0, "step": 7209 }, { "epoch": 0.9171861086375779, "ewc_loss": 0.02507290430366993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5072904463740997e-05, "grad_norm": 15.958829879760742, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8645734786987305, "num_tokens": 275021557.0, "step": 7210 }, { "epoch": 0.9173133189161684, "ewc_loss": 0.02502363920211792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502363895473536e-05, "grad_norm": 15.931004524230957, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8514338731765747, "num_tokens": 275061308.0, "step": 7211 }, { "epoch": 0.9174405291947589, "ewc_loss": 0.025070425122976303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5070425181183964e-05, "grad_norm": 15.967535972595215, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8556946516036987, "num_tokens": 275101497.0, "step": 7212 }, { "epoch": 0.9175677394733495, "ewc_loss": 0.024996181949973106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4996181309688836e-05, "grad_norm": 15.944855690002441, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8501687049865723, "num_tokens": 275138317.0, "step": 7213 }, { "epoch": 0.91769494975194, "ewc_loss": 0.02502167411148548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502167444617953e-05, "grad_norm": 15.918039321899414, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8616113662719727, "num_tokens": 275180119.0, "step": 7214 }, { "epoch": 0.9178221600305305, "ewc_loss": 0.025068864226341248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.506886448827572e-05, "grad_norm": 15.893634796142578, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8602654337882996, "num_tokens": 275212083.0, "step": 7215 }, { "epoch": 0.9179493703091209, "ewc_loss": 0.02502504549920559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5025045033544302e-05, "grad_norm": 15.928013801574707, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8516768217086792, "num_tokens": 275247872.0, "step": 7216 }, { "epoch": 0.9180765805877115, "ewc_loss": 0.025047672912478447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5047673261724412e-05, "grad_norm": 15.86535358428955, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8667439222335815, "num_tokens": 275287022.0, "step": 7217 }, { "epoch": 0.918203790866302, "ewc_loss": 0.025057189166545868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.505718839529436e-05, "grad_norm": 16.03874397277832, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8649147748947144, "num_tokens": 275322259.0, "step": 7218 }, { "epoch": 0.9183310011448925, "ewc_loss": 0.025077013298869133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5077013560803607e-05, "grad_norm": 15.830259323120117, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8765566945075989, "num_tokens": 275357410.0, "step": 7219 }, { "epoch": 0.918458211423483, "ewc_loss": 0.02502354420721531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5023544367286377e-05, "grad_norm": 16.01861000061035, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8743720054626465, "num_tokens": 275388720.0, "step": 7220 }, { "epoch": 0.9185854217020736, "ewc_loss": 0.02516026236116886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.516026324883569e-05, "grad_norm": 15.954731941223145, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8686631917953491, "num_tokens": 275427699.0, "step": 7221 }, { "epoch": 0.918712631980664, "ewc_loss": 0.024985074996948242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4985074560390785e-05, "grad_norm": 15.858352661132812, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8651065826416016, "num_tokens": 275472220.0, "step": 7222 }, { "epoch": 0.9188398422592545, "ewc_loss": 0.02502390369772911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502390452718828e-05, "grad_norm": 15.886436462402344, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8495700359344482, "num_tokens": 275506693.0, "step": 7223 }, { "epoch": 0.918967052537845, "ewc_loss": 0.025111667811870575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.511166712793056e-05, "grad_norm": 15.916186332702637, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8656986355781555, "num_tokens": 275547295.0, "step": 7224 }, { "epoch": 0.9190942628164356, "ewc_loss": 0.025042330846190453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5042330889846198e-05, "grad_norm": 15.911161422729492, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8587826490402222, "num_tokens": 275591552.0, "step": 7225 }, { "epoch": 0.9192214730950261, "ewc_loss": 0.02509354054927826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5093540898524225e-05, "grad_norm": 15.900379180908203, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8375004529953003, "num_tokens": 275632197.0, "step": 7226 }, { "epoch": 0.9193486833736166, "ewc_loss": 0.02505434677004814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5054347133846022e-05, "grad_norm": 15.937858581542969, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8554213047027588, "num_tokens": 275670680.0, "step": 7227 }, { "epoch": 0.919475893652207, "ewc_loss": 0.025049813091754913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5049812393262982e-05, "grad_norm": 15.915538787841797, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8601665496826172, "num_tokens": 275709315.0, "step": 7228 }, { "epoch": 0.9196031039307976, "ewc_loss": 0.02507060207426548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5070601623156108e-05, "grad_norm": 15.994717597961426, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.861426591873169, "num_tokens": 275740405.0, "step": 7229 }, { "epoch": 0.9197303142093881, "ewc_loss": 0.02503599226474762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503599171177484e-05, "grad_norm": 15.921167373657227, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8803157210350037, "num_tokens": 275780915.0, "step": 7230 }, { "epoch": 0.9198575244879786, "ewc_loss": 0.025007255375385284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5007255317177624e-05, "grad_norm": 15.976179122924805, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8567468523979187, "num_tokens": 275817832.0, "step": 7231 }, { "epoch": 0.9199847347665692, "ewc_loss": 0.02502347342669964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502347342669964e-05, "grad_norm": 15.876429557800293, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8659127950668335, "num_tokens": 275857621.0, "step": 7232 }, { "epoch": 0.9201119450451597, "ewc_loss": 0.02496548742055893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4965487682493404e-05, "grad_norm": 15.921465873718262, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.858648419380188, "num_tokens": 275898917.0, "step": 7233 }, { "epoch": 0.9202391553237501, "ewc_loss": 0.02503945119678974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5039451429620385e-05, "grad_norm": 15.935110092163086, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8660880923271179, "num_tokens": 275930151.0, "step": 7234 }, { "epoch": 0.9203663656023406, "ewc_loss": 0.024969927966594696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.496992783562746e-05, "grad_norm": 15.893345832824707, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8692989945411682, "num_tokens": 275966901.0, "step": 7235 }, { "epoch": 0.9204935758809312, "ewc_loss": 0.02497989498078823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4979894078569487e-05, "grad_norm": 15.881507873535156, "learning_rate": 1e-06, "loss": 0.3539, "mean_token_accuracy": 0.8845119476318359, "num_tokens": 276002845.0, "step": 7236 }, { "epoch": 0.9206207861595217, "ewc_loss": 0.025022748857736588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5022749468917027e-05, "grad_norm": 15.956929206848145, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8609117865562439, "num_tokens": 276038476.0, "step": 7237 }, { "epoch": 0.9207479964381122, "ewc_loss": 0.02501615881919861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.501615927030798e-05, "grad_norm": 15.863348960876465, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.858572244644165, "num_tokens": 276077867.0, "step": 7238 }, { "epoch": 0.9208752067167028, "ewc_loss": 0.025030076503753662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503007635823451e-05, "grad_norm": 15.899044036865234, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8517786264419556, "num_tokens": 276123289.0, "step": 7239 }, { "epoch": 0.9210024169952932, "ewc_loss": 0.02499353140592575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499353104212787e-05, "grad_norm": 15.950591087341309, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.849760890007019, "num_tokens": 276157357.0, "step": 7240 }, { "epoch": 0.9211296272738837, "ewc_loss": 0.025046871975064278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.504687108739745e-05, "grad_norm": 15.918417930603027, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8834377527236938, "num_tokens": 276195548.0, "step": 7241 }, { "epoch": 0.9212568375524742, "ewc_loss": 0.024966919794678688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4966919227153994e-05, "grad_norm": 15.926972389221191, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8642666339874268, "num_tokens": 276233533.0, "step": 7242 }, { "epoch": 0.9213840478310648, "ewc_loss": 0.02502291649580002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5022916815942153e-05, "grad_norm": 15.941242218017578, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8660604953765869, "num_tokens": 276274310.0, "step": 7243 }, { "epoch": 0.9215112581096553, "ewc_loss": 0.025022072717547417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5022072804858908e-05, "grad_norm": 16.009841918945312, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8595985174179077, "num_tokens": 276310272.0, "step": 7244 }, { "epoch": 0.9216384683882458, "ewc_loss": 0.025009635835886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5009636374306865e-05, "grad_norm": 15.933295249938965, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8560448884963989, "num_tokens": 276350334.0, "step": 7245 }, { "epoch": 0.9217656786668362, "ewc_loss": 0.02496401034295559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4964010663097724e-05, "grad_norm": 15.927084922790527, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8573777675628662, "num_tokens": 276393913.0, "step": 7246 }, { "epoch": 0.9218928889454268, "ewc_loss": 0.02501554973423481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5015549908857793e-05, "grad_norm": 15.934947967529297, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8584631681442261, "num_tokens": 276430368.0, "step": 7247 }, { "epoch": 0.9220200992240173, "ewc_loss": 0.02498500794172287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4985007257782854e-05, "grad_norm": 15.965408325195312, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8565850853919983, "num_tokens": 276470930.0, "step": 7248 }, { "epoch": 0.9221473095026078, "ewc_loss": 0.02501649037003517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5016490326379426e-05, "grad_norm": 15.888818740844727, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8668267130851746, "num_tokens": 276509359.0, "step": 7249 }, { "epoch": 0.9222745197811983, "ewc_loss": 0.02499576285481453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499576294212602e-05, "grad_norm": 15.95828628540039, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8537640571594238, "num_tokens": 276546980.0, "step": 7250 }, { "epoch": 0.9224017300597889, "ewc_loss": 0.02503274753689766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.503274845366832e-05, "grad_norm": 15.985368728637695, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8491194248199463, "num_tokens": 276586212.0, "step": 7251 }, { "epoch": 0.9225289403383793, "ewc_loss": 0.02500992640852928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.500992559362203e-05, "grad_norm": 15.962167739868164, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8627551794052124, "num_tokens": 276623946.0, "step": 7252 }, { "epoch": 0.9226561506169698, "ewc_loss": 0.024989929050207138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.498992944310885e-05, "grad_norm": 15.91635799407959, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8559620380401611, "num_tokens": 276667567.0, "step": 7253 }, { "epoch": 0.9227833608955603, "ewc_loss": 0.02497565746307373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.497565765224863e-05, "grad_norm": 15.94962215423584, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8555660843849182, "num_tokens": 276708479.0, "step": 7254 }, { "epoch": 0.9229105711741509, "ewc_loss": 0.025018298998475075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.501829840184655e-05, "grad_norm": 15.957468032836914, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8565587997436523, "num_tokens": 276744627.0, "step": 7255 }, { "epoch": 0.9230377814527414, "ewc_loss": 0.02494695596396923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.494695581845008e-05, "grad_norm": 15.916169166564941, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8628981709480286, "num_tokens": 276782830.0, "step": 7256 }, { "epoch": 0.9231649917313319, "ewc_loss": 0.025003768503665924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5003768314491026e-05, "grad_norm": 15.998668670654297, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8600867390632629, "num_tokens": 276825745.0, "step": 7257 }, { "epoch": 0.9232922020099223, "ewc_loss": 0.025020236149430275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5020235625561327e-05, "grad_norm": 15.914450645446777, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8674727082252502, "num_tokens": 276866743.0, "step": 7258 }, { "epoch": 0.9234194122885129, "ewc_loss": 0.024952612817287445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4952612875495106e-05, "grad_norm": 15.989729881286621, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.857354462146759, "num_tokens": 276903884.0, "step": 7259 }, { "epoch": 0.9235466225671034, "ewc_loss": 0.02500608004629612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5006080250022933e-05, "grad_norm": 16.004220962524414, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8677349090576172, "num_tokens": 276939142.0, "step": 7260 }, { "epoch": 0.9236738328456939, "ewc_loss": 0.024977724999189377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.497772584320046e-05, "grad_norm": 15.968714714050293, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8627854585647583, "num_tokens": 276979041.0, "step": 7261 }, { "epoch": 0.9238010431242845, "ewc_loss": 0.024937856942415237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4937857233453542e-05, "grad_norm": 15.971795082092285, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8736833333969116, "num_tokens": 277015590.0, "step": 7262 }, { "epoch": 0.923928253402875, "ewc_loss": 0.024978766217827797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.497876630513929e-05, "grad_norm": 16.01881980895996, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8553425669670105, "num_tokens": 277055745.0, "step": 7263 }, { "epoch": 0.9240554636814655, "ewc_loss": 0.024983838200569153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4983837647596374e-05, "grad_norm": 15.974517822265625, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8574821352958679, "num_tokens": 277098856.0, "step": 7264 }, { "epoch": 0.9241826739600559, "ewc_loss": 0.024918440729379654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4918441340560094e-05, "grad_norm": 15.990368843078613, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.873365044593811, "num_tokens": 277139165.0, "step": 7265 }, { "epoch": 0.9243098842386465, "ewc_loss": 0.02500005252659321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5000052119139582e-05, "grad_norm": 16.064395904541016, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8657981157302856, "num_tokens": 277171879.0, "step": 7266 }, { "epoch": 0.924437094517237, "ewc_loss": 0.024971861392259598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4971861421363428e-05, "grad_norm": 15.973806381225586, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8630049228668213, "num_tokens": 277211719.0, "step": 7267 }, { "epoch": 0.9245643047958275, "ewc_loss": 0.024931253865361214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.493125430191867e-05, "grad_norm": 15.99384593963623, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8537571430206299, "num_tokens": 277252675.0, "step": 7268 }, { "epoch": 0.924691515074418, "ewc_loss": 0.02497808262705803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4978082365123555e-05, "grad_norm": 16.023868560791016, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8495705127716064, "num_tokens": 277287529.0, "step": 7269 }, { "epoch": 0.9248187253530086, "ewc_loss": 0.024899160489439964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4899160052882507e-05, "grad_norm": 15.908520698547363, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8617455959320068, "num_tokens": 277328392.0, "step": 7270 }, { "epoch": 0.924945935631599, "ewc_loss": 0.024923205375671387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.492320527380798e-05, "grad_norm": 15.992887496948242, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8711759448051453, "num_tokens": 277368412.0, "step": 7271 }, { "epoch": 0.9250731459101895, "ewc_loss": 0.02496807649731636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.496807610441465e-05, "grad_norm": 15.95760726928711, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8672482371330261, "num_tokens": 277404373.0, "step": 7272 }, { "epoch": 0.92520035618878, "ewc_loss": 0.024938931688666344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4938932256191038e-05, "grad_norm": 15.96098518371582, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8593509793281555, "num_tokens": 277444988.0, "step": 7273 }, { "epoch": 0.9253275664673706, "ewc_loss": 0.024967912584543228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.496791239536833e-05, "grad_norm": 15.957959175109863, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8706177473068237, "num_tokens": 277487501.0, "step": 7274 }, { "epoch": 0.9254547767459611, "ewc_loss": 0.024972403421998024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4972403480205685e-05, "grad_norm": 15.940956115722656, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8582480549812317, "num_tokens": 277529056.0, "step": 7275 }, { "epoch": 0.9255819870245516, "ewc_loss": 0.024970265105366707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4970264348667115e-05, "grad_norm": 16.023164749145508, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8739403486251831, "num_tokens": 277563022.0, "step": 7276 }, { "epoch": 0.925709197303142, "ewc_loss": 0.02497311495244503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.497311470506247e-05, "grad_norm": 15.91112995147705, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8693089485168457, "num_tokens": 277603879.0, "step": 7277 }, { "epoch": 0.9258364075817326, "ewc_loss": 0.024941788986325264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4941788069554605e-05, "grad_norm": 15.97654914855957, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8580347895622253, "num_tokens": 277643711.0, "step": 7278 }, { "epoch": 0.9259636178603231, "ewc_loss": 0.025023171678185463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502317147445865e-05, "grad_norm": 16.003257751464844, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8610666990280151, "num_tokens": 277680667.0, "step": 7279 }, { "epoch": 0.9260908281389136, "ewc_loss": 0.02500106580555439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5001065296237357e-05, "grad_norm": 16.020963668823242, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8605575561523438, "num_tokens": 277715548.0, "step": 7280 }, { "epoch": 0.9262180384175042, "ewc_loss": 0.024966295808553696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4966295313788578e-05, "grad_norm": 15.924460411071777, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8615800142288208, "num_tokens": 277745287.0, "step": 7281 }, { "epoch": 0.9263452486960947, "ewc_loss": 0.02499254420399666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4992543330881745e-05, "grad_norm": 15.974081993103027, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8557511568069458, "num_tokens": 277781484.0, "step": 7282 }, { "epoch": 0.9264724589746851, "ewc_loss": 0.025044454261660576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5044453650480136e-05, "grad_norm": 15.989263534545898, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8644790649414062, "num_tokens": 277818429.0, "step": 7283 }, { "epoch": 0.9265996692532756, "ewc_loss": 0.025006935000419617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.50069351750426e-05, "grad_norm": 15.939067840576172, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8685406446456909, "num_tokens": 277853077.0, "step": 7284 }, { "epoch": 0.9267268795318662, "ewc_loss": 0.025032663717865944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5032662961166352e-05, "grad_norm": 15.932028770446777, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8795278072357178, "num_tokens": 277893997.0, "step": 7285 }, { "epoch": 0.9268540898104567, "ewc_loss": 0.024979740381240845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4979739464470185e-05, "grad_norm": 15.866809844970703, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8511883616447449, "num_tokens": 277934680.0, "step": 7286 }, { "epoch": 0.9269813000890472, "ewc_loss": 0.025034604594111443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5034603822859935e-05, "grad_norm": 15.888468742370605, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8544785976409912, "num_tokens": 277980370.0, "step": 7287 }, { "epoch": 0.9271085103676378, "ewc_loss": 0.02509041130542755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5090410417760722e-05, "grad_norm": 15.933037757873535, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8736006021499634, "num_tokens": 278017257.0, "step": 7288 }, { "epoch": 0.9272357206462282, "ewc_loss": 0.025074295699596405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.507429599063471e-05, "grad_norm": 15.946452140808105, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8617098331451416, "num_tokens": 278054835.0, "step": 7289 }, { "epoch": 0.9273629309248187, "ewc_loss": 0.02512216754257679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.512216815375723e-05, "grad_norm": 16.024232864379883, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8601495623588562, "num_tokens": 278089510.0, "step": 7290 }, { "epoch": 0.9274901412034092, "ewc_loss": 0.025049349293112755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5049348550965078e-05, "grad_norm": 15.890427589416504, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8640010952949524, "num_tokens": 278126179.0, "step": 7291 }, { "epoch": 0.9276173514819998, "ewc_loss": 0.025035148486495018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5035147700691596e-05, "grad_norm": 16.015613555908203, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8397125005722046, "num_tokens": 278170837.0, "step": 7292 }, { "epoch": 0.9277445617605903, "ewc_loss": 0.025090886279940605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.509088699298445e-05, "grad_norm": 15.974970817565918, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8666893243789673, "num_tokens": 278207410.0, "step": 7293 }, { "epoch": 0.9278717720391808, "ewc_loss": 0.025032352656126022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5032351913978346e-05, "grad_norm": 16.035945892333984, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.882142186164856, "num_tokens": 278240495.0, "step": 7294 }, { "epoch": 0.9279989823177712, "ewc_loss": 0.025005977600812912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5005978386616334e-05, "grad_norm": 15.924367904663086, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8528158664703369, "num_tokens": 278278449.0, "step": 7295 }, { "epoch": 0.9281261925963618, "ewc_loss": 0.024990499019622803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499049878679216e-05, "grad_norm": 16.04030418395996, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8714504837989807, "num_tokens": 278321434.0, "step": 7296 }, { "epoch": 0.9282534028749523, "ewc_loss": 0.0250248946249485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5024894057423808e-05, "grad_norm": 16.000917434692383, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8572659492492676, "num_tokens": 278363730.0, "step": 7297 }, { "epoch": 0.9283806131535428, "ewc_loss": 0.024954918771982193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4954919354058802e-05, "grad_norm": 15.911234855651855, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8636887073516846, "num_tokens": 278395493.0, "step": 7298 }, { "epoch": 0.9285078234321333, "ewc_loss": 0.02502378076314926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5023780835908838e-05, "grad_norm": 16.06144905090332, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8643741011619568, "num_tokens": 278433024.0, "step": 7299 }, { "epoch": 0.9286350337107239, "ewc_loss": 0.025064270943403244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.506427154003177e-05, "grad_norm": 15.970673561096191, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8632620573043823, "num_tokens": 278471311.0, "step": 7300 }, { "epoch": 0.9287622439893143, "ewc_loss": 0.02495071291923523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4950713850557804e-05, "grad_norm": 15.944230079650879, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8826686143875122, "num_tokens": 278509151.0, "step": 7301 }, { "epoch": 0.9288894542679048, "ewc_loss": 0.025020143017172813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5020142857101746e-05, "grad_norm": 15.992679595947266, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.869682252407074, "num_tokens": 278548743.0, "step": 7302 }, { "epoch": 0.9290166645464953, "ewc_loss": 0.025013335049152374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5013334379764274e-05, "grad_norm": 16.01481819152832, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.87403404712677, "num_tokens": 278590722.0, "step": 7303 }, { "epoch": 0.9291438748250859, "ewc_loss": 0.025064213201403618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5064213332370855e-05, "grad_norm": 16.025354385375977, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8491808176040649, "num_tokens": 278628150.0, "step": 7304 }, { "epoch": 0.9292710851036764, "ewc_loss": 0.02494245581328869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4942455638665706e-05, "grad_norm": 15.967159271240234, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8460066318511963, "num_tokens": 278672543.0, "step": 7305 }, { "epoch": 0.9293982953822669, "ewc_loss": 0.02496332675218582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.496332672308199e-05, "grad_norm": 15.977981567382812, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8707265257835388, "num_tokens": 278707234.0, "step": 7306 }, { "epoch": 0.9295255056608573, "ewc_loss": 0.02498500980436802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4985009076772258e-05, "grad_norm": 16.036510467529297, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8649848103523254, "num_tokens": 278734678.0, "step": 7307 }, { "epoch": 0.9296527159394479, "ewc_loss": 0.025045247748494148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5045248548849486e-05, "grad_norm": 16.028032302856445, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8729704022407532, "num_tokens": 278778284.0, "step": 7308 }, { "epoch": 0.9297799262180384, "ewc_loss": 0.024967432022094727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4967432182165794e-05, "grad_norm": 15.938946723937988, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8621680736541748, "num_tokens": 278815363.0, "step": 7309 }, { "epoch": 0.9299071364966289, "ewc_loss": 0.024958666414022446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4958666472230107e-05, "grad_norm": 15.953499794006348, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8500668406486511, "num_tokens": 278860777.0, "step": 7310 }, { "epoch": 0.9300343467752195, "ewc_loss": 0.025034107267856598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5034107238752767e-05, "grad_norm": 16.02457046508789, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8766390085220337, "num_tokens": 278890552.0, "step": 7311 }, { "epoch": 0.93016155705381, "ewc_loss": 0.024986643344163895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4986642529256642e-05, "grad_norm": 15.966638565063477, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8514161109924316, "num_tokens": 278924741.0, "step": 7312 }, { "epoch": 0.9302887673324005, "ewc_loss": 0.025040650740265846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5040650143637322e-05, "grad_norm": 16.055307388305664, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8714534044265747, "num_tokens": 278965185.0, "step": 7313 }, { "epoch": 0.9304159776109909, "ewc_loss": 0.02499963901937008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4999639208544977e-05, "grad_norm": 15.94333267211914, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8524560332298279, "num_tokens": 278998961.0, "step": 7314 }, { "epoch": 0.9305431878895815, "ewc_loss": 0.024995967745780945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499596848792862e-05, "grad_norm": 16.02712631225586, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8501065969467163, "num_tokens": 279032689.0, "step": 7315 }, { "epoch": 0.930670398168172, "ewc_loss": 0.025077544152736664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5077544705709442e-05, "grad_norm": 15.956772804260254, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8490336537361145, "num_tokens": 279080305.0, "step": 7316 }, { "epoch": 0.9307976084467625, "ewc_loss": 0.025006964802742004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5006964278873056e-05, "grad_norm": 16.0295467376709, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8636504411697388, "num_tokens": 279120581.0, "step": 7317 }, { "epoch": 0.930924818725353, "ewc_loss": 0.02508101984858513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5081018975470215e-05, "grad_norm": 15.9135160446167, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8745774030685425, "num_tokens": 279165050.0, "step": 7318 }, { "epoch": 0.9310520290039436, "ewc_loss": 0.024976227432489395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4976226995931938e-05, "grad_norm": 16.000957489013672, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8668964505195618, "num_tokens": 279209393.0, "step": 7319 }, { "epoch": 0.931179239282534, "ewc_loss": 0.02512979693710804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.51297969953157e-05, "grad_norm": 15.980504989624023, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8506357669830322, "num_tokens": 279250569.0, "step": 7320 }, { "epoch": 0.9313064495611245, "ewc_loss": 0.025014963001012802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5014962375280447e-05, "grad_norm": 15.995254516601562, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.868744969367981, "num_tokens": 279291154.0, "step": 7321 }, { "epoch": 0.931433659839715, "ewc_loss": 0.02505442500114441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5054425350390375e-05, "grad_norm": 15.998103141784668, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8616420030593872, "num_tokens": 279326967.0, "step": 7322 }, { "epoch": 0.9315608701183056, "ewc_loss": 0.02505439519882202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5054394427570514e-05, "grad_norm": 16.03661346435547, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8750892877578735, "num_tokens": 279367129.0, "step": 7323 }, { "epoch": 0.9316880803968961, "ewc_loss": 0.025058740749955177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5058739993255585e-05, "grad_norm": 16.05849266052246, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.86815345287323, "num_tokens": 279403651.0, "step": 7324 }, { "epoch": 0.9318152906754866, "ewc_loss": 0.02507045678794384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5070456104003824e-05, "grad_norm": 16.109661102294922, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8553611040115356, "num_tokens": 279439166.0, "step": 7325 }, { "epoch": 0.931942500954077, "ewc_loss": 0.02502647042274475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5026471121236682e-05, "grad_norm": 15.953092575073242, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8614107370376587, "num_tokens": 279473489.0, "step": 7326 }, { "epoch": 0.9320697112326676, "ewc_loss": 0.024982625618577003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.4982626200653613e-05, "grad_norm": 15.961884498596191, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.866166353225708, "num_tokens": 279511709.0, "step": 7327 }, { "epoch": 0.9321969215112581, "ewc_loss": 0.025077538564801216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5077539248741232e-05, "grad_norm": 16.0806827545166, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8509400486946106, "num_tokens": 279551638.0, "step": 7328 }, { "epoch": 0.9323241317898486, "ewc_loss": 0.025051390752196312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.505139127606526e-05, "grad_norm": 16.03780174255371, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8736673593521118, "num_tokens": 279587304.0, "step": 7329 }, { "epoch": 0.9324513420684392, "ewc_loss": 0.025014931336045265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5014931452460587e-05, "grad_norm": 15.984896659851074, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8539623618125916, "num_tokens": 279628138.0, "step": 7330 }, { "epoch": 0.9325785523470297, "ewc_loss": 0.024999523535370827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.499952279322315e-05, "grad_norm": 16.03135871887207, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8672798871994019, "num_tokens": 279663064.0, "step": 7331 }, { "epoch": 0.9327057626256201, "ewc_loss": 0.025044359266757965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5044359063031152e-05, "grad_norm": 16.050596237182617, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8540626764297485, "num_tokens": 279704937.0, "step": 7332 }, { "epoch": 0.9328329729042106, "ewc_loss": 0.02501823753118515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5018238375196233e-05, "grad_norm": 15.975236892700195, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8654953241348267, "num_tokens": 279738215.0, "step": 7333 }, { "epoch": 0.9329601831828012, "ewc_loss": 0.025025254115462303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.502525421732571e-05, "grad_norm": 16.04853630065918, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8465744256973267, "num_tokens": 279771996.0, "step": 7334 }, { "epoch": 0.9330873934613917, "ewc_loss": 0.025031255558133125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5031255063368008e-05, "grad_norm": 15.96894645690918, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.87641441822052, "num_tokens": 279810037.0, "step": 7335 }, { "epoch": 0.9332146037399822, "ewc_loss": 0.02502671256661415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5026713046827354e-05, "grad_norm": 16.038005828857422, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8564243316650391, "num_tokens": 279847394.0, "step": 7336 }, { "epoch": 0.9333418140185727, "ewc_loss": 0.025016775354743004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5016775907715783e-05, "grad_norm": 15.946455955505371, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8496294021606445, "num_tokens": 279879057.0, "step": 7337 }, { "epoch": 0.9334690242971632, "ewc_loss": 0.02503203973174095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5032039047800936e-05, "grad_norm": 15.99460220336914, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8571882247924805, "num_tokens": 279911775.0, "step": 7338 }, { "epoch": 0.9335962345757537, "ewc_loss": 0.025065729394555092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.506572855054401e-05, "grad_norm": 15.96546745300293, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8654642701148987, "num_tokens": 279947661.0, "step": 7339 }, { "epoch": 0.9337234448543442, "ewc_loss": 0.02507099322974682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.507099270587787e-05, "grad_norm": 15.932607650756836, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8644982576370239, "num_tokens": 279977507.0, "step": 7340 }, { "epoch": 0.9338506551329347, "ewc_loss": 0.025105968117713928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.510596823412925e-05, "grad_norm": 15.968070983886719, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8593112826347351, "num_tokens": 280016749.0, "step": 7341 }, { "epoch": 0.9339778654115253, "ewc_loss": 0.025111405178904533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.511140519345645e-05, "grad_norm": 15.935270309448242, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8644454479217529, "num_tokens": 280056092.0, "step": 7342 }, { "epoch": 0.9341050756901158, "ewc_loss": 0.02515808306634426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.515808228054084e-05, "grad_norm": 15.984625816345215, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.874853789806366, "num_tokens": 280089803.0, "step": 7343 }, { "epoch": 0.9342322859687062, "ewc_loss": 0.025148475542664528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5148476197500713e-05, "grad_norm": 15.931873321533203, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8506477475166321, "num_tokens": 280126190.0, "step": 7344 }, { "epoch": 0.9343594962472968, "ewc_loss": 0.025179646909236908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5179646399919875e-05, "grad_norm": 15.968000411987305, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8741801977157593, "num_tokens": 280162743.0, "step": 7345 }, { "epoch": 0.9344867065258873, "ewc_loss": 0.025225715711712837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.522571594454348e-05, "grad_norm": 16.044647216796875, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8623554110527039, "num_tokens": 280203676.0, "step": 7346 }, { "epoch": 0.9346139168044778, "ewc_loss": 0.02519073151051998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5190731321345083e-05, "grad_norm": 15.941227912902832, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8790676593780518, "num_tokens": 280241765.0, "step": 7347 }, { "epoch": 0.9347411270830683, "ewc_loss": 0.02520155906677246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5201559765264392e-05, "grad_norm": 16.090774536132812, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8416152000427246, "num_tokens": 280280103.0, "step": 7348 }, { "epoch": 0.9348683373616589, "ewc_loss": 0.025244496762752533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5244497010135092e-05, "grad_norm": 15.970515251159668, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8462609052658081, "num_tokens": 280314674.0, "step": 7349 }, { "epoch": 0.9349955476402493, "ewc_loss": 0.025129249319434166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5129249479505233e-05, "grad_norm": 16.04674530029297, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8638455867767334, "num_tokens": 280353890.0, "step": 7350 }, { "epoch": 0.9351227579188398, "ewc_loss": 0.025250859558582306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5250859835068695e-05, "grad_norm": 15.93801498413086, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8549330830574036, "num_tokens": 280396403.0, "step": 7351 }, { "epoch": 0.9352499681974303, "ewc_loss": 0.02512752264738083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.512752325856127e-05, "grad_norm": 16.01900863647461, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8627038598060608, "num_tokens": 280436154.0, "step": 7352 }, { "epoch": 0.9353771784760209, "ewc_loss": 0.02525312826037407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5253128114854917e-05, "grad_norm": 16.020755767822266, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8701342344284058, "num_tokens": 280476480.0, "step": 7353 }, { "epoch": 0.9355043887546114, "ewc_loss": 0.025150716304779053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5150715373456478e-05, "grad_norm": 16.016145706176758, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8669902086257935, "num_tokens": 280505976.0, "step": 7354 }, { "epoch": 0.9356315990332019, "ewc_loss": 0.0251638051122427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5163804821204394e-05, "grad_norm": 15.930323600769043, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8581380844116211, "num_tokens": 280548365.0, "step": 7355 }, { "epoch": 0.9357588093117923, "ewc_loss": 0.025172991678118706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5172992536681704e-05, "grad_norm": 16.05420684814453, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8807370066642761, "num_tokens": 280588074.0, "step": 7356 }, { "epoch": 0.9358860195903829, "ewc_loss": 0.02522137574851513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.522137583582662e-05, "grad_norm": 16.025062561035156, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.872288167476654, "num_tokens": 280626915.0, "step": 7357 }, { "epoch": 0.9360132298689734, "ewc_loss": 0.025100911036133766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5100911443587393e-05, "grad_norm": 16.01410675048828, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.857498824596405, "num_tokens": 280660088.0, "step": 7358 }, { "epoch": 0.9361404401475639, "ewc_loss": 0.02515381947159767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.515382038836833e-05, "grad_norm": 15.981314659118652, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8607259392738342, "num_tokens": 280701326.0, "step": 7359 }, { "epoch": 0.9362676504261545, "ewc_loss": 0.025132708251476288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5132707378361374e-05, "grad_norm": 15.99428653717041, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8726989030838013, "num_tokens": 280740256.0, "step": 7360 }, { "epoch": 0.936394860704745, "ewc_loss": 0.025112595409154892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5112594812526368e-05, "grad_norm": 15.978659629821777, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.87732994556427, "num_tokens": 280773515.0, "step": 7361 }, { "epoch": 0.9365220709833355, "ewc_loss": 0.025132423266768456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.513242361601442e-05, "grad_norm": 16.01820945739746, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8376690745353699, "num_tokens": 280817207.0, "step": 7362 }, { "epoch": 0.9366492812619259, "ewc_loss": 0.025111211463809013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.511121056159027e-05, "grad_norm": 15.92213249206543, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.849260687828064, "num_tokens": 280860253.0, "step": 7363 }, { "epoch": 0.9367764915405165, "ewc_loss": 0.02509124018251896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.509123987692874e-05, "grad_norm": 16.022663116455078, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8600154519081116, "num_tokens": 280892634.0, "step": 7364 }, { "epoch": 0.936903701819107, "ewc_loss": 0.025137295946478844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.513729668862652e-05, "grad_norm": 15.887840270996094, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8713967204093933, "num_tokens": 280930912.0, "step": 7365 }, { "epoch": 0.9370309120976975, "ewc_loss": 0.02508164383471012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5081644707825035e-05, "grad_norm": 16.014991760253906, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8664442300796509, "num_tokens": 280971588.0, "step": 7366 }, { "epoch": 0.937158122376288, "ewc_loss": 0.025179440155625343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5179440854117274e-05, "grad_norm": 15.955464363098145, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8705328702926636, "num_tokens": 281009898.0, "step": 7367 }, { "epoch": 0.9372853326548786, "ewc_loss": 0.025075150653719902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5075150915654376e-05, "grad_norm": 15.989673614501953, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8554375767707825, "num_tokens": 281046726.0, "step": 7368 }, { "epoch": 0.937412542933469, "ewc_loss": 0.025141024962067604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.514102561690379e-05, "grad_norm": 15.957531929016113, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8767763376235962, "num_tokens": 281085163.0, "step": 7369 }, { "epoch": 0.9375397532120595, "ewc_loss": 0.025072135031223297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5072135031223297e-05, "grad_norm": 15.977740287780762, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8679256439208984, "num_tokens": 281128660.0, "step": 7370 }, { "epoch": 0.93766696349065, "ewc_loss": 0.025147736072540283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.514773586881347e-05, "grad_norm": 15.941113471984863, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8450804948806763, "num_tokens": 281163780.0, "step": 7371 }, { "epoch": 0.9377941737692406, "ewc_loss": 0.025089379400014877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5089379050768912e-05, "grad_norm": 16.007314682006836, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8705713748931885, "num_tokens": 281204247.0, "step": 7372 }, { "epoch": 0.9379213840478311, "ewc_loss": 0.02511712908744812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5117129553109407e-05, "grad_norm": 15.933418273925781, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.887518584728241, "num_tokens": 281239518.0, "step": 7373 }, { "epoch": 0.9380485943264216, "ewc_loss": 0.025119194760918617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5119194106082432e-05, "grad_norm": 16.07996368408203, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8711791038513184, "num_tokens": 281275916.0, "step": 7374 }, { "epoch": 0.938175804605012, "ewc_loss": 0.025094490498304367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5094490410992876e-05, "grad_norm": 15.94303035736084, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8732011914253235, "num_tokens": 281307615.0, "step": 7375 }, { "epoch": 0.9383030148836026, "ewc_loss": 0.02504800446331501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5048004317795858e-05, "grad_norm": 16.016695022583008, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8675102591514587, "num_tokens": 281342677.0, "step": 7376 }, { "epoch": 0.9384302251621931, "ewc_loss": 0.02513814903795719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5138149794656783e-05, "grad_norm": 16.00678253173828, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8582956790924072, "num_tokens": 281379545.0, "step": 7377 }, { "epoch": 0.9385574354407836, "ewc_loss": 0.02507043443620205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5070434276130982e-05, "grad_norm": 16.007963180541992, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8446264266967773, "num_tokens": 281424169.0, "step": 7378 }, { "epoch": 0.9386846457193742, "ewc_loss": 0.02511734329164028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5117344193859026e-05, "grad_norm": 16.00914764404297, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8841181397438049, "num_tokens": 281461306.0, "step": 7379 }, { "epoch": 0.9388118559979647, "ewc_loss": 0.025105109438300133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5105109671130776e-05, "grad_norm": 16.005483627319336, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8755050301551819, "num_tokens": 281498753.0, "step": 7380 }, { "epoch": 0.9389390662765551, "ewc_loss": 0.025083906948566437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5083907530643046e-05, "grad_norm": 15.982688903808594, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8720559477806091, "num_tokens": 281537163.0, "step": 7381 }, { "epoch": 0.9390662765551456, "ewc_loss": 0.025092091411352158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.50920911639696e-05, "grad_norm": 15.948328018188477, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.854357898235321, "num_tokens": 281577926.0, "step": 7382 }, { "epoch": 0.9391934868337362, "ewc_loss": 0.025078443810343742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5078443286474794e-05, "grad_norm": 15.99091625213623, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8823592662811279, "num_tokens": 281621283.0, "step": 7383 }, { "epoch": 0.9393206971123267, "ewc_loss": 0.025127941742539406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5127941626124084e-05, "grad_norm": 15.9453125, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8659543991088867, "num_tokens": 281658336.0, "step": 7384 }, { "epoch": 0.9394479073909172, "ewc_loss": 0.025088321417570114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5088322217925452e-05, "grad_norm": 15.952218055725098, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8668321371078491, "num_tokens": 281695137.0, "step": 7385 }, { "epoch": 0.9395751176695077, "ewc_loss": 0.025146309286355972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.514630978112109e-05, "grad_norm": 16.029685974121094, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8780074715614319, "num_tokens": 281733570.0, "step": 7386 }, { "epoch": 0.9397023279480982, "ewc_loss": 0.02508467808365822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.508467878215015e-05, "grad_norm": 15.942176818847656, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8650954961776733, "num_tokens": 281771163.0, "step": 7387 }, { "epoch": 0.9398295382266887, "ewc_loss": 0.025068167597055435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5068167815334164e-05, "grad_norm": 16.008808135986328, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8652583956718445, "num_tokens": 281813715.0, "step": 7388 }, { "epoch": 0.9399567485052792, "ewc_loss": 0.025112979114055634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5112978619290516e-05, "grad_norm": 16.008432388305664, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8613309860229492, "num_tokens": 281856921.0, "step": 7389 }, { "epoch": 0.9400839587838697, "ewc_loss": 0.0250584464520216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5058447135961615e-05, "grad_norm": 15.983238220214844, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8545905351638794, "num_tokens": 281893273.0, "step": 7390 }, { "epoch": 0.9402111690624603, "ewc_loss": 0.025118041783571243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5118042685789987e-05, "grad_norm": 16.084014892578125, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8682622313499451, "num_tokens": 281927452.0, "step": 7391 }, { "epoch": 0.9403383793410508, "ewc_loss": 0.02505130134522915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5051302145584486e-05, "grad_norm": 15.87977409362793, "learning_rate": 1e-06, "loss": 0.3414, "mean_token_accuracy": 0.8882386684417725, "num_tokens": 281965359.0, "step": 7392 }, { "epoch": 0.9404655896196412, "ewc_loss": 0.025067362934350967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5067363822017796e-05, "grad_norm": 16.007034301757812, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8613834381103516, "num_tokens": 281999209.0, "step": 7393 }, { "epoch": 0.9405927998982317, "ewc_loss": 0.02513895183801651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5138951968983747e-05, "grad_norm": 15.964866638183594, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8530399799346924, "num_tokens": 282042129.0, "step": 7394 }, { "epoch": 0.9407200101768223, "ewc_loss": 0.025080284103751183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5080284103751183e-05, "grad_norm": 16.012577056884766, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8544184565544128, "num_tokens": 282087941.0, "step": 7395 }, { "epoch": 0.9408472204554128, "ewc_loss": 0.02514018304646015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5140183424809948e-05, "grad_norm": 16.04372787475586, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.868766188621521, "num_tokens": 282123130.0, "step": 7396 }, { "epoch": 0.9409744307340033, "ewc_loss": 0.02509567141532898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5095670935115777e-05, "grad_norm": 16.010879516601562, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8598917126655579, "num_tokens": 282161142.0, "step": 7397 }, { "epoch": 0.9411016410125939, "ewc_loss": 0.02510136179625988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5101362552959472e-05, "grad_norm": 16.02074432373047, "learning_rate": 1e-06, "loss": 0.3454, "mean_token_accuracy": 0.8868891000747681, "num_tokens": 282196363.0, "step": 7398 }, { "epoch": 0.9412288512911843, "ewc_loss": 0.02508346550166607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5083465516217984e-05, "grad_norm": 16.032520294189453, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8452283143997192, "num_tokens": 282233202.0, "step": 7399 }, { "epoch": 0.9413560615697748, "ewc_loss": 0.025101346895098686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.510134618205484e-05, "grad_norm": 15.990923881530762, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8645684719085693, "num_tokens": 282270976.0, "step": 7400 }, { "epoch": 0.9414832718483653, "ewc_loss": 0.025101004168391228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5101004212046973e-05, "grad_norm": 15.948360443115234, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8738260269165039, "num_tokens": 282312223.0, "step": 7401 }, { "epoch": 0.9416104821269559, "ewc_loss": 0.025105249136686325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.510524973331485e-05, "grad_norm": 16.00052261352539, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8477859497070312, "num_tokens": 282350418.0, "step": 7402 }, { "epoch": 0.9417376924055464, "ewc_loss": 0.02514529973268509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5145300242002122e-05, "grad_norm": 16.033248901367188, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8653212189674377, "num_tokens": 282380751.0, "step": 7403 }, { "epoch": 0.9418649026841369, "ewc_loss": 0.025184445083141327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.518444489396643e-05, "grad_norm": 16.04044532775879, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8754822015762329, "num_tokens": 282420248.0, "step": 7404 }, { "epoch": 0.9419921129627273, "ewc_loss": 0.025115609169006348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5115608877968043e-05, "grad_norm": 15.928583145141602, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8684633374214172, "num_tokens": 282457951.0, "step": 7405 }, { "epoch": 0.9421193232413179, "ewc_loss": 0.02512326091527939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.512326136638876e-05, "grad_norm": 15.970738410949707, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8630690574645996, "num_tokens": 282507379.0, "step": 7406 }, { "epoch": 0.9422465335199084, "ewc_loss": 0.0251481793820858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5148179702227935e-05, "grad_norm": 15.959068298339844, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8516366481781006, "num_tokens": 282546905.0, "step": 7407 }, { "epoch": 0.9423737437984989, "ewc_loss": 0.025124212726950645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5124212697846815e-05, "grad_norm": 15.987668991088867, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8351849317550659, "num_tokens": 282591368.0, "step": 7408 }, { "epoch": 0.9425009540770894, "ewc_loss": 0.02515653520822525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.515653432055842e-05, "grad_norm": 16.005632400512695, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8610734939575195, "num_tokens": 282617331.0, "step": 7409 }, { "epoch": 0.94262816435568, "ewc_loss": 0.02518422342836857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5184222977259196e-05, "grad_norm": 15.987846374511719, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8772901892662048, "num_tokens": 282652346.0, "step": 7410 }, { "epoch": 0.9427553746342705, "ewc_loss": 0.025205958634614944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5205958081642166e-05, "grad_norm": 15.982964515686035, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8674581050872803, "num_tokens": 282691564.0, "step": 7411 }, { "epoch": 0.9428825849128609, "ewc_loss": 0.025246504694223404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5246505174436606e-05, "grad_norm": 16.01253890991211, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8736820816993713, "num_tokens": 282728186.0, "step": 7412 }, { "epoch": 0.9430097951914514, "ewc_loss": 0.025217926129698753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5217925212928094e-05, "grad_norm": 16.03186798095703, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8565342426300049, "num_tokens": 282770491.0, "step": 7413 }, { "epoch": 0.943137005470042, "ewc_loss": 0.025273151695728302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527315155020915e-05, "grad_norm": 15.998345375061035, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8565682172775269, "num_tokens": 282809824.0, "step": 7414 }, { "epoch": 0.9432642157486325, "ewc_loss": 0.025190213695168495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5190212909365073e-05, "grad_norm": 16.011077880859375, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8682080507278442, "num_tokens": 282846735.0, "step": 7415 }, { "epoch": 0.943391426027223, "ewc_loss": 0.025199690833687782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.519969166314695e-05, "grad_norm": 16.007518768310547, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.882726788520813, "num_tokens": 282882242.0, "step": 7416 }, { "epoch": 0.9435186363058136, "ewc_loss": 0.02520192414522171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.52019235631451e-05, "grad_norm": 15.983777046203613, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8505557179450989, "num_tokens": 282918774.0, "step": 7417 }, { "epoch": 0.943645846584404, "ewc_loss": 0.025175750255584717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.517575012461748e-05, "grad_norm": 15.978479385375977, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.855952262878418, "num_tokens": 282957766.0, "step": 7418 }, { "epoch": 0.9437730568629945, "ewc_loss": 0.025222713127732277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5222712793038227e-05, "grad_norm": 16.0047550201416, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8517828583717346, "num_tokens": 282995980.0, "step": 7419 }, { "epoch": 0.943900267141585, "ewc_loss": 0.025232631713151932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5232631742255762e-05, "grad_norm": 15.993258476257324, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8644658923149109, "num_tokens": 283033148.0, "step": 7420 }, { "epoch": 0.9440274774201756, "ewc_loss": 0.025258414447307587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.525841409806162e-05, "grad_norm": 15.9672212600708, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.868835985660553, "num_tokens": 283069781.0, "step": 7421 }, { "epoch": 0.9441546876987661, "ewc_loss": 0.025254977867007256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5254978027078323e-05, "grad_norm": 16.046520233154297, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8570031523704529, "num_tokens": 283107270.0, "step": 7422 }, { "epoch": 0.9442818979773566, "ewc_loss": 0.02522113546729088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5221135729225352e-05, "grad_norm": 15.992873191833496, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8754435777664185, "num_tokens": 283143571.0, "step": 7423 }, { "epoch": 0.944409108255947, "ewc_loss": 0.02518686279654503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.518686233088374e-05, "grad_norm": 15.977396965026855, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.867369532585144, "num_tokens": 283179995.0, "step": 7424 }, { "epoch": 0.9445363185345376, "ewc_loss": 0.025173602625727654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5173601898131892e-05, "grad_norm": 15.981578826904297, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8640218377113342, "num_tokens": 283218370.0, "step": 7425 }, { "epoch": 0.9446635288131281, "ewc_loss": 0.02522844262421131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5228442609659396e-05, "grad_norm": 16.030073165893555, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8577952980995178, "num_tokens": 283247879.0, "step": 7426 }, { "epoch": 0.9447907390917186, "ewc_loss": 0.02521316148340702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5213161279680207e-05, "grad_norm": 15.962848663330078, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8774693608283997, "num_tokens": 283285027.0, "step": 7427 }, { "epoch": 0.9449179493703092, "ewc_loss": 0.02522745169699192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5227451260434464e-05, "grad_norm": 15.964499473571777, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.86696857213974, "num_tokens": 283324934.0, "step": 7428 }, { "epoch": 0.9450451596488997, "ewc_loss": 0.02523525059223175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5235251086996868e-05, "grad_norm": 16.02967643737793, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8714271783828735, "num_tokens": 283357540.0, "step": 7429 }, { "epoch": 0.9451723699274901, "ewc_loss": 0.02524079754948616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.524079718568828e-05, "grad_norm": 16.06694221496582, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8611764907836914, "num_tokens": 283393470.0, "step": 7430 }, { "epoch": 0.9452995802060806, "ewc_loss": 0.02523396909236908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.523396869946737e-05, "grad_norm": 16.06354522705078, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8639058470726013, "num_tokens": 283429916.0, "step": 7431 }, { "epoch": 0.9454267904846712, "ewc_loss": 0.025190208107233047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5190207452396862e-05, "grad_norm": 16.02147674560547, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8533430695533752, "num_tokens": 283467997.0, "step": 7432 }, { "epoch": 0.9455540007632617, "ewc_loss": 0.025210276246070862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5210276362486184e-05, "grad_norm": 16.038150787353516, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8295620679855347, "num_tokens": 283504741.0, "step": 7433 }, { "epoch": 0.9456812110418522, "ewc_loss": 0.025233447551727295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5233448468497954e-05, "grad_norm": 16.023860931396484, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8735791444778442, "num_tokens": 283534535.0, "step": 7434 }, { "epoch": 0.9458084213204427, "ewc_loss": 0.025230607017874718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5230607207049616e-05, "grad_norm": 16.07050895690918, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8749676942825317, "num_tokens": 283571600.0, "step": 7435 }, { "epoch": 0.9459356315990332, "ewc_loss": 0.025261985138058662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5261984774260782e-05, "grad_norm": 16.012847900390625, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8503392934799194, "num_tokens": 283605662.0, "step": 7436 }, { "epoch": 0.9460628418776237, "ewc_loss": 0.025207458063960075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.520745874790009e-05, "grad_norm": 16.026660919189453, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8639782667160034, "num_tokens": 283642868.0, "step": 7437 }, { "epoch": 0.9461900521562142, "ewc_loss": 0.025273755192756653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5273755454691127e-05, "grad_norm": 16.054567337036133, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8550492525100708, "num_tokens": 283675604.0, "step": 7438 }, { "epoch": 0.9463172624348047, "ewc_loss": 0.02526126056909561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.526126081647817e-05, "grad_norm": 16.04532241821289, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8841047286987305, "num_tokens": 283709459.0, "step": 7439 }, { "epoch": 0.9464444727133953, "ewc_loss": 0.025214413180947304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5214412744389847e-05, "grad_norm": 15.986645698547363, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8625805377960205, "num_tokens": 283748020.0, "step": 7440 }, { "epoch": 0.9465716829919858, "ewc_loss": 0.025237254798412323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5237255613319576e-05, "grad_norm": 16.071855545043945, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8583654165267944, "num_tokens": 283778514.0, "step": 7441 }, { "epoch": 0.9466988932705762, "ewc_loss": 0.02527387998998165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5273879145970568e-05, "grad_norm": 15.955984115600586, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8673369884490967, "num_tokens": 283819764.0, "step": 7442 }, { "epoch": 0.9468261035491667, "ewc_loss": 0.025230826810002327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5230827304767445e-05, "grad_norm": 16.06650733947754, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8485535383224487, "num_tokens": 283853897.0, "step": 7443 }, { "epoch": 0.9469533138277573, "ewc_loss": 0.025348803028464317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5348803319502622e-05, "grad_norm": 16.024145126342773, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8764104843139648, "num_tokens": 283891826.0, "step": 7444 }, { "epoch": 0.9470805241063478, "ewc_loss": 0.02521372400224209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5213723347405903e-05, "grad_norm": 15.991698265075684, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8623495697975159, "num_tokens": 283931690.0, "step": 7445 }, { "epoch": 0.9472077343849383, "ewc_loss": 0.02529250457882881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5292503778473474e-05, "grad_norm": 15.985861778259277, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8639464378356934, "num_tokens": 283971107.0, "step": 7446 }, { "epoch": 0.9473349446635289, "ewc_loss": 0.025244876742362976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5244877178920433e-05, "grad_norm": 16.00737762451172, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8712602853775024, "num_tokens": 284005286.0, "step": 7447 }, { "epoch": 0.9474621549421193, "ewc_loss": 0.02528042159974575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5280422050855123e-05, "grad_norm": 16.033550262451172, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8782367706298828, "num_tokens": 284044089.0, "step": 7448 }, { "epoch": 0.9475893652207098, "ewc_loss": 0.02531951293349266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5319512133137323e-05, "grad_norm": 16.06536293029785, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.859413206577301, "num_tokens": 284081172.0, "step": 7449 }, { "epoch": 0.9477165754993003, "ewc_loss": 0.025272279977798462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527228025428485e-05, "grad_norm": 16.010133743286133, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8707758188247681, "num_tokens": 284113362.0, "step": 7450 }, { "epoch": 0.9478437857778909, "ewc_loss": 0.025311045348644257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5311044737463817e-05, "grad_norm": 16.05549430847168, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.87331223487854, "num_tokens": 284155272.0, "step": 7451 }, { "epoch": 0.9479709960564814, "ewc_loss": 0.025295136496424675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5295135856140405e-05, "grad_norm": 16.00458526611328, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8716671466827393, "num_tokens": 284188218.0, "step": 7452 }, { "epoch": 0.9480982063350719, "ewc_loss": 0.025249214842915535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.524921546864789e-05, "grad_norm": 16.06134605407715, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8672335743904114, "num_tokens": 284228838.0, "step": 7453 }, { "epoch": 0.9482254166136623, "ewc_loss": 0.025269484147429466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.52694844675716e-05, "grad_norm": 15.991217613220215, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8616186380386353, "num_tokens": 284265570.0, "step": 7454 }, { "epoch": 0.9483526268922529, "ewc_loss": 0.025276070460677147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527607102820184e-05, "grad_norm": 16.094383239746094, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8540762662887573, "num_tokens": 284307062.0, "step": 7455 }, { "epoch": 0.9484798371708434, "ewc_loss": 0.02527024783194065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527024844312109e-05, "grad_norm": 16.012216567993164, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8622475862503052, "num_tokens": 284340488.0, "step": 7456 }, { "epoch": 0.9486070474494339, "ewc_loss": 0.02524527534842491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.524527553759981e-05, "grad_norm": 16.0361270904541, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.862740159034729, "num_tokens": 284382399.0, "step": 7457 }, { "epoch": 0.9487342577280244, "ewc_loss": 0.025247452780604362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5247452867915854e-05, "grad_norm": 15.986987113952637, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8512683510780334, "num_tokens": 284416391.0, "step": 7458 }, { "epoch": 0.948861468006615, "ewc_loss": 0.02528681606054306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5286815798608586e-05, "grad_norm": 16.027551651000977, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.863909125328064, "num_tokens": 284457049.0, "step": 7459 }, { "epoch": 0.9489886782852054, "ewc_loss": 0.02528028003871441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5280280169681646e-05, "grad_norm": 15.987919807434082, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.861631453037262, "num_tokens": 284494426.0, "step": 7460 }, { "epoch": 0.9491158885637959, "ewc_loss": 0.025295821949839592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5295821615145542e-05, "grad_norm": 16.04001235961914, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8816468715667725, "num_tokens": 284533349.0, "step": 7461 }, { "epoch": 0.9492430988423864, "ewc_loss": 0.025263668969273567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5263669158448465e-05, "grad_norm": 16.008298873901367, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8703190088272095, "num_tokens": 284567777.0, "step": 7462 }, { "epoch": 0.949370309120977, "ewc_loss": 0.025258874520659447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5258874302380718e-05, "grad_norm": 15.99750804901123, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8576496243476868, "num_tokens": 284609120.0, "step": 7463 }, { "epoch": 0.9494975193995675, "ewc_loss": 0.02529025822877884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.52902591455495e-05, "grad_norm": 16.048324584960938, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8564895987510681, "num_tokens": 284649221.0, "step": 7464 }, { "epoch": 0.949624729678158, "ewc_loss": 0.025249145925045013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5249146347050555e-05, "grad_norm": 15.982680320739746, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8689379692077637, "num_tokens": 284682595.0, "step": 7465 }, { "epoch": 0.9497519399567486, "ewc_loss": 0.025267675518989563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5267676392104477e-05, "grad_norm": 16.01700210571289, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8601059913635254, "num_tokens": 284722903.0, "step": 7466 }, { "epoch": 0.949879150235339, "ewc_loss": 0.025302235037088394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5302235371782444e-05, "grad_norm": 16.057104110717773, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8694795370101929, "num_tokens": 284759166.0, "step": 7467 }, { "epoch": 0.9500063605139295, "ewc_loss": 0.025278538465499878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5278539396822453e-05, "grad_norm": 15.983955383300781, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8602951765060425, "num_tokens": 284803192.0, "step": 7468 }, { "epoch": 0.95013357079252, "ewc_loss": 0.025237547233700752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5237546651624143e-05, "grad_norm": 16.05996322631836, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8663632273674011, "num_tokens": 284837088.0, "step": 7469 }, { "epoch": 0.9502607810711106, "ewc_loss": 0.025328390300273895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532839062041603e-05, "grad_norm": 16.003698348999023, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8442791700363159, "num_tokens": 284879641.0, "step": 7470 }, { "epoch": 0.9503879913497011, "ewc_loss": 0.025216562673449516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5216562789864838e-05, "grad_norm": 16.03717041015625, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.852200984954834, "num_tokens": 284917519.0, "step": 7471 }, { "epoch": 0.9505152016282916, "ewc_loss": 0.02525903470814228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.525903437344823e-05, "grad_norm": 15.97909927368164, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8542037010192871, "num_tokens": 284950422.0, "step": 7472 }, { "epoch": 0.950642411906882, "ewc_loss": 0.02521779015660286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.521779060771223e-05, "grad_norm": 15.95608901977539, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8708757162094116, "num_tokens": 284986392.0, "step": 7473 }, { "epoch": 0.9507696221854726, "ewc_loss": 0.025293808430433273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5293807993875816e-05, "grad_norm": 16.035566329956055, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8650004863739014, "num_tokens": 285027684.0, "step": 7474 }, { "epoch": 0.9508968324640631, "ewc_loss": 0.025241276249289513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5241275579901412e-05, "grad_norm": 15.991716384887695, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8553857803344727, "num_tokens": 285059551.0, "step": 7475 }, { "epoch": 0.9510240427426536, "ewc_loss": 0.025231175124645233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5231174731743522e-05, "grad_norm": 15.95670223236084, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8645169138908386, "num_tokens": 285100197.0, "step": 7476 }, { "epoch": 0.9511512530212441, "ewc_loss": 0.025275861844420433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5275861844420433e-05, "grad_norm": 16.006202697753906, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8642323017120361, "num_tokens": 285130115.0, "step": 7477 }, { "epoch": 0.9512784632998347, "ewc_loss": 0.025319894775748253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5319894120912068e-05, "grad_norm": 15.967340469360352, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8646842241287231, "num_tokens": 285169069.0, "step": 7478 }, { "epoch": 0.9514056735784251, "ewc_loss": 0.025285596027970314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5285595256718807e-05, "grad_norm": 15.95788860321045, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8439822196960449, "num_tokens": 285209971.0, "step": 7479 }, { "epoch": 0.9515328838570156, "ewc_loss": 0.025288231670856476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5288230972364545e-05, "grad_norm": 16.030351638793945, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8591001033782959, "num_tokens": 285250024.0, "step": 7480 }, { "epoch": 0.9516600941356061, "ewc_loss": 0.02532592974603176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5325929527753033e-05, "grad_norm": 15.98025894165039, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8602879047393799, "num_tokens": 285284900.0, "step": 7481 }, { "epoch": 0.9517873044141967, "ewc_loss": 0.025336148217320442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5336148610222153e-05, "grad_norm": 16.059555053710938, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8677918910980225, "num_tokens": 285325657.0, "step": 7482 }, { "epoch": 0.9519145146927872, "ewc_loss": 0.02533038519322872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.533038605179172e-05, "grad_norm": 15.954909324645996, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.863885760307312, "num_tokens": 285356362.0, "step": 7483 }, { "epoch": 0.9520417249713777, "ewc_loss": 0.02528255432844162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5282553906436078e-05, "grad_norm": 16.08211326599121, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8637200593948364, "num_tokens": 285394321.0, "step": 7484 }, { "epoch": 0.9521689352499682, "ewc_loss": 0.025385186076164246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5385186745552346e-05, "grad_norm": 16.011138916015625, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8826651573181152, "num_tokens": 285435260.0, "step": 7485 }, { "epoch": 0.9522961455285587, "ewc_loss": 0.025260353460907936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.52603531407658e-05, "grad_norm": 16.024539947509766, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8723436594009399, "num_tokens": 285479449.0, "step": 7486 }, { "epoch": 0.9524233558071492, "ewc_loss": 0.025365179404616356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5365179681102745e-05, "grad_norm": 16.025787353515625, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8436618447303772, "num_tokens": 285520492.0, "step": 7487 }, { "epoch": 0.9525505660857397, "ewc_loss": 0.02527511492371559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5275114239775576e-05, "grad_norm": 16.04444694519043, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.859485924243927, "num_tokens": 285558410.0, "step": 7488 }, { "epoch": 0.9526777763643303, "ewc_loss": 0.025322994217276573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532299367885571e-05, "grad_norm": 16.149856567382812, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8439845442771912, "num_tokens": 285598871.0, "step": 7489 }, { "epoch": 0.9528049866429208, "ewc_loss": 0.025300180539488792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5300179913756438e-05, "grad_norm": 16.018070220947266, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8560622334480286, "num_tokens": 285641374.0, "step": 7490 }, { "epoch": 0.9529321969215112, "ewc_loss": 0.025280620902776718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.528062032070011e-05, "grad_norm": 15.958863258361816, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8755491971969604, "num_tokens": 285675416.0, "step": 7491 }, { "epoch": 0.9530594072001017, "ewc_loss": 0.025269458070397377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5269457182730548e-05, "grad_norm": 16.182649612426758, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8701605796813965, "num_tokens": 285707491.0, "step": 7492 }, { "epoch": 0.9531866174786923, "ewc_loss": 0.025348233059048653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5348233975819312e-05, "grad_norm": 16.008222579956055, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8517923355102539, "num_tokens": 285747571.0, "step": 7493 }, { "epoch": 0.9533138277572828, "ewc_loss": 0.025224583223462105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5224582714145072e-05, "grad_norm": 16.080284118652344, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8856387138366699, "num_tokens": 285786362.0, "step": 7494 }, { "epoch": 0.9534410380358733, "ewc_loss": 0.0252667348831892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.526673415559344e-05, "grad_norm": 16.0314998626709, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.870548665523529, "num_tokens": 285829401.0, "step": 7495 }, { "epoch": 0.9535682483144639, "ewc_loss": 0.025238769128918648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5238769012503326e-05, "grad_norm": 16.045339584350586, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8608250617980957, "num_tokens": 285867853.0, "step": 7496 }, { "epoch": 0.9536954585930543, "ewc_loss": 0.025221291929483414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5221292162314057e-05, "grad_norm": 15.996959686279297, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8736978769302368, "num_tokens": 285908497.0, "step": 7497 }, { "epoch": 0.9538226688716448, "ewc_loss": 0.02518855594098568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5188555810018443e-05, "grad_norm": 16.0909481048584, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8509633541107178, "num_tokens": 285940528.0, "step": 7498 }, { "epoch": 0.9539498791502353, "ewc_loss": 0.025305723771452904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5305724193458445e-05, "grad_norm": 16.06505584716797, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.864693284034729, "num_tokens": 285980206.0, "step": 7499 }, { "epoch": 0.9540770894288259, "ewc_loss": 0.0252380408346653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5238041416741908e-05, "grad_norm": 16.052453994750977, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8583862781524658, "num_tokens": 286019409.0, "step": 7500 }, { "epoch": 0.9542042997074164, "ewc_loss": 0.025240816175937653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5240815375582315e-05, "grad_norm": 16.0600643157959, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8734142184257507, "num_tokens": 286050955.0, "step": 7501 }, { "epoch": 0.9543315099860069, "ewc_loss": 0.025240108370780945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5240107788704336e-05, "grad_norm": 16.0285587310791, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8678783178329468, "num_tokens": 286087567.0, "step": 7502 }, { "epoch": 0.9544587202645973, "ewc_loss": 0.02523190900683403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5231909603462555e-05, "grad_norm": 16.09178924560547, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8431789875030518, "num_tokens": 286126893.0, "step": 7503 }, { "epoch": 0.9545859305431879, "ewc_loss": 0.02523054927587509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5230548999388702e-05, "grad_norm": 15.986974716186523, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8579789996147156, "num_tokens": 286165818.0, "step": 7504 }, { "epoch": 0.9547131408217784, "ewc_loss": 0.025212911888957024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.521291207813192e-05, "grad_norm": 16.03379249572754, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.866208553314209, "num_tokens": 286208026.0, "step": 7505 }, { "epoch": 0.9548403511003689, "ewc_loss": 0.02525179274380207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5251792976632714e-05, "grad_norm": 15.995519638061523, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8642314076423645, "num_tokens": 286245468.0, "step": 7506 }, { "epoch": 0.9549675613789594, "ewc_loss": 0.02524586208164692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.524586125218775e-05, "grad_norm": 16.02092170715332, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8682650327682495, "num_tokens": 286283955.0, "step": 7507 }, { "epoch": 0.95509477165755, "ewc_loss": 0.02524733543395996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5247334633604623e-05, "grad_norm": 16.003442764282227, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8696572780609131, "num_tokens": 286326560.0, "step": 7508 }, { "epoch": 0.9552219819361404, "ewc_loss": 0.025256458669900894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.525645868445281e-05, "grad_norm": 16.03346824645996, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8638240098953247, "num_tokens": 286365791.0, "step": 7509 }, { "epoch": 0.9553491922147309, "ewc_loss": 0.025260047987103462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5260047550546005e-05, "grad_norm": 16.0049991607666, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8753972053527832, "num_tokens": 286404132.0, "step": 7510 }, { "epoch": 0.9554764024933214, "ewc_loss": 0.02526060864329338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5260607799282297e-05, "grad_norm": 16.05167579650879, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.853249192237854, "num_tokens": 286444828.0, "step": 7511 }, { "epoch": 0.955603612771912, "ewc_loss": 0.025233320891857147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5233321139239706e-05, "grad_norm": 15.997468948364258, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8691138029098511, "num_tokens": 286481725.0, "step": 7512 }, { "epoch": 0.9557308230505025, "ewc_loss": 0.025237422436475754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5237422960344702e-05, "grad_norm": 16.052059173583984, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8642395734786987, "num_tokens": 286520354.0, "step": 7513 }, { "epoch": 0.955858033329093, "ewc_loss": 0.025221150368452072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.522115028114058e-05, "grad_norm": 16.025209426879883, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8613854050636292, "num_tokens": 286564314.0, "step": 7514 }, { "epoch": 0.9559852436076836, "ewc_loss": 0.02521214447915554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5212144464603625e-05, "grad_norm": 16.015777587890625, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8626217246055603, "num_tokens": 286606955.0, "step": 7515 }, { "epoch": 0.956112453886274, "ewc_loss": 0.025231506675481796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.523150760680437e-05, "grad_norm": 16.044170379638672, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.870252251625061, "num_tokens": 286640091.0, "step": 7516 }, { "epoch": 0.9562396641648645, "ewc_loss": 0.025182414799928665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5182414901792072e-05, "grad_norm": 15.965033531188965, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8634650707244873, "num_tokens": 286679090.0, "step": 7517 }, { "epoch": 0.956366874443455, "ewc_loss": 0.025269275531172752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5269275283790193e-05, "grad_norm": 16.16510009765625, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8676433563232422, "num_tokens": 286720260.0, "step": 7518 }, { "epoch": 0.9564940847220456, "ewc_loss": 0.025280820205807686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.52808204095345e-05, "grad_norm": 16.019615173339844, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8697702884674072, "num_tokens": 286763992.0, "step": 7519 }, { "epoch": 0.9566212950006361, "ewc_loss": 0.02514924854040146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.514924926799722e-05, "grad_norm": 16.01905059814453, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8452953100204468, "num_tokens": 286807068.0, "step": 7520 }, { "epoch": 0.9567485052792266, "ewc_loss": 0.02519749104976654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.519749068596866e-05, "grad_norm": 16.096052169799805, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8703559041023254, "num_tokens": 286853020.0, "step": 7521 }, { "epoch": 0.956875715557817, "ewc_loss": 0.025136936455965042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5136936528724618e-05, "grad_norm": 15.993284225463867, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8533477187156677, "num_tokens": 286896098.0, "step": 7522 }, { "epoch": 0.9570029258364076, "ewc_loss": 0.02513989619910717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5139896024484187e-05, "grad_norm": 16.10610008239746, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8485827445983887, "num_tokens": 286938873.0, "step": 7523 }, { "epoch": 0.9571301361149981, "ewc_loss": 0.025179989635944366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5179990188917145e-05, "grad_norm": 16.011220932006836, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8681173920631409, "num_tokens": 286972941.0, "step": 7524 }, { "epoch": 0.9572573463935886, "ewc_loss": 0.025092514231801033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5092514988500625e-05, "grad_norm": 16.03695297241211, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8528035283088684, "num_tokens": 287010818.0, "step": 7525 }, { "epoch": 0.9573845566721791, "ewc_loss": 0.025185883045196533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5185883714584634e-05, "grad_norm": 16.062702178955078, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.868342399597168, "num_tokens": 287053841.0, "step": 7526 }, { "epoch": 0.9575117669507697, "ewc_loss": 0.025092778727412224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.509277874196414e-05, "grad_norm": 16.0299015045166, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8693053722381592, "num_tokens": 287090900.0, "step": 7527 }, { "epoch": 0.9576389772293601, "ewc_loss": 0.02512519247829914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5125193133135326e-05, "grad_norm": 16.041881561279297, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8621492385864258, "num_tokens": 287131203.0, "step": 7528 }, { "epoch": 0.9577661875079506, "ewc_loss": 0.025108231231570244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.510823105694726e-05, "grad_norm": 15.99903678894043, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8742309808731079, "num_tokens": 287165949.0, "step": 7529 }, { "epoch": 0.9578933977865411, "ewc_loss": 0.02510557323694229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.510557351342868e-05, "grad_norm": 16.03217315673828, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8625633716583252, "num_tokens": 287203376.0, "step": 7530 }, { "epoch": 0.9580206080651317, "ewc_loss": 0.025171764194965363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.517176471883431e-05, "grad_norm": 16.03803825378418, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8548574447631836, "num_tokens": 287240391.0, "step": 7531 }, { "epoch": 0.9581478183437222, "ewc_loss": 0.025127144530415535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.512714490876533e-05, "grad_norm": 16.003692626953125, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8612594604492188, "num_tokens": 287278485.0, "step": 7532 }, { "epoch": 0.9582750286223127, "ewc_loss": 0.02517646551132202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5176464987453073e-05, "grad_norm": 16.08306884765625, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.858668863773346, "num_tokens": 287313626.0, "step": 7533 }, { "epoch": 0.9584022389009031, "ewc_loss": 0.0252310112118721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5231011022697203e-05, "grad_norm": 16.093698501586914, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8790168166160583, "num_tokens": 287344532.0, "step": 7534 }, { "epoch": 0.9585294491794937, "ewc_loss": 0.02513830177485943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.513830258976668e-05, "grad_norm": 16.011240005493164, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8455977439880371, "num_tokens": 287383069.0, "step": 7535 }, { "epoch": 0.9586566594580842, "ewc_loss": 0.025133050978183746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5133051167358644e-05, "grad_norm": 15.98578929901123, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8628159761428833, "num_tokens": 287418554.0, "step": 7536 }, { "epoch": 0.9587838697366747, "ewc_loss": 0.025204205885529518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.520420639484655e-05, "grad_norm": 16.054916381835938, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.862295925617218, "num_tokens": 287452449.0, "step": 7537 }, { "epoch": 0.9589110800152653, "ewc_loss": 0.025189708918333054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.518970904930029e-05, "grad_norm": 16.009836196899414, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8500731587409973, "num_tokens": 287492247.0, "step": 7538 }, { "epoch": 0.9590382902938558, "ewc_loss": 0.02526310831308365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5263108909712173e-05, "grad_norm": 16.078472137451172, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8697726130485535, "num_tokens": 287528651.0, "step": 7539 }, { "epoch": 0.9591655005724462, "ewc_loss": 0.025279158726334572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5279157853219658e-05, "grad_norm": 16.080997467041016, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8535680174827576, "num_tokens": 287565837.0, "step": 7540 }, { "epoch": 0.9592927108510367, "ewc_loss": 0.02525387518107891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5253875719499774e-05, "grad_norm": 16.024770736694336, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8628406524658203, "num_tokens": 287599469.0, "step": 7541 }, { "epoch": 0.9594199211296273, "ewc_loss": 0.02526543289422989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5265433578169905e-05, "grad_norm": 16.10171890258789, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8619078993797302, "num_tokens": 287637104.0, "step": 7542 }, { "epoch": 0.9595471314082178, "ewc_loss": 0.025291480123996735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5291479687439278e-05, "grad_norm": 16.03142547607422, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8643887042999268, "num_tokens": 287679134.0, "step": 7543 }, { "epoch": 0.9596743416868083, "ewc_loss": 0.02526015229523182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.526015305193141e-05, "grad_norm": 16.049419403076172, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8706607818603516, "num_tokens": 287720340.0, "step": 7544 }, { "epoch": 0.9598015519653988, "ewc_loss": 0.025292038917541504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5292038117186166e-05, "grad_norm": 16.026906967163086, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8680194020271301, "num_tokens": 287760931.0, "step": 7545 }, { "epoch": 0.9599287622439893, "ewc_loss": 0.025273090228438377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527308970456943e-05, "grad_norm": 16.025371551513672, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8502625823020935, "num_tokens": 287801723.0, "step": 7546 }, { "epoch": 0.9600559725225798, "ewc_loss": 0.025309056043624878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.530905658204574e-05, "grad_norm": 16.0632381439209, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8618988990783691, "num_tokens": 287837010.0, "step": 7547 }, { "epoch": 0.9601831828011703, "ewc_loss": 0.025290803983807564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5290804842370562e-05, "grad_norm": 15.968716621398926, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8651998043060303, "num_tokens": 287874381.0, "step": 7548 }, { "epoch": 0.9603103930797608, "ewc_loss": 0.02529801055788994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5298009859398007e-05, "grad_norm": 16.098129272460938, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8558467030525208, "num_tokens": 287916673.0, "step": 7549 }, { "epoch": 0.9604376033583514, "ewc_loss": 0.025317568331956863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5317567633464932e-05, "grad_norm": 16.0064640045166, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8589191436767578, "num_tokens": 287956764.0, "step": 7550 }, { "epoch": 0.9605648136369419, "ewc_loss": 0.025322971865534782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5322971850982867e-05, "grad_norm": 16.07757568359375, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8713102340698242, "num_tokens": 287996373.0, "step": 7551 }, { "epoch": 0.9606920239155323, "ewc_loss": 0.02537195384502411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.537195359764155e-05, "grad_norm": 16.11862564086914, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8584526777267456, "num_tokens": 288033671.0, "step": 7552 }, { "epoch": 0.9608192341941229, "ewc_loss": 0.025289656594395638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5289657060056925e-05, "grad_norm": 16.04165267944336, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8725611567497253, "num_tokens": 288068254.0, "step": 7553 }, { "epoch": 0.9609464444727134, "ewc_loss": 0.02532467432320118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5324674425064586e-05, "grad_norm": 16.093978881835938, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8498006463050842, "num_tokens": 288102833.0, "step": 7554 }, { "epoch": 0.9610736547513039, "ewc_loss": 0.025321006774902344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5321007342427038e-05, "grad_norm": 16.003507614135742, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8763047456741333, "num_tokens": 288135927.0, "step": 7555 }, { "epoch": 0.9612008650298944, "ewc_loss": 0.025298824533820152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5298824766650796e-05, "grad_norm": 16.022735595703125, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8824641108512878, "num_tokens": 288173842.0, "step": 7556 }, { "epoch": 0.961328075308485, "ewc_loss": 0.025310590863227844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.531058999011293e-05, "grad_norm": 15.994836807250977, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8693214654922485, "num_tokens": 288215007.0, "step": 7557 }, { "epoch": 0.9614552855870754, "ewc_loss": 0.025307347998023033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5307348550995812e-05, "grad_norm": 16.13988494873047, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8532599806785583, "num_tokens": 288251598.0, "step": 7558 }, { "epoch": 0.9615824958656659, "ewc_loss": 0.02536894753575325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536894680815749e-05, "grad_norm": 16.079397201538086, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8647286891937256, "num_tokens": 288292811.0, "step": 7559 }, { "epoch": 0.9617097061442564, "ewc_loss": 0.025260068476200104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5260067559429444e-05, "grad_norm": 16.0509090423584, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8687812685966492, "num_tokens": 288337790.0, "step": 7560 }, { "epoch": 0.961836916422847, "ewc_loss": 0.025287600234150887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5287599783041514e-05, "grad_norm": 16.079442977905273, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.861356794834137, "num_tokens": 288373347.0, "step": 7561 }, { "epoch": 0.9619641267014375, "ewc_loss": 0.025290481746196747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.529048106225673e-05, "grad_norm": 16.106470108032227, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8837936520576477, "num_tokens": 288405650.0, "step": 7562 }, { "epoch": 0.962091336980028, "ewc_loss": 0.025285474956035614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5285475203418173e-05, "grad_norm": 16.08550262451172, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8687857389450073, "num_tokens": 288443849.0, "step": 7563 }, { "epoch": 0.9622185472586186, "ewc_loss": 0.02524667978286743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5246679797419347e-05, "grad_norm": 16.082473754882812, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8426946401596069, "num_tokens": 288487557.0, "step": 7564 }, { "epoch": 0.962345757537209, "ewc_loss": 0.025263288989663124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5263288989663124e-05, "grad_norm": 16.04174041748047, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8551220893859863, "num_tokens": 288529932.0, "step": 7565 }, { "epoch": 0.9624729678157995, "ewc_loss": 0.025223854929208755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5223855118383653e-05, "grad_norm": 16.061059951782227, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8592764735221863, "num_tokens": 288568276.0, "step": 7566 }, { "epoch": 0.96260017809439, "ewc_loss": 0.025264007970690727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5264007490477525e-05, "grad_norm": 16.095476150512695, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8663423657417297, "num_tokens": 288602387.0, "step": 7567 }, { "epoch": 0.9627273883729806, "ewc_loss": 0.025301024317741394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5301023924839683e-05, "grad_norm": 16.147212982177734, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8708359003067017, "num_tokens": 288638157.0, "step": 7568 }, { "epoch": 0.9628545986515711, "ewc_loss": 0.02524683065712452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.524683077353984e-05, "grad_norm": 16.038827896118164, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8605387806892395, "num_tokens": 288675054.0, "step": 7569 }, { "epoch": 0.9629818089301616, "ewc_loss": 0.02524801716208458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5248016754630953e-05, "grad_norm": 16.109661102294922, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8698544502258301, "num_tokens": 288714222.0, "step": 7570 }, { "epoch": 0.963109019208752, "ewc_loss": 0.02532276324927807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532276266720146e-05, "grad_norm": 16.10079574584961, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8535202741622925, "num_tokens": 288752772.0, "step": 7571 }, { "epoch": 0.9632362294873426, "ewc_loss": 0.02525188960134983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5251889383071102e-05, "grad_norm": 16.005887985229492, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8474169373512268, "num_tokens": 288792067.0, "step": 7572 }, { "epoch": 0.9633634397659331, "ewc_loss": 0.025270892307162285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5270892365369946e-05, "grad_norm": 16.049020767211914, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8755020499229431, "num_tokens": 288826172.0, "step": 7573 }, { "epoch": 0.9634906500445236, "ewc_loss": 0.025284044444561005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5284043658757582e-05, "grad_norm": 16.017152786254883, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8743817806243896, "num_tokens": 288863774.0, "step": 7574 }, { "epoch": 0.9636178603231141, "ewc_loss": 0.025276869535446167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5276869564549997e-05, "grad_norm": 16.074277877807617, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8573671579360962, "num_tokens": 288901892.0, "step": 7575 }, { "epoch": 0.9637450706017047, "ewc_loss": 0.025348050519824028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5348050257889554e-05, "grad_norm": 16.05755615234375, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8525590300559998, "num_tokens": 288941470.0, "step": 7576 }, { "epoch": 0.9638722808802951, "ewc_loss": 0.02524564042687416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5245641154469922e-05, "grad_norm": 16.024999618530273, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.85040682554245, "num_tokens": 288988616.0, "step": 7577 }, { "epoch": 0.9639994911588856, "ewc_loss": 0.02531382255256176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.531382233428303e-05, "grad_norm": 16.052061080932617, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8807663917541504, "num_tokens": 289024589.0, "step": 7578 }, { "epoch": 0.9641267014374761, "ewc_loss": 0.025305116549134254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.530511665099766e-05, "grad_norm": 16.056541442871094, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8659778833389282, "num_tokens": 289064334.0, "step": 7579 }, { "epoch": 0.9642539117160667, "ewc_loss": 0.025309760123491287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5309760530944914e-05, "grad_norm": 16.05977439880371, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8527823090553284, "num_tokens": 289099385.0, "step": 7580 }, { "epoch": 0.9643811219946572, "ewc_loss": 0.02529948763549328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5299486878793687e-05, "grad_norm": 16.030141830444336, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8667246103286743, "num_tokens": 289139143.0, "step": 7581 }, { "epoch": 0.9645083322732477, "ewc_loss": 0.02525385096669197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5253850253648125e-05, "grad_norm": 16.0976619720459, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8582423329353333, "num_tokens": 289179303.0, "step": 7582 }, { "epoch": 0.9646355425518381, "ewc_loss": 0.025320595130324364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5320594431832433e-05, "grad_norm": 16.107608795166016, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.85529625415802, "num_tokens": 289214218.0, "step": 7583 }, { "epoch": 0.9647627528304287, "ewc_loss": 0.02524673007428646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5246730729122646e-05, "grad_norm": 16.096452713012695, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8601788282394409, "num_tokens": 289240991.0, "step": 7584 }, { "epoch": 0.9648899631090192, "ewc_loss": 0.025279037654399872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5279037799919024e-05, "grad_norm": 16.05890464782715, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8576280474662781, "num_tokens": 289273374.0, "step": 7585 }, { "epoch": 0.9650171733876097, "ewc_loss": 0.025303838774561882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.530383790144697e-05, "grad_norm": 16.082712173461914, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.850176990032196, "num_tokens": 289314904.0, "step": 7586 }, { "epoch": 0.9651443836662003, "ewc_loss": 0.02530699223279953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5306992029072717e-05, "grad_norm": 16.02552032470703, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8566881418228149, "num_tokens": 289358110.0, "step": 7587 }, { "epoch": 0.9652715939447908, "ewc_loss": 0.02536638267338276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536638203309849e-05, "grad_norm": 16.155317306518555, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.853492259979248, "num_tokens": 289391581.0, "step": 7588 }, { "epoch": 0.9653988042233812, "ewc_loss": 0.0253182090818882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5318209736724384e-05, "grad_norm": 16.024991989135742, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8378236889839172, "num_tokens": 289432280.0, "step": 7589 }, { "epoch": 0.9655260145019717, "ewc_loss": 0.02530895732343197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5308956537628546e-05, "grad_norm": 16.0766658782959, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8471341729164124, "num_tokens": 289473222.0, "step": 7590 }, { "epoch": 0.9656532247805623, "ewc_loss": 0.025377076119184494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5377075871801935e-05, "grad_norm": 16.07189178466797, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8591926097869873, "num_tokens": 289512898.0, "step": 7591 }, { "epoch": 0.9657804350591528, "ewc_loss": 0.02531510405242443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.531510472181253e-05, "grad_norm": 16.070743560791016, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8684641122817993, "num_tokens": 289554394.0, "step": 7592 }, { "epoch": 0.9659076453377433, "ewc_loss": 0.025346260517835617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5346260372316465e-05, "grad_norm": 16.036090850830078, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8599681854248047, "num_tokens": 289594871.0, "step": 7593 }, { "epoch": 0.9660348556163338, "ewc_loss": 0.025357048958539963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5357048798468895e-05, "grad_norm": 16.0913028717041, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.851681113243103, "num_tokens": 289629165.0, "step": 7594 }, { "epoch": 0.9661620658949243, "ewc_loss": 0.025374187156558037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5374187316629104e-05, "grad_norm": 16.02390480041504, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8622786998748779, "num_tokens": 289670131.0, "step": 7595 }, { "epoch": 0.9662892761735148, "ewc_loss": 0.025356711819767952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.535671228542924e-05, "grad_norm": 16.143360137939453, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8659514784812927, "num_tokens": 289705238.0, "step": 7596 }, { "epoch": 0.9664164864521053, "ewc_loss": 0.0253977682441473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5397768695256673e-05, "grad_norm": 16.09633445739746, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8654564023017883, "num_tokens": 289740146.0, "step": 7597 }, { "epoch": 0.9665436967306958, "ewc_loss": 0.025351600721478462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5351600925205275e-05, "grad_norm": 16.078889846801758, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8673355579376221, "num_tokens": 289781145.0, "step": 7598 }, { "epoch": 0.9666709070092864, "ewc_loss": 0.025351079180836678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5351078875246458e-05, "grad_norm": 16.099092483520508, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8456819653511047, "num_tokens": 289816347.0, "step": 7599 }, { "epoch": 0.9667981172878769, "ewc_loss": 0.025359097868204117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5359096980537288e-05, "grad_norm": 16.07316780090332, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8486102819442749, "num_tokens": 289852974.0, "step": 7600 }, { "epoch": 0.9669253275664673, "ewc_loss": 0.025301257148385048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5301256755483337e-05, "grad_norm": 16.032751083374023, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8508864641189575, "num_tokens": 289894553.0, "step": 7601 }, { "epoch": 0.9670525378450578, "ewc_loss": 0.025373822078108788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.537382169975899e-05, "grad_norm": 16.095151901245117, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8450741767883301, "num_tokens": 289930641.0, "step": 7602 }, { "epoch": 0.9671797481236484, "ewc_loss": 0.025387242436408997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5387242203578353e-05, "grad_norm": 16.09280776977539, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8572260737419128, "num_tokens": 289972690.0, "step": 7603 }, { "epoch": 0.9673069584022389, "ewc_loss": 0.025343410670757294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.534341001592111e-05, "grad_norm": 15.997419357299805, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8566941022872925, "num_tokens": 290006518.0, "step": 7604 }, { "epoch": 0.9674341686808294, "ewc_loss": 0.025325287133455276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532528742449358e-05, "grad_norm": 16.073347091674805, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.849281907081604, "num_tokens": 290045206.0, "step": 7605 }, { "epoch": 0.96756137895942, "ewc_loss": 0.02540569379925728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5405694032087922e-05, "grad_norm": 16.022886276245117, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8673272728919983, "num_tokens": 290086976.0, "step": 7606 }, { "epoch": 0.9676885892380104, "ewc_loss": 0.025358149781823158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.535814928705804e-05, "grad_norm": 16.060522079467773, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8752123117446899, "num_tokens": 290126894.0, "step": 7607 }, { "epoch": 0.9678157995166009, "ewc_loss": 0.02541632018983364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5416320568183437e-05, "grad_norm": 16.044944763183594, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8760890960693359, "num_tokens": 290161684.0, "step": 7608 }, { "epoch": 0.9679430097951914, "ewc_loss": 0.025352200493216515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5352201191708446e-05, "grad_norm": 16.06589698791504, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8645095825195312, "num_tokens": 290203407.0, "step": 7609 }, { "epoch": 0.968070220073782, "ewc_loss": 0.025409653782844543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540965397201944e-05, "grad_norm": 16.129608154296875, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8702554702758789, "num_tokens": 290238645.0, "step": 7610 }, { "epoch": 0.9681974303523725, "ewc_loss": 0.025383668020367622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5383667889400385e-05, "grad_norm": 16.02345848083496, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8624570965766907, "num_tokens": 290280279.0, "step": 7611 }, { "epoch": 0.968324640630963, "ewc_loss": 0.02540423348546028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5404233383596875e-05, "grad_norm": 16.138059616088867, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8616710901260376, "num_tokens": 290317739.0, "step": 7612 }, { "epoch": 0.9684518509095535, "ewc_loss": 0.025399817153811455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5399816877325065e-05, "grad_norm": 16.02591896057129, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8605455160140991, "num_tokens": 290360384.0, "step": 7613 }, { "epoch": 0.968579061188144, "ewc_loss": 0.02538456954061985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5384570108144544e-05, "grad_norm": 16.195955276489258, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8705009818077087, "num_tokens": 290398706.0, "step": 7614 }, { "epoch": 0.9687062714667345, "ewc_loss": 0.025400787591934204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540078821766656e-05, "grad_norm": 16.031211853027344, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8761974573135376, "num_tokens": 290439524.0, "step": 7615 }, { "epoch": 0.968833481745325, "ewc_loss": 0.025295810773968697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.529581070120912e-05, "grad_norm": 16.13911247253418, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.85777747631073, "num_tokens": 290472125.0, "step": 7616 }, { "epoch": 0.9689606920239155, "ewc_loss": 0.025445858016610146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5445857318118215e-05, "grad_norm": 16.129270553588867, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8472474813461304, "num_tokens": 290514348.0, "step": 7617 }, { "epoch": 0.9690879023025061, "ewc_loss": 0.02533482015132904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5334820747957565e-05, "grad_norm": 16.105636596679688, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.85748291015625, "num_tokens": 290551699.0, "step": 7618 }, { "epoch": 0.9692151125810966, "ewc_loss": 0.025382841005921364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5382840249221772e-05, "grad_norm": 16.193958282470703, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8644750714302063, "num_tokens": 290588644.0, "step": 7619 }, { "epoch": 0.969342322859687, "ewc_loss": 0.025339338928461075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5339339117635973e-05, "grad_norm": 16.096118927001953, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.864689290523529, "num_tokens": 290629546.0, "step": 7620 }, { "epoch": 0.9694695331382776, "ewc_loss": 0.02527017705142498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5270177502534352e-05, "grad_norm": 16.121000289916992, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8713722229003906, "num_tokens": 290665527.0, "step": 7621 }, { "epoch": 0.9695967434168681, "ewc_loss": 0.02540709637105465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5407096472918056e-05, "grad_norm": 16.147851943969727, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8545236587524414, "num_tokens": 290706667.0, "step": 7622 }, { "epoch": 0.9697239536954586, "ewc_loss": 0.02530849725008011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5308498152298853e-05, "grad_norm": 16.08551025390625, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8572244048118591, "num_tokens": 290745698.0, "step": 7623 }, { "epoch": 0.9698511639740491, "ewc_loss": 0.025317341089248657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.531734025978949e-05, "grad_norm": 16.124046325683594, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8689142465591431, "num_tokens": 290777983.0, "step": 7624 }, { "epoch": 0.9699783742526397, "ewc_loss": 0.025346219539642334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5346220354549587e-05, "grad_norm": 16.100542068481445, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8564860820770264, "num_tokens": 290814184.0, "step": 7625 }, { "epoch": 0.9701055845312301, "ewc_loss": 0.025339551270008087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5339551939396188e-05, "grad_norm": 16.067636489868164, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8543331623077393, "num_tokens": 290854054.0, "step": 7626 }, { "epoch": 0.9702327948098206, "ewc_loss": 0.025347448885440826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.534744817239698e-05, "grad_norm": 16.0811710357666, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8623833656311035, "num_tokens": 290891325.0, "step": 7627 }, { "epoch": 0.9703600050884111, "ewc_loss": 0.025333529338240623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5333529265481047e-05, "grad_norm": 15.980084419250488, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8608965873718262, "num_tokens": 290930101.0, "step": 7628 }, { "epoch": 0.9704872153670017, "ewc_loss": 0.025342177599668503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5342176741105504e-05, "grad_norm": 16.067434310913086, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.862320601940155, "num_tokens": 290964691.0, "step": 7629 }, { "epoch": 0.9706144256455922, "ewc_loss": 0.025374021381139755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.537402178859338e-05, "grad_norm": 16.071861267089844, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8596124649047852, "num_tokens": 291008140.0, "step": 7630 }, { "epoch": 0.9707416359241827, "ewc_loss": 0.025360655039548874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5360655854456127e-05, "grad_norm": 16.082622528076172, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8568485379219055, "num_tokens": 291041484.0, "step": 7631 }, { "epoch": 0.9708688462027731, "ewc_loss": 0.0254073366522789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5407336579519324e-05, "grad_norm": 16.121389389038086, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8439403176307678, "num_tokens": 291082337.0, "step": 7632 }, { "epoch": 0.9709960564813637, "ewc_loss": 0.025390369817614555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5390369046363048e-05, "grad_norm": 16.087385177612305, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8619425296783447, "num_tokens": 291115921.0, "step": 7633 }, { "epoch": 0.9711232667599542, "ewc_loss": 0.02540353313088417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540353307267651e-05, "grad_norm": 16.11005401611328, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8705570697784424, "num_tokens": 291154367.0, "step": 7634 }, { "epoch": 0.9712504770385447, "ewc_loss": 0.025383928790688515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5383928004885092e-05, "grad_norm": 16.01370620727539, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8630927205085754, "num_tokens": 291198919.0, "step": 7635 }, { "epoch": 0.9713776873171353, "ewc_loss": 0.02541002631187439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.541002686484717e-05, "grad_norm": 16.144817352294922, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8510162830352783, "num_tokens": 291237062.0, "step": 7636 }, { "epoch": 0.9715048975957258, "ewc_loss": 0.025390364229679108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5390363589394838e-05, "grad_norm": 16.032867431640625, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.874239981174469, "num_tokens": 291275676.0, "step": 7637 }, { "epoch": 0.9716321078743162, "ewc_loss": 0.025383539497852325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5383538741152734e-05, "grad_norm": 16.08572769165039, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.85550856590271, "num_tokens": 291316785.0, "step": 7638 }, { "epoch": 0.9717593181529067, "ewc_loss": 0.025409681722521782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5409681256860495e-05, "grad_norm": 16.0510311126709, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8603500723838806, "num_tokens": 291360448.0, "step": 7639 }, { "epoch": 0.9718865284314973, "ewc_loss": 0.025423478335142136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.542347829148639e-05, "grad_norm": 16.17505645751953, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8593591451644897, "num_tokens": 291401650.0, "step": 7640 }, { "epoch": 0.9720137387100878, "ewc_loss": 0.025382008403539658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.538200897106435e-05, "grad_norm": 16.04719352722168, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8630039691925049, "num_tokens": 291439736.0, "step": 7641 }, { "epoch": 0.9721409489886783, "ewc_loss": 0.0253298282623291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532982762204483e-05, "grad_norm": 16.092267990112305, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8685458898544312, "num_tokens": 291479077.0, "step": 7642 }, { "epoch": 0.9722681592672688, "ewc_loss": 0.025364067405462265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536406827857718e-05, "grad_norm": 16.14932632446289, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8720049858093262, "num_tokens": 291512470.0, "step": 7643 }, { "epoch": 0.9723953695458593, "ewc_loss": 0.02535148896276951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5351488147862256e-05, "grad_norm": 16.041183471679688, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8731345534324646, "num_tokens": 291552797.0, "step": 7644 }, { "epoch": 0.9725225798244498, "ewc_loss": 0.025275254622101784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.527525430195965e-05, "grad_norm": 16.100265502929688, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.866369366645813, "num_tokens": 291589885.0, "step": 7645 }, { "epoch": 0.9726497901030403, "ewc_loss": 0.025364911183714867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536491047067102e-05, "grad_norm": 16.191078186035156, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8623740673065186, "num_tokens": 291633005.0, "step": 7646 }, { "epoch": 0.9727770003816308, "ewc_loss": 0.02533978410065174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.533978477003984e-05, "grad_norm": 16.079872131347656, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.859158992767334, "num_tokens": 291666083.0, "step": 7647 }, { "epoch": 0.9729042106602214, "ewc_loss": 0.025244133546948433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5244133212254383e-05, "grad_norm": 16.15518569946289, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8690481781959534, "num_tokens": 291706421.0, "step": 7648 }, { "epoch": 0.9730314209388119, "ewc_loss": 0.02538416162133217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5384160835528746e-05, "grad_norm": 16.05908203125, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8673933744430542, "num_tokens": 291744641.0, "step": 7649 }, { "epoch": 0.9731586312174023, "ewc_loss": 0.02526545152068138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.526545176806394e-05, "grad_norm": 16.112178802490234, "learning_rate": 1e-06, "loss": 0.3471, "mean_token_accuracy": 0.8883938789367676, "num_tokens": 291780586.0, "step": 7650 }, { "epoch": 0.9732858414959928, "ewc_loss": 0.025341957807540894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.534195846237708e-05, "grad_norm": 16.06510353088379, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8586289882659912, "num_tokens": 291820482.0, "step": 7651 }, { "epoch": 0.9734130517745834, "ewc_loss": 0.025276098400354385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5276098313042894e-05, "grad_norm": 16.031335830688477, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8719865083694458, "num_tokens": 291858568.0, "step": 7652 }, { "epoch": 0.9735402620531739, "ewc_loss": 0.02533777989447117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5337780243717134e-05, "grad_norm": 16.140745162963867, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8552895188331604, "num_tokens": 291889383.0, "step": 7653 }, { "epoch": 0.9736674723317644, "ewc_loss": 0.025316337123513222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5316337996628135e-05, "grad_norm": 16.038991928100586, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8717544078826904, "num_tokens": 291927541.0, "step": 7654 }, { "epoch": 0.973794682610355, "ewc_loss": 0.02534477412700653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5344774257973768e-05, "grad_norm": 16.056262969970703, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8468434810638428, "num_tokens": 291969074.0, "step": 7655 }, { "epoch": 0.9739218928889454, "ewc_loss": 0.025353258475661278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5353258024551906e-05, "grad_norm": 16.11458396911621, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8610764145851135, "num_tokens": 292008311.0, "step": 7656 }, { "epoch": 0.9740491031675359, "ewc_loss": 0.02536180429160595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5361803636769764e-05, "grad_norm": 16.050708770751953, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8544093370437622, "num_tokens": 292051810.0, "step": 7657 }, { "epoch": 0.9741763134461264, "ewc_loss": 0.025333065539598465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5333065423183143e-05, "grad_norm": 16.050779342651367, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8699100017547607, "num_tokens": 292090411.0, "step": 7658 }, { "epoch": 0.974303523724717, "ewc_loss": 0.02537073753774166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5370736693730578e-05, "grad_norm": 16.1182918548584, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8557803630828857, "num_tokens": 292124411.0, "step": 7659 }, { "epoch": 0.9744307340033075, "ewc_loss": 0.025387950241565704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5387949790456332e-05, "grad_norm": 16.074195861816406, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8622198700904846, "num_tokens": 292166428.0, "step": 7660 }, { "epoch": 0.974557944281898, "ewc_loss": 0.025352226570248604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5352226657560095e-05, "grad_norm": 16.06232261657715, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8646596074104309, "num_tokens": 292202687.0, "step": 7661 }, { "epoch": 0.9746851545604885, "ewc_loss": 0.025370996445417404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5370996809215285e-05, "grad_norm": 16.030778884887695, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8724361658096313, "num_tokens": 292246364.0, "step": 7662 }, { "epoch": 0.974812364839079, "ewc_loss": 0.02538003772497177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5380037186550908e-05, "grad_norm": 16.100379943847656, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8643184900283813, "num_tokens": 292285789.0, "step": 7663 }, { "epoch": 0.9749395751176695, "ewc_loss": 0.02541692741215229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.541692811064422e-05, "grad_norm": 16.109500885009766, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8586435914039612, "num_tokens": 292320755.0, "step": 7664 }, { "epoch": 0.97506678539626, "ewc_loss": 0.02540287747979164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5402878236491233e-05, "grad_norm": 16.122005462646484, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8722159266471863, "num_tokens": 292362818.0, "step": 7665 }, { "epoch": 0.9751939956748505, "ewc_loss": 0.02536945417523384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5369454306201078e-05, "grad_norm": 16.062131881713867, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8554589748382568, "num_tokens": 292404016.0, "step": 7666 }, { "epoch": 0.9753212059534411, "ewc_loss": 0.02535444311797619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5354444005643018e-05, "grad_norm": 16.05927848815918, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8490461111068726, "num_tokens": 292442117.0, "step": 7667 }, { "epoch": 0.9754484162320316, "ewc_loss": 0.025368748232722282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5368748538312502e-05, "grad_norm": 16.05939483642578, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8711531758308411, "num_tokens": 292472523.0, "step": 7668 }, { "epoch": 0.975575626510622, "ewc_loss": 0.025375060737133026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5375060431542806e-05, "grad_norm": 16.095251083374023, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8675740361213684, "num_tokens": 292508571.0, "step": 7669 }, { "epoch": 0.9757028367892125, "ewc_loss": 0.02541268989443779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.541268986533396e-05, "grad_norm": 16.139671325683594, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8590773344039917, "num_tokens": 292545358.0, "step": 7670 }, { "epoch": 0.9758300470678031, "ewc_loss": 0.025399886071681976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.53998859989224e-05, "grad_norm": 16.110872268676758, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8475035429000854, "num_tokens": 292583962.0, "step": 7671 }, { "epoch": 0.9759572573463936, "ewc_loss": 0.025347596034407616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5347595510538667e-05, "grad_norm": 16.12460708618164, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8616287708282471, "num_tokens": 292616186.0, "step": 7672 }, { "epoch": 0.9760844676249841, "ewc_loss": 0.02535494975745678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5354949684697203e-05, "grad_norm": 16.107070922851562, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.858974814414978, "num_tokens": 292657378.0, "step": 7673 }, { "epoch": 0.9762116779035747, "ewc_loss": 0.025391997769474983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5391997041879222e-05, "grad_norm": 16.069419860839844, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8765162229537964, "num_tokens": 292692041.0, "step": 7674 }, { "epoch": 0.9763388881821651, "ewc_loss": 0.025398245081305504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5398245270480402e-05, "grad_norm": 16.106016159057617, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8612449169158936, "num_tokens": 292725062.0, "step": 7675 }, { "epoch": 0.9764660984607556, "ewc_loss": 0.025427071377635002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5427070795558393e-05, "grad_norm": 16.056140899658203, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8711014986038208, "num_tokens": 292766522.0, "step": 7676 }, { "epoch": 0.9765933087393461, "ewc_loss": 0.025383012369275093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.538301305321511e-05, "grad_norm": 16.123315811157227, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.862370491027832, "num_tokens": 292811371.0, "step": 7677 }, { "epoch": 0.9767205190179367, "ewc_loss": 0.025418078526854515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5418077711947262e-05, "grad_norm": 16.0740966796875, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8635587692260742, "num_tokens": 292850237.0, "step": 7678 }, { "epoch": 0.9768477292965272, "ewc_loss": 0.025357015430927277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.535701605665963e-05, "grad_norm": 16.044404983520508, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8660122752189636, "num_tokens": 292887291.0, "step": 7679 }, { "epoch": 0.9769749395751177, "ewc_loss": 0.02540404349565506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5404044208698906e-05, "grad_norm": 16.118310928344727, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8754147291183472, "num_tokens": 292928396.0, "step": 7680 }, { "epoch": 0.9771021498537081, "ewc_loss": 0.025402259081602097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5402259780094028e-05, "grad_norm": 16.06280517578125, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8660454750061035, "num_tokens": 292962748.0, "step": 7681 }, { "epoch": 0.9772293601322987, "ewc_loss": 0.025463074445724487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5463074052822776e-05, "grad_norm": 16.18632698059082, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8752614259719849, "num_tokens": 293002330.0, "step": 7682 }, { "epoch": 0.9773565704108892, "ewc_loss": 0.0254057627171278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5405763153685257e-05, "grad_norm": 16.07916831970215, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.86231529712677, "num_tokens": 293038288.0, "step": 7683 }, { "epoch": 0.9774837806894797, "ewc_loss": 0.02537241391837597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5372413801960647e-05, "grad_norm": 16.11865997314453, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8652846813201904, "num_tokens": 293078121.0, "step": 7684 }, { "epoch": 0.9776109909680702, "ewc_loss": 0.025406116619706154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5406116037629545e-05, "grad_norm": 16.113922119140625, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8603755831718445, "num_tokens": 293116846.0, "step": 7685 }, { "epoch": 0.9777382012466608, "ewc_loss": 0.02538444660604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5384446416865103e-05, "grad_norm": 16.185121536254883, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8594255447387695, "num_tokens": 293162237.0, "step": 7686 }, { "epoch": 0.9778654115252512, "ewc_loss": 0.02539236471056938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5392364477738738e-05, "grad_norm": 16.096065521240234, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8608816862106323, "num_tokens": 293196631.0, "step": 7687 }, { "epoch": 0.9779926218038417, "ewc_loss": 0.025307629257440567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5307628675363958e-05, "grad_norm": 16.093589782714844, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8731725215911865, "num_tokens": 293238883.0, "step": 7688 }, { "epoch": 0.9781198320824323, "ewc_loss": 0.025389445945620537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.538944681873545e-05, "grad_norm": 16.131820678710938, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8531886339187622, "num_tokens": 293270802.0, "step": 7689 }, { "epoch": 0.9782470423610228, "ewc_loss": 0.025392211973667145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.539221168262884e-05, "grad_norm": 16.124629974365234, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8491796255111694, "num_tokens": 293314910.0, "step": 7690 }, { "epoch": 0.9783742526396133, "ewc_loss": 0.02533092349767685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5330922653665766e-05, "grad_norm": 16.128835678100586, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8764309883117676, "num_tokens": 293353083.0, "step": 7691 }, { "epoch": 0.9785014629182038, "ewc_loss": 0.025369368493556976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536936881369911e-05, "grad_norm": 16.145309448242188, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8655480146408081, "num_tokens": 293389473.0, "step": 7692 }, { "epoch": 0.9786286731967943, "ewc_loss": 0.02538951113820076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5389510483364575e-05, "grad_norm": 16.08185577392578, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8763496279716492, "num_tokens": 293426893.0, "step": 7693 }, { "epoch": 0.9787558834753848, "ewc_loss": 0.025305287912487984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5305287636001594e-05, "grad_norm": 16.131183624267578, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8680292367935181, "num_tokens": 293468116.0, "step": 7694 }, { "epoch": 0.9788830937539753, "ewc_loss": 0.025444477796554565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5444478524150327e-05, "grad_norm": 16.097003936767578, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8658074736595154, "num_tokens": 293503050.0, "step": 7695 }, { "epoch": 0.9790103040325658, "ewc_loss": 0.02532845176756382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.532845246605575e-05, "grad_norm": 16.076030731201172, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8522433042526245, "num_tokens": 293541673.0, "step": 7696 }, { "epoch": 0.9791375143111564, "ewc_loss": 0.025407977402210236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5407976863789372e-05, "grad_norm": 16.151670455932617, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.860381007194519, "num_tokens": 293578472.0, "step": 7697 }, { "epoch": 0.9792647245897469, "ewc_loss": 0.02538810484111309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5388104404555634e-05, "grad_norm": 16.098697662353516, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8649688959121704, "num_tokens": 293611204.0, "step": 7698 }, { "epoch": 0.9793919348683373, "ewc_loss": 0.025401752442121506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5401752282050438e-05, "grad_norm": 16.153453826904297, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8533831834793091, "num_tokens": 293651352.0, "step": 7699 }, { "epoch": 0.9795191451469278, "ewc_loss": 0.025441205129027367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5441204343223944e-05, "grad_norm": 16.16181755065918, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8735313415527344, "num_tokens": 293687396.0, "step": 7700 }, { "epoch": 0.9796463554255184, "ewc_loss": 0.02539113536477089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.539113484090194e-05, "grad_norm": 16.161712646484375, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8595173358917236, "num_tokens": 293725083.0, "step": 7701 }, { "epoch": 0.9797735657041089, "ewc_loss": 0.02538967691361904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.53896778303897e-05, "grad_norm": 16.09848403930664, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8636435270309448, "num_tokens": 293766136.0, "step": 7702 }, { "epoch": 0.9799007759826994, "ewc_loss": 0.02539307437837124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.539307388360612e-05, "grad_norm": 16.17376136779785, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8465343713760376, "num_tokens": 293805810.0, "step": 7703 }, { "epoch": 0.98002798626129, "ewc_loss": 0.025418302044272423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.54183014476439e-05, "grad_norm": 16.06014633178711, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8547482490539551, "num_tokens": 293839041.0, "step": 7704 }, { "epoch": 0.9801551965398804, "ewc_loss": 0.025367962196469307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.536796273489017e-05, "grad_norm": 16.09341812133789, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8780477046966553, "num_tokens": 293876541.0, "step": 7705 }, { "epoch": 0.9802824068184709, "ewc_loss": 0.025446800515055656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5446801373618655e-05, "grad_norm": 16.00028419494629, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8765774369239807, "num_tokens": 293918141.0, "step": 7706 }, { "epoch": 0.9804096170970614, "ewc_loss": 0.025400901213288307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540090099500958e-05, "grad_norm": 16.12735939025879, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8519092202186584, "num_tokens": 293958078.0, "step": 7707 }, { "epoch": 0.980536827375652, "ewc_loss": 0.02551368996500969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5513689251965843e-05, "grad_norm": 16.103137969970703, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8728733062744141, "num_tokens": 293990347.0, "step": 7708 }, { "epoch": 0.9806640376542425, "ewc_loss": 0.025442004203796387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5442004698561504e-05, "grad_norm": 16.105558395385742, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8690367341041565, "num_tokens": 294032136.0, "step": 7709 }, { "epoch": 0.980791247932833, "ewc_loss": 0.025504887104034424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5504887162242085e-05, "grad_norm": 16.149389266967773, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.851432204246521, "num_tokens": 294065136.0, "step": 7710 }, { "epoch": 0.9809184582114235, "ewc_loss": 0.025468217208981514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.54682163358666e-05, "grad_norm": 16.102840423583984, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8764704465866089, "num_tokens": 294104539.0, "step": 7711 }, { "epoch": 0.981045668490014, "ewc_loss": 0.02546527236700058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5465273211011663e-05, "grad_norm": 16.081907272338867, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8617610931396484, "num_tokens": 294148962.0, "step": 7712 }, { "epoch": 0.9811728787686045, "ewc_loss": 0.0254416111856699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.544161179685034e-05, "grad_norm": 16.081829071044922, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8659300804138184, "num_tokens": 294181866.0, "step": 7713 }, { "epoch": 0.981300089047195, "ewc_loss": 0.02548976242542267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.54897622653516e-05, "grad_norm": 16.12940216064453, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8723570108413696, "num_tokens": 294223558.0, "step": 7714 }, { "epoch": 0.9814272993257855, "ewc_loss": 0.02546396665275097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5463967176619917e-05, "grad_norm": 16.06107521057129, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8676588535308838, "num_tokens": 294260487.0, "step": 7715 }, { "epoch": 0.9815545096043761, "ewc_loss": 0.02550114132463932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5501141863060184e-05, "grad_norm": 16.182222366333008, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8644824028015137, "num_tokens": 294302642.0, "step": 7716 }, { "epoch": 0.9816817198829666, "ewc_loss": 0.025482356548309326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5482357159489766e-05, "grad_norm": 16.112831115722656, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8550131320953369, "num_tokens": 294338376.0, "step": 7717 }, { "epoch": 0.981808930161557, "ewc_loss": 0.025470709428191185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.547071017033886e-05, "grad_norm": 16.13884735107422, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8581669330596924, "num_tokens": 294372089.0, "step": 7718 }, { "epoch": 0.9819361404401475, "ewc_loss": 0.025479430332779884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.547943040553946e-05, "grad_norm": 16.078088760375977, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8685669898986816, "num_tokens": 294403801.0, "step": 7719 }, { "epoch": 0.9820633507187381, "ewc_loss": 0.025472991168498993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5472991183050908e-05, "grad_norm": 16.138370513916016, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8667687773704529, "num_tokens": 294440301.0, "step": 7720 }, { "epoch": 0.9821905609973286, "ewc_loss": 0.025490108877420425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5490109692327678e-05, "grad_norm": 16.115198135375977, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.863636314868927, "num_tokens": 294472967.0, "step": 7721 }, { "epoch": 0.9823177712759191, "ewc_loss": 0.025518812239170074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5518811526126228e-05, "grad_norm": 16.17868423461914, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8544492125511169, "num_tokens": 294518248.0, "step": 7722 }, { "epoch": 0.9824449815545097, "ewc_loss": 0.025499887764453888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.549988857936114e-05, "grad_norm": 16.088533401489258, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8614976406097412, "num_tokens": 294553899.0, "step": 7723 }, { "epoch": 0.9825721918331001, "ewc_loss": 0.025460628792643547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.546062933106441e-05, "grad_norm": 16.15579605102539, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8411003351211548, "num_tokens": 294591382.0, "step": 7724 }, { "epoch": 0.9826994021116906, "ewc_loss": 0.025503011420369148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.550301178416703e-05, "grad_norm": 16.093820571899414, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8644911646842957, "num_tokens": 294629606.0, "step": 7725 }, { "epoch": 0.9828266123902811, "ewc_loss": 0.025491749867796898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5491750420769677e-05, "grad_norm": 16.11569595336914, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8661917448043823, "num_tokens": 294669417.0, "step": 7726 }, { "epoch": 0.9829538226688717, "ewc_loss": 0.02554980292916298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5549803467583843e-05, "grad_norm": 16.090557098388672, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8722639679908752, "num_tokens": 294709876.0, "step": 7727 }, { "epoch": 0.9830810329474622, "ewc_loss": 0.025545168668031693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5545168682583608e-05, "grad_norm": 16.13528060913086, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8540542721748352, "num_tokens": 294750565.0, "step": 7728 }, { "epoch": 0.9832082432260527, "ewc_loss": 0.025554019957780838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5554019885021262e-05, "grad_norm": 16.151809692382812, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8637754917144775, "num_tokens": 294790586.0, "step": 7729 }, { "epoch": 0.9833354535046431, "ewc_loss": 0.025516953319311142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5516952518955804e-05, "grad_norm": 16.091262817382812, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8606432676315308, "num_tokens": 294830176.0, "step": 7730 }, { "epoch": 0.9834626637832337, "ewc_loss": 0.02555982582271099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.555982609919738e-05, "grad_norm": 16.171653747558594, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.851616621017456, "num_tokens": 294874199.0, "step": 7731 }, { "epoch": 0.9835898740618242, "ewc_loss": 0.025490254163742065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.549025339249056e-05, "grad_norm": 16.125137329101562, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8708947896957397, "num_tokens": 294914219.0, "step": 7732 }, { "epoch": 0.9837170843404147, "ewc_loss": 0.025495771318674088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5495772206340916e-05, "grad_norm": 16.091821670532227, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8409246802330017, "num_tokens": 294950726.0, "step": 7733 }, { "epoch": 0.9838442946190052, "ewc_loss": 0.025457462295889854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5457462470512837e-05, "grad_norm": 16.135616302490234, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8625282049179077, "num_tokens": 294993815.0, "step": 7734 }, { "epoch": 0.9839715048975958, "ewc_loss": 0.025533121079206467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5533121515763924e-05, "grad_norm": 16.166364669799805, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8712792992591858, "num_tokens": 295029275.0, "step": 7735 }, { "epoch": 0.9840987151761862, "ewc_loss": 0.0254666805267334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5466681108810008e-05, "grad_norm": 16.119464874267578, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.862453281879425, "num_tokens": 295066166.0, "step": 7736 }, { "epoch": 0.9842259254547767, "ewc_loss": 0.025471143424510956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5471143089816906e-05, "grad_norm": 16.0930118560791, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8414261937141418, "num_tokens": 295100436.0, "step": 7737 }, { "epoch": 0.9843531357333672, "ewc_loss": 0.025523267686367035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5523268050164916e-05, "grad_norm": 16.158781051635742, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8650364875793457, "num_tokens": 295141756.0, "step": 7738 }, { "epoch": 0.9844803460119578, "ewc_loss": 0.025471488013863564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.547148869780358e-05, "grad_norm": 16.019441604614258, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8810715675354004, "num_tokens": 295178894.0, "step": 7739 }, { "epoch": 0.9846075562905483, "ewc_loss": 0.025518784299492836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5518784241285175e-05, "grad_norm": 16.143415451049805, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8529219627380371, "num_tokens": 295219133.0, "step": 7740 }, { "epoch": 0.9847347665691388, "ewc_loss": 0.025556175038218498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5556175387464464e-05, "grad_norm": 16.07501220703125, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8633716702461243, "num_tokens": 295260322.0, "step": 7741 }, { "epoch": 0.9848619768477292, "ewc_loss": 0.025436794385313988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.543679511290975e-05, "grad_norm": 16.08194923400879, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8576530814170837, "num_tokens": 295299675.0, "step": 7742 }, { "epoch": 0.9849891871263198, "ewc_loss": 0.025529727339744568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5529727281536907e-05, "grad_norm": 16.056285858154297, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8693996071815491, "num_tokens": 295336281.0, "step": 7743 }, { "epoch": 0.9851163974049103, "ewc_loss": 0.025503844022750854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5503844881313853e-05, "grad_norm": 16.15370750427246, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8612805604934692, "num_tokens": 295376932.0, "step": 7744 }, { "epoch": 0.9852436076835008, "ewc_loss": 0.0255131796002388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.551317993493285e-05, "grad_norm": 16.12024688720703, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8594666719436646, "num_tokens": 295421904.0, "step": 7745 }, { "epoch": 0.9853708179620914, "ewc_loss": 0.025407828390598297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540782770665828e-05, "grad_norm": 16.15174674987793, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8659967184066772, "num_tokens": 295456688.0, "step": 7746 }, { "epoch": 0.9854980282406819, "ewc_loss": 0.02549208328127861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5492083295830525e-05, "grad_norm": 16.184856414794922, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8801867961883545, "num_tokens": 295500022.0, "step": 7747 }, { "epoch": 0.9856252385192723, "ewc_loss": 0.025422723963856697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.542272341088392e-05, "grad_norm": 16.151235580444336, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8833199739456177, "num_tokens": 295538106.0, "step": 7748 }, { "epoch": 0.9857524487978628, "ewc_loss": 0.025401625782251358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.540162495279219e-05, "grad_norm": 16.172149658203125, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8590219020843506, "num_tokens": 295572465.0, "step": 7749 }, { "epoch": 0.9858796590764534, "ewc_loss": 0.025386232882738113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5386232664459385e-05, "grad_norm": 16.08126449584961, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8698685169219971, "num_tokens": 295614577.0, "step": 7750 }, { "epoch": 0.9860068693550439, "ewc_loss": 0.025408681482076645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5408680812688544e-05, "grad_norm": 16.092220306396484, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8675282597541809, "num_tokens": 295654780.0, "step": 7751 }, { "epoch": 0.9861340796336344, "ewc_loss": 0.025409309193491936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5409308364032768e-05, "grad_norm": 16.140789031982422, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8620302677154541, "num_tokens": 295695880.0, "step": 7752 }, { "epoch": 0.986261289912225, "ewc_loss": 0.025460336357355118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.546033647377044e-05, "grad_norm": 16.09548568725586, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.863396406173706, "num_tokens": 295738176.0, "step": 7753 }, { "epoch": 0.9863885001908154, "ewc_loss": 0.02541152574121952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.541152571211569e-05, "grad_norm": 16.168102264404297, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8537552356719971, "num_tokens": 295777562.0, "step": 7754 }, { "epoch": 0.9865157104694059, "ewc_loss": 0.025440093129873276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5440092940698378e-05, "grad_norm": 16.111135482788086, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8795769810676575, "num_tokens": 295815013.0, "step": 7755 }, { "epoch": 0.9866429207479964, "ewc_loss": 0.025391288101673126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.539128763601184e-05, "grad_norm": 16.04901885986328, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8521876335144043, "num_tokens": 295857751.0, "step": 7756 }, { "epoch": 0.986770131026587, "ewc_loss": 0.02543303184211254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5433031623833813e-05, "grad_norm": 16.200618743896484, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8688152432441711, "num_tokens": 295896033.0, "step": 7757 }, { "epoch": 0.9868973413051775, "ewc_loss": 0.025436406955122948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5436407668166794e-05, "grad_norm": 16.097837448120117, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8659489154815674, "num_tokens": 295936838.0, "step": 7758 }, { "epoch": 0.987024551583768, "ewc_loss": 0.02541121281683445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.541121284593828e-05, "grad_norm": 16.194835662841797, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8688627481460571, "num_tokens": 295972307.0, "step": 7759 }, { "epoch": 0.9871517618623584, "ewc_loss": 0.025421159341931343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.542115907999687e-05, "grad_norm": 16.116836547851562, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8608894348144531, "num_tokens": 296009966.0, "step": 7760 }, { "epoch": 0.987278972140949, "ewc_loss": 0.02538795955479145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.538795888540335e-05, "grad_norm": 16.13372230529785, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8603358268737793, "num_tokens": 296047874.0, "step": 7761 }, { "epoch": 0.9874061824195395, "ewc_loss": 0.025407738983631134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5407738576177508e-05, "grad_norm": 16.083036422729492, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8706774711608887, "num_tokens": 296090591.0, "step": 7762 }, { "epoch": 0.98753339269813, "ewc_loss": 0.02539781481027603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.539781416999176e-05, "grad_norm": 16.199960708618164, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8574367165565491, "num_tokens": 296133024.0, "step": 7763 }, { "epoch": 0.9876606029767205, "ewc_loss": 0.02545628882944584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.545628922234755e-05, "grad_norm": 16.17435073852539, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8524134755134583, "num_tokens": 296177347.0, "step": 7764 }, { "epoch": 0.9877878132553111, "ewc_loss": 0.02538442797958851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5384428226971067e-05, "grad_norm": 16.133394241333008, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8543584942817688, "num_tokens": 296215532.0, "step": 7765 }, { "epoch": 0.9879150235339016, "ewc_loss": 0.025407688692212105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5407689463463612e-05, "grad_norm": 16.141876220703125, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8525627851486206, "num_tokens": 296249494.0, "step": 7766 }, { "epoch": 0.988042233812492, "ewc_loss": 0.025350049138069153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.535004932724405e-05, "grad_norm": 16.042144775390625, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.854593813419342, "num_tokens": 296289370.0, "step": 7767 }, { "epoch": 0.9881694440910825, "ewc_loss": 0.025408219546079636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5408218789380044e-05, "grad_norm": 16.118898391723633, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8650578260421753, "num_tokens": 296329376.0, "step": 7768 }, { "epoch": 0.9882966543696731, "ewc_loss": 0.025485068559646606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5485069272690453e-05, "grad_norm": 16.158979415893555, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.870829701423645, "num_tokens": 296368875.0, "step": 7769 }, { "epoch": 0.9884238646482636, "ewc_loss": 0.025435520336031914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5435520001337864e-05, "grad_norm": 16.173377990722656, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8627092242240906, "num_tokens": 296404766.0, "step": 7770 }, { "epoch": 0.9885510749268541, "ewc_loss": 0.025477109476923943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5477109375060536e-05, "grad_norm": 16.197649002075195, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8628227710723877, "num_tokens": 296441705.0, "step": 7771 }, { "epoch": 0.9886782852054447, "ewc_loss": 0.02537226676940918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.537226646381896e-05, "grad_norm": 16.071691513061523, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8711353540420532, "num_tokens": 296481075.0, "step": 7772 }, { "epoch": 0.9888054954840351, "ewc_loss": 0.025451719760894775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5451719920965843e-05, "grad_norm": 16.18262481689453, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8595987558364868, "num_tokens": 296511404.0, "step": 7773 }, { "epoch": 0.9889327057626256, "ewc_loss": 0.025425508618354797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5425508283660747e-05, "grad_norm": 16.13348960876465, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8553218841552734, "num_tokens": 296546080.0, "step": 7774 }, { "epoch": 0.9890599160412161, "ewc_loss": 0.025453150272369385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.545314964663703e-05, "grad_norm": 16.18136978149414, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8652905821800232, "num_tokens": 296586119.0, "step": 7775 }, { "epoch": 0.9891871263198067, "ewc_loss": 0.025440763682127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5440764147788286e-05, "grad_norm": 16.070405960083008, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8613364696502686, "num_tokens": 296629602.0, "step": 7776 }, { "epoch": 0.9893143365983972, "ewc_loss": 0.025434400886297226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5434401322854683e-05, "grad_norm": 16.123443603515625, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.856752336025238, "num_tokens": 296666709.0, "step": 7777 }, { "epoch": 0.9894415468769877, "ewc_loss": 0.025477735325694084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5477735107415356e-05, "grad_norm": 16.110883712768555, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8639653921127319, "num_tokens": 296702611.0, "step": 7778 }, { "epoch": 0.9895687571555781, "ewc_loss": 0.025430947542190552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.543094706197735e-05, "grad_norm": 16.106401443481445, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8688136339187622, "num_tokens": 296737649.0, "step": 7779 }, { "epoch": 0.9896959674341687, "ewc_loss": 0.025552725419402122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5552724764565937e-05, "grad_norm": 16.15580940246582, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8655981421470642, "num_tokens": 296776915.0, "step": 7780 }, { "epoch": 0.9898231777127592, "ewc_loss": 0.02547094225883484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5470943000982516e-05, "grad_norm": 16.125457763671875, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8694889545440674, "num_tokens": 296811491.0, "step": 7781 }, { "epoch": 0.9899503879913497, "ewc_loss": 0.025525035336613655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5525036107865162e-05, "grad_norm": 16.16667366027832, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8672177791595459, "num_tokens": 296847602.0, "step": 7782 }, { "epoch": 0.9900775982699402, "ewc_loss": 0.025539696216583252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5539695343468338e-05, "grad_norm": 16.133010864257812, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8469642400741577, "num_tokens": 296884737.0, "step": 7783 }, { "epoch": 0.9902048085485308, "ewc_loss": 0.02552240714430809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.552240766817704e-05, "grad_norm": 16.15485191345215, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8502588272094727, "num_tokens": 296927780.0, "step": 7784 }, { "epoch": 0.9903320188271212, "ewc_loss": 0.025529688224196434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5529689082759432e-05, "grad_norm": 16.119415283203125, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8770968317985535, "num_tokens": 296964200.0, "step": 7785 }, { "epoch": 0.9904592291057117, "ewc_loss": 0.025533873587846756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5533872758387588e-05, "grad_norm": 16.178171157836914, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8544906973838806, "num_tokens": 297009927.0, "step": 7786 }, { "epoch": 0.9905864393843022, "ewc_loss": 0.025546323508024216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.554632374085486e-05, "grad_norm": 16.16954231262207, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8508821129798889, "num_tokens": 297050788.0, "step": 7787 }, { "epoch": 0.9907136496628928, "ewc_loss": 0.025471990928053856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5471990738878958e-05, "grad_norm": 16.051408767700195, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8691079616546631, "num_tokens": 297092398.0, "step": 7788 }, { "epoch": 0.9908408599414833, "ewc_loss": 0.025514984503388405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5514984372421168e-05, "grad_norm": 16.168825149536133, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8607207536697388, "num_tokens": 297130728.0, "step": 7789 }, { "epoch": 0.9909680702200738, "ewc_loss": 0.025522099807858467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.552210025896784e-05, "grad_norm": 16.108121871948242, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8862649202346802, "num_tokens": 297169124.0, "step": 7790 }, { "epoch": 0.9910952804986642, "ewc_loss": 0.025488676503300667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5488676328677684e-05, "grad_norm": 16.115196228027344, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8721274733543396, "num_tokens": 297204914.0, "step": 7791 }, { "epoch": 0.9912224907772548, "ewc_loss": 0.02550680935382843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5506809834041633e-05, "grad_norm": 16.103038787841797, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8596417307853699, "num_tokens": 297240551.0, "step": 7792 }, { "epoch": 0.9913497010558453, "ewc_loss": 0.025516020134091377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5516019377391785e-05, "grad_norm": 16.047157287597656, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8545459508895874, "num_tokens": 297287160.0, "step": 7793 }, { "epoch": 0.9914769113344358, "ewc_loss": 0.02555161714553833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5551617000019178e-05, "grad_norm": 16.132524490356445, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8650830984115601, "num_tokens": 297323394.0, "step": 7794 }, { "epoch": 0.9916041216130264, "ewc_loss": 0.025571513921022415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5571513106115162e-05, "grad_norm": 16.12775993347168, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8422818183898926, "num_tokens": 297357808.0, "step": 7795 }, { "epoch": 0.9917313318916169, "ewc_loss": 0.025513816624879837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.551381658122409e-05, "grad_norm": 16.040529251098633, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8582053780555725, "num_tokens": 297397300.0, "step": 7796 }, { "epoch": 0.9918585421702073, "ewc_loss": 0.025564195588231087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5564195311744697e-05, "grad_norm": 16.11980628967285, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8727884292602539, "num_tokens": 297434699.0, "step": 7797 }, { "epoch": 0.9919857524487978, "ewc_loss": 0.025602176785469055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.560217762948014e-05, "grad_norm": 16.184911727905273, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8620293140411377, "num_tokens": 297473939.0, "step": 7798 }, { "epoch": 0.9921129627273884, "ewc_loss": 0.02558986283838749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5589863071218133e-05, "grad_norm": 16.143577575683594, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.854723334312439, "num_tokens": 297511154.0, "step": 7799 }, { "epoch": 0.9922401730059789, "ewc_loss": 0.025538867339491844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5538867703289725e-05, "grad_norm": 16.105928421020508, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8541582822799683, "num_tokens": 297556092.0, "step": 7800 }, { "epoch": 0.9923673832845694, "ewc_loss": 0.025522330775856972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.552233127062209e-05, "grad_norm": 16.080760955810547, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8674716949462891, "num_tokens": 297598160.0, "step": 7801 }, { "epoch": 0.9924945935631599, "ewc_loss": 0.025522494688630104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.552249497966841e-05, "grad_norm": 16.14039421081543, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.879586935043335, "num_tokens": 297638706.0, "step": 7802 }, { "epoch": 0.9926218038417504, "ewc_loss": 0.02552354894578457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.552354817453306e-05, "grad_norm": 16.129924774169922, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8682289123535156, "num_tokens": 297675955.0, "step": 7803 }, { "epoch": 0.9927490141203409, "ewc_loss": 0.02553435042500496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.553435115260072e-05, "grad_norm": 16.126598358154297, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8578514456748962, "num_tokens": 297715362.0, "step": 7804 }, { "epoch": 0.9928762243989314, "ewc_loss": 0.025554992258548737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.555499304435216e-05, "grad_norm": 16.111486434936523, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8489545583724976, "num_tokens": 297750931.0, "step": 7805 }, { "epoch": 0.993003434677522, "ewc_loss": 0.025521425530314445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5521425413899124e-05, "grad_norm": 16.134965896606445, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.855532705783844, "num_tokens": 297786107.0, "step": 7806 }, { "epoch": 0.9931306449561125, "ewc_loss": 0.025564521551132202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5564520910847932e-05, "grad_norm": 16.128007888793945, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8639944791793823, "num_tokens": 297824782.0, "step": 7807 }, { "epoch": 0.993257855234703, "ewc_loss": 0.025554126128554344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5554125386406668e-05, "grad_norm": 16.148466110229492, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8580709099769592, "num_tokens": 297861748.0, "step": 7808 }, { "epoch": 0.9933850655132934, "ewc_loss": 0.025562848895788193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5562849259586073e-05, "grad_norm": 16.13702392578125, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8628559112548828, "num_tokens": 297902722.0, "step": 7809 }, { "epoch": 0.993512275791884, "ewc_loss": 0.02551148645579815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.551148645579815e-05, "grad_norm": 16.009937286376953, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.864138126373291, "num_tokens": 297948709.0, "step": 7810 }, { "epoch": 0.9936394860704745, "ewc_loss": 0.02554141916334629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.55414197454229e-05, "grad_norm": 16.169620513916016, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8659071922302246, "num_tokens": 297986446.0, "step": 7811 }, { "epoch": 0.993766696349065, "ewc_loss": 0.025577615946531296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.557761581556406e-05, "grad_norm": 16.123811721801758, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8553617000579834, "num_tokens": 298024004.0, "step": 7812 }, { "epoch": 0.9938939066276555, "ewc_loss": 0.025525206699967384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5525207092869096e-05, "grad_norm": 16.121097564697266, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8538404107093811, "num_tokens": 298065518.0, "step": 7813 }, { "epoch": 0.9940211169062461, "ewc_loss": 0.02554515190422535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5545152311678976e-05, "grad_norm": 16.10296058654785, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8557419180870056, "num_tokens": 298108138.0, "step": 7814 }, { "epoch": 0.9941483271848366, "ewc_loss": 0.02549905888736248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5499059120193124e-05, "grad_norm": 16.15404510498047, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8627051115036011, "num_tokens": 298149348.0, "step": 7815 }, { "epoch": 0.994275537463427, "ewc_loss": 0.025603987276554108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5603987523936667e-05, "grad_norm": 16.09164047241211, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8607819080352783, "num_tokens": 298186347.0, "step": 7816 }, { "epoch": 0.9944027477420175, "ewc_loss": 0.02547895908355713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5478959287283942e-05, "grad_norm": 16.0747127532959, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8421380519866943, "num_tokens": 298223322.0, "step": 7817 }, { "epoch": 0.9945299580206081, "ewc_loss": 0.025538600981235504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5538600311847404e-05, "grad_norm": 16.093706130981445, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8582191467285156, "num_tokens": 298258794.0, "step": 7818 }, { "epoch": 0.9946571682991986, "ewc_loss": 0.025549866259098053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5549867132212967e-05, "grad_norm": 16.086605072021484, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8565768599510193, "num_tokens": 298297510.0, "step": 7819 }, { "epoch": 0.9947843785777891, "ewc_loss": 0.02555934339761734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.555934406700544e-05, "grad_norm": 16.129701614379883, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.867945671081543, "num_tokens": 298333784.0, "step": 7820 }, { "epoch": 0.9949115888563796, "ewc_loss": 0.025603236630558968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5603236281313002e-05, "grad_norm": 16.11821174621582, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8696316480636597, "num_tokens": 298371241.0, "step": 7821 }, { "epoch": 0.9950387991349701, "ewc_loss": 0.025614121928811073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.561412111390382e-05, "grad_norm": 16.129714965820312, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8487865924835205, "num_tokens": 298407902.0, "step": 7822 }, { "epoch": 0.9951660094135606, "ewc_loss": 0.02567266672849655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5672667106846347e-05, "grad_norm": 16.155277252197266, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8382720947265625, "num_tokens": 298452339.0, "step": 7823 }, { "epoch": 0.9952932196921511, "ewc_loss": 0.025603225454688072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.560322536737658e-05, "grad_norm": 16.101659774780273, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8536165356636047, "num_tokens": 298495854.0, "step": 7824 }, { "epoch": 0.9954204299707416, "ewc_loss": 0.02558310702443123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5583107344573364e-05, "grad_norm": 16.14643096923828, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8565707206726074, "num_tokens": 298529642.0, "step": 7825 }, { "epoch": 0.9955476402493322, "ewc_loss": 0.02566109225153923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5661092877271585e-05, "grad_norm": 16.179508209228516, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8643463253974915, "num_tokens": 298574396.0, "step": 7826 }, { "epoch": 0.9956748505279227, "ewc_loss": 0.025606613606214523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5606614144635387e-05, "grad_norm": 16.115604400634766, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8497714996337891, "num_tokens": 298612457.0, "step": 7827 }, { "epoch": 0.9958020608065131, "ewc_loss": 0.025609290227293968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5609289878048003e-05, "grad_norm": 16.152860641479492, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8626569509506226, "num_tokens": 298645061.0, "step": 7828 }, { "epoch": 0.9959292710851037, "ewc_loss": 0.025617675855755806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.561767541919835e-05, "grad_norm": 16.140562057495117, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8599039316177368, "num_tokens": 298688014.0, "step": 7829 }, { "epoch": 0.9960564813636942, "ewc_loss": 0.025573192164301872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5573192033334635e-05, "grad_norm": 16.16741180419922, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8656312227249146, "num_tokens": 298727815.0, "step": 7830 }, { "epoch": 0.9961836916422847, "ewc_loss": 0.025587519630789757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5587520212866366e-05, "grad_norm": 16.126680374145508, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8829880952835083, "num_tokens": 298765663.0, "step": 7831 }, { "epoch": 0.9963109019208752, "ewc_loss": 0.02560454048216343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5604540496715344e-05, "grad_norm": 16.134138107299805, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8657481670379639, "num_tokens": 298799360.0, "step": 7832 }, { "epoch": 0.9964381121994658, "ewc_loss": 0.02563583292067051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5635832571424544e-05, "grad_norm": 16.19774627685547, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8621373176574707, "num_tokens": 298835669.0, "step": 7833 }, { "epoch": 0.9965653224780562, "ewc_loss": 0.025636261329054832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.563626185292378e-05, "grad_norm": 16.143898010253906, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.860614538192749, "num_tokens": 298866871.0, "step": 7834 }, { "epoch": 0.9966925327566467, "ewc_loss": 0.025640234351158142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5640234525781125e-05, "grad_norm": 16.18288803100586, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8533103466033936, "num_tokens": 298904136.0, "step": 7835 }, { "epoch": 0.9968197430352372, "ewc_loss": 0.025621172040700912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5621171516831964e-05, "grad_norm": 16.152408599853516, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8750271201133728, "num_tokens": 298937069.0, "step": 7836 }, { "epoch": 0.9969469533138278, "ewc_loss": 0.025632040575146675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.563203997851815e-05, "grad_norm": 16.146326065063477, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8548934459686279, "num_tokens": 298976778.0, "step": 7837 }, { "epoch": 0.9970741635924183, "ewc_loss": 0.02558763325214386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5587632990209386e-05, "grad_norm": 16.15712547302246, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8728646039962769, "num_tokens": 299010808.0, "step": 7838 }, { "epoch": 0.9972013738710088, "ewc_loss": 0.02564033307135105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5640332751208916e-05, "grad_norm": 16.19346046447754, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8521944284439087, "num_tokens": 299047611.0, "step": 7839 }, { "epoch": 0.9973285841495992, "ewc_loss": 0.02559615485370159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5596154955564998e-05, "grad_norm": 16.11063003540039, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8664712905883789, "num_tokens": 299088668.0, "step": 7840 }, { "epoch": 0.9974557944281898, "ewc_loss": 0.025618363171815872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.561836299719289e-05, "grad_norm": 16.146656036376953, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8424673080444336, "num_tokens": 299126572.0, "step": 7841 }, { "epoch": 0.9975830047067803, "ewc_loss": 0.02563771791756153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5637717044446617e-05, "grad_norm": 16.08800506591797, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8738239407539368, "num_tokens": 299164806.0, "step": 7842 }, { "epoch": 0.9977102149853708, "ewc_loss": 0.025579143315553665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5579143766663037e-05, "grad_norm": 16.12016487121582, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8582636713981628, "num_tokens": 299207039.0, "step": 7843 }, { "epoch": 0.9978374252639614, "ewc_loss": 0.025714535266160965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5714534785947762e-05, "grad_norm": 16.180845260620117, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8714070320129395, "num_tokens": 299243242.0, "step": 7844 }, { "epoch": 0.9979646355425519, "ewc_loss": 0.025621693581342697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5621693566790782e-05, "grad_norm": 16.11490249633789, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8687694072723389, "num_tokens": 299282326.0, "step": 7845 }, { "epoch": 0.9980918458211423, "ewc_loss": 0.025670921429991722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5670920877018943e-05, "grad_norm": 16.232694625854492, "learning_rate": 1e-06, "loss": 0.3351, "mean_token_accuracy": 0.8906203508377075, "num_tokens": 299316645.0, "step": 7846 }, { "epoch": 0.9982190560997328, "ewc_loss": 0.025674525648355484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.567452611401677e-05, "grad_norm": 16.156696319580078, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8641988039016724, "num_tokens": 299349932.0, "step": 7847 }, { "epoch": 0.9983462663783234, "ewc_loss": 0.02557569369673729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.557569314376451e-05, "grad_norm": 16.11330223083496, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8410996198654175, "num_tokens": 299392452.0, "step": 7848 }, { "epoch": 0.9984734766569139, "ewc_loss": 0.0256145428866148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5614543119445443e-05, "grad_norm": 16.097013473510742, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8684147596359253, "num_tokens": 299431479.0, "step": 7849 }, { "epoch": 0.9986006869355044, "ewc_loss": 0.025607474148273468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5607474526623264e-05, "grad_norm": 16.14764404296875, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8602619171142578, "num_tokens": 299475585.0, "step": 7850 }, { "epoch": 0.9987278972140949, "ewc_loss": 0.02563430741429329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5634308258304372e-05, "grad_norm": 16.103199005126953, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8783476948738098, "num_tokens": 299514026.0, "step": 7851 }, { "epoch": 0.9988551074926854, "ewc_loss": 0.025584137067198753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.558413689257577e-05, "grad_norm": 16.14448356628418, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8524256348609924, "num_tokens": 299550858.0, "step": 7852 }, { "epoch": 0.9989823177712759, "ewc_loss": 0.025663020089268684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5663021006039344e-05, "grad_norm": 16.188459396362305, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8552771806716919, "num_tokens": 299586831.0, "step": 7853 }, { "epoch": 0.9991095280498664, "ewc_loss": 0.025622131302952766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5622131943237036e-05, "grad_norm": 16.12649917602539, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8718204498291016, "num_tokens": 299623116.0, "step": 7854 }, { "epoch": 0.9992367383284569, "ewc_loss": 0.025606732815504074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5606732378946617e-05, "grad_norm": 16.122495651245117, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8654806017875671, "num_tokens": 299662165.0, "step": 7855 }, { "epoch": 0.9993639486070475, "ewc_loss": 0.025605596601963043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5605597329558805e-05, "grad_norm": 16.133886337280273, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8588924407958984, "num_tokens": 299702581.0, "step": 7856 }, { "epoch": 0.999491158885638, "ewc_loss": 0.025633839890360832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.563384077802766e-05, "grad_norm": 16.146230697631836, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8476806879043579, "num_tokens": 299742559.0, "step": 7857 }, { "epoch": 0.9996183691642284, "ewc_loss": 0.0256345197558403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5634519261075184e-05, "grad_norm": 16.106828689575195, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8789950609207153, "num_tokens": 299778637.0, "step": 7858 }, { "epoch": 0.9997455794428189, "ewc_loss": 0.025643981993198395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.564398164395243e-05, "grad_norm": 16.163124084472656, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.871932327747345, "num_tokens": 299812808.0, "step": 7859 }, { "epoch": 0.9998727897214095, "ewc_loss": 0.025656692683696747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5656692741904408e-05, "grad_norm": 16.07819175720215, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.859116792678833, "num_tokens": 299848987.0, "step": 7860 }, { "epoch": 1.0, "ewc_loss": 0.025591043755412102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5591043595341034e-05, "grad_norm": 16.168500900268555, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8441948890686035, "num_tokens": 299886286.0, "step": 7861 }, { "epoch": 1.0001272102785905, "ewc_loss": 0.025665832683444023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5665833163657226e-05, "grad_norm": 16.11457061767578, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8571668267250061, "num_tokens": 299925456.0, "step": 7862 }, { "epoch": 1.000254420557181, "ewc_loss": 0.025590285658836365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5590285076759756e-05, "grad_norm": 16.115516662597656, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8607676029205322, "num_tokens": 299965936.0, "step": 7863 }, { "epoch": 1.0003816308357716, "ewc_loss": 0.025671586394309998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.567158662714064e-05, "grad_norm": 16.08293342590332, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8767136335372925, "num_tokens": 300003181.0, "step": 7864 }, { "epoch": 1.0005088411143621, "ewc_loss": 0.025631830096244812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5631830794736743e-05, "grad_norm": 16.154926300048828, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8618312478065491, "num_tokens": 300040502.0, "step": 7865 }, { "epoch": 1.0006360513929526, "ewc_loss": 0.025698017328977585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5698016543174163e-05, "grad_norm": 16.123916625976562, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.878746747970581, "num_tokens": 300078795.0, "step": 7866 }, { "epoch": 1.0007632616715432, "ewc_loss": 0.025613319128751755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5613318939576857e-05, "grad_norm": 16.101301193237305, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8729180097579956, "num_tokens": 300115188.0, "step": 7867 }, { "epoch": 1.0008904719501335, "ewc_loss": 0.025686807930469513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5686807930469513e-05, "grad_norm": 16.186033248901367, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8660926818847656, "num_tokens": 300158583.0, "step": 7868 }, { "epoch": 1.001017682228724, "ewc_loss": 0.02559148147702217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.559148197178729e-05, "grad_norm": 16.092103958129883, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8694735765457153, "num_tokens": 300194415.0, "step": 7869 }, { "epoch": 1.0011448925073145, "ewc_loss": 0.025655172765254974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5655172066763043e-05, "grad_norm": 16.11284065246582, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8593490719795227, "num_tokens": 300235066.0, "step": 7870 }, { "epoch": 1.001272102785905, "ewc_loss": 0.025595271959900856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.559527274570428e-05, "grad_norm": 16.130456924438477, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8679010272026062, "num_tokens": 300272167.0, "step": 7871 }, { "epoch": 1.0013993130644956, "ewc_loss": 0.025634238496422768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5634239136707038e-05, "grad_norm": 16.12717628479004, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8564419150352478, "num_tokens": 300306032.0, "step": 7872 }, { "epoch": 1.0015265233430861, "ewc_loss": 0.02562705986201763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5627059585531242e-05, "grad_norm": 16.13117218017578, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8610972166061401, "num_tokens": 300338433.0, "step": 7873 }, { "epoch": 1.0016537336216766, "ewc_loss": 0.025617366656661034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5617366190999746e-05, "grad_norm": 16.137046813964844, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8541790246963501, "num_tokens": 300376527.0, "step": 7874 }, { "epoch": 1.0017809439002672, "ewc_loss": 0.025633929297327995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5633929908508435e-05, "grad_norm": 16.051942825317383, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8808068037033081, "num_tokens": 300417309.0, "step": 7875 }, { "epoch": 1.0019081541788577, "ewc_loss": 0.025658568367362022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5658568119979464e-05, "grad_norm": 16.161148071289062, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8661417961120605, "num_tokens": 300459438.0, "step": 7876 }, { "epoch": 1.0020353644574482, "ewc_loss": 0.025642329826951027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.564233000157401e-05, "grad_norm": 15.998801231384277, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8411399126052856, "num_tokens": 300501443.0, "step": 7877 }, { "epoch": 1.0021625747360388, "ewc_loss": 0.025662105530500412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.566210605436936e-05, "grad_norm": 16.181612014770508, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8723101615905762, "num_tokens": 300531918.0, "step": 7878 }, { "epoch": 1.0022897850146293, "ewc_loss": 0.025728320702910423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5728320906637236e-05, "grad_norm": 16.136924743652344, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8586881160736084, "num_tokens": 300575084.0, "step": 7879 }, { "epoch": 1.0024169952932196, "ewc_loss": 0.02566477842628956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.566477814980317e-05, "grad_norm": 16.101442337036133, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8651787638664246, "num_tokens": 300608737.0, "step": 7880 }, { "epoch": 1.0025442055718101, "ewc_loss": 0.02567187137901783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5671872208476998e-05, "grad_norm": 16.087528228759766, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8535536527633667, "num_tokens": 300647855.0, "step": 7881 }, { "epoch": 1.0026714158504006, "ewc_loss": 0.025714099407196045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5714100047480315e-05, "grad_norm": 16.13874053955078, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8649886846542358, "num_tokens": 300689763.0, "step": 7882 }, { "epoch": 1.0027986261289912, "ewc_loss": 0.025681674480438232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5681674742372707e-05, "grad_norm": 16.134601593017578, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.87491774559021, "num_tokens": 300725838.0, "step": 7883 }, { "epoch": 1.0029258364075817, "ewc_loss": 0.025693286210298538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.569328535173554e-05, "grad_norm": 16.168804168701172, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8705068826675415, "num_tokens": 300760423.0, "step": 7884 }, { "epoch": 1.0030530466861722, "ewc_loss": 0.025661280378699303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.566128023318015e-05, "grad_norm": 16.11370849609375, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8711259365081787, "num_tokens": 300796236.0, "step": 7885 }, { "epoch": 1.0031802569647628, "ewc_loss": 0.02568005584180355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.568005584180355e-05, "grad_norm": 16.24493980407715, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8696569204330444, "num_tokens": 300835213.0, "step": 7886 }, { "epoch": 1.0033074672433533, "ewc_loss": 0.025709444656968117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.570944525359664e-05, "grad_norm": 16.134183883666992, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8712531328201294, "num_tokens": 300873363.0, "step": 7887 }, { "epoch": 1.0034346775219438, "ewc_loss": 0.025609303265810013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5609302610973828e-05, "grad_norm": 16.222055435180664, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8600690960884094, "num_tokens": 300910233.0, "step": 7888 }, { "epoch": 1.0035618878005343, "ewc_loss": 0.025738626718521118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5738627300597727e-05, "grad_norm": 16.18498420715332, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8720759749412537, "num_tokens": 300946986.0, "step": 7889 }, { "epoch": 1.0036890980791249, "ewc_loss": 0.02561855874955654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5618559448048472e-05, "grad_norm": 16.156034469604492, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8664450645446777, "num_tokens": 300985750.0, "step": 7890 }, { "epoch": 1.0038163083577154, "ewc_loss": 0.02567787654697895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5677876692498103e-05, "grad_norm": 16.143468856811523, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8636586666107178, "num_tokens": 301032820.0, "step": 7891 }, { "epoch": 1.0039435186363057, "ewc_loss": 0.025589652359485626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.558965206844732e-05, "grad_norm": 16.086137771606445, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8547525405883789, "num_tokens": 301070825.0, "step": 7892 }, { "epoch": 1.0040707289148962, "ewc_loss": 0.025621959939599037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.56219591392437e-05, "grad_norm": 16.095460891723633, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8609201312065125, "num_tokens": 301110764.0, "step": 7893 }, { "epoch": 1.0041979391934868, "ewc_loss": 0.02566019631922245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5660196115495637e-05, "grad_norm": 16.13957977294922, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8678184151649475, "num_tokens": 301152421.0, "step": 7894 }, { "epoch": 1.0043251494720773, "ewc_loss": 0.025655187666416168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5655188437667675e-05, "grad_norm": 16.106250762939453, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8661534786224365, "num_tokens": 301192780.0, "step": 7895 }, { "epoch": 1.0044523597506678, "ewc_loss": 0.025691645219922066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.569164462329354e-05, "grad_norm": 16.255905151367188, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8613946437835693, "num_tokens": 301228147.0, "step": 7896 }, { "epoch": 1.0045795700292584, "ewc_loss": 0.02567683532834053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5676836230559275e-05, "grad_norm": 16.16614532470703, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8745272159576416, "num_tokens": 301259568.0, "step": 7897 }, { "epoch": 1.0047067803078489, "ewc_loss": 0.02558768168091774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.558768210292328e-05, "grad_norm": 16.144624710083008, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8660957217216492, "num_tokens": 301296483.0, "step": 7898 }, { "epoch": 1.0048339905864394, "ewc_loss": 0.025640511885285378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5640511012170464e-05, "grad_norm": 16.094602584838867, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.867665708065033, "num_tokens": 301337437.0, "step": 7899 }, { "epoch": 1.00496120086503, "ewc_loss": 0.025674117729067802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5674116841400973e-05, "grad_norm": 16.18876838684082, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8639123439788818, "num_tokens": 301368777.0, "step": 7900 }, { "epoch": 1.0050884111436205, "ewc_loss": 0.02573605254292488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.573605343059171e-05, "grad_norm": 16.16428565979004, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8612070083618164, "num_tokens": 301404885.0, "step": 7901 }, { "epoch": 1.005215621422211, "ewc_loss": 0.025649577379226685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5649576855357736e-05, "grad_norm": 16.139511108398438, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8554971218109131, "num_tokens": 301445308.0, "step": 7902 }, { "epoch": 1.0053428317008015, "ewc_loss": 0.025642702355980873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5642702894401737e-05, "grad_norm": 16.146129608154297, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8610974550247192, "num_tokens": 301484845.0, "step": 7903 }, { "epoch": 1.0054700419793918, "ewc_loss": 0.025682482868433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.568248237366788e-05, "grad_norm": 16.192886352539062, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8620854020118713, "num_tokens": 301524498.0, "step": 7904 }, { "epoch": 1.0055972522579824, "ewc_loss": 0.025674037635326385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5674036805867217e-05, "grad_norm": 16.125404357910156, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.869719386100769, "num_tokens": 301565522.0, "step": 7905 }, { "epoch": 1.0057244625365729, "ewc_loss": 0.025710800662636757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5710800400702283e-05, "grad_norm": 16.203195571899414, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8607562780380249, "num_tokens": 301603724.0, "step": 7906 }, { "epoch": 1.0058516728151634, "ewc_loss": 0.02570665068924427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5706651285872795e-05, "grad_norm": 16.17275619506836, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.861708402633667, "num_tokens": 301636650.0, "step": 7907 }, { "epoch": 1.005978883093754, "ewc_loss": 0.02568245120346546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.568245145084802e-05, "grad_norm": 16.114410400390625, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8556615114212036, "num_tokens": 301675328.0, "step": 7908 }, { "epoch": 1.0061060933723445, "ewc_loss": 0.025736354291439056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5736353563843295e-05, "grad_norm": 16.2635498046875, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.858483076095581, "num_tokens": 301710058.0, "step": 7909 }, { "epoch": 1.006233303650935, "ewc_loss": 0.025755496695637703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.575549660832621e-05, "grad_norm": 16.138042449951172, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8589625954627991, "num_tokens": 301753567.0, "step": 7910 }, { "epoch": 1.0063605139295255, "ewc_loss": 0.025639450177550316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5639450541348197e-05, "grad_norm": 16.117401123046875, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8718351125717163, "num_tokens": 301792895.0, "step": 7911 }, { "epoch": 1.006487724208116, "ewc_loss": 0.025830140337347984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.583014065749012e-05, "grad_norm": 16.27773666381836, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8759399652481079, "num_tokens": 301833001.0, "step": 7912 }, { "epoch": 1.0066149344867066, "ewc_loss": 0.0257380623370409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5738061594893225e-05, "grad_norm": 16.11145782470703, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8678131103515625, "num_tokens": 301871623.0, "step": 7913 }, { "epoch": 1.006742144765297, "ewc_loss": 0.025704899802803993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.570489959907718e-05, "grad_norm": 16.21489143371582, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8757295608520508, "num_tokens": 301903750.0, "step": 7914 }, { "epoch": 1.0068693550438876, "ewc_loss": 0.025794684886932373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5794684916036204e-05, "grad_norm": 16.221996307373047, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8658548593521118, "num_tokens": 301943079.0, "step": 7915 }, { "epoch": 1.0069965653224782, "ewc_loss": 0.025691315531730652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.56913153862115e-05, "grad_norm": 16.129867553710938, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8690601587295532, "num_tokens": 301980031.0, "step": 7916 }, { "epoch": 1.0071237756010685, "ewc_loss": 0.025748800486326218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.574880090833176e-05, "grad_norm": 16.25481414794922, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8645501732826233, "num_tokens": 302020454.0, "step": 7917 }, { "epoch": 1.007250985879659, "ewc_loss": 0.025763679295778275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.576367842266336e-05, "grad_norm": 16.12246322631836, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8595470786094666, "num_tokens": 302064391.0, "step": 7918 }, { "epoch": 1.0073781961582495, "ewc_loss": 0.025696752592921257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5696752345538698e-05, "grad_norm": 16.179889678955078, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8647594451904297, "num_tokens": 302101598.0, "step": 7919 }, { "epoch": 1.00750540643684, "ewc_loss": 0.025792710483074188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5792711312533356e-05, "grad_norm": 16.15987205505371, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8692572116851807, "num_tokens": 302138803.0, "step": 7920 }, { "epoch": 1.0076326167154306, "ewc_loss": 0.02570263110101223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.570263131929096e-05, "grad_norm": 16.250062942504883, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8620731830596924, "num_tokens": 302173682.0, "step": 7921 }, { "epoch": 1.0077598269940211, "ewc_loss": 0.02576863393187523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.576863334979862e-05, "grad_norm": 16.147138595581055, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8613022565841675, "num_tokens": 302207390.0, "step": 7922 }, { "epoch": 1.0078870372726116, "ewc_loss": 0.02566969394683838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.566969305917155e-05, "grad_norm": 16.16617202758789, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8700634241104126, "num_tokens": 302244440.0, "step": 7923 }, { "epoch": 1.0080142475512022, "ewc_loss": 0.025736412033438683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.573641177150421e-05, "grad_norm": 16.14733123779297, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8595492839813232, "num_tokens": 302281058.0, "step": 7924 }, { "epoch": 1.0081414578297927, "ewc_loss": 0.025694632902741432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5694633222883567e-05, "grad_norm": 16.17730140686035, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8810422420501709, "num_tokens": 302319586.0, "step": 7925 }, { "epoch": 1.0082686681083832, "ewc_loss": 0.025740643963217735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.574064455984626e-05, "grad_norm": 16.205127716064453, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.85906583070755, "num_tokens": 302361356.0, "step": 7926 }, { "epoch": 1.0083958783869738, "ewc_loss": 0.025709200650453568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5709201509016566e-05, "grad_norm": 16.159095764160156, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8735563158988953, "num_tokens": 302399974.0, "step": 7927 }, { "epoch": 1.0085230886655643, "ewc_loss": 0.025709863752126694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5709863621159457e-05, "grad_norm": 16.157224655151367, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8672625422477722, "num_tokens": 302435709.0, "step": 7928 }, { "epoch": 1.0086502989441546, "ewc_loss": 0.02572333812713623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5723338694660924e-05, "grad_norm": 16.11520767211914, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8614623546600342, "num_tokens": 302474151.0, "step": 7929 }, { "epoch": 1.0087775092227451, "ewc_loss": 0.02576146274805069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5761462893569842e-05, "grad_norm": 16.201431274414062, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8664637804031372, "num_tokens": 302509877.0, "step": 7930 }, { "epoch": 1.0089047195013356, "ewc_loss": 0.025759823620319366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5759823984117247e-05, "grad_norm": 16.16110610961914, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8784071207046509, "num_tokens": 302547689.0, "step": 7931 }, { "epoch": 1.0090319297799262, "ewc_loss": 0.025710012763738632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5710012778290547e-05, "grad_norm": 16.115991592407227, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8629106283187866, "num_tokens": 302590728.0, "step": 7932 }, { "epoch": 1.0091591400585167, "ewc_loss": 0.02575034275650978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5750343411345966e-05, "grad_norm": 16.112783432006836, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8423402905464172, "num_tokens": 302625709.0, "step": 7933 }, { "epoch": 1.0092863503371072, "ewc_loss": 0.02578980289399624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5789802748477086e-05, "grad_norm": 16.247220993041992, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.873493492603302, "num_tokens": 302662086.0, "step": 7934 }, { "epoch": 1.0094135606156978, "ewc_loss": 0.025843368843197823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5843368348432705e-05, "grad_norm": 16.153331756591797, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8814665079116821, "num_tokens": 302703680.0, "step": 7935 }, { "epoch": 1.0095407708942883, "ewc_loss": 0.025795958936214447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5795958208618686e-05, "grad_norm": 16.25075912475586, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8512643575668335, "num_tokens": 302740426.0, "step": 7936 }, { "epoch": 1.0096679811728788, "ewc_loss": 0.02581264264881611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5812641979428008e-05, "grad_norm": 16.188631057739258, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.877130389213562, "num_tokens": 302780778.0, "step": 7937 }, { "epoch": 1.0097951914514693, "ewc_loss": 0.025757575407624245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5757575713214464e-05, "grad_norm": 16.182899475097656, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8682371377944946, "num_tokens": 302816012.0, "step": 7938 }, { "epoch": 1.0099224017300599, "ewc_loss": 0.025764206424355507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.576420592959039e-05, "grad_norm": 16.16258430480957, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8429815769195557, "num_tokens": 302858325.0, "step": 7939 }, { "epoch": 1.0100496120086504, "ewc_loss": 0.02575564943253994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.575564940343611e-05, "grad_norm": 16.13903045654297, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.865287184715271, "num_tokens": 302898210.0, "step": 7940 }, { "epoch": 1.0101768222872407, "ewc_loss": 0.025785719975829124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5785719117266126e-05, "grad_norm": 16.180316925048828, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8654553294181824, "num_tokens": 302936072.0, "step": 7941 }, { "epoch": 1.0103040325658312, "ewc_loss": 0.02578776888549328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5787769118323922e-05, "grad_norm": 16.118804931640625, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8564819693565369, "num_tokens": 302978941.0, "step": 7942 }, { "epoch": 1.0104312428444218, "ewc_loss": 0.02574162930250168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5741628633113578e-05, "grad_norm": 16.2126407623291, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8581501245498657, "num_tokens": 303016664.0, "step": 7943 }, { "epoch": 1.0105584531230123, "ewc_loss": 0.02582106739282608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5821067538345233e-05, "grad_norm": 16.161819458007812, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8616795539855957, "num_tokens": 303063313.0, "step": 7944 }, { "epoch": 1.0106856634016028, "ewc_loss": 0.025723189115524292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5723189537529834e-05, "grad_norm": 16.220096588134766, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.854944109916687, "num_tokens": 303103786.0, "step": 7945 }, { "epoch": 1.0108128736801933, "ewc_loss": 0.02575867436826229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5758674382814206e-05, "grad_norm": 16.205629348754883, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.869613766670227, "num_tokens": 303136928.0, "step": 7946 }, { "epoch": 1.0109400839587839, "ewc_loss": 0.025732753798365593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5732753783813678e-05, "grad_norm": 16.136573791503906, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.871575117111206, "num_tokens": 303174854.0, "step": 7947 }, { "epoch": 1.0110672942373744, "ewc_loss": 0.025774894282221794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5774894311325625e-05, "grad_norm": 16.16280174255371, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.862704873085022, "num_tokens": 303214234.0, "step": 7948 }, { "epoch": 1.011194504515965, "ewc_loss": 0.02581252157688141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5812521926127374e-05, "grad_norm": 16.20446014404297, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8778589367866516, "num_tokens": 303251830.0, "step": 7949 }, { "epoch": 1.0113217147945555, "ewc_loss": 0.025745978578925133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5745977836777456e-05, "grad_norm": 16.135229110717773, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8544647693634033, "num_tokens": 303291564.0, "step": 7950 }, { "epoch": 1.011448925073146, "ewc_loss": 0.02575944922864437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5759449272300117e-05, "grad_norm": 16.182714462280273, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8621881008148193, "num_tokens": 303328642.0, "step": 7951 }, { "epoch": 1.0115761353517365, "ewc_loss": 0.02579728700220585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5797287889872678e-05, "grad_norm": 16.153745651245117, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8622969388961792, "num_tokens": 303373693.0, "step": 7952 }, { "epoch": 1.0117033456303268, "ewc_loss": 0.025768928229808807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5768928026081994e-05, "grad_norm": 16.272621154785156, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8690183162689209, "num_tokens": 303407124.0, "step": 7953 }, { "epoch": 1.0118305559089174, "ewc_loss": 0.025841446593403816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5841445676633157e-05, "grad_norm": 16.1899471282959, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8753790855407715, "num_tokens": 303441586.0, "step": 7954 }, { "epoch": 1.0119577661875079, "ewc_loss": 0.025716731324791908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5716732125147246e-05, "grad_norm": 16.227500915527344, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8662916421890259, "num_tokens": 303473593.0, "step": 7955 }, { "epoch": 1.0120849764660984, "ewc_loss": 0.025798944756388664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5798944989219308e-05, "grad_norm": 16.22127914428711, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8613311648368835, "num_tokens": 303510239.0, "step": 7956 }, { "epoch": 1.012212186744689, "ewc_loss": 0.02575269155204296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5752691726665944e-05, "grad_norm": 16.1878719329834, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8624502420425415, "num_tokens": 303544709.0, "step": 7957 }, { "epoch": 1.0123393970232795, "ewc_loss": 0.025784043595194817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.578404382802546e-05, "grad_norm": 16.0893497467041, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8736377954483032, "num_tokens": 303581990.0, "step": 7958 }, { "epoch": 1.01246660730187, "ewc_loss": 0.025784090161323547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.578408930276055e-05, "grad_norm": 16.183805465698242, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8681355118751526, "num_tokens": 303625167.0, "step": 7959 }, { "epoch": 1.0125938175804605, "ewc_loss": 0.0258293766528368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582937668194063e-05, "grad_norm": 16.134437561035156, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8658504486083984, "num_tokens": 303664751.0, "step": 7960 }, { "epoch": 1.012721027859051, "ewc_loss": 0.025799011811614037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579901229182724e-05, "grad_norm": 16.205698013305664, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.862790584564209, "num_tokens": 303702190.0, "step": 7961 }, { "epoch": 1.0128482381376416, "ewc_loss": 0.02586614340543747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5866143914754502e-05, "grad_norm": 16.132516860961914, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.879027247428894, "num_tokens": 303742872.0, "step": 7962 }, { "epoch": 1.012975448416232, "ewc_loss": 0.025779010728001595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.577901068434585e-05, "grad_norm": 16.22573471069336, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8578084707260132, "num_tokens": 303781265.0, "step": 7963 }, { "epoch": 1.0131026586948226, "ewc_loss": 0.02585827186703682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5858271328615956e-05, "grad_norm": 16.135156631469727, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.865169882774353, "num_tokens": 303820686.0, "step": 7964 }, { "epoch": 1.0132298689734132, "ewc_loss": 0.025833209976553917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.58332092926139e-05, "grad_norm": 16.197757720947266, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8679484128952026, "num_tokens": 303863428.0, "step": 7965 }, { "epoch": 1.0133570792520035, "ewc_loss": 0.025816889479756355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5816889319685288e-05, "grad_norm": 16.118488311767578, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8603485226631165, "num_tokens": 303903847.0, "step": 7966 }, { "epoch": 1.013484289530594, "ewc_loss": 0.025813696905970573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5813696993282065e-05, "grad_norm": 16.147682189941406, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.857598066329956, "num_tokens": 303943083.0, "step": 7967 }, { "epoch": 1.0136114998091845, "ewc_loss": 0.025844965130090714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5844965421129018e-05, "grad_norm": 16.15383529663086, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8745853900909424, "num_tokens": 303978773.0, "step": 7968 }, { "epoch": 1.013738710087775, "ewc_loss": 0.025800447911024094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5800447474466637e-05, "grad_norm": 16.16896629333496, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8722867965698242, "num_tokens": 304017058.0, "step": 7969 }, { "epoch": 1.0138659203663656, "ewc_loss": 0.025775182992219925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.577518353064079e-05, "grad_norm": 16.146385192871094, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8566190004348755, "num_tokens": 304054104.0, "step": 7970 }, { "epoch": 1.013993130644956, "ewc_loss": 0.025793738663196564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579373904154636e-05, "grad_norm": 16.101470947265625, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8792567253112793, "num_tokens": 304091100.0, "step": 7971 }, { "epoch": 1.0141203409235466, "ewc_loss": 0.02579963393509388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5799634386203252e-05, "grad_norm": 16.196369171142578, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8659806847572327, "num_tokens": 304127747.0, "step": 7972 }, { "epoch": 1.0142475512021372, "ewc_loss": 0.025810658931732178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5810659280978143e-05, "grad_norm": 16.17107582092285, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8653401136398315, "num_tokens": 304171089.0, "step": 7973 }, { "epoch": 1.0143747614807277, "ewc_loss": 0.02577555552124977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5775554604479112e-05, "grad_norm": 16.156993865966797, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8560193181037903, "num_tokens": 304210207.0, "step": 7974 }, { "epoch": 1.0145019717593182, "ewc_loss": 0.025735557079315186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5735556846484542e-05, "grad_norm": 16.17498207092285, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8824375867843628, "num_tokens": 304249423.0, "step": 7975 }, { "epoch": 1.0146291820379088, "ewc_loss": 0.025758618488907814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5758617994142696e-05, "grad_norm": 16.199962615966797, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8699307441711426, "num_tokens": 304286872.0, "step": 7976 }, { "epoch": 1.0147563923164993, "ewc_loss": 0.025754626840353012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5754627131391317e-05, "grad_norm": 16.20629119873047, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.868545413017273, "num_tokens": 304323033.0, "step": 7977 }, { "epoch": 1.0148836025950896, "ewc_loss": 0.025728849694132805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.572885023255367e-05, "grad_norm": 16.215518951416016, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8681203126907349, "num_tokens": 304360096.0, "step": 7978 }, { "epoch": 1.0150108128736801, "ewc_loss": 0.025756126269698143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.575612597865984e-05, "grad_norm": 16.198211669921875, "learning_rate": 1e-06, "loss": 0.349, "mean_token_accuracy": 0.8855676651000977, "num_tokens": 304396364.0, "step": 7979 }, { "epoch": 1.0151380231522706, "ewc_loss": 0.025767778977751732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5767778424778953e-05, "grad_norm": 16.24150276184082, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8664262890815735, "num_tokens": 304431583.0, "step": 7980 }, { "epoch": 1.0152652334308612, "ewc_loss": 0.025746874511241913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5746874598553404e-05, "grad_norm": 16.251577377319336, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8593635559082031, "num_tokens": 304470531.0, "step": 7981 }, { "epoch": 1.0153924437094517, "ewc_loss": 0.025721950456500053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.572195080574602e-05, "grad_norm": 16.14236831665039, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8695584535598755, "num_tokens": 304505187.0, "step": 7982 }, { "epoch": 1.0155196539880422, "ewc_loss": 0.025731241330504417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.573124220361933e-05, "grad_norm": 16.21662139892578, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8434381484985352, "num_tokens": 304542685.0, "step": 7983 }, { "epoch": 1.0156468642666328, "ewc_loss": 0.02573912777006626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5739127522683702e-05, "grad_norm": 16.124120712280273, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8651003837585449, "num_tokens": 304579204.0, "step": 7984 }, { "epoch": 1.0157740745452233, "ewc_loss": 0.025705572217702866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5705572625156492e-05, "grad_norm": 16.106548309326172, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8727870583534241, "num_tokens": 304615224.0, "step": 7985 }, { "epoch": 1.0159012848238138, "ewc_loss": 0.02575930394232273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5759303753147833e-05, "grad_norm": 16.142169952392578, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8603225946426392, "num_tokens": 304653513.0, "step": 7986 }, { "epoch": 1.0160284951024043, "ewc_loss": 0.0258161798119545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5816179913817905e-05, "grad_norm": 16.201223373413086, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8704835176467896, "num_tokens": 304686744.0, "step": 7987 }, { "epoch": 1.0161557053809949, "ewc_loss": 0.025782978162169456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5782977900234982e-05, "grad_norm": 16.1718807220459, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8648126125335693, "num_tokens": 304727262.0, "step": 7988 }, { "epoch": 1.0162829156595854, "ewc_loss": 0.02580428682267666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5804287361097522e-05, "grad_norm": 16.149768829345703, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8627451658248901, "num_tokens": 304763418.0, "step": 7989 }, { "epoch": 1.0164101259381757, "ewc_loss": 0.02579559199512005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5795592591748573e-05, "grad_norm": 16.179933547973633, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8693867921829224, "num_tokens": 304800763.0, "step": 7990 }, { "epoch": 1.0165373362167662, "ewc_loss": 0.025905607268214226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.590560688986443e-05, "grad_norm": 16.229644775390625, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8846265077590942, "num_tokens": 304833136.0, "step": 7991 }, { "epoch": 1.0166645464953568, "ewc_loss": 0.02580871805548668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.580871841928456e-05, "grad_norm": 16.211185455322266, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8747672438621521, "num_tokens": 304873208.0, "step": 7992 }, { "epoch": 1.0167917567739473, "ewc_loss": 0.02581806480884552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581806438683998e-05, "grad_norm": 16.154672622680664, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8660373687744141, "num_tokens": 304914237.0, "step": 7993 }, { "epoch": 1.0169189670525378, "ewc_loss": 0.025816669687628746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581666922196746e-05, "grad_norm": 16.242616653442383, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8811753392219543, "num_tokens": 304955453.0, "step": 7994 }, { "epoch": 1.0170461773311283, "ewc_loss": 0.025855910032987595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5855910280370153e-05, "grad_norm": 16.130903244018555, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8652136921882629, "num_tokens": 304990533.0, "step": 7995 }, { "epoch": 1.0171733876097189, "ewc_loss": 0.02580302208662033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5803021344472654e-05, "grad_norm": 16.27635955810547, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8644174933433533, "num_tokens": 305024849.0, "step": 7996 }, { "epoch": 1.0173005978883094, "ewc_loss": 0.025847958400845528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.584795765869785e-05, "grad_norm": 16.14157485961914, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8623155951499939, "num_tokens": 305061615.0, "step": 7997 }, { "epoch": 1.0174278081669, "ewc_loss": 0.025808321312069893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5808321879594587e-05, "grad_norm": 16.245845794677734, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8465508222579956, "num_tokens": 305102251.0, "step": 7998 }, { "epoch": 1.0175550184454905, "ewc_loss": 0.025873737409710884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5873738195514306e-05, "grad_norm": 16.22230339050293, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8744324445724487, "num_tokens": 305135405.0, "step": 7999 }, { "epoch": 1.017682228724081, "ewc_loss": 0.025768492370843887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5768493287614547e-05, "grad_norm": 16.197399139404297, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8645156025886536, "num_tokens": 305169005.0, "step": 8000 }, { "epoch": 1.0178094390026715, "ewc_loss": 0.025830790400505066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5830790036707185e-05, "grad_norm": 16.177234649658203, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8616408109664917, "num_tokens": 305209050.0, "step": 8001 }, { "epoch": 1.0179366492812618, "ewc_loss": 0.02579301968216896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579302054073196e-05, "grad_norm": 16.218847274780273, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8655850291252136, "num_tokens": 305238250.0, "step": 8002 }, { "epoch": 1.0180638595598523, "ewc_loss": 0.025924114510416985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.592411510704551e-05, "grad_norm": 16.264633178710938, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.869084358215332, "num_tokens": 305283544.0, "step": 8003 }, { "epoch": 1.0181910698384429, "ewc_loss": 0.025823960080742836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582395973149687e-05, "grad_norm": 16.22494888305664, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8485527634620667, "num_tokens": 305320548.0, "step": 8004 }, { "epoch": 1.0183182801170334, "ewc_loss": 0.02583962306380272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5839623049250804e-05, "grad_norm": 16.162260055541992, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8717935085296631, "num_tokens": 305359560.0, "step": 8005 }, { "epoch": 1.018445490395624, "ewc_loss": 0.025855490937829018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5855490093817934e-05, "grad_norm": 16.212329864501953, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8629195094108582, "num_tokens": 305396680.0, "step": 8006 }, { "epoch": 1.0185727006742145, "ewc_loss": 0.025875231251120567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5875231585814618e-05, "grad_norm": 16.21067237854004, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8750633597373962, "num_tokens": 305432856.0, "step": 8007 }, { "epoch": 1.018699910952805, "ewc_loss": 0.0258277989923954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5827799618127756e-05, "grad_norm": 16.221982955932617, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8675808906555176, "num_tokens": 305469973.0, "step": 8008 }, { "epoch": 1.0188271212313955, "ewc_loss": 0.02587701752781868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.58770178334089e-05, "grad_norm": 16.191017150878906, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8719812035560608, "num_tokens": 305505576.0, "step": 8009 }, { "epoch": 1.018954331509986, "ewc_loss": 0.025839565321803093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.583956484158989e-05, "grad_norm": 16.22749137878418, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8480609059333801, "num_tokens": 305542037.0, "step": 8010 }, { "epoch": 1.0190815417885766, "ewc_loss": 0.025883108377456665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.588310780993197e-05, "grad_norm": 16.271760940551758, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8734608888626099, "num_tokens": 305587784.0, "step": 8011 }, { "epoch": 1.019208752067167, "ewc_loss": 0.025767773389816284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5767772967810743e-05, "grad_norm": 16.16280746459961, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8586808443069458, "num_tokens": 305621888.0, "step": 8012 }, { "epoch": 1.0193359623457576, "ewc_loss": 0.025801997631788254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5801997253438458e-05, "grad_norm": 16.14336585998535, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8854944705963135, "num_tokens": 305663844.0, "step": 8013 }, { "epoch": 1.0194631726243482, "ewc_loss": 0.025841403752565384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5841403839876875e-05, "grad_norm": 16.189254760742188, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8813321590423584, "num_tokens": 305695913.0, "step": 8014 }, { "epoch": 1.0195903829029385, "ewc_loss": 0.025830579921603203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5830579033936374e-05, "grad_norm": 16.173465728759766, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8645738363265991, "num_tokens": 305733632.0, "step": 8015 }, { "epoch": 1.019717593181529, "ewc_loss": 0.025844737887382507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5844738047453575e-05, "grad_norm": 16.230606079101562, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8509944677352905, "num_tokens": 305766917.0, "step": 8016 }, { "epoch": 1.0198448034601195, "ewc_loss": 0.02583308331668377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5833083782345057e-05, "grad_norm": 16.14014434814453, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8601086139678955, "num_tokens": 305805768.0, "step": 8017 }, { "epoch": 1.01997201373871, "ewc_loss": 0.025813423097133636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581342232588213e-05, "grad_norm": 16.242076873779297, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.86670982837677, "num_tokens": 305843412.0, "step": 8018 }, { "epoch": 1.0200992240173006, "ewc_loss": 0.025865981355309486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5865982024697587e-05, "grad_norm": 16.205625534057617, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8810651898384094, "num_tokens": 305880523.0, "step": 8019 }, { "epoch": 1.020226434295891, "ewc_loss": 0.025818105787038803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581810622359626e-05, "grad_norm": 16.218122482299805, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8684924840927124, "num_tokens": 305915379.0, "step": 8020 }, { "epoch": 1.0203536445744816, "ewc_loss": 0.02587834745645523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587834751466289e-05, "grad_norm": 16.21611213684082, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8799659013748169, "num_tokens": 305952471.0, "step": 8021 }, { "epoch": 1.0204808548530722, "ewc_loss": 0.02583574317395687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.583574314485304e-05, "grad_norm": 16.14204216003418, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8656191825866699, "num_tokens": 305994023.0, "step": 8022 }, { "epoch": 1.0206080651316627, "ewc_loss": 0.025821812450885773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5821813324000686e-05, "grad_norm": 16.252073287963867, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8713303804397583, "num_tokens": 306026563.0, "step": 8023 }, { "epoch": 1.0207352754102532, "ewc_loss": 0.025898173451423645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5898172680172138e-05, "grad_norm": 16.185380935668945, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8629274368286133, "num_tokens": 306067913.0, "step": 8024 }, { "epoch": 1.0208624856888437, "ewc_loss": 0.025782467797398567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.578246858320199e-05, "grad_norm": 16.18181037902832, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8628782629966736, "num_tokens": 306108692.0, "step": 8025 }, { "epoch": 1.0209896959674343, "ewc_loss": 0.02584931254386902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5849312805803493e-05, "grad_norm": 16.1664981842041, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8667941689491272, "num_tokens": 306148082.0, "step": 8026 }, { "epoch": 1.0211169062460246, "ewc_loss": 0.0258607417345047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.586074151622597e-05, "grad_norm": 16.201152801513672, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8735907077789307, "num_tokens": 306182800.0, "step": 8027 }, { "epoch": 1.021244116524615, "ewc_loss": 0.025849396362900734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5849396479316056e-05, "grad_norm": 16.183874130249023, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8631982207298279, "num_tokens": 306226741.0, "step": 8028 }, { "epoch": 1.0213713268032056, "ewc_loss": 0.02581893838942051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581893750175368e-05, "grad_norm": 16.12868309020996, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8640819191932678, "num_tokens": 306265993.0, "step": 8029 }, { "epoch": 1.0214985370817962, "ewc_loss": 0.025846676900982857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5846677090157755e-05, "grad_norm": 16.305078506469727, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8709055185317993, "num_tokens": 306306511.0, "step": 8030 }, { "epoch": 1.0216257473603867, "ewc_loss": 0.025904420763254166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5904420908773318e-05, "grad_norm": 16.258459091186523, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8803716897964478, "num_tokens": 306337687.0, "step": 8031 }, { "epoch": 1.0217529576389772, "ewc_loss": 0.025851136073470116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5851135433185846e-05, "grad_norm": 16.2247371673584, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8314833641052246, "num_tokens": 306379646.0, "step": 8032 }, { "epoch": 1.0218801679175677, "ewc_loss": 0.02583228424191475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5832283427007496e-05, "grad_norm": 16.238231658935547, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8660696148872375, "num_tokens": 306415389.0, "step": 8033 }, { "epoch": 1.0220073781961583, "ewc_loss": 0.025850214064121246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5850213205558248e-05, "grad_norm": 16.213764190673828, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8609026074409485, "num_tokens": 306452035.0, "step": 8034 }, { "epoch": 1.0221345884747488, "ewc_loss": 0.025815993547439575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5815994376898743e-05, "grad_norm": 16.188060760498047, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8655087947845459, "num_tokens": 306496277.0, "step": 8035 }, { "epoch": 1.0222617987533393, "ewc_loss": 0.025797320529818535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579732063168194e-05, "grad_norm": 16.16229248046875, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8709653615951538, "num_tokens": 306528212.0, "step": 8036 }, { "epoch": 1.0223890090319299, "ewc_loss": 0.02585163153707981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5851632017293014e-05, "grad_norm": 16.18608283996582, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8700035810470581, "num_tokens": 306562047.0, "step": 8037 }, { "epoch": 1.0225162193105204, "ewc_loss": 0.025825457647442818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582545675977599e-05, "grad_norm": 16.164888381958008, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8566908836364746, "num_tokens": 306601298.0, "step": 8038 }, { "epoch": 1.0226434295891107, "ewc_loss": 0.025870006531476974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587000562925823e-05, "grad_norm": 16.187137603759766, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.865963339805603, "num_tokens": 306640961.0, "step": 8039 }, { "epoch": 1.0227706398677012, "ewc_loss": 0.025819385424256325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5819384973146953e-05, "grad_norm": 16.242340087890625, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8846170902252197, "num_tokens": 306679507.0, "step": 8040 }, { "epoch": 1.0228978501462918, "ewc_loss": 0.02589266188442707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5892661142279394e-05, "grad_norm": 16.29328727722168, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8571727275848389, "num_tokens": 306717337.0, "step": 8041 }, { "epoch": 1.0230250604248823, "ewc_loss": 0.025839541107416153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5839541194727644e-05, "grad_norm": 16.196544647216797, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.840070366859436, "num_tokens": 306754152.0, "step": 8042 }, { "epoch": 1.0231522707034728, "ewc_loss": 0.025804441422224045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5804441975196823e-05, "grad_norm": 16.18389320373535, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8589812517166138, "num_tokens": 306794887.0, "step": 8043 }, { "epoch": 1.0232794809820633, "ewc_loss": 0.025843268260359764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.584326830401551e-05, "grad_norm": 16.187955856323242, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8698569536209106, "num_tokens": 306828374.0, "step": 8044 }, { "epoch": 1.0234066912606539, "ewc_loss": 0.025856636464595795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5856636057142168e-05, "grad_norm": 16.261720657348633, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8694930076599121, "num_tokens": 306865285.0, "step": 8045 }, { "epoch": 1.0235339015392444, "ewc_loss": 0.025856591761112213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5856592401396483e-05, "grad_norm": 16.16474723815918, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8720420598983765, "num_tokens": 306897195.0, "step": 8046 }, { "epoch": 1.023661111817835, "ewc_loss": 0.025862207636237144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.586220762168523e-05, "grad_norm": 16.33722496032715, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8644901514053345, "num_tokens": 306935352.0, "step": 8047 }, { "epoch": 1.0237883220964255, "ewc_loss": 0.025909632444381714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5909632313414477e-05, "grad_norm": 16.300474166870117, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8720351457595825, "num_tokens": 306964621.0, "step": 8048 }, { "epoch": 1.023915532375016, "ewc_loss": 0.025820834562182426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582083470770158e-05, "grad_norm": 16.21979331970215, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8644684553146362, "num_tokens": 307003451.0, "step": 8049 }, { "epoch": 1.0240427426536065, "ewc_loss": 0.025838498026132584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5838498913799413e-05, "grad_norm": 16.233823776245117, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8664363622665405, "num_tokens": 307044695.0, "step": 8050 }, { "epoch": 1.0241699529321968, "ewc_loss": 0.02582845836877823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582845809229184e-05, "grad_norm": 16.186826705932617, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8725284337997437, "num_tokens": 307080603.0, "step": 8051 }, { "epoch": 1.0242971632107873, "ewc_loss": 0.025819318369030952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.581931767053902e-05, "grad_norm": 16.20805549621582, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8596752882003784, "num_tokens": 307122300.0, "step": 8052 }, { "epoch": 1.0244243734893779, "ewc_loss": 0.02585436776280403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5854367777355947e-05, "grad_norm": 16.158079147338867, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8716065883636475, "num_tokens": 307158685.0, "step": 8053 }, { "epoch": 1.0245515837679684, "ewc_loss": 0.025891929864883423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.589192990853917e-05, "grad_norm": 16.25787353515625, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8718622922897339, "num_tokens": 307199104.0, "step": 8054 }, { "epoch": 1.024678794046559, "ewc_loss": 0.02586890384554863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.586890332167968e-05, "grad_norm": 16.17014503479004, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8579652309417725, "num_tokens": 307235840.0, "step": 8055 }, { "epoch": 1.0248060043251495, "ewc_loss": 0.025829656049609184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5829656806308776e-05, "grad_norm": 16.2226619720459, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8553394675254822, "num_tokens": 307273089.0, "step": 8056 }, { "epoch": 1.02493321460374, "ewc_loss": 0.025909889489412308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5909888790920377e-05, "grad_norm": 16.194042205810547, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8667848110198975, "num_tokens": 307318825.0, "step": 8057 }, { "epoch": 1.0250604248823305, "ewc_loss": 0.025882432237267494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5882432964863256e-05, "grad_norm": 16.20415496826172, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8726426362991333, "num_tokens": 307357006.0, "step": 8058 }, { "epoch": 1.025187635160921, "ewc_loss": 0.025903871282935143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5903871573973447e-05, "grad_norm": 16.270959854125977, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8644968867301941, "num_tokens": 307390274.0, "step": 8059 }, { "epoch": 1.0253148454395116, "ewc_loss": 0.025906596332788467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.590659642009996e-05, "grad_norm": 16.122888565063477, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8747256398200989, "num_tokens": 307427157.0, "step": 8060 }, { "epoch": 1.025442055718102, "ewc_loss": 0.025862347334623337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5862347683869302e-05, "grad_norm": 16.25840950012207, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.863726019859314, "num_tokens": 307466555.0, "step": 8061 }, { "epoch": 1.0255692659966926, "ewc_loss": 0.025957975536584854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5957975594792515e-05, "grad_norm": 16.194974899291992, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8586255311965942, "num_tokens": 307503375.0, "step": 8062 }, { "epoch": 1.0256964762752832, "ewc_loss": 0.025903644040226936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5903644200298004e-05, "grad_norm": 16.188434600830078, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8755836486816406, "num_tokens": 307543170.0, "step": 8063 }, { "epoch": 1.0258236865538735, "ewc_loss": 0.025950821116566658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.595082150946837e-05, "grad_norm": 16.1927433013916, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8627465963363647, "num_tokens": 307582890.0, "step": 8064 }, { "epoch": 1.025950896832464, "ewc_loss": 0.02591819316148758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5918192477547564e-05, "grad_norm": 16.180063247680664, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8584548234939575, "num_tokens": 307617643.0, "step": 8065 }, { "epoch": 1.0260781071110545, "ewc_loss": 0.025935422629117966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.593542194517795e-05, "grad_norm": 16.23639488220215, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8631339073181152, "num_tokens": 307652258.0, "step": 8066 }, { "epoch": 1.026205317389645, "ewc_loss": 0.025899147614836693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.589914765849244e-05, "grad_norm": 16.181787490844727, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8616838455200195, "num_tokens": 307693245.0, "step": 8067 }, { "epoch": 1.0263325276682356, "ewc_loss": 0.025878936052322388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587893686722964e-05, "grad_norm": 16.16434097290039, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8689181804656982, "num_tokens": 307728751.0, "step": 8068 }, { "epoch": 1.026459737946826, "ewc_loss": 0.025899402797222137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5899402317008935e-05, "grad_norm": 16.236387252807617, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8550131916999817, "num_tokens": 307767869.0, "step": 8069 }, { "epoch": 1.0265869482254166, "ewc_loss": 0.025937428697943687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.593742829049006e-05, "grad_norm": 16.17108154296875, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.87534099817276, "num_tokens": 307809438.0, "step": 8070 }, { "epoch": 1.0267141585040072, "ewc_loss": 0.025892464444041252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.589246469142381e-05, "grad_norm": 16.255647659301758, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8460922241210938, "num_tokens": 307848752.0, "step": 8071 }, { "epoch": 1.0268413687825977, "ewc_loss": 0.025921396911144257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.592139753687661e-05, "grad_norm": 16.14870834350586, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8608811497688293, "num_tokens": 307886080.0, "step": 8072 }, { "epoch": 1.0269685790611882, "ewc_loss": 0.02585538662970066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5855386411421932e-05, "grad_norm": 16.247331619262695, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8790919780731201, "num_tokens": 307922854.0, "step": 8073 }, { "epoch": 1.0270957893397787, "ewc_loss": 0.025933852419257164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.593385215732269e-05, "grad_norm": 16.161291122436523, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.863400936126709, "num_tokens": 307960032.0, "step": 8074 }, { "epoch": 1.0272229996183693, "ewc_loss": 0.02583700604736805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.58370055234991e-05, "grad_norm": 16.15624237060547, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8654683232307434, "num_tokens": 307996074.0, "step": 8075 }, { "epoch": 1.0273502098969596, "ewc_loss": 0.025936339050531387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5936338715837337e-05, "grad_norm": 16.16502571105957, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8781247138977051, "num_tokens": 308030833.0, "step": 8076 }, { "epoch": 1.02747742017555, "ewc_loss": 0.025949349626898766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.59493499470409e-05, "grad_norm": 16.201416015625, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8759304881095886, "num_tokens": 308079620.0, "step": 8077 }, { "epoch": 1.0276046304541406, "ewc_loss": 0.02595207467675209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5952074793167412e-05, "grad_norm": 16.184402465820312, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8601035475730896, "num_tokens": 308118555.0, "step": 8078 }, { "epoch": 1.0277318407327312, "ewc_loss": 0.025953393429517746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5953393560484983e-05, "grad_norm": 16.306299209594727, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8488799333572388, "num_tokens": 308154410.0, "step": 8079 }, { "epoch": 1.0278590510113217, "ewc_loss": 0.02592162787914276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5921628548530862e-05, "grad_norm": 16.157651901245117, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8668431639671326, "num_tokens": 308191990.0, "step": 8080 }, { "epoch": 1.0279862612899122, "ewc_loss": 0.025874044746160507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5874045604723506e-05, "grad_norm": 16.255924224853516, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8458917140960693, "num_tokens": 308231047.0, "step": 8081 }, { "epoch": 1.0281134715685027, "ewc_loss": 0.025961901992559433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.596190279291477e-05, "grad_norm": 16.18628692626953, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8792093396186829, "num_tokens": 308269487.0, "step": 8082 }, { "epoch": 1.0282406818470933, "ewc_loss": 0.025894014164805412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5894014470395632e-05, "grad_norm": 16.20899772644043, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.865982174873352, "num_tokens": 308304974.0, "step": 8083 }, { "epoch": 1.0283678921256838, "ewc_loss": 0.02595851942896843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5958519472624175e-05, "grad_norm": 16.17728614807129, "learning_rate": 1e-06, "loss": 0.3492, "mean_token_accuracy": 0.8872803449630737, "num_tokens": 308342072.0, "step": 8084 }, { "epoch": 1.0284951024042743, "ewc_loss": 0.02591915801167488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5919158360920846e-05, "grad_norm": 16.230323791503906, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8619895577430725, "num_tokens": 308374293.0, "step": 8085 }, { "epoch": 1.0286223126828649, "ewc_loss": 0.02597329579293728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5973295123549178e-05, "grad_norm": 16.139371871948242, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8709477782249451, "num_tokens": 308415872.0, "step": 8086 }, { "epoch": 1.0287495229614554, "ewc_loss": 0.025939183309674263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5939183615264483e-05, "grad_norm": 16.251399993896484, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8593959212303162, "num_tokens": 308451198.0, "step": 8087 }, { "epoch": 1.0288767332400457, "ewc_loss": 0.026020240038633347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.602023960207589e-05, "grad_norm": 16.161800384521484, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8734415173530579, "num_tokens": 308490554.0, "step": 8088 }, { "epoch": 1.0290039435186362, "ewc_loss": 0.025984227657318115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.598422724986449e-05, "grad_norm": 16.245267868041992, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8603554368019104, "num_tokens": 308530221.0, "step": 8089 }, { "epoch": 1.0291311537972267, "ewc_loss": 0.02598925493657589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.598925493657589e-05, "grad_norm": 16.211994171142578, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8507064580917358, "num_tokens": 308563634.0, "step": 8090 }, { "epoch": 1.0292583640758173, "ewc_loss": 0.02596382424235344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5963823645724915e-05, "grad_norm": 16.23965072631836, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8662809729576111, "num_tokens": 308605033.0, "step": 8091 }, { "epoch": 1.0293855743544078, "ewc_loss": 0.025926562026143074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5926561647793278e-05, "grad_norm": 16.241744995117188, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8657633662223816, "num_tokens": 308642603.0, "step": 8092 }, { "epoch": 1.0295127846329983, "ewc_loss": 0.025971239432692528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.597123966552317e-05, "grad_norm": 16.184091567993164, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8627505302429199, "num_tokens": 308685189.0, "step": 8093 }, { "epoch": 1.0296399949115889, "ewc_loss": 0.025955380871891975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.595538171590306e-05, "grad_norm": 16.226980209350586, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8652194142341614, "num_tokens": 308726052.0, "step": 8094 }, { "epoch": 1.0297672051901794, "ewc_loss": 0.02594439871609211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.594439865788445e-05, "grad_norm": 16.150344848632812, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8543235063552856, "num_tokens": 308764828.0, "step": 8095 }, { "epoch": 1.02989441546877, "ewc_loss": 0.025978658348321915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.597865750431083e-05, "grad_norm": 16.30683135986328, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8763331770896912, "num_tokens": 308799712.0, "step": 8096 }, { "epoch": 1.0300216257473604, "ewc_loss": 0.02598843351006508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5988432753365487e-05, "grad_norm": 16.205595016479492, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8656037449836731, "num_tokens": 308839879.0, "step": 8097 }, { "epoch": 1.030148836025951, "ewc_loss": 0.025917263701558113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.591726297396235e-05, "grad_norm": 16.27839469909668, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8587797284126282, "num_tokens": 308880362.0, "step": 8098 }, { "epoch": 1.0302760463045415, "ewc_loss": 0.026008974760770798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600897460069973e-05, "grad_norm": 16.187896728515625, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8761063814163208, "num_tokens": 308921395.0, "step": 8099 }, { "epoch": 1.0304032565831318, "ewc_loss": 0.025884727016091347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5884726710501127e-05, "grad_norm": 16.23240089416504, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8746752142906189, "num_tokens": 308959976.0, "step": 8100 }, { "epoch": 1.0305304668617223, "ewc_loss": 0.025972427800297737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5972427465603687e-05, "grad_norm": 16.303882598876953, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8790040612220764, "num_tokens": 309000702.0, "step": 8101 }, { "epoch": 1.0306576771403129, "ewc_loss": 0.025847261771559715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5847260985756293e-05, "grad_norm": 16.203094482421875, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8840155601501465, "num_tokens": 309036540.0, "step": 8102 }, { "epoch": 1.0307848874189034, "ewc_loss": 0.025885362178087234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5885361537802964e-05, "grad_norm": 16.263193130493164, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8648464679718018, "num_tokens": 309072408.0, "step": 8103 }, { "epoch": 1.030912097697494, "ewc_loss": 0.02595577947795391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5955780074582435e-05, "grad_norm": 16.19668197631836, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.853265106678009, "num_tokens": 309110366.0, "step": 8104 }, { "epoch": 1.0310393079760845, "ewc_loss": 0.025876550003886223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587655035313219e-05, "grad_norm": 16.200389862060547, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8688628077507019, "num_tokens": 309155194.0, "step": 8105 }, { "epoch": 1.031166518254675, "ewc_loss": 0.025903277099132538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5903276764438488e-05, "grad_norm": 16.348478317260742, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8745423555374146, "num_tokens": 309193747.0, "step": 8106 }, { "epoch": 1.0312937285332655, "ewc_loss": 0.02587444894015789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5874449420371093e-05, "grad_norm": 16.21291732788086, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8541375398635864, "num_tokens": 309234161.0, "step": 8107 }, { "epoch": 1.031420938811856, "ewc_loss": 0.025821171700954437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5821171220741235e-05, "grad_norm": 16.24173927307129, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8555194139480591, "num_tokens": 309272574.0, "step": 8108 }, { "epoch": 1.0315481490904466, "ewc_loss": 0.025923246517777443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5923245630110614e-05, "grad_norm": 16.328487396240234, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.868023693561554, "num_tokens": 309306567.0, "step": 8109 }, { "epoch": 1.031675359369037, "ewc_loss": 0.025860249996185303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5860250389087014e-05, "grad_norm": 16.195886611938477, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8702616095542908, "num_tokens": 309347218.0, "step": 8110 }, { "epoch": 1.0318025696476276, "ewc_loss": 0.025891022756695747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.58910222328268e-05, "grad_norm": 16.286670684814453, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8800989389419556, "num_tokens": 309385701.0, "step": 8111 }, { "epoch": 1.0319297799262181, "ewc_loss": 0.025884408503770828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5884408387355506e-05, "grad_norm": 16.24030303955078, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8640592694282532, "num_tokens": 309424600.0, "step": 8112 }, { "epoch": 1.0320569902048085, "ewc_loss": 0.02579818107187748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579818101366982e-05, "grad_norm": 16.262041091918945, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8446991443634033, "num_tokens": 309459929.0, "step": 8113 }, { "epoch": 1.032184200483399, "ewc_loss": 0.02590997889637947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5909979740390554e-05, "grad_norm": 16.200119018554688, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8587613105773926, "num_tokens": 309505085.0, "step": 8114 }, { "epoch": 1.0323114107619895, "ewc_loss": 0.025848055258393288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5848055884125642e-05, "grad_norm": 16.253923416137695, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8682509660720825, "num_tokens": 309544746.0, "step": 8115 }, { "epoch": 1.03243862104058, "ewc_loss": 0.025950083509087563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.595008299977053e-05, "grad_norm": 16.449121475219727, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8533819317817688, "num_tokens": 309584301.0, "step": 8116 }, { "epoch": 1.0325658313191706, "ewc_loss": 0.02593490108847618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5934901714208536e-05, "grad_norm": 16.362377166748047, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8694846034049988, "num_tokens": 309622570.0, "step": 8117 }, { "epoch": 1.032693041597761, "ewc_loss": 0.025808267295360565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.580826730991248e-05, "grad_norm": 16.199058532714844, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8501455783843994, "num_tokens": 309666417.0, "step": 8118 }, { "epoch": 1.0328202518763516, "ewc_loss": 0.02579258568584919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.579258580226451e-05, "grad_norm": 16.319385528564453, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8566856384277344, "num_tokens": 309705706.0, "step": 8119 }, { "epoch": 1.0329474621549422, "ewc_loss": 0.025826679542660713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.582667912065517e-05, "grad_norm": 16.30517578125, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.874945878982544, "num_tokens": 309744288.0, "step": 8120 }, { "epoch": 1.0330746724335327, "ewc_loss": 0.025782929733395576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.578293060651049e-05, "grad_norm": 16.376731872558594, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8525437712669373, "num_tokens": 309780406.0, "step": 8121 }, { "epoch": 1.0332018827121232, "ewc_loss": 0.025847269222140312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.584727008070331e-05, "grad_norm": 16.42083740234375, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8337404131889343, "num_tokens": 309811135.0, "step": 8122 }, { "epoch": 1.0333290929907137, "ewc_loss": 0.025831319391727448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5831319362623617e-05, "grad_norm": 16.314788818359375, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8696853518486023, "num_tokens": 309847037.0, "step": 8123 }, { "epoch": 1.0334563032693043, "ewc_loss": 0.025774534791707993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5774534151423723e-05, "grad_norm": 16.370201110839844, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8659393787384033, "num_tokens": 309884274.0, "step": 8124 }, { "epoch": 1.0335835135478946, "ewc_loss": 0.025881608948111534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.588160896266345e-05, "grad_norm": 16.34162139892578, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8776772022247314, "num_tokens": 309915572.0, "step": 8125 }, { "epoch": 1.033710723826485, "ewc_loss": 0.02579617314040661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5796172849368304e-05, "grad_norm": 16.386465072631836, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8702294826507568, "num_tokens": 309954502.0, "step": 8126 }, { "epoch": 1.0338379341050756, "ewc_loss": 0.02584752067923546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5847521101241e-05, "grad_norm": 16.256093978881836, "learning_rate": 1e-06, "loss": 0.3491, "mean_token_accuracy": 0.8869354128837585, "num_tokens": 309993015.0, "step": 8127 }, { "epoch": 1.0339651443836662, "ewc_loss": 0.02579575777053833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5795758119784296e-05, "grad_norm": 16.353469848632812, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8568425178527832, "num_tokens": 310030840.0, "step": 8128 }, { "epoch": 1.0340923546622567, "ewc_loss": 0.025898052379488945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5898052626871504e-05, "grad_norm": 16.287700653076172, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8660787343978882, "num_tokens": 310067586.0, "step": 8129 }, { "epoch": 1.0342195649408472, "ewc_loss": 0.02578233741223812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5782337615964934e-05, "grad_norm": 16.24382209777832, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8742226362228394, "num_tokens": 310104496.0, "step": 8130 }, { "epoch": 1.0343467752194377, "ewc_loss": 0.025878824293613434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.587882408988662e-05, "grad_norm": 16.34294319152832, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.848834216594696, "num_tokens": 310141162.0, "step": 8131 }, { "epoch": 1.0344739854980283, "ewc_loss": 0.02589491195976734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.589491123217158e-05, "grad_norm": 16.260231018066406, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8637047410011292, "num_tokens": 310182965.0, "step": 8132 }, { "epoch": 1.0346011957766188, "ewc_loss": 0.025840669870376587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5840670787147246e-05, "grad_norm": 16.333942413330078, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8487255573272705, "num_tokens": 310219572.0, "step": 8133 }, { "epoch": 1.0347284060552093, "ewc_loss": 0.025891117751598358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5891118639265187e-05, "grad_norm": 16.265954971313477, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8542218208312988, "num_tokens": 310248711.0, "step": 8134 }, { "epoch": 1.0348556163337999, "ewc_loss": 0.025847887620329857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5847888537100516e-05, "grad_norm": 16.25381851196289, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8490802049636841, "num_tokens": 310287915.0, "step": 8135 }, { "epoch": 1.0349828266123904, "ewc_loss": 0.025936763733625412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5936764359357767e-05, "grad_norm": 16.304473876953125, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8641159534454346, "num_tokens": 310325619.0, "step": 8136 }, { "epoch": 1.0351100368909807, "ewc_loss": 0.02596951089799404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5969511625589803e-05, "grad_norm": 16.28157615661621, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8729199171066284, "num_tokens": 310362456.0, "step": 8137 }, { "epoch": 1.0352372471695712, "ewc_loss": 0.02590201236307621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5902012566803023e-05, "grad_norm": 16.268951416015625, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8603360056877136, "num_tokens": 310399584.0, "step": 8138 }, { "epoch": 1.0353644574481617, "ewc_loss": 0.025921670719981194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5921670385287143e-05, "grad_norm": 16.28936004638672, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8588047027587891, "num_tokens": 310437435.0, "step": 8139 }, { "epoch": 1.0354916677267523, "ewc_loss": 0.025942090898752213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.594209036033135e-05, "grad_norm": 16.283279418945312, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8710687756538391, "num_tokens": 310476320.0, "step": 8140 }, { "epoch": 1.0356188780053428, "ewc_loss": 0.02596886269748211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5968862246372737e-05, "grad_norm": 16.304773330688477, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8815895915031433, "num_tokens": 310512956.0, "step": 8141 }, { "epoch": 1.0357460882839333, "ewc_loss": 0.025942467153072357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5942466891137883e-05, "grad_norm": 16.269683837890625, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8643255829811096, "num_tokens": 310547605.0, "step": 8142 }, { "epoch": 1.0358732985625239, "ewc_loss": 0.025970831513404846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5970832211896777e-05, "grad_norm": 16.32093048095703, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8634964823722839, "num_tokens": 310586991.0, "step": 8143 }, { "epoch": 1.0360005088411144, "ewc_loss": 0.025981128215789795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5981127691920847e-05, "grad_norm": 16.29822540283203, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8647984266281128, "num_tokens": 310621271.0, "step": 8144 }, { "epoch": 1.036127719119705, "ewc_loss": 0.02598975971341133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5989758796640672e-05, "grad_norm": 16.293062210083008, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8681974411010742, "num_tokens": 310662154.0, "step": 8145 }, { "epoch": 1.0362549293982954, "ewc_loss": 0.0260003674775362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600036714284215e-05, "grad_norm": 16.28302574157715, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8759201765060425, "num_tokens": 310701924.0, "step": 8146 }, { "epoch": 1.036382139676886, "ewc_loss": 0.025996942073106766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5996941985795274e-05, "grad_norm": 16.30891227722168, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8723400831222534, "num_tokens": 310739688.0, "step": 8147 }, { "epoch": 1.0365093499554765, "ewc_loss": 0.025998029857873917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5998029741458595e-05, "grad_norm": 16.267595291137695, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8594595193862915, "num_tokens": 310773317.0, "step": 8148 }, { "epoch": 1.0366365602340668, "ewc_loss": 0.025978274643421173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5978275516536087e-05, "grad_norm": 16.29845428466797, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8642950057983398, "num_tokens": 310810854.0, "step": 8149 }, { "epoch": 1.0367637705126573, "ewc_loss": 0.025996007025241852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5996007025241852e-05, "grad_norm": 16.28587532043457, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8589549660682678, "num_tokens": 310846080.0, "step": 8150 }, { "epoch": 1.0368909807912479, "ewc_loss": 0.025952404364943504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5952404030249454e-05, "grad_norm": 16.222421646118164, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8424380421638489, "num_tokens": 310885032.0, "step": 8151 }, { "epoch": 1.0370181910698384, "ewc_loss": 0.02597663551568985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.597663478809409e-05, "grad_norm": 16.20708465576172, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.870740532875061, "num_tokens": 310924518.0, "step": 8152 }, { "epoch": 1.037145401348429, "ewc_loss": 0.02603834867477417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6038349460577592e-05, "grad_norm": 16.342166900634766, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8676921725273132, "num_tokens": 310961734.0, "step": 8153 }, { "epoch": 1.0372726116270194, "ewc_loss": 0.02603803016245365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6038029318442568e-05, "grad_norm": 16.232891082763672, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8676262497901917, "num_tokens": 311004541.0, "step": 8154 }, { "epoch": 1.03739982190561, "ewc_loss": 0.025985335931181908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5985335014411248e-05, "grad_norm": 16.344541549682617, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8739681243896484, "num_tokens": 311043947.0, "step": 8155 }, { "epoch": 1.0375270321842005, "ewc_loss": 0.026019658893346786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601965934445616e-05, "grad_norm": 16.24698829650879, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8609174489974976, "num_tokens": 311083154.0, "step": 8156 }, { "epoch": 1.037654242462791, "ewc_loss": 0.025975340977311134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5975341486628167e-05, "grad_norm": 16.336130142211914, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8757002949714661, "num_tokens": 311121429.0, "step": 8157 }, { "epoch": 1.0377814527413816, "ewc_loss": 0.02601815015077591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6018149583251216e-05, "grad_norm": 16.244800567626953, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8502844572067261, "num_tokens": 311166578.0, "step": 8158 }, { "epoch": 1.037908663019972, "ewc_loss": 0.025939861312508583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5939862098312005e-05, "grad_norm": 16.20047378540039, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.867284893989563, "num_tokens": 311209645.0, "step": 8159 }, { "epoch": 1.0380358732985626, "ewc_loss": 0.026029109954833984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6029110813396983e-05, "grad_norm": 16.278789520263672, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.869425356388092, "num_tokens": 311248428.0, "step": 8160 }, { "epoch": 1.0381630835771531, "ewc_loss": 0.025967998430132866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5967998226406053e-05, "grad_norm": 16.231782913208008, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8664110898971558, "num_tokens": 311282040.0, "step": 8161 }, { "epoch": 1.0382902938557435, "ewc_loss": 0.025983665138483047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5983665182138793e-05, "grad_norm": 16.315866470336914, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8723288774490356, "num_tokens": 311320233.0, "step": 8162 }, { "epoch": 1.038417504134334, "ewc_loss": 0.02596631646156311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5966315661207773e-05, "grad_norm": 16.231891632080078, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8503900766372681, "num_tokens": 311357586.0, "step": 8163 }, { "epoch": 1.0385447144129245, "ewc_loss": 0.02591482549905777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.59148255281616e-05, "grad_norm": 16.241668701171875, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8649529814720154, "num_tokens": 311395545.0, "step": 8164 }, { "epoch": 1.038671924691515, "ewc_loss": 0.02597457356750965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.597457387309987e-05, "grad_norm": 16.372976303100586, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8649778962135315, "num_tokens": 311428785.0, "step": 8165 }, { "epoch": 1.0387991349701056, "ewc_loss": 0.026011506095528603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6011506633949466e-05, "grad_norm": 16.27151870727539, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8538047075271606, "num_tokens": 311469546.0, "step": 8166 }, { "epoch": 1.038926345248696, "ewc_loss": 0.02591913379728794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5919132895069197e-05, "grad_norm": 16.348318099975586, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8577533960342407, "num_tokens": 311504064.0, "step": 8167 }, { "epoch": 1.0390535555272866, "ewc_loss": 0.025960765779018402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5960765924537554e-05, "grad_norm": 16.278993606567383, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8485430479049683, "num_tokens": 311536600.0, "step": 8168 }, { "epoch": 1.0391807658058771, "ewc_loss": 0.025933807715773582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5933808501577005e-05, "grad_norm": 16.34787368774414, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8814257979393005, "num_tokens": 311573638.0, "step": 8169 }, { "epoch": 1.0393079760844677, "ewc_loss": 0.02593870274722576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5938703402061947e-05, "grad_norm": 16.262462615966797, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8573574423789978, "num_tokens": 311611085.0, "step": 8170 }, { "epoch": 1.0394351863630582, "ewc_loss": 0.025908757001161575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.590875737951137e-05, "grad_norm": 16.279340744018555, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8590292930603027, "num_tokens": 311655497.0, "step": 8171 }, { "epoch": 1.0395623966416487, "ewc_loss": 0.025975728407502174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5975728931371123e-05, "grad_norm": 16.393308639526367, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8502293825149536, "num_tokens": 311688167.0, "step": 8172 }, { "epoch": 1.0396896069202393, "ewc_loss": 0.025910066440701485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5910067051881924e-05, "grad_norm": 16.23727798461914, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8653707504272461, "num_tokens": 311724926.0, "step": 8173 }, { "epoch": 1.0398168171988296, "ewc_loss": 0.025966985151171684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5966985049308278e-05, "grad_norm": 16.410112380981445, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8463176488876343, "num_tokens": 311760729.0, "step": 8174 }, { "epoch": 1.03994402747742, "ewc_loss": 0.025991294533014297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5991294023697264e-05, "grad_norm": 16.173032760620117, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8655396699905396, "num_tokens": 311802138.0, "step": 8175 }, { "epoch": 1.0400712377560106, "ewc_loss": 0.02593357488512993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.593357567093335e-05, "grad_norm": 16.28314208984375, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8723963499069214, "num_tokens": 311838510.0, "step": 8176 }, { "epoch": 1.0401984480346012, "ewc_loss": 0.026049556210637093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6049556254292838e-05, "grad_norm": 16.261024475097656, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8579063415527344, "num_tokens": 311870875.0, "step": 8177 }, { "epoch": 1.0403256583131917, "ewc_loss": 0.025950489565730095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5950490453396924e-05, "grad_norm": 16.22911834716797, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8768188953399658, "num_tokens": 311905854.0, "step": 8178 }, { "epoch": 1.0404528685917822, "ewc_loss": 0.02601173333823681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601173400762491e-05, "grad_norm": 16.22833251953125, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8619749546051025, "num_tokens": 311936381.0, "step": 8179 }, { "epoch": 1.0405800788703727, "ewc_loss": 0.026049509644508362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6049508960568346e-05, "grad_norm": 16.297821044921875, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8586577773094177, "num_tokens": 311972529.0, "step": 8180 }, { "epoch": 1.0407072891489633, "ewc_loss": 0.026100685819983482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.610068622743711e-05, "grad_norm": 16.285751342773438, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8831517696380615, "num_tokens": 312007883.0, "step": 8181 }, { "epoch": 1.0408344994275538, "ewc_loss": 0.026081927120685577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608192698971834e-05, "grad_norm": 16.280824661254883, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8603286743164062, "num_tokens": 312041000.0, "step": 8182 }, { "epoch": 1.0409617097061443, "ewc_loss": 0.026066288352012634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6066289137816057e-05, "grad_norm": 16.28160858154297, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8687160611152649, "num_tokens": 312076011.0, "step": 8183 }, { "epoch": 1.0410889199847349, "ewc_loss": 0.026056790724396706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.605679037515074e-05, "grad_norm": 16.174942016601562, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8589218258857727, "num_tokens": 312115614.0, "step": 8184 }, { "epoch": 1.0412161302633254, "ewc_loss": 0.02611120417714119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.611120362416841e-05, "grad_norm": 16.295988082885742, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8583135008811951, "num_tokens": 312154111.0, "step": 8185 }, { "epoch": 1.0413433405419157, "ewc_loss": 0.026108529418706894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61085297097452e-05, "grad_norm": 16.19413948059082, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8595178127288818, "num_tokens": 312191571.0, "step": 8186 }, { "epoch": 1.0414705508205062, "ewc_loss": 0.026154829189181328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.615482844703365e-05, "grad_norm": 16.345829010009766, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8569821715354919, "num_tokens": 312229205.0, "step": 8187 }, { "epoch": 1.0415977610990967, "ewc_loss": 0.026144111528992653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.614411096146796e-05, "grad_norm": 16.219303131103516, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8794045448303223, "num_tokens": 312266604.0, "step": 8188 }, { "epoch": 1.0417249713776873, "ewc_loss": 0.02609250508248806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.609250441309996e-05, "grad_norm": 16.2579402923584, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8552652597427368, "num_tokens": 312305222.0, "step": 8189 }, { "epoch": 1.0418521816562778, "ewc_loss": 0.026127131655812263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6127132514375262e-05, "grad_norm": 16.265962600708008, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8433769941329956, "num_tokens": 312345823.0, "step": 8190 }, { "epoch": 1.0419793919348683, "ewc_loss": 0.02610708586871624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6107085432158783e-05, "grad_norm": 16.28525161743164, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8559356331825256, "num_tokens": 312383569.0, "step": 8191 }, { "epoch": 1.0421066022134589, "ewc_loss": 0.02613419108092785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.613419019326102e-05, "grad_norm": 16.228904724121094, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8680281639099121, "num_tokens": 312423068.0, "step": 8192 }, { "epoch": 1.0422338124920494, "ewc_loss": 0.02610783651471138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6107836674782448e-05, "grad_norm": 16.297887802124023, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8593549728393555, "num_tokens": 312460569.0, "step": 8193 }, { "epoch": 1.04236102277064, "ewc_loss": 0.026119021698832512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.611902164062485e-05, "grad_norm": 16.295595169067383, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8434798717498779, "num_tokens": 312501654.0, "step": 8194 }, { "epoch": 1.0424882330492304, "ewc_loss": 0.0260801762342453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608017712191213e-05, "grad_norm": 16.192909240722656, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8560055494308472, "num_tokens": 312543630.0, "step": 8195 }, { "epoch": 1.042615443327821, "ewc_loss": 0.02609895169734955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6098950911546126e-05, "grad_norm": 16.294246673583984, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8625210523605347, "num_tokens": 312579428.0, "step": 8196 }, { "epoch": 1.0427426536064115, "ewc_loss": 0.02615041844546795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6150419216719456e-05, "grad_norm": 16.295379638671875, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8792203664779663, "num_tokens": 312621388.0, "step": 8197 }, { "epoch": 1.0428698638850018, "ewc_loss": 0.026082491502165794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608249087643344e-05, "grad_norm": 16.312179565429688, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8647141456604004, "num_tokens": 312665303.0, "step": 8198 }, { "epoch": 1.0429970741635923, "ewc_loss": 0.026079364120960236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6079364033648744e-05, "grad_norm": 16.300857543945312, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8538086414337158, "num_tokens": 312702354.0, "step": 8199 }, { "epoch": 1.0431242844421829, "ewc_loss": 0.026025615632534027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6025614715763368e-05, "grad_norm": 16.310218811035156, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8695272207260132, "num_tokens": 312737223.0, "step": 8200 }, { "epoch": 1.0432514947207734, "ewc_loss": 0.02604737877845764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6047378923976794e-05, "grad_norm": 16.306209564208984, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8651241660118103, "num_tokens": 312773353.0, "step": 8201 }, { "epoch": 1.043378704999364, "ewc_loss": 0.026009151712059975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6009151042671874e-05, "grad_norm": 16.27476692199707, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8464211225509644, "num_tokens": 312809230.0, "step": 8202 }, { "epoch": 1.0435059152779544, "ewc_loss": 0.0260060653090477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600606603664346e-05, "grad_norm": 16.33234405517578, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.882082462310791, "num_tokens": 312841278.0, "step": 8203 }, { "epoch": 1.043633125556545, "ewc_loss": 0.026008589193224907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600858897494618e-05, "grad_norm": 16.214988708496094, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8689091205596924, "num_tokens": 312888469.0, "step": 8204 }, { "epoch": 1.0437603358351355, "ewc_loss": 0.02596992999315262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.596992999315262e-05, "grad_norm": 16.251352310180664, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.871286153793335, "num_tokens": 312925446.0, "step": 8205 }, { "epoch": 1.043887546113726, "ewc_loss": 0.026071181520819664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6071182219311595e-05, "grad_norm": 16.318784713745117, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8636564016342163, "num_tokens": 312965425.0, "step": 8206 }, { "epoch": 1.0440147563923166, "ewc_loss": 0.026016950607299805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601695086923428e-05, "grad_norm": 16.196653366088867, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8589901924133301, "num_tokens": 313000637.0, "step": 8207 }, { "epoch": 1.044141966670907, "ewc_loss": 0.02600080519914627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6000805519288406e-05, "grad_norm": 16.339374542236328, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8654012680053711, "num_tokens": 313035986.0, "step": 8208 }, { "epoch": 1.0442691769494976, "ewc_loss": 0.026057660579681396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6057659852085635e-05, "grad_norm": 16.263439178466797, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8539553880691528, "num_tokens": 313077027.0, "step": 8209 }, { "epoch": 1.0443963872280881, "ewc_loss": 0.025977326557040215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5977326004067436e-05, "grad_norm": 16.21546745300293, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8750830292701721, "num_tokens": 313121733.0, "step": 8210 }, { "epoch": 1.0445235975066784, "ewc_loss": 0.026075826957821846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6075826099258848e-05, "grad_norm": 16.288848876953125, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8600106239318848, "num_tokens": 313165542.0, "step": 8211 }, { "epoch": 1.044650807785269, "ewc_loss": 0.02602866291999817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.602866334200371e-05, "grad_norm": 16.352514266967773, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8537376523017883, "num_tokens": 313201082.0, "step": 8212 }, { "epoch": 1.0447780180638595, "ewc_loss": 0.026028553023934364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6028552383650094e-05, "grad_norm": 16.291515350341797, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8757914304733276, "num_tokens": 313241765.0, "step": 8213 }, { "epoch": 1.04490522834245, "ewc_loss": 0.026042575016617775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.604257497296203e-05, "grad_norm": 16.356369018554688, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8629686236381531, "num_tokens": 313282471.0, "step": 8214 }, { "epoch": 1.0450324386210406, "ewc_loss": 0.02600269205868244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6002691811299883e-05, "grad_norm": 16.234094619750977, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8597097396850586, "num_tokens": 313331505.0, "step": 8215 }, { "epoch": 1.045159648899631, "ewc_loss": 0.025960126891732216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.596012745925691e-05, "grad_norm": 16.323562622070312, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8648322820663452, "num_tokens": 313369823.0, "step": 8216 }, { "epoch": 1.0452868591782216, "ewc_loss": 0.026059461757540703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.605946247058455e-05, "grad_norm": 16.346363067626953, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.865540623664856, "num_tokens": 313404255.0, "step": 8217 }, { "epoch": 1.0454140694568121, "ewc_loss": 0.025979725643992424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5979725251090713e-05, "grad_norm": 16.333147048950195, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8749420642852783, "num_tokens": 313439368.0, "step": 8218 }, { "epoch": 1.0455412797354027, "ewc_loss": 0.026042595505714417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6042594981845468e-05, "grad_norm": 16.361135482788086, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8564675450325012, "num_tokens": 313479475.0, "step": 8219 }, { "epoch": 1.0456684900139932, "ewc_loss": 0.025963710620999336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5963710868381895e-05, "grad_norm": 16.34929084777832, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8552273511886597, "num_tokens": 313514604.0, "step": 8220 }, { "epoch": 1.0457957002925837, "ewc_loss": 0.02596084587275982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.596084596007131e-05, "grad_norm": 16.355609893798828, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8598591089248657, "num_tokens": 313548907.0, "step": 8221 }, { "epoch": 1.0459229105711743, "ewc_loss": 0.025954866781830788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5954866941901855e-05, "grad_norm": 16.366174697875977, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8635269403457642, "num_tokens": 313586615.0, "step": 8222 }, { "epoch": 1.0460501208497646, "ewc_loss": 0.025950796902179718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.595079604361672e-05, "grad_norm": 16.27174949645996, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8553228378295898, "num_tokens": 313625202.0, "step": 8223 }, { "epoch": 1.046177331128355, "ewc_loss": 0.025957148522138596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.59571479546139e-05, "grad_norm": 16.353437423706055, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8571760654449463, "num_tokens": 313663257.0, "step": 8224 }, { "epoch": 1.0463045414069456, "ewc_loss": 0.02601989544928074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601989581307862e-05, "grad_norm": 16.287220001220703, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8685082197189331, "num_tokens": 313701430.0, "step": 8225 }, { "epoch": 1.0464317516855361, "ewc_loss": 0.025946078822016716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5946079404093325e-05, "grad_norm": 16.34918212890625, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8749334216117859, "num_tokens": 313741286.0, "step": 8226 }, { "epoch": 1.0465589619641267, "ewc_loss": 0.025970343500375748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5970342903747223e-05, "grad_norm": 16.298551559448242, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8492934703826904, "num_tokens": 313779115.0, "step": 8227 }, { "epoch": 1.0466861722427172, "ewc_loss": 0.02593425288796425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.593425233499147e-05, "grad_norm": 16.328535079956055, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8689848184585571, "num_tokens": 313822331.0, "step": 8228 }, { "epoch": 1.0468133825213077, "ewc_loss": 0.025984255596995354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5984256353694946e-05, "grad_norm": 16.321514129638672, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8585285544395447, "num_tokens": 313863545.0, "step": 8229 }, { "epoch": 1.0469405927998983, "ewc_loss": 0.02597074769437313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5970748538384214e-05, "grad_norm": 16.41281509399414, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8849086761474609, "num_tokens": 313894956.0, "step": 8230 }, { "epoch": 1.0470678030784888, "ewc_loss": 0.02600404992699623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600405059638433e-05, "grad_norm": 16.312652587890625, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8726596236228943, "num_tokens": 313927920.0, "step": 8231 }, { "epoch": 1.0471950133570793, "ewc_loss": 0.025917043909430504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5917044695233926e-05, "grad_norm": 16.3055419921875, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8726862668991089, "num_tokens": 313963230.0, "step": 8232 }, { "epoch": 1.0473222236356698, "ewc_loss": 0.02600092999637127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6000929210567847e-05, "grad_norm": 16.38295555114746, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8567513227462769, "num_tokens": 313995951.0, "step": 8233 }, { "epoch": 1.0474494339142604, "ewc_loss": 0.025977980345487595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5977980840252712e-05, "grad_norm": 16.30294418334961, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.865288496017456, "num_tokens": 314040928.0, "step": 8234 }, { "epoch": 1.0475766441928507, "ewc_loss": 0.025982897728681564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5982897568610497e-05, "grad_norm": 16.343185424804688, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8560754656791687, "num_tokens": 314080329.0, "step": 8235 }, { "epoch": 1.0477038544714412, "ewc_loss": 0.025994133204221725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.59941334661562e-05, "grad_norm": 16.35860252380371, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8653603792190552, "num_tokens": 314117107.0, "step": 8236 }, { "epoch": 1.0478310647500317, "ewc_loss": 0.02596508339047432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5965084205381572e-05, "grad_norm": 16.33303451538086, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8623818159103394, "num_tokens": 314148077.0, "step": 8237 }, { "epoch": 1.0479582750286223, "ewc_loss": 0.025978242978453636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5978242774726823e-05, "grad_norm": 16.268003463745117, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8560555577278137, "num_tokens": 314182064.0, "step": 8238 }, { "epoch": 1.0480854853072128, "ewc_loss": 0.026001399382948875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6001398509833962e-05, "grad_norm": 16.32550811767578, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8561223745346069, "num_tokens": 314221667.0, "step": 8239 }, { "epoch": 1.0482126955858033, "ewc_loss": 0.026076752692461014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6076751964865252e-05, "grad_norm": 16.332162857055664, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8610354661941528, "num_tokens": 314255225.0, "step": 8240 }, { "epoch": 1.0483399058643939, "ewc_loss": 0.026088787242770195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6088786398759112e-05, "grad_norm": 16.371442794799805, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8641042709350586, "num_tokens": 314288011.0, "step": 8241 }, { "epoch": 1.0484671161429844, "ewc_loss": 0.02605433203279972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.605433292046655e-05, "grad_norm": 16.284530639648438, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8616666793823242, "num_tokens": 314332461.0, "step": 8242 }, { "epoch": 1.048594326421575, "ewc_loss": 0.026076454669237137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6076455469592474e-05, "grad_norm": 16.319446563720703, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8599358797073364, "num_tokens": 314367967.0, "step": 8243 }, { "epoch": 1.0487215367001654, "ewc_loss": 0.026106376200914383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61063760262914e-05, "grad_norm": 16.36192512512207, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8735802173614502, "num_tokens": 314404487.0, "step": 8244 }, { "epoch": 1.048848746978756, "ewc_loss": 0.026116913184523582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.611691343190614e-05, "grad_norm": 16.324647903442383, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8614230155944824, "num_tokens": 314444731.0, "step": 8245 }, { "epoch": 1.0489759572573465, "ewc_loss": 0.026102902367711067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6102901756530628e-05, "grad_norm": 16.341176986694336, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8647076487541199, "num_tokens": 314479720.0, "step": 8246 }, { "epoch": 1.0491031675359368, "ewc_loss": 0.026068031787872314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6068031729664654e-05, "grad_norm": 16.324382781982422, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8814987540245056, "num_tokens": 314515613.0, "step": 8247 }, { "epoch": 1.0492303778145273, "ewc_loss": 0.02611655369400978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.611655327200424e-05, "grad_norm": 16.265335083007812, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8637034893035889, "num_tokens": 314556519.0, "step": 8248 }, { "epoch": 1.0493575880931179, "ewc_loss": 0.026158450171351433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.615845005493611e-05, "grad_norm": 16.439128875732422, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8744074106216431, "num_tokens": 314590803.0, "step": 8249 }, { "epoch": 1.0494847983717084, "ewc_loss": 0.026177851483225822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617785139591433e-05, "grad_norm": 16.404129028320312, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8655691146850586, "num_tokens": 314631347.0, "step": 8250 }, { "epoch": 1.049612008650299, "ewc_loss": 0.026056382805109024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6056382921524346e-05, "grad_norm": 16.315702438354492, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8646275401115417, "num_tokens": 314673739.0, "step": 8251 }, { "epoch": 1.0497392189288894, "ewc_loss": 0.026087600737810135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6087600417668e-05, "grad_norm": 16.323463439941406, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8643028736114502, "num_tokens": 314718498.0, "step": 8252 }, { "epoch": 1.04986642920748, "ewc_loss": 0.026050081476569176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6050081942230463e-05, "grad_norm": 16.368976593017578, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8690568804740906, "num_tokens": 314758755.0, "step": 8253 }, { "epoch": 1.0499936394860705, "ewc_loss": 0.02606336586177349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.606336602184456e-05, "grad_norm": 16.240671157836914, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8684341907501221, "num_tokens": 314793644.0, "step": 8254 }, { "epoch": 1.050120849764661, "ewc_loss": 0.026015950366854668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601595042506233e-05, "grad_norm": 16.426908493041992, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8607337474822998, "num_tokens": 314828045.0, "step": 8255 }, { "epoch": 1.0502480600432516, "ewc_loss": 0.026102738454937935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.610273804748431e-05, "grad_norm": 16.310945510864258, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8662950992584229, "num_tokens": 314866088.0, "step": 8256 }, { "epoch": 1.050375270321842, "ewc_loss": 0.025981606915593147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.598160608613398e-05, "grad_norm": 16.321327209472656, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8556631803512573, "num_tokens": 314904757.0, "step": 8257 }, { "epoch": 1.0505024806004326, "ewc_loss": 0.026114659383893013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6114659704035148e-05, "grad_norm": 16.427413940429688, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8569808006286621, "num_tokens": 314941410.0, "step": 8258 }, { "epoch": 1.0506296908790231, "ewc_loss": 0.026034647598862648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6034647817141376e-05, "grad_norm": 16.2324161529541, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8727525472640991, "num_tokens": 314982060.0, "step": 8259 }, { "epoch": 1.0507569011576134, "ewc_loss": 0.02603583037853241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.603583016025368e-05, "grad_norm": 16.376846313476562, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8549975156784058, "num_tokens": 315021812.0, "step": 8260 }, { "epoch": 1.050884111436204, "ewc_loss": 0.02611718885600567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6117188099306077e-05, "grad_norm": 16.407695770263672, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8604574203491211, "num_tokens": 315059520.0, "step": 8261 }, { "epoch": 1.0510113217147945, "ewc_loss": 0.026000408455729485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6000408979598433e-05, "grad_norm": 16.264354705810547, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8596518039703369, "num_tokens": 315098392.0, "step": 8262 }, { "epoch": 1.051138531993385, "ewc_loss": 0.02603589929640293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6035899281851016e-05, "grad_norm": 16.389060974121094, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8527471423149109, "num_tokens": 315137375.0, "step": 8263 }, { "epoch": 1.0512657422719756, "ewc_loss": 0.02608044259250164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6080442694365047e-05, "grad_norm": 16.31318473815918, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8689903020858765, "num_tokens": 315171308.0, "step": 8264 }, { "epoch": 1.051392952550566, "ewc_loss": 0.02600093185901642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600093102955725e-05, "grad_norm": 16.33165168762207, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8644388318061829, "num_tokens": 315212176.0, "step": 8265 }, { "epoch": 1.0515201628291566, "ewc_loss": 0.02606154978275299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6061548851430416e-05, "grad_norm": 16.37395477294922, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8527188301086426, "num_tokens": 315251321.0, "step": 8266 }, { "epoch": 1.0516473731077471, "ewc_loss": 0.02603200264275074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.603200300654862e-05, "grad_norm": 16.28655242919922, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8494621515274048, "num_tokens": 315288780.0, "step": 8267 }, { "epoch": 1.0517745833863377, "ewc_loss": 0.02601478062570095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.601478081487585e-05, "grad_norm": 16.350093841552734, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8861201405525208, "num_tokens": 315325630.0, "step": 8268 }, { "epoch": 1.0519017936649282, "ewc_loss": 0.02608959563076496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608959584904369e-05, "grad_norm": 16.313766479492188, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.858197808265686, "num_tokens": 315362887.0, "step": 8269 }, { "epoch": 1.0520290039435187, "ewc_loss": 0.026046786457300186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6046785933431238e-05, "grad_norm": 16.377933502197266, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8768362998962402, "num_tokens": 315397552.0, "step": 8270 }, { "epoch": 1.0521562142221093, "ewc_loss": 0.026112481951713562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6112482373719104e-05, "grad_norm": 16.288345336914062, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8748446702957153, "num_tokens": 315436785.0, "step": 8271 }, { "epoch": 1.0522834245006996, "ewc_loss": 0.026045169681310654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6045168851851486e-05, "grad_norm": 16.365232467651367, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8748266696929932, "num_tokens": 315467849.0, "step": 8272 }, { "epoch": 1.05241063477929, "ewc_loss": 0.026153625920414925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6153626095037907e-05, "grad_norm": 16.334678649902344, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8721156120300293, "num_tokens": 315501496.0, "step": 8273 }, { "epoch": 1.0525378450578806, "ewc_loss": 0.026047399267554283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6047398932860233e-05, "grad_norm": 16.288734436035156, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8736220598220825, "num_tokens": 315540878.0, "step": 8274 }, { "epoch": 1.0526650553364711, "ewc_loss": 0.02614489011466503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6144889488932677e-05, "grad_norm": 16.362001419067383, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8793440461158752, "num_tokens": 315578139.0, "step": 8275 }, { "epoch": 1.0527922656150617, "ewc_loss": 0.02609286643564701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6092866391991265e-05, "grad_norm": 16.307880401611328, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8655291199684143, "num_tokens": 315615845.0, "step": 8276 }, { "epoch": 1.0529194758936522, "ewc_loss": 0.026115430518984795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.611543095554225e-05, "grad_norm": 16.327852249145508, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8572906255722046, "num_tokens": 315645407.0, "step": 8277 }, { "epoch": 1.0530466861722427, "ewc_loss": 0.0261126309633255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6112631530850194e-05, "grad_norm": 16.36739158630371, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8714920282363892, "num_tokens": 315687922.0, "step": 8278 }, { "epoch": 1.0531738964508333, "ewc_loss": 0.026136739179491997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6136738597415388e-05, "grad_norm": 16.307540893554688, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8658424615859985, "num_tokens": 315723730.0, "step": 8279 }, { "epoch": 1.0533011067294238, "ewc_loss": 0.02607964724302292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6079647795995697e-05, "grad_norm": 16.355907440185547, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8768200874328613, "num_tokens": 315765662.0, "step": 8280 }, { "epoch": 1.0534283170080143, "ewc_loss": 0.026136118918657303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.613611832202878e-05, "grad_norm": 16.274049758911133, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.839818000793457, "num_tokens": 315808668.0, "step": 8281 }, { "epoch": 1.0535555272866048, "ewc_loss": 0.026033323258161545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6033323592855595e-05, "grad_norm": 16.341100692749023, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8602220416069031, "num_tokens": 315844878.0, "step": 8282 }, { "epoch": 1.0536827375651954, "ewc_loss": 0.02616998739540577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6169987904722802e-05, "grad_norm": 16.33466339111328, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8783588409423828, "num_tokens": 315879472.0, "step": 8283 }, { "epoch": 1.0538099478437857, "ewc_loss": 0.02606913447380066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6069134037243202e-05, "grad_norm": 16.302993774414062, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.859288215637207, "num_tokens": 315915496.0, "step": 8284 }, { "epoch": 1.0539371581223762, "ewc_loss": 0.026077890768647194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6077890652231872e-05, "grad_norm": 16.239498138427734, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8671388626098633, "num_tokens": 315954317.0, "step": 8285 }, { "epoch": 1.0540643684009667, "ewc_loss": 0.02607392519712448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6073925255332142e-05, "grad_norm": 16.295026779174805, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8602681756019592, "num_tokens": 315995518.0, "step": 8286 }, { "epoch": 1.0541915786795573, "ewc_loss": 0.026080427691340446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608042814244982e-05, "grad_norm": 16.19706153869629, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8785531520843506, "num_tokens": 316040475.0, "step": 8287 }, { "epoch": 1.0543187889581478, "ewc_loss": 0.026055337861180305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6055337002617307e-05, "grad_norm": 16.304096221923828, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8635315895080566, "num_tokens": 316083003.0, "step": 8288 }, { "epoch": 1.0544459992367383, "ewc_loss": 0.026173748075962067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617374775581993e-05, "grad_norm": 16.273109436035156, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8656882047653198, "num_tokens": 316124687.0, "step": 8289 }, { "epoch": 1.0545732095153288, "ewc_loss": 0.026026403531432152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6026404157164507e-05, "grad_norm": 16.296463012695312, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8525100946426392, "num_tokens": 316170052.0, "step": 8290 }, { "epoch": 1.0547004197939194, "ewc_loss": 0.026110736653208733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61107361438917e-05, "grad_norm": 16.274736404418945, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8511970043182373, "num_tokens": 316211941.0, "step": 8291 }, { "epoch": 1.05482763007251, "ewc_loss": 0.026088349521160126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608834984130226e-05, "grad_norm": 16.276369094848633, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8678438663482666, "num_tokens": 316252592.0, "step": 8292 }, { "epoch": 1.0549548403511004, "ewc_loss": 0.026069728657603264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6069728846778162e-05, "grad_norm": 16.288665771484375, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.873348593711853, "num_tokens": 316287250.0, "step": 8293 }, { "epoch": 1.055082050629691, "ewc_loss": 0.0260750912129879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6075091227539815e-05, "grad_norm": 16.27643394470215, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8681183457374573, "num_tokens": 316326308.0, "step": 8294 }, { "epoch": 1.0552092609082815, "ewc_loss": 0.026089204475283623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6089204766321927e-05, "grad_norm": 16.278005599975586, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8698360323905945, "num_tokens": 316364519.0, "step": 8295 }, { "epoch": 1.0553364711868718, "ewc_loss": 0.026092763990163803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6092764528584667e-05, "grad_norm": 16.276744842529297, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8697839975357056, "num_tokens": 316401165.0, "step": 8296 }, { "epoch": 1.0554636814654623, "ewc_loss": 0.02609640546143055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6096406145370565e-05, "grad_norm": 16.32390594482422, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8603086471557617, "num_tokens": 316440245.0, "step": 8297 }, { "epoch": 1.0555908917440529, "ewc_loss": 0.02600805088877678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600805055408273e-05, "grad_norm": 16.209917068481445, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8726329803466797, "num_tokens": 316484969.0, "step": 8298 }, { "epoch": 1.0557181020226434, "ewc_loss": 0.026053814217448235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.605381450848654e-05, "grad_norm": 16.32303237915039, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.882931649684906, "num_tokens": 316520510.0, "step": 8299 }, { "epoch": 1.055845312301234, "ewc_loss": 0.026065900921821594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.60659016930731e-05, "grad_norm": 16.19371223449707, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8492778539657593, "num_tokens": 316561946.0, "step": 8300 }, { "epoch": 1.0559725225798244, "ewc_loss": 0.02606848068535328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.606848102004733e-05, "grad_norm": 16.269874572753906, "learning_rate": 1e-06, "loss": 0.37, "mean_token_accuracy": 0.8827135562896729, "num_tokens": 316605489.0, "step": 8301 }, { "epoch": 1.056099732858415, "ewc_loss": 0.026124930009245872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6124929718207568e-05, "grad_norm": 16.23796844482422, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8483723402023315, "num_tokens": 316648590.0, "step": 8302 }, { "epoch": 1.0562269431370055, "ewc_loss": 0.02612200379371643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6122002964257263e-05, "grad_norm": 16.287879943847656, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8773115873336792, "num_tokens": 316688169.0, "step": 8303 }, { "epoch": 1.056354153415596, "ewc_loss": 0.02612283080816269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6122830604435876e-05, "grad_norm": 16.31377601623535, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8446694612503052, "num_tokens": 316724031.0, "step": 8304 }, { "epoch": 1.0564813636941865, "ewc_loss": 0.02614210918545723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6142108254134655e-05, "grad_norm": 16.291841506958008, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8655040264129639, "num_tokens": 316769762.0, "step": 8305 }, { "epoch": 1.056608573972777, "ewc_loss": 0.02608846127986908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6088460799655877e-05, "grad_norm": 16.28985023498535, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8726313710212708, "num_tokens": 316814156.0, "step": 8306 }, { "epoch": 1.0567357842513676, "ewc_loss": 0.02607870101928711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.607870010251645e-05, "grad_norm": 16.306808471679688, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8745938539505005, "num_tokens": 316852784.0, "step": 8307 }, { "epoch": 1.0568629945299581, "ewc_loss": 0.026127692312002182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6127692763111554e-05, "grad_norm": 16.263538360595703, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8580824732780457, "num_tokens": 316891992.0, "step": 8308 }, { "epoch": 1.0569902048085484, "ewc_loss": 0.026102373376488686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61023742496036e-05, "grad_norm": 16.363447189331055, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.879959225654602, "num_tokens": 316932246.0, "step": 8309 }, { "epoch": 1.057117415087139, "ewc_loss": 0.02606675587594509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6066756618092768e-05, "grad_norm": 16.355876922607422, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8566601276397705, "num_tokens": 316972017.0, "step": 8310 }, { "epoch": 1.0572446253657295, "ewc_loss": 0.02603360265493393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6033601898234338e-05, "grad_norm": 16.33488655090332, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8603473901748657, "num_tokens": 317008631.0, "step": 8311 }, { "epoch": 1.05737183564432, "ewc_loss": 0.026062821969389915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6062822144012898e-05, "grad_norm": 16.32361602783203, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8705623149871826, "num_tokens": 317048616.0, "step": 8312 }, { "epoch": 1.0574990459229106, "ewc_loss": 0.025998570024967194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.5998569981311448e-05, "grad_norm": 16.350284576416016, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8582116365432739, "num_tokens": 317088675.0, "step": 8313 }, { "epoch": 1.057626256201501, "ewc_loss": 0.02600550837814808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6005507606896572e-05, "grad_norm": 16.331056594848633, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8857376575469971, "num_tokens": 317120351.0, "step": 8314 }, { "epoch": 1.0577534664800916, "ewc_loss": 0.026015007868409157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6015008188551292e-05, "grad_norm": 16.329044342041016, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8591198325157166, "num_tokens": 317159027.0, "step": 8315 }, { "epoch": 1.0578806767586821, "ewc_loss": 0.026008080691099167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.600808147690259e-05, "grad_norm": 16.350337982177734, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8658137917518616, "num_tokens": 317188892.0, "step": 8316 }, { "epoch": 1.0580078870372727, "ewc_loss": 0.02606077305972576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6060772142955102e-05, "grad_norm": 16.37139892578125, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8706479072570801, "num_tokens": 317229026.0, "step": 8317 }, { "epoch": 1.0581350973158632, "ewc_loss": 0.02602420188486576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6024201360996813e-05, "grad_norm": 16.31087875366211, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8637948632240295, "num_tokens": 317272946.0, "step": 8318 }, { "epoch": 1.0582623075944537, "ewc_loss": 0.026043828576803207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6043828256661072e-05, "grad_norm": 16.335031509399414, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.865704357624054, "num_tokens": 317314625.0, "step": 8319 }, { "epoch": 1.058389517873044, "ewc_loss": 0.02607477642595768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6074776542373e-05, "grad_norm": 16.33383560180664, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8757892847061157, "num_tokens": 317351450.0, "step": 8320 }, { "epoch": 1.0585167281516346, "ewc_loss": 0.026037411764264107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6037412681034766e-05, "grad_norm": 16.407751083374023, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8499687910079956, "num_tokens": 317391519.0, "step": 8321 }, { "epoch": 1.058643938430225, "ewc_loss": 0.026060868054628372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.606086854939349e-05, "grad_norm": 16.367473602294922, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8581191301345825, "num_tokens": 317430672.0, "step": 8322 }, { "epoch": 1.0587711487088156, "ewc_loss": 0.026019522920250893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6019522920250893e-05, "grad_norm": 16.335323333740234, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.866858959197998, "num_tokens": 317470203.0, "step": 8323 }, { "epoch": 1.0588983589874061, "ewc_loss": 0.026047533378005028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6047533538076095e-05, "grad_norm": 16.359086990356445, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8678882718086243, "num_tokens": 317509594.0, "step": 8324 }, { "epoch": 1.0590255692659967, "ewc_loss": 0.02603876404464245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.60387641901616e-05, "grad_norm": 16.305950164794922, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.863865852355957, "num_tokens": 317549735.0, "step": 8325 }, { "epoch": 1.0591527795445872, "ewc_loss": 0.026040051132440567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.604005203465931e-05, "grad_norm": 16.345874786376953, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8630318641662598, "num_tokens": 317585064.0, "step": 8326 }, { "epoch": 1.0592799898231777, "ewc_loss": 0.02607695572078228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.607695569167845e-05, "grad_norm": 16.306400299072266, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.880398154258728, "num_tokens": 317622289.0, "step": 8327 }, { "epoch": 1.0594072001017683, "ewc_loss": 0.026068614795804024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6068615625263192e-05, "grad_norm": 16.296085357666016, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8535460233688354, "num_tokens": 317662872.0, "step": 8328 }, { "epoch": 1.0595344103803588, "ewc_loss": 0.02605949528515339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6059495212393813e-05, "grad_norm": 16.350976943969727, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8661705255508423, "num_tokens": 317701418.0, "step": 8329 }, { "epoch": 1.0596616206589493, "ewc_loss": 0.02608601376414299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6086014258908108e-05, "grad_norm": 16.281007766723633, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8784897923469543, "num_tokens": 317741416.0, "step": 8330 }, { "epoch": 1.0597888309375398, "ewc_loss": 0.026054739952087402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6054740374092944e-05, "grad_norm": 16.36868667602539, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8697928190231323, "num_tokens": 317778623.0, "step": 8331 }, { "epoch": 1.0599160412161304, "ewc_loss": 0.02613929472863674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.613929427752737e-05, "grad_norm": 16.291284561157227, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8667606711387634, "num_tokens": 317816577.0, "step": 8332 }, { "epoch": 1.0600432514947207, "ewc_loss": 0.02602255344390869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.60225533565972e-05, "grad_norm": 16.34380340576172, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.863071084022522, "num_tokens": 317851532.0, "step": 8333 }, { "epoch": 1.0601704617733112, "ewc_loss": 0.026107899844646454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.610790033941157e-05, "grad_norm": 16.283527374267578, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8663206100463867, "num_tokens": 317886789.0, "step": 8334 }, { "epoch": 1.0602976720519017, "ewc_loss": 0.02608199417591095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.608199429232627e-05, "grad_norm": 16.3361759185791, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8679239749908447, "num_tokens": 317929511.0, "step": 8335 }, { "epoch": 1.0604248823304923, "ewc_loss": 0.026110010221600533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6110010367119685e-05, "grad_norm": 16.31707191467285, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8639553189277649, "num_tokens": 317963767.0, "step": 8336 }, { "epoch": 1.0605520926090828, "ewc_loss": 0.02612348087131977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6123479983652942e-05, "grad_norm": 16.3219051361084, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8824502229690552, "num_tokens": 318000458.0, "step": 8337 }, { "epoch": 1.0606793028876733, "ewc_loss": 0.026123568415641785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6123569114133716e-05, "grad_norm": 16.290771484375, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8830828666687012, "num_tokens": 318040394.0, "step": 8338 }, { "epoch": 1.0608065131662638, "ewc_loss": 0.026125462725758553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6125462682102807e-05, "grad_norm": 16.30839729309082, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8619063496589661, "num_tokens": 318080859.0, "step": 8339 }, { "epoch": 1.0609337234448544, "ewc_loss": 0.02618236094713211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.618236067064572e-05, "grad_norm": 16.338401794433594, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8822905421257019, "num_tokens": 318115378.0, "step": 8340 }, { "epoch": 1.061060933723445, "ewc_loss": 0.02619071863591671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6190718926955014e-05, "grad_norm": 16.302061080932617, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8576104640960693, "num_tokens": 318150351.0, "step": 8341 }, { "epoch": 1.0611881440020354, "ewc_loss": 0.02614477090537548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6144771254621446e-05, "grad_norm": 16.33925437927246, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8643012046813965, "num_tokens": 318192427.0, "step": 8342 }, { "epoch": 1.061315354280626, "ewc_loss": 0.02617325261235237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6173252990702167e-05, "grad_norm": 16.29599952697754, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8524726629257202, "num_tokens": 318232150.0, "step": 8343 }, { "epoch": 1.0614425645592165, "ewc_loss": 0.026160966604948044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6160965717281215e-05, "grad_norm": 16.346378326416016, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8617730140686035, "num_tokens": 318267742.0, "step": 8344 }, { "epoch": 1.0615697748378068, "ewc_loss": 0.02616885118186474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6168851036345586e-05, "grad_norm": 16.304588317871094, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8704169392585754, "num_tokens": 318307974.0, "step": 8345 }, { "epoch": 1.0616969851163973, "ewc_loss": 0.026152973994612694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6152974896831438e-05, "grad_norm": 16.35485076904297, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8464530110359192, "num_tokens": 318349880.0, "step": 8346 }, { "epoch": 1.0618241953949878, "ewc_loss": 0.0262192040681839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6219204301014543e-05, "grad_norm": 16.343578338623047, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8635439276695251, "num_tokens": 318383869.0, "step": 8347 }, { "epoch": 1.0619514056735784, "ewc_loss": 0.026165341958403587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6165342205786146e-05, "grad_norm": 16.308862686157227, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8810204863548279, "num_tokens": 318419659.0, "step": 8348 }, { "epoch": 1.062078615952169, "ewc_loss": 0.026172392070293427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617239260871429e-05, "grad_norm": 16.280473709106445, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8747692108154297, "num_tokens": 318460795.0, "step": 8349 }, { "epoch": 1.0622058262307594, "ewc_loss": 0.026186762377619743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6186762625002302e-05, "grad_norm": 16.316431045532227, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8562965393066406, "num_tokens": 318499451.0, "step": 8350 }, { "epoch": 1.06233303650935, "ewc_loss": 0.026200082153081894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.620008308440447e-05, "grad_norm": 16.2983455657959, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8697203397750854, "num_tokens": 318533413.0, "step": 8351 }, { "epoch": 1.0624602467879405, "ewc_loss": 0.0262070894241333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6207089831586927e-05, "grad_norm": 16.388931274414062, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8546786308288574, "num_tokens": 318572533.0, "step": 8352 }, { "epoch": 1.062587457066531, "ewc_loss": 0.026202768087387085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6202767912764102e-05, "grad_norm": 16.281648635864258, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8677862882614136, "num_tokens": 318609442.0, "step": 8353 }, { "epoch": 1.0627146673451215, "ewc_loss": 0.026137597858905792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.613759716041386e-05, "grad_norm": 16.39551544189453, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8804380297660828, "num_tokens": 318643475.0, "step": 8354 }, { "epoch": 1.062841877623712, "ewc_loss": 0.026245078071951866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624507760629058e-05, "grad_norm": 16.376258850097656, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8699934482574463, "num_tokens": 318684223.0, "step": 8355 }, { "epoch": 1.0629690879023026, "ewc_loss": 0.02614104188978672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6141042326344177e-05, "grad_norm": 16.33455467224121, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.866151750087738, "num_tokens": 318717148.0, "step": 8356 }, { "epoch": 1.0630962981808931, "ewc_loss": 0.026239152997732162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.623915315780323e-05, "grad_norm": 16.425785064697266, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8767104148864746, "num_tokens": 318758929.0, "step": 8357 }, { "epoch": 1.0632235084594834, "ewc_loss": 0.026140952482819557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6140953195863403e-05, "grad_norm": 16.35328483581543, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8635101914405823, "num_tokens": 318796438.0, "step": 8358 }, { "epoch": 1.063350718738074, "ewc_loss": 0.02614331990480423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6143319701077417e-05, "grad_norm": 16.426610946655273, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8530207872390747, "num_tokens": 318835732.0, "step": 8359 }, { "epoch": 1.0634779290166645, "ewc_loss": 0.026148628443479538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6148627512156963e-05, "grad_norm": 16.271663665771484, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8636326789855957, "num_tokens": 318872151.0, "step": 8360 }, { "epoch": 1.063605139295255, "ewc_loss": 0.026109565049409866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6109564714715816e-05, "grad_norm": 16.359838485717773, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.854813814163208, "num_tokens": 318911956.0, "step": 8361 }, { "epoch": 1.0637323495738455, "ewc_loss": 0.026219425722956657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6219426217721775e-05, "grad_norm": 16.351524353027344, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8666661977767944, "num_tokens": 318945593.0, "step": 8362 }, { "epoch": 1.063859559852436, "ewc_loss": 0.026137975975871086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61379755102098e-05, "grad_norm": 16.41135025024414, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.875315248966217, "num_tokens": 318976670.0, "step": 8363 }, { "epoch": 1.0639867701310266, "ewc_loss": 0.026197414845228195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6197414626949467e-05, "grad_norm": 16.307281494140625, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8675216436386108, "num_tokens": 319007798.0, "step": 8364 }, { "epoch": 1.0641139804096171, "ewc_loss": 0.02612566575407982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6125666408916004e-05, "grad_norm": 16.336952209472656, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8545864820480347, "num_tokens": 319044808.0, "step": 8365 }, { "epoch": 1.0642411906882077, "ewc_loss": 0.026220163330435753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.622016290843021e-05, "grad_norm": 16.306350708007812, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8853768706321716, "num_tokens": 319083387.0, "step": 8366 }, { "epoch": 1.0643684009667982, "ewc_loss": 0.026155948638916016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6155948944506235e-05, "grad_norm": 16.331159591674805, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.880833625793457, "num_tokens": 319121897.0, "step": 8367 }, { "epoch": 1.0644956112453887, "ewc_loss": 0.026230910792946815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6230911316815764e-05, "grad_norm": 16.294160842895508, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8536319732666016, "num_tokens": 319157190.0, "step": 8368 }, { "epoch": 1.064622821523979, "ewc_loss": 0.02616436593234539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6164365408476442e-05, "grad_norm": 16.246957778930664, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8753281235694885, "num_tokens": 319188353.0, "step": 8369 }, { "epoch": 1.0647500318025696, "ewc_loss": 0.02624676004052162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624676017148886e-05, "grad_norm": 16.421321868896484, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8672630786895752, "num_tokens": 319225536.0, "step": 8370 }, { "epoch": 1.06487724208116, "ewc_loss": 0.02623785473406315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6237854399369098e-05, "grad_norm": 16.313566207885742, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8730076551437378, "num_tokens": 319265114.0, "step": 8371 }, { "epoch": 1.0650044523597506, "ewc_loss": 0.02620428428053856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.620428494992666e-05, "grad_norm": 16.385251998901367, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8764603137969971, "num_tokens": 319309280.0, "step": 8372 }, { "epoch": 1.0651316626383411, "ewc_loss": 0.026270179077982903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.627017966005951e-05, "grad_norm": 16.355031967163086, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8769255876541138, "num_tokens": 319346114.0, "step": 8373 }, { "epoch": 1.0652588729169317, "ewc_loss": 0.026187917217612267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6187917683273554e-05, "grad_norm": 16.345685958862305, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8735509514808655, "num_tokens": 319389001.0, "step": 8374 }, { "epoch": 1.0653860831955222, "ewc_loss": 0.02619972638785839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6199726562481374e-05, "grad_norm": 16.326519012451172, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8736672401428223, "num_tokens": 319431464.0, "step": 8375 }, { "epoch": 1.0655132934741127, "ewc_loss": 0.026157747954130173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6157747925026342e-05, "grad_norm": 16.323131561279297, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8422552347183228, "num_tokens": 319475638.0, "step": 8376 }, { "epoch": 1.0656405037527032, "ewc_loss": 0.026187526062130928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.618752660055179e-05, "grad_norm": 16.36351776123047, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8757139444351196, "num_tokens": 319515983.0, "step": 8377 }, { "epoch": 1.0657677140312938, "ewc_loss": 0.026170136407017708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6170137061853893e-05, "grad_norm": 16.410972595214844, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8625239729881287, "num_tokens": 319552152.0, "step": 8378 }, { "epoch": 1.0658949243098843, "ewc_loss": 0.026167893782258034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.616789424791932e-05, "grad_norm": 16.38311767578125, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8580296039581299, "num_tokens": 319592963.0, "step": 8379 }, { "epoch": 1.0660221345884748, "ewc_loss": 0.026079969480633736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6079969757120125e-05, "grad_norm": 16.325031280517578, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.852206826210022, "num_tokens": 319635136.0, "step": 8380 }, { "epoch": 1.0661493448670654, "ewc_loss": 0.026114365085959435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6114365027751774e-05, "grad_norm": 16.377704620361328, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8686494827270508, "num_tokens": 319670118.0, "step": 8381 }, { "epoch": 1.0662765551456557, "ewc_loss": 0.026107560843229294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.610756018839311e-05, "grad_norm": 16.352218627929688, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8724840879440308, "num_tokens": 319711057.0, "step": 8382 }, { "epoch": 1.0664037654242462, "ewc_loss": 0.026091426610946655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6091425752383657e-05, "grad_norm": 16.394495010375977, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8647341728210449, "num_tokens": 319746451.0, "step": 8383 }, { "epoch": 1.0665309757028367, "ewc_loss": 0.026089241728186607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6089241146109998e-05, "grad_norm": 16.282257080078125, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8594239950180054, "num_tokens": 319792149.0, "step": 8384 }, { "epoch": 1.0666581859814273, "ewc_loss": 0.026066044345498085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6066043574246578e-05, "grad_norm": 16.3912410736084, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8573924899101257, "num_tokens": 319826508.0, "step": 8385 }, { "epoch": 1.0667853962600178, "ewc_loss": 0.02615009807050228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.615009725559503e-05, "grad_norm": 16.362110137939453, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8679928779602051, "num_tokens": 319867000.0, "step": 8386 }, { "epoch": 1.0669126065386083, "ewc_loss": 0.02603057026863098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6030569642898627e-05, "grad_norm": 16.291423797607422, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.847591757774353, "num_tokens": 319904476.0, "step": 8387 }, { "epoch": 1.0670398168171988, "ewc_loss": 0.026124650612473488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6124651412828825e-05, "grad_norm": 16.352642059326172, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8420560359954834, "num_tokens": 319943279.0, "step": 8388 }, { "epoch": 1.0671670270957894, "ewc_loss": 0.02617293782532215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6172938305535354e-05, "grad_norm": 16.30340003967285, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8625383973121643, "num_tokens": 319986610.0, "step": 8389 }, { "epoch": 1.06729423737438, "ewc_loss": 0.02614814229309559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6148141841986217e-05, "grad_norm": 16.401607513427734, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8590766191482544, "num_tokens": 320022702.0, "step": 8390 }, { "epoch": 1.0674214476529704, "ewc_loss": 0.026221375912427902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6221376174362376e-05, "grad_norm": 16.298267364501953, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.868032693862915, "num_tokens": 320067918.0, "step": 8391 }, { "epoch": 1.067548657931561, "ewc_loss": 0.02610701322555542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.610701267258264e-05, "grad_norm": 16.325199127197266, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8571457862854004, "num_tokens": 320110455.0, "step": 8392 }, { "epoch": 1.0676758682101515, "ewc_loss": 0.026181144639849663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.618114376673475e-05, "grad_norm": 16.329410552978516, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8827261328697205, "num_tokens": 320147675.0, "step": 8393 }, { "epoch": 1.0678030784887418, "ewc_loss": 0.026181546971201897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6181547582382336e-05, "grad_norm": 16.33100700378418, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8521146774291992, "num_tokens": 320183882.0, "step": 8394 }, { "epoch": 1.0679302887673323, "ewc_loss": 0.026192601770162582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6192601580987684e-05, "grad_norm": 16.332612991333008, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.854241669178009, "num_tokens": 320222215.0, "step": 8395 }, { "epoch": 1.0680574990459228, "ewc_loss": 0.026234520599246025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62345201917924e-05, "grad_norm": 16.40646743774414, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8579904437065125, "num_tokens": 320262418.0, "step": 8396 }, { "epoch": 1.0681847093245134, "ewc_loss": 0.026189593598246574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.618959297251422e-05, "grad_norm": 16.303377151489258, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8591777086257935, "num_tokens": 320297304.0, "step": 8397 }, { "epoch": 1.068311919603104, "ewc_loss": 0.026176368817687035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617636891955044e-05, "grad_norm": 16.312023162841797, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8680166602134705, "num_tokens": 320330919.0, "step": 8398 }, { "epoch": 1.0684391298816944, "ewc_loss": 0.02627473510801792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.627473440952599e-05, "grad_norm": 16.368684768676758, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8658413290977478, "num_tokens": 320367602.0, "step": 8399 }, { "epoch": 1.068566340160285, "ewc_loss": 0.02624155394732952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624155422381591e-05, "grad_norm": 16.346725463867188, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8660562038421631, "num_tokens": 320403875.0, "step": 8400 }, { "epoch": 1.0686935504388755, "ewc_loss": 0.026240568608045578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624056833155919e-05, "grad_norm": 16.33945083618164, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8688822984695435, "num_tokens": 320439149.0, "step": 8401 }, { "epoch": 1.068820760717466, "ewc_loss": 0.02620222233235836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.620222221594304e-05, "grad_norm": 16.3289852142334, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8769067525863647, "num_tokens": 320476786.0, "step": 8402 }, { "epoch": 1.0689479709960565, "ewc_loss": 0.026246190071105957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624619082780555e-05, "grad_norm": 16.29485321044922, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8498256206512451, "num_tokens": 320516945.0, "step": 8403 }, { "epoch": 1.069075181274647, "ewc_loss": 0.026258492842316628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.625849265314173e-05, "grad_norm": 16.343387603759766, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8802151679992676, "num_tokens": 320552801.0, "step": 8404 }, { "epoch": 1.0692023915532376, "ewc_loss": 0.026276517659425735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6276517019141465e-05, "grad_norm": 16.30898094177246, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8686560392379761, "num_tokens": 320593605.0, "step": 8405 }, { "epoch": 1.0693296018318281, "ewc_loss": 0.02623406983911991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.623406908242032e-05, "grad_norm": 16.33401870727539, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8726606369018555, "num_tokens": 320624476.0, "step": 8406 }, { "epoch": 1.0694568121104184, "ewc_loss": 0.0262416023761034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6241601517540403e-05, "grad_norm": 16.30974578857422, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8727163076400757, "num_tokens": 320667000.0, "step": 8407 }, { "epoch": 1.069584022389009, "ewc_loss": 0.026219448074698448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6219448045594618e-05, "grad_norm": 16.328916549682617, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8702468872070312, "num_tokens": 320699286.0, "step": 8408 }, { "epoch": 1.0697112326675995, "ewc_loss": 0.026268547400832176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.626854802656453e-05, "grad_norm": 16.36009407043457, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8717851638793945, "num_tokens": 320732520.0, "step": 8409 }, { "epoch": 1.06983844294619, "ewc_loss": 0.026272209361195564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.627220965223387e-05, "grad_norm": 16.32770347595215, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8543383479118347, "num_tokens": 320773425.0, "step": 8410 }, { "epoch": 1.0699656532247805, "ewc_loss": 0.02619590237736702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.619590304675512e-05, "grad_norm": 16.251007080078125, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8442407250404358, "num_tokens": 320810859.0, "step": 8411 }, { "epoch": 1.070092863503371, "ewc_loss": 0.026313163340091705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6313164198654704e-05, "grad_norm": 16.386232376098633, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8689842224121094, "num_tokens": 320850266.0, "step": 8412 }, { "epoch": 1.0702200737819616, "ewc_loss": 0.026282262057065964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6282261387677863e-05, "grad_norm": 16.247900009155273, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8645183444023132, "num_tokens": 320892812.0, "step": 8413 }, { "epoch": 1.0703472840605521, "ewc_loss": 0.026276880875229836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6276880817022175e-05, "grad_norm": 16.394975662231445, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8777244091033936, "num_tokens": 320929563.0, "step": 8414 }, { "epoch": 1.0704744943391427, "ewc_loss": 0.026306262239813805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.630626295285765e-05, "grad_norm": 16.326730728149414, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8735084533691406, "num_tokens": 320964010.0, "step": 8415 }, { "epoch": 1.0706017046177332, "ewc_loss": 0.026245810091495514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6245810659020208e-05, "grad_norm": 16.35079574584961, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8626123666763306, "num_tokens": 321006405.0, "step": 8416 }, { "epoch": 1.0707289148963237, "ewc_loss": 0.026300542056560516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.63005422311835e-05, "grad_norm": 16.327116012573242, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8604503870010376, "num_tokens": 321050146.0, "step": 8417 }, { "epoch": 1.070856125174914, "ewc_loss": 0.026257002726197243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6257002900820225e-05, "grad_norm": 16.43486213684082, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8656070232391357, "num_tokens": 321086857.0, "step": 8418 }, { "epoch": 1.0709833354535045, "ewc_loss": 0.02631387487053871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631387542351149e-05, "grad_norm": 16.380802154541016, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8709406852722168, "num_tokens": 321127310.0, "step": 8419 }, { "epoch": 1.071110545732095, "ewc_loss": 0.026170505210757256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617050449771341e-05, "grad_norm": 16.395092010498047, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8558690547943115, "num_tokens": 321165881.0, "step": 8420 }, { "epoch": 1.0712377560106856, "ewc_loss": 0.026277339085936546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6277339202351868e-05, "grad_norm": 16.435138702392578, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8594574928283691, "num_tokens": 321204494.0, "step": 8421 }, { "epoch": 1.0713649662892761, "ewc_loss": 0.026175139471888542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6175139282713644e-05, "grad_norm": 16.292396545410156, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.855124831199646, "num_tokens": 321243639.0, "step": 8422 }, { "epoch": 1.0714921765678667, "ewc_loss": 0.026218444108963013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621844396344386e-05, "grad_norm": 16.407873153686523, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8583090305328369, "num_tokens": 321281399.0, "step": 8423 }, { "epoch": 1.0716193868464572, "ewc_loss": 0.026258256286382675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.625825618451927e-05, "grad_norm": 16.368152618408203, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8515740036964417, "num_tokens": 321316539.0, "step": 8424 }, { "epoch": 1.0717465971250477, "ewc_loss": 0.02617698349058628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.617698373796884e-05, "grad_norm": 16.35655975341797, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8606219291687012, "num_tokens": 321348618.0, "step": 8425 }, { "epoch": 1.0718738074036382, "ewc_loss": 0.02624456398189068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.624456465127878e-05, "grad_norm": 16.41463279724121, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8711587190628052, "num_tokens": 321381674.0, "step": 8426 }, { "epoch": 1.0720010176822288, "ewc_loss": 0.026248332113027573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6248331778333522e-05, "grad_norm": 16.381879806518555, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8824368715286255, "num_tokens": 321420342.0, "step": 8427 }, { "epoch": 1.0721282279608193, "ewc_loss": 0.0262234415858984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62234407273354e-05, "grad_norm": 16.396818161010742, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.862168550491333, "num_tokens": 321450770.0, "step": 8428 }, { "epoch": 1.0722554382394098, "ewc_loss": 0.02621360495686531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621360545163043e-05, "grad_norm": 16.40803337097168, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8684421181678772, "num_tokens": 321491450.0, "step": 8429 }, { "epoch": 1.0723826485180004, "ewc_loss": 0.026225002482533455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6225003239233047e-05, "grad_norm": 16.37999153137207, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8598625659942627, "num_tokens": 321531774.0, "step": 8430 }, { "epoch": 1.0725098587965907, "ewc_loss": 0.026237908750772476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6237908969051205e-05, "grad_norm": 16.3864688873291, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8656038045883179, "num_tokens": 321567692.0, "step": 8431 }, { "epoch": 1.0726370690751812, "ewc_loss": 0.026222163811326027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.622216379677411e-05, "grad_norm": 16.362407684326172, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8606464266777039, "num_tokens": 321601168.0, "step": 8432 }, { "epoch": 1.0727642793537717, "ewc_loss": 0.026221953332424164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62219527940033e-05, "grad_norm": 16.375572204589844, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8479843139648438, "num_tokens": 321636100.0, "step": 8433 }, { "epoch": 1.0728914896323622, "ewc_loss": 0.02628571353852749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6285713829565793e-05, "grad_norm": 16.464759826660156, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8665066957473755, "num_tokens": 321674365.0, "step": 8434 }, { "epoch": 1.0730186999109528, "ewc_loss": 0.026191018521785736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.61910190602066e-05, "grad_norm": 16.204126358032227, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8760012984275818, "num_tokens": 321712267.0, "step": 8435 }, { "epoch": 1.0731459101895433, "ewc_loss": 0.026270927861332893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.627092726470437e-05, "grad_norm": 16.434476852416992, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8632607460021973, "num_tokens": 321749955.0, "step": 8436 }, { "epoch": 1.0732731204681338, "ewc_loss": 0.02631278522312641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6312785848858766e-05, "grad_norm": 16.31089973449707, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8733786344528198, "num_tokens": 321788522.0, "step": 8437 }, { "epoch": 1.0734003307467244, "ewc_loss": 0.02619919553399086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6199195417575538e-05, "grad_norm": 16.34912872314453, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8633309602737427, "num_tokens": 321826314.0, "step": 8438 }, { "epoch": 1.073527541025315, "ewc_loss": 0.026310155168175697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631015559018124e-05, "grad_norm": 16.369319915771484, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8619101047515869, "num_tokens": 321864851.0, "step": 8439 }, { "epoch": 1.0736547513039054, "ewc_loss": 0.026253599673509598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.625359957164619e-05, "grad_norm": 16.402921676635742, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8747958540916443, "num_tokens": 321901001.0, "step": 8440 }, { "epoch": 1.073781961582496, "ewc_loss": 0.026327041909098625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6327041268814355e-05, "grad_norm": 16.478742599487305, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8644107580184937, "num_tokens": 321941368.0, "step": 8441 }, { "epoch": 1.0739091718610865, "ewc_loss": 0.026229921728372574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6229921786580235e-05, "grad_norm": 16.375747680664062, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8688281774520874, "num_tokens": 321978268.0, "step": 8442 }, { "epoch": 1.0740363821396768, "ewc_loss": 0.026226164773106575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6226165573461913e-05, "grad_norm": 16.400434494018555, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8725454807281494, "num_tokens": 322016969.0, "step": 8443 }, { "epoch": 1.0741635924182673, "ewc_loss": 0.026249542832374573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6249543225276284e-05, "grad_norm": 16.339441299438477, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8586136698722839, "num_tokens": 322059362.0, "step": 8444 }, { "epoch": 1.0742908026968578, "ewc_loss": 0.02620501071214676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6205010726698674e-05, "grad_norm": 16.370473861694336, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8460390567779541, "num_tokens": 322097334.0, "step": 8445 }, { "epoch": 1.0744180129754484, "ewc_loss": 0.026209602132439613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6209601855953224e-05, "grad_norm": 16.351900100708008, "learning_rate": 1e-06, "loss": 0.3437, "mean_token_accuracy": 0.8846927881240845, "num_tokens": 322121988.0, "step": 8446 }, { "epoch": 1.074545223254039, "ewc_loss": 0.026243045926094055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6243045795126818e-05, "grad_norm": 16.437631607055664, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.861163854598999, "num_tokens": 322156602.0, "step": 8447 }, { "epoch": 1.0746724335326294, "ewc_loss": 0.026263222098350525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.626322202559095e-05, "grad_norm": 16.34049415588379, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8626383543014526, "num_tokens": 322194475.0, "step": 8448 }, { "epoch": 1.07479964381122, "ewc_loss": 0.02619665488600731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6196654289378785e-05, "grad_norm": 16.367774963378906, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8725929856300354, "num_tokens": 322232158.0, "step": 8449 }, { "epoch": 1.0749268540898105, "ewc_loss": 0.026263711974024773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6263711333740503e-05, "grad_norm": 16.341796875, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8733562231063843, "num_tokens": 322272170.0, "step": 8450 }, { "epoch": 1.075054064368401, "ewc_loss": 0.02623474970459938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6234749384457245e-05, "grad_norm": 16.453895568847656, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8698933124542236, "num_tokens": 322305496.0, "step": 8451 }, { "epoch": 1.0751812746469915, "ewc_loss": 0.026315322145819664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631532152008731e-05, "grad_norm": 16.325170516967773, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8700466156005859, "num_tokens": 322347552.0, "step": 8452 }, { "epoch": 1.075308484925582, "ewc_loss": 0.026211483404040337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621148269099649e-05, "grad_norm": 16.316936492919922, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8675665855407715, "num_tokens": 322386471.0, "step": 8453 }, { "epoch": 1.0754356952041726, "ewc_loss": 0.026314014568924904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6314015485695563e-05, "grad_norm": 16.38829231262207, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8306384086608887, "num_tokens": 322431019.0, "step": 8454 }, { "epoch": 1.0755629054827631, "ewc_loss": 0.02628941275179386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6289411835023202e-05, "grad_norm": 16.354211807250977, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8841956853866577, "num_tokens": 322471525.0, "step": 8455 }, { "epoch": 1.0756901157613534, "ewc_loss": 0.0262618288397789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6261828679707833e-05, "grad_norm": 16.322973251342773, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8676037788391113, "num_tokens": 322516594.0, "step": 8456 }, { "epoch": 1.075817326039944, "ewc_loss": 0.02625511959195137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6255120246787556e-05, "grad_norm": 16.35905647277832, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8711890578269958, "num_tokens": 322557837.0, "step": 8457 }, { "epoch": 1.0759445363185345, "ewc_loss": 0.026284923776984215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6284924388164654e-05, "grad_norm": 16.38019561767578, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8610179424285889, "num_tokens": 322594165.0, "step": 8458 }, { "epoch": 1.076071746597125, "ewc_loss": 0.026253370568156242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6253370378981344e-05, "grad_norm": 16.371566772460938, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8486422300338745, "num_tokens": 322637083.0, "step": 8459 }, { "epoch": 1.0761989568757155, "ewc_loss": 0.026259925216436386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6259926016791724e-05, "grad_norm": 16.43221092224121, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8641579151153564, "num_tokens": 322673216.0, "step": 8460 }, { "epoch": 1.076326167154306, "ewc_loss": 0.02620125748217106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.620125815155916e-05, "grad_norm": 16.372652053833008, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.867737889289856, "num_tokens": 322709150.0, "step": 8461 }, { "epoch": 1.0764533774328966, "ewc_loss": 0.0261854100972414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6185409296886064e-05, "grad_norm": 16.37433433532715, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8712875247001648, "num_tokens": 322747893.0, "step": 8462 }, { "epoch": 1.0765805877114871, "ewc_loss": 0.026188580319285393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6188579795416445e-05, "grad_norm": 16.40033531188965, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8478531837463379, "num_tokens": 322781667.0, "step": 8463 }, { "epoch": 1.0767077979900777, "ewc_loss": 0.02620711550116539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6207115297438577e-05, "grad_norm": 16.377843856811523, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8647446632385254, "num_tokens": 322818008.0, "step": 8464 }, { "epoch": 1.0768350082686682, "ewc_loss": 0.02621544897556305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621544808789622e-05, "grad_norm": 16.351633071899414, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8456452488899231, "num_tokens": 322856528.0, "step": 8465 }, { "epoch": 1.0769622185472587, "ewc_loss": 0.02622513845562935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.622513784444891e-05, "grad_norm": 16.309675216674805, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8691027164459229, "num_tokens": 322888798.0, "step": 8466 }, { "epoch": 1.077089428825849, "ewc_loss": 0.026238130405545235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6238130885758437e-05, "grad_norm": 16.402366638183594, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8675455451011658, "num_tokens": 322923301.0, "step": 8467 }, { "epoch": 1.0772166391044395, "ewc_loss": 0.02625434100627899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6254341719322838e-05, "grad_norm": 16.387540817260742, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8777192831039429, "num_tokens": 322964253.0, "step": 8468 }, { "epoch": 1.07734384938303, "ewc_loss": 0.02624824084341526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6248240828863345e-05, "grad_norm": 16.399351119995117, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8633578419685364, "num_tokens": 323006926.0, "step": 8469 }, { "epoch": 1.0774710596616206, "ewc_loss": 0.0262292567640543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6229256036458537e-05, "grad_norm": 16.43553924560547, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8785907030105591, "num_tokens": 323047541.0, "step": 8470 }, { "epoch": 1.0775982699402111, "ewc_loss": 0.02621486783027649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621486783027649e-05, "grad_norm": 16.3872127532959, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8580543994903564, "num_tokens": 323084272.0, "step": 8471 }, { "epoch": 1.0777254802188017, "ewc_loss": 0.026222942396998405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.622294232423883e-05, "grad_norm": 16.448631286621094, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8767387866973877, "num_tokens": 323116698.0, "step": 8472 }, { "epoch": 1.0778526904973922, "ewc_loss": 0.026259997859597206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6259998776367866e-05, "grad_norm": 16.32243537902832, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8730719089508057, "num_tokens": 323158836.0, "step": 8473 }, { "epoch": 1.0779799007759827, "ewc_loss": 0.026223162189126015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.622316242195666e-05, "grad_norm": 16.42566680908203, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8612279295921326, "num_tokens": 323193777.0, "step": 8474 }, { "epoch": 1.0781071110545732, "ewc_loss": 0.026272177696228027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6272176910424605e-05, "grad_norm": 16.36890411376953, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8667358160018921, "num_tokens": 323232546.0, "step": 8475 }, { "epoch": 1.0782343213331638, "ewc_loss": 0.026203671470284462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6203671950497665e-05, "grad_norm": 16.387914657592773, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8658351302146912, "num_tokens": 323269403.0, "step": 8476 }, { "epoch": 1.0783615316117543, "ewc_loss": 0.02623734064400196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62373414443573e-05, "grad_norm": 16.306901931762695, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8785492181777954, "num_tokens": 323305982.0, "step": 8477 }, { "epoch": 1.0784887418903448, "ewc_loss": 0.02625531144440174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6255311240674928e-05, "grad_norm": 16.514162063598633, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8617172241210938, "num_tokens": 323341032.0, "step": 8478 }, { "epoch": 1.0786159521689354, "ewc_loss": 0.026275938376784325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6275938580511138e-05, "grad_norm": 16.349149703979492, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8596850037574768, "num_tokens": 323381647.0, "step": 8479 }, { "epoch": 1.0787431624475257, "ewc_loss": 0.026231084018945694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62310841208091e-05, "grad_norm": 16.33379364013672, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8692518472671509, "num_tokens": 323417185.0, "step": 8480 }, { "epoch": 1.0788703727261162, "ewc_loss": 0.02628435380756855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.628435322549194e-05, "grad_norm": 16.34502601623535, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8649576902389526, "num_tokens": 323454624.0, "step": 8481 }, { "epoch": 1.0789975830047067, "ewc_loss": 0.0262766070663929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6276607968611643e-05, "grad_norm": 16.3239803314209, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8654953241348267, "num_tokens": 323492862.0, "step": 8482 }, { "epoch": 1.0791247932832972, "ewc_loss": 0.026322202757000923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6322202757000923e-05, "grad_norm": 16.37881851196289, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8620942831039429, "num_tokens": 323536012.0, "step": 8483 }, { "epoch": 1.0792520035618878, "ewc_loss": 0.02632293477654457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.632293399074115e-05, "grad_norm": 16.381534576416016, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8697817325592041, "num_tokens": 323573495.0, "step": 8484 }, { "epoch": 1.0793792138404783, "ewc_loss": 0.026313917711377144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6313917260267772e-05, "grad_norm": 16.3554630279541, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8755648136138916, "num_tokens": 323610900.0, "step": 8485 }, { "epoch": 1.0795064241190688, "ewc_loss": 0.026292193681001663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6292193069821224e-05, "grad_norm": 16.36920166015625, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8655034303665161, "num_tokens": 323650983.0, "step": 8486 }, { "epoch": 1.0796336343976594, "ewc_loss": 0.0263436920940876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6343692297814414e-05, "grad_norm": 16.457399368286133, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8576879501342773, "num_tokens": 323690155.0, "step": 8487 }, { "epoch": 1.0797608446762499, "ewc_loss": 0.02635275200009346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6352752684033476e-05, "grad_norm": 16.336326599121094, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8788212537765503, "num_tokens": 323732212.0, "step": 8488 }, { "epoch": 1.0798880549548404, "ewc_loss": 0.026291826739907265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.629182745295111e-05, "grad_norm": 16.427501678466797, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8737430572509766, "num_tokens": 323773400.0, "step": 8489 }, { "epoch": 1.080015265233431, "ewc_loss": 0.026311881840229034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6311881811125204e-05, "grad_norm": 16.29359245300293, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8734821677207947, "num_tokens": 323812753.0, "step": 8490 }, { "epoch": 1.0801424755120215, "ewc_loss": 0.02627786435186863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6277864890289493e-05, "grad_norm": 16.469331741333008, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8681650161743164, "num_tokens": 323855835.0, "step": 8491 }, { "epoch": 1.0802696857906118, "ewc_loss": 0.026322368532419205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6322368285036646e-05, "grad_norm": 16.385454177856445, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.878572940826416, "num_tokens": 323896065.0, "step": 8492 }, { "epoch": 1.0803968960692023, "ewc_loss": 0.026227856054902077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6227855414617807e-05, "grad_norm": 16.368200302124023, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8689022660255432, "num_tokens": 323934351.0, "step": 8493 }, { "epoch": 1.0805241063477928, "ewc_loss": 0.026297926902770996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.62979265244212e-05, "grad_norm": 16.432706832885742, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8554924726486206, "num_tokens": 323968878.0, "step": 8494 }, { "epoch": 1.0806513166263834, "ewc_loss": 0.02626677416265011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6266774511896074e-05, "grad_norm": 16.41145896911621, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.865245521068573, "num_tokens": 324004838.0, "step": 8495 }, { "epoch": 1.080778526904974, "ewc_loss": 0.026193996891379356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6193996745860204e-05, "grad_norm": 16.37112808227539, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8814048767089844, "num_tokens": 324040851.0, "step": 8496 }, { "epoch": 1.0809057371835644, "ewc_loss": 0.026229768991470337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6229768991470337e-05, "grad_norm": 16.405792236328125, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8538461923599243, "num_tokens": 324084675.0, "step": 8497 }, { "epoch": 1.081032947462155, "ewc_loss": 0.026248687878251076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6248688300256617e-05, "grad_norm": 16.4328670501709, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8717441558837891, "num_tokens": 324122136.0, "step": 8498 }, { "epoch": 1.0811601577407455, "ewc_loss": 0.026213817298412323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621381645440124e-05, "grad_norm": 16.410425186157227, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8645700812339783, "num_tokens": 324153142.0, "step": 8499 }, { "epoch": 1.081287368019336, "ewc_loss": 0.026207009330391884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.620700979605317e-05, "grad_norm": 16.374984741210938, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8462459444999695, "num_tokens": 324187244.0, "step": 8500 }, { "epoch": 1.0814145782979265, "ewc_loss": 0.026186956092715263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6186955437879078e-05, "grad_norm": 16.36106300354004, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8688327074050903, "num_tokens": 324225644.0, "step": 8501 }, { "epoch": 1.081541788576517, "ewc_loss": 0.026300374418497086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6300374884158373e-05, "grad_norm": 16.41537857055664, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8736560940742493, "num_tokens": 324270394.0, "step": 8502 }, { "epoch": 1.0816689988551076, "ewc_loss": 0.02621794492006302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.621794556034729e-05, "grad_norm": 16.380123138427734, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8765658140182495, "num_tokens": 324311244.0, "step": 8503 }, { "epoch": 1.0817962091336981, "ewc_loss": 0.02623003162443638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6230030925944448e-05, "grad_norm": 16.370431900024414, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8484442234039307, "num_tokens": 324350821.0, "step": 8504 }, { "epoch": 1.0819234194122884, "ewc_loss": 0.02624872326850891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6248722861055285e-05, "grad_norm": 16.402584075927734, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8652640581130981, "num_tokens": 324388255.0, "step": 8505 }, { "epoch": 1.082050629690879, "ewc_loss": 0.026283441111445427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6283441911800765e-05, "grad_norm": 16.407567977905273, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8413525223731995, "num_tokens": 324430068.0, "step": 8506 }, { "epoch": 1.0821778399694695, "ewc_loss": 0.026257799938321114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.625779961817898e-05, "grad_norm": 16.377872467041016, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8730584383010864, "num_tokens": 324465341.0, "step": 8507 }, { "epoch": 1.08230505024806, "ewc_loss": 0.02626759000122547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6267589419148862e-05, "grad_norm": 16.33638572692871, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8742720484733582, "num_tokens": 324502199.0, "step": 8508 }, { "epoch": 1.0824322605266505, "ewc_loss": 0.026262842118740082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6262841856805608e-05, "grad_norm": 16.437021255493164, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8677008152008057, "num_tokens": 324535898.0, "step": 8509 }, { "epoch": 1.082559470805241, "ewc_loss": 0.026265539228916168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6265539418091066e-05, "grad_norm": 16.32940673828125, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8557730317115784, "num_tokens": 324574404.0, "step": 8510 }, { "epoch": 1.0826866810838316, "ewc_loss": 0.026289645582437515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6289644665666856e-05, "grad_norm": 16.42683982849121, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8656907081604004, "num_tokens": 324611743.0, "step": 8511 }, { "epoch": 1.0828138913624221, "ewc_loss": 0.026353362947702408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6353362045483664e-05, "grad_norm": 16.423973083496094, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8568673133850098, "num_tokens": 324653126.0, "step": 8512 }, { "epoch": 1.0829411016410126, "ewc_loss": 0.026230668649077415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6230669391225092e-05, "grad_norm": 16.303131103515625, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8733964562416077, "num_tokens": 324692957.0, "step": 8513 }, { "epoch": 1.0830683119196032, "ewc_loss": 0.026274079456925392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6274079573340714e-05, "grad_norm": 16.453144073486328, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.861253023147583, "num_tokens": 324737214.0, "step": 8514 }, { "epoch": 1.0831955221981937, "ewc_loss": 0.026334160938858986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6334160793339834e-05, "grad_norm": 16.36337661743164, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8762549161911011, "num_tokens": 324773098.0, "step": 8515 }, { "epoch": 1.083322732476784, "ewc_loss": 0.02627694047987461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.627694084367249e-05, "grad_norm": 16.36425018310547, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8559510707855225, "num_tokens": 324815846.0, "step": 8516 }, { "epoch": 1.0834499427553745, "ewc_loss": 0.02634083293378353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634083284647204e-05, "grad_norm": 16.41109275817871, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8622040748596191, "num_tokens": 324855912.0, "step": 8517 }, { "epoch": 1.083577153033965, "ewc_loss": 0.026310449466109276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6310450266464613e-05, "grad_norm": 16.376283645629883, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8602651357650757, "num_tokens": 324893090.0, "step": 8518 }, { "epoch": 1.0837043633125556, "ewc_loss": 0.02628065086901188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6280651582055725e-05, "grad_norm": 16.381345748901367, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8656573295593262, "num_tokens": 324929779.0, "step": 8519 }, { "epoch": 1.0838315735911461, "ewc_loss": 0.026341382414102554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634138218127191e-05, "grad_norm": 16.400081634521484, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8711615204811096, "num_tokens": 324961349.0, "step": 8520 }, { "epoch": 1.0839587838697367, "ewc_loss": 0.026303060352802277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6303059712518007e-05, "grad_norm": 16.42385482788086, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8653693199157715, "num_tokens": 324998688.0, "step": 8521 }, { "epoch": 1.0840859941483272, "ewc_loss": 0.02632146142423153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6321460609324276e-05, "grad_norm": 16.358095169067383, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8583765029907227, "num_tokens": 325040601.0, "step": 8522 }, { "epoch": 1.0842132044269177, "ewc_loss": 0.02626706473529339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.626706555020064e-05, "grad_norm": 16.36203956604004, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8710483312606812, "num_tokens": 325081729.0, "step": 8523 }, { "epoch": 1.0843404147055082, "ewc_loss": 0.026331719011068344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6331719709560275e-05, "grad_norm": 16.396696090698242, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8551158905029297, "num_tokens": 325118886.0, "step": 8524 }, { "epoch": 1.0844676249840988, "ewc_loss": 0.026278307661414146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6278306904714555e-05, "grad_norm": 16.342439651489258, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.8837524652481079, "num_tokens": 325156417.0, "step": 8525 }, { "epoch": 1.0845948352626893, "ewc_loss": 0.026328755542635918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6328754756832495e-05, "grad_norm": 16.452892303466797, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8786376118659973, "num_tokens": 325200902.0, "step": 8526 }, { "epoch": 1.0847220455412798, "ewc_loss": 0.026362044736742973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6362044081906788e-05, "grad_norm": 16.399110794067383, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8667030334472656, "num_tokens": 325233362.0, "step": 8527 }, { "epoch": 1.0848492558198704, "ewc_loss": 0.02630404196679592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.630404196679592e-05, "grad_norm": 16.39689826965332, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8648262619972229, "num_tokens": 325274646.0, "step": 8528 }, { "epoch": 1.0849764660984607, "ewc_loss": 0.026358097791671753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6358096874901094e-05, "grad_norm": 16.446672439575195, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8818472027778625, "num_tokens": 325304735.0, "step": 8529 }, { "epoch": 1.0851036763770512, "ewc_loss": 0.0263564083725214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6356408852734603e-05, "grad_norm": 16.443260192871094, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8697386980056763, "num_tokens": 325339833.0, "step": 8530 }, { "epoch": 1.0852308866556417, "ewc_loss": 0.026317648589611053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6317648007534444e-05, "grad_norm": 16.370336532592773, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8532710075378418, "num_tokens": 325373838.0, "step": 8531 }, { "epoch": 1.0853580969342322, "ewc_loss": 0.026371685788035393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6371686544734985e-05, "grad_norm": 16.46026039123535, "learning_rate": 1e-06, "loss": 0.3485, "mean_token_accuracy": 0.8868913650512695, "num_tokens": 325412913.0, "step": 8532 }, { "epoch": 1.0854853072128228, "ewc_loss": 0.026357606053352356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6357605747762136e-05, "grad_norm": 16.384828567504883, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8708721399307251, "num_tokens": 325451830.0, "step": 8533 }, { "epoch": 1.0856125174914133, "ewc_loss": 0.026378145441412926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6378145776106976e-05, "grad_norm": 16.479982376098633, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.84791100025177, "num_tokens": 325487162.0, "step": 8534 }, { "epoch": 1.0857397277700038, "ewc_loss": 0.02630278654396534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6302786864107475e-05, "grad_norm": 16.395601272583008, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8756738901138306, "num_tokens": 325525906.0, "step": 8535 }, { "epoch": 1.0858669380485944, "ewc_loss": 0.0262812077999115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.628120819281321e-05, "grad_norm": 16.454631805419922, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8769111037254333, "num_tokens": 325559925.0, "step": 8536 }, { "epoch": 1.0859941483271849, "ewc_loss": 0.026348544284701347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634854354255367e-05, "grad_norm": 16.40980339050293, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8658527135848999, "num_tokens": 325599980.0, "step": 8537 }, { "epoch": 1.0861213586057754, "ewc_loss": 0.026303231716156006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6303232516511343e-05, "grad_norm": 16.458446502685547, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8511253595352173, "num_tokens": 325631745.0, "step": 8538 }, { "epoch": 1.086248568884366, "ewc_loss": 0.026406092569231987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6406092729303055e-05, "grad_norm": 16.474031448364258, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8626619577407837, "num_tokens": 325673583.0, "step": 8539 }, { "epoch": 1.0863757791629565, "ewc_loss": 0.026289276778697968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.628927722980734e-05, "grad_norm": 16.37813949584961, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8585602641105652, "num_tokens": 325712995.0, "step": 8540 }, { "epoch": 1.0865029894415468, "ewc_loss": 0.026341453194618225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634145312185865e-05, "grad_norm": 16.454084396362305, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8725493550300598, "num_tokens": 325748259.0, "step": 8541 }, { "epoch": 1.0866301997201373, "ewc_loss": 0.0263202041387558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6320203687646426e-05, "grad_norm": 16.359901428222656, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8715362548828125, "num_tokens": 325787676.0, "step": 8542 }, { "epoch": 1.0867574099987278, "ewc_loss": 0.026324963197112083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6324963982915506e-05, "grad_norm": 16.352672576904297, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8414891958236694, "num_tokens": 325827819.0, "step": 8543 }, { "epoch": 1.0868846202773184, "ewc_loss": 0.02632618509232998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6326184524805285e-05, "grad_norm": 16.406211853027344, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8746642470359802, "num_tokens": 325866328.0, "step": 8544 }, { "epoch": 1.0870118305559089, "ewc_loss": 0.026329386979341507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.632938776514493e-05, "grad_norm": 16.358863830566406, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8612043261528015, "num_tokens": 325906752.0, "step": 8545 }, { "epoch": 1.0871390408344994, "ewc_loss": 0.026363583281636238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6363582946942188e-05, "grad_norm": 16.441150665283203, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8605772256851196, "num_tokens": 325943765.0, "step": 8546 }, { "epoch": 1.08726625111309, "ewc_loss": 0.02637304551899433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6373045329819433e-05, "grad_norm": 16.454282760620117, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8734694719314575, "num_tokens": 325981471.0, "step": 8547 }, { "epoch": 1.0873934613916805, "ewc_loss": 0.026318255811929703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631825554999523e-05, "grad_norm": 16.42588996887207, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8714816570281982, "num_tokens": 326017253.0, "step": 8548 }, { "epoch": 1.087520671670271, "ewc_loss": 0.026364538818597794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.636453791637905e-05, "grad_norm": 16.43020248413086, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8676304221153259, "num_tokens": 326053484.0, "step": 8549 }, { "epoch": 1.0876478819488615, "ewc_loss": 0.026313651353120804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6313651687814854e-05, "grad_norm": 16.38465690612793, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8726238012313843, "num_tokens": 326084779.0, "step": 8550 }, { "epoch": 1.087775092227452, "ewc_loss": 0.026362905278801918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6362904463894665e-05, "grad_norm": 16.484500885009766, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8643801212310791, "num_tokens": 326120795.0, "step": 8551 }, { "epoch": 1.0879023025060426, "ewc_loss": 0.02635134756565094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.635134842421394e-05, "grad_norm": 16.369312286376953, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8648341298103333, "num_tokens": 326155991.0, "step": 8552 }, { "epoch": 1.0880295127846331, "ewc_loss": 0.026329996064305305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6329995307605714e-05, "grad_norm": 16.485782623291016, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8622989058494568, "num_tokens": 326195738.0, "step": 8553 }, { "epoch": 1.0881567230632234, "ewc_loss": 0.02637048251926899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6370482373749837e-05, "grad_norm": 16.307119369506836, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8658995628356934, "num_tokens": 326237773.0, "step": 8554 }, { "epoch": 1.088283933341814, "ewc_loss": 0.02629079483449459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6290794266969897e-05, "grad_norm": 16.42247772216797, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8802105188369751, "num_tokens": 326278589.0, "step": 8555 }, { "epoch": 1.0884111436204045, "ewc_loss": 0.026359083130955696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6359082767157815e-05, "grad_norm": 16.367265701293945, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8593543767929077, "num_tokens": 326316862.0, "step": 8556 }, { "epoch": 1.088538353898995, "ewc_loss": 0.026331456378102303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.633145595609676e-05, "grad_norm": 16.437650680541992, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8630390167236328, "num_tokens": 326353892.0, "step": 8557 }, { "epoch": 1.0886655641775855, "ewc_loss": 0.026389356702566147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.638935620780103e-05, "grad_norm": 16.381929397583008, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8700915575027466, "num_tokens": 326389478.0, "step": 8558 }, { "epoch": 1.088792774456176, "ewc_loss": 0.026284320279955864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6284320483682677e-05, "grad_norm": 16.377643585205078, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8624420762062073, "num_tokens": 326428072.0, "step": 8559 }, { "epoch": 1.0889199847347666, "ewc_loss": 0.026402849704027176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6402849471196532e-05, "grad_norm": 16.45696258544922, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8681210875511169, "num_tokens": 326466374.0, "step": 8560 }, { "epoch": 1.0890471950133571, "ewc_loss": 0.02632955275475979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6329553293180652e-05, "grad_norm": 16.35003089904785, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8627617359161377, "num_tokens": 326501081.0, "step": 8561 }, { "epoch": 1.0891744052919476, "ewc_loss": 0.026313643902540207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631364441185724e-05, "grad_norm": 16.49268913269043, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8749699592590332, "num_tokens": 326532539.0, "step": 8562 }, { "epoch": 1.0893016155705382, "ewc_loss": 0.026414789259433746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6414789317641407e-05, "grad_norm": 16.37097930908203, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8717989921569824, "num_tokens": 326569481.0, "step": 8563 }, { "epoch": 1.0894288258491287, "ewc_loss": 0.026286300271749496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6286299544153735e-05, "grad_norm": 16.417993545532227, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8646326065063477, "num_tokens": 326609746.0, "step": 8564 }, { "epoch": 1.089556036127719, "ewc_loss": 0.026412909850478172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6412910301587544e-05, "grad_norm": 16.448532104492188, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8547341823577881, "num_tokens": 326652379.0, "step": 8565 }, { "epoch": 1.0896832464063095, "ewc_loss": 0.026342330500483513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6342329874751158e-05, "grad_norm": 16.455677032470703, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8618950247764587, "num_tokens": 326687617.0, "step": 8566 }, { "epoch": 1.0898104566849, "ewc_loss": 0.026351258158683777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.635125747474376e-05, "grad_norm": 16.44921112060547, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8581482172012329, "num_tokens": 326724155.0, "step": 8567 }, { "epoch": 1.0899376669634906, "ewc_loss": 0.026328599080443382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.632859832374379e-05, "grad_norm": 16.396780014038086, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8647211790084839, "num_tokens": 326768234.0, "step": 8568 }, { "epoch": 1.0900648772420811, "ewc_loss": 0.026367755606770515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6367755708633922e-05, "grad_norm": 16.394197463989258, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8700728416442871, "num_tokens": 326810423.0, "step": 8569 }, { "epoch": 1.0901920875206716, "ewc_loss": 0.02633679285645485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6336792871006764e-05, "grad_norm": 16.430267333984375, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8641862273216248, "num_tokens": 326851834.0, "step": 8570 }, { "epoch": 1.0903192977992622, "ewc_loss": 0.026362793520092964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.636279350554105e-05, "grad_norm": 16.391014099121094, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8538843393325806, "num_tokens": 326889334.0, "step": 8571 }, { "epoch": 1.0904465080778527, "ewc_loss": 0.026329167187213898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.63291676674271e-05, "grad_norm": 16.359458923339844, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.854759931564331, "num_tokens": 326931884.0, "step": 8572 }, { "epoch": 1.0905737183564432, "ewc_loss": 0.026359451934695244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6359452022006735e-05, "grad_norm": 16.429000854492188, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8738454580307007, "num_tokens": 326969043.0, "step": 8573 }, { "epoch": 1.0907009286350338, "ewc_loss": 0.026375150308012962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6375149900559336e-05, "grad_norm": 16.40736961364746, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8659346103668213, "num_tokens": 326998369.0, "step": 8574 }, { "epoch": 1.0908281389136243, "ewc_loss": 0.026409577578306198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640957791300025e-05, "grad_norm": 16.528976440429688, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8650376796722412, "num_tokens": 327036909.0, "step": 8575 }, { "epoch": 1.0909553491922148, "ewc_loss": 0.02640387788414955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6403877200209536e-05, "grad_norm": 16.37680435180664, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8586130738258362, "num_tokens": 327076053.0, "step": 8576 }, { "epoch": 1.0910825594708053, "ewc_loss": 0.02628575637936592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6285755666322075e-05, "grad_norm": 16.451501846313477, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8728846311569214, "num_tokens": 327112285.0, "step": 8577 }, { "epoch": 1.0912097697493957, "ewc_loss": 0.02644028514623642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6440284273121506e-05, "grad_norm": 16.39295196533203, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8717728853225708, "num_tokens": 327151591.0, "step": 8578 }, { "epoch": 1.0913369800279862, "ewc_loss": 0.026338407769799232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6338408133597113e-05, "grad_norm": 16.434402465820312, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8510786294937134, "num_tokens": 327193600.0, "step": 8579 }, { "epoch": 1.0914641903065767, "ewc_loss": 0.026407534256577492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6407533368910663e-05, "grad_norm": 16.388160705566406, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8564049601554871, "num_tokens": 327230249.0, "step": 8580 }, { "epoch": 1.0915914005851672, "ewc_loss": 0.026320919394493103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.632091855048202e-05, "grad_norm": 16.4287109375, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.878558874130249, "num_tokens": 327270043.0, "step": 8581 }, { "epoch": 1.0917186108637578, "ewc_loss": 0.02638627588748932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6386276658740826e-05, "grad_norm": 16.412364959716797, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8681256175041199, "num_tokens": 327309396.0, "step": 8582 }, { "epoch": 1.0918458211423483, "ewc_loss": 0.026384757831692696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6384757802588865e-05, "grad_norm": 16.45452308654785, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8735710978507996, "num_tokens": 327349497.0, "step": 8583 }, { "epoch": 1.0919730314209388, "ewc_loss": 0.0263846255838871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6384625016362406e-05, "grad_norm": 16.36168670654297, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8572437763214111, "num_tokens": 327385648.0, "step": 8584 }, { "epoch": 1.0921002416995294, "ewc_loss": 0.02634686604142189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.63468664343236e-05, "grad_norm": 16.431907653808594, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8666747808456421, "num_tokens": 327426379.0, "step": 8585 }, { "epoch": 1.0922274519781199, "ewc_loss": 0.026396797969937325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6396797693450935e-05, "grad_norm": 16.354188919067383, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.870957612991333, "num_tokens": 327467555.0, "step": 8586 }, { "epoch": 1.0923546622567104, "ewc_loss": 0.026339175179600716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.633917574712541e-05, "grad_norm": 16.399316787719727, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8616296052932739, "num_tokens": 327505303.0, "step": 8587 }, { "epoch": 1.092481872535301, "ewc_loss": 0.026402737945318222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6402738512842916e-05, "grad_norm": 16.39542579650879, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8703629970550537, "num_tokens": 327547854.0, "step": 8588 }, { "epoch": 1.0926090828138915, "ewc_loss": 0.026310089975595474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631009010656271e-05, "grad_norm": 16.388023376464844, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8671831488609314, "num_tokens": 327584132.0, "step": 8589 }, { "epoch": 1.0927362930924818, "ewc_loss": 0.026341529563069344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6341529519413598e-05, "grad_norm": 16.337980270385742, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8549992442131042, "num_tokens": 327629250.0, "step": 8590 }, { "epoch": 1.0928635033710723, "ewc_loss": 0.026417499408125877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641749961185269e-05, "grad_norm": 16.499452590942383, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8579390048980713, "num_tokens": 327666988.0, "step": 8591 }, { "epoch": 1.0929907136496628, "ewc_loss": 0.026404336094856262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640433558553923e-05, "grad_norm": 16.335086822509766, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.867978572845459, "num_tokens": 327705955.0, "step": 8592 }, { "epoch": 1.0931179239282534, "ewc_loss": 0.026342470198869705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634246993693523e-05, "grad_norm": 16.454410552978516, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8578736782073975, "num_tokens": 327746362.0, "step": 8593 }, { "epoch": 1.0932451342068439, "ewc_loss": 0.026403335854411125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640333514136728e-05, "grad_norm": 16.41179847717285, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8650476932525635, "num_tokens": 327789034.0, "step": 8594 }, { "epoch": 1.0933723444854344, "ewc_loss": 0.026317065581679344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.631706593092531e-05, "grad_norm": 16.41434669494629, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8549473285675049, "num_tokens": 327824203.0, "step": 8595 }, { "epoch": 1.093499554764025, "ewc_loss": 0.02636345475912094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.636345561768394e-05, "grad_norm": 16.38618278503418, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8672925233840942, "num_tokens": 327860340.0, "step": 8596 }, { "epoch": 1.0936267650426155, "ewc_loss": 0.026335058733820915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6335059374105185e-05, "grad_norm": 16.386381149291992, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.858625054359436, "num_tokens": 327901709.0, "step": 8597 }, { "epoch": 1.093753975321206, "ewc_loss": 0.02636530064046383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6365300072939135e-05, "grad_norm": 16.45786476135254, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8671116828918457, "num_tokens": 327940327.0, "step": 8598 }, { "epoch": 1.0938811855997965, "ewc_loss": 0.026401309296488762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640130878717173e-05, "grad_norm": 16.44153594970703, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8709896206855774, "num_tokens": 327977966.0, "step": 8599 }, { "epoch": 1.094008395878387, "ewc_loss": 0.02633235789835453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.633235817484092e-05, "grad_norm": 16.47372817993164, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8618254661560059, "num_tokens": 328021143.0, "step": 8600 }, { "epoch": 1.0941356061569776, "ewc_loss": 0.026370251551270485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6370251362095587e-05, "grad_norm": 16.444133758544922, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8712506890296936, "num_tokens": 328060792.0, "step": 8601 }, { "epoch": 1.094262816435568, "ewc_loss": 0.026331519708037376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6331519620725885e-05, "grad_norm": 16.462886810302734, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8669781684875488, "num_tokens": 328099698.0, "step": 8602 }, { "epoch": 1.0943900267141584, "ewc_loss": 0.026293877512216568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6293877454008907e-05, "grad_norm": 16.334930419921875, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8687480688095093, "num_tokens": 328141075.0, "step": 8603 }, { "epoch": 1.094517236992749, "ewc_loss": 0.02628685161471367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6286852516932413e-05, "grad_norm": 16.52873420715332, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8586373925209045, "num_tokens": 328178945.0, "step": 8604 }, { "epoch": 1.0946444472713395, "ewc_loss": 0.02634759433567524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634759403008502e-05, "grad_norm": 16.32938575744629, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8391585946083069, "num_tokens": 328214423.0, "step": 8605 }, { "epoch": 1.09477165754993, "ewc_loss": 0.026284363120794296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.628436232043896e-05, "grad_norm": 16.486162185668945, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8657243251800537, "num_tokens": 328248640.0, "step": 8606 }, { "epoch": 1.0948988678285205, "ewc_loss": 0.02633048966526985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6330490072723478e-05, "grad_norm": 16.404817581176758, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8448672294616699, "num_tokens": 328283016.0, "step": 8607 }, { "epoch": 1.095026078107111, "ewc_loss": 0.02626149170100689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6261492166668177e-05, "grad_norm": 16.3918514251709, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8687171339988708, "num_tokens": 328322377.0, "step": 8608 }, { "epoch": 1.0951532883857016, "ewc_loss": 0.026334770023822784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.633477015479002e-05, "grad_norm": 16.442657470703125, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8669338822364807, "num_tokens": 328359409.0, "step": 8609 }, { "epoch": 1.0952804986642921, "ewc_loss": 0.02632325328886509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6323254132876173e-05, "grad_norm": 16.32077980041504, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8700334429740906, "num_tokens": 328401257.0, "step": 8610 }, { "epoch": 1.0954077089428826, "ewc_loss": 0.02636081911623478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.63608199020382e-05, "grad_norm": 16.425445556640625, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8691520690917969, "num_tokens": 328435886.0, "step": 8611 }, { "epoch": 1.0955349192214732, "ewc_loss": 0.026410497725009918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6410498321638443e-05, "grad_norm": 16.350576400756836, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8546872735023499, "num_tokens": 328472091.0, "step": 8612 }, { "epoch": 1.0956621295000637, "ewc_loss": 0.02635260857641697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6352608983870596e-05, "grad_norm": 16.434772491455078, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8582079410552979, "num_tokens": 328509453.0, "step": 8613 }, { "epoch": 1.095789339778654, "ewc_loss": 0.02643679268658161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6436791813466698e-05, "grad_norm": 16.36583709716797, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8653960227966309, "num_tokens": 328545590.0, "step": 8614 }, { "epoch": 1.0959165500572445, "ewc_loss": 0.026355039328336716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6355039153713733e-05, "grad_norm": 16.428726196289062, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.854845404624939, "num_tokens": 328583093.0, "step": 8615 }, { "epoch": 1.096043760335835, "ewc_loss": 0.02646702527999878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6467025236343034e-05, "grad_norm": 16.447940826416016, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8481159210205078, "num_tokens": 328625312.0, "step": 8616 }, { "epoch": 1.0961709706144256, "ewc_loss": 0.02641266956925392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6412670194986276e-05, "grad_norm": 16.507158279418945, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8581138849258423, "num_tokens": 328655392.0, "step": 8617 }, { "epoch": 1.0962981808930161, "ewc_loss": 0.026411723345518112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641172250150703e-05, "grad_norm": 16.36461639404297, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8503032922744751, "num_tokens": 328693686.0, "step": 8618 }, { "epoch": 1.0964253911716066, "ewc_loss": 0.026383399963378906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.638340083649382e-05, "grad_norm": 16.412504196166992, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8770657181739807, "num_tokens": 328728051.0, "step": 8619 }, { "epoch": 1.0965526014501972, "ewc_loss": 0.026433216407895088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6433215680299327e-05, "grad_norm": 16.381591796875, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8549478054046631, "num_tokens": 328768091.0, "step": 8620 }, { "epoch": 1.0966798117287877, "ewc_loss": 0.026427969336509705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.64279697148595e-05, "grad_norm": 16.41090202331543, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8625391125679016, "num_tokens": 328803851.0, "step": 8621 }, { "epoch": 1.0968070220073782, "ewc_loss": 0.026456208899617195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645620952534955e-05, "grad_norm": 16.49463653564453, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8762428760528564, "num_tokens": 328837429.0, "step": 8622 }, { "epoch": 1.0969342322859688, "ewc_loss": 0.02646058425307274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6460584194865078e-05, "grad_norm": 16.424163818359375, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8650942444801331, "num_tokens": 328874264.0, "step": 8623 }, { "epoch": 1.0970614425645593, "ewc_loss": 0.026459893211722374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645989297889173e-05, "grad_norm": 16.425765991210938, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8675673007965088, "num_tokens": 328911627.0, "step": 8624 }, { "epoch": 1.0971886528431498, "ewc_loss": 0.026449212804436684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6449213692103513e-05, "grad_norm": 16.41758918762207, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.858319878578186, "num_tokens": 328944471.0, "step": 8625 }, { "epoch": 1.0973158631217403, "ewc_loss": 0.026482723653316498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6482723114895634e-05, "grad_norm": 16.458372116088867, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8724066019058228, "num_tokens": 328983826.0, "step": 8626 }, { "epoch": 1.0974430734003306, "ewc_loss": 0.026491234079003334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6491234166314825e-05, "grad_norm": 16.38521385192871, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8656046986579895, "num_tokens": 329019385.0, "step": 8627 }, { "epoch": 1.0975702836789212, "ewc_loss": 0.026481466367840767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6481466193217784e-05, "grad_norm": 16.43436050415039, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8526440858840942, "num_tokens": 329065179.0, "step": 8628 }, { "epoch": 1.0976974939575117, "ewc_loss": 0.02651824615895748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651824615895748e-05, "grad_norm": 16.421537399291992, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8715236783027649, "num_tokens": 329101077.0, "step": 8629 }, { "epoch": 1.0978247042361022, "ewc_loss": 0.026474343612790108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.64743430307135e-05, "grad_norm": 16.461259841918945, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8705446124076843, "num_tokens": 329137538.0, "step": 8630 }, { "epoch": 1.0979519145146928, "ewc_loss": 0.026490941643714905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6490941309020855e-05, "grad_norm": 16.441869735717773, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8789868950843811, "num_tokens": 329171952.0, "step": 8631 }, { "epoch": 1.0980791247932833, "ewc_loss": 0.026473158970475197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.647315886861179e-05, "grad_norm": 16.458850860595703, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8732876181602478, "num_tokens": 329214956.0, "step": 8632 }, { "epoch": 1.0982063350718738, "ewc_loss": 0.026470085605978966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6470084776519798e-05, "grad_norm": 16.358110427856445, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8549671173095703, "num_tokens": 329254228.0, "step": 8633 }, { "epoch": 1.0983335453504643, "ewc_loss": 0.02648015506565571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6480154701857828e-05, "grad_norm": 16.42427635192871, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8668652772903442, "num_tokens": 329288337.0, "step": 8634 }, { "epoch": 1.0984607556290549, "ewc_loss": 0.02646586298942566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6465862902114168e-05, "grad_norm": 16.36578941345215, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8582013845443726, "num_tokens": 329329118.0, "step": 8635 }, { "epoch": 1.0985879659076454, "ewc_loss": 0.026416771113872528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641677019710187e-05, "grad_norm": 16.46249008178711, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8519899249076843, "num_tokens": 329366301.0, "step": 8636 }, { "epoch": 1.098715176186236, "ewc_loss": 0.02650735154747963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6507352231419645e-05, "grad_norm": 16.44515609741211, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8539842963218689, "num_tokens": 329401612.0, "step": 8637 }, { "epoch": 1.0988423864648265, "ewc_loss": 0.026475613936781883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6475614504306577e-05, "grad_norm": 16.423189163208008, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8671267032623291, "num_tokens": 329440403.0, "step": 8638 }, { "epoch": 1.0989695967434168, "ewc_loss": 0.026469798758625984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646979919518344e-05, "grad_norm": 16.511072158813477, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8750280141830444, "num_tokens": 329478699.0, "step": 8639 }, { "epoch": 1.0990968070220073, "ewc_loss": 0.026466090232133865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646609027578961e-05, "grad_norm": 16.457712173461914, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8640612363815308, "num_tokens": 329517860.0, "step": 8640 }, { "epoch": 1.0992240173005978, "ewc_loss": 0.0264485664665699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.644856613187585e-05, "grad_norm": 16.484708786010742, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8701391220092773, "num_tokens": 329555385.0, "step": 8641 }, { "epoch": 1.0993512275791884, "ewc_loss": 0.026443317532539368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.644331834744662e-05, "grad_norm": 16.47677230834961, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8623369932174683, "num_tokens": 329594775.0, "step": 8642 }, { "epoch": 1.0994784378577789, "ewc_loss": 0.02639094367623329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6390944185550325e-05, "grad_norm": 16.480274200439453, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8690677285194397, "num_tokens": 329629872.0, "step": 8643 }, { "epoch": 1.0996056481363694, "ewc_loss": 0.02642679400742054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.642679464770481e-05, "grad_norm": 16.496095657348633, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8593708276748657, "num_tokens": 329665263.0, "step": 8644 }, { "epoch": 1.09973285841496, "ewc_loss": 0.02640778385102749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6407784389448352e-05, "grad_norm": 16.49028778076172, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8552902340888977, "num_tokens": 329705853.0, "step": 8645 }, { "epoch": 1.0998600686935505, "ewc_loss": 0.026371918618679047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.637191937537864e-05, "grad_norm": 16.3507137298584, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8715157508850098, "num_tokens": 329741690.0, "step": 8646 }, { "epoch": 1.099987278972141, "ewc_loss": 0.02638075314462185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6380752387922257e-05, "grad_norm": 16.520442962646484, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8570436239242554, "num_tokens": 329781548.0, "step": 8647 }, { "epoch": 1.1001144892507315, "ewc_loss": 0.02646256983280182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646257053129375e-05, "grad_norm": 16.35852813720703, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8730285167694092, "num_tokens": 329822304.0, "step": 8648 }, { "epoch": 1.100241699529322, "ewc_loss": 0.02634810097515583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.634810152812861e-05, "grad_norm": 16.49501609802246, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8668720722198486, "num_tokens": 329856065.0, "step": 8649 }, { "epoch": 1.1003689098079126, "ewc_loss": 0.026482950896024704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6482950488571078e-05, "grad_norm": 16.4185848236084, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8494645357131958, "num_tokens": 329895002.0, "step": 8650 }, { "epoch": 1.100496120086503, "ewc_loss": 0.02642740122973919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.642740037117619e-05, "grad_norm": 16.509313583374023, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8594722747802734, "num_tokens": 329938810.0, "step": 8651 }, { "epoch": 1.1006233303650934, "ewc_loss": 0.02650504745543003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6505047571845353e-05, "grad_norm": 16.414756774902344, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8778660893440247, "num_tokens": 329976513.0, "step": 8652 }, { "epoch": 1.100750540643684, "ewc_loss": 0.026402181014418602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640218190208543e-05, "grad_norm": 16.477798461914062, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8731042146682739, "num_tokens": 330014013.0, "step": 8653 }, { "epoch": 1.1008777509222745, "ewc_loss": 0.026510637253522873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651063732628245e-05, "grad_norm": 16.43462562561035, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.861189067363739, "num_tokens": 330052577.0, "step": 8654 }, { "epoch": 1.101004961200865, "ewc_loss": 0.02641335502266884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641335413500201e-05, "grad_norm": 16.4400634765625, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8629889488220215, "num_tokens": 330091385.0, "step": 8655 }, { "epoch": 1.1011321714794555, "ewc_loss": 0.026412563398480415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6412562874611467e-05, "grad_norm": 16.435033798217773, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8741360902786255, "num_tokens": 330130285.0, "step": 8656 }, { "epoch": 1.101259381758046, "ewc_loss": 0.026463890448212624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6463891117600724e-05, "grad_norm": 16.42976951599121, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8519822359085083, "num_tokens": 330166823.0, "step": 8657 }, { "epoch": 1.1013865920366366, "ewc_loss": 0.026442082598805428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6442083253641613e-05, "grad_norm": 16.441843032836914, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8669832944869995, "num_tokens": 330200655.0, "step": 8658 }, { "epoch": 1.101513802315227, "ewc_loss": 0.026510490104556084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6510489988140762e-05, "grad_norm": 16.489444732666016, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8639218211174011, "num_tokens": 330243656.0, "step": 8659 }, { "epoch": 1.1016410125938176, "ewc_loss": 0.02643628790974617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6436287953401916e-05, "grad_norm": 16.430625915527344, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8571697473526001, "num_tokens": 330287200.0, "step": 8660 }, { "epoch": 1.1017682228724082, "ewc_loss": 0.026442566886544228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6442567104822956e-05, "grad_norm": 16.395421981811523, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8676175475120544, "num_tokens": 330324794.0, "step": 8661 }, { "epoch": 1.1018954331509987, "ewc_loss": 0.026416271924972534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6416271794005297e-05, "grad_norm": 16.412179946899414, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8673312664031982, "num_tokens": 330365297.0, "step": 8662 }, { "epoch": 1.102022643429589, "ewc_loss": 0.026431063190102577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6431063815834932e-05, "grad_norm": 16.405105590820312, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8823707103729248, "num_tokens": 330408046.0, "step": 8663 }, { "epoch": 1.1021498537081795, "ewc_loss": 0.026451321318745613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6451321900822222e-05, "grad_norm": 16.4646053314209, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8614251613616943, "num_tokens": 330447815.0, "step": 8664 }, { "epoch": 1.10227706398677, "ewc_loss": 0.026438390836119652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6438390705152415e-05, "grad_norm": 16.553279876708984, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8549481630325317, "num_tokens": 330487796.0, "step": 8665 }, { "epoch": 1.1024042742653606, "ewc_loss": 0.02641558088362217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641558057803195e-05, "grad_norm": 16.36955451965332, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8766223192214966, "num_tokens": 330531155.0, "step": 8666 }, { "epoch": 1.1025314845439511, "ewc_loss": 0.026453761383891106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6453761165612377e-05, "grad_norm": 16.50909996032715, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8749459981918335, "num_tokens": 330568611.0, "step": 8667 }, { "epoch": 1.1026586948225416, "ewc_loss": 0.026497505605220795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.649750604177825e-05, "grad_norm": 16.39907455444336, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8600450754165649, "num_tokens": 330608085.0, "step": 8668 }, { "epoch": 1.1027859051011322, "ewc_loss": 0.026433022692799568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.643302286742255e-05, "grad_norm": 16.446739196777344, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8478844165802002, "num_tokens": 330644776.0, "step": 8669 }, { "epoch": 1.1029131153797227, "ewc_loss": 0.026554308831691742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.655430944287218e-05, "grad_norm": 16.46143341064453, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8552425503730774, "num_tokens": 330686401.0, "step": 8670 }, { "epoch": 1.1030403256583132, "ewc_loss": 0.026455674320459366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6455674742464907e-05, "grad_norm": 16.470983505249023, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8704659342765808, "num_tokens": 330723115.0, "step": 8671 }, { "epoch": 1.1031675359369038, "ewc_loss": 0.02646145224571228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646145185281057e-05, "grad_norm": 16.440425872802734, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8807808756828308, "num_tokens": 330761170.0, "step": 8672 }, { "epoch": 1.1032947462154943, "ewc_loss": 0.026427939534187317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.642793879203964e-05, "grad_norm": 16.501493453979492, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8655070662498474, "num_tokens": 330790230.0, "step": 8673 }, { "epoch": 1.1034219564940848, "ewc_loss": 0.0264783576130867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6478357540327124e-05, "grad_norm": 16.450708389282227, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8758487105369568, "num_tokens": 330826178.0, "step": 8674 }, { "epoch": 1.1035491667726753, "ewc_loss": 0.0264595840126276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6459583750693128e-05, "grad_norm": 16.468168258666992, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8721767067909241, "num_tokens": 330866341.0, "step": 8675 }, { "epoch": 1.1036763770512656, "ewc_loss": 0.026460712775588036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.646071334311273e-05, "grad_norm": 16.421222686767578, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8514726758003235, "num_tokens": 330903955.0, "step": 8676 }, { "epoch": 1.1038035873298562, "ewc_loss": 0.026463640853762627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6463640097063035e-05, "grad_norm": 16.49517250061035, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8615388870239258, "num_tokens": 330943024.0, "step": 8677 }, { "epoch": 1.1039307976084467, "ewc_loss": 0.02645382285118103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6453823011252098e-05, "grad_norm": 16.41707992553711, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8599714040756226, "num_tokens": 330981004.0, "step": 8678 }, { "epoch": 1.1040580078870372, "ewc_loss": 0.026447687298059464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6447687559993938e-05, "grad_norm": 16.455894470214844, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8638909459114075, "num_tokens": 331021532.0, "step": 8679 }, { "epoch": 1.1041852181656278, "ewc_loss": 0.026453616097569466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6453615646460094e-05, "grad_norm": 16.468740463256836, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.873431921005249, "num_tokens": 331058010.0, "step": 8680 }, { "epoch": 1.1043124284442183, "ewc_loss": 0.026459190994501114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645919084898196e-05, "grad_norm": 16.50166893005371, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.874205470085144, "num_tokens": 331090713.0, "step": 8681 }, { "epoch": 1.1044396387228088, "ewc_loss": 0.026479439809918404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6479439839022234e-05, "grad_norm": 16.48104476928711, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8694318532943726, "num_tokens": 331127532.0, "step": 8682 }, { "epoch": 1.1045668490013993, "ewc_loss": 0.026448411867022514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.644841151777655e-05, "grad_norm": 16.427288055419922, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8457478284835815, "num_tokens": 331172088.0, "step": 8683 }, { "epoch": 1.1046940592799899, "ewc_loss": 0.02651604823768139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6516048819757998e-05, "grad_norm": 16.53387451171875, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8609814047813416, "num_tokens": 331213419.0, "step": 8684 }, { "epoch": 1.1048212695585804, "ewc_loss": 0.026457104831933975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6457104468136095e-05, "grad_norm": 16.424644470214844, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8654241561889648, "num_tokens": 331250102.0, "step": 8685 }, { "epoch": 1.104948479837171, "ewc_loss": 0.026410626247525215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.641062565089669e-05, "grad_norm": 16.472553253173828, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8664542436599731, "num_tokens": 331284597.0, "step": 8686 }, { "epoch": 1.1050756901157615, "ewc_loss": 0.026503730565309525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6503730623517185e-05, "grad_norm": 16.489656448364258, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8577258586883545, "num_tokens": 331330688.0, "step": 8687 }, { "epoch": 1.1052029003943518, "ewc_loss": 0.026408175006508827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6408175472170115e-05, "grad_norm": 16.412626266479492, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8678210377693176, "num_tokens": 331368777.0, "step": 8688 }, { "epoch": 1.1053301106729423, "ewc_loss": 0.026490265503525734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6490264644962735e-05, "grad_norm": 16.604158401489258, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8698620796203613, "num_tokens": 331414369.0, "step": 8689 }, { "epoch": 1.1054573209515328, "ewc_loss": 0.026464004069566727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6464003894943744e-05, "grad_norm": 16.478181838989258, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8715174198150635, "num_tokens": 331452788.0, "step": 8690 }, { "epoch": 1.1055845312301233, "ewc_loss": 0.026369385421276093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.63693855231395e-05, "grad_norm": 16.461185455322266, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8540923595428467, "num_tokens": 331489575.0, "step": 8691 }, { "epoch": 1.1057117415087139, "ewc_loss": 0.02649032138288021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6490321033634245e-05, "grad_norm": 16.492475509643555, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8818971514701843, "num_tokens": 331529129.0, "step": 8692 }, { "epoch": 1.1058389517873044, "ewc_loss": 0.02643289789557457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6432897357153706e-05, "grad_norm": 16.474519729614258, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8606417179107666, "num_tokens": 331564989.0, "step": 8693 }, { "epoch": 1.105966162065895, "ewc_loss": 0.026522843167185783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6522842745180242e-05, "grad_norm": 16.528919219970703, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8661311268806458, "num_tokens": 331602561.0, "step": 8694 }, { "epoch": 1.1060933723444855, "ewc_loss": 0.026433082297444344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6433082894072868e-05, "grad_norm": 16.39656639099121, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8667266368865967, "num_tokens": 331646902.0, "step": 8695 }, { "epoch": 1.106220582623076, "ewc_loss": 0.02642141841351986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.642141771502793e-05, "grad_norm": 16.451337814331055, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8660460710525513, "num_tokens": 331682347.0, "step": 8696 }, { "epoch": 1.1063477929016665, "ewc_loss": 0.026480061933398247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6480061933398247e-05, "grad_norm": 16.472143173217773, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8583237528800964, "num_tokens": 331725813.0, "step": 8697 }, { "epoch": 1.106475003180257, "ewc_loss": 0.02647171914577484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6471718229004182e-05, "grad_norm": 16.44695281982422, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8450256586074829, "num_tokens": 331767856.0, "step": 8698 }, { "epoch": 1.1066022134588476, "ewc_loss": 0.026446346193552017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6446346964803524e-05, "grad_norm": 16.4111385345459, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8739274740219116, "num_tokens": 331804238.0, "step": 8699 }, { "epoch": 1.106729423737438, "ewc_loss": 0.026477927342057228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6477928258827887e-05, "grad_norm": 16.493755340576172, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8660398721694946, "num_tokens": 331839963.0, "step": 8700 }, { "epoch": 1.1068566340160284, "ewc_loss": 0.026476044207811356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6476043785805814e-05, "grad_norm": 16.449926376342773, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.869528591632843, "num_tokens": 331881563.0, "step": 8701 }, { "epoch": 1.106983844294619, "ewc_loss": 0.026434490457177162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6434490791871212e-05, "grad_norm": 16.5350399017334, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8620480895042419, "num_tokens": 331925540.0, "step": 8702 }, { "epoch": 1.1071110545732095, "ewc_loss": 0.0264644306153059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6464431357453577e-05, "grad_norm": 16.41436767578125, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8729455471038818, "num_tokens": 331963171.0, "step": 8703 }, { "epoch": 1.1072382648518, "ewc_loss": 0.026391560211777687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6391560822958127e-05, "grad_norm": 16.466798782348633, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8853448629379272, "num_tokens": 331998310.0, "step": 8704 }, { "epoch": 1.1073654751303905, "ewc_loss": 0.026538318023085594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.653831870702561e-05, "grad_norm": 16.497661590576172, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8715159296989441, "num_tokens": 332034181.0, "step": 8705 }, { "epoch": 1.107492685408981, "ewc_loss": 0.026434114202857018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.643411426106468e-05, "grad_norm": 16.473085403442383, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8748714923858643, "num_tokens": 332075368.0, "step": 8706 }, { "epoch": 1.1076198956875716, "ewc_loss": 0.026439571753144264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6439571229275316e-05, "grad_norm": 16.424406051635742, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8755384087562561, "num_tokens": 332113281.0, "step": 8707 }, { "epoch": 1.107747105966162, "ewc_loss": 0.02642795443534851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6427955162944272e-05, "grad_norm": 16.48296356201172, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8576445579528809, "num_tokens": 332154206.0, "step": 8708 }, { "epoch": 1.1078743162447526, "ewc_loss": 0.026460854336619377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6460855224286206e-05, "grad_norm": 16.463577270507812, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.850452721118927, "num_tokens": 332195409.0, "step": 8709 }, { "epoch": 1.1080015265233432, "ewc_loss": 0.026448175311088562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6448175049154088e-05, "grad_norm": 16.446441650390625, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8739796876907349, "num_tokens": 332233836.0, "step": 8710 }, { "epoch": 1.1081287368019337, "ewc_loss": 0.026435917243361473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6435916879563592e-05, "grad_norm": 16.49299430847168, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8700657486915588, "num_tokens": 332274167.0, "step": 8711 }, { "epoch": 1.108255947080524, "ewc_loss": 0.026457861065864563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645786116772797e-05, "grad_norm": 16.476137161254883, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8623314499855042, "num_tokens": 332315856.0, "step": 8712 }, { "epoch": 1.1083831573591145, "ewc_loss": 0.026408562436699867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640856291691307e-05, "grad_norm": 16.48518943786621, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8650857210159302, "num_tokens": 332366365.0, "step": 8713 }, { "epoch": 1.108510367637705, "ewc_loss": 0.026459697633981705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645969834702555e-05, "grad_norm": 16.473440170288086, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8637018799781799, "num_tokens": 332400411.0, "step": 8714 }, { "epoch": 1.1086375779162956, "ewc_loss": 0.026380721479654312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6380721465102397e-05, "grad_norm": 16.48348617553711, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.868199348449707, "num_tokens": 332441959.0, "step": 8715 }, { "epoch": 1.108764788194886, "ewc_loss": 0.02641051448881626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6410514692543074e-05, "grad_norm": 16.488035202026367, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8523657917976379, "num_tokens": 332479545.0, "step": 8716 }, { "epoch": 1.1088919984734766, "ewc_loss": 0.026400944218039513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.640094498929102e-05, "grad_norm": 16.438323974609375, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8767116069793701, "num_tokens": 332516240.0, "step": 8717 }, { "epoch": 1.1090192087520672, "ewc_loss": 0.02640017308294773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6400173737783916e-05, "grad_norm": 16.52798843383789, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8702662587165833, "num_tokens": 332553488.0, "step": 8718 }, { "epoch": 1.1091464190306577, "ewc_loss": 0.026427261531352997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.642726212798152e-05, "grad_norm": 16.427003860473633, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8567644953727722, "num_tokens": 332588646.0, "step": 8719 }, { "epoch": 1.1092736293092482, "ewc_loss": 0.026344720274209976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6344720026827417e-05, "grad_norm": 16.464799880981445, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8560514450073242, "num_tokens": 332629103.0, "step": 8720 }, { "epoch": 1.1094008395878387, "ewc_loss": 0.026426680386066437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6426680051372387e-05, "grad_norm": 16.6235408782959, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8709558844566345, "num_tokens": 332664820.0, "step": 8721 }, { "epoch": 1.1095280498664293, "ewc_loss": 0.026496071368455887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6496070859138854e-05, "grad_norm": 16.462221145629883, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8671659231185913, "num_tokens": 332699772.0, "step": 8722 }, { "epoch": 1.1096552601450198, "ewc_loss": 0.026390116661787033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.639011654537171e-05, "grad_norm": 16.460384368896484, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8736830949783325, "num_tokens": 332733478.0, "step": 8723 }, { "epoch": 1.1097824704236103, "ewc_loss": 0.02649693936109543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6496938517084345e-05, "grad_norm": 16.449247360229492, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.861733078956604, "num_tokens": 332774733.0, "step": 8724 }, { "epoch": 1.1099096807022006, "ewc_loss": 0.02646097168326378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6460971639608033e-05, "grad_norm": 16.437265396118164, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8422013521194458, "num_tokens": 332812970.0, "step": 8725 }, { "epoch": 1.1100368909807912, "ewc_loss": 0.026527153328061104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6527153750066645e-05, "grad_norm": 16.429244995117188, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8612265586853027, "num_tokens": 332854974.0, "step": 8726 }, { "epoch": 1.1101641012593817, "ewc_loss": 0.026616785675287247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661678627191577e-05, "grad_norm": 16.553417205810547, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8518608808517456, "num_tokens": 332886462.0, "step": 8727 }, { "epoch": 1.1102913115379722, "ewc_loss": 0.02659713663160801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659713754837867e-05, "grad_norm": 16.41326904296875, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8613820672035217, "num_tokens": 332928614.0, "step": 8728 }, { "epoch": 1.1104185218165628, "ewc_loss": 0.02653292380273342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6532923584454693e-05, "grad_norm": 16.470579147338867, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8661891222000122, "num_tokens": 332962447.0, "step": 8729 }, { "epoch": 1.1105457320951533, "ewc_loss": 0.026654468849301338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6654468456399627e-05, "grad_norm": 16.455642700195312, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8571741580963135, "num_tokens": 332991395.0, "step": 8730 }, { "epoch": 1.1106729423737438, "ewc_loss": 0.026650678366422653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6650677682482637e-05, "grad_norm": 16.477365493774414, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8576461672782898, "num_tokens": 333027852.0, "step": 8731 }, { "epoch": 1.1108001526523343, "ewc_loss": 0.026589613407850266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6589614208205603e-05, "grad_norm": 16.44426155090332, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8679841160774231, "num_tokens": 333063337.0, "step": 8732 }, { "epoch": 1.1109273629309249, "ewc_loss": 0.026627471670508385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.66274710156722e-05, "grad_norm": 16.41868782043457, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8559423685073853, "num_tokens": 333094632.0, "step": 8733 }, { "epoch": 1.1110545732095154, "ewc_loss": 0.026695845648646355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6695845008362085e-05, "grad_norm": 16.482873916625977, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8789898157119751, "num_tokens": 333134071.0, "step": 8734 }, { "epoch": 1.111181783488106, "ewc_loss": 0.026627538725733757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.662753831828013e-05, "grad_norm": 16.412641525268555, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8715896606445312, "num_tokens": 333171288.0, "step": 8735 }, { "epoch": 1.1113089937666965, "ewc_loss": 0.026667261496186256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6667261408874765e-05, "grad_norm": 16.47893524169922, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8732789754867554, "num_tokens": 333212111.0, "step": 8736 }, { "epoch": 1.1114362040452868, "ewc_loss": 0.026660703122615814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6660703952074982e-05, "grad_norm": 16.417558670043945, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8710422515869141, "num_tokens": 333247557.0, "step": 8737 }, { "epoch": 1.1115634143238773, "ewc_loss": 0.026589984074234962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6589983463054523e-05, "grad_norm": 16.518539428710938, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8644009828567505, "num_tokens": 333278705.0, "step": 8738 }, { "epoch": 1.1116906246024678, "ewc_loss": 0.026706939563155174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670693902473431e-05, "grad_norm": 16.469619750976562, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8679170608520508, "num_tokens": 333316493.0, "step": 8739 }, { "epoch": 1.1118178348810583, "ewc_loss": 0.02661331743001938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661331745912321e-05, "grad_norm": 16.41360092163086, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8683130741119385, "num_tokens": 333360384.0, "step": 8740 }, { "epoch": 1.1119450451596489, "ewc_loss": 0.026664983481168747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6664984034141526e-05, "grad_norm": 16.503440856933594, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8666208982467651, "num_tokens": 333395568.0, "step": 8741 }, { "epoch": 1.1120722554382394, "ewc_loss": 0.026656027883291245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6656027330318466e-05, "grad_norm": 16.514175415039062, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8692207336425781, "num_tokens": 333429267.0, "step": 8742 }, { "epoch": 1.11219946571683, "ewc_loss": 0.026624126359820366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.662412589415908e-05, "grad_norm": 16.479747772216797, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8719910383224487, "num_tokens": 333469908.0, "step": 8743 }, { "epoch": 1.1123266759954205, "ewc_loss": 0.026601668447256088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.66016686509829e-05, "grad_norm": 16.49991798400879, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8696976900100708, "num_tokens": 333507865.0, "step": 8744 }, { "epoch": 1.112453886274011, "ewc_loss": 0.026623563840985298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6623563826433383e-05, "grad_norm": 16.507129669189453, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8768295645713806, "num_tokens": 333544011.0, "step": 8745 }, { "epoch": 1.1125810965526015, "ewc_loss": 0.0266034547239542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6603454898577183e-05, "grad_norm": 16.525407791137695, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.867275595664978, "num_tokens": 333581949.0, "step": 8746 }, { "epoch": 1.112708306831192, "ewc_loss": 0.02658797800540924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6587977117742412e-05, "grad_norm": 16.50537109375, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8738929629325867, "num_tokens": 333615994.0, "step": 8747 }, { "epoch": 1.1128355171097826, "ewc_loss": 0.026608919724822044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6608919142745435e-05, "grad_norm": 16.51265525817871, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8810962438583374, "num_tokens": 333654454.0, "step": 8748 }, { "epoch": 1.112962727388373, "ewc_loss": 0.02651311457157135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651311478985008e-05, "grad_norm": 16.50288963317871, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8583155274391174, "num_tokens": 333700206.0, "step": 8749 }, { "epoch": 1.1130899376669634, "ewc_loss": 0.02656693197786808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6566931410343386e-05, "grad_norm": 16.509841918945312, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8696221709251404, "num_tokens": 333737538.0, "step": 8750 }, { "epoch": 1.113217147945554, "ewc_loss": 0.026579441502690315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6579442419460975e-05, "grad_norm": 16.52187728881836, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.867796778678894, "num_tokens": 333771237.0, "step": 8751 }, { "epoch": 1.1133443582241445, "ewc_loss": 0.026582367718219757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6582367354421876e-05, "grad_norm": 16.51175880432129, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8627769947052002, "num_tokens": 333806522.0, "step": 8752 }, { "epoch": 1.113471568502735, "ewc_loss": 0.026577705517411232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657770528458059e-05, "grad_norm": 16.557701110839844, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8656853437423706, "num_tokens": 333846656.0, "step": 8753 }, { "epoch": 1.1135987787813255, "ewc_loss": 0.026543542742729187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6543542844592594e-05, "grad_norm": 16.389673233032227, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8573384881019592, "num_tokens": 333886294.0, "step": 8754 }, { "epoch": 1.113725989059916, "ewc_loss": 0.026476819068193436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6476818675291725e-05, "grad_norm": 16.525558471679688, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8628150224685669, "num_tokens": 333923868.0, "step": 8755 }, { "epoch": 1.1138531993385066, "ewc_loss": 0.026583222672343254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6583222279441543e-05, "grad_norm": 16.512117385864258, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.869888424873352, "num_tokens": 333957187.0, "step": 8756 }, { "epoch": 1.113980409617097, "ewc_loss": 0.026508184149861336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6508183509577066e-05, "grad_norm": 16.506093978881836, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8797920346260071, "num_tokens": 333999061.0, "step": 8757 }, { "epoch": 1.1141076198956876, "ewc_loss": 0.026562925428152084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6562925995676778e-05, "grad_norm": 16.46436882019043, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8606956005096436, "num_tokens": 334043595.0, "step": 8758 }, { "epoch": 1.1142348301742782, "ewc_loss": 0.026592977344989777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659297751961276e-05, "grad_norm": 16.610261917114258, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8707128167152405, "num_tokens": 334078487.0, "step": 8759 }, { "epoch": 1.1143620404528687, "ewc_loss": 0.02653609961271286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6536099539953284e-05, "grad_norm": 16.419418334960938, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8743066191673279, "num_tokens": 334119610.0, "step": 8760 }, { "epoch": 1.114489250731459, "ewc_loss": 0.02651171013712883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651171053003054e-05, "grad_norm": 16.55411148071289, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8622145056724548, "num_tokens": 334160149.0, "step": 8761 }, { "epoch": 1.1146164610100495, "ewc_loss": 0.026555757969617844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6555757358437404e-05, "grad_norm": 16.439512252807617, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8669196367263794, "num_tokens": 334198667.0, "step": 8762 }, { "epoch": 1.11474367128864, "ewc_loss": 0.026520535349845886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6520536266616546e-05, "grad_norm": 16.53403091430664, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8680071830749512, "num_tokens": 334233838.0, "step": 8763 }, { "epoch": 1.1148708815672306, "ewc_loss": 0.026508040726184845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6508039809414186e-05, "grad_norm": 16.437580108642578, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8580328822135925, "num_tokens": 334275703.0, "step": 8764 }, { "epoch": 1.114998091845821, "ewc_loss": 0.026541687548160553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6541687475400977e-05, "grad_norm": 16.517438888549805, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8769434094429016, "num_tokens": 334311119.0, "step": 8765 }, { "epoch": 1.1151253021244116, "ewc_loss": 0.026525460183620453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6525460270931944e-05, "grad_norm": 16.466230392456055, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8591561913490295, "num_tokens": 334348298.0, "step": 8766 }, { "epoch": 1.1152525124030022, "ewc_loss": 0.02652081288397312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6520812753005885e-05, "grad_norm": 16.50494956970215, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8525584936141968, "num_tokens": 334386986.0, "step": 8767 }, { "epoch": 1.1153797226815927, "ewc_loss": 0.026542019098997116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6542018531472422e-05, "grad_norm": 16.468456268310547, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8698961734771729, "num_tokens": 334423263.0, "step": 8768 }, { "epoch": 1.1155069329601832, "ewc_loss": 0.02651895396411419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651895374583546e-05, "grad_norm": 16.486312866210938, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8824291229248047, "num_tokens": 334460363.0, "step": 8769 }, { "epoch": 1.1156341432387737, "ewc_loss": 0.026557404547929764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6557405362837017e-05, "grad_norm": 16.565500259399414, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8694101572036743, "num_tokens": 334501883.0, "step": 8770 }, { "epoch": 1.1157613535173643, "ewc_loss": 0.02655182033777237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.655182106536813e-05, "grad_norm": 16.564132690429688, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8650903105735779, "num_tokens": 334536242.0, "step": 8771 }, { "epoch": 1.1158885637959548, "ewc_loss": 0.02648092806339264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6480927772354335e-05, "grad_norm": 16.476295471191406, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8782070279121399, "num_tokens": 334569685.0, "step": 8772 }, { "epoch": 1.1160157740745453, "ewc_loss": 0.026477308943867683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6477309802430682e-05, "grad_norm": 16.568037033081055, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8687856793403625, "num_tokens": 334604946.0, "step": 8773 }, { "epoch": 1.1161429843531356, "ewc_loss": 0.026563405990600586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6563406208879314e-05, "grad_norm": 16.605745315551758, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8723568916320801, "num_tokens": 334645495.0, "step": 8774 }, { "epoch": 1.1162701946317262, "ewc_loss": 0.02648691087961197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6486910428502597e-05, "grad_norm": 16.45459747314453, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8616012930870056, "num_tokens": 334684239.0, "step": 8775 }, { "epoch": 1.1163974049103167, "ewc_loss": 0.026487311348319054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6487310606171377e-05, "grad_norm": 16.63269805908203, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.872265100479126, "num_tokens": 334713799.0, "step": 8776 }, { "epoch": 1.1165246151889072, "ewc_loss": 0.026524705812335014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6524705390329473e-05, "grad_norm": 16.45537757873535, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8865812420845032, "num_tokens": 334753374.0, "step": 8777 }, { "epoch": 1.1166518254674977, "ewc_loss": 0.02643764019012451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.643763946252875e-05, "grad_norm": 16.4740047454834, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8674687147140503, "num_tokens": 334792867.0, "step": 8778 }, { "epoch": 1.1167790357460883, "ewc_loss": 0.026568979024887085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.656897959241178e-05, "grad_norm": 16.55657196044922, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8636255264282227, "num_tokens": 334832664.0, "step": 8779 }, { "epoch": 1.1169062460246788, "ewc_loss": 0.026500016450881958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6500016247155145e-05, "grad_norm": 16.450681686401367, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8596678972244263, "num_tokens": 334876763.0, "step": 8780 }, { "epoch": 1.1170334563032693, "ewc_loss": 0.02649025432765484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6490253731026314e-05, "grad_norm": 16.513004302978516, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8668200373649597, "num_tokens": 334909431.0, "step": 8781 }, { "epoch": 1.1171606665818599, "ewc_loss": 0.02657095156610012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6570951376925223e-05, "grad_norm": 16.571821212768555, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.864537239074707, "num_tokens": 334941939.0, "step": 8782 }, { "epoch": 1.1172878768604504, "ewc_loss": 0.02651616372168064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651616341609042e-05, "grad_norm": 16.45863914489746, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.875198483467102, "num_tokens": 334981338.0, "step": 8783 }, { "epoch": 1.117415087139041, "ewc_loss": 0.02652190625667572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6521905965637416e-05, "grad_norm": 16.47731590270996, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8662173748016357, "num_tokens": 335020660.0, "step": 8784 }, { "epoch": 1.1175422974176314, "ewc_loss": 0.026536673307418823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.65366725216154e-05, "grad_norm": 16.50124168395996, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8692182302474976, "num_tokens": 335057377.0, "step": 8785 }, { "epoch": 1.1176695076962218, "ewc_loss": 0.026559680700302124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6559680918580852e-05, "grad_norm": 16.44854164123535, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8566495180130005, "num_tokens": 335092912.0, "step": 8786 }, { "epoch": 1.1177967179748123, "ewc_loss": 0.026488007977604866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6488007279112935e-05, "grad_norm": 16.494672775268555, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.881219744682312, "num_tokens": 335129157.0, "step": 8787 }, { "epoch": 1.1179239282534028, "ewc_loss": 0.026604915037751198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.660491554706823e-05, "grad_norm": 16.501752853393555, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8681231141090393, "num_tokens": 335168065.0, "step": 8788 }, { "epoch": 1.1180511385319933, "ewc_loss": 0.026614634320139885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6614634407451376e-05, "grad_norm": 16.500457763671875, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8646479249000549, "num_tokens": 335218876.0, "step": 8789 }, { "epoch": 1.1181783488105839, "ewc_loss": 0.026567889377474785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6567890017759055e-05, "grad_norm": 16.458541870117188, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8760637044906616, "num_tokens": 335256300.0, "step": 8790 }, { "epoch": 1.1183055590891744, "ewc_loss": 0.026531411334872246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6531412004260346e-05, "grad_norm": 16.563331604003906, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8762431740760803, "num_tokens": 335291035.0, "step": 8791 }, { "epoch": 1.118432769367765, "ewc_loss": 0.026643851771950722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.664385101525113e-05, "grad_norm": 16.516983032226562, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8786352872848511, "num_tokens": 335330092.0, "step": 8792 }, { "epoch": 1.1185599796463555, "ewc_loss": 0.026562845334410667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6562845960143022e-05, "grad_norm": 16.54102325439453, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8531275987625122, "num_tokens": 335367146.0, "step": 8793 }, { "epoch": 1.118687189924946, "ewc_loss": 0.02658623829483986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6586238163872622e-05, "grad_norm": 16.54254913330078, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.866793155670166, "num_tokens": 335405069.0, "step": 8794 }, { "epoch": 1.1188144002035365, "ewc_loss": 0.02652524970471859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6525249268161133e-05, "grad_norm": 16.475446701049805, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8565797805786133, "num_tokens": 335443779.0, "step": 8795 }, { "epoch": 1.118941610482127, "ewc_loss": 0.0265627633780241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6562764105619863e-05, "grad_norm": 16.46621322631836, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8592301607131958, "num_tokens": 335485978.0, "step": 8796 }, { "epoch": 1.1190688207607176, "ewc_loss": 0.026538550853729248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6538551537669264e-05, "grad_norm": 16.52014923095703, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.870734453201294, "num_tokens": 335530375.0, "step": 8797 }, { "epoch": 1.119196031039308, "ewc_loss": 0.02653099223971367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6530991817708127e-05, "grad_norm": 16.462865829467773, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8720793724060059, "num_tokens": 335569109.0, "step": 8798 }, { "epoch": 1.1193232413178984, "ewc_loss": 0.026478061452507973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6478061045054346e-05, "grad_norm": 16.567771911621094, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8531184792518616, "num_tokens": 335599283.0, "step": 8799 }, { "epoch": 1.119450451596489, "ewc_loss": 0.026539959013462067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6539959435467608e-05, "grad_norm": 16.524173736572266, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8735731244087219, "num_tokens": 335634792.0, "step": 8800 }, { "epoch": 1.1195776618750795, "ewc_loss": 0.026503875851631165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.650387614266947e-05, "grad_norm": 16.57643699645996, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8798604607582092, "num_tokens": 335672869.0, "step": 8801 }, { "epoch": 1.11970487215367, "ewc_loss": 0.026519257575273514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6519257517065853e-05, "grad_norm": 16.50716781616211, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8715897798538208, "num_tokens": 335714996.0, "step": 8802 }, { "epoch": 1.1198320824322605, "ewc_loss": 0.026483479887247086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.648347981448751e-05, "grad_norm": 16.607316970825195, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8589993715286255, "num_tokens": 335756884.0, "step": 8803 }, { "epoch": 1.119959292710851, "ewc_loss": 0.02650907076895237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6509071176405996e-05, "grad_norm": 16.510705947875977, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8767313957214355, "num_tokens": 335793935.0, "step": 8804 }, { "epoch": 1.1200865029894416, "ewc_loss": 0.02650124952197075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.650124952197075e-05, "grad_norm": 16.524486541748047, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8781906366348267, "num_tokens": 335834169.0, "step": 8805 }, { "epoch": 1.120213713268032, "ewc_loss": 0.026521246880292892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6521247491473332e-05, "grad_norm": 16.55715560913086, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8845875859260559, "num_tokens": 335872302.0, "step": 8806 }, { "epoch": 1.1203409235466226, "ewc_loss": 0.02651757001876831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6517569494899362e-05, "grad_norm": 16.52987289428711, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8667405843734741, "num_tokens": 335908074.0, "step": 8807 }, { "epoch": 1.1204681338252132, "ewc_loss": 0.026519641280174255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651964132383e-05, "grad_norm": 16.52600860595703, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8775904774665833, "num_tokens": 335946268.0, "step": 8808 }, { "epoch": 1.1205953441038037, "ewc_loss": 0.026484817266464233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6484816771699116e-05, "grad_norm": 16.535980224609375, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8684865236282349, "num_tokens": 335985364.0, "step": 8809 }, { "epoch": 1.120722554382394, "ewc_loss": 0.026497235521674156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6497235012357123e-05, "grad_norm": 16.504093170166016, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8703484535217285, "num_tokens": 336025300.0, "step": 8810 }, { "epoch": 1.1208497646609845, "ewc_loss": 0.02649260312318802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6492603865335695e-05, "grad_norm": 16.509431838989258, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8711010813713074, "num_tokens": 336062105.0, "step": 8811 }, { "epoch": 1.120976974939575, "ewc_loss": 0.026495210826396942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6495210477150977e-05, "grad_norm": 16.569623947143555, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8557707071304321, "num_tokens": 336102944.0, "step": 8812 }, { "epoch": 1.1211041852181656, "ewc_loss": 0.026502756401896477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6502755645196885e-05, "grad_norm": 16.53723907470703, "learning_rate": 1e-06, "loss": 0.3325, "mean_token_accuracy": 0.8923503160476685, "num_tokens": 336142964.0, "step": 8813 }, { "epoch": 1.121231395496756, "ewc_loss": 0.026432814076542854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6432813683641143e-05, "grad_norm": 16.47134780883789, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8696225881576538, "num_tokens": 336186962.0, "step": 8814 }, { "epoch": 1.1213586057753466, "ewc_loss": 0.02642033062875271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6420329959364608e-05, "grad_norm": 16.468067169189453, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8765890598297119, "num_tokens": 336220775.0, "step": 8815 }, { "epoch": 1.1214858160539372, "ewc_loss": 0.02646605484187603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6466055714990944e-05, "grad_norm": 16.53782081604004, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8634154796600342, "num_tokens": 336263067.0, "step": 8816 }, { "epoch": 1.1216130263325277, "ewc_loss": 0.026495620608329773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6495619749766774e-05, "grad_norm": 16.566198348999023, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8711287975311279, "num_tokens": 336308463.0, "step": 8817 }, { "epoch": 1.1217402366111182, "ewc_loss": 0.026395922526717186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.639592275954783e-05, "grad_norm": 16.414779663085938, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8803520798683167, "num_tokens": 336346965.0, "step": 8818 }, { "epoch": 1.1218674468897087, "ewc_loss": 0.026431461796164513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.643146217451431e-05, "grad_norm": 16.538341522216797, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8684393167495728, "num_tokens": 336382114.0, "step": 8819 }, { "epoch": 1.1219946571682993, "ewc_loss": 0.026442790403962135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6442790840519592e-05, "grad_norm": 16.516136169433594, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8710760474205017, "num_tokens": 336414020.0, "step": 8820 }, { "epoch": 1.1221218674468898, "ewc_loss": 0.02640584670007229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6405847165733576e-05, "grad_norm": 16.527828216552734, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.865424633026123, "num_tokens": 336452522.0, "step": 8821 }, { "epoch": 1.1222490777254803, "ewc_loss": 0.026449913159012794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6449914003023878e-05, "grad_norm": 16.512248992919922, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8775544166564941, "num_tokens": 336484593.0, "step": 8822 }, { "epoch": 1.1223762880040706, "ewc_loss": 0.02645176835358143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.645176755322609e-05, "grad_norm": 16.583206176757812, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8514515161514282, "num_tokens": 336518964.0, "step": 8823 }, { "epoch": 1.1225034982826612, "ewc_loss": 0.0264778770506382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6477877327124588e-05, "grad_norm": 16.530498504638672, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8550268411636353, "num_tokens": 336556320.0, "step": 8824 }, { "epoch": 1.1226307085612517, "ewc_loss": 0.026468425989151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6468425858183764e-05, "grad_norm": 16.568836212158203, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8576003313064575, "num_tokens": 336595473.0, "step": 8825 }, { "epoch": 1.1227579188398422, "ewc_loss": 0.026468072086572647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6468071155250072e-05, "grad_norm": 16.513364791870117, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8668506145477295, "num_tokens": 336631793.0, "step": 8826 }, { "epoch": 1.1228851291184327, "ewc_loss": 0.026471301913261414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.647130168043077e-05, "grad_norm": 16.482168197631836, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8530462384223938, "num_tokens": 336671230.0, "step": 8827 }, { "epoch": 1.1230123393970233, "ewc_loss": 0.026481542736291885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6481542590772733e-05, "grad_norm": 16.52274513244629, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8690390586853027, "num_tokens": 336712015.0, "step": 8828 }, { "epoch": 1.1231395496756138, "ewc_loss": 0.02651272527873516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.651272552611772e-05, "grad_norm": 16.598596572875977, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8506187200546265, "num_tokens": 336741206.0, "step": 8829 }, { "epoch": 1.1232667599542043, "ewc_loss": 0.02653978392481804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6539782993495464e-05, "grad_norm": 16.501646041870117, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8599759936332703, "num_tokens": 336778838.0, "step": 8830 }, { "epoch": 1.1233939702327949, "ewc_loss": 0.026526570320129395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6526569854468107e-05, "grad_norm": 16.53980255126953, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8726552724838257, "num_tokens": 336815471.0, "step": 8831 }, { "epoch": 1.1235211805113854, "ewc_loss": 0.026524480432271957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6524479835643433e-05, "grad_norm": 16.523086547851562, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.869621753692627, "num_tokens": 336850870.0, "step": 8832 }, { "epoch": 1.123648390789976, "ewc_loss": 0.026534339413046837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.653433875821065e-05, "grad_norm": 16.490234375, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8717814683914185, "num_tokens": 336891779.0, "step": 8833 }, { "epoch": 1.1237756010685664, "ewc_loss": 0.026595165953040123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6595165763865225e-05, "grad_norm": 16.558815002441406, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8780513405799866, "num_tokens": 336931171.0, "step": 8834 }, { "epoch": 1.1239028113471567, "ewc_loss": 0.026564447209239006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6564446670818143e-05, "grad_norm": 16.580703735351562, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8775823712348938, "num_tokens": 336963080.0, "step": 8835 }, { "epoch": 1.1240300216257473, "ewc_loss": 0.02661002241075039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6610023269313388e-05, "grad_norm": 16.563650131225586, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8533375263214111, "num_tokens": 337000817.0, "step": 8836 }, { "epoch": 1.1241572319043378, "ewc_loss": 0.0265496876090765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.654968739079777e-05, "grad_norm": 16.559297561645508, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8620508313179016, "num_tokens": 337037249.0, "step": 8837 }, { "epoch": 1.1242844421829283, "ewc_loss": 0.026590412482619286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659041274455376e-05, "grad_norm": 16.441293716430664, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8753831386566162, "num_tokens": 337077752.0, "step": 8838 }, { "epoch": 1.1244116524615189, "ewc_loss": 0.026568152010440826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6568151952233166e-05, "grad_norm": 16.519763946533203, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8643979430198669, "num_tokens": 337119649.0, "step": 8839 }, { "epoch": 1.1245388627401094, "ewc_loss": 0.026610558852553368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661055805219803e-05, "grad_norm": 16.595558166503906, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8511487245559692, "num_tokens": 337154376.0, "step": 8840 }, { "epoch": 1.1246660730187, "ewc_loss": 0.026577526703476906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657752702361904e-05, "grad_norm": 16.49030303955078, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8761869668960571, "num_tokens": 337193377.0, "step": 8841 }, { "epoch": 1.1247932832972904, "ewc_loss": 0.026586750522255898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6586751118884422e-05, "grad_norm": 16.53923797607422, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8580965995788574, "num_tokens": 337233176.0, "step": 8842 }, { "epoch": 1.124920493575881, "ewc_loss": 0.02660929597914219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.660929567355197e-05, "grad_norm": 16.58323860168457, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8528861999511719, "num_tokens": 337262995.0, "step": 8843 }, { "epoch": 1.1250477038544715, "ewc_loss": 0.02654353156685829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6543531930656172e-05, "grad_norm": 16.472801208496094, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.866013765335083, "num_tokens": 337302466.0, "step": 8844 }, { "epoch": 1.125174914133062, "ewc_loss": 0.026545017957687378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.654501804499887e-05, "grad_norm": 16.48332977294922, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8785480260848999, "num_tokens": 337341197.0, "step": 8845 }, { "epoch": 1.1253021244116526, "ewc_loss": 0.026608722284436226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6608722691889852e-05, "grad_norm": 16.542537689208984, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8682681322097778, "num_tokens": 337375602.0, "step": 8846 }, { "epoch": 1.125429334690243, "ewc_loss": 0.026571355760097504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657135519257281e-05, "grad_norm": 16.551252365112305, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8572777509689331, "num_tokens": 337406685.0, "step": 8847 }, { "epoch": 1.1255565449688334, "ewc_loss": 0.026578059419989586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657805998751428e-05, "grad_norm": 16.55246925354004, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8619423508644104, "num_tokens": 337447364.0, "step": 8848 }, { "epoch": 1.125683755247424, "ewc_loss": 0.026635929942131042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6635929316398688e-05, "grad_norm": 16.549354553222656, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8663913607597351, "num_tokens": 337487840.0, "step": 8849 }, { "epoch": 1.1258109655260145, "ewc_loss": 0.026635488495230675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.663548912096303e-05, "grad_norm": 16.56963539123535, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8705443143844604, "num_tokens": 337525117.0, "step": 8850 }, { "epoch": 1.125938175804605, "ewc_loss": 0.02659497782588005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659497840795666e-05, "grad_norm": 16.536556243896484, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8601168394088745, "num_tokens": 337567024.0, "step": 8851 }, { "epoch": 1.1260653860831955, "ewc_loss": 0.026579128578305244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657912773429416e-05, "grad_norm": 16.556671142578125, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.864926278591156, "num_tokens": 337600117.0, "step": 8852 }, { "epoch": 1.126192596361786, "ewc_loss": 0.026607826352119446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6607825930113904e-05, "grad_norm": 16.559648513793945, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8525110483169556, "num_tokens": 337637188.0, "step": 8853 }, { "epoch": 1.1263198066403766, "ewc_loss": 0.026583252474665642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6583253202261403e-05, "grad_norm": 16.571516036987305, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8622031211853027, "num_tokens": 337676492.0, "step": 8854 }, { "epoch": 1.126447016918967, "ewc_loss": 0.026621298864483833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.662129918462597e-05, "grad_norm": 16.552501678466797, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8801142573356628, "num_tokens": 337713468.0, "step": 8855 }, { "epoch": 1.1265742271975576, "ewc_loss": 0.026530753821134567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6530753530096263e-05, "grad_norm": 16.562904357910156, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8535488843917847, "num_tokens": 337750944.0, "step": 8856 }, { "epoch": 1.1267014374761481, "ewc_loss": 0.026595041155815125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659504025359638e-05, "grad_norm": 16.466156005859375, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8812768459320068, "num_tokens": 337793802.0, "step": 8857 }, { "epoch": 1.1268286477547387, "ewc_loss": 0.026531551033258438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6531550247455016e-05, "grad_norm": 16.56502342224121, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8792654871940613, "num_tokens": 337828550.0, "step": 8858 }, { "epoch": 1.126955858033329, "ewc_loss": 0.02660497836768627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6604979211697355e-05, "grad_norm": 16.498411178588867, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8762543201446533, "num_tokens": 337862168.0, "step": 8859 }, { "epoch": 1.1270830683119195, "ewc_loss": 0.026526648551225662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.652664807101246e-05, "grad_norm": 16.55937385559082, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8626539707183838, "num_tokens": 337898837.0, "step": 8860 }, { "epoch": 1.12721027859051, "ewc_loss": 0.026634465903043747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6634465029928833e-05, "grad_norm": 16.545578002929688, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.853813648223877, "num_tokens": 337939608.0, "step": 8861 }, { "epoch": 1.1273374888691006, "ewc_loss": 0.026539022102952003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6539022655924782e-05, "grad_norm": 16.50202751159668, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.866182267665863, "num_tokens": 337983412.0, "step": 8862 }, { "epoch": 1.127464699147691, "ewc_loss": 0.026596976444125175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6596975658321753e-05, "grad_norm": 16.65032196044922, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8760230541229248, "num_tokens": 338022171.0, "step": 8863 }, { "epoch": 1.1275919094262816, "ewc_loss": 0.02656533569097519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6565336156636477e-05, "grad_norm": 16.50832748413086, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8640106916427612, "num_tokens": 338058838.0, "step": 8864 }, { "epoch": 1.1277191197048722, "ewc_loss": 0.026481730863451958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.64817299466813e-05, "grad_norm": 16.524606704711914, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8632887601852417, "num_tokens": 338095080.0, "step": 8865 }, { "epoch": 1.1278463299834627, "ewc_loss": 0.02661862224340439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661862163222395e-05, "grad_norm": 16.602943420410156, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8485758304595947, "num_tokens": 338134206.0, "step": 8866 }, { "epoch": 1.1279735402620532, "ewc_loss": 0.0265515074133873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.655150819919072e-05, "grad_norm": 16.4848575592041, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8729658126831055, "num_tokens": 338173794.0, "step": 8867 }, { "epoch": 1.1281007505406437, "ewc_loss": 0.026484817266464233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6484816771699116e-05, "grad_norm": 16.586240768432617, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8641453981399536, "num_tokens": 338206591.0, "step": 8868 }, { "epoch": 1.1282279608192343, "ewc_loss": 0.02660064771771431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6600648197927512e-05, "grad_norm": 16.5637264251709, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8793603777885437, "num_tokens": 338238728.0, "step": 8869 }, { "epoch": 1.1283551710978248, "ewc_loss": 0.026537472382187843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.653747287695296e-05, "grad_norm": 16.49307632446289, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8755749464035034, "num_tokens": 338282372.0, "step": 8870 }, { "epoch": 1.1284823813764153, "ewc_loss": 0.02651912160217762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6519121092860587e-05, "grad_norm": 16.453277587890625, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8618024587631226, "num_tokens": 338319175.0, "step": 8871 }, { "epoch": 1.1286095916550056, "ewc_loss": 0.02653728611767292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6537285521044396e-05, "grad_norm": 16.533849716186523, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8637380599975586, "num_tokens": 338357967.0, "step": 8872 }, { "epoch": 1.1287368019335962, "ewc_loss": 0.02656516619026661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6565166990621947e-05, "grad_norm": 16.530651092529297, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8734062910079956, "num_tokens": 338391431.0, "step": 8873 }, { "epoch": 1.1288640122121867, "ewc_loss": 0.026613835245370865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661383587110322e-05, "grad_norm": 16.551504135131836, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8746955394744873, "num_tokens": 338426298.0, "step": 8874 }, { "epoch": 1.1289912224907772, "ewc_loss": 0.02658914215862751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.658914127096068e-05, "grad_norm": 16.485210418701172, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8489751815795898, "num_tokens": 338468147.0, "step": 8875 }, { "epoch": 1.1291184327693677, "ewc_loss": 0.02657378651201725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657378718140535e-05, "grad_norm": 16.538610458374023, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8504292964935303, "num_tokens": 338517095.0, "step": 8876 }, { "epoch": 1.1292456430479583, "ewc_loss": 0.026574090123176575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6574090952635743e-05, "grad_norm": 16.466981887817383, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8743886947631836, "num_tokens": 338549925.0, "step": 8877 }, { "epoch": 1.1293728533265488, "ewc_loss": 0.02654365263879299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6543651983956806e-05, "grad_norm": 16.49790382385254, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8692882061004639, "num_tokens": 338583966.0, "step": 8878 }, { "epoch": 1.1295000636051393, "ewc_loss": 0.026637181639671326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6637180781108327e-05, "grad_norm": 16.496957778930664, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8713359832763672, "num_tokens": 338622715.0, "step": 8879 }, { "epoch": 1.1296272738837299, "ewc_loss": 0.02661643736064434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661643702595029e-05, "grad_norm": 16.506189346313477, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.854559063911438, "num_tokens": 338659920.0, "step": 8880 }, { "epoch": 1.1297544841623204, "ewc_loss": 0.02661215513944626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6612155124894343e-05, "grad_norm": 16.58366584777832, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8614157438278198, "num_tokens": 338698183.0, "step": 8881 }, { "epoch": 1.129881694440911, "ewc_loss": 0.02663450688123703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6634506866685115e-05, "grad_norm": 16.51151466369629, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8558407425880432, "num_tokens": 338735853.0, "step": 8882 }, { "epoch": 1.1300089047195012, "ewc_loss": 0.026572858914732933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6572859496809542e-05, "grad_norm": 16.485759735107422, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.856695294380188, "num_tokens": 338771714.0, "step": 8883 }, { "epoch": 1.1301361149980917, "ewc_loss": 0.026622094213962555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6622094082995318e-05, "grad_norm": 16.48241424560547, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8833327293395996, "num_tokens": 338813823.0, "step": 8884 }, { "epoch": 1.1302633252766823, "ewc_loss": 0.026680346578359604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6680347218643874e-05, "grad_norm": 16.59575080871582, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8701779842376709, "num_tokens": 338858137.0, "step": 8885 }, { "epoch": 1.1303905355552728, "ewc_loss": 0.026658808812499046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6658808565116487e-05, "grad_norm": 16.564870834350586, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8568586111068726, "num_tokens": 338893653.0, "step": 8886 }, { "epoch": 1.1305177458338633, "ewc_loss": 0.026603516191244125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6603516744216904e-05, "grad_norm": 16.531911849975586, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8478801250457764, "num_tokens": 338931230.0, "step": 8887 }, { "epoch": 1.1306449561124539, "ewc_loss": 0.026612499728798866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6612498913891613e-05, "grad_norm": 16.571483612060547, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8652238845825195, "num_tokens": 338966921.0, "step": 8888 }, { "epoch": 1.1307721663910444, "ewc_loss": 0.026631897315382957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6631896616891026e-05, "grad_norm": 16.637062072753906, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8469407558441162, "num_tokens": 339000957.0, "step": 8889 }, { "epoch": 1.130899376669635, "ewc_loss": 0.02664705365896225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6647054255590774e-05, "grad_norm": 16.649412155151367, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8596749305725098, "num_tokens": 339035876.0, "step": 8890 }, { "epoch": 1.1310265869482254, "ewc_loss": 0.026565076783299446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.656507604115177e-05, "grad_norm": 16.496356964111328, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8680945038795471, "num_tokens": 339073700.0, "step": 8891 }, { "epoch": 1.131153797226816, "ewc_loss": 0.0266286488622427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6628649720805697e-05, "grad_norm": 16.520095825195312, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8713663816452026, "num_tokens": 339108222.0, "step": 8892 }, { "epoch": 1.1312810075054065, "ewc_loss": 0.026661863550543785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.666186264832504e-05, "grad_norm": 16.56113624572754, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8773806095123291, "num_tokens": 339145749.0, "step": 8893 }, { "epoch": 1.131408217783997, "ewc_loss": 0.026667136698961258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.666713589860592e-05, "grad_norm": 16.546905517578125, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8525554537773132, "num_tokens": 339183852.0, "step": 8894 }, { "epoch": 1.1315354280625876, "ewc_loss": 0.02663564682006836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6635647373041138e-05, "grad_norm": 16.57779884338379, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8808705806732178, "num_tokens": 339218766.0, "step": 8895 }, { "epoch": 1.131662638341178, "ewc_loss": 0.026676712557673454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.667671287781559e-05, "grad_norm": 16.508893966674805, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8664878606796265, "num_tokens": 339259932.0, "step": 8896 }, { "epoch": 1.1317898486197684, "ewc_loss": 0.02662932127714157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6629320927895606e-05, "grad_norm": 16.509862899780273, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8697865009307861, "num_tokens": 339302201.0, "step": 8897 }, { "epoch": 1.131917058898359, "ewc_loss": 0.026639094576239586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6639094357960857e-05, "grad_norm": 16.551860809326172, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.8370247483253479, "num_tokens": 339342091.0, "step": 8898 }, { "epoch": 1.1320442691769494, "ewc_loss": 0.026669854298233986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6669853468774818e-05, "grad_norm": 16.620697021484375, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8617432713508606, "num_tokens": 339379951.0, "step": 8899 }, { "epoch": 1.13217147945554, "ewc_loss": 0.026657354086637497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.665735337359365e-05, "grad_norm": 16.53463363647461, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8619153499603271, "num_tokens": 339422243.0, "step": 8900 }, { "epoch": 1.1322986897341305, "ewc_loss": 0.02661859802901745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6618597985361703e-05, "grad_norm": 16.54440689086914, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8812520503997803, "num_tokens": 339464969.0, "step": 8901 }, { "epoch": 1.132425900012721, "ewc_loss": 0.026640981435775757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6640980649972335e-05, "grad_norm": 16.495594024658203, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8699356317520142, "num_tokens": 339502382.0, "step": 8902 }, { "epoch": 1.1325531102913116, "ewc_loss": 0.026639698073267937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6639698262442835e-05, "grad_norm": 16.555835723876953, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8513349890708923, "num_tokens": 339542875.0, "step": 8903 }, { "epoch": 1.132680320569902, "ewc_loss": 0.026698732748627663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6698733563534915e-05, "grad_norm": 16.615938186645508, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8526017069816589, "num_tokens": 339580776.0, "step": 8904 }, { "epoch": 1.1328075308484926, "ewc_loss": 0.02662748284637928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.662748192960862e-05, "grad_norm": 16.52146339416504, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8793754577636719, "num_tokens": 339620195.0, "step": 8905 }, { "epoch": 1.1329347411270831, "ewc_loss": 0.026591043919324875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.659104393387679e-05, "grad_norm": 16.59756851196289, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8737980127334595, "num_tokens": 339659234.0, "step": 8906 }, { "epoch": 1.1330619514056737, "ewc_loss": 0.026651522144675255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6651521693565883e-05, "grad_norm": 16.509262084960938, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8567367792129517, "num_tokens": 339703259.0, "step": 8907 }, { "epoch": 1.133189161684264, "ewc_loss": 0.026563521474599838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6563520805211738e-05, "grad_norm": 16.582855224609375, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8725789189338684, "num_tokens": 339745901.0, "step": 8908 }, { "epoch": 1.1333163719628545, "ewc_loss": 0.026642387732863426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.664238854777068e-05, "grad_norm": 16.493675231933594, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8569425344467163, "num_tokens": 339790417.0, "step": 8909 }, { "epoch": 1.133443582241445, "ewc_loss": 0.026507632806897163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.650763235578779e-05, "grad_norm": 16.597299575805664, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.857910692691803, "num_tokens": 339827224.0, "step": 8910 }, { "epoch": 1.1335707925200356, "ewc_loss": 0.026647143065929413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6647143386071548e-05, "grad_norm": 16.531436920166016, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8603920936584473, "num_tokens": 339859932.0, "step": 8911 }, { "epoch": 1.133698002798626, "ewc_loss": 0.02653789520263672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6537894882494584e-05, "grad_norm": 16.582473754882812, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8541114926338196, "num_tokens": 339902416.0, "step": 8912 }, { "epoch": 1.1338252130772166, "ewc_loss": 0.02660319022834301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6603189326124266e-05, "grad_norm": 16.490158081054688, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8743141293525696, "num_tokens": 339940495.0, "step": 8913 }, { "epoch": 1.1339524233558071, "ewc_loss": 0.026554064825177193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6554065698292106e-05, "grad_norm": 16.55413818359375, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8484645485877991, "num_tokens": 339983010.0, "step": 8914 }, { "epoch": 1.1340796336343977, "ewc_loss": 0.026618510484695435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6618510673870333e-05, "grad_norm": 16.57439613342285, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8666243553161621, "num_tokens": 340020683.0, "step": 8915 }, { "epoch": 1.1342068439129882, "ewc_loss": 0.02659417875111103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.65941780526191e-05, "grad_norm": 16.5512752532959, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8575471639633179, "num_tokens": 340063179.0, "step": 8916 }, { "epoch": 1.1343340541915787, "ewc_loss": 0.026540838181972504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.654083800734952e-05, "grad_norm": 16.48309898376465, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8754727840423584, "num_tokens": 340100166.0, "step": 8917 }, { "epoch": 1.1344612644701693, "ewc_loss": 0.02661345712840557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6613457521307282e-05, "grad_norm": 16.614023208618164, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8624132871627808, "num_tokens": 340144325.0, "step": 8918 }, { "epoch": 1.1345884747487598, "ewc_loss": 0.026619888842105865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661988946783822e-05, "grad_norm": 16.515966415405273, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.866003155708313, "num_tokens": 340180563.0, "step": 8919 }, { "epoch": 1.1347156850273503, "ewc_loss": 0.026552628725767136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6552628696663305e-05, "grad_norm": 16.52838706970215, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8532941341400146, "num_tokens": 340224621.0, "step": 8920 }, { "epoch": 1.1348428953059406, "ewc_loss": 0.02661893144249916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661893086042255e-05, "grad_norm": 16.569517135620117, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8709545731544495, "num_tokens": 340264810.0, "step": 8921 }, { "epoch": 1.1349701055845312, "ewc_loss": 0.026586119085550308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.658611992956139e-05, "grad_norm": 16.529991149902344, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8788692355155945, "num_tokens": 340301158.0, "step": 8922 }, { "epoch": 1.1350973158631217, "ewc_loss": 0.02660963498055935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6609635824570432e-05, "grad_norm": 16.583276748657227, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.866332471370697, "num_tokens": 340339806.0, "step": 8923 }, { "epoch": 1.1352245261417122, "ewc_loss": 0.02660098299384117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6600982891977765e-05, "grad_norm": 16.491024017333984, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8717536330223083, "num_tokens": 340386412.0, "step": 8924 }, { "epoch": 1.1353517364203027, "ewc_loss": 0.02660059928894043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6600599085213616e-05, "grad_norm": 16.500276565551758, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8750492334365845, "num_tokens": 340431391.0, "step": 8925 }, { "epoch": 1.1354789466988933, "ewc_loss": 0.026666123420000076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6666122721508145e-05, "grad_norm": 16.584653854370117, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8691360950469971, "num_tokens": 340465904.0, "step": 8926 }, { "epoch": 1.1356061569774838, "ewc_loss": 0.026635082438588142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6635081667336635e-05, "grad_norm": 16.568849563598633, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8697346448898315, "num_tokens": 340504290.0, "step": 8927 }, { "epoch": 1.1357333672560743, "ewc_loss": 0.026612907648086548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661290818650741e-05, "grad_norm": 16.503952026367188, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8607144951820374, "num_tokens": 340539100.0, "step": 8928 }, { "epoch": 1.1358605775346649, "ewc_loss": 0.026628844439983368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6628844352671877e-05, "grad_norm": 16.6328125, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8805185556411743, "num_tokens": 340569005.0, "step": 8929 }, { "epoch": 1.1359877878132554, "ewc_loss": 0.0266735702753067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6673569664126262e-05, "grad_norm": 16.564477920532227, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8651518821716309, "num_tokens": 340613476.0, "step": 8930 }, { "epoch": 1.136114998091846, "ewc_loss": 0.026667458936572075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.666745967871975e-05, "grad_norm": 16.676898956298828, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8586204051971436, "num_tokens": 340647982.0, "step": 8931 }, { "epoch": 1.1362422083704362, "ewc_loss": 0.02668273076415062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.668273009476252e-05, "grad_norm": 16.5477352142334, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.844146728515625, "num_tokens": 340683270.0, "step": 8932 }, { "epoch": 1.1363694186490267, "ewc_loss": 0.02660254016518593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.66025399469072e-05, "grad_norm": 16.606473922729492, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8696997165679932, "num_tokens": 340719955.0, "step": 8933 }, { "epoch": 1.1364966289276173, "ewc_loss": 0.02672025002539158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.672025038918946e-05, "grad_norm": 16.538896560668945, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8667784929275513, "num_tokens": 340753934.0, "step": 8934 }, { "epoch": 1.1366238392062078, "ewc_loss": 0.026649711653590202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6649711799109355e-05, "grad_norm": 16.629230499267578, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8721895217895508, "num_tokens": 340787545.0, "step": 8935 }, { "epoch": 1.1367510494847983, "ewc_loss": 0.026748834177851677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.674883398867678e-05, "grad_norm": 16.585338592529297, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8760207891464233, "num_tokens": 340826362.0, "step": 8936 }, { "epoch": 1.1368782597633889, "ewc_loss": 0.026687009260058403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6687010176829062e-05, "grad_norm": 16.629674911499023, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8706982731819153, "num_tokens": 340857871.0, "step": 8937 }, { "epoch": 1.1370054700419794, "ewc_loss": 0.02672235481441021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6722354959929362e-05, "grad_norm": 16.5268497467041, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8720833659172058, "num_tokens": 340897736.0, "step": 8938 }, { "epoch": 1.13713268032057, "ewc_loss": 0.026695972308516502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6695972337620333e-05, "grad_norm": 16.564605712890625, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.861945629119873, "num_tokens": 340935774.0, "step": 8939 }, { "epoch": 1.1372598905991604, "ewc_loss": 0.026751376688480377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6751376935862936e-05, "grad_norm": 16.496976852416992, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8700284361839294, "num_tokens": 340971497.0, "step": 8940 }, { "epoch": 1.137387100877751, "ewc_loss": 0.026759309694170952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.67593095486518e-05, "grad_norm": 16.592002868652344, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8673862218856812, "num_tokens": 341016209.0, "step": 8941 }, { "epoch": 1.1375143111563415, "ewc_loss": 0.02673972211778164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6739722670754418e-05, "grad_norm": 16.551528930664062, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8615756034851074, "num_tokens": 341052137.0, "step": 8942 }, { "epoch": 1.137641521434932, "ewc_loss": 0.026702191680669785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6702191462391056e-05, "grad_norm": 16.59730339050293, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8687769174575806, "num_tokens": 341083417.0, "step": 8943 }, { "epoch": 1.1377687317135226, "ewc_loss": 0.02675759792327881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6757597879623063e-05, "grad_norm": 16.51028823852539, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.869107723236084, "num_tokens": 341127798.0, "step": 8944 }, { "epoch": 1.137895941992113, "ewc_loss": 0.026689356192946434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6689356673159637e-05, "grad_norm": 16.499086380004883, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8527719974517822, "num_tokens": 341173485.0, "step": 8945 }, { "epoch": 1.1380231522707034, "ewc_loss": 0.026742754504084587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.674275492609013e-05, "grad_norm": 16.554624557495117, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8551223874092102, "num_tokens": 341212834.0, "step": 8946 }, { "epoch": 1.138150362549294, "ewc_loss": 0.026758382096886635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.675838186405599e-05, "grad_norm": 16.626605987548828, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8597902059555054, "num_tokens": 341246497.0, "step": 8947 }, { "epoch": 1.1382775728278844, "ewc_loss": 0.026774153113365173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6774152502184734e-05, "grad_norm": 16.52616310119629, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8504475951194763, "num_tokens": 341291498.0, "step": 8948 }, { "epoch": 1.138404783106475, "ewc_loss": 0.0267321839928627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6732184778666124e-05, "grad_norm": 16.635251998901367, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8601830005645752, "num_tokens": 341328104.0, "step": 8949 }, { "epoch": 1.1385319933850655, "ewc_loss": 0.026787538081407547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6787538445205428e-05, "grad_norm": 16.529233932495117, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.878128170967102, "num_tokens": 341359042.0, "step": 8950 }, { "epoch": 1.138659203663656, "ewc_loss": 0.026685457676649094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6685458578867838e-05, "grad_norm": 16.58696937561035, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8665268421173096, "num_tokens": 341403529.0, "step": 8951 }, { "epoch": 1.1387864139422466, "ewc_loss": 0.026755185797810555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.675518589967396e-05, "grad_norm": 16.523239135742188, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8734945058822632, "num_tokens": 341442509.0, "step": 8952 }, { "epoch": 1.138913624220837, "ewc_loss": 0.026674631983041763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6674631953937933e-05, "grad_norm": 16.549152374267578, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.864284873008728, "num_tokens": 341483015.0, "step": 8953 }, { "epoch": 1.1390408344994276, "ewc_loss": 0.02676469087600708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6764690119307488e-05, "grad_norm": 16.532052993774414, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.867034375667572, "num_tokens": 341519676.0, "step": 8954 }, { "epoch": 1.1391680447780181, "ewc_loss": 0.026692558079957962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6692558094509877e-05, "grad_norm": 16.575502395629883, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8664611577987671, "num_tokens": 341563483.0, "step": 8955 }, { "epoch": 1.1392952550566087, "ewc_loss": 0.026753589510917664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6753588826977648e-05, "grad_norm": 16.63100242614746, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8591086864471436, "num_tokens": 341598387.0, "step": 8956 }, { "epoch": 1.139422465335199, "ewc_loss": 0.026661589741706848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.666158979991451e-05, "grad_norm": 16.559494018554688, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8690128326416016, "num_tokens": 341640253.0, "step": 8957 }, { "epoch": 1.1395496756137895, "ewc_loss": 0.02670060656964779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6700607122620568e-05, "grad_norm": 16.593843460083008, "learning_rate": 1e-06, "loss": 0.3371, "mean_token_accuracy": 0.892547070980072, "num_tokens": 341675788.0, "step": 8958 }, { "epoch": 1.13967688589238, "ewc_loss": 0.026656508445739746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6656507543521002e-05, "grad_norm": 16.599559783935547, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8662241697311401, "num_tokens": 341716917.0, "step": 8959 }, { "epoch": 1.1398040961709706, "ewc_loss": 0.026641739532351494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6641739168553613e-05, "grad_norm": 16.515254974365234, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8654841184616089, "num_tokens": 341758936.0, "step": 8960 }, { "epoch": 1.139931306449561, "ewc_loss": 0.026645811274647713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6645811885828152e-05, "grad_norm": 16.634553909301758, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.842619776725769, "num_tokens": 341797612.0, "step": 8961 }, { "epoch": 1.1400585167281516, "ewc_loss": 0.026686621829867363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6686620913096704e-05, "grad_norm": 16.530057907104492, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8619282245635986, "num_tokens": 341836061.0, "step": 8962 }, { "epoch": 1.1401857270067421, "ewc_loss": 0.026636911556124687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6636911570676602e-05, "grad_norm": 16.562297821044922, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8652287125587463, "num_tokens": 341875465.0, "step": 8963 }, { "epoch": 1.1403129372853327, "ewc_loss": 0.026684515178203583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6684514523367397e-05, "grad_norm": 16.5636043548584, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8610594868659973, "num_tokens": 341919795.0, "step": 8964 }, { "epoch": 1.1404401475639232, "ewc_loss": 0.026656195521354675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6656194677343592e-05, "grad_norm": 16.57442855834961, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8621573448181152, "num_tokens": 341953291.0, "step": 8965 }, { "epoch": 1.1405673578425137, "ewc_loss": 0.026684723794460297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6684723707148805e-05, "grad_norm": 16.511415481567383, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8850298523902893, "num_tokens": 341988364.0, "step": 8966 }, { "epoch": 1.1406945681211043, "ewc_loss": 0.02665608562529087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.665608553797938e-05, "grad_norm": 16.568363189697266, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8697526454925537, "num_tokens": 342028478.0, "step": 8967 }, { "epoch": 1.1408217783996948, "ewc_loss": 0.02670017071068287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6700170565163717e-05, "grad_norm": 16.6279354095459, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.874020516872406, "num_tokens": 342067348.0, "step": 8968 }, { "epoch": 1.1409489886782853, "ewc_loss": 0.026655498892068863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6655498004402034e-05, "grad_norm": 16.533241271972656, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8627058863639832, "num_tokens": 342105845.0, "step": 8969 }, { "epoch": 1.1410761989568756, "ewc_loss": 0.026652660220861435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6652660380932502e-05, "grad_norm": 16.59076690673828, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8661667704582214, "num_tokens": 342142072.0, "step": 8970 }, { "epoch": 1.1412034092354661, "ewc_loss": 0.026693783700466156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6693784093367867e-05, "grad_norm": 16.61275291442871, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8690805435180664, "num_tokens": 342176592.0, "step": 8971 }, { "epoch": 1.1413306195140567, "ewc_loss": 0.026650503277778625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6650503059499897e-05, "grad_norm": 16.516979217529297, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8692225813865662, "num_tokens": 342211896.0, "step": 8972 }, { "epoch": 1.1414578297926472, "ewc_loss": 0.02665594220161438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.66559418378165e-05, "grad_norm": 16.57747459411621, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8695770502090454, "num_tokens": 342256078.0, "step": 8973 }, { "epoch": 1.1415850400712377, "ewc_loss": 0.026688165962696075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6688165235100314e-05, "grad_norm": 16.57354164123535, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8687901496887207, "num_tokens": 342296766.0, "step": 8974 }, { "epoch": 1.1417122503498283, "ewc_loss": 0.026683878153562546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6683877877076156e-05, "grad_norm": 16.673931121826172, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.865938127040863, "num_tokens": 342333689.0, "step": 8975 }, { "epoch": 1.1418394606284188, "ewc_loss": 0.026678593829274178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6678593712858856e-05, "grad_norm": 16.58899688720703, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8764010071754456, "num_tokens": 342369605.0, "step": 8976 }, { "epoch": 1.1419666709070093, "ewc_loss": 0.02661360427737236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661360485944897e-05, "grad_norm": 16.577598571777344, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8753669857978821, "num_tokens": 342410806.0, "step": 8977 }, { "epoch": 1.1420938811855998, "ewc_loss": 0.026673544198274612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6673544198274612e-05, "grad_norm": 16.601655960083008, "learning_rate": 1e-06, "loss": 0.3372, "mean_token_accuracy": 0.8916572332382202, "num_tokens": 342446957.0, "step": 8978 }, { "epoch": 1.1422210914641904, "ewc_loss": 0.026623329147696495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6623329176800326e-05, "grad_norm": 16.61241912841797, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8549906611442566, "num_tokens": 342480717.0, "step": 8979 }, { "epoch": 1.142348301742781, "ewc_loss": 0.02665729820728302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6657298803911544e-05, "grad_norm": 16.57693099975586, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8744053244590759, "num_tokens": 342520431.0, "step": 8980 }, { "epoch": 1.1424755120213712, "ewc_loss": 0.026596061885356903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6596062525641173e-05, "grad_norm": 16.570722579956055, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8629961013793945, "num_tokens": 342561735.0, "step": 8981 }, { "epoch": 1.1426027222999617, "ewc_loss": 0.026676060631871223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6676059860619716e-05, "grad_norm": 16.655799865722656, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8784356117248535, "num_tokens": 342593899.0, "step": 8982 }, { "epoch": 1.1427299325785523, "ewc_loss": 0.02660333923995495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6603338483255357e-05, "grad_norm": 16.5191707611084, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8651059865951538, "num_tokens": 342628044.0, "step": 8983 }, { "epoch": 1.1428571428571428, "ewc_loss": 0.026651471853256226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6651472580851987e-05, "grad_norm": 16.72942543029785, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8646785020828247, "num_tokens": 342665018.0, "step": 8984 }, { "epoch": 1.1429843531357333, "ewc_loss": 0.02667039819061756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6670397346606478e-05, "grad_norm": 16.579845428466797, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8551516532897949, "num_tokens": 342704832.0, "step": 8985 }, { "epoch": 1.1431115634143239, "ewc_loss": 0.026614660397171974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6614659873303026e-05, "grad_norm": 16.58940887451172, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8728452920913696, "num_tokens": 342741598.0, "step": 8986 }, { "epoch": 1.1432387736929144, "ewc_loss": 0.026714643463492393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.671464426384773e-05, "grad_norm": 16.74464988708496, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8612031936645508, "num_tokens": 342777302.0, "step": 8987 }, { "epoch": 1.143365983971505, "ewc_loss": 0.02665563113987446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6655630790628493e-05, "grad_norm": 16.55722999572754, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8463422656059265, "num_tokens": 342809578.0, "step": 8988 }, { "epoch": 1.1434931942500954, "ewc_loss": 0.02657254971563816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.657255026861094e-05, "grad_norm": 16.59916114807129, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8674271106719971, "num_tokens": 342845809.0, "step": 8989 }, { "epoch": 1.143620404528686, "ewc_loss": 0.02671256847679615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6712568796938285e-05, "grad_norm": 16.648195266723633, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8525893092155457, "num_tokens": 342886118.0, "step": 8990 }, { "epoch": 1.1437476148072765, "ewc_loss": 0.026654822751879692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.665482315933332e-05, "grad_norm": 16.548330307006836, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8620637655258179, "num_tokens": 342922517.0, "step": 8991 }, { "epoch": 1.143874825085867, "ewc_loss": 0.02667088992893696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.667089029273484e-05, "grad_norm": 16.60565185546875, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8602474927902222, "num_tokens": 342959741.0, "step": 8992 }, { "epoch": 1.1440020353644575, "ewc_loss": 0.02672155387699604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6721554604591802e-05, "grad_norm": 16.573081970214844, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8802613019943237, "num_tokens": 342991418.0, "step": 8993 }, { "epoch": 1.144129245643048, "ewc_loss": 0.026727093383669853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.67270934273256e-05, "grad_norm": 16.591798782348633, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8835050463676453, "num_tokens": 343024263.0, "step": 8994 }, { "epoch": 1.1442564559216384, "ewc_loss": 0.026790574193000793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6790574338519946e-05, "grad_norm": 16.612092971801758, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8675476312637329, "num_tokens": 343066388.0, "step": 8995 }, { "epoch": 1.144383666200229, "ewc_loss": 0.02677313983440399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677313932508696e-05, "grad_norm": 16.578218460083008, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8673447370529175, "num_tokens": 343107631.0, "step": 8996 }, { "epoch": 1.1445108764788194, "ewc_loss": 0.026773542165756226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6773541321745142e-05, "grad_norm": 16.618249893188477, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.872594952583313, "num_tokens": 343143645.0, "step": 8997 }, { "epoch": 1.14463808675741, "ewc_loss": 0.026763681322336197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.676368058018852e-05, "grad_norm": 16.593856811523438, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.853085994720459, "num_tokens": 343182076.0, "step": 8998 }, { "epoch": 1.1447652970360005, "ewc_loss": 0.026828454807400703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6828454792848788e-05, "grad_norm": 16.618080139160156, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8848255276679993, "num_tokens": 343222808.0, "step": 8999 }, { "epoch": 1.144892507314591, "ewc_loss": 0.026763172820210457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.676317308214493e-05, "grad_norm": 16.60945701599121, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.86297208070755, "num_tokens": 343268229.0, "step": 9000 }, { "epoch": 1.1450197175931816, "ewc_loss": 0.02680657058954239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6806570531334728e-05, "grad_norm": 16.581748962402344, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8593528866767883, "num_tokens": 343304459.0, "step": 9001 }, { "epoch": 1.145146927871772, "ewc_loss": 0.02677110955119133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.67711093329126e-05, "grad_norm": 16.72587776184082, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8298239707946777, "num_tokens": 343343211.0, "step": 9002 }, { "epoch": 1.1452741381503626, "ewc_loss": 0.02677137590944767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6771376724354923e-05, "grad_norm": 16.595731735229492, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8622665405273438, "num_tokens": 343388498.0, "step": 9003 }, { "epoch": 1.1454013484289531, "ewc_loss": 0.026670891791582108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6670892111724243e-05, "grad_norm": 16.587955474853516, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8642316460609436, "num_tokens": 343430101.0, "step": 9004 }, { "epoch": 1.1455285587075437, "ewc_loss": 0.026760974898934364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6760975742945448e-05, "grad_norm": 16.630475997924805, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.854134738445282, "num_tokens": 343466454.0, "step": 9005 }, { "epoch": 1.145655768986134, "ewc_loss": 0.026738230139017105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6738229280454107e-05, "grad_norm": 16.64130973815918, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8816909193992615, "num_tokens": 343506960.0, "step": 9006 }, { "epoch": 1.1457829792647245, "ewc_loss": 0.02668786235153675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6687861463869922e-05, "grad_norm": 16.585115432739258, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8738555908203125, "num_tokens": 343541133.0, "step": 9007 }, { "epoch": 1.145910189543315, "ewc_loss": 0.02672410197556019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6724101189756766e-05, "grad_norm": 16.65773582458496, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8748906850814819, "num_tokens": 343580266.0, "step": 9008 }, { "epoch": 1.1460373998219056, "ewc_loss": 0.026712141931056976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6712141334428452e-05, "grad_norm": 16.6175594329834, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8638300895690918, "num_tokens": 343611691.0, "step": 9009 }, { "epoch": 1.146164610100496, "ewc_loss": 0.026648065075278282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6648065613699146e-05, "grad_norm": 16.628965377807617, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8721908926963806, "num_tokens": 343649355.0, "step": 9010 }, { "epoch": 1.1462918203790866, "ewc_loss": 0.026745809242129326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6745809009298682e-05, "grad_norm": 16.61956214904785, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8808478116989136, "num_tokens": 343681606.0, "step": 9011 }, { "epoch": 1.1464190306576771, "ewc_loss": 0.026705322787165642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670532194315456e-05, "grad_norm": 16.600725173950195, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8776649832725525, "num_tokens": 343721592.0, "step": 9012 }, { "epoch": 1.1465462409362677, "ewc_loss": 0.026707887649536133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6707886718213558e-05, "grad_norm": 16.637725830078125, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8634810447692871, "num_tokens": 343764084.0, "step": 9013 }, { "epoch": 1.1466734512148582, "ewc_loss": 0.026746751740574837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.674675124580972e-05, "grad_norm": 16.5889949798584, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8676862716674805, "num_tokens": 343801213.0, "step": 9014 }, { "epoch": 1.1468006614934487, "ewc_loss": 0.026664627715945244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.666462751221843e-05, "grad_norm": 16.55559730529785, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8681243062019348, "num_tokens": 343839761.0, "step": 9015 }, { "epoch": 1.1469278717720393, "ewc_loss": 0.026669979095458984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6669978979043663e-05, "grad_norm": 16.60099983215332, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8661733269691467, "num_tokens": 343877792.0, "step": 9016 }, { "epoch": 1.1470550820506298, "ewc_loss": 0.02678355947136879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6783560315379873e-05, "grad_norm": 16.581436157226562, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.872374951839447, "num_tokens": 343915107.0, "step": 9017 }, { "epoch": 1.1471822923292203, "ewc_loss": 0.026718953624367714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.671895344974473e-05, "grad_norm": 16.668418884277344, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8719277381896973, "num_tokens": 343949783.0, "step": 9018 }, { "epoch": 1.1473095026078106, "ewc_loss": 0.02678292617201805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6782925488078035e-05, "grad_norm": 16.630741119384766, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8485924005508423, "num_tokens": 343991058.0, "step": 9019 }, { "epoch": 1.1474367128864011, "ewc_loss": 0.026695357635617256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6695357519201934e-05, "grad_norm": 16.600112915039062, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8672124743461609, "num_tokens": 344030424.0, "step": 9020 }, { "epoch": 1.1475639231649917, "ewc_loss": 0.02671756222844124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.671756192285102e-05, "grad_norm": 16.53261375427246, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.868563175201416, "num_tokens": 344071385.0, "step": 9021 }, { "epoch": 1.1476911334435822, "ewc_loss": 0.02668096497654915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.668096567504108e-05, "grad_norm": 16.604835510253906, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8764452338218689, "num_tokens": 344109725.0, "step": 9022 }, { "epoch": 1.1478183437221727, "ewc_loss": 0.026723483577370644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.672348273335956e-05, "grad_norm": 16.57006072998047, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8742826581001282, "num_tokens": 344143947.0, "step": 9023 }, { "epoch": 1.1479455540007633, "ewc_loss": 0.026682624593377113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6682624593377113e-05, "grad_norm": 16.59541130065918, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8564730882644653, "num_tokens": 344179969.0, "step": 9024 }, { "epoch": 1.1480727642793538, "ewc_loss": 0.026733633130788803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6733632694231346e-05, "grad_norm": 16.541465759277344, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8778858184814453, "num_tokens": 344212279.0, "step": 9025 }, { "epoch": 1.1481999745579443, "ewc_loss": 0.026706857606768608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670685717021115e-05, "grad_norm": 16.6076602935791, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8734150528907776, "num_tokens": 344248402.0, "step": 9026 }, { "epoch": 1.1483271848365348, "ewc_loss": 0.02673928812146187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.673928793228697e-05, "grad_norm": 16.529911041259766, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8643823862075806, "num_tokens": 344286557.0, "step": 9027 }, { "epoch": 1.1484543951151254, "ewc_loss": 0.0266902856528759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.669028617674485e-05, "grad_norm": 16.569974899291992, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8673139810562134, "num_tokens": 344331014.0, "step": 9028 }, { "epoch": 1.148581605393716, "ewc_loss": 0.026788394898176193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6788395189214498e-05, "grad_norm": 16.707599639892578, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8637213706970215, "num_tokens": 344373239.0, "step": 9029 }, { "epoch": 1.1487088156723062, "ewc_loss": 0.026706822216510773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6706822609412484e-05, "grad_norm": 16.589427947998047, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8803978562355042, "num_tokens": 344409871.0, "step": 9030 }, { "epoch": 1.1488360259508967, "ewc_loss": 0.02668415755033493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6684158001444302e-05, "grad_norm": 16.644182205200195, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8563187122344971, "num_tokens": 344452108.0, "step": 9031 }, { "epoch": 1.1489632362294873, "ewc_loss": 0.02676958590745926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6769586838781834e-05, "grad_norm": 16.664810180664062, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8605332374572754, "num_tokens": 344489798.0, "step": 9032 }, { "epoch": 1.1490904465080778, "ewc_loss": 0.02669581212103367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.669581226655282e-05, "grad_norm": 16.556198120117188, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.874850869178772, "num_tokens": 344528194.0, "step": 9033 }, { "epoch": 1.1492176567866683, "ewc_loss": 0.026687797158956528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6687797799240798e-05, "grad_norm": 16.67661476135254, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8524488806724548, "num_tokens": 344559699.0, "step": 9034 }, { "epoch": 1.1493448670652588, "ewc_loss": 0.026746029034256935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.674602910701651e-05, "grad_norm": 16.5603084564209, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8590487241744995, "num_tokens": 344598234.0, "step": 9035 }, { "epoch": 1.1494720773438494, "ewc_loss": 0.02665533870458603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6655337933334522e-05, "grad_norm": 16.608882904052734, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8605408668518066, "num_tokens": 344634780.0, "step": 9036 }, { "epoch": 1.14959928762244, "ewc_loss": 0.026797477155923843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6797477403306402e-05, "grad_norm": 16.61020851135254, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8711388111114502, "num_tokens": 344677167.0, "step": 9037 }, { "epoch": 1.1497264979010304, "ewc_loss": 0.02673668973147869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6736690415418707e-05, "grad_norm": 16.58648109436035, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.854084849357605, "num_tokens": 344713824.0, "step": 9038 }, { "epoch": 1.149853708179621, "ewc_loss": 0.026713667437434196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6713667466538027e-05, "grad_norm": 16.556406021118164, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.85358726978302, "num_tokens": 344755814.0, "step": 9039 }, { "epoch": 1.1499809184582115, "ewc_loss": 0.026742812246084213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6742813133751042e-05, "grad_norm": 16.60086441040039, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8811477422714233, "num_tokens": 344795022.0, "step": 9040 }, { "epoch": 1.150108128736802, "ewc_loss": 0.026756154373288155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6756153602036647e-05, "grad_norm": 16.579866409301758, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8754801750183105, "num_tokens": 344833906.0, "step": 9041 }, { "epoch": 1.1502353390153925, "ewc_loss": 0.026699408888816833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.669940840860363e-05, "grad_norm": 16.492029190063477, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8727745413780212, "num_tokens": 344875975.0, "step": 9042 }, { "epoch": 1.150362549293983, "ewc_loss": 0.02674272656440735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6742725822259672e-05, "grad_norm": 16.52182960510254, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8699156045913696, "num_tokens": 344915433.0, "step": 9043 }, { "epoch": 1.1504897595725734, "ewc_loss": 0.026768717914819717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.676871736184694e-05, "grad_norm": 16.64195442199707, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8711883425712585, "num_tokens": 344949038.0, "step": 9044 }, { "epoch": 1.150616969851164, "ewc_loss": 0.026778586208820343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6778585379361175e-05, "grad_norm": 16.600149154663086, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8675706386566162, "num_tokens": 344988260.0, "step": 9045 }, { "epoch": 1.1507441801297544, "ewc_loss": 0.026728764176368713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6728763259598054e-05, "grad_norm": 16.54381561279297, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8794182538986206, "num_tokens": 345024355.0, "step": 9046 }, { "epoch": 1.150871390408345, "ewc_loss": 0.026774458587169647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677445809240453e-05, "grad_norm": 16.663057327270508, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8764947652816772, "num_tokens": 345063041.0, "step": 9047 }, { "epoch": 1.1509986006869355, "ewc_loss": 0.02681480161845684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6814801458385773e-05, "grad_norm": 16.604490280151367, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.886941134929657, "num_tokens": 345097813.0, "step": 9048 }, { "epoch": 1.151125810965526, "ewc_loss": 0.02671043388545513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6710433303378522e-05, "grad_norm": 16.635456085205078, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8632540702819824, "num_tokens": 345134534.0, "step": 9049 }, { "epoch": 1.1512530212441165, "ewc_loss": 0.026716914027929306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6716914362623356e-05, "grad_norm": 16.64887809753418, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8706658482551575, "num_tokens": 345171018.0, "step": 9050 }, { "epoch": 1.151380231522707, "ewc_loss": 0.026740696281194687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6740695830085315e-05, "grad_norm": 16.655488967895508, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.878208577632904, "num_tokens": 345208006.0, "step": 9051 }, { "epoch": 1.1515074418012976, "ewc_loss": 0.026692213490605354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6692214305512607e-05, "grad_norm": 16.601425170898438, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8852157592773438, "num_tokens": 345246616.0, "step": 9052 }, { "epoch": 1.1516346520798881, "ewc_loss": 0.026698989793658257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6698990041040815e-05, "grad_norm": 16.615949630737305, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8830376863479614, "num_tokens": 345288095.0, "step": 9053 }, { "epoch": 1.1517618623584787, "ewc_loss": 0.02673986181616783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.673986273293849e-05, "grad_norm": 16.6049747467041, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.853018045425415, "num_tokens": 345333291.0, "step": 9054 }, { "epoch": 1.151889072637069, "ewc_loss": 0.026708384975790977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670838512131013e-05, "grad_norm": 16.562705993652344, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8654982447624207, "num_tokens": 345375093.0, "step": 9055 }, { "epoch": 1.1520162829156595, "ewc_loss": 0.02668514847755432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6685149350669235e-05, "grad_norm": 16.680078506469727, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8602757453918457, "num_tokens": 345415547.0, "step": 9056 }, { "epoch": 1.15214349319425, "ewc_loss": 0.02672833949327469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6728339435067028e-05, "grad_norm": 16.623186111450195, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8716381192207336, "num_tokens": 345457340.0, "step": 9057 }, { "epoch": 1.1522707034728406, "ewc_loss": 0.026658963412046432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.665896317921579e-05, "grad_norm": 16.573284149169922, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8631872534751892, "num_tokens": 345494711.0, "step": 9058 }, { "epoch": 1.152397913751431, "ewc_loss": 0.02670901082456112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670901085366495e-05, "grad_norm": 16.674840927124023, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8597365021705627, "num_tokens": 345534249.0, "step": 9059 }, { "epoch": 1.1525251240300216, "ewc_loss": 0.026714619249105453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.671461879799608e-05, "grad_norm": 16.603179931640625, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8660147190093994, "num_tokens": 345570832.0, "step": 9060 }, { "epoch": 1.1526523343086121, "ewc_loss": 0.026643257588148117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6643258024705574e-05, "grad_norm": 16.653671264648438, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8615732192993164, "num_tokens": 345609007.0, "step": 9061 }, { "epoch": 1.1527795445872027, "ewc_loss": 0.026645824313163757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6645824618753977e-05, "grad_norm": 16.583984375, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8778877258300781, "num_tokens": 345647979.0, "step": 9062 }, { "epoch": 1.1529067548657932, "ewc_loss": 0.02661793865263462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.661793951119762e-05, "grad_norm": 16.657432556152344, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8598918914794922, "num_tokens": 345681821.0, "step": 9063 }, { "epoch": 1.1530339651443837, "ewc_loss": 0.026697717607021332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6697716748458333e-05, "grad_norm": 16.571977615356445, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8573195338249207, "num_tokens": 345724274.0, "step": 9064 }, { "epoch": 1.1531611754229742, "ewc_loss": 0.026647944003343582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6647943741409108e-05, "grad_norm": 16.717416763305664, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8825327157974243, "num_tokens": 345763558.0, "step": 9065 }, { "epoch": 1.1532883857015648, "ewc_loss": 0.026701634749770164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.670163485163357e-05, "grad_norm": 16.599332809448242, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.86229008436203, "num_tokens": 345798486.0, "step": 9066 }, { "epoch": 1.1534155959801553, "ewc_loss": 0.026627492159605026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6627492843545042e-05, "grad_norm": 16.67975616455078, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8636469841003418, "num_tokens": 345839379.0, "step": 9067 }, { "epoch": 1.1535428062587456, "ewc_loss": 0.02678031660616398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.678031705727335e-05, "grad_norm": 16.686172485351562, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8629037737846375, "num_tokens": 345876675.0, "step": 9068 }, { "epoch": 1.1536700165373361, "ewc_loss": 0.026602180674672127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6602179787005298e-05, "grad_norm": 16.54375457763672, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8855874538421631, "num_tokens": 345913482.0, "step": 9069 }, { "epoch": 1.1537972268159267, "ewc_loss": 0.026700902730226517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6700903617893346e-05, "grad_norm": 16.66650390625, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8627258539199829, "num_tokens": 345947671.0, "step": 9070 }, { "epoch": 1.1539244370945172, "ewc_loss": 0.026707254350185394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6707253709901124e-05, "grad_norm": 16.56331443786621, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8693293929100037, "num_tokens": 345981520.0, "step": 9071 }, { "epoch": 1.1540516473731077, "ewc_loss": 0.026725832372903824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.672583286766894e-05, "grad_norm": 16.632064819335938, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8498611450195312, "num_tokens": 346020050.0, "step": 9072 }, { "epoch": 1.1541788576516983, "ewc_loss": 0.026779914274811745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6779915060615167e-05, "grad_norm": 16.66670036315918, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8587023019790649, "num_tokens": 346050000.0, "step": 9073 }, { "epoch": 1.1543060679302888, "ewc_loss": 0.02671084925532341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6710849851951934e-05, "grad_norm": 16.62557029724121, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8621267080307007, "num_tokens": 346085038.0, "step": 9074 }, { "epoch": 1.1544332782088793, "ewc_loss": 0.026757286861538887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6757286832435057e-05, "grad_norm": 16.560977935791016, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.876757025718689, "num_tokens": 346125871.0, "step": 9075 }, { "epoch": 1.1545604884874698, "ewc_loss": 0.026748530566692352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6748530217446387e-05, "grad_norm": 16.60240364074707, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8754571676254272, "num_tokens": 346160460.0, "step": 9076 }, { "epoch": 1.1546876987660604, "ewc_loss": 0.02676786109805107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.676786061783787e-05, "grad_norm": 16.549667358398438, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8565670251846313, "num_tokens": 346201621.0, "step": 9077 }, { "epoch": 1.154814909044651, "ewc_loss": 0.026826055720448494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.682605554582551e-05, "grad_norm": 16.633214950561523, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8691114783287048, "num_tokens": 346242680.0, "step": 9078 }, { "epoch": 1.1549421193232412, "ewc_loss": 0.02680106647312641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6801066269399598e-05, "grad_norm": 16.616472244262695, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8576810359954834, "num_tokens": 346275774.0, "step": 9079 }, { "epoch": 1.1550693296018317, "ewc_loss": 0.026770176365971565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6770176191348583e-05, "grad_norm": 16.539634704589844, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8698186874389648, "num_tokens": 346317648.0, "step": 9080 }, { "epoch": 1.1551965398804223, "ewc_loss": 0.026844266802072525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6844267267733812e-05, "grad_norm": 16.6372127532959, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8702079653739929, "num_tokens": 346357736.0, "step": 9081 }, { "epoch": 1.1553237501590128, "ewc_loss": 0.0268759336322546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6875934054260142e-05, "grad_norm": 16.631513595581055, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8659740090370178, "num_tokens": 346397961.0, "step": 9082 }, { "epoch": 1.1554509604376033, "ewc_loss": 0.02676982246339321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6769823307404295e-05, "grad_norm": 16.592918395996094, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8782693147659302, "num_tokens": 346438268.0, "step": 9083 }, { "epoch": 1.1555781707161938, "ewc_loss": 0.02681874856352806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6818748665391468e-05, "grad_norm": 16.64288330078125, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8585220575332642, "num_tokens": 346479024.0, "step": 9084 }, { "epoch": 1.1557053809947844, "ewc_loss": 0.02681564912199974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6815649107447825e-05, "grad_norm": 16.580278396606445, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8737642765045166, "num_tokens": 346523526.0, "step": 9085 }, { "epoch": 1.155832591273375, "ewc_loss": 0.026749638840556145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6749637981993146e-05, "grad_norm": 16.536314010620117, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8740199208259583, "num_tokens": 346561985.0, "step": 9086 }, { "epoch": 1.1559598015519654, "ewc_loss": 0.02680346556007862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6803465516422875e-05, "grad_norm": 16.626930236816406, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.853546142578125, "num_tokens": 346605600.0, "step": 9087 }, { "epoch": 1.156087011830556, "ewc_loss": 0.026766298338770866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6766298105940223e-05, "grad_norm": 16.55243492126465, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8803107142448425, "num_tokens": 346640798.0, "step": 9088 }, { "epoch": 1.1562142221091465, "ewc_loss": 0.026794487610459328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6794486984726973e-05, "grad_norm": 16.57889175415039, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8614692091941833, "num_tokens": 346678483.0, "step": 9089 }, { "epoch": 1.156341432387737, "ewc_loss": 0.02680378407239914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6803783839568496e-05, "grad_norm": 16.612451553344727, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8838672637939453, "num_tokens": 346714604.0, "step": 9090 }, { "epoch": 1.1564686426663275, "ewc_loss": 0.02681477554142475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6814775992534123e-05, "grad_norm": 16.634849548339844, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8778849244117737, "num_tokens": 346747203.0, "step": 9091 }, { "epoch": 1.156595852944918, "ewc_loss": 0.026792118325829506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6792118660523556e-05, "grad_norm": 16.624242782592773, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8657309412956238, "num_tokens": 346790650.0, "step": 9092 }, { "epoch": 1.1567230632235084, "ewc_loss": 0.026728671044111252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6728670491138473e-05, "grad_norm": 16.592639923095703, "learning_rate": 1e-06, "loss": 0.3455, "mean_token_accuracy": 0.8906404972076416, "num_tokens": 346822872.0, "step": 9093 }, { "epoch": 1.156850273502099, "ewc_loss": 0.02681073732674122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.681073783605825e-05, "grad_norm": 16.599681854248047, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8734973073005676, "num_tokens": 346857999.0, "step": 9094 }, { "epoch": 1.1569774837806894, "ewc_loss": 0.026782093569636345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6782094209920615e-05, "grad_norm": 16.649635314941406, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8742266893386841, "num_tokens": 346891804.0, "step": 9095 }, { "epoch": 1.15710469405928, "ewc_loss": 0.026807384565472603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6807385438587517e-05, "grad_norm": 16.53841209411621, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8657862544059753, "num_tokens": 346931554.0, "step": 9096 }, { "epoch": 1.1572319043378705, "ewc_loss": 0.02677036076784134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677035990927834e-05, "grad_norm": 16.625587463378906, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8538779020309448, "num_tokens": 346965214.0, "step": 9097 }, { "epoch": 1.157359114616461, "ewc_loss": 0.026833554729819298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.683355523913633e-05, "grad_norm": 16.526521682739258, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8787070512771606, "num_tokens": 347001219.0, "step": 9098 }, { "epoch": 1.1574863248950515, "ewc_loss": 0.02681485190987587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6814852390089072e-05, "grad_norm": 16.64987564086914, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8499382734298706, "num_tokens": 347043212.0, "step": 9099 }, { "epoch": 1.157613535173642, "ewc_loss": 0.026892241090536118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.689224129426293e-05, "grad_norm": 16.515052795410156, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8792682886123657, "num_tokens": 347082249.0, "step": 9100 }, { "epoch": 1.1577407454522326, "ewc_loss": 0.02678733505308628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.678733471839223e-05, "grad_norm": 16.588350296020508, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8654260039329529, "num_tokens": 347119286.0, "step": 9101 }, { "epoch": 1.1578679557308231, "ewc_loss": 0.02692025899887085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6920259188045748e-05, "grad_norm": 16.60539436340332, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8799176216125488, "num_tokens": 347155549.0, "step": 9102 }, { "epoch": 1.1579951660094137, "ewc_loss": 0.026826689019799232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6826688554137945e-05, "grad_norm": 16.583362579345703, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8701839447021484, "num_tokens": 347196846.0, "step": 9103 }, { "epoch": 1.158122376288004, "ewc_loss": 0.026870569214224815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6870569854509085e-05, "grad_norm": 16.618736267089844, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8809117674827576, "num_tokens": 347234028.0, "step": 9104 }, { "epoch": 1.1582495865665945, "ewc_loss": 0.026874233037233353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6874233299167827e-05, "grad_norm": 16.643461227416992, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.878902792930603, "num_tokens": 347276931.0, "step": 9105 }, { "epoch": 1.158376796845185, "ewc_loss": 0.026812216266989708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6812216674443334e-05, "grad_norm": 16.607675552368164, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8645891547203064, "num_tokens": 347317139.0, "step": 9106 }, { "epoch": 1.1585040071237755, "ewc_loss": 0.026819773018360138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6819772756425664e-05, "grad_norm": 16.650291442871094, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8704861402511597, "num_tokens": 347355718.0, "step": 9107 }, { "epoch": 1.158631217402366, "ewc_loss": 0.026816362515091896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6816362151294015e-05, "grad_norm": 16.622989654541016, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8584798574447632, "num_tokens": 347393403.0, "step": 9108 }, { "epoch": 1.1587584276809566, "ewc_loss": 0.02675296925008297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.675296855159104e-05, "grad_norm": 16.602489471435547, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.875754714012146, "num_tokens": 347433736.0, "step": 9109 }, { "epoch": 1.1588856379595471, "ewc_loss": 0.026815539225935936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6815539968083613e-05, "grad_norm": 16.575218200683594, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8687446117401123, "num_tokens": 347474574.0, "step": 9110 }, { "epoch": 1.1590128482381377, "ewc_loss": 0.02677994780242443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677994780242443e-05, "grad_norm": 16.63083839416504, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8737179040908813, "num_tokens": 347518785.0, "step": 9111 }, { "epoch": 1.1591400585167282, "ewc_loss": 0.02681989036500454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6819890990736894e-05, "grad_norm": 16.578468322753906, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8557187914848328, "num_tokens": 347557518.0, "step": 9112 }, { "epoch": 1.1592672687953187, "ewc_loss": 0.02677280642092228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677280645002611e-05, "grad_norm": 16.63412857055664, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8383374214172363, "num_tokens": 347600094.0, "step": 9113 }, { "epoch": 1.1593944790739092, "ewc_loss": 0.026799354702234268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.679935460037086e-05, "grad_norm": 16.591386795043945, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8663525581359863, "num_tokens": 347633709.0, "step": 9114 }, { "epoch": 1.1595216893524998, "ewc_loss": 0.02677306905388832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.677306838450022e-05, "grad_norm": 16.64038848876953, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8746980428695679, "num_tokens": 347670027.0, "step": 9115 }, { "epoch": 1.1596488996310903, "ewc_loss": 0.026790251955389977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6790252377395518e-05, "grad_norm": 16.600780487060547, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8727244734764099, "num_tokens": 347707488.0, "step": 9116 }, { "epoch": 1.1597761099096806, "ewc_loss": 0.02674069255590439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6740692192106508e-05, "grad_norm": 16.581260681152344, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8625121116638184, "num_tokens": 347746030.0, "step": 9117 }, { "epoch": 1.1599033201882711, "ewc_loss": 0.026839686557650566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6839687052415684e-05, "grad_norm": 16.63015365600586, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8598494529724121, "num_tokens": 347783409.0, "step": 9118 }, { "epoch": 1.1600305304668617, "ewc_loss": 0.026761967688798904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.676196709217038e-05, "grad_norm": 16.599504470825195, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8750422596931458, "num_tokens": 347821335.0, "step": 9119 }, { "epoch": 1.1601577407454522, "ewc_loss": 0.026821717619895935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6821717256098054e-05, "grad_norm": 16.56974220275879, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8782405853271484, "num_tokens": 347857590.0, "step": 9120 }, { "epoch": 1.1602849510240427, "ewc_loss": 0.026798447594046593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6798446924658492e-05, "grad_norm": 16.62632942199707, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.879753589630127, "num_tokens": 347897000.0, "step": 9121 }, { "epoch": 1.1604121613026332, "ewc_loss": 0.02685399539768696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6853995223063976e-05, "grad_norm": 16.68494415283203, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8674094676971436, "num_tokens": 347928947.0, "step": 9122 }, { "epoch": 1.1605393715812238, "ewc_loss": 0.026829229667782784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68292296823347e-05, "grad_norm": 16.671659469604492, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8377881050109863, "num_tokens": 347959294.0, "step": 9123 }, { "epoch": 1.1606665818598143, "ewc_loss": 0.026846127584576607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.684612809389364e-05, "grad_norm": 16.61089515686035, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8733216524124146, "num_tokens": 347993103.0, "step": 9124 }, { "epoch": 1.1607937921384048, "ewc_loss": 0.026864800602197647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6864800020121038e-05, "grad_norm": 16.586692810058594, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8805112242698669, "num_tokens": 348028412.0, "step": 9125 }, { "epoch": 1.1609210024169954, "ewc_loss": 0.02690119296312332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.690119254111778e-05, "grad_norm": 16.691036224365234, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8507038354873657, "num_tokens": 348071429.0, "step": 9126 }, { "epoch": 1.161048212695586, "ewc_loss": 0.0268817450851202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.688174572540447e-05, "grad_norm": 16.5135498046875, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8716986179351807, "num_tokens": 348113222.0, "step": 9127 }, { "epoch": 1.1611754229741762, "ewc_loss": 0.026820560917258263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68205603788374e-05, "grad_norm": 16.62212562561035, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8664460778236389, "num_tokens": 348150188.0, "step": 9128 }, { "epoch": 1.1613026332527667, "ewc_loss": 0.02696290984749794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6962909032590687e-05, "grad_norm": 16.588045120239258, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8707125186920166, "num_tokens": 348188547.0, "step": 9129 }, { "epoch": 1.1614298435313573, "ewc_loss": 0.026857202872633934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6857202101382427e-05, "grad_norm": 16.6137752532959, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8628609776496887, "num_tokens": 348228624.0, "step": 9130 }, { "epoch": 1.1615570538099478, "ewc_loss": 0.026960354298353195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.696035517146811e-05, "grad_norm": 16.58278465270996, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8640227913856506, "num_tokens": 348264932.0, "step": 9131 }, { "epoch": 1.1616842640885383, "ewc_loss": 0.02686246484518051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6862464437726885e-05, "grad_norm": 16.601320266723633, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8690089583396912, "num_tokens": 348307680.0, "step": 9132 }, { "epoch": 1.1618114743671288, "ewc_loss": 0.02696123532950878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6961235562339425e-05, "grad_norm": 16.637510299682617, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8822042346000671, "num_tokens": 348346449.0, "step": 9133 }, { "epoch": 1.1619386846457194, "ewc_loss": 0.02687365561723709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68736548605375e-05, "grad_norm": 16.53322410583496, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.87839674949646, "num_tokens": 348386393.0, "step": 9134 }, { "epoch": 1.16206589492431, "ewc_loss": 0.026873568072915077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.687356754904613e-05, "grad_norm": 16.601285934448242, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8613139390945435, "num_tokens": 348429458.0, "step": 9135 }, { "epoch": 1.1621931052029004, "ewc_loss": 0.026921812444925308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6921812604996376e-05, "grad_norm": 16.61454200744629, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8555337190628052, "num_tokens": 348471365.0, "step": 9136 }, { "epoch": 1.162320315481491, "ewc_loss": 0.026835588738322258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6835588869289495e-05, "grad_norm": 16.566328048706055, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8534916639328003, "num_tokens": 348513425.0, "step": 9137 }, { "epoch": 1.1624475257600815, "ewc_loss": 0.02685393951833248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6853938834392466e-05, "grad_norm": 16.6210994720459, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8649393320083618, "num_tokens": 348551181.0, "step": 9138 }, { "epoch": 1.162574736038672, "ewc_loss": 0.02688995562493801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6889954824582674e-05, "grad_norm": 16.606168746948242, "learning_rate": 1e-06, "loss": 0.337, "mean_token_accuracy": 0.8891206383705139, "num_tokens": 348583611.0, "step": 9139 }, { "epoch": 1.1627019463172625, "ewc_loss": 0.0268239788711071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6823978259926662e-05, "grad_norm": 16.580846786499023, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8655292987823486, "num_tokens": 348624281.0, "step": 9140 }, { "epoch": 1.162829156595853, "ewc_loss": 0.02685694396495819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6856943804887123e-05, "grad_norm": 16.540124893188477, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8667126893997192, "num_tokens": 348661416.0, "step": 9141 }, { "epoch": 1.1629563668744434, "ewc_loss": 0.026848362758755684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6848363631870598e-05, "grad_norm": 16.66697883605957, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8473385572433472, "num_tokens": 348703907.0, "step": 9142 }, { "epoch": 1.163083577153034, "ewc_loss": 0.026889771223068237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6889771106652915e-05, "grad_norm": 16.55389976501465, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8801530599594116, "num_tokens": 348735617.0, "step": 9143 }, { "epoch": 1.1632107874316244, "ewc_loss": 0.026807084679603577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.680708530533593e-05, "grad_norm": 16.6031551361084, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.865455150604248, "num_tokens": 348773552.0, "step": 9144 }, { "epoch": 1.163337997710215, "ewc_loss": 0.026919636875391006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6919637093669735e-05, "grad_norm": 16.618860244750977, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.868972897529602, "num_tokens": 348811274.0, "step": 9145 }, { "epoch": 1.1634652079888055, "ewc_loss": 0.026810409501194954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.681040859897621e-05, "grad_norm": 16.614133834838867, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8557678461074829, "num_tokens": 348847773.0, "step": 9146 }, { "epoch": 1.163592418267396, "ewc_loss": 0.026942240074276924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6942239855998196e-05, "grad_norm": 16.667884826660156, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8587477207183838, "num_tokens": 348886356.0, "step": 9147 }, { "epoch": 1.1637196285459865, "ewc_loss": 0.026808738708496094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6808738766703755e-05, "grad_norm": 16.67345428466797, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8786934614181519, "num_tokens": 348922892.0, "step": 9148 }, { "epoch": 1.163846838824577, "ewc_loss": 0.026860922574996948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6860921934712678e-05, "grad_norm": 16.64801788330078, "learning_rate": 1e-06, "loss": 0.3512, "mean_token_accuracy": 0.8867533802986145, "num_tokens": 348954924.0, "step": 9149 }, { "epoch": 1.1639740491031676, "ewc_loss": 0.026839537546038628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6839537895284593e-05, "grad_norm": 16.575305938720703, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8738835453987122, "num_tokens": 348997335.0, "step": 9150 }, { "epoch": 1.1641012593817581, "ewc_loss": 0.026843607425689697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6843606974580325e-05, "grad_norm": 16.60101890563965, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8575662970542908, "num_tokens": 349038221.0, "step": 9151 }, { "epoch": 1.1642284696603487, "ewc_loss": 0.02681836113333702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6818361220648512e-05, "grad_norm": 16.531429290771484, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8635025024414062, "num_tokens": 349076559.0, "step": 9152 }, { "epoch": 1.164355679938939, "ewc_loss": 0.02685149572789669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6851495931623504e-05, "grad_norm": 16.634958267211914, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8456851243972778, "num_tokens": 349112264.0, "step": 9153 }, { "epoch": 1.1644828902175295, "ewc_loss": 0.026907462626695633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6907462597591802e-05, "grad_norm": 16.573144912719727, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8809865117073059, "num_tokens": 349146571.0, "step": 9154 }, { "epoch": 1.16461010049612, "ewc_loss": 0.02683611400425434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.683611455722712e-05, "grad_norm": 16.553874969482422, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8746068477630615, "num_tokens": 349183674.0, "step": 9155 }, { "epoch": 1.1647373107747105, "ewc_loss": 0.02690429799258709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6904297556029633e-05, "grad_norm": 16.56814956665039, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8721996545791626, "num_tokens": 349227042.0, "step": 9156 }, { "epoch": 1.164864521053301, "ewc_loss": 0.02690155804157257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6901558157987893e-05, "grad_norm": 16.611003875732422, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8632148504257202, "num_tokens": 349264072.0, "step": 9157 }, { "epoch": 1.1649917313318916, "ewc_loss": 0.026856478303670883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6856478143599816e-05, "grad_norm": 16.548513412475586, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8690565824508667, "num_tokens": 349304427.0, "step": 9158 }, { "epoch": 1.1651189416104821, "ewc_loss": 0.026868058368563652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.686805783014279e-05, "grad_norm": 16.626991271972656, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8675646781921387, "num_tokens": 349346708.0, "step": 9159 }, { "epoch": 1.1652461518890727, "ewc_loss": 0.02686907909810543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6869078283198178e-05, "grad_norm": 16.597530364990234, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8659353256225586, "num_tokens": 349387716.0, "step": 9160 }, { "epoch": 1.1653733621676632, "ewc_loss": 0.02685118466615677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6851184884435497e-05, "grad_norm": 16.61730194091797, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8641020059585571, "num_tokens": 349425757.0, "step": 9161 }, { "epoch": 1.1655005724462537, "ewc_loss": 0.026869140565395355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68691401288379e-05, "grad_norm": 16.600921630859375, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8743427991867065, "num_tokens": 349472097.0, "step": 9162 }, { "epoch": 1.1656277827248442, "ewc_loss": 0.026834968477487564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6834968593902886e-05, "grad_norm": 16.672101974487305, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8651559948921204, "num_tokens": 349513809.0, "step": 9163 }, { "epoch": 1.1657549930034348, "ewc_loss": 0.026867883279919624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6867883207160048e-05, "grad_norm": 16.603023529052734, "learning_rate": 1e-06, "loss": 0.3598, "mean_token_accuracy": 0.8833701014518738, "num_tokens": 349544906.0, "step": 9164 }, { "epoch": 1.1658822032820253, "ewc_loss": 0.02680349163711071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6803490982274525e-05, "grad_norm": 16.60012435913086, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8625649213790894, "num_tokens": 349582529.0, "step": 9165 }, { "epoch": 1.1660094135606156, "ewc_loss": 0.026896944269537926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6896943381871097e-05, "grad_norm": 16.678897857666016, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8631032705307007, "num_tokens": 349624871.0, "step": 9166 }, { "epoch": 1.1661366238392061, "ewc_loss": 0.02686438336968422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6864383471547626e-05, "grad_norm": 16.61448097229004, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8318693041801453, "num_tokens": 349660783.0, "step": 9167 }, { "epoch": 1.1662638341177967, "ewc_loss": 0.02682756818830967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.682756894500926e-05, "grad_norm": 16.623085021972656, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8774024248123169, "num_tokens": 349698430.0, "step": 9168 }, { "epoch": 1.1663910443963872, "ewc_loss": 0.026885302737355232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6885301849688403e-05, "grad_norm": 16.622447967529297, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8608152866363525, "num_tokens": 349734981.0, "step": 9169 }, { "epoch": 1.1665182546749777, "ewc_loss": 0.0268750861287117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.687508640519809e-05, "grad_norm": 16.674571990966797, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8623687028884888, "num_tokens": 349774267.0, "step": 9170 }, { "epoch": 1.1666454649535682, "ewc_loss": 0.026841668412089348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6841667931876145e-05, "grad_norm": 16.568552017211914, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8639380931854248, "num_tokens": 349811400.0, "step": 9171 }, { "epoch": 1.1667726752321588, "ewc_loss": 0.026822196319699287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6822195650311187e-05, "grad_norm": 16.60149574279785, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8589097857475281, "num_tokens": 349850031.0, "step": 9172 }, { "epoch": 1.1668998855107493, "ewc_loss": 0.026863591745495796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6863592211157084e-05, "grad_norm": 16.56174659729004, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8696988821029663, "num_tokens": 349889967.0, "step": 9173 }, { "epoch": 1.1670270957893398, "ewc_loss": 0.026852909475564957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.685290928639006e-05, "grad_norm": 16.638416290283203, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8618056178092957, "num_tokens": 349930276.0, "step": 9174 }, { "epoch": 1.1671543060679304, "ewc_loss": 0.026921715587377548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6921716198557988e-05, "grad_norm": 16.512420654296875, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8624826669692993, "num_tokens": 349969889.0, "step": 9175 }, { "epoch": 1.1672815163465209, "ewc_loss": 0.02687492035329342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6874920877162367e-05, "grad_norm": 16.665395736694336, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8645530343055725, "num_tokens": 350005729.0, "step": 9176 }, { "epoch": 1.1674087266251112, "ewc_loss": 0.027003014460206032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7003014110960066e-05, "grad_norm": 16.62825584411621, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8744980692863464, "num_tokens": 350043130.0, "step": 9177 }, { "epoch": 1.1675359369037017, "ewc_loss": 0.02687624655663967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6876246920437552e-05, "grad_norm": 16.558982849121094, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.885626494884491, "num_tokens": 350081233.0, "step": 9178 }, { "epoch": 1.1676631471822922, "ewc_loss": 0.026960914954543114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.69609154202044e-05, "grad_norm": 16.652944564819336, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8542698621749878, "num_tokens": 350119403.0, "step": 9179 }, { "epoch": 1.1677903574608828, "ewc_loss": 0.026911042630672455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.691104236873798e-05, "grad_norm": 16.556049346923828, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8546862602233887, "num_tokens": 350159099.0, "step": 9180 }, { "epoch": 1.1679175677394733, "ewc_loss": 0.026896165683865547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.689616485440638e-05, "grad_norm": 16.58112907409668, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.867776095867157, "num_tokens": 350195642.0, "step": 9181 }, { "epoch": 1.1680447780180638, "ewc_loss": 0.027017250657081604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.701725134102162e-05, "grad_norm": 16.637718200683594, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8741084337234497, "num_tokens": 350236721.0, "step": 9182 }, { "epoch": 1.1681719882966544, "ewc_loss": 0.026950884610414505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695088551263325e-05, "grad_norm": 16.599931716918945, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8636583089828491, "num_tokens": 350272691.0, "step": 9183 }, { "epoch": 1.168299198575245, "ewc_loss": 0.026932010427117348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6932009859592654e-05, "grad_norm": 16.621225357055664, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.866694986820221, "num_tokens": 350318299.0, "step": 9184 }, { "epoch": 1.1684264088538354, "ewc_loss": 0.02695251628756523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695251714612823e-05, "grad_norm": 16.59862518310547, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8667608499526978, "num_tokens": 350360973.0, "step": 9185 }, { "epoch": 1.168553619132426, "ewc_loss": 0.026952356100082397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6952355256071314e-05, "grad_norm": 16.67194938659668, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8753281235694885, "num_tokens": 350400516.0, "step": 9186 }, { "epoch": 1.1686808294110165, "ewc_loss": 0.02694292552769184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6942925615003332e-05, "grad_norm": 16.599374771118164, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8628131151199341, "num_tokens": 350438147.0, "step": 9187 }, { "epoch": 1.168808039689607, "ewc_loss": 0.0269345510751009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6934550987789407e-05, "grad_norm": 16.57811164855957, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.872305154800415, "num_tokens": 350481604.0, "step": 9188 }, { "epoch": 1.1689352499681975, "ewc_loss": 0.026955602690577507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6955602152156644e-05, "grad_norm": 16.656442642211914, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8874672055244446, "num_tokens": 350518225.0, "step": 9189 }, { "epoch": 1.169062460246788, "ewc_loss": 0.02691570296883583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6915702619589865e-05, "grad_norm": 16.57505989074707, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8657265901565552, "num_tokens": 350553752.0, "step": 9190 }, { "epoch": 1.1691896705253784, "ewc_loss": 0.026881128549575806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.688112908799667e-05, "grad_norm": 16.515357971191406, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8720917701721191, "num_tokens": 350593676.0, "step": 9191 }, { "epoch": 1.169316880803969, "ewc_loss": 0.026954224333167076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695422517717816e-05, "grad_norm": 16.705652236938477, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8662714958190918, "num_tokens": 350633618.0, "step": 9192 }, { "epoch": 1.1694440910825594, "ewc_loss": 0.02693965658545494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.693965689104516e-05, "grad_norm": 16.566017150878906, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8652739524841309, "num_tokens": 350678221.0, "step": 9193 }, { "epoch": 1.16957130136115, "ewc_loss": 0.026853058487176895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.685305844352115e-05, "grad_norm": 16.609697341918945, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8774435520172119, "num_tokens": 350718258.0, "step": 9194 }, { "epoch": 1.1696985116397405, "ewc_loss": 0.026963260024785995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.696326009754557e-05, "grad_norm": 16.698619842529297, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8781258463859558, "num_tokens": 350757370.0, "step": 9195 }, { "epoch": 1.169825721918331, "ewc_loss": 0.02691659890115261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6916599381365813e-05, "grad_norm": 16.65143394470215, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8675220012664795, "num_tokens": 350801394.0, "step": 9196 }, { "epoch": 1.1699529321969215, "ewc_loss": 0.026815194636583328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.681519436009694e-05, "grad_norm": 16.6177978515625, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.859549880027771, "num_tokens": 350840635.0, "step": 9197 }, { "epoch": 1.170080142475512, "ewc_loss": 0.02687697857618332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6876978154177777e-05, "grad_norm": 16.670093536376953, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8829758763313293, "num_tokens": 350879616.0, "step": 9198 }, { "epoch": 1.1702073527541026, "ewc_loss": 0.026817554607987404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.681755540834274e-05, "grad_norm": 16.652219772338867, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8677003383636475, "num_tokens": 350921710.0, "step": 9199 }, { "epoch": 1.1703345630326931, "ewc_loss": 0.026831382885575294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6831383365788497e-05, "grad_norm": 16.623441696166992, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8552509546279907, "num_tokens": 350964218.0, "step": 9200 }, { "epoch": 1.1704617733112836, "ewc_loss": 0.026775287464261055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6775287551572546e-05, "grad_norm": 16.592191696166992, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.877710223197937, "num_tokens": 351004511.0, "step": 9201 }, { "epoch": 1.170588983589874, "ewc_loss": 0.026831431314349174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.683143065951299e-05, "grad_norm": 16.625070571899414, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8628677129745483, "num_tokens": 351044583.0, "step": 9202 }, { "epoch": 1.1707161938684645, "ewc_loss": 0.026827961206436157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6827961846720427e-05, "grad_norm": 16.6441650390625, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8507863283157349, "num_tokens": 351083613.0, "step": 9203 }, { "epoch": 1.170843404147055, "ewc_loss": 0.0268909502774477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6890949811786413e-05, "grad_norm": 16.629514694213867, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8597259521484375, "num_tokens": 351117412.0, "step": 9204 }, { "epoch": 1.1709706144256455, "ewc_loss": 0.02680802159011364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6808022084878758e-05, "grad_norm": 16.62884521484375, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8535180687904358, "num_tokens": 351154779.0, "step": 9205 }, { "epoch": 1.171097824704236, "ewc_loss": 0.026856163516640663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6856163458433002e-05, "grad_norm": 16.67957878112793, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8775326609611511, "num_tokens": 351191412.0, "step": 9206 }, { "epoch": 1.1712250349828266, "ewc_loss": 0.02681031823158264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6810317649506032e-05, "grad_norm": 16.589820861816406, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8700433969497681, "num_tokens": 351235465.0, "step": 9207 }, { "epoch": 1.1713522452614171, "ewc_loss": 0.026876842603087425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.687684172997251e-05, "grad_norm": 16.703550338745117, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8729050159454346, "num_tokens": 351279326.0, "step": 9208 }, { "epoch": 1.1714794555400077, "ewc_loss": 0.02681409753859043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68140975094866e-05, "grad_norm": 16.574739456176758, "learning_rate": 1e-06, "loss": 0.3539, "mean_token_accuracy": 0.8846495151519775, "num_tokens": 351316678.0, "step": 9209 }, { "epoch": 1.1716066658185982, "ewc_loss": 0.026753751561045647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6753750717034563e-05, "grad_norm": 16.609458923339844, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8632780313491821, "num_tokens": 351352641.0, "step": 9210 }, { "epoch": 1.1717338760971887, "ewc_loss": 0.026873065158724785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.687306550797075e-05, "grad_norm": 16.630739212036133, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8583076596260071, "num_tokens": 351385845.0, "step": 9211 }, { "epoch": 1.1718610863757792, "ewc_loss": 0.026804976165294647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6804975277627818e-05, "grad_norm": 16.64919090270996, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8732772469520569, "num_tokens": 351425220.0, "step": 9212 }, { "epoch": 1.1719882966543698, "ewc_loss": 0.026858072727918625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6858073397306725e-05, "grad_norm": 16.602497100830078, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8624657392501831, "num_tokens": 351467515.0, "step": 9213 }, { "epoch": 1.1721155069329603, "ewc_loss": 0.026875397190451622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6875397452386096e-05, "grad_norm": 16.69279670715332, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8650730848312378, "num_tokens": 351507571.0, "step": 9214 }, { "epoch": 1.1722427172115506, "ewc_loss": 0.026882978156208992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6882979000220075e-05, "grad_norm": 16.59861183166504, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8648713231086731, "num_tokens": 351546150.0, "step": 9215 }, { "epoch": 1.1723699274901411, "ewc_loss": 0.026818079873919487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6818079277290963e-05, "grad_norm": 16.590688705444336, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8608899712562561, "num_tokens": 351588574.0, "step": 9216 }, { "epoch": 1.1724971377687317, "ewc_loss": 0.026899470016360283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6899469958152622e-05, "grad_norm": 16.623275756835938, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8684813976287842, "num_tokens": 351626914.0, "step": 9217 }, { "epoch": 1.1726243480473222, "ewc_loss": 0.026893334463238716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6893334506894462e-05, "grad_norm": 16.626590728759766, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8579593896865845, "num_tokens": 351663574.0, "step": 9218 }, { "epoch": 1.1727515583259127, "ewc_loss": 0.0269327312707901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.693273199838586e-05, "grad_norm": 16.645971298217773, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8511345386505127, "num_tokens": 351703275.0, "step": 9219 }, { "epoch": 1.1728787686045032, "ewc_loss": 0.02693510800600052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6935107598546892e-05, "grad_norm": 16.596160888671875, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8578346967697144, "num_tokens": 351745713.0, "step": 9220 }, { "epoch": 1.1730059788830938, "ewc_loss": 0.02690110169351101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6901101591647603e-05, "grad_norm": 16.649675369262695, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8532252311706543, "num_tokens": 351780789.0, "step": 9221 }, { "epoch": 1.1731331891616843, "ewc_loss": 0.026940058916807175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6940058887703344e-05, "grad_norm": 16.65329933166504, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8771672248840332, "num_tokens": 351820352.0, "step": 9222 }, { "epoch": 1.1732603994402748, "ewc_loss": 0.02692841738462448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.692841735552065e-05, "grad_norm": 16.59682846069336, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8744615316390991, "num_tokens": 351860011.0, "step": 9223 }, { "epoch": 1.1733876097188654, "ewc_loss": 0.02694137953221798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.694137947401032e-05, "grad_norm": 16.690025329589844, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8614072799682617, "num_tokens": 351898742.0, "step": 9224 }, { "epoch": 1.1735148199974559, "ewc_loss": 0.026985732838511467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.698573371162638e-05, "grad_norm": 16.617202758789062, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8546790480613708, "num_tokens": 351940201.0, "step": 9225 }, { "epoch": 1.1736420302760462, "ewc_loss": 0.02694045379757881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6940453608403914e-05, "grad_norm": 16.757761001586914, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8663036823272705, "num_tokens": 351971041.0, "step": 9226 }, { "epoch": 1.1737692405546367, "ewc_loss": 0.026995956897735596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.699595643207431e-05, "grad_norm": 16.66677474975586, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8498243689537048, "num_tokens": 352013002.0, "step": 9227 }, { "epoch": 1.1738964508332272, "ewc_loss": 0.026924367994070053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6924368285108358e-05, "grad_norm": 16.687088012695312, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8658391237258911, "num_tokens": 352051435.0, "step": 9228 }, { "epoch": 1.1740236611118178, "ewc_loss": 0.02696128562092781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6961286494042724e-05, "grad_norm": 16.631010055541992, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8732640743255615, "num_tokens": 352093295.0, "step": 9229 }, { "epoch": 1.1741508713904083, "ewc_loss": 0.02691892720758915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6918927687802352e-05, "grad_norm": 16.650108337402344, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8637737035751343, "num_tokens": 352136401.0, "step": 9230 }, { "epoch": 1.1742780816689988, "ewc_loss": 0.02700699493288994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7006994059775025e-05, "grad_norm": 16.723163604736328, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8777183294296265, "num_tokens": 352178696.0, "step": 9231 }, { "epoch": 1.1744052919475894, "ewc_loss": 0.026911307126283646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.69113079411909e-05, "grad_norm": 16.66766929626465, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.847758948802948, "num_tokens": 352214759.0, "step": 9232 }, { "epoch": 1.1745325022261799, "ewc_loss": 0.02692895382642746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6928953957394697e-05, "grad_norm": 16.6669864654541, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8796073198318481, "num_tokens": 352250520.0, "step": 9233 }, { "epoch": 1.1746597125047704, "ewc_loss": 0.02697085775434971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6970858016284183e-05, "grad_norm": 16.717065811157227, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.865136981010437, "num_tokens": 352288773.0, "step": 9234 }, { "epoch": 1.174786922783361, "ewc_loss": 0.026910683140158653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6910684027825482e-05, "grad_norm": 16.64688491821289, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8743267059326172, "num_tokens": 352326548.0, "step": 9235 }, { "epoch": 1.1749141330619515, "ewc_loss": 0.026907682418823242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.690768269530963e-05, "grad_norm": 16.638744354248047, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8757539391517639, "num_tokens": 352366537.0, "step": 9236 }, { "epoch": 1.175041343340542, "ewc_loss": 0.026906121522188187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.690612200240139e-05, "grad_norm": 16.586984634399414, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.849547266960144, "num_tokens": 352402390.0, "step": 9237 }, { "epoch": 1.1751685536191325, "ewc_loss": 0.026954175904393196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6954176064464264e-05, "grad_norm": 16.657581329345703, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8651474714279175, "num_tokens": 352442357.0, "step": 9238 }, { "epoch": 1.175295763897723, "ewc_loss": 0.026955686509609222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6955685825669207e-05, "grad_norm": 16.55640983581543, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8643215894699097, "num_tokens": 352476884.0, "step": 9239 }, { "epoch": 1.1754229741763134, "ewc_loss": 0.026910532265901566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6910533051704988e-05, "grad_norm": 16.631324768066406, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8670439720153809, "num_tokens": 352513448.0, "step": 9240 }, { "epoch": 1.175550184454904, "ewc_loss": 0.026940839365124702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6940839234157465e-05, "grad_norm": 16.56968116760254, "learning_rate": 1e-06, "loss": 0.3563, "mean_token_accuracy": 0.8850739002227783, "num_tokens": 352548420.0, "step": 9241 }, { "epoch": 1.1756773947334944, "ewc_loss": 0.026957165449857712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695716466405429e-05, "grad_norm": 16.626970291137695, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8699873685836792, "num_tokens": 352592157.0, "step": 9242 }, { "epoch": 1.175804605012085, "ewc_loss": 0.02702074870467186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702074925764464e-05, "grad_norm": 16.684932708740234, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8578574657440186, "num_tokens": 352626427.0, "step": 9243 }, { "epoch": 1.1759318152906755, "ewc_loss": 0.026935748755931854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.693574788281694e-05, "grad_norm": 16.63406753540039, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8765643239021301, "num_tokens": 352667075.0, "step": 9244 }, { "epoch": 1.176059025569266, "ewc_loss": 0.026938987895846367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6938987502944656e-05, "grad_norm": 16.594696044921875, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8644009828567505, "num_tokens": 352707114.0, "step": 9245 }, { "epoch": 1.1761862358478565, "ewc_loss": 0.02691981941461563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.691981899261009e-05, "grad_norm": 16.578407287597656, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.873069167137146, "num_tokens": 352738064.0, "step": 9246 }, { "epoch": 1.176313446126447, "ewc_loss": 0.026975825428962708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6975825676345266e-05, "grad_norm": 16.61763572692871, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8516390323638916, "num_tokens": 352779964.0, "step": 9247 }, { "epoch": 1.1764406564050376, "ewc_loss": 0.026928570121526718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.692857015063055e-05, "grad_norm": 16.524560928344727, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8754575252532959, "num_tokens": 352819574.0, "step": 9248 }, { "epoch": 1.1765678666836281, "ewc_loss": 0.026986878365278244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.698687785596121e-05, "grad_norm": 16.65469741821289, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.87596195936203, "num_tokens": 352859151.0, "step": 9249 }, { "epoch": 1.1766950769622184, "ewc_loss": 0.026981493458151817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6981493647326715e-05, "grad_norm": 16.613887786865234, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8456589579582214, "num_tokens": 352901119.0, "step": 9250 }, { "epoch": 1.176822287240809, "ewc_loss": 0.026961054652929306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6961055482388474e-05, "grad_norm": 16.611814498901367, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.866366446018219, "num_tokens": 352945878.0, "step": 9251 }, { "epoch": 1.1769494975193995, "ewc_loss": 0.02696353942155838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6963540221913718e-05, "grad_norm": 16.672170639038086, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8616535663604736, "num_tokens": 352979819.0, "step": 9252 }, { "epoch": 1.17707670779799, "ewc_loss": 0.026938965544104576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6938965675071813e-05, "grad_norm": 16.59174156188965, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8555102348327637, "num_tokens": 353023552.0, "step": 9253 }, { "epoch": 1.1772039180765805, "ewc_loss": 0.02693823166191578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.693823080335278e-05, "grad_norm": 16.669193267822266, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8680253028869629, "num_tokens": 353058178.0, "step": 9254 }, { "epoch": 1.177331128355171, "ewc_loss": 0.026981525123119354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6981524570146576e-05, "grad_norm": 16.59164047241211, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8759299516677856, "num_tokens": 353091428.0, "step": 9255 }, { "epoch": 1.1774583386337616, "ewc_loss": 0.0269851665943861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6985166186932474e-05, "grad_norm": 16.619401931762695, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8703763484954834, "num_tokens": 353131943.0, "step": 9256 }, { "epoch": 1.1775855489123521, "ewc_loss": 0.02696814015507698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6968140446115285e-05, "grad_norm": 16.61075782775879, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8599612712860107, "num_tokens": 353171213.0, "step": 9257 }, { "epoch": 1.1777127591909426, "ewc_loss": 0.026938356459140778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6938356313621625e-05, "grad_norm": 16.623855590820312, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.875175952911377, "num_tokens": 353205057.0, "step": 9258 }, { "epoch": 1.1778399694695332, "ewc_loss": 0.026977889239788055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6977888410328887e-05, "grad_norm": 16.618175506591797, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8522401452064514, "num_tokens": 353251164.0, "step": 9259 }, { "epoch": 1.1779671797481237, "ewc_loss": 0.026983508840203285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6983509087585844e-05, "grad_norm": 16.622806549072266, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.871723473072052, "num_tokens": 353288202.0, "step": 9260 }, { "epoch": 1.1780943900267142, "ewc_loss": 0.026997607201337814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6997608074452728e-05, "grad_norm": 16.69202423095703, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8657888174057007, "num_tokens": 353327396.0, "step": 9261 }, { "epoch": 1.1782216003053048, "ewc_loss": 0.02698606438934803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6986064767697826e-05, "grad_norm": 16.67852783203125, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8548880815505981, "num_tokens": 353366512.0, "step": 9262 }, { "epoch": 1.1783488105838953, "ewc_loss": 0.026983387768268585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6983387215295807e-05, "grad_norm": 16.642017364501953, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8587432503700256, "num_tokens": 353408546.0, "step": 9263 }, { "epoch": 1.1784760208624856, "ewc_loss": 0.026977332308888435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6977331799571402e-05, "grad_norm": 16.677104949951172, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8740087747573853, "num_tokens": 353444950.0, "step": 9264 }, { "epoch": 1.1786032311410761, "ewc_loss": 0.0269791092723608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6979108952218667e-05, "grad_norm": 16.671184539794922, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8744198083877563, "num_tokens": 353483306.0, "step": 9265 }, { "epoch": 1.1787304414196667, "ewc_loss": 0.027007170021533966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.700717050174717e-05, "grad_norm": 16.704038619995117, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8654452562332153, "num_tokens": 353523038.0, "step": 9266 }, { "epoch": 1.1788576516982572, "ewc_loss": 0.026977304369211197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.697730451473035e-05, "grad_norm": 16.631704330444336, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8763872981071472, "num_tokens": 353564266.0, "step": 9267 }, { "epoch": 1.1789848619768477, "ewc_loss": 0.026960203424096107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6960204195347615e-05, "grad_norm": 16.692461013793945, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8704516291618347, "num_tokens": 353603558.0, "step": 9268 }, { "epoch": 1.1791120722554382, "ewc_loss": 0.02695462480187416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695462535484694e-05, "grad_norm": 16.695205688476562, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8742287158966064, "num_tokens": 353648097.0, "step": 9269 }, { "epoch": 1.1792392825340288, "ewc_loss": 0.02692311629652977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6923116820398718e-05, "grad_norm": 16.721418380737305, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.851190447807312, "num_tokens": 353688970.0, "step": 9270 }, { "epoch": 1.1793664928126193, "ewc_loss": 0.026923293247818947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6923293262370862e-05, "grad_norm": 16.670455932617188, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8559268116950989, "num_tokens": 353732505.0, "step": 9271 }, { "epoch": 1.1794937030912098, "ewc_loss": 0.02688111551105976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6881116355070844e-05, "grad_norm": 16.756513595581055, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8749990463256836, "num_tokens": 353767460.0, "step": 9272 }, { "epoch": 1.1796209133698004, "ewc_loss": 0.026893477886915207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6893478207057342e-05, "grad_norm": 16.647048950195312, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8758655786514282, "num_tokens": 353801951.0, "step": 9273 }, { "epoch": 1.1797481236483909, "ewc_loss": 0.02683371491730213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6833715310203843e-05, "grad_norm": 16.680017471313477, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8739650249481201, "num_tokens": 353836815.0, "step": 9274 }, { "epoch": 1.1798753339269812, "ewc_loss": 0.026926357299089432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6926356440526433e-05, "grad_norm": 16.707094192504883, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8585705757141113, "num_tokens": 353876296.0, "step": 9275 }, { "epoch": 1.1800025442055717, "ewc_loss": 0.02688012458384037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6880125005845912e-05, "grad_norm": 16.62984275817871, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8502388000488281, "num_tokens": 353911792.0, "step": 9276 }, { "epoch": 1.1801297544841622, "ewc_loss": 0.026872869580984116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6872869057115167e-05, "grad_norm": 16.640050888061523, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8562570810317993, "num_tokens": 353949532.0, "step": 9277 }, { "epoch": 1.1802569647627528, "ewc_loss": 0.02696118876338005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6961188268614933e-05, "grad_norm": 16.68752098083496, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8541553020477295, "num_tokens": 353988622.0, "step": 9278 }, { "epoch": 1.1803841750413433, "ewc_loss": 0.026922723278403282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6922723918687552e-05, "grad_norm": 16.683231353759766, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8707107901573181, "num_tokens": 354028136.0, "step": 9279 }, { "epoch": 1.1805113853199338, "ewc_loss": 0.02692347951233387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6923478799290024e-05, "grad_norm": 16.613693237304688, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8691796660423279, "num_tokens": 354065029.0, "step": 9280 }, { "epoch": 1.1806385955985244, "ewc_loss": 0.02694060280919075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6940602765535004e-05, "grad_norm": 16.68613624572754, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8778389096260071, "num_tokens": 354099279.0, "step": 9281 }, { "epoch": 1.1807658058771149, "ewc_loss": 0.027025489136576653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702548954403028e-05, "grad_norm": 16.707603454589844, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8626508712768555, "num_tokens": 354138870.0, "step": 9282 }, { "epoch": 1.1808930161557054, "ewc_loss": 0.026939693838357925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.693969327083323e-05, "grad_norm": 16.682621002197266, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.854289710521698, "num_tokens": 354176498.0, "step": 9283 }, { "epoch": 1.181020226434296, "ewc_loss": 0.026983482763171196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6983483621734194e-05, "grad_norm": 16.70326042175293, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8476499915122986, "num_tokens": 354210063.0, "step": 9284 }, { "epoch": 1.1811474367128865, "ewc_loss": 0.02689261920750141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6892619644058868e-05, "grad_norm": 16.600263595581055, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8597220182418823, "num_tokens": 354248095.0, "step": 9285 }, { "epoch": 1.181274646991477, "ewc_loss": 0.02693023532629013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6930234525934793e-05, "grad_norm": 16.60708236694336, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8632044792175293, "num_tokens": 354286897.0, "step": 9286 }, { "epoch": 1.1814018572700675, "ewc_loss": 0.026996387168765068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.699638753256295e-05, "grad_norm": 16.712209701538086, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8751237392425537, "num_tokens": 354330685.0, "step": 9287 }, { "epoch": 1.181529067548658, "ewc_loss": 0.02697121724486351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6971218176186085e-05, "grad_norm": 16.610008239746094, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8587809205055237, "num_tokens": 354371120.0, "step": 9288 }, { "epoch": 1.1816562778272484, "ewc_loss": 0.026927893981337547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.692789348657243e-05, "grad_norm": 16.7186279296875, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8642293214797974, "num_tokens": 354410825.0, "step": 9289 }, { "epoch": 1.1817834881058389, "ewc_loss": 0.026954667642712593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695466719160322e-05, "grad_norm": 16.565961837768555, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8635897636413574, "num_tokens": 354453623.0, "step": 9290 }, { "epoch": 1.1819106983844294, "ewc_loss": 0.026932869106531143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6932868422591127e-05, "grad_norm": 16.75612449645996, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8675757646560669, "num_tokens": 354491854.0, "step": 9291 }, { "epoch": 1.18203790866302, "ewc_loss": 0.02702280879020691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7022808353649452e-05, "grad_norm": 16.667110443115234, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8523024320602417, "num_tokens": 354530788.0, "step": 9292 }, { "epoch": 1.1821651189416105, "ewc_loss": 0.02689248137176037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.68924814008642e-05, "grad_norm": 16.689289093017578, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8858051300048828, "num_tokens": 354568576.0, "step": 9293 }, { "epoch": 1.182292329220201, "ewc_loss": 0.026996178552508354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.699617834878154e-05, "grad_norm": 16.59217643737793, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8664023876190186, "num_tokens": 354611307.0, "step": 9294 }, { "epoch": 1.1824195394987915, "ewc_loss": 0.02689656801521778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6896568670053966e-05, "grad_norm": 16.639129638671875, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.873361349105835, "num_tokens": 354648810.0, "step": 9295 }, { "epoch": 1.182546749777382, "ewc_loss": 0.02699957601726055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6999576220987365e-05, "grad_norm": 16.651485443115234, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.868575930595398, "num_tokens": 354688411.0, "step": 9296 }, { "epoch": 1.1826739600559726, "ewc_loss": 0.02695888839662075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695888906600885e-05, "grad_norm": 16.594593048095703, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8647786378860474, "num_tokens": 354734700.0, "step": 9297 }, { "epoch": 1.1828011703345631, "ewc_loss": 0.02698480151593685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6984802389051765e-05, "grad_norm": 16.66950798034668, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8635212779045105, "num_tokens": 354765517.0, "step": 9298 }, { "epoch": 1.1829283806131534, "ewc_loss": 0.02698448672890663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6984485884895548e-05, "grad_norm": 16.608877182006836, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8702706098556519, "num_tokens": 354808563.0, "step": 9299 }, { "epoch": 1.183055590891744, "ewc_loss": 0.027009937912225723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7009937184629962e-05, "grad_norm": 16.672731399536133, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8495145440101624, "num_tokens": 354846618.0, "step": 9300 }, { "epoch": 1.1831828011703345, "ewc_loss": 0.02700049616396427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.700049662962556e-05, "grad_norm": 16.60578727722168, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8695058226585388, "num_tokens": 354881293.0, "step": 9301 }, { "epoch": 1.183310011448925, "ewc_loss": 0.02697118930518627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6971189072355628e-05, "grad_norm": 16.59955596923828, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8642376661300659, "num_tokens": 354925239.0, "step": 9302 }, { "epoch": 1.1834372217275155, "ewc_loss": 0.026966003701090813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.696600313356612e-05, "grad_norm": 16.720407485961914, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.853571355342865, "num_tokens": 354967836.0, "step": 9303 }, { "epoch": 1.183564432006106, "ewc_loss": 0.026983274146914482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6983274437952787e-05, "grad_norm": 16.674182891845703, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8514450192451477, "num_tokens": 355003386.0, "step": 9304 }, { "epoch": 1.1836916422846966, "ewc_loss": 0.02695610746741295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695610783121083e-05, "grad_norm": 16.73914909362793, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.844738245010376, "num_tokens": 355043391.0, "step": 9305 }, { "epoch": 1.1838188525632871, "ewc_loss": 0.02699934132397175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6999341571354307e-05, "grad_norm": 16.663162231445312, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8679342269897461, "num_tokens": 355082820.0, "step": 9306 }, { "epoch": 1.1839460628418776, "ewc_loss": 0.026954513043165207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695451257750392e-05, "grad_norm": 16.650022506713867, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8871771693229675, "num_tokens": 355117531.0, "step": 9307 }, { "epoch": 1.1840732731204682, "ewc_loss": 0.02701820246875286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7018202672479674e-05, "grad_norm": 16.692289352416992, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8543614745140076, "num_tokens": 355159485.0, "step": 9308 }, { "epoch": 1.1842004833990587, "ewc_loss": 0.02700957842171192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7009578843717463e-05, "grad_norm": 16.681211471557617, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8616390228271484, "num_tokens": 355206051.0, "step": 9309 }, { "epoch": 1.1843276936776492, "ewc_loss": 0.02696017548441887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6960175091517158e-05, "grad_norm": 16.64791488647461, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8578121662139893, "num_tokens": 355242933.0, "step": 9310 }, { "epoch": 1.1844549039562398, "ewc_loss": 0.02696344070136547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6963440177496523e-05, "grad_norm": 16.638652801513672, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8786699771881104, "num_tokens": 355286311.0, "step": 9311 }, { "epoch": 1.1845821142348303, "ewc_loss": 0.02704663760960102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7046637114835903e-05, "grad_norm": 16.654842376708984, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8663665056228638, "num_tokens": 355327844.0, "step": 9312 }, { "epoch": 1.1847093245134206, "ewc_loss": 0.026967570185661316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6967571102431975e-05, "grad_norm": 16.590669631958008, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8750907778739929, "num_tokens": 355368977.0, "step": 9313 }, { "epoch": 1.1848365347920111, "ewc_loss": 0.027034303173422813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.703430254769046e-05, "grad_norm": 16.671051025390625, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8565152287483215, "num_tokens": 355411479.0, "step": 9314 }, { "epoch": 1.1849637450706016, "ewc_loss": 0.027046823874115944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7046824470744468e-05, "grad_norm": 16.66556739807129, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8839192986488342, "num_tokens": 355447370.0, "step": 9315 }, { "epoch": 1.1850909553491922, "ewc_loss": 0.02698924019932747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6989240723196417e-05, "grad_norm": 16.600419998168945, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8637277483940125, "num_tokens": 355490335.0, "step": 9316 }, { "epoch": 1.1852181656277827, "ewc_loss": 0.026955077424645424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6955078283208422e-05, "grad_norm": 16.66975975036621, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8669531345367432, "num_tokens": 355527843.0, "step": 9317 }, { "epoch": 1.1853453759063732, "ewc_loss": 0.027087586000561714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7087586204288527e-05, "grad_norm": 16.743127822875977, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8645800948143005, "num_tokens": 355566094.0, "step": 9318 }, { "epoch": 1.1854725861849638, "ewc_loss": 0.026946797966957092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.694679824344348e-05, "grad_norm": 16.58246421813965, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8743235468864441, "num_tokens": 355609505.0, "step": 9319 }, { "epoch": 1.1855997964635543, "ewc_loss": 0.026965538039803505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.696553747227881e-05, "grad_norm": 16.708175659179688, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.841162919998169, "num_tokens": 355650930.0, "step": 9320 }, { "epoch": 1.1857270067421448, "ewc_loss": 0.027012772858142853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.701277298911009e-05, "grad_norm": 16.715225219726562, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8758864402770996, "num_tokens": 355684075.0, "step": 9321 }, { "epoch": 1.1858542170207353, "ewc_loss": 0.02696654014289379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6966539735440165e-05, "grad_norm": 16.6750545501709, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8713314533233643, "num_tokens": 355723567.0, "step": 9322 }, { "epoch": 1.1859814272993259, "ewc_loss": 0.026955679059028625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6955678549711592e-05, "grad_norm": 16.638765335083008, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.852460503578186, "num_tokens": 355764915.0, "step": 9323 }, { "epoch": 1.1861086375779162, "ewc_loss": 0.02696230076253414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6962301490129903e-05, "grad_norm": 16.74951934814453, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.867929220199585, "num_tokens": 355804950.0, "step": 9324 }, { "epoch": 1.1862358478565067, "ewc_loss": 0.026981333270668983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6981333576259203e-05, "grad_norm": 16.65579605102539, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.874142587184906, "num_tokens": 355844155.0, "step": 9325 }, { "epoch": 1.1863630581350972, "ewc_loss": 0.02692868933081627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6928690203931183e-05, "grad_norm": 16.707632064819336, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8822489976882935, "num_tokens": 355883971.0, "step": 9326 }, { "epoch": 1.1864902684136878, "ewc_loss": 0.02700888365507126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.700888398976531e-05, "grad_norm": 16.747159957885742, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8610036373138428, "num_tokens": 355922271.0, "step": 9327 }, { "epoch": 1.1866174786922783, "ewc_loss": 0.026904551312327385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.690455221454613e-05, "grad_norm": 16.611379623413086, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8655996918678284, "num_tokens": 355961087.0, "step": 9328 }, { "epoch": 1.1867446889708688, "ewc_loss": 0.026940232142806053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.694023169169668e-05, "grad_norm": 16.676462173461914, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8699856400489807, "num_tokens": 355999553.0, "step": 9329 }, { "epoch": 1.1868718992494594, "ewc_loss": 0.026994969695806503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6994970539817587e-05, "grad_norm": 16.677288055419922, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8712509870529175, "num_tokens": 356038503.0, "step": 9330 }, { "epoch": 1.1869991095280499, "ewc_loss": 0.026922520250082016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6922520191874355e-05, "grad_norm": 16.651254653930664, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8767259120941162, "num_tokens": 356078538.0, "step": 9331 }, { "epoch": 1.1871263198066404, "ewc_loss": 0.02696545235812664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6965451979776844e-05, "grad_norm": 16.619333267211914, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8791131973266602, "num_tokens": 356115031.0, "step": 9332 }, { "epoch": 1.187253530085231, "ewc_loss": 0.026928920298814774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.692891939659603e-05, "grad_norm": 16.676551818847656, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8807817697525024, "num_tokens": 356154751.0, "step": 9333 }, { "epoch": 1.1873807403638215, "ewc_loss": 0.026988474652171135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6988474928657524e-05, "grad_norm": 16.617538452148438, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8514269590377808, "num_tokens": 356192105.0, "step": 9334 }, { "epoch": 1.187507950642412, "ewc_loss": 0.02695460058748722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.695459988899529e-05, "grad_norm": 16.669105529785156, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8565307259559631, "num_tokens": 356224964.0, "step": 9335 }, { "epoch": 1.1876351609210025, "ewc_loss": 0.02699270099401474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.699270044104196e-05, "grad_norm": 16.677886962890625, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8712993264198303, "num_tokens": 356261565.0, "step": 9336 }, { "epoch": 1.187762371199593, "ewc_loss": 0.026972107589244843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.697210766200442e-05, "grad_norm": 16.60624122619629, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8677232265472412, "num_tokens": 356301886.0, "step": 9337 }, { "epoch": 1.1878895814781834, "ewc_loss": 0.027027631178498268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7027630494558252e-05, "grad_norm": 16.654129028320312, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8745238780975342, "num_tokens": 356339481.0, "step": 9338 }, { "epoch": 1.1880167917567739, "ewc_loss": 0.027055174112319946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7055173632106744e-05, "grad_norm": 16.665971755981445, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8431118130683899, "num_tokens": 356387038.0, "step": 9339 }, { "epoch": 1.1881440020353644, "ewc_loss": 0.027048511430621147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7048510673921555e-05, "grad_norm": 16.640911102294922, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8631169199943542, "num_tokens": 356433256.0, "step": 9340 }, { "epoch": 1.188271212313955, "ewc_loss": 0.02702423743903637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702423807932064e-05, "grad_norm": 16.706958770751953, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8555954098701477, "num_tokens": 356471282.0, "step": 9341 }, { "epoch": 1.1883984225925455, "ewc_loss": 0.027058083564043045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7058084015152417e-05, "grad_norm": 16.630802154541016, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8723350167274475, "num_tokens": 356508951.0, "step": 9342 }, { "epoch": 1.188525632871136, "ewc_loss": 0.027039041742682457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7039041015086696e-05, "grad_norm": 16.701147079467773, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8693108558654785, "num_tokens": 356544487.0, "step": 9343 }, { "epoch": 1.1886528431497265, "ewc_loss": 0.027133818715810776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7133819457958452e-05, "grad_norm": 16.665708541870117, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8726847171783447, "num_tokens": 356580873.0, "step": 9344 }, { "epoch": 1.188780053428317, "ewc_loss": 0.02705051563680172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7050515200244263e-05, "grad_norm": 16.71741485595703, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.863999605178833, "num_tokens": 356613622.0, "step": 9345 }, { "epoch": 1.1889072637069076, "ewc_loss": 0.027097593992948532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7097594283986837e-05, "grad_norm": 16.709842681884766, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8439333438873291, "num_tokens": 356660058.0, "step": 9346 }, { "epoch": 1.189034473985498, "ewc_loss": 0.027042297646403313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7042297006119043e-05, "grad_norm": 16.684465408325195, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8663858771324158, "num_tokens": 356701614.0, "step": 9347 }, { "epoch": 1.1891616842640884, "ewc_loss": 0.027035875245928764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7035875973524526e-05, "grad_norm": 16.647506713867188, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.881459653377533, "num_tokens": 356737105.0, "step": 9348 }, { "epoch": 1.189288894542679, "ewc_loss": 0.02707587368786335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7075873731519096e-05, "grad_norm": 16.706096649169922, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.857377290725708, "num_tokens": 356779382.0, "step": 9349 }, { "epoch": 1.1894161048212695, "ewc_loss": 0.027042562142014503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.704256257857196e-05, "grad_norm": 16.64434051513672, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8608887791633606, "num_tokens": 356813836.0, "step": 9350 }, { "epoch": 1.18954331509986, "ewc_loss": 0.02706034481525421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7060345018981025e-05, "grad_norm": 16.680395126342773, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8572642803192139, "num_tokens": 356852591.0, "step": 9351 }, { "epoch": 1.1896705253784505, "ewc_loss": 0.027088504284620285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7088504793937318e-05, "grad_norm": 16.671911239624023, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8643602132797241, "num_tokens": 356889940.0, "step": 9352 }, { "epoch": 1.189797735657041, "ewc_loss": 0.02702602744102478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702602796489373e-05, "grad_norm": 16.639772415161133, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8696193695068359, "num_tokens": 356927878.0, "step": 9353 }, { "epoch": 1.1899249459356316, "ewc_loss": 0.027046507224440575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.704650796658825e-05, "grad_norm": 16.674877166748047, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8761037588119507, "num_tokens": 356963339.0, "step": 9354 }, { "epoch": 1.1900521562142221, "ewc_loss": 0.027069946750998497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7069947464042343e-05, "grad_norm": 16.620786666870117, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8700810670852661, "num_tokens": 357002999.0, "step": 9355 }, { "epoch": 1.1901793664928126, "ewc_loss": 0.027049757540225983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7049756681662984e-05, "grad_norm": 16.649261474609375, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8691712617874146, "num_tokens": 357044305.0, "step": 9356 }, { "epoch": 1.1903065767714032, "ewc_loss": 0.027043962851166725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.704396320041269e-05, "grad_norm": 16.618207931518555, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8682351112365723, "num_tokens": 357082197.0, "step": 9357 }, { "epoch": 1.1904337870499937, "ewc_loss": 0.0270672794431448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.706727900658734e-05, "grad_norm": 16.640920639038086, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8484082818031311, "num_tokens": 357119541.0, "step": 9358 }, { "epoch": 1.1905609973285842, "ewc_loss": 0.02704448252916336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7044483431382105e-05, "grad_norm": 16.589996337890625, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8879032135009766, "num_tokens": 357159176.0, "step": 9359 }, { "epoch": 1.1906882076071748, "ewc_loss": 0.027076024562120438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.707602470763959e-05, "grad_norm": 16.70522689819336, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8664714097976685, "num_tokens": 357199204.0, "step": 9360 }, { "epoch": 1.1908154178857653, "ewc_loss": 0.027064485475420952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7064485038863495e-05, "grad_norm": 16.597915649414062, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8809900283813477, "num_tokens": 357236900.0, "step": 9361 }, { "epoch": 1.1909426281643556, "ewc_loss": 0.027023935690522194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702393612707965e-05, "grad_norm": 16.654796600341797, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.866765022277832, "num_tokens": 357271622.0, "step": 9362 }, { "epoch": 1.1910698384429461, "ewc_loss": 0.027103547006845474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7103547836304642e-05, "grad_norm": 16.68014144897461, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8776007294654846, "num_tokens": 357311145.0, "step": 9363 }, { "epoch": 1.1911970487215366, "ewc_loss": 0.027049345895648003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7049345590057783e-05, "grad_norm": 16.57931137084961, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8835259079933167, "num_tokens": 357344609.0, "step": 9364 }, { "epoch": 1.1913242590001272, "ewc_loss": 0.027091994881629944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7091995434602723e-05, "grad_norm": 16.720701217651367, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8642836809158325, "num_tokens": 357378896.0, "step": 9365 }, { "epoch": 1.1914514692787177, "ewc_loss": 0.027135878801345825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7135878553963266e-05, "grad_norm": 16.608049392700195, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.870762825012207, "num_tokens": 357424730.0, "step": 9366 }, { "epoch": 1.1915786795573082, "ewc_loss": 0.02703286148607731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.703286190808285e-05, "grad_norm": 16.6318302154541, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8483974933624268, "num_tokens": 357461152.0, "step": 9367 }, { "epoch": 1.1917058898358988, "ewc_loss": 0.027115043252706528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7115043849335052e-05, "grad_norm": 16.684619903564453, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8602921366691589, "num_tokens": 357502305.0, "step": 9368 }, { "epoch": 1.1918331001144893, "ewc_loss": 0.027066243812441826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7066244001616724e-05, "grad_norm": 16.589754104614258, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8616961240768433, "num_tokens": 357539187.0, "step": 9369 }, { "epoch": 1.1919603103930798, "ewc_loss": 0.027126494795084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7126494387630373e-05, "grad_norm": 16.735597610473633, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8565570116043091, "num_tokens": 357569520.0, "step": 9370 }, { "epoch": 1.1920875206716703, "ewc_loss": 0.027119262143969536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7119262085761875e-05, "grad_norm": 16.632814407348633, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8664088249206543, "num_tokens": 357605253.0, "step": 9371 }, { "epoch": 1.1922147309502609, "ewc_loss": 0.027119124308228493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7119123842567205e-05, "grad_norm": 16.687421798706055, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8775651454925537, "num_tokens": 357639701.0, "step": 9372 }, { "epoch": 1.1923419412288512, "ewc_loss": 0.02716195210814476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7161951948073693e-05, "grad_norm": 16.616506576538086, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8751177191734314, "num_tokens": 357675611.0, "step": 9373 }, { "epoch": 1.1924691515074417, "ewc_loss": 0.027109520509839058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7109521397505887e-05, "grad_norm": 16.65391731262207, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8780556321144104, "num_tokens": 357708521.0, "step": 9374 }, { "epoch": 1.1925963617860322, "ewc_loss": 0.027191275730729103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7191275876248255e-05, "grad_norm": 16.649324417114258, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8569438457489014, "num_tokens": 357749970.0, "step": 9375 }, { "epoch": 1.1927235720646228, "ewc_loss": 0.02714303322136402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7143032639287412e-05, "grad_norm": 16.72765350341797, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8761805295944214, "num_tokens": 357783287.0, "step": 9376 }, { "epoch": 1.1928507823432133, "ewc_loss": 0.02720063552260399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7200636395718902e-05, "grad_norm": 16.674102783203125, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8679248094558716, "num_tokens": 357818883.0, "step": 9377 }, { "epoch": 1.1929779926218038, "ewc_loss": 0.027118364349007607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7118363504996523e-05, "grad_norm": 16.588300704956055, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8792514801025391, "num_tokens": 357862862.0, "step": 9378 }, { "epoch": 1.1931052029003943, "ewc_loss": 0.027168305590748787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7168305678060278e-05, "grad_norm": 16.667068481445312, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8558335900306702, "num_tokens": 357901574.0, "step": 9379 }, { "epoch": 1.1932324131789849, "ewc_loss": 0.027163993567228317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.716399285418447e-05, "grad_norm": 16.694913864135742, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8648307919502258, "num_tokens": 357941822.0, "step": 9380 }, { "epoch": 1.1933596234575754, "ewc_loss": 0.0271979458630085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7197946110391058e-05, "grad_norm": 16.696022033691406, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8596667051315308, "num_tokens": 357982820.0, "step": 9381 }, { "epoch": 1.193486833736166, "ewc_loss": 0.02715124934911728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.715124901442323e-05, "grad_norm": 16.685317993164062, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8643839955329895, "num_tokens": 358020491.0, "step": 9382 }, { "epoch": 1.1936140440147565, "ewc_loss": 0.027101190760731697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7101190426037647e-05, "grad_norm": 16.573503494262695, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8682003021240234, "num_tokens": 358059775.0, "step": 9383 }, { "epoch": 1.193741254293347, "ewc_loss": 0.02717607654631138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7176076400792226e-05, "grad_norm": 16.72442054748535, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8741216659545898, "num_tokens": 358098672.0, "step": 9384 }, { "epoch": 1.1938684645719375, "ewc_loss": 0.027238350361585617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723834950302262e-05, "grad_norm": 16.699819564819336, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8588409423828125, "num_tokens": 358140850.0, "step": 9385 }, { "epoch": 1.193995674850528, "ewc_loss": 0.02706768922507763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7067690098192543e-05, "grad_norm": 16.675891876220703, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8731001615524292, "num_tokens": 358175503.0, "step": 9386 }, { "epoch": 1.1941228851291183, "ewc_loss": 0.027183465659618378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.718346513574943e-05, "grad_norm": 16.694971084594727, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8701891899108887, "num_tokens": 358210615.0, "step": 9387 }, { "epoch": 1.1942500954077089, "ewc_loss": 0.027130935341119766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.713093454076443e-05, "grad_norm": 16.691490173339844, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.869092583656311, "num_tokens": 358246605.0, "step": 9388 }, { "epoch": 1.1943773056862994, "ewc_loss": 0.02709011547267437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7090114599559456e-05, "grad_norm": 16.708459854125977, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8635563254356384, "num_tokens": 358283136.0, "step": 9389 }, { "epoch": 1.19450451596489, "ewc_loss": 0.027163328602910042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7163328923052177e-05, "grad_norm": 16.711156845092773, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8752427101135254, "num_tokens": 358325437.0, "step": 9390 }, { "epoch": 1.1946317262434805, "ewc_loss": 0.02712903916835785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7129039153805934e-05, "grad_norm": 16.678808212280273, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8666661381721497, "num_tokens": 358365360.0, "step": 9391 }, { "epoch": 1.194758936522071, "ewc_loss": 0.027118144556879997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7118145226268098e-05, "grad_norm": 16.676603317260742, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8834390640258789, "num_tokens": 358406502.0, "step": 9392 }, { "epoch": 1.1948861468006615, "ewc_loss": 0.027101580053567886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7101579689770006e-05, "grad_norm": 16.809730529785156, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.861788272857666, "num_tokens": 358437220.0, "step": 9393 }, { "epoch": 1.195013357079252, "ewc_loss": 0.027109552174806595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7109552320325747e-05, "grad_norm": 16.712669372558594, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8560433387756348, "num_tokens": 358475314.0, "step": 9394 }, { "epoch": 1.1951405673578426, "ewc_loss": 0.027060730382800102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7060730644734576e-05, "grad_norm": 16.805635452270508, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8768499493598938, "num_tokens": 358520777.0, "step": 9395 }, { "epoch": 1.195267777636433, "ewc_loss": 0.02711138129234314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7111382223665714e-05, "grad_norm": 16.670011520385742, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8869093656539917, "num_tokens": 358557973.0, "step": 9396 }, { "epoch": 1.1953949879150234, "ewc_loss": 0.027001023292541504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7001024136552587e-05, "grad_norm": 16.846303939819336, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8738305568695068, "num_tokens": 358596795.0, "step": 9397 }, { "epoch": 1.195522198193614, "ewc_loss": 0.027086744084954262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7086744012194686e-05, "grad_norm": 16.67574119567871, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8677079677581787, "num_tokens": 358632146.0, "step": 9398 }, { "epoch": 1.1956494084722045, "ewc_loss": 0.026978323236107826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6978323148796335e-05, "grad_norm": 16.742029190063477, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8750838041305542, "num_tokens": 358672194.0, "step": 9399 }, { "epoch": 1.195776618750795, "ewc_loss": 0.027091681957244873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7091682568425313e-05, "grad_norm": 16.74472427368164, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8741729855537415, "num_tokens": 358710192.0, "step": 9400 }, { "epoch": 1.1959038290293855, "ewc_loss": 0.027028966695070267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.702896745176986e-05, "grad_norm": 16.711776733398438, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8538332581520081, "num_tokens": 358749284.0, "step": 9401 }, { "epoch": 1.196031039307976, "ewc_loss": 0.02704653888940811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.704653888940811e-05, "grad_norm": 16.803586959838867, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8635539412498474, "num_tokens": 358782830.0, "step": 9402 }, { "epoch": 1.1961582495865666, "ewc_loss": 0.027018830180168152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7018830223823898e-05, "grad_norm": 16.708824157714844, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8644417524337769, "num_tokens": 358820696.0, "step": 9403 }, { "epoch": 1.196285459865157, "ewc_loss": 0.027023140341043472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7023139409720898e-05, "grad_norm": 16.810453414916992, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8740830421447754, "num_tokens": 358853616.0, "step": 9404 }, { "epoch": 1.1964126701437476, "ewc_loss": 0.027146771550178528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.71467706625117e-05, "grad_norm": 16.762508392333984, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8755089044570923, "num_tokens": 358893420.0, "step": 9405 }, { "epoch": 1.1965398804223382, "ewc_loss": 0.026971755549311638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.697175477806013e-05, "grad_norm": 16.7891902923584, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8685169219970703, "num_tokens": 358930704.0, "step": 9406 }, { "epoch": 1.1966670907009287, "ewc_loss": 0.027026282623410225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7026282623410225e-05, "grad_norm": 16.652990341186523, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8807109594345093, "num_tokens": 358967730.0, "step": 9407 }, { "epoch": 1.1967943009795192, "ewc_loss": 0.026958387345075607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.6958387024933472e-05, "grad_norm": 16.720956802368164, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8792005777359009, "num_tokens": 359007241.0, "step": 9408 }, { "epoch": 1.1969215112581097, "ewc_loss": 0.0270445104688406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7044510716223158e-05, "grad_norm": 16.724607467651367, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8548343181610107, "num_tokens": 359045573.0, "step": 9409 }, { "epoch": 1.1970487215367003, "ewc_loss": 0.027021095156669617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7021094865631312e-05, "grad_norm": 16.728111267089844, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8707631230354309, "num_tokens": 359083420.0, "step": 9410 }, { "epoch": 1.1971759318152906, "ewc_loss": 0.02703830413520336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.703830432437826e-05, "grad_norm": 16.70902442932129, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8801473379135132, "num_tokens": 359117555.0, "step": 9411 }, { "epoch": 1.1973031420938811, "ewc_loss": 0.02699677273631096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.69967731583165e-05, "grad_norm": 16.68647003173828, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8646197319030762, "num_tokens": 359154597.0, "step": 9412 }, { "epoch": 1.1974303523724716, "ewc_loss": 0.027047963812947273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7047963158111088e-05, "grad_norm": 16.74380111694336, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8628032803535461, "num_tokens": 359190074.0, "step": 9413 }, { "epoch": 1.1975575626510622, "ewc_loss": 0.02707124873995781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.707124804146588e-05, "grad_norm": 16.64989471435547, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8565353751182556, "num_tokens": 359232711.0, "step": 9414 }, { "epoch": 1.1976847729296527, "ewc_loss": 0.027073659002780914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7073658202425577e-05, "grad_norm": 16.763385772705078, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8674712181091309, "num_tokens": 359265582.0, "step": 9415 }, { "epoch": 1.1978119832082432, "ewc_loss": 0.02709241770207882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7092417440144345e-05, "grad_norm": 16.667394638061523, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8750891089439392, "num_tokens": 359303177.0, "step": 9416 }, { "epoch": 1.1979391934868338, "ewc_loss": 0.02705969475209713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.705969563976396e-05, "grad_norm": 16.72635269165039, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8768596649169922, "num_tokens": 359338712.0, "step": 9417 }, { "epoch": 1.1980664037654243, "ewc_loss": 0.027093222364783287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7093223252450116e-05, "grad_norm": 16.7293758392334, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8760805726051331, "num_tokens": 359372849.0, "step": 9418 }, { "epoch": 1.1981936140440148, "ewc_loss": 0.027104554697871208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7104555556434207e-05, "grad_norm": 16.777732849121094, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8727445602416992, "num_tokens": 359407981.0, "step": 9419 }, { "epoch": 1.1983208243226053, "ewc_loss": 0.02710863947868347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7108639187645167e-05, "grad_norm": 16.690061569213867, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8465499877929688, "num_tokens": 359450180.0, "step": 9420 }, { "epoch": 1.1984480346011959, "ewc_loss": 0.02707395702600479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7073956516687758e-05, "grad_norm": 16.779064178466797, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8668475151062012, "num_tokens": 359487909.0, "step": 9421 }, { "epoch": 1.1985752448797862, "ewc_loss": 0.027042802423238754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.704280268517323e-05, "grad_norm": 16.685321807861328, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8795228004455566, "num_tokens": 359522609.0, "step": 9422 }, { "epoch": 1.1987024551583767, "ewc_loss": 0.02707321010529995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7073210731032304e-05, "grad_norm": 16.723215103149414, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8644505739212036, "num_tokens": 359560735.0, "step": 9423 }, { "epoch": 1.1988296654369672, "ewc_loss": 0.02705291099846363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7052910809288733e-05, "grad_norm": 16.70156478881836, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8612892031669617, "num_tokens": 359602994.0, "step": 9424 }, { "epoch": 1.1989568757155578, "ewc_loss": 0.027058418840169907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.705841870920267e-05, "grad_norm": 16.614168167114258, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8676599860191345, "num_tokens": 359638864.0, "step": 9425 }, { "epoch": 1.1990840859941483, "ewc_loss": 0.02710040658712387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.710040644160472e-05, "grad_norm": 16.71359634399414, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8755698800086975, "num_tokens": 359674202.0, "step": 9426 }, { "epoch": 1.1992112962727388, "ewc_loss": 0.027113940566778183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.71139397227671e-05, "grad_norm": 16.68522071838379, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8735495209693909, "num_tokens": 359710408.0, "step": 9427 }, { "epoch": 1.1993385065513293, "ewc_loss": 0.027107205241918564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7107205823995173e-05, "grad_norm": 16.7077579498291, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.875612735748291, "num_tokens": 359749068.0, "step": 9428 }, { "epoch": 1.1994657168299199, "ewc_loss": 0.027127422392368317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.712742207222618e-05, "grad_norm": 16.647693634033203, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.868357241153717, "num_tokens": 359788449.0, "step": 9429 }, { "epoch": 1.1995929271085104, "ewc_loss": 0.027084844186902046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7084844987257384e-05, "grad_norm": 16.71145248413086, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8621747493743896, "num_tokens": 359824167.0, "step": 9430 }, { "epoch": 1.199720137387101, "ewc_loss": 0.027171431109309196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.717143070185557e-05, "grad_norm": 16.781118392944336, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8628078103065491, "num_tokens": 359861476.0, "step": 9431 }, { "epoch": 1.1998473476656915, "ewc_loss": 0.027160294353961945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7160294848727062e-05, "grad_norm": 16.68821907043457, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8660678267478943, "num_tokens": 359906463.0, "step": 9432 }, { "epoch": 1.199974557944282, "ewc_loss": 0.027108119800686836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7108118956675753e-05, "grad_norm": 16.718786239624023, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8711405396461487, "num_tokens": 359940576.0, "step": 9433 }, { "epoch": 1.2001017682228725, "ewc_loss": 0.027166664600372314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.716666494961828e-05, "grad_norm": 16.737462997436523, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8775948286056519, "num_tokens": 359973616.0, "step": 9434 }, { "epoch": 1.200228978501463, "ewc_loss": 0.02712462842464447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7124628104502335e-05, "grad_norm": 16.723922729492188, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8798437118530273, "num_tokens": 360011059.0, "step": 9435 }, { "epoch": 1.2003561887800533, "ewc_loss": 0.027133021503686905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7133020921610296e-05, "grad_norm": 16.724929809570312, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.876708447933197, "num_tokens": 360048307.0, "step": 9436 }, { "epoch": 1.2004833990586439, "ewc_loss": 0.027082454413175583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7082454835181125e-05, "grad_norm": 16.728342056274414, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8677988052368164, "num_tokens": 360086946.0, "step": 9437 }, { "epoch": 1.2006106093372344, "ewc_loss": 0.027117934077978134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7117934223497286e-05, "grad_norm": 16.705270767211914, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8719005584716797, "num_tokens": 360121908.0, "step": 9438 }, { "epoch": 1.200737819615825, "ewc_loss": 0.02708078734576702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7080786821898073e-05, "grad_norm": 16.690515518188477, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8582871556282043, "num_tokens": 360164349.0, "step": 9439 }, { "epoch": 1.2008650298944155, "ewc_loss": 0.027165941894054413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.716594099183567e-05, "grad_norm": 16.762495040893555, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8830113410949707, "num_tokens": 360197660.0, "step": 9440 }, { "epoch": 1.200992240173006, "ewc_loss": 0.027117077261209488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7117077479488216e-05, "grad_norm": 16.712444305419922, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8463265895843506, "num_tokens": 360240297.0, "step": 9441 }, { "epoch": 1.2011194504515965, "ewc_loss": 0.02710125222802162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7101252271677367e-05, "grad_norm": 16.690597534179688, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8577231168746948, "num_tokens": 360281566.0, "step": 9442 }, { "epoch": 1.201246660730187, "ewc_loss": 0.027160286903381348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7160287572769448e-05, "grad_norm": 16.729557037353516, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8733616471290588, "num_tokens": 360313927.0, "step": 9443 }, { "epoch": 1.2013738710087776, "ewc_loss": 0.02711750566959381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.711750494199805e-05, "grad_norm": 16.71485710144043, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8721994161605835, "num_tokens": 360349369.0, "step": 9444 }, { "epoch": 1.201501081287368, "ewc_loss": 0.027149811387062073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7149812012794428e-05, "grad_norm": 16.687602996826172, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8668551445007324, "num_tokens": 360386996.0, "step": 9445 }, { "epoch": 1.2016282915659584, "ewc_loss": 0.027136126533150673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.713612593652215e-05, "grad_norm": 16.704811096191406, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8728339076042175, "num_tokens": 360424883.0, "step": 9446 }, { "epoch": 1.201755501844549, "ewc_loss": 0.027172625064849854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.71726257778937e-05, "grad_norm": 16.710084915161133, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8731186389923096, "num_tokens": 360457992.0, "step": 9447 }, { "epoch": 1.2018827121231395, "ewc_loss": 0.027140142396092415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7140142265125178e-05, "grad_norm": 16.709596633911133, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8530610799789429, "num_tokens": 360497934.0, "step": 9448 }, { "epoch": 1.20200992240173, "ewc_loss": 0.02715030498802662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.715030495892279e-05, "grad_norm": 16.72943115234375, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8655257225036621, "num_tokens": 360533928.0, "step": 9449 }, { "epoch": 1.2021371326803205, "ewc_loss": 0.027183517813682556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7183517886442132e-05, "grad_norm": 16.814180374145508, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8707172274589539, "num_tokens": 360571076.0, "step": 9450 }, { "epoch": 1.202264342958911, "ewc_loss": 0.027157442644238472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7157442673342302e-05, "grad_norm": 16.745956420898438, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8598432540893555, "num_tokens": 360611957.0, "step": 9451 }, { "epoch": 1.2023915532375016, "ewc_loss": 0.027158204466104507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7158204829902388e-05, "grad_norm": 16.747722625732422, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.861725389957428, "num_tokens": 360647823.0, "step": 9452 }, { "epoch": 1.202518763516092, "ewc_loss": 0.02716594934463501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7165950086782686e-05, "grad_norm": 16.786685943603516, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8592677712440491, "num_tokens": 360688341.0, "step": 9453 }, { "epoch": 1.2026459737946826, "ewc_loss": 0.027091309428215027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7091309675597586e-05, "grad_norm": 16.670785903930664, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8603692054748535, "num_tokens": 360730417.0, "step": 9454 }, { "epoch": 1.2027731840732732, "ewc_loss": 0.027159955352544785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.71599546977086e-05, "grad_norm": 16.74751091003418, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8489190340042114, "num_tokens": 360771931.0, "step": 9455 }, { "epoch": 1.2029003943518637, "ewc_loss": 0.027142858132719994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7142858016304672e-05, "grad_norm": 16.74857521057129, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8657177686691284, "num_tokens": 360811460.0, "step": 9456 }, { "epoch": 1.2030276046304542, "ewc_loss": 0.027077198028564453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7077197955804877e-05, "grad_norm": 16.69559669494629, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8801382780075073, "num_tokens": 360849476.0, "step": 9457 }, { "epoch": 1.2031548149090447, "ewc_loss": 0.02714444138109684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7144440537085757e-05, "grad_norm": 16.766027450561523, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8574575781822205, "num_tokens": 360891178.0, "step": 9458 }, { "epoch": 1.2032820251876353, "ewc_loss": 0.027129612863063812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.712961213546805e-05, "grad_norm": 16.743850708007812, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8637000322341919, "num_tokens": 360930503.0, "step": 9459 }, { "epoch": 1.2034092354662256, "ewc_loss": 0.02713344618678093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7133446565130726e-05, "grad_norm": 16.709442138671875, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8647609353065491, "num_tokens": 360963730.0, "step": 9460 }, { "epoch": 1.203536445744816, "ewc_loss": 0.027144767343997955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7144767955178395e-05, "grad_norm": 16.6285400390625, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8534192442893982, "num_tokens": 361003796.0, "step": 9461 }, { "epoch": 1.2036636560234066, "ewc_loss": 0.027120191603899002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7120191589347087e-05, "grad_norm": 16.757169723510742, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8680709004402161, "num_tokens": 361044818.0, "step": 9462 }, { "epoch": 1.2037908663019972, "ewc_loss": 0.027202175930142403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7202175260754302e-05, "grad_norm": 16.647010803222656, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8778764009475708, "num_tokens": 361080692.0, "step": 9463 }, { "epoch": 1.2039180765805877, "ewc_loss": 0.0270888302475214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7088830393040553e-05, "grad_norm": 16.75743865966797, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.859397828578949, "num_tokens": 361116609.0, "step": 9464 }, { "epoch": 1.2040452868591782, "ewc_loss": 0.027219438925385475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7219439289183356e-05, "grad_norm": 16.730098724365234, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8608126640319824, "num_tokens": 361154559.0, "step": 9465 }, { "epoch": 1.2041724971377687, "ewc_loss": 0.02713198959827423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7131989554618485e-05, "grad_norm": 16.75465965270996, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8766884207725525, "num_tokens": 361191005.0, "step": 9466 }, { "epoch": 1.2042997074163593, "ewc_loss": 0.02719770558178425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.719770600378979e-05, "grad_norm": 16.781431198120117, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8652149438858032, "num_tokens": 361230191.0, "step": 9467 }, { "epoch": 1.2044269176949498, "ewc_loss": 0.027115505188703537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7115505872643553e-05, "grad_norm": 16.731760025024414, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8608709573745728, "num_tokens": 361269132.0, "step": 9468 }, { "epoch": 1.2045541279735403, "ewc_loss": 0.02715301886200905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.715301889111288e-05, "grad_norm": 16.801753997802734, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8601083755493164, "num_tokens": 361306674.0, "step": 9469 }, { "epoch": 1.2046813382521309, "ewc_loss": 0.02713192254304886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7131922252010554e-05, "grad_norm": 16.69550323486328, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8693591356277466, "num_tokens": 361348309.0, "step": 9470 }, { "epoch": 1.2048085485307212, "ewc_loss": 0.02714022994041443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7140229576616548e-05, "grad_norm": 16.73027992248535, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8631571531295776, "num_tokens": 361386824.0, "step": 9471 }, { "epoch": 1.2049357588093117, "ewc_loss": 0.02714901976287365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.714901893341448e-05, "grad_norm": 16.722591400146484, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8483461141586304, "num_tokens": 361426590.0, "step": 9472 }, { "epoch": 1.2050629690879022, "ewc_loss": 0.02714141085743904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.714141010073945e-05, "grad_norm": 16.752681732177734, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8527624607086182, "num_tokens": 361463259.0, "step": 9473 }, { "epoch": 1.2051901793664928, "ewc_loss": 0.02720389887690544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7203899662708864e-05, "grad_norm": 16.775344848632812, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.876022458076477, "num_tokens": 361498750.0, "step": 9474 }, { "epoch": 1.2053173896450833, "ewc_loss": 0.027135523036122322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7135523851029575e-05, "grad_norm": 16.755474090576172, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8568363785743713, "num_tokens": 361537101.0, "step": 9475 }, { "epoch": 1.2054445999236738, "ewc_loss": 0.027111398056149483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7111398594570346e-05, "grad_norm": 16.782808303833008, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8702700138092041, "num_tokens": 361572130.0, "step": 9476 }, { "epoch": 1.2055718102022643, "ewc_loss": 0.027149002999067307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.714900256250985e-05, "grad_norm": 16.74380874633789, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8833990097045898, "num_tokens": 361608484.0, "step": 9477 }, { "epoch": 1.2056990204808549, "ewc_loss": 0.027161533012986183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7161533580510877e-05, "grad_norm": 16.744647979736328, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8838487863540649, "num_tokens": 361650361.0, "step": 9478 }, { "epoch": 1.2058262307594454, "ewc_loss": 0.027136389166116714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7136389689985663e-05, "grad_norm": 16.75522804260254, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8742296099662781, "num_tokens": 361680670.0, "step": 9479 }, { "epoch": 1.205953441038036, "ewc_loss": 0.027087291702628136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7087291528005153e-05, "grad_norm": 16.696678161621094, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.873335599899292, "num_tokens": 361723504.0, "step": 9480 }, { "epoch": 1.2060806513166265, "ewc_loss": 0.0270979180932045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7097918064100668e-05, "grad_norm": 16.703794479370117, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8730671405792236, "num_tokens": 361765850.0, "step": 9481 }, { "epoch": 1.206207861595217, "ewc_loss": 0.027148647233843803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7148647859576158e-05, "grad_norm": 16.72190284729004, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8587221503257751, "num_tokens": 361804375.0, "step": 9482 }, { "epoch": 1.2063350718738075, "ewc_loss": 0.027155432850122452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7155432690051384e-05, "grad_norm": 16.70888900756836, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8644258975982666, "num_tokens": 361834714.0, "step": 9483 }, { "epoch": 1.206462282152398, "ewc_loss": 0.02713995985686779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7139960366184823e-05, "grad_norm": 16.745670318603516, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8666592836380005, "num_tokens": 361877568.0, "step": 9484 }, { "epoch": 1.2065894924309883, "ewc_loss": 0.02715291827917099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7152918846695684e-05, "grad_norm": 16.633853912353516, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.862760603427887, "num_tokens": 361914785.0, "step": 9485 }, { "epoch": 1.2067167027095789, "ewc_loss": 0.027129288762807846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.712928835535422e-05, "grad_norm": 16.756149291992188, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8747447729110718, "num_tokens": 361956781.0, "step": 9486 }, { "epoch": 1.2068439129881694, "ewc_loss": 0.027230871841311455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723087163758464e-05, "grad_norm": 16.689218521118164, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8767654299736023, "num_tokens": 361997408.0, "step": 9487 }, { "epoch": 1.20697112326676, "ewc_loss": 0.027106964960694313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7106965717393905e-05, "grad_norm": 16.719858169555664, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8676705360412598, "num_tokens": 362030226.0, "step": 9488 }, { "epoch": 1.2070983335453505, "ewc_loss": 0.027168408036231995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7168407541466877e-05, "grad_norm": 16.72461700439453, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.877551257610321, "num_tokens": 362068370.0, "step": 9489 }, { "epoch": 1.207225543823941, "ewc_loss": 0.027150537818670273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7150537789566442e-05, "grad_norm": 16.76485824584961, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8761028051376343, "num_tokens": 362108569.0, "step": 9490 }, { "epoch": 1.2073527541025315, "ewc_loss": 0.027131404727697372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7131405659019947e-05, "grad_norm": 16.714168548583984, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8603100180625916, "num_tokens": 362140476.0, "step": 9491 }, { "epoch": 1.207479964381122, "ewc_loss": 0.027171140536665916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7171139663551003e-05, "grad_norm": 16.753408432006836, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8762285709381104, "num_tokens": 362171132.0, "step": 9492 }, { "epoch": 1.2076071746597126, "ewc_loss": 0.027244243770837784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7244243028690107e-05, "grad_norm": 16.75078582763672, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8657463788986206, "num_tokens": 362204791.0, "step": 9493 }, { "epoch": 1.207734384938303, "ewc_loss": 0.027230991050601006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7230991690885276e-05, "grad_norm": 16.743942260742188, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8710819482803345, "num_tokens": 362242360.0, "step": 9494 }, { "epoch": 1.2078615952168934, "ewc_loss": 0.02725319191813469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7253192456555553e-05, "grad_norm": 16.767019271850586, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8699735403060913, "num_tokens": 362276558.0, "step": 9495 }, { "epoch": 1.207988805495484, "ewc_loss": 0.02721102349460125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.721102282521315e-05, "grad_norm": 16.812162399291992, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8534475564956665, "num_tokens": 362307055.0, "step": 9496 }, { "epoch": 1.2081160157740745, "ewc_loss": 0.027225926518440247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72259258053964e-05, "grad_norm": 16.74452018737793, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8778256177902222, "num_tokens": 362342433.0, "step": 9497 }, { "epoch": 1.208243226052665, "ewc_loss": 0.027217403054237366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7217403840040788e-05, "grad_norm": 16.76079559326172, "learning_rate": 1e-06, "loss": 0.3383, "mean_token_accuracy": 0.889211893081665, "num_tokens": 362377008.0, "step": 9498 }, { "epoch": 1.2083704363312555, "ewc_loss": 0.027268629521131516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7268630219623446e-05, "grad_norm": 16.761642456054688, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8817006349563599, "num_tokens": 362412089.0, "step": 9499 }, { "epoch": 1.208497646609846, "ewc_loss": 0.02725089155137539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7250891434960067e-05, "grad_norm": 16.80214500427246, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.858815610408783, "num_tokens": 362455817.0, "step": 9500 }, { "epoch": 1.2086248568884366, "ewc_loss": 0.027201121672987938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.720112206588965e-05, "grad_norm": 16.694860458374023, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.867331862449646, "num_tokens": 362498859.0, "step": 9501 }, { "epoch": 1.208752067167027, "ewc_loss": 0.027170248329639435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7170248358743265e-05, "grad_norm": 16.69007110595703, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8668612241744995, "num_tokens": 362542346.0, "step": 9502 }, { "epoch": 1.2088792774456176, "ewc_loss": 0.027268061414361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7268060875940137e-05, "grad_norm": 16.754440307617188, "learning_rate": 1e-06, "loss": 0.3399, "mean_token_accuracy": 0.8869075775146484, "num_tokens": 362575360.0, "step": 9503 }, { "epoch": 1.2090064877242082, "ewc_loss": 0.027241984382271767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7241983843850903e-05, "grad_norm": 16.777740478515625, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8687366247177124, "num_tokens": 362614737.0, "step": 9504 }, { "epoch": 1.2091336980027987, "ewc_loss": 0.02724895440042019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.724895421124529e-05, "grad_norm": 16.787660598754883, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8624151945114136, "num_tokens": 362653756.0, "step": 9505 }, { "epoch": 1.2092609082813892, "ewc_loss": 0.027183569967746735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7183570637134835e-05, "grad_norm": 16.719148635864258, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8698534965515137, "num_tokens": 362690745.0, "step": 9506 }, { "epoch": 1.2093881185599797, "ewc_loss": 0.02719000354409218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7190004402655177e-05, "grad_norm": 16.763208389282227, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8771663308143616, "num_tokens": 362726328.0, "step": 9507 }, { "epoch": 1.2095153288385703, "ewc_loss": 0.027235817164182663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7235817469772883e-05, "grad_norm": 16.718610763549805, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8711646795272827, "num_tokens": 362764285.0, "step": 9508 }, { "epoch": 1.2096425391171606, "ewc_loss": 0.027213195338845253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7213194698560983e-05, "grad_norm": 16.889507293701172, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8486846685409546, "num_tokens": 362794697.0, "step": 9509 }, { "epoch": 1.209769749395751, "ewc_loss": 0.027251947671175003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7251948267803527e-05, "grad_norm": 16.729835510253906, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8676669597625732, "num_tokens": 362831618.0, "step": 9510 }, { "epoch": 1.2098969596743416, "ewc_loss": 0.02716941572725773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.716941526159644e-05, "grad_norm": 16.744609832763672, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8756123185157776, "num_tokens": 362866404.0, "step": 9511 }, { "epoch": 1.2100241699529322, "ewc_loss": 0.027218803763389587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.721880446188152e-05, "grad_norm": 16.74967384338379, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8847436904907227, "num_tokens": 362902554.0, "step": 9512 }, { "epoch": 1.2101513802315227, "ewc_loss": 0.027215059846639633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7215059162699617e-05, "grad_norm": 16.69982147216797, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8628569841384888, "num_tokens": 362944169.0, "step": 9513 }, { "epoch": 1.2102785905101132, "ewc_loss": 0.027156691998243332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7156691430718638e-05, "grad_norm": 16.68912696838379, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8476556539535522, "num_tokens": 362983789.0, "step": 9514 }, { "epoch": 1.2104058007887037, "ewc_loss": 0.027180487290024757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7180487450095825e-05, "grad_norm": 16.70070457458496, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8734784126281738, "num_tokens": 363021551.0, "step": 9515 }, { "epoch": 1.2105330110672943, "ewc_loss": 0.027243977412581444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.724397745623719e-05, "grad_norm": 16.780597686767578, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8517382740974426, "num_tokens": 363064951.0, "step": 9516 }, { "epoch": 1.2106602213458848, "ewc_loss": 0.02726891078054905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7268910343991593e-05, "grad_norm": 16.68049430847168, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8676435947418213, "num_tokens": 363110876.0, "step": 9517 }, { "epoch": 1.2107874316244753, "ewc_loss": 0.02715570107102394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.715570190048311e-05, "grad_norm": 16.680816650390625, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8645315170288086, "num_tokens": 363155310.0, "step": 9518 }, { "epoch": 1.2109146419030659, "ewc_loss": 0.027262600138783455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7262600269750692e-05, "grad_norm": 16.729284286499023, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8786022663116455, "num_tokens": 363193166.0, "step": 9519 }, { "epoch": 1.2110418521816562, "ewc_loss": 0.02726173773407936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726173806877341e-05, "grad_norm": 16.77288246154785, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8500319719314575, "num_tokens": 363225163.0, "step": 9520 }, { "epoch": 1.2111690624602467, "ewc_loss": 0.027221225202083588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7221225536777638e-05, "grad_norm": 16.76365089416504, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8663128614425659, "num_tokens": 363264720.0, "step": 9521 }, { "epoch": 1.2112962727388372, "ewc_loss": 0.027180951088666916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.718095129239373e-05, "grad_norm": 16.709856033325195, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8522182703018188, "num_tokens": 363305029.0, "step": 9522 }, { "epoch": 1.2114234830174277, "ewc_loss": 0.027174990624189377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.717499046411831e-05, "grad_norm": 16.69257354736328, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8745619058609009, "num_tokens": 363344128.0, "step": 9523 }, { "epoch": 1.2115506932960183, "ewc_loss": 0.027235042303800583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7235042580286972e-05, "grad_norm": 16.63336944580078, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8661497831344604, "num_tokens": 363385903.0, "step": 9524 }, { "epoch": 1.2116779035746088, "ewc_loss": 0.02718956768512726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7189567845198326e-05, "grad_norm": 16.788347244262695, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8691933155059814, "num_tokens": 363425216.0, "step": 9525 }, { "epoch": 1.2118051138531993, "ewc_loss": 0.027297040447592735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.729704101511743e-05, "grad_norm": 16.717391967773438, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8788242340087891, "num_tokens": 363462691.0, "step": 9526 }, { "epoch": 1.2119323241317899, "ewc_loss": 0.027186375111341476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7186375518795103e-05, "grad_norm": 16.73691177368164, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8534034490585327, "num_tokens": 363495849.0, "step": 9527 }, { "epoch": 1.2120595344103804, "ewc_loss": 0.02728903293609619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728903382376302e-05, "grad_norm": 16.742746353149414, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8672150373458862, "num_tokens": 363535268.0, "step": 9528 }, { "epoch": 1.212186744688971, "ewc_loss": 0.02716434746980667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7164347557118163e-05, "grad_norm": 16.65147590637207, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8500887155532837, "num_tokens": 363573523.0, "step": 9529 }, { "epoch": 1.2123139549675614, "ewc_loss": 0.027251701802015305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.725170270423405e-05, "grad_norm": 16.742799758911133, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8667818307876587, "num_tokens": 363607545.0, "step": 9530 }, { "epoch": 1.212441165246152, "ewc_loss": 0.02723531238734722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7235311790718697e-05, "grad_norm": 16.72231101989746, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8695884943008423, "num_tokens": 363648456.0, "step": 9531 }, { "epoch": 1.2125683755247425, "ewc_loss": 0.027206290513277054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7206289814785123e-05, "grad_norm": 16.643230438232422, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.876267671585083, "num_tokens": 363688726.0, "step": 9532 }, { "epoch": 1.212695585803333, "ewc_loss": 0.027256470173597336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7256470275460742e-05, "grad_norm": 16.72254753112793, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8606466054916382, "num_tokens": 363728557.0, "step": 9533 }, { "epoch": 1.2128227960819233, "ewc_loss": 0.02725967764854431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7259677153779194e-05, "grad_norm": 16.749099731445312, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8667057752609253, "num_tokens": 363764191.0, "step": 9534 }, { "epoch": 1.2129500063605139, "ewc_loss": 0.02721768617630005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7217685783398338e-05, "grad_norm": 16.762582778930664, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8560411930084229, "num_tokens": 363798126.0, "step": 9535 }, { "epoch": 1.2130772166391044, "ewc_loss": 0.027226941660046577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.722694080148358e-05, "grad_norm": 16.726062774658203, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8520992994308472, "num_tokens": 363836037.0, "step": 9536 }, { "epoch": 1.213204426917695, "ewc_loss": 0.02726087160408497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7260872229817323e-05, "grad_norm": 16.753984451293945, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8880167007446289, "num_tokens": 363872359.0, "step": 9537 }, { "epoch": 1.2133316371962855, "ewc_loss": 0.027224799618124962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7224799850955606e-05, "grad_norm": 16.710613250732422, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8621415495872498, "num_tokens": 363914206.0, "step": 9538 }, { "epoch": 1.213458847474876, "ewc_loss": 0.027218421921133995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7218422474106774e-05, "grad_norm": 16.793617248535156, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8686615228652954, "num_tokens": 363956147.0, "step": 9539 }, { "epoch": 1.2135860577534665, "ewc_loss": 0.027241300791502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7241301722824574e-05, "grad_norm": 16.73939323425293, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8845481872558594, "num_tokens": 363986328.0, "step": 9540 }, { "epoch": 1.213713268032057, "ewc_loss": 0.027173036709427834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.71730368694989e-05, "grad_norm": 16.680774688720703, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8648253679275513, "num_tokens": 364024905.0, "step": 9541 }, { "epoch": 1.2138404783106476, "ewc_loss": 0.027218660339713097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7218660761718638e-05, "grad_norm": 16.719131469726562, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8684960007667542, "num_tokens": 364068628.0, "step": 9542 }, { "epoch": 1.213967688589238, "ewc_loss": 0.027221420779824257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7221420168643817e-05, "grad_norm": 16.69635772705078, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8646020889282227, "num_tokens": 364104261.0, "step": 9543 }, { "epoch": 1.2140948988678284, "ewc_loss": 0.02721056155860424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.721056080190465e-05, "grad_norm": 16.73702621459961, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8649017214775085, "num_tokens": 364147524.0, "step": 9544 }, { "epoch": 1.214222109146419, "ewc_loss": 0.027296552434563637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7296551706967875e-05, "grad_norm": 16.739768981933594, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8694939613342285, "num_tokens": 364187715.0, "step": 9545 }, { "epoch": 1.2143493194250095, "ewc_loss": 0.02724453993141651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7244539523962885e-05, "grad_norm": 16.70025062561035, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8773212432861328, "num_tokens": 364226081.0, "step": 9546 }, { "epoch": 1.2144765297036, "ewc_loss": 0.027257828041911125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7257827241555788e-05, "grad_norm": 16.819116592407227, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8682902455329895, "num_tokens": 364268390.0, "step": 9547 }, { "epoch": 1.2146037399821905, "ewc_loss": 0.02729496732354164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7294967367197387e-05, "grad_norm": 16.77294158935547, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8702231645584106, "num_tokens": 364310176.0, "step": 9548 }, { "epoch": 1.214730950260781, "ewc_loss": 0.027154618874192238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7154619601788e-05, "grad_norm": 16.690771102905273, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8789435625076294, "num_tokens": 364345652.0, "step": 9549 }, { "epoch": 1.2148581605393716, "ewc_loss": 0.027202408760786057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7202408091397956e-05, "grad_norm": 16.74854278564453, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8633652925491333, "num_tokens": 364383810.0, "step": 9550 }, { "epoch": 1.214985370817962, "ewc_loss": 0.027230123057961464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723012221395038e-05, "grad_norm": 16.731081008911133, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8715394735336304, "num_tokens": 364421169.0, "step": 9551 }, { "epoch": 1.2151125810965526, "ewc_loss": 0.02719060704112053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.719060648814775e-05, "grad_norm": 16.76289176940918, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8617552518844604, "num_tokens": 364452552.0, "step": 9552 }, { "epoch": 1.2152397913751432, "ewc_loss": 0.027236592024564743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7236592359258793e-05, "grad_norm": 16.777353286743164, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8656291365623474, "num_tokens": 364486779.0, "step": 9553 }, { "epoch": 1.2153670016537337, "ewc_loss": 0.027185581624507904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7185582439415157e-05, "grad_norm": 16.701997756958008, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8770743608474731, "num_tokens": 364526237.0, "step": 9554 }, { "epoch": 1.2154942119323242, "ewc_loss": 0.0272429957985878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7242995201959275e-05, "grad_norm": 16.732961654663086, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8738467693328857, "num_tokens": 364568446.0, "step": 9555 }, { "epoch": 1.2156214222109147, "ewc_loss": 0.027277693152427673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7277692424831912e-05, "grad_norm": 16.970483779907227, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8746960163116455, "num_tokens": 364604448.0, "step": 9556 }, { "epoch": 1.2157486324895053, "ewc_loss": 0.027251582592725754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7251582650933415e-05, "grad_norm": 16.676624298095703, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8650991916656494, "num_tokens": 364642707.0, "step": 9557 }, { "epoch": 1.2158758427680956, "ewc_loss": 0.027198338881134987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7198339012102224e-05, "grad_norm": 16.74678611755371, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8631584048271179, "num_tokens": 364678428.0, "step": 9558 }, { "epoch": 1.216003053046686, "ewc_loss": 0.027362050488591194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7362049877410755e-05, "grad_norm": 16.723129272460938, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8810763359069824, "num_tokens": 364718915.0, "step": 9559 }, { "epoch": 1.2161302633252766, "ewc_loss": 0.02728426456451416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7284264433546923e-05, "grad_norm": 16.70148468017578, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8844587802886963, "num_tokens": 364756030.0, "step": 9560 }, { "epoch": 1.2162574736038672, "ewc_loss": 0.027369258925318718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7369258532417007e-05, "grad_norm": 16.768491744995117, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8647108674049377, "num_tokens": 364796158.0, "step": 9561 }, { "epoch": 1.2163846838824577, "ewc_loss": 0.027325257658958435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7325257178745233e-05, "grad_norm": 16.71051025390625, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8761040568351746, "num_tokens": 364832722.0, "step": 9562 }, { "epoch": 1.2165118941610482, "ewc_loss": 0.02731890231370926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7318901629769243e-05, "grad_norm": 16.749067306518555, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8656874895095825, "num_tokens": 364874120.0, "step": 9563 }, { "epoch": 1.2166391044396387, "ewc_loss": 0.027347616851329803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7347616196493618e-05, "grad_norm": 16.75581932067871, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.869248628616333, "num_tokens": 364913166.0, "step": 9564 }, { "epoch": 1.2167663147182293, "ewc_loss": 0.027322586625814438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7322586902300827e-05, "grad_norm": 16.769126892089844, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8503552079200745, "num_tokens": 364949586.0, "step": 9565 }, { "epoch": 1.2168935249968198, "ewc_loss": 0.02735157683491707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7351576136425138e-05, "grad_norm": 16.751001358032227, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8504344820976257, "num_tokens": 364989070.0, "step": 9566 }, { "epoch": 1.2170207352754103, "ewc_loss": 0.027323905378580093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7323905669618398e-05, "grad_norm": 16.769426345825195, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8712543845176697, "num_tokens": 365027050.0, "step": 9567 }, { "epoch": 1.2171479455540009, "ewc_loss": 0.027310242876410484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7310243240208365e-05, "grad_norm": 16.76796531677246, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8678661584854126, "num_tokens": 365060050.0, "step": 9568 }, { "epoch": 1.2172751558325912, "ewc_loss": 0.02731027454137802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7310274163028225e-05, "grad_norm": 16.794910430908203, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8368919491767883, "num_tokens": 365098030.0, "step": 9569 }, { "epoch": 1.2174023661111817, "ewc_loss": 0.027322981506586075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7322981623001397e-05, "grad_norm": 16.758378982543945, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8701346516609192, "num_tokens": 365141143.0, "step": 9570 }, { "epoch": 1.2175295763897722, "ewc_loss": 0.027314506471157074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7314506951370277e-05, "grad_norm": 16.854385375976562, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8455929756164551, "num_tokens": 365174204.0, "step": 9571 }, { "epoch": 1.2176567866683627, "ewc_loss": 0.02734415791928768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7344158297637478e-05, "grad_norm": 16.795719146728516, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8686192631721497, "num_tokens": 365207170.0, "step": 9572 }, { "epoch": 1.2177839969469533, "ewc_loss": 0.02726353146135807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7263531592325307e-05, "grad_norm": 16.754283905029297, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8714430332183838, "num_tokens": 365248926.0, "step": 9573 }, { "epoch": 1.2179112072255438, "ewc_loss": 0.027271384373307228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7271384169580415e-05, "grad_norm": 16.72672462463379, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8704670071601868, "num_tokens": 365283588.0, "step": 9574 }, { "epoch": 1.2180384175041343, "ewc_loss": 0.027315502986311913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.731550375756342e-05, "grad_norm": 16.804731369018555, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8655886650085449, "num_tokens": 365324869.0, "step": 9575 }, { "epoch": 1.2181656277827249, "ewc_loss": 0.027297912165522575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7297912311041728e-05, "grad_norm": 16.724716186523438, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8645349740982056, "num_tokens": 365366832.0, "step": 9576 }, { "epoch": 1.2182928380613154, "ewc_loss": 0.027316920459270477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7316920750308782e-05, "grad_norm": 16.79690933227539, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8655238747596741, "num_tokens": 365401148.0, "step": 9577 }, { "epoch": 1.218420048339906, "ewc_loss": 0.027357330545783043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7357331418897957e-05, "grad_norm": 16.724884033203125, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8691854476928711, "num_tokens": 365442303.0, "step": 9578 }, { "epoch": 1.2185472586184964, "ewc_loss": 0.027286918833851814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7286918339086697e-05, "grad_norm": 16.788171768188477, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8798830509185791, "num_tokens": 365477877.0, "step": 9579 }, { "epoch": 1.218674468897087, "ewc_loss": 0.027392376214265823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.739237606874667e-05, "grad_norm": 16.811185836791992, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8561016321182251, "num_tokens": 365520648.0, "step": 9580 }, { "epoch": 1.2188016791756775, "ewc_loss": 0.027299685403704643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7299685825710185e-05, "grad_norm": 16.737340927124023, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8504352569580078, "num_tokens": 365555129.0, "step": 9581 }, { "epoch": 1.218928889454268, "ewc_loss": 0.027278954163193703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7278954803477973e-05, "grad_norm": 16.76034927368164, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8727452754974365, "num_tokens": 365594368.0, "step": 9582 }, { "epoch": 1.2190560997328583, "ewc_loss": 0.02733457088470459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7334570404491387e-05, "grad_norm": 16.77217674255371, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8662189245223999, "num_tokens": 365634763.0, "step": 9583 }, { "epoch": 1.2191833100114489, "ewc_loss": 0.027323974296450615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7323974791215733e-05, "grad_norm": 16.739891052246094, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8833315968513489, "num_tokens": 365675951.0, "step": 9584 }, { "epoch": 1.2193105202900394, "ewc_loss": 0.027265561744570732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7265561584499665e-05, "grad_norm": 16.715044021606445, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8754638433456421, "num_tokens": 365710348.0, "step": 9585 }, { "epoch": 1.21943773056863, "ewc_loss": 0.027295880019664764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7295880499877967e-05, "grad_norm": 16.79450035095215, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.870376706123352, "num_tokens": 365745166.0, "step": 9586 }, { "epoch": 1.2195649408472204, "ewc_loss": 0.02733786031603813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7337860956322402e-05, "grad_norm": 16.81621742248535, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8801490664482117, "num_tokens": 365780122.0, "step": 9587 }, { "epoch": 1.219692151125811, "ewc_loss": 0.02733011171221733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7330112061463296e-05, "grad_norm": 16.75328826904297, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8673973083496094, "num_tokens": 365816326.0, "step": 9588 }, { "epoch": 1.2198193614044015, "ewc_loss": 0.02727450057864189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.727450009842869e-05, "grad_norm": 16.738012313842773, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8413113355636597, "num_tokens": 365858938.0, "step": 9589 }, { "epoch": 1.219946571682992, "ewc_loss": 0.027286281809210777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7286281692795455e-05, "grad_norm": 16.760530471801758, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.857530951499939, "num_tokens": 365893892.0, "step": 9590 }, { "epoch": 1.2200737819615826, "ewc_loss": 0.027304425835609436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7304426112095825e-05, "grad_norm": 16.707538604736328, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8580762147903442, "num_tokens": 365938787.0, "step": 9591 }, { "epoch": 1.220200992240173, "ewc_loss": 0.027341054752469063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7341055101715028e-05, "grad_norm": 16.783010482788086, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8672754764556885, "num_tokens": 365972664.0, "step": 9592 }, { "epoch": 1.2203282025187634, "ewc_loss": 0.027303464710712433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730346386670135e-05, "grad_norm": 16.753578186035156, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8734129667282104, "num_tokens": 366013367.0, "step": 9593 }, { "epoch": 1.220455412797354, "ewc_loss": 0.027310524135828018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.731052336457651e-05, "grad_norm": 16.726215362548828, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8734407424926758, "num_tokens": 366051637.0, "step": 9594 }, { "epoch": 1.2205826230759445, "ewc_loss": 0.027293721213936806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7293721359455958e-05, "grad_norm": 16.748218536376953, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8644680976867676, "num_tokens": 366092713.0, "step": 9595 }, { "epoch": 1.220709833354535, "ewc_loss": 0.027305101975798607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7305102776153944e-05, "grad_norm": 16.767606735229492, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8680144548416138, "num_tokens": 366131594.0, "step": 9596 }, { "epoch": 1.2208370436331255, "ewc_loss": 0.027326706796884537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.732670691329986e-05, "grad_norm": 16.790502548217773, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8616836667060852, "num_tokens": 366170871.0, "step": 9597 }, { "epoch": 1.220964253911716, "ewc_loss": 0.02727312222123146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7273123123450205e-05, "grad_norm": 16.71971321105957, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8568081855773926, "num_tokens": 366212974.0, "step": 9598 }, { "epoch": 1.2210914641903066, "ewc_loss": 0.027249163016676903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72491633950267e-05, "grad_norm": 16.78969955444336, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8594987988471985, "num_tokens": 366255904.0, "step": 9599 }, { "epoch": 1.221218674468897, "ewc_loss": 0.02732388861477375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7323889298713766e-05, "grad_norm": 16.840198516845703, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8730610609054565, "num_tokens": 366288887.0, "step": 9600 }, { "epoch": 1.2213458847474876, "ewc_loss": 0.027234727516770363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723472789512016e-05, "grad_norm": 16.686370849609375, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8799720406532288, "num_tokens": 366328796.0, "step": 9601 }, { "epoch": 1.2214730950260781, "ewc_loss": 0.027257220819592476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7257221518084407e-05, "grad_norm": 16.860675811767578, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8644348978996277, "num_tokens": 366373093.0, "step": 9602 }, { "epoch": 1.2216003053046687, "ewc_loss": 0.02734222076833248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.73422210739227e-05, "grad_norm": 16.736600875854492, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8760707974433899, "num_tokens": 366414353.0, "step": 9603 }, { "epoch": 1.2217275155832592, "ewc_loss": 0.027226954698562622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7226955353398807e-05, "grad_norm": 16.821409225463867, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8806686401367188, "num_tokens": 366450683.0, "step": 9604 }, { "epoch": 1.2218547258618497, "ewc_loss": 0.02726208232343197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726208185777068e-05, "grad_norm": 16.792369842529297, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8720242381095886, "num_tokens": 366487158.0, "step": 9605 }, { "epoch": 1.2219819361404403, "ewc_loss": 0.027198603376746178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7198602765565738e-05, "grad_norm": 16.769412994384766, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8696269989013672, "num_tokens": 366529984.0, "step": 9606 }, { "epoch": 1.2221091464190306, "ewc_loss": 0.027273794636130333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7273794330540113e-05, "grad_norm": 16.785175323486328, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8712480068206787, "num_tokens": 366564100.0, "step": 9607 }, { "epoch": 1.222236356697621, "ewc_loss": 0.02721182443201542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7211824999540113e-05, "grad_norm": 16.69668960571289, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.874415397644043, "num_tokens": 366603752.0, "step": 9608 }, { "epoch": 1.2223635669762116, "ewc_loss": 0.02719186618924141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7191867047804408e-05, "grad_norm": 16.77251434326172, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8692417144775391, "num_tokens": 366642170.0, "step": 9609 }, { "epoch": 1.2224907772548022, "ewc_loss": 0.02728920243680477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728920298977755e-05, "grad_norm": 16.746793746948242, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.854424774646759, "num_tokens": 366684038.0, "step": 9610 }, { "epoch": 1.2226179875333927, "ewc_loss": 0.027232153341174126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723215402511414e-05, "grad_norm": 16.731433868408203, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8645808696746826, "num_tokens": 366717832.0, "step": 9611 }, { "epoch": 1.2227451978119832, "ewc_loss": 0.027254683896899223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.725468402786646e-05, "grad_norm": 16.72045135498047, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8619270324707031, "num_tokens": 366757999.0, "step": 9612 }, { "epoch": 1.2228724080905737, "ewc_loss": 0.02726406417787075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7264064556220546e-05, "grad_norm": 16.752437591552734, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8766930103302002, "num_tokens": 366798533.0, "step": 9613 }, { "epoch": 1.2229996183691643, "ewc_loss": 0.02728497050702572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72849702014355e-05, "grad_norm": 16.695690155029297, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8647295236587524, "num_tokens": 366843066.0, "step": 9614 }, { "epoch": 1.2231268286477548, "ewc_loss": 0.027255909517407417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.725591002672445e-05, "grad_norm": 16.788488388061523, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8831212520599365, "num_tokens": 366883190.0, "step": 9615 }, { "epoch": 1.2232540389263453, "ewc_loss": 0.027307193726301193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730719279497862e-05, "grad_norm": 16.752988815307617, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8719923496246338, "num_tokens": 366920642.0, "step": 9616 }, { "epoch": 1.2233812492049359, "ewc_loss": 0.027235208079218864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7235208108322695e-05, "grad_norm": 16.78464698791504, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8683613538742065, "num_tokens": 366954468.0, "step": 9617 }, { "epoch": 1.2235084594835262, "ewc_loss": 0.027270814403891563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7270814825897105e-05, "grad_norm": 16.741497039794922, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8521704077720642, "num_tokens": 366991170.0, "step": 9618 }, { "epoch": 1.2236356697621167, "ewc_loss": 0.02725657820701599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.725657759583555e-05, "grad_norm": 16.767488479614258, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8722521066665649, "num_tokens": 367031833.0, "step": 9619 }, { "epoch": 1.2237628800407072, "ewc_loss": 0.02733444795012474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.733444853220135e-05, "grad_norm": 16.82805061340332, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8634372353553772, "num_tokens": 367074709.0, "step": 9620 }, { "epoch": 1.2238900903192977, "ewc_loss": 0.027248991653323174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7248992410022765e-05, "grad_norm": 16.784090042114258, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8644679188728333, "num_tokens": 367106820.0, "step": 9621 }, { "epoch": 1.2240173005978883, "ewc_loss": 0.0272879246622324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7287924240226857e-05, "grad_norm": 16.745737075805664, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8815335631370544, "num_tokens": 367142298.0, "step": 9622 }, { "epoch": 1.2241445108764788, "ewc_loss": 0.027237921953201294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7237922040512785e-05, "grad_norm": 16.73039436340332, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8578978180885315, "num_tokens": 367179957.0, "step": 9623 }, { "epoch": 1.2242717211550693, "ewc_loss": 0.02730415388941765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7304153263685293e-05, "grad_norm": 16.797168731689453, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8703770637512207, "num_tokens": 367226800.0, "step": 9624 }, { "epoch": 1.2243989314336599, "ewc_loss": 0.02730514295399189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7305142793920822e-05, "grad_norm": 16.847639083862305, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.871699333190918, "num_tokens": 367257168.0, "step": 9625 }, { "epoch": 1.2245261417122504, "ewc_loss": 0.027292294427752495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7292295271763578e-05, "grad_norm": 16.73736572265625, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8695987462997437, "num_tokens": 367295254.0, "step": 9626 }, { "epoch": 1.224653351990841, "ewc_loss": 0.02722952328622341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7229523766436614e-05, "grad_norm": 16.77765655517578, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.854999840259552, "num_tokens": 367335814.0, "step": 9627 }, { "epoch": 1.2247805622694314, "ewc_loss": 0.027313338592648506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.73133391601732e-05, "grad_norm": 16.744752883911133, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.864493727684021, "num_tokens": 367377592.0, "step": 9628 }, { "epoch": 1.224907772548022, "ewc_loss": 0.027305208146572113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730520827753935e-05, "grad_norm": 16.86929702758789, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8595656156539917, "num_tokens": 367408069.0, "step": 9629 }, { "epoch": 1.2250349828266125, "ewc_loss": 0.02732151746749878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7321517336531542e-05, "grad_norm": 16.8085880279541, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8692430257797241, "num_tokens": 367444787.0, "step": 9630 }, { "epoch": 1.225162193105203, "ewc_loss": 0.02728950046002865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728950130403973e-05, "grad_norm": 16.778217315673828, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8543127775192261, "num_tokens": 367486848.0, "step": 9631 }, { "epoch": 1.2252894033837933, "ewc_loss": 0.027283191680908203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728319122979883e-05, "grad_norm": 16.77635955810547, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8779654502868652, "num_tokens": 367530456.0, "step": 9632 }, { "epoch": 1.2254166136623839, "ewc_loss": 0.027276376262307167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7276375476503745e-05, "grad_norm": 16.806236267089844, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8706923723220825, "num_tokens": 367566513.0, "step": 9633 }, { "epoch": 1.2255438239409744, "ewc_loss": 0.027315376326441765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.731537642830517e-05, "grad_norm": 16.827016830444336, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8471649289131165, "num_tokens": 367607757.0, "step": 9634 }, { "epoch": 1.225671034219565, "ewc_loss": 0.027260759845376015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7260759452474304e-05, "grad_norm": 16.76395606994629, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.869658350944519, "num_tokens": 367645525.0, "step": 9635 }, { "epoch": 1.2257982444981554, "ewc_loss": 0.027263445779681206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726344609982334e-05, "grad_norm": 16.785045623779297, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8770164251327515, "num_tokens": 367680840.0, "step": 9636 }, { "epoch": 1.225925454776746, "ewc_loss": 0.027273764833807945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7273765226709656e-05, "grad_norm": 16.791595458984375, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8755712509155273, "num_tokens": 367717444.0, "step": 9637 }, { "epoch": 1.2260526650553365, "ewc_loss": 0.02726665884256363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7266658435110003e-05, "grad_norm": 16.78960418701172, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8468216061592102, "num_tokens": 367756548.0, "step": 9638 }, { "epoch": 1.226179875333927, "ewc_loss": 0.027273209765553474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7273210434941575e-05, "grad_norm": 16.829469680786133, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8621037602424622, "num_tokens": 367798341.0, "step": 9639 }, { "epoch": 1.2263070856125176, "ewc_loss": 0.027275709435343742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7275709726382047e-05, "grad_norm": 16.824087142944336, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8733880519866943, "num_tokens": 367835087.0, "step": 9640 }, { "epoch": 1.226434295891108, "ewc_loss": 0.02726365439593792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726365528360475e-05, "grad_norm": 16.754175186157227, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.86928391456604, "num_tokens": 367873816.0, "step": 9641 }, { "epoch": 1.2265615061696984, "ewc_loss": 0.02725236304104328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7252362997387536e-05, "grad_norm": 16.84425926208496, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8637159466743469, "num_tokens": 367913621.0, "step": 9642 }, { "epoch": 1.226688716448289, "ewc_loss": 0.027279406785964966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7279405912850052e-05, "grad_norm": 16.820837020874023, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8588565587997437, "num_tokens": 367946881.0, "step": 9643 }, { "epoch": 1.2268159267268794, "ewc_loss": 0.02723822556436062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7238225811743177e-05, "grad_norm": 16.857833862304688, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8716865181922913, "num_tokens": 367980806.0, "step": 9644 }, { "epoch": 1.22694313700547, "ewc_loss": 0.027321256697177887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7321257221046835e-05, "grad_norm": 16.800695419311523, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.874565839767456, "num_tokens": 368016041.0, "step": 9645 }, { "epoch": 1.2270703472840605, "ewc_loss": 0.02720622345805168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7206224331166595e-05, "grad_norm": 16.80158233642578, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8651846051216125, "num_tokens": 368052331.0, "step": 9646 }, { "epoch": 1.227197557562651, "ewc_loss": 0.027270035818219185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7270036298432387e-05, "grad_norm": 16.785457611083984, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8764919638633728, "num_tokens": 368091458.0, "step": 9647 }, { "epoch": 1.2273247678412416, "ewc_loss": 0.02719944901764393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7199448595638387e-05, "grad_norm": 16.804391860961914, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8564233183860779, "num_tokens": 368132433.0, "step": 9648 }, { "epoch": 1.227451978119832, "ewc_loss": 0.027271050959825516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7271051294519566e-05, "grad_norm": 16.83283042907715, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8678951263427734, "num_tokens": 368171030.0, "step": 9649 }, { "epoch": 1.2275791883984226, "ewc_loss": 0.02724556252360344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7245561796007678e-05, "grad_norm": 16.809581756591797, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8566418886184692, "num_tokens": 368211555.0, "step": 9650 }, { "epoch": 1.2277063986770131, "ewc_loss": 0.027242062613368034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7242062060395256e-05, "grad_norm": 16.87735366821289, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8733856678009033, "num_tokens": 368252076.0, "step": 9651 }, { "epoch": 1.2278336089556037, "ewc_loss": 0.02725537307560444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7255373424850404e-05, "grad_norm": 16.772205352783203, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8612972497940063, "num_tokens": 368291265.0, "step": 9652 }, { "epoch": 1.2279608192341942, "ewc_loss": 0.027168935164809227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7168935048393905e-05, "grad_norm": 16.90612030029297, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8603482246398926, "num_tokens": 368332178.0, "step": 9653 }, { "epoch": 1.2280880295127847, "ewc_loss": 0.027262065559625626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726206548686605e-05, "grad_norm": 16.7724609375, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8656797409057617, "num_tokens": 368374103.0, "step": 9654 }, { "epoch": 1.2282152397913753, "ewc_loss": 0.02712148241698742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7121483071823604e-05, "grad_norm": 16.837886810302734, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8700684905052185, "num_tokens": 368411928.0, "step": 9655 }, { "epoch": 1.2283424500699656, "ewc_loss": 0.027294056490063667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.729405605350621e-05, "grad_norm": 16.816110610961914, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8435473442077637, "num_tokens": 368449200.0, "step": 9656 }, { "epoch": 1.228469660348556, "ewc_loss": 0.02717871218919754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7178712116437964e-05, "grad_norm": 16.77098274230957, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8504717350006104, "num_tokens": 368489935.0, "step": 9657 }, { "epoch": 1.2285968706271466, "ewc_loss": 0.027287505567073822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7287505872664042e-05, "grad_norm": 16.839570999145508, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8810670375823975, "num_tokens": 368522227.0, "step": 9658 }, { "epoch": 1.2287240809057371, "ewc_loss": 0.02730187214910984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7301872250973247e-05, "grad_norm": 16.881488800048828, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8667552471160889, "num_tokens": 368555084.0, "step": 9659 }, { "epoch": 1.2288512911843277, "ewc_loss": 0.027256548404693604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7256548492005095e-05, "grad_norm": 16.789081573486328, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.863991379737854, "num_tokens": 368591845.0, "step": 9660 }, { "epoch": 1.2289785014629182, "ewc_loss": 0.027242183685302734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7242183932685293e-05, "grad_norm": 16.768001556396484, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8567709922790527, "num_tokens": 368637841.0, "step": 9661 }, { "epoch": 1.2291057117415087, "ewc_loss": 0.027260614559054375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726061393332202e-05, "grad_norm": 16.78955841064453, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8819737434387207, "num_tokens": 368677470.0, "step": 9662 }, { "epoch": 1.2292329220200993, "ewc_loss": 0.02728859707713127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728859726630617e-05, "grad_norm": 16.854101181030273, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8803307414054871, "num_tokens": 368718412.0, "step": 9663 }, { "epoch": 1.2293601322986898, "ewc_loss": 0.027288038283586502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728803883655928e-05, "grad_norm": 16.80967903137207, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8561925292015076, "num_tokens": 368759267.0, "step": 9664 }, { "epoch": 1.2294873425772803, "ewc_loss": 0.027265213429927826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7265214157523587e-05, "grad_norm": 16.846664428710938, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.858301043510437, "num_tokens": 368797586.0, "step": 9665 }, { "epoch": 1.2296145528558708, "ewc_loss": 0.027261873707175255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7261874492978677e-05, "grad_norm": 16.824628829956055, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8593740463256836, "num_tokens": 368833801.0, "step": 9666 }, { "epoch": 1.2297417631344612, "ewc_loss": 0.02727012149989605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7270121790934354e-05, "grad_norm": 16.797298431396484, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8510225415229797, "num_tokens": 368878513.0, "step": 9667 }, { "epoch": 1.2298689734130517, "ewc_loss": 0.02726001851260662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.726001912378706e-05, "grad_norm": 16.86802864074707, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8596282601356506, "num_tokens": 368917815.0, "step": 9668 }, { "epoch": 1.2299961836916422, "ewc_loss": 0.027288734912872314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.728873550950084e-05, "grad_norm": 16.721555709838867, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8562077283859253, "num_tokens": 368954971.0, "step": 9669 }, { "epoch": 1.2301233939702327, "ewc_loss": 0.027219576761126518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7219577532378025e-05, "grad_norm": 16.823814392089844, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8721213340759277, "num_tokens": 368996212.0, "step": 9670 }, { "epoch": 1.2302506042488233, "ewc_loss": 0.027251478284597397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7251478968537413e-05, "grad_norm": 16.78778076171875, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8808500170707703, "num_tokens": 369032186.0, "step": 9671 }, { "epoch": 1.2303778145274138, "ewc_loss": 0.02725939080119133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7259391572442837e-05, "grad_norm": 16.795591354370117, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8711373209953308, "num_tokens": 369077565.0, "step": 9672 }, { "epoch": 1.2305050248060043, "ewc_loss": 0.027301905676722527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730190499278251e-05, "grad_norm": 16.835275650024414, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.873587429523468, "num_tokens": 369111935.0, "step": 9673 }, { "epoch": 1.2306322350845948, "ewc_loss": 0.027218520641326904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7218520699534565e-05, "grad_norm": 16.771114349365234, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8659368753433228, "num_tokens": 369154374.0, "step": 9674 }, { "epoch": 1.2307594453631854, "ewc_loss": 0.02725384384393692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7253843654762022e-05, "grad_norm": 16.813505172729492, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8584644794464111, "num_tokens": 369198557.0, "step": 9675 }, { "epoch": 1.230886655641776, "ewc_loss": 0.027257703244686127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7257703550276347e-05, "grad_norm": 16.772449493408203, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8507505655288696, "num_tokens": 369237756.0, "step": 9676 }, { "epoch": 1.2310138659203664, "ewc_loss": 0.02721385657787323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7213856810703874e-05, "grad_norm": 16.796199798583984, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8543975353240967, "num_tokens": 369270155.0, "step": 9677 }, { "epoch": 1.231141076198957, "ewc_loss": 0.027276983484625816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.727698301896453e-05, "grad_norm": 16.797719955444336, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8591792583465576, "num_tokens": 369310870.0, "step": 9678 }, { "epoch": 1.2312682864775475, "ewc_loss": 0.027239253744482994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.723925354075618e-05, "grad_norm": 16.786354064941406, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8482484221458435, "num_tokens": 369344679.0, "step": 9679 }, { "epoch": 1.231395496756138, "ewc_loss": 0.027266204357147217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7266203687759116e-05, "grad_norm": 16.759323120117188, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8664265871047974, "num_tokens": 369380388.0, "step": 9680 }, { "epoch": 1.2315227070347283, "ewc_loss": 0.027260782197117805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7260781280347146e-05, "grad_norm": 16.74789047241211, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.879051148891449, "num_tokens": 369422327.0, "step": 9681 }, { "epoch": 1.2316499173133189, "ewc_loss": 0.027266617864370346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7266618417343125e-05, "grad_norm": 16.756643295288086, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8741209506988525, "num_tokens": 369457160.0, "step": 9682 }, { "epoch": 1.2317771275919094, "ewc_loss": 0.027291616424918175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7291616788716055e-05, "grad_norm": 16.791671752929688, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8666769862174988, "num_tokens": 369497883.0, "step": 9683 }, { "epoch": 1.2319043378705, "ewc_loss": 0.027294129133224487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7294128813082352e-05, "grad_norm": 16.746349334716797, "learning_rate": 1e-06, "loss": 0.3257, "mean_token_accuracy": 0.8961656093597412, "num_tokens": 369540014.0, "step": 9684 }, { "epoch": 1.2320315481490904, "ewc_loss": 0.02727428637444973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.727428545767907e-05, "grad_norm": 16.788314819335938, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.865416944026947, "num_tokens": 369580424.0, "step": 9685 }, { "epoch": 1.232158758427681, "ewc_loss": 0.027309967204928398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7309966753819026e-05, "grad_norm": 16.787343978881836, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8632389307022095, "num_tokens": 369623615.0, "step": 9686 }, { "epoch": 1.2322859687062715, "ewc_loss": 0.02728094719350338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7280946596874855e-05, "grad_norm": 16.727405548095703, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.855320155620575, "num_tokens": 369658408.0, "step": 9687 }, { "epoch": 1.232413178984862, "ewc_loss": 0.027241671457886696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7241670977673493e-05, "grad_norm": 16.77235221862793, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8638232350349426, "num_tokens": 369696311.0, "step": 9688 }, { "epoch": 1.2325403892634526, "ewc_loss": 0.027332914993166924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.733291512413416e-05, "grad_norm": 16.804080963134766, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8540350198745728, "num_tokens": 369737473.0, "step": 9689 }, { "epoch": 1.232667599542043, "ewc_loss": 0.027311835438013077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7311834855936468e-05, "grad_norm": 16.82809066772461, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8639522194862366, "num_tokens": 369770151.0, "step": 9690 }, { "epoch": 1.2327948098206334, "ewc_loss": 0.02732277661561966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7322776077198796e-05, "grad_norm": 16.775035858154297, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.884680986404419, "num_tokens": 369813503.0, "step": 9691 }, { "epoch": 1.232922020099224, "ewc_loss": 0.027279332280158997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.727933315327391e-05, "grad_norm": 16.834877014160156, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.868607759475708, "num_tokens": 369848768.0, "step": 9692 }, { "epoch": 1.2330492303778144, "ewc_loss": 0.027280382812023163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7280382710159756e-05, "grad_norm": 16.67170524597168, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8694807291030884, "num_tokens": 369892510.0, "step": 9693 }, { "epoch": 1.233176440656405, "ewc_loss": 0.027234813198447227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7234813387622125e-05, "grad_norm": 16.80733871459961, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8741229176521301, "num_tokens": 369929638.0, "step": 9694 }, { "epoch": 1.2333036509349955, "ewc_loss": 0.027370737865567207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.737073737080209e-05, "grad_norm": 16.706205368041992, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8874890804290771, "num_tokens": 369971201.0, "step": 9695 }, { "epoch": 1.233430861213586, "ewc_loss": 0.027257001027464867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7257001420366578e-05, "grad_norm": 16.845483779907227, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8606904149055481, "num_tokens": 370011713.0, "step": 9696 }, { "epoch": 1.2335580714921766, "ewc_loss": 0.027362221851944923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736222268140409e-05, "grad_norm": 16.712583541870117, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.846729040145874, "num_tokens": 370042647.0, "step": 9697 }, { "epoch": 1.233685281770767, "ewc_loss": 0.02721947431564331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7219473849982023e-05, "grad_norm": 16.776729583740234, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.86549973487854, "num_tokens": 370083976.0, "step": 9698 }, { "epoch": 1.2338124920493576, "ewc_loss": 0.02737731672823429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7377316655474715e-05, "grad_norm": 16.73996353149414, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8625919818878174, "num_tokens": 370126435.0, "step": 9699 }, { "epoch": 1.2339397023279481, "ewc_loss": 0.027290252968668938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7290252546663396e-05, "grad_norm": 16.745500564575195, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8755730390548706, "num_tokens": 370165038.0, "step": 9700 }, { "epoch": 1.2340669126065387, "ewc_loss": 0.02741030417382717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.741030402830802e-05, "grad_norm": 16.798585891723633, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.858684241771698, "num_tokens": 370203438.0, "step": 9701 }, { "epoch": 1.2341941228851292, "ewc_loss": 0.027326209470629692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.732621032919269e-05, "grad_norm": 16.80145835876465, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8819806575775146, "num_tokens": 370235144.0, "step": 9702 }, { "epoch": 1.2343213331637197, "ewc_loss": 0.0273333378136158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7333337129675783e-05, "grad_norm": 16.746097564697266, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.861414909362793, "num_tokens": 370278936.0, "step": 9703 }, { "epoch": 1.2344485434423103, "ewc_loss": 0.027298804372549057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7298803615849465e-05, "grad_norm": 16.78346061706543, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.870323896408081, "num_tokens": 370312595.0, "step": 9704 }, { "epoch": 1.2345757537209006, "ewc_loss": 0.027351828292012215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.735182897595223e-05, "grad_norm": 16.80123519897461, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8636085987091064, "num_tokens": 370351697.0, "step": 9705 }, { "epoch": 1.234702963999491, "ewc_loss": 0.027308005839586258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7308005883242004e-05, "grad_norm": 16.75809097290039, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8803456425666809, "num_tokens": 370391264.0, "step": 9706 }, { "epoch": 1.2348301742780816, "ewc_loss": 0.027410166338086128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.741016578511335e-05, "grad_norm": 16.856637954711914, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8731885552406311, "num_tokens": 370425248.0, "step": 9707 }, { "epoch": 1.2349573845566721, "ewc_loss": 0.027369419112801552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736941860348452e-05, "grad_norm": 16.846426010131836, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8458121418952942, "num_tokens": 370462670.0, "step": 9708 }, { "epoch": 1.2350845948352627, "ewc_loss": 0.027344239875674248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7344240152160637e-05, "grad_norm": 16.766521453857422, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8466231822967529, "num_tokens": 370506126.0, "step": 9709 }, { "epoch": 1.2352118051138532, "ewc_loss": 0.027322659268975258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.732265966187697e-05, "grad_norm": 16.838687896728516, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8765365481376648, "num_tokens": 370546895.0, "step": 9710 }, { "epoch": 1.2353390153924437, "ewc_loss": 0.02734302543103695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.734302506723907e-05, "grad_norm": 16.776338577270508, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.871025562286377, "num_tokens": 370590645.0, "step": 9711 }, { "epoch": 1.2354662256710343, "ewc_loss": 0.027255035936832428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7255035092821345e-05, "grad_norm": 16.75307846069336, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.869953989982605, "num_tokens": 370628614.0, "step": 9712 }, { "epoch": 1.2355934359496248, "ewc_loss": 0.027315160259604454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.731515996856615e-05, "grad_norm": 16.8170166015625, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8688672780990601, "num_tokens": 370666082.0, "step": 9713 }, { "epoch": 1.2357206462282153, "ewc_loss": 0.027323799207806587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.732379834924359e-05, "grad_norm": 16.803726196289062, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8526978492736816, "num_tokens": 370705780.0, "step": 9714 }, { "epoch": 1.2358478565068058, "ewc_loss": 0.027327410876750946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.732741086219903e-05, "grad_norm": 16.82748794555664, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8723225593566895, "num_tokens": 370742550.0, "step": 9715 }, { "epoch": 1.2359750667853961, "ewc_loss": 0.027299487963318825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72994875558652e-05, "grad_norm": 16.76604461669922, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8705507516860962, "num_tokens": 370784406.0, "step": 9716 }, { "epoch": 1.2361022770639867, "ewc_loss": 0.027301141992211342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7301142836222425e-05, "grad_norm": 16.81050682067871, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8617235422134399, "num_tokens": 370820911.0, "step": 9717 }, { "epoch": 1.2362294873425772, "ewc_loss": 0.02729829214513302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.729829247982707e-05, "grad_norm": 16.838607788085938, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8780791759490967, "num_tokens": 370857427.0, "step": 9718 }, { "epoch": 1.2363566976211677, "ewc_loss": 0.027286970987915993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.72869710897794e-05, "grad_norm": 16.841419219970703, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8567209243774414, "num_tokens": 370897989.0, "step": 9719 }, { "epoch": 1.2364839078997583, "ewc_loss": 0.02729732170701027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7297321139485575e-05, "grad_norm": 16.900634765625, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8540566563606262, "num_tokens": 370932672.0, "step": 9720 }, { "epoch": 1.2366111181783488, "ewc_loss": 0.027288416400551796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7288417186355218e-05, "grad_norm": 16.793745040893555, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8586674928665161, "num_tokens": 370971027.0, "step": 9721 }, { "epoch": 1.2367383284569393, "ewc_loss": 0.027282778173685074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7282778319204226e-05, "grad_norm": 16.81324577331543, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8774809837341309, "num_tokens": 371014742.0, "step": 9722 }, { "epoch": 1.2368655387355298, "ewc_loss": 0.02729124389588833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.729124389588833e-05, "grad_norm": 16.92791748046875, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8508286476135254, "num_tokens": 371052065.0, "step": 9723 }, { "epoch": 1.2369927490141204, "ewc_loss": 0.02730889804661274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730889718804974e-05, "grad_norm": 16.745628356933594, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8454326391220093, "num_tokens": 371093897.0, "step": 9724 }, { "epoch": 1.237119959292711, "ewc_loss": 0.027292579412460327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.729257903411053e-05, "grad_norm": 16.844486236572266, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8667372465133667, "num_tokens": 371132390.0, "step": 9725 }, { "epoch": 1.2372471695713014, "ewc_loss": 0.027390586212277412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7390586183173582e-05, "grad_norm": 16.800865173339844, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8793243765830994, "num_tokens": 371170468.0, "step": 9726 }, { "epoch": 1.237374379849892, "ewc_loss": 0.027308691293001175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730869164224714e-05, "grad_norm": 16.801677703857422, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8637653589248657, "num_tokens": 371209323.0, "step": 9727 }, { "epoch": 1.2375015901284825, "ewc_loss": 0.02732819877564907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7328198484610766e-05, "grad_norm": 16.793865203857422, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8666267395019531, "num_tokens": 371246526.0, "step": 9728 }, { "epoch": 1.237628800407073, "ewc_loss": 0.02733626589179039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7336265702615492e-05, "grad_norm": 16.78877067565918, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8709930181503296, "num_tokens": 371282850.0, "step": 9729 }, { "epoch": 1.2377560106856633, "ewc_loss": 0.027347054332494736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7347054128767923e-05, "grad_norm": 16.872638702392578, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8746150732040405, "num_tokens": 371312614.0, "step": 9730 }, { "epoch": 1.2378832209642538, "ewc_loss": 0.02736959233880043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736959322646726e-05, "grad_norm": 16.812580108642578, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8683854937553406, "num_tokens": 371350170.0, "step": 9731 }, { "epoch": 1.2380104312428444, "ewc_loss": 0.027353959158062935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7353959012543783e-05, "grad_norm": 16.792713165283203, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8732672929763794, "num_tokens": 371389295.0, "step": 9732 }, { "epoch": 1.238137641521435, "ewc_loss": 0.027369506657123566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736950591497589e-05, "grad_norm": 16.841764450073242, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.866893470287323, "num_tokens": 371430579.0, "step": 9733 }, { "epoch": 1.2382648518000254, "ewc_loss": 0.027386179193854332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.738617877184879e-05, "grad_norm": 16.80678367614746, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8574751615524292, "num_tokens": 371465087.0, "step": 9734 }, { "epoch": 1.238392062078616, "ewc_loss": 0.02733837254345417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7338372092344798e-05, "grad_norm": 16.883718490600586, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8751847743988037, "num_tokens": 371503549.0, "step": 9735 }, { "epoch": 1.2385192723572065, "ewc_loss": 0.02732965536415577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7329655495123006e-05, "grad_norm": 16.7979793548584, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.886364221572876, "num_tokens": 371542953.0, "step": 9736 }, { "epoch": 1.238646482635797, "ewc_loss": 0.027305619791150093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.730561936914455e-05, "grad_norm": 16.802108764648438, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8707537651062012, "num_tokens": 371582541.0, "step": 9737 }, { "epoch": 1.2387736929143875, "ewc_loss": 0.027351362630724907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7351363314664923e-05, "grad_norm": 16.802257537841797, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8771706819534302, "num_tokens": 371617221.0, "step": 9738 }, { "epoch": 1.238900903192978, "ewc_loss": 0.027381984516978264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7381984182284214e-05, "grad_norm": 16.84406280517578, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8703004121780396, "num_tokens": 371660773.0, "step": 9739 }, { "epoch": 1.2390281134715684, "ewc_loss": 0.027348477393388748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7348476578481495e-05, "grad_norm": 16.862821578979492, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8614988923072815, "num_tokens": 371694763.0, "step": 9740 }, { "epoch": 1.239155323750159, "ewc_loss": 0.027339225634932518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.733922519837506e-05, "grad_norm": 16.75413703918457, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8664018511772156, "num_tokens": 371734793.0, "step": 9741 }, { "epoch": 1.2392825340287494, "ewc_loss": 0.027307238429784775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7307238269713707e-05, "grad_norm": 16.79991340637207, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8624691963195801, "num_tokens": 371775691.0, "step": 9742 }, { "epoch": 1.23940974430734, "ewc_loss": 0.027340145781636238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7340145607013255e-05, "grad_norm": 16.74230194091797, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8590987920761108, "num_tokens": 371817319.0, "step": 9743 }, { "epoch": 1.2395369545859305, "ewc_loss": 0.027307188138365746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7307187338010408e-05, "grad_norm": 16.76308250427246, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8665395975112915, "num_tokens": 371847431.0, "step": 9744 }, { "epoch": 1.239664164864521, "ewc_loss": 0.027432270348072052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.743227014434524e-05, "grad_norm": 16.783790588378906, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8799389600753784, "num_tokens": 371881788.0, "step": 9745 }, { "epoch": 1.2397913751431116, "ewc_loss": 0.027358179911971092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.735817906796001e-05, "grad_norm": 16.772092819213867, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8626395463943481, "num_tokens": 371917388.0, "step": 9746 }, { "epoch": 1.239918585421702, "ewc_loss": 0.027430200949311256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7430200134404004e-05, "grad_norm": 16.81180191040039, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8763551712036133, "num_tokens": 371957561.0, "step": 9747 }, { "epoch": 1.2400457957002926, "ewc_loss": 0.027396807447075844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.739680712693371e-05, "grad_norm": 16.81683349609375, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8595026731491089, "num_tokens": 371987322.0, "step": 9748 }, { "epoch": 1.2401730059788831, "ewc_loss": 0.02739417739212513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.739417686825618e-05, "grad_norm": 16.803571701049805, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8690744638442993, "num_tokens": 372027263.0, "step": 9749 }, { "epoch": 1.2403002162574737, "ewc_loss": 0.027458971366286278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.745897108979989e-05, "grad_norm": 16.796152114868164, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8711657524108887, "num_tokens": 372061441.0, "step": 9750 }, { "epoch": 1.2404274265360642, "ewc_loss": 0.027434753254055977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.743475306488108e-05, "grad_norm": 16.758676528930664, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8729210495948792, "num_tokens": 372099085.0, "step": 9751 }, { "epoch": 1.2405546368146547, "ewc_loss": 0.027445513755083084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7445514206192456e-05, "grad_norm": 16.786483764648438, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8705880641937256, "num_tokens": 372134106.0, "step": 9752 }, { "epoch": 1.2406818470932452, "ewc_loss": 0.027489421889185905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.748942279140465e-05, "grad_norm": 16.829294204711914, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.875735342502594, "num_tokens": 372164635.0, "step": 9753 }, { "epoch": 1.2408090573718356, "ewc_loss": 0.02748784050345421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7487840270623565e-05, "grad_norm": 16.89068031311035, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8735362887382507, "num_tokens": 372204924.0, "step": 9754 }, { "epoch": 1.240936267650426, "ewc_loss": 0.027511190623044968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7511190637596883e-05, "grad_norm": 16.78522491455078, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8664811253547668, "num_tokens": 372240939.0, "step": 9755 }, { "epoch": 1.2410634779290166, "ewc_loss": 0.027465011924505234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7465011953609064e-05, "grad_norm": 16.950780868530273, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8628854751586914, "num_tokens": 372272355.0, "step": 9756 }, { "epoch": 1.2411906882076071, "ewc_loss": 0.02748926170170307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7489260901347734e-05, "grad_norm": 16.8303165435791, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8649119734764099, "num_tokens": 372309566.0, "step": 9757 }, { "epoch": 1.2413178984861977, "ewc_loss": 0.027437327429652214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7437326934887096e-05, "grad_norm": 16.93060874938965, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8613090515136719, "num_tokens": 372347768.0, "step": 9758 }, { "epoch": 1.2414451087647882, "ewc_loss": 0.027539754286408424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7539754228200763e-05, "grad_norm": 16.81222152709961, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8756923675537109, "num_tokens": 372381020.0, "step": 9759 }, { "epoch": 1.2415723190433787, "ewc_loss": 0.027437930926680565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7437930839369074e-05, "grad_norm": 16.92626953125, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8566054701805115, "num_tokens": 372422434.0, "step": 9760 }, { "epoch": 1.2416995293219693, "ewc_loss": 0.027478056028485298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747805592662189e-05, "grad_norm": 16.740446090698242, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.858518660068512, "num_tokens": 372459342.0, "step": 9761 }, { "epoch": 1.2418267396005598, "ewc_loss": 0.02740180306136608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.740180389082525e-05, "grad_norm": 16.814163208007812, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8708806037902832, "num_tokens": 372496637.0, "step": 9762 }, { "epoch": 1.2419539498791503, "ewc_loss": 0.027541643008589745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7541642339201644e-05, "grad_norm": 16.79134750366211, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8796118497848511, "num_tokens": 372532068.0, "step": 9763 }, { "epoch": 1.2420811601577408, "ewc_loss": 0.027481073513627052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7481073630042374e-05, "grad_norm": 16.89847183227539, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8737515807151794, "num_tokens": 372565460.0, "step": 9764 }, { "epoch": 1.2422083704363311, "ewc_loss": 0.027569644153118134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.756964386207983e-05, "grad_norm": 16.812814712524414, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8707820177078247, "num_tokens": 372605998.0, "step": 9765 }, { "epoch": 1.2423355807149217, "ewc_loss": 0.02748185209929943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7481852157507092e-05, "grad_norm": 16.823211669921875, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8575377464294434, "num_tokens": 372642038.0, "step": 9766 }, { "epoch": 1.2424627909935122, "ewc_loss": 0.02744731679558754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.744731682469137e-05, "grad_norm": 16.758922576904297, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.864149272441864, "num_tokens": 372678192.0, "step": 9767 }, { "epoch": 1.2425900012721027, "ewc_loss": 0.027493013069033623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.749301347648725e-05, "grad_norm": 16.810386657714844, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8640071153640747, "num_tokens": 372712867.0, "step": 9768 }, { "epoch": 1.2427172115506933, "ewc_loss": 0.027491511777043343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.749151099123992e-05, "grad_norm": 16.812055587768555, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8496770262718201, "num_tokens": 372745790.0, "step": 9769 }, { "epoch": 1.2428444218292838, "ewc_loss": 0.02751343324780464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7513433451531455e-05, "grad_norm": 16.868282318115234, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8791183233261108, "num_tokens": 372777634.0, "step": 9770 }, { "epoch": 1.2429716321078743, "ewc_loss": 0.02749745175242424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.74974518106319e-05, "grad_norm": 16.7587833404541, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.852676510810852, "num_tokens": 372820527.0, "step": 9771 }, { "epoch": 1.2430988423864648, "ewc_loss": 0.027483133599162102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7483132726047188e-05, "grad_norm": 16.846818923950195, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8770021200180054, "num_tokens": 372858479.0, "step": 9772 }, { "epoch": 1.2432260526650554, "ewc_loss": 0.027590658515691757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7590658646658994e-05, "grad_norm": 16.89829444885254, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.873568058013916, "num_tokens": 372895219.0, "step": 9773 }, { "epoch": 1.243353262943646, "ewc_loss": 0.02746991068124771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7469910492072813e-05, "grad_norm": 16.79434585571289, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8792561292648315, "num_tokens": 372934491.0, "step": 9774 }, { "epoch": 1.2434804732222364, "ewc_loss": 0.02746395766735077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7463956939755008e-05, "grad_norm": 16.796266555786133, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8581883311271667, "num_tokens": 372972892.0, "step": 9775 }, { "epoch": 1.243607683500827, "ewc_loss": 0.027547182515263557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7547182980924845e-05, "grad_norm": 16.89740562438965, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8706780672073364, "num_tokens": 373011564.0, "step": 9776 }, { "epoch": 1.2437348937794175, "ewc_loss": 0.027519591152668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7519590730662458e-05, "grad_norm": 16.838430404663086, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8693256974220276, "num_tokens": 373049444.0, "step": 9777 }, { "epoch": 1.243862104058008, "ewc_loss": 0.027473146095871925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747314647422172e-05, "grad_norm": 16.830089569091797, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8668196201324463, "num_tokens": 373085466.0, "step": 9778 }, { "epoch": 1.2439893143365983, "ewc_loss": 0.027463024482131004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.746302379819099e-05, "grad_norm": 16.791950225830078, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8452909588813782, "num_tokens": 373124712.0, "step": 9779 }, { "epoch": 1.2441165246151888, "ewc_loss": 0.027455836534500122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.745583697105758e-05, "grad_norm": 16.818946838378906, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8727979063987732, "num_tokens": 373160008.0, "step": 9780 }, { "epoch": 1.2442437348937794, "ewc_loss": 0.027464281767606735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7464282538858242e-05, "grad_norm": 16.795875549316406, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8672378659248352, "num_tokens": 373192110.0, "step": 9781 }, { "epoch": 1.24437094517237, "ewc_loss": 0.02744598127901554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7445981686469167e-05, "grad_norm": 16.825572967529297, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8689168095588684, "num_tokens": 373234432.0, "step": 9782 }, { "epoch": 1.2444981554509604, "ewc_loss": 0.02750164270401001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750164276221767e-05, "grad_norm": 16.821826934814453, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8581393361091614, "num_tokens": 373270725.0, "step": 9783 }, { "epoch": 1.244625365729551, "ewc_loss": 0.027425047010183334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7425046937423758e-05, "grad_norm": 16.80678939819336, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8458433747291565, "num_tokens": 373307531.0, "step": 9784 }, { "epoch": 1.2447525760081415, "ewc_loss": 0.027506409212946892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750640851445496e-05, "grad_norm": 16.827392578125, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.87114417552948, "num_tokens": 373351657.0, "step": 9785 }, { "epoch": 1.244879786286732, "ewc_loss": 0.027478402480483055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747840335359797e-05, "grad_norm": 16.868770599365234, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8723688721656799, "num_tokens": 373388811.0, "step": 9786 }, { "epoch": 1.2450069965653225, "ewc_loss": 0.027528757229447365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7528756618266925e-05, "grad_norm": 16.90849494934082, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8762993812561035, "num_tokens": 373423378.0, "step": 9787 }, { "epoch": 1.245134206843913, "ewc_loss": 0.027403537184000015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.740353738772683e-05, "grad_norm": 16.738807678222656, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8689230680465698, "num_tokens": 373460982.0, "step": 9788 }, { "epoch": 1.2452614171225034, "ewc_loss": 0.027489308267831802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7489308195072226e-05, "grad_norm": 16.86087989807129, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.874017059803009, "num_tokens": 373501474.0, "step": 9789 }, { "epoch": 1.245388627401094, "ewc_loss": 0.02746869996190071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7468700864119455e-05, "grad_norm": 16.798980712890625, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8723578453063965, "num_tokens": 373537830.0, "step": 9790 }, { "epoch": 1.2455158376796844, "ewc_loss": 0.02738572284579277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.73857222055085e-05, "grad_norm": 16.811586380004883, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8607749938964844, "num_tokens": 373573975.0, "step": 9791 }, { "epoch": 1.245643047958275, "ewc_loss": 0.0274613369256258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.74613375950139e-05, "grad_norm": 16.814041137695312, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.847309410572052, "num_tokens": 373612157.0, "step": 9792 }, { "epoch": 1.2457702582368655, "ewc_loss": 0.02753228321671486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75322836387204e-05, "grad_norm": 16.861249923706055, "learning_rate": 1e-06, "loss": 0.3483, "mean_token_accuracy": 0.8867063522338867, "num_tokens": 373645065.0, "step": 9793 }, { "epoch": 1.245897468515456, "ewc_loss": 0.027475781738758087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747578218986746e-05, "grad_norm": 16.816370010375977, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8619678020477295, "num_tokens": 373687658.0, "step": 9794 }, { "epoch": 1.2460246787940465, "ewc_loss": 0.027438761666417122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7438762117526494e-05, "grad_norm": 16.813037872314453, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8692122101783752, "num_tokens": 373732262.0, "step": 9795 }, { "epoch": 1.246151889072637, "ewc_loss": 0.02743072807788849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7430727641331032e-05, "grad_norm": 16.808732986450195, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8477464914321899, "num_tokens": 373772410.0, "step": 9796 }, { "epoch": 1.2462790993512276, "ewc_loss": 0.02746265009045601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7462650905363262e-05, "grad_norm": 16.859312057495117, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8675899505615234, "num_tokens": 373808773.0, "step": 9797 }, { "epoch": 1.2464063096298181, "ewc_loss": 0.027428293600678444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7428293833509088e-05, "grad_norm": 16.726232528686523, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8622154593467712, "num_tokens": 373848945.0, "step": 9798 }, { "epoch": 1.2465335199084087, "ewc_loss": 0.027447910979390144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.744791163422633e-05, "grad_norm": 16.83724594116211, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8622963428497314, "num_tokens": 373892716.0, "step": 9799 }, { "epoch": 1.2466607301869992, "ewc_loss": 0.027490336447954178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.749033592408523e-05, "grad_norm": 16.77976417541504, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8827857375144958, "num_tokens": 373929443.0, "step": 9800 }, { "epoch": 1.2467879404655897, "ewc_loss": 0.027468234300613403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7468235202832147e-05, "grad_norm": 16.84688377380371, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8680209517478943, "num_tokens": 373968882.0, "step": 9801 }, { "epoch": 1.2469151507441802, "ewc_loss": 0.027555938810110092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7555939595913514e-05, "grad_norm": 16.84164810180664, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8797124624252319, "num_tokens": 374009265.0, "step": 9802 }, { "epoch": 1.2470423610227706, "ewc_loss": 0.02741045504808426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7410455004428513e-05, "grad_norm": 16.766498565673828, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.852871298789978, "num_tokens": 374047152.0, "step": 9803 }, { "epoch": 1.247169571301361, "ewc_loss": 0.0275062695145607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750627027126029e-05, "grad_norm": 16.887283325195312, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.863531231880188, "num_tokens": 374083127.0, "step": 9804 }, { "epoch": 1.2472967815799516, "ewc_loss": 0.027498750016093254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7498750569066033e-05, "grad_norm": 16.79173469543457, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8761146068572998, "num_tokens": 374127086.0, "step": 9805 }, { "epoch": 1.2474239918585421, "ewc_loss": 0.02743247151374817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7432472052169032e-05, "grad_norm": 16.85538673400879, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8622122406959534, "num_tokens": 374163879.0, "step": 9806 }, { "epoch": 1.2475512021371327, "ewc_loss": 0.027515731751918793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7515732654137537e-05, "grad_norm": 16.840166091918945, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8718208074569702, "num_tokens": 374206639.0, "step": 9807 }, { "epoch": 1.2476784124157232, "ewc_loss": 0.027421535924077034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7421536287874915e-05, "grad_norm": 16.770431518554688, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8649975061416626, "num_tokens": 374244106.0, "step": 9808 }, { "epoch": 1.2478056226943137, "ewc_loss": 0.027449024841189384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.74490248557413e-05, "grad_norm": 16.786888122558594, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8647798299789429, "num_tokens": 374285776.0, "step": 9809 }, { "epoch": 1.2479328329729042, "ewc_loss": 0.027491463348269463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7491463697515428e-05, "grad_norm": 16.813610076904297, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8774354457855225, "num_tokens": 374323036.0, "step": 9810 }, { "epoch": 1.2480600432514948, "ewc_loss": 0.027434613555669785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7434613002697006e-05, "grad_norm": 16.805557250976562, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8742778897285461, "num_tokens": 374360204.0, "step": 9811 }, { "epoch": 1.2481872535300853, "ewc_loss": 0.02746012434363365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7460124329081737e-05, "grad_norm": 16.87932014465332, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8762955665588379, "num_tokens": 374394961.0, "step": 9812 }, { "epoch": 1.2483144638086758, "ewc_loss": 0.02744945138692856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7449452318251133e-05, "grad_norm": 16.8528995513916, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8697179555892944, "num_tokens": 374436142.0, "step": 9813 }, { "epoch": 1.2484416740872661, "ewc_loss": 0.027414239943027496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7414240321377292e-05, "grad_norm": 16.824687957763672, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8641071915626526, "num_tokens": 374474263.0, "step": 9814 }, { "epoch": 1.2485688843658567, "ewc_loss": 0.027395199984312057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7395199140300974e-05, "grad_norm": 16.879985809326172, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8619987964630127, "num_tokens": 374510250.0, "step": 9815 }, { "epoch": 1.2486960946444472, "ewc_loss": 0.027450447902083397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7450447305454873e-05, "grad_norm": 16.86894416809082, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8856630325317383, "num_tokens": 374549579.0, "step": 9816 }, { "epoch": 1.2488233049230377, "ewc_loss": 0.027391089126467705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.739108822424896e-05, "grad_norm": 16.852313995361328, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8656712770462036, "num_tokens": 374588631.0, "step": 9817 }, { "epoch": 1.2489505152016283, "ewc_loss": 0.02741076983511448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7410769689595327e-05, "grad_norm": 16.815458297729492, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8637312650680542, "num_tokens": 374627092.0, "step": 9818 }, { "epoch": 1.2490777254802188, "ewc_loss": 0.027400178834795952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7400179533287883e-05, "grad_norm": 16.929183959960938, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8723686933517456, "num_tokens": 374657797.0, "step": 9819 }, { "epoch": 1.2492049357588093, "ewc_loss": 0.027447937056422234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.744793710007798e-05, "grad_norm": 16.884601593017578, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8719290494918823, "num_tokens": 374694011.0, "step": 9820 }, { "epoch": 1.2493321460373998, "ewc_loss": 0.027354594320058823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.735459383984562e-05, "grad_norm": 16.78194236755371, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8693504333496094, "num_tokens": 374735127.0, "step": 9821 }, { "epoch": 1.2494593563159904, "ewc_loss": 0.027373570948839188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7373571356292814e-05, "grad_norm": 16.81463623046875, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8648185729980469, "num_tokens": 374774170.0, "step": 9822 }, { "epoch": 1.249586566594581, "ewc_loss": 0.0274500735104084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7450072593637742e-05, "grad_norm": 16.928543090820312, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8681461215019226, "num_tokens": 374809162.0, "step": 9823 }, { "epoch": 1.2497137768731714, "ewc_loss": 0.027434706687927246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.743470759014599e-05, "grad_norm": 16.783756256103516, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8597491383552551, "num_tokens": 374847974.0, "step": 9824 }, { "epoch": 1.249840987151762, "ewc_loss": 0.027367398142814636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736739770625718e-05, "grad_norm": 16.816614151000977, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.8840430974960327, "num_tokens": 374894911.0, "step": 9825 }, { "epoch": 1.2499681974303525, "ewc_loss": 0.027480099350214005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7480098651722074e-05, "grad_norm": 16.84315299987793, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.861886739730835, "num_tokens": 374933534.0, "step": 9826 }, { "epoch": 1.250095407708943, "ewc_loss": 0.027418704703450203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7418704121373594e-05, "grad_norm": 16.85466766357422, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8552330732345581, "num_tokens": 374968183.0, "step": 9827 }, { "epoch": 1.2502226179875333, "ewc_loss": 0.027406001463532448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7406002118368633e-05, "grad_norm": 16.801856994628906, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8604355454444885, "num_tokens": 375005973.0, "step": 9828 }, { "epoch": 1.2503498282661238, "ewc_loss": 0.02737865410745144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.737865361268632e-05, "grad_norm": 16.825862884521484, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8620590567588806, "num_tokens": 375043446.0, "step": 9829 }, { "epoch": 1.2504770385447144, "ewc_loss": 0.0274409968405962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7440997655503452e-05, "grad_norm": 16.789770126342773, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8707951903343201, "num_tokens": 375081518.0, "step": 9830 }, { "epoch": 1.250604248823305, "ewc_loss": 0.027448687702417374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7448688342701644e-05, "grad_norm": 16.836471557617188, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.860673725605011, "num_tokens": 375117767.0, "step": 9831 }, { "epoch": 1.2507314591018954, "ewc_loss": 0.02743283472955227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7432834031060338e-05, "grad_norm": 16.79201316833496, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8436552286148071, "num_tokens": 375152061.0, "step": 9832 }, { "epoch": 1.250858669380486, "ewc_loss": 0.027483629062771797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7483629310154356e-05, "grad_norm": 16.81285858154297, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8608697652816772, "num_tokens": 375195677.0, "step": 9833 }, { "epoch": 1.2509858796590765, "ewc_loss": 0.02747063711285591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7470636268844828e-05, "grad_norm": 16.74762535095215, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8604466915130615, "num_tokens": 375230319.0, "step": 9834 }, { "epoch": 1.251113089937667, "ewc_loss": 0.027510683983564377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7510683139553294e-05, "grad_norm": 16.848064422607422, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8716935515403748, "num_tokens": 375268970.0, "step": 9835 }, { "epoch": 1.2512403002162575, "ewc_loss": 0.02757730521261692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7577305445447564e-05, "grad_norm": 16.819393157958984, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8742176294326782, "num_tokens": 375303661.0, "step": 9836 }, { "epoch": 1.2513675104948478, "ewc_loss": 0.02747519500553608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7475194656290114e-05, "grad_norm": 16.8824405670166, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8747684955596924, "num_tokens": 375343278.0, "step": 9837 }, { "epoch": 1.2514947207734384, "ewc_loss": 0.027579186484217644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.757918628049083e-05, "grad_norm": 16.839786529541016, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8731203079223633, "num_tokens": 375378574.0, "step": 9838 }, { "epoch": 1.251621931052029, "ewc_loss": 0.027538981288671494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7538981157704256e-05, "grad_norm": 16.897075653076172, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8624191284179688, "num_tokens": 375412781.0, "step": 9839 }, { "epoch": 1.2517491413306194, "ewc_loss": 0.02755717560648918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7557174689718522e-05, "grad_norm": 16.8247127532959, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8791612386703491, "num_tokens": 375450431.0, "step": 9840 }, { "epoch": 1.25187635160921, "ewc_loss": 0.027508344501256943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7508343919180334e-05, "grad_norm": 16.794416427612305, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8614450097084045, "num_tokens": 375490692.0, "step": 9841 }, { "epoch": 1.2520035618878005, "ewc_loss": 0.027569221332669258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7569221856538206e-05, "grad_norm": 16.842796325683594, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8725184202194214, "num_tokens": 375535278.0, "step": 9842 }, { "epoch": 1.252130772166391, "ewc_loss": 0.02751936763525009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.751936699496582e-05, "grad_norm": 16.83901596069336, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8609554767608643, "num_tokens": 375575859.0, "step": 9843 }, { "epoch": 1.2522579824449815, "ewc_loss": 0.027547042816877365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7547042918740772e-05, "grad_norm": 16.828760147094727, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8598069548606873, "num_tokens": 375611084.0, "step": 9844 }, { "epoch": 1.252385192723572, "ewc_loss": 0.02754451334476471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754451270448044e-05, "grad_norm": 16.852108001708984, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.877666711807251, "num_tokens": 375644926.0, "step": 9845 }, { "epoch": 1.2525124030021626, "ewc_loss": 0.027570370584726334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7570371457841247e-05, "grad_norm": 16.8642635345459, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8756215572357178, "num_tokens": 375686970.0, "step": 9846 }, { "epoch": 1.2526396132807531, "ewc_loss": 0.027548789978027344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7548789148568176e-05, "grad_norm": 16.813873291015625, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8789341449737549, "num_tokens": 375727372.0, "step": 9847 }, { "epoch": 1.2527668235593437, "ewc_loss": 0.027563514187932014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.756351386778988e-05, "grad_norm": 16.880613327026367, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8561961650848389, "num_tokens": 375763568.0, "step": 9848 }, { "epoch": 1.2528940338379342, "ewc_loss": 0.027469441294670105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.74694411928067e-05, "grad_norm": 16.815174102783203, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8511325716972351, "num_tokens": 375797954.0, "step": 9849 }, { "epoch": 1.2530212441165247, "ewc_loss": 0.027543023228645325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7543022952158935e-05, "grad_norm": 16.91653823852539, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8652564883232117, "num_tokens": 375836362.0, "step": 9850 }, { "epoch": 1.2531484543951152, "ewc_loss": 0.027572741732001305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7572741601034068e-05, "grad_norm": 16.8230037689209, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8620603084564209, "num_tokens": 375879403.0, "step": 9851 }, { "epoch": 1.2532756646737058, "ewc_loss": 0.027476290240883827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747628968791105e-05, "grad_norm": 16.815166473388672, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.870111346244812, "num_tokens": 375916728.0, "step": 9852 }, { "epoch": 1.253402874952296, "ewc_loss": 0.02756449021399021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7564490665099584e-05, "grad_norm": 16.87868881225586, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8617743849754333, "num_tokens": 375951743.0, "step": 9853 }, { "epoch": 1.2535300852308866, "ewc_loss": 0.027541007846593857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7541007511899807e-05, "grad_norm": 16.851118087768555, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8424227833747864, "num_tokens": 375988872.0, "step": 9854 }, { "epoch": 1.2536572955094771, "ewc_loss": 0.027567215263843536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7567215511226095e-05, "grad_norm": 16.912395477294922, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8564320802688599, "num_tokens": 376030388.0, "step": 9855 }, { "epoch": 1.2537845057880677, "ewc_loss": 0.027583885937929153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7583886549109593e-05, "grad_norm": 16.863134384155273, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8758708238601685, "num_tokens": 376062975.0, "step": 9856 }, { "epoch": 1.2539117160666582, "ewc_loss": 0.02755831368267536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7558313377085142e-05, "grad_norm": 16.904253005981445, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8602396249771118, "num_tokens": 376101873.0, "step": 9857 }, { "epoch": 1.2540389263452487, "ewc_loss": 0.02753940224647522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.753940316324588e-05, "grad_norm": 16.841169357299805, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8467888236045837, "num_tokens": 376138774.0, "step": 9858 }, { "epoch": 1.2541661366238392, "ewc_loss": 0.027583710849285126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.758371010713745e-05, "grad_norm": 16.970008850097656, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8587142825126648, "num_tokens": 376172357.0, "step": 9859 }, { "epoch": 1.2542933469024298, "ewc_loss": 0.02755836583673954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7558366127777845e-05, "grad_norm": 16.813663482666016, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8568810224533081, "num_tokens": 376212977.0, "step": 9860 }, { "epoch": 1.2544205571810203, "ewc_loss": 0.027491804212331772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.749180384853389e-05, "grad_norm": 16.91263198852539, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8574932813644409, "num_tokens": 376256989.0, "step": 9861 }, { "epoch": 1.2545477674596106, "ewc_loss": 0.0275623369961977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7562336981645785e-05, "grad_norm": 16.82135009765625, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8710129857063293, "num_tokens": 376300130.0, "step": 9862 }, { "epoch": 1.2546749777382011, "ewc_loss": 0.0274594034999609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7459404009277932e-05, "grad_norm": 16.856950759887695, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8701013922691345, "num_tokens": 376337475.0, "step": 9863 }, { "epoch": 1.2548021880167917, "ewc_loss": 0.027590284124016762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7590283934841864e-05, "grad_norm": 16.91231918334961, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8577311038970947, "num_tokens": 376376485.0, "step": 9864 }, { "epoch": 1.2549293982953822, "ewc_loss": 0.027499740943312645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7499740099301562e-05, "grad_norm": 16.858247756958008, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8724817633628845, "num_tokens": 376413393.0, "step": 9865 }, { "epoch": 1.2550566085739727, "ewc_loss": 0.027544628828763962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7544629119802266e-05, "grad_norm": 16.906919479370117, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8574520349502563, "num_tokens": 376448192.0, "step": 9866 }, { "epoch": 1.2551838188525632, "ewc_loss": 0.02753792703151703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75379261438502e-05, "grad_norm": 16.803712844848633, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8704354763031006, "num_tokens": 376489382.0, "step": 9867 }, { "epoch": 1.2553110291311538, "ewc_loss": 0.027520010247826576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7520010917214677e-05, "grad_norm": 16.8687744140625, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8713629245758057, "num_tokens": 376527221.0, "step": 9868 }, { "epoch": 1.2554382394097443, "ewc_loss": 0.027567101642489433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7567100914893672e-05, "grad_norm": 16.867116928100586, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8628778457641602, "num_tokens": 376567045.0, "step": 9869 }, { "epoch": 1.2555654496883348, "ewc_loss": 0.027536530047655106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7536529159988277e-05, "grad_norm": 16.885099411010742, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8739728331565857, "num_tokens": 376607039.0, "step": 9870 }, { "epoch": 1.2556926599669254, "ewc_loss": 0.027521127834916115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7521127776708454e-05, "grad_norm": 16.878446578979492, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8690186142921448, "num_tokens": 376643220.0, "step": 9871 }, { "epoch": 1.255819870245516, "ewc_loss": 0.02752845175564289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.752845102804713e-05, "grad_norm": 16.858905792236328, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8860301375389099, "num_tokens": 376681417.0, "step": 9872 }, { "epoch": 1.2559470805241064, "ewc_loss": 0.027504559606313705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7504560421220958e-05, "grad_norm": 16.827293395996094, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8847205638885498, "num_tokens": 376722836.0, "step": 9873 }, { "epoch": 1.256074290802697, "ewc_loss": 0.027522478252649307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7522477466845885e-05, "grad_norm": 16.858396530151367, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8596709370613098, "num_tokens": 376763194.0, "step": 9874 }, { "epoch": 1.2562015010812875, "ewc_loss": 0.027492327615618706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7492327717482112e-05, "grad_norm": 16.895490646362305, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8618556261062622, "num_tokens": 376803701.0, "step": 9875 }, { "epoch": 1.256328711359878, "ewc_loss": 0.027496278285980225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7496278562466614e-05, "grad_norm": 16.90610122680664, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8761395215988159, "num_tokens": 376835296.0, "step": 9876 }, { "epoch": 1.2564559216384683, "ewc_loss": 0.027464451268315315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7464451704872772e-05, "grad_norm": 16.91150665283203, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8707250356674194, "num_tokens": 376876317.0, "step": 9877 }, { "epoch": 1.2565831319170588, "ewc_loss": 0.027465971186757088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7465970561024733e-05, "grad_norm": 16.890926361083984, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8659281134605408, "num_tokens": 376916403.0, "step": 9878 }, { "epoch": 1.2567103421956494, "ewc_loss": 0.027426159009337425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7426158339949325e-05, "grad_norm": 16.895362854003906, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8764184713363647, "num_tokens": 376956079.0, "step": 9879 }, { "epoch": 1.25683755247424, "ewc_loss": 0.027362031862139702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.736203168751672e-05, "grad_norm": 16.776348114013672, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8576942682266235, "num_tokens": 376991618.0, "step": 9880 }, { "epoch": 1.2569647627528304, "ewc_loss": 0.027424005791544914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.742400647548493e-05, "grad_norm": 16.871614456176758, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8740898370742798, "num_tokens": 377033964.0, "step": 9881 }, { "epoch": 1.257091973031421, "ewc_loss": 0.027468785643577576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7468786356621422e-05, "grad_norm": 16.832189559936523, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8744729161262512, "num_tokens": 377070091.0, "step": 9882 }, { "epoch": 1.2572191833100115, "ewc_loss": 0.02741994336247444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.741994285315741e-05, "grad_norm": 16.864084243774414, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8750876188278198, "num_tokens": 377111185.0, "step": 9883 }, { "epoch": 1.257346393588602, "ewc_loss": 0.02746705524623394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7467054678709246e-05, "grad_norm": 16.771648406982422, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.872268795967102, "num_tokens": 377147637.0, "step": 9884 }, { "epoch": 1.2574736038671925, "ewc_loss": 0.027431070804595947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7431071430328302e-05, "grad_norm": 16.90702247619629, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8667043447494507, "num_tokens": 377181918.0, "step": 9885 }, { "epoch": 1.2576008141457828, "ewc_loss": 0.027523012831807137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7523012249730527e-05, "grad_norm": 16.87869644165039, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8616200089454651, "num_tokens": 377222900.0, "step": 9886 }, { "epoch": 1.2577280244243734, "ewc_loss": 0.027433861047029495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.743386176007334e-05, "grad_norm": 16.88681411743164, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8819916248321533, "num_tokens": 377263107.0, "step": 9887 }, { "epoch": 1.257855234702964, "ewc_loss": 0.027495747432112694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.749574741756078e-05, "grad_norm": 16.840011596679688, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8828097581863403, "num_tokens": 377301384.0, "step": 9888 }, { "epoch": 1.2579824449815544, "ewc_loss": 0.027415307238698006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7415308068157174e-05, "grad_norm": 16.84135627746582, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8608659505844116, "num_tokens": 377339865.0, "step": 9889 }, { "epoch": 1.258109655260145, "ewc_loss": 0.02747880108654499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7478801712277345e-05, "grad_norm": 16.86445426940918, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8759742975234985, "num_tokens": 377381582.0, "step": 9890 }, { "epoch": 1.2582368655387355, "ewc_loss": 0.027466343715786934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.746634345385246e-05, "grad_norm": 16.864376068115234, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8437018394470215, "num_tokens": 377425548.0, "step": 9891 }, { "epoch": 1.258364075817326, "ewc_loss": 0.027448620647192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7448621040093713e-05, "grad_norm": 16.866233825683594, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8661832213401794, "num_tokens": 377462298.0, "step": 9892 }, { "epoch": 1.2584912860959165, "ewc_loss": 0.02746138721704483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7461386707727797e-05, "grad_norm": 16.86675453186035, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8590213060379028, "num_tokens": 377503873.0, "step": 9893 }, { "epoch": 1.258618496374507, "ewc_loss": 0.027418024837970734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.741802563832607e-05, "grad_norm": 16.76553726196289, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8821467161178589, "num_tokens": 377540497.0, "step": 9894 }, { "epoch": 1.2587457066530976, "ewc_loss": 0.027442265301942825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7442265491117723e-05, "grad_norm": 16.903827667236328, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8675812482833862, "num_tokens": 377582242.0, "step": 9895 }, { "epoch": 1.2588729169316881, "ewc_loss": 0.027458271011710167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7458270778879523e-05, "grad_norm": 16.789520263671875, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8827049136161804, "num_tokens": 377614972.0, "step": 9896 }, { "epoch": 1.2590001272102787, "ewc_loss": 0.027445169165730476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7445168598205782e-05, "grad_norm": 16.977352142333984, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.864176869392395, "num_tokens": 377645392.0, "step": 9897 }, { "epoch": 1.2591273374888692, "ewc_loss": 0.02750769630074501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750769635895267e-05, "grad_norm": 16.79302215576172, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8748648166656494, "num_tokens": 377680317.0, "step": 9898 }, { "epoch": 1.2592545477674597, "ewc_loss": 0.027423763647675514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.742376454989426e-05, "grad_norm": 16.898073196411133, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8663420677185059, "num_tokens": 377714879.0, "step": 9899 }, { "epoch": 1.2593817580460502, "ewc_loss": 0.02751537412405014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.751537431322504e-05, "grad_norm": 16.873136520385742, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.85231614112854, "num_tokens": 377751404.0, "step": 9900 }, { "epoch": 1.2595089683246408, "ewc_loss": 0.027475418522953987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747541839198675e-05, "grad_norm": 16.84756851196289, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8709251880645752, "num_tokens": 377788354.0, "step": 9901 }, { "epoch": 1.259636178603231, "ewc_loss": 0.027510885149240494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7510885047377087e-05, "grad_norm": 16.91253089904785, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.856846809387207, "num_tokens": 377827259.0, "step": 9902 }, { "epoch": 1.2597633888818216, "ewc_loss": 0.027502765879034996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.750276507867966e-05, "grad_norm": 16.85956573486328, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.858196496963501, "num_tokens": 377872477.0, "step": 9903 }, { "epoch": 1.2598905991604121, "ewc_loss": 0.027509402483701706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7509402571013197e-05, "grad_norm": 16.91897964477539, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8608489036560059, "num_tokens": 377911159.0, "step": 9904 }, { "epoch": 1.2600178094390027, "ewc_loss": 0.02754470892250538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7544709155336022e-05, "grad_norm": 16.876161575317383, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8707853555679321, "num_tokens": 377951919.0, "step": 9905 }, { "epoch": 1.2601450197175932, "ewc_loss": 0.02750384248793125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7503841920406558e-05, "grad_norm": 16.938573837280273, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8488339185714722, "num_tokens": 377997567.0, "step": 9906 }, { "epoch": 1.2602722299961837, "ewc_loss": 0.027520328760147095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7520329240360297e-05, "grad_norm": 16.82610511779785, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8659476041793823, "num_tokens": 378033811.0, "step": 9907 }, { "epoch": 1.2603994402747742, "ewc_loss": 0.02747694030404091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7476940886117518e-05, "grad_norm": 16.867263793945312, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8549181222915649, "num_tokens": 378070136.0, "step": 9908 }, { "epoch": 1.2605266505533648, "ewc_loss": 0.02754669450223446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754669367277529e-05, "grad_norm": 16.913951873779297, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8643292784690857, "num_tokens": 378106902.0, "step": 9909 }, { "epoch": 1.2606538608319553, "ewc_loss": 0.027520600706338882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7520600269781426e-05, "grad_norm": 16.867630004882812, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8714981079101562, "num_tokens": 378144638.0, "step": 9910 }, { "epoch": 1.2607810711105456, "ewc_loss": 0.027492627501487732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7492627850733697e-05, "grad_norm": 16.86933708190918, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8759140968322754, "num_tokens": 378179059.0, "step": 9911 }, { "epoch": 1.2609082813891361, "ewc_loss": 0.027548590674996376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754859087872319e-05, "grad_norm": 16.919158935546875, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8545846343040466, "num_tokens": 378210346.0, "step": 9912 }, { "epoch": 1.2610354916677267, "ewc_loss": 0.027533240616321564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7533240427146666e-05, "grad_norm": 16.821706771850586, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8729211091995239, "num_tokens": 378242687.0, "step": 9913 }, { "epoch": 1.2611627019463172, "ewc_loss": 0.027568142861127853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7568143195821904e-05, "grad_norm": 16.956134796142578, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8727242946624756, "num_tokens": 378279421.0, "step": 9914 }, { "epoch": 1.2612899122249077, "ewc_loss": 0.02762339636683464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7623396817944013e-05, "grad_norm": 16.90108299255371, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8619576692581177, "num_tokens": 378318133.0, "step": 9915 }, { "epoch": 1.2614171225034982, "ewc_loss": 0.02752665989100933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7526659323484637e-05, "grad_norm": 16.890268325805664, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.849392831325531, "num_tokens": 378362987.0, "step": 9916 }, { "epoch": 1.2615443327820888, "ewc_loss": 0.02758650667965412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75865058938507e-05, "grad_norm": 16.85501480102539, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8837286829948425, "num_tokens": 378396197.0, "step": 9917 }, { "epoch": 1.2616715430606793, "ewc_loss": 0.027556534856557846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7556534405448474e-05, "grad_norm": 16.90941619873047, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.875938892364502, "num_tokens": 378433691.0, "step": 9918 }, { "epoch": 1.2617987533392698, "ewc_loss": 0.027589671313762665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.758967093541287e-05, "grad_norm": 16.8726749420166, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8705588579177856, "num_tokens": 378470277.0, "step": 9919 }, { "epoch": 1.2619259636178604, "ewc_loss": 0.027555391192436218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7555390261113644e-05, "grad_norm": 16.856887817382812, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8609745502471924, "num_tokens": 378510500.0, "step": 9920 }, { "epoch": 1.2620531738964509, "ewc_loss": 0.02758617140352726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7586171199800447e-05, "grad_norm": 16.9057674407959, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8653731346130371, "num_tokens": 378552893.0, "step": 9921 }, { "epoch": 1.2621803841750414, "ewc_loss": 0.027572082355618477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7572083126869984e-05, "grad_norm": 16.887657165527344, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.873824954032898, "num_tokens": 378590917.0, "step": 9922 }, { "epoch": 1.262307594453632, "ewc_loss": 0.027567058801651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.756705907813739e-05, "grad_norm": 16.892648696899414, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.875462532043457, "num_tokens": 378637764.0, "step": 9923 }, { "epoch": 1.2624348047322225, "ewc_loss": 0.027543405070900917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754340493993368e-05, "grad_norm": 16.81177520751953, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8641675710678101, "num_tokens": 378675497.0, "step": 9924 }, { "epoch": 1.262562015010813, "ewc_loss": 0.027505580335855484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7505580874276347e-05, "grad_norm": 16.91864776611328, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8585044145584106, "num_tokens": 378710470.0, "step": 9925 }, { "epoch": 1.2626892252894033, "ewc_loss": 0.02761642076075077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7616420993581414e-05, "grad_norm": 16.880334854125977, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8632918000221252, "num_tokens": 378758116.0, "step": 9926 }, { "epoch": 1.2628164355679938, "ewc_loss": 0.02755296416580677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7552963729249313e-05, "grad_norm": 16.82158851623535, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8640977144241333, "num_tokens": 378798166.0, "step": 9927 }, { "epoch": 1.2629436458465844, "ewc_loss": 0.027622893452644348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.762289295787923e-05, "grad_norm": 16.907264709472656, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8824496865272522, "num_tokens": 378839899.0, "step": 9928 }, { "epoch": 1.263070856125175, "ewc_loss": 0.02760178968310356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.760178904281929e-05, "grad_norm": 16.861072540283203, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.868110716342926, "num_tokens": 378886022.0, "step": 9929 }, { "epoch": 1.2631980664037654, "ewc_loss": 0.027578309178352356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.757830952759832e-05, "grad_norm": 16.874284744262695, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8577004671096802, "num_tokens": 378925097.0, "step": 9930 }, { "epoch": 1.263325276682356, "ewc_loss": 0.027559952810406685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7559952286537737e-05, "grad_norm": 16.843080520629883, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8791426420211792, "num_tokens": 378965242.0, "step": 9931 }, { "epoch": 1.2634524869609465, "ewc_loss": 0.027542149648070335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7542149837245233e-05, "grad_norm": 16.951335906982422, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8469284176826477, "num_tokens": 379005932.0, "step": 9932 }, { "epoch": 1.263579697239537, "ewc_loss": 0.027617085725069046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.761708492471371e-05, "grad_norm": 16.88654899597168, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8642624616622925, "num_tokens": 379045817.0, "step": 9933 }, { "epoch": 1.2637069075181275, "ewc_loss": 0.02753601409494877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7536014385987073e-05, "grad_norm": 16.898765563964844, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8606943488121033, "num_tokens": 379086481.0, "step": 9934 }, { "epoch": 1.2638341177967178, "ewc_loss": 0.027548691257834435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7548690923140384e-05, "grad_norm": 16.8951358795166, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8498963117599487, "num_tokens": 379129782.0, "step": 9935 }, { "epoch": 1.2639613280753084, "ewc_loss": 0.02751947194337845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7519472496351227e-05, "grad_norm": 16.810876846313477, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8649926781654358, "num_tokens": 379171525.0, "step": 9936 }, { "epoch": 1.264088538353899, "ewc_loss": 0.027523957192897797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7523956305230968e-05, "grad_norm": 16.958505630493164, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8606478571891785, "num_tokens": 379213887.0, "step": 9937 }, { "epoch": 1.2642157486324894, "ewc_loss": 0.027594538405537605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7594538551056758e-05, "grad_norm": 16.85388946533203, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8668230772018433, "num_tokens": 379252634.0, "step": 9938 }, { "epoch": 1.26434295891108, "ewc_loss": 0.027491379529237747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7491380024002865e-05, "grad_norm": 16.98350715637207, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8661918640136719, "num_tokens": 379287402.0, "step": 9939 }, { "epoch": 1.2644701691896705, "ewc_loss": 0.02752447873353958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7524478355189785e-05, "grad_norm": 16.808177947998047, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8728480339050293, "num_tokens": 379325592.0, "step": 9940 }, { "epoch": 1.264597379468261, "ewc_loss": 0.027467520907521248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7467520339996554e-05, "grad_norm": 16.940357208251953, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8546295762062073, "num_tokens": 379359010.0, "step": 9941 }, { "epoch": 1.2647245897468515, "ewc_loss": 0.027526067569851875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7526068151928484e-05, "grad_norm": 16.87529945373535, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8594802618026733, "num_tokens": 379399783.0, "step": 9942 }, { "epoch": 1.264851800025442, "ewc_loss": 0.027482064440846443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7482064979267307e-05, "grad_norm": 16.9079647064209, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8711482286453247, "num_tokens": 379442207.0, "step": 9943 }, { "epoch": 1.2649790103040326, "ewc_loss": 0.027526356279850006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7526355552254245e-05, "grad_norm": 16.926740646362305, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8664066791534424, "num_tokens": 379479561.0, "step": 9944 }, { "epoch": 1.2651062205826231, "ewc_loss": 0.027487369254231453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7487369152368046e-05, "grad_norm": 16.85282325744629, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.872378945350647, "num_tokens": 379516413.0, "step": 9945 }, { "epoch": 1.2652334308612136, "ewc_loss": 0.02748437598347664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.748437509580981e-05, "grad_norm": 16.935338973999023, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8578073382377625, "num_tokens": 379557993.0, "step": 9946 }, { "epoch": 1.2653606411398042, "ewc_loss": 0.027539262548089027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7539263101061806e-05, "grad_norm": 16.873401641845703, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8485873937606812, "num_tokens": 379594028.0, "step": 9947 }, { "epoch": 1.2654878514183947, "ewc_loss": 0.027459515258669853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.745951496763155e-05, "grad_norm": 16.875316619873047, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8605381846427917, "num_tokens": 379631235.0, "step": 9948 }, { "epoch": 1.2656150616969852, "ewc_loss": 0.027478110045194626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7478110496303998e-05, "grad_norm": 16.874691009521484, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.855596661567688, "num_tokens": 379668714.0, "step": 9949 }, { "epoch": 1.2657422719755758, "ewc_loss": 0.02753593772649765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7535937988432124e-05, "grad_norm": 16.8974666595459, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.871722936630249, "num_tokens": 379710233.0, "step": 9950 }, { "epoch": 1.265869482254166, "ewc_loss": 0.027474427595734596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7474427042761818e-05, "grad_norm": 16.86939811706543, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8633239269256592, "num_tokens": 379746208.0, "step": 9951 }, { "epoch": 1.2659966925327566, "ewc_loss": 0.027545956894755363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7545956982066855e-05, "grad_norm": 16.83678436279297, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8763860464096069, "num_tokens": 379796420.0, "step": 9952 }, { "epoch": 1.2661239028113471, "ewc_loss": 0.027534689754247665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7534690161701292e-05, "grad_norm": 16.863420486450195, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8680996298789978, "num_tokens": 379830027.0, "step": 9953 }, { "epoch": 1.2662511130899377, "ewc_loss": 0.027564823627471924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7564823540160432e-05, "grad_norm": 16.87868309020996, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8668957948684692, "num_tokens": 379871751.0, "step": 9954 }, { "epoch": 1.2663783233685282, "ewc_loss": 0.027519283816218376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.751928332145326e-05, "grad_norm": 16.877227783203125, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8595058917999268, "num_tokens": 379908255.0, "step": 9955 }, { "epoch": 1.2665055336471187, "ewc_loss": 0.02756592445075512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7565924028749578e-05, "grad_norm": 16.896121978759766, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8689084649085999, "num_tokens": 379944502.0, "step": 9956 }, { "epoch": 1.2666327439257092, "ewc_loss": 0.027589311823248863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.758931259450037e-05, "grad_norm": 16.892539978027344, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8639802932739258, "num_tokens": 379981929.0, "step": 9957 }, { "epoch": 1.2667599542042998, "ewc_loss": 0.027563629671931267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7563630283111706e-05, "grad_norm": 16.899290084838867, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8617613315582275, "num_tokens": 380018876.0, "step": 9958 }, { "epoch": 1.2668871644828903, "ewc_loss": 0.02754327282309532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754327215370722e-05, "grad_norm": 16.85079002380371, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.861596941947937, "num_tokens": 380054153.0, "step": 9959 }, { "epoch": 1.2670143747614806, "ewc_loss": 0.027571309357881546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7571310056373477e-05, "grad_norm": 16.943254470825195, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8575772047042847, "num_tokens": 380086550.0, "step": 9960 }, { "epoch": 1.2671415850400711, "ewc_loss": 0.027576403692364693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7576403226703405e-05, "grad_norm": 16.95419692993164, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8603072166442871, "num_tokens": 380120308.0, "step": 9961 }, { "epoch": 1.2672687953186617, "ewc_loss": 0.02751602791249752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.751602733042091e-05, "grad_norm": 16.85798454284668, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8498243093490601, "num_tokens": 380160204.0, "step": 9962 }, { "epoch": 1.2673960055972522, "ewc_loss": 0.027560513466596603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7560514354263432e-05, "grad_norm": 16.96221351623535, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8699970245361328, "num_tokens": 380196687.0, "step": 9963 }, { "epoch": 1.2675232158758427, "ewc_loss": 0.02754896879196167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7548969228519127e-05, "grad_norm": 16.83796501159668, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.881293773651123, "num_tokens": 380237451.0, "step": 9964 }, { "epoch": 1.2676504261544332, "ewc_loss": 0.02750101499259472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7501015210873447e-05, "grad_norm": 16.880220413208008, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8622980117797852, "num_tokens": 380276631.0, "step": 9965 }, { "epoch": 1.2677776364330238, "ewc_loss": 0.02758876234292984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75887632597005e-05, "grad_norm": 16.888622283935547, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8712506294250488, "num_tokens": 380319432.0, "step": 9966 }, { "epoch": 1.2679048467116143, "ewc_loss": 0.027545969933271408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.754596971499268e-05, "grad_norm": 16.885473251342773, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8746642470359802, "num_tokens": 380356666.0, "step": 9967 }, { "epoch": 1.2680320569902048, "ewc_loss": 0.02760183997452259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.760183997452259e-05, "grad_norm": 16.91767692565918, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8747413158416748, "num_tokens": 380390347.0, "step": 9968 }, { "epoch": 1.2681592672687954, "ewc_loss": 0.027549300342798233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7549300284590572e-05, "grad_norm": 16.829853057861328, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8695985078811646, "num_tokens": 380424083.0, "step": 9969 }, { "epoch": 1.2682864775473859, "ewc_loss": 0.027598967775702477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7598967790254392e-05, "grad_norm": 16.95803451538086, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8611729741096497, "num_tokens": 380471179.0, "step": 9970 }, { "epoch": 1.2684136878259764, "ewc_loss": 0.027588525786995888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7588524972088635e-05, "grad_norm": 16.82386589050293, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8676598072052002, "num_tokens": 380510291.0, "step": 9971 }, { "epoch": 1.268540898104567, "ewc_loss": 0.02756219543516636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.756219510047231e-05, "grad_norm": 16.888166427612305, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8712807893753052, "num_tokens": 380556297.0, "step": 9972 }, { "epoch": 1.2686681083831575, "ewc_loss": 0.027646450325846672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7646450689644553e-05, "grad_norm": 16.91138458251953, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8769407868385315, "num_tokens": 380592113.0, "step": 9973 }, { "epoch": 1.268795318661748, "ewc_loss": 0.02753128483891487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7531285013537854e-05, "grad_norm": 16.84122657775879, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8798011541366577, "num_tokens": 380623942.0, "step": 9974 }, { "epoch": 1.2689225289403383, "ewc_loss": 0.027532169595360756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7532169042387977e-05, "grad_norm": 16.846942901611328, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8607930541038513, "num_tokens": 380662300.0, "step": 9975 }, { "epoch": 1.2690497392189288, "ewc_loss": 0.027592815458774567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75928159680916e-05, "grad_norm": 16.862167358398438, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8682870864868164, "num_tokens": 380701452.0, "step": 9976 }, { "epoch": 1.2691769494975194, "ewc_loss": 0.02762470953166485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7624710128293373e-05, "grad_norm": 16.92675018310547, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8773354291915894, "num_tokens": 380741248.0, "step": 9977 }, { "epoch": 1.2693041597761099, "ewc_loss": 0.027583133429288864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7583133487496525e-05, "grad_norm": 16.852041244506836, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8598264455795288, "num_tokens": 380781455.0, "step": 9978 }, { "epoch": 1.2694313700547004, "ewc_loss": 0.027570592239499092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7570591555559076e-05, "grad_norm": 16.86304473876953, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.85767662525177, "num_tokens": 380818139.0, "step": 9979 }, { "epoch": 1.269558580333291, "ewc_loss": 0.027622157707810402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7622158086160198e-05, "grad_norm": 16.871864318847656, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8712388277053833, "num_tokens": 380860530.0, "step": 9980 }, { "epoch": 1.2696857906118815, "ewc_loss": 0.027569783851504326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7569783924263902e-05, "grad_norm": 16.880775451660156, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.879167914390564, "num_tokens": 380900825.0, "step": 9981 }, { "epoch": 1.269813000890472, "ewc_loss": 0.027579396963119507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7579397283261642e-05, "grad_norm": 16.860876083374023, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8646613359451294, "num_tokens": 380939270.0, "step": 9982 }, { "epoch": 1.2699402111690625, "ewc_loss": 0.027650728821754456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7650728952721693e-05, "grad_norm": 16.94345474243164, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8444759845733643, "num_tokens": 380981708.0, "step": 9983 }, { "epoch": 1.2700674214476528, "ewc_loss": 0.027613142505288124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7613143174676225e-05, "grad_norm": 16.891237258911133, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8713788986206055, "num_tokens": 381015186.0, "step": 9984 }, { "epoch": 1.2701946317262434, "ewc_loss": 0.02758387289941311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7583873816183768e-05, "grad_norm": 16.9898738861084, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8765106797218323, "num_tokens": 381055503.0, "step": 9985 }, { "epoch": 1.270321842004834, "ewc_loss": 0.027644308283925056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7644307920127176e-05, "grad_norm": 16.98141098022461, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.878926157951355, "num_tokens": 381091935.0, "step": 9986 }, { "epoch": 1.2704490522834244, "ewc_loss": 0.02748912014067173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.748912083916366e-05, "grad_norm": 16.87959861755371, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8716253042221069, "num_tokens": 381127951.0, "step": 9987 }, { "epoch": 1.270576262562015, "ewc_loss": 0.027576571330428123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.757657057372853e-05, "grad_norm": 16.950611114501953, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8608428239822388, "num_tokens": 381161918.0, "step": 9988 }, { "epoch": 1.2707034728406055, "ewc_loss": 0.02758467011153698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.758467053354252e-05, "grad_norm": 16.901649475097656, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.85483318567276, "num_tokens": 381198747.0, "step": 9989 }, { "epoch": 1.270830683119196, "ewc_loss": 0.027563614770770073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7563613912207074e-05, "grad_norm": 16.88262367248535, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8675028085708618, "num_tokens": 381242714.0, "step": 9990 }, { "epoch": 1.2709578933977865, "ewc_loss": 0.02757362835109234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7573627448873594e-05, "grad_norm": 16.868722915649414, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8785766363143921, "num_tokens": 381274667.0, "step": 9991 }, { "epoch": 1.271085103676377, "ewc_loss": 0.02761322259902954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.761322321020998e-05, "grad_norm": 16.967376708984375, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8761923313140869, "num_tokens": 381318110.0, "step": 9992 }, { "epoch": 1.2712123139549676, "ewc_loss": 0.02757531963288784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7575319109018892e-05, "grad_norm": 16.8447265625, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8680559396743774, "num_tokens": 381356694.0, "step": 9993 }, { "epoch": 1.2713395242335581, "ewc_loss": 0.027557577937841415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.755757850536611e-05, "grad_norm": 16.883853912353516, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8562626242637634, "num_tokens": 381395218.0, "step": 9994 }, { "epoch": 1.2714667345121486, "ewc_loss": 0.027602145448327065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7602145564742386e-05, "grad_norm": 16.897441864013672, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8804111480712891, "num_tokens": 381429195.0, "step": 9995 }, { "epoch": 1.2715939447907392, "ewc_loss": 0.027530867606401443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7530868464964442e-05, "grad_norm": 16.839412689208984, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8609278798103333, "num_tokens": 381468276.0, "step": 9996 }, { "epoch": 1.2717211550693297, "ewc_loss": 0.027627356350421906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7627356757875532e-05, "grad_norm": 16.963302612304688, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8747885227203369, "num_tokens": 381503400.0, "step": 9997 }, { "epoch": 1.2718483653479202, "ewc_loss": 0.02759249322116375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7592494006967172e-05, "grad_norm": 16.82851219177246, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8722313642501831, "num_tokens": 381542126.0, "step": 9998 }, { "epoch": 1.2719755756265108, "ewc_loss": 0.027562526986002922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7562526156543754e-05, "grad_norm": 16.927459716796875, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8761540651321411, "num_tokens": 381579169.0, "step": 9999 }, { "epoch": 1.272102785905101, "ewc_loss": 0.02764752134680748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.764752207440324e-05, "grad_norm": 16.868125915527344, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8530840873718262, "num_tokens": 381615501.0, "step": 10000 }, { "epoch": 1.2722299961836916, "ewc_loss": 0.02752590924501419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7525909899850376e-05, "grad_norm": 16.955236434936523, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8622556924819946, "num_tokens": 381649737.0, "step": 10001 }, { "epoch": 1.2723572064622821, "ewc_loss": 0.027638552710413933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7638552637654357e-05, "grad_norm": 16.857074737548828, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8706148266792297, "num_tokens": 381693776.0, "step": 10002 }, { "epoch": 1.2724844167408726, "ewc_loss": 0.027502937242388725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7502937882672995e-05, "grad_norm": 16.892919540405273, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8771226406097412, "num_tokens": 381733324.0, "step": 10003 }, { "epoch": 1.2726116270194632, "ewc_loss": 0.027614813297986984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.761481300694868e-05, "grad_norm": 16.850915908813477, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.863067626953125, "num_tokens": 381765893.0, "step": 10004 }, { "epoch": 1.2727388372980537, "ewc_loss": 0.027590062469244003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.759006201813463e-05, "grad_norm": 16.8775577545166, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8655687570571899, "num_tokens": 381802782.0, "step": 10005 }, { "epoch": 1.2728660475766442, "ewc_loss": 0.027618639171123505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7618638341664337e-05, "grad_norm": 16.863481521606445, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8701207041740417, "num_tokens": 381843448.0, "step": 10006 }, { "epoch": 1.2729932578552348, "ewc_loss": 0.02765556424856186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7655563826556318e-05, "grad_norm": 17.00237274169922, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8564159274101257, "num_tokens": 381878850.0, "step": 10007 }, { "epoch": 1.2731204681338253, "ewc_loss": 0.027663011103868484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7663010769174434e-05, "grad_norm": 16.89377784729004, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8457168340682983, "num_tokens": 381915508.0, "step": 10008 }, { "epoch": 1.2732476784124156, "ewc_loss": 0.02762554958462715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.762555050139781e-05, "grad_norm": 16.8781681060791, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.857507050037384, "num_tokens": 381952699.0, "step": 10009 }, { "epoch": 1.2733748886910061, "ewc_loss": 0.027716105803847313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7716105250874534e-05, "grad_norm": 16.938968658447266, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8605284690856934, "num_tokens": 381989947.0, "step": 10010 }, { "epoch": 1.2735020989695967, "ewc_loss": 0.027632519602775574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7632519049802795e-05, "grad_norm": 16.91728401184082, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8664078712463379, "num_tokens": 382027701.0, "step": 10011 }, { "epoch": 1.2736293092481872, "ewc_loss": 0.027670005336403847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7670004783431068e-05, "grad_norm": 16.86042022705078, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.864464521408081, "num_tokens": 382066415.0, "step": 10012 }, { "epoch": 1.2737565195267777, "ewc_loss": 0.027651876211166382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.765187673503533e-05, "grad_norm": 16.901748657226562, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8617545366287231, "num_tokens": 382108402.0, "step": 10013 }, { "epoch": 1.2738837298053682, "ewc_loss": 0.027692707255482674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7692707590176724e-05, "grad_norm": 16.883445739746094, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8671740293502808, "num_tokens": 382146865.0, "step": 10014 }, { "epoch": 1.2740109400839588, "ewc_loss": 0.027688879519701004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.768887861748226e-05, "grad_norm": 16.961313247680664, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8641735911369324, "num_tokens": 382186746.0, "step": 10015 }, { "epoch": 1.2741381503625493, "ewc_loss": 0.027697166427969933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7697165933204815e-05, "grad_norm": 16.933792114257812, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8798213601112366, "num_tokens": 382220693.0, "step": 10016 }, { "epoch": 1.2742653606411398, "ewc_loss": 0.027622833847999573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7622832931228913e-05, "grad_norm": 16.919340133666992, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8611901998519897, "num_tokens": 382259549.0, "step": 10017 }, { "epoch": 1.2743925709197303, "ewc_loss": 0.02770681492984295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7706815671990626e-05, "grad_norm": 16.897018432617188, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8765538930892944, "num_tokens": 382299164.0, "step": 10018 }, { "epoch": 1.2745197811983209, "ewc_loss": 0.02770712599158287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7707126719178632e-05, "grad_norm": 16.899152755737305, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8801664113998413, "num_tokens": 382333596.0, "step": 10019 }, { "epoch": 1.2746469914769114, "ewc_loss": 0.027675725519657135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.767572550510522e-05, "grad_norm": 16.88155746459961, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8682972192764282, "num_tokens": 382372321.0, "step": 10020 }, { "epoch": 1.274774201755502, "ewc_loss": 0.027670564129948616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.767056503216736e-05, "grad_norm": 16.86440658569336, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8697314262390137, "num_tokens": 382411125.0, "step": 10021 }, { "epoch": 1.2749014120340925, "ewc_loss": 0.02765899896621704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7658998078550212e-05, "grad_norm": 16.830997467041016, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8573899865150452, "num_tokens": 382450935.0, "step": 10022 }, { "epoch": 1.275028622312683, "ewc_loss": 0.027724238112568855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7724237952497788e-05, "grad_norm": 16.926870346069336, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.865009069442749, "num_tokens": 382488232.0, "step": 10023 }, { "epoch": 1.2751558325912733, "ewc_loss": 0.02771495282649994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7714952011592686e-05, "grad_norm": 16.857742309570312, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8626912236213684, "num_tokens": 382522280.0, "step": 10024 }, { "epoch": 1.2752830428698638, "ewc_loss": 0.02764626406133175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7646263333735988e-05, "grad_norm": 16.8366641998291, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8771768808364868, "num_tokens": 382562788.0, "step": 10025 }, { "epoch": 1.2754102531484544, "ewc_loss": 0.027730457484722137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773045707726851e-05, "grad_norm": 16.988452911376953, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8601512312889099, "num_tokens": 382598534.0, "step": 10026 }, { "epoch": 1.2755374634270449, "ewc_loss": 0.027685612440109253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.768561171251349e-05, "grad_norm": 16.810928344726562, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8612972497940063, "num_tokens": 382641803.0, "step": 10027 }, { "epoch": 1.2756646737056354, "ewc_loss": 0.027595695108175278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7595695428317413e-05, "grad_norm": 16.886791229248047, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8776722550392151, "num_tokens": 382677871.0, "step": 10028 }, { "epoch": 1.275791883984226, "ewc_loss": 0.02772090770304203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7720907382899895e-05, "grad_norm": 16.89830780029297, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8754907846450806, "num_tokens": 382714989.0, "step": 10029 }, { "epoch": 1.2759190942628165, "ewc_loss": 0.02765456587076187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.765456520137377e-05, "grad_norm": 16.883865356445312, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8700360655784607, "num_tokens": 382751919.0, "step": 10030 }, { "epoch": 1.276046304541407, "ewc_loss": 0.027667038142681122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7667038011713885e-05, "grad_norm": 16.919431686401367, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8783898949623108, "num_tokens": 382793550.0, "step": 10031 }, { "epoch": 1.2761735148199975, "ewc_loss": 0.027617761865258217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7617761588771828e-05, "grad_norm": 16.86870765686035, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8741154670715332, "num_tokens": 382833142.0, "step": 10032 }, { "epoch": 1.2763007250985878, "ewc_loss": 0.027609368786215782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7609368771663867e-05, "grad_norm": 16.861234664916992, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8741254806518555, "num_tokens": 382869934.0, "step": 10033 }, { "epoch": 1.2764279353771784, "ewc_loss": 0.027611231431365013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7611231416813098e-05, "grad_norm": 16.860214233398438, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8539042472839355, "num_tokens": 382909533.0, "step": 10034 }, { "epoch": 1.2765551456557689, "ewc_loss": 0.02763659693300724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7636597224045545e-05, "grad_norm": 16.903383255004883, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8727781772613525, "num_tokens": 382947462.0, "step": 10035 }, { "epoch": 1.2766823559343594, "ewc_loss": 0.027646638453006744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7646638045553118e-05, "grad_norm": 16.908639907836914, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8683964014053345, "num_tokens": 382989973.0, "step": 10036 }, { "epoch": 1.27680956621295, "ewc_loss": 0.027593623846769333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7593623599386774e-05, "grad_norm": 16.929014205932617, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8651961088180542, "num_tokens": 383033052.0, "step": 10037 }, { "epoch": 1.2769367764915405, "ewc_loss": 0.027614085003733635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.761408541118726e-05, "grad_norm": 16.92443084716797, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8657283782958984, "num_tokens": 383075981.0, "step": 10038 }, { "epoch": 1.277063986770131, "ewc_loss": 0.02755037322640419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7550373488338664e-05, "grad_norm": 16.925443649291992, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.882901668548584, "num_tokens": 383111984.0, "step": 10039 }, { "epoch": 1.2771911970487215, "ewc_loss": 0.027537301182746887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.753730041149538e-05, "grad_norm": 16.89170265197754, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8683521151542664, "num_tokens": 383155323.0, "step": 10040 }, { "epoch": 1.277318407327312, "ewc_loss": 0.027585070580244064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75850707112113e-05, "grad_norm": 16.980243682861328, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.865756094455719, "num_tokens": 383190069.0, "step": 10041 }, { "epoch": 1.2774456176059026, "ewc_loss": 0.02758793905377388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7587939257500693e-05, "grad_norm": 17.005390167236328, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8594497442245483, "num_tokens": 383224999.0, "step": 10042 }, { "epoch": 1.2775728278844931, "ewc_loss": 0.02754693478345871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7546935598365963e-05, "grad_norm": 16.90886116027832, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.860141396522522, "num_tokens": 383267764.0, "step": 10043 }, { "epoch": 1.2777000381630836, "ewc_loss": 0.027519280090928078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.751927968347445e-05, "grad_norm": 17.004589080810547, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8759599924087524, "num_tokens": 383306148.0, "step": 10044 }, { "epoch": 1.2778272484416742, "ewc_loss": 0.027550630271434784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7550629965844564e-05, "grad_norm": 16.954730987548828, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8713474869728088, "num_tokens": 383343408.0, "step": 10045 }, { "epoch": 1.2779544587202647, "ewc_loss": 0.027487486600875854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7487487386679277e-05, "grad_norm": 16.960844039916992, "learning_rate": 1e-06, "loss": 0.3355, "mean_token_accuracy": 0.8905206322669983, "num_tokens": 383383626.0, "step": 10046 }, { "epoch": 1.2780816689988552, "ewc_loss": 0.027544669806957245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7544669137569144e-05, "grad_norm": 17.00794219970703, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.851272463798523, "num_tokens": 383423218.0, "step": 10047 }, { "epoch": 1.2782088792774458, "ewc_loss": 0.027473602443933487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.747360304056201e-05, "grad_norm": 16.894893646240234, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8755170702934265, "num_tokens": 383461126.0, "step": 10048 }, { "epoch": 1.278336089556036, "ewc_loss": 0.02753341943025589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7533418688108213e-05, "grad_norm": 16.924072265625, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8590314984321594, "num_tokens": 383501808.0, "step": 10049 }, { "epoch": 1.2784632998346266, "ewc_loss": 0.027528977021574974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7528976715984754e-05, "grad_norm": 16.925676345825195, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.8875762224197388, "num_tokens": 383542395.0, "step": 10050 }, { "epoch": 1.2785905101132171, "ewc_loss": 0.027528150007128716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7528150894795544e-05, "grad_norm": 16.91796875, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8640071153640747, "num_tokens": 383579968.0, "step": 10051 }, { "epoch": 1.2787177203918076, "ewc_loss": 0.027565591037273407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.756559115368873e-05, "grad_norm": 16.98831558227539, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8667599558830261, "num_tokens": 383617861.0, "step": 10052 }, { "epoch": 1.2788449306703982, "ewc_loss": 0.027545826509594917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.75458260148298e-05, "grad_norm": 16.855314254760742, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8631777167320251, "num_tokens": 383652534.0, "step": 10053 }, { "epoch": 1.2789721409489887, "ewc_loss": 0.02755686827003956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7556869099498726e-05, "grad_norm": 16.969600677490234, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.862251877784729, "num_tokens": 383693120.0, "step": 10054 }, { "epoch": 1.2790993512275792, "ewc_loss": 0.027626611292362213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7626610972220078e-05, "grad_norm": 16.952390670776367, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8580794930458069, "num_tokens": 383734958.0, "step": 10055 }, { "epoch": 1.2792265615061698, "ewc_loss": 0.027549983933568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7549984224606305e-05, "grad_norm": 16.949708938598633, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8601418733596802, "num_tokens": 383768873.0, "step": 10056 }, { "epoch": 1.2793537717847603, "ewc_loss": 0.027591723948717117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7591724574449472e-05, "grad_norm": 16.972496032714844, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8689095973968506, "num_tokens": 383804063.0, "step": 10057 }, { "epoch": 1.2794809820633506, "ewc_loss": 0.027579553425312042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7579553716350347e-05, "grad_norm": 16.939002990722656, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.8802378177642822, "num_tokens": 383843098.0, "step": 10058 }, { "epoch": 1.2796081923419411, "ewc_loss": 0.027589207515120506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7589207093114965e-05, "grad_norm": 16.92978858947754, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8753761053085327, "num_tokens": 383879443.0, "step": 10059 }, { "epoch": 1.2797354026205316, "ewc_loss": 0.02755597047507763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7555970518733375e-05, "grad_norm": 16.935588836669922, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8668156266212463, "num_tokens": 383918071.0, "step": 10060 }, { "epoch": 1.2798626128991222, "ewc_loss": 0.02758745849132538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7587459044298157e-05, "grad_norm": 16.910717010498047, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8608167171478271, "num_tokens": 383953052.0, "step": 10061 }, { "epoch": 1.2799898231777127, "ewc_loss": 0.027621857821941376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7621857952908613e-05, "grad_norm": 16.957782745361328, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8653329014778137, "num_tokens": 383992835.0, "step": 10062 }, { "epoch": 1.2801170334563032, "ewc_loss": 0.027646902948617935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7646903618006036e-05, "grad_norm": 16.854597091674805, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8545207381248474, "num_tokens": 384031234.0, "step": 10063 }, { "epoch": 1.2802442437348938, "ewc_loss": 0.027631867676973343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7631867851596326e-05, "grad_norm": 17.007234573364258, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8585343956947327, "num_tokens": 384063611.0, "step": 10064 }, { "epoch": 1.2803714540134843, "ewc_loss": 0.027672085911035538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7672085707308725e-05, "grad_norm": 16.88801383972168, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8754261136054993, "num_tokens": 384106757.0, "step": 10065 }, { "epoch": 1.2804986642920748, "ewc_loss": 0.02765858732163906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.765858698694501e-05, "grad_norm": 17.012117385864258, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.871134877204895, "num_tokens": 384143094.0, "step": 10066 }, { "epoch": 1.2806258745706653, "ewc_loss": 0.02770296484231949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770296487142332e-05, "grad_norm": 16.87659454345703, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8674567937850952, "num_tokens": 384181498.0, "step": 10067 }, { "epoch": 1.2807530848492559, "ewc_loss": 0.02759625017642975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7596250220085494e-05, "grad_norm": 16.983060836791992, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8713592290878296, "num_tokens": 384209784.0, "step": 10068 }, { "epoch": 1.2808802951278464, "ewc_loss": 0.027752680703997612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7752681489801034e-05, "grad_norm": 16.93924903869629, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8721137046813965, "num_tokens": 384248000.0, "step": 10069 }, { "epoch": 1.281007505406437, "ewc_loss": 0.02761928364634514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7619284082902595e-05, "grad_norm": 16.93495750427246, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8575928807258606, "num_tokens": 384290897.0, "step": 10070 }, { "epoch": 1.2811347156850275, "ewc_loss": 0.027725568041205406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.772556763375178e-05, "grad_norm": 16.967554092407227, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8608027696609497, "num_tokens": 384329464.0, "step": 10071 }, { "epoch": 1.281261925963618, "ewc_loss": 0.027673624455928802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7673624572344124e-05, "grad_norm": 16.900728225708008, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8706716299057007, "num_tokens": 384368915.0, "step": 10072 }, { "epoch": 1.2813891362422083, "ewc_loss": 0.027668580412864685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.766858051472809e-05, "grad_norm": 16.961198806762695, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8770984411239624, "num_tokens": 384407445.0, "step": 10073 }, { "epoch": 1.2815163465207988, "ewc_loss": 0.027711153030395508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7711153961718082e-05, "grad_norm": 16.990266799926758, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8630803227424622, "num_tokens": 384444384.0, "step": 10074 }, { "epoch": 1.2816435567993893, "ewc_loss": 0.02765807881951332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.765807948890142e-05, "grad_norm": 16.86654281616211, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8642411231994629, "num_tokens": 384479670.0, "step": 10075 }, { "epoch": 1.2817707670779799, "ewc_loss": 0.027651555836200714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7651556592900306e-05, "grad_norm": 16.940614700317383, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8553581833839417, "num_tokens": 384522436.0, "step": 10076 }, { "epoch": 1.2818979773565704, "ewc_loss": 0.02777453139424324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7774531190516427e-05, "grad_norm": 16.96329116821289, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8602890968322754, "num_tokens": 384559754.0, "step": 10077 }, { "epoch": 1.282025187635161, "ewc_loss": 0.027657337486743927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7657337341224775e-05, "grad_norm": 16.920246124267578, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8642265796661377, "num_tokens": 384595153.0, "step": 10078 }, { "epoch": 1.2821523979137515, "ewc_loss": 0.02770676091313362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770676110230852e-05, "grad_norm": 16.952831268310547, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.86604905128479, "num_tokens": 384628345.0, "step": 10079 }, { "epoch": 1.282279608192342, "ewc_loss": 0.027671996504068375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.767199657682795e-05, "grad_norm": 16.868284225463867, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8623213768005371, "num_tokens": 384666725.0, "step": 10080 }, { "epoch": 1.2824068184709325, "ewc_loss": 0.02769680880010128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.769680941128172e-05, "grad_norm": 16.958080291748047, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8584803342819214, "num_tokens": 384706217.0, "step": 10081 }, { "epoch": 1.2825340287495228, "ewc_loss": 0.027733968570828438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7733967726817355e-05, "grad_norm": 16.896879196166992, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8631255626678467, "num_tokens": 384741835.0, "step": 10082 }, { "epoch": 1.2826612390281134, "ewc_loss": 0.027692198753356934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.769219827314373e-05, "grad_norm": 17.029775619506836, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8685661554336548, "num_tokens": 384777273.0, "step": 10083 }, { "epoch": 1.2827884493067039, "ewc_loss": 0.0277896448969841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778964517347049e-05, "grad_norm": 16.89523696899414, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8809489607810974, "num_tokens": 384814950.0, "step": 10084 }, { "epoch": 1.2829156595852944, "ewc_loss": 0.027693016454577446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7693016818375327e-05, "grad_norm": 16.929893493652344, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.858026385307312, "num_tokens": 384848088.0, "step": 10085 }, { "epoch": 1.283042869863885, "ewc_loss": 0.027817536145448685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7817535737995058e-05, "grad_norm": 16.861602783203125, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8583892583847046, "num_tokens": 384887361.0, "step": 10086 }, { "epoch": 1.2831700801424755, "ewc_loss": 0.02781939134001732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7819391107186675e-05, "grad_norm": 16.997575759887695, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8724771738052368, "num_tokens": 384930459.0, "step": 10087 }, { "epoch": 1.283297290421066, "ewc_loss": 0.027840865775942802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7840866096084937e-05, "grad_norm": 16.856706619262695, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8610023856163025, "num_tokens": 384969506.0, "step": 10088 }, { "epoch": 1.2834245006996565, "ewc_loss": 0.027750303968787193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.77503040706506e-05, "grad_norm": 16.972332000732422, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8724688291549683, "num_tokens": 385004368.0, "step": 10089 }, { "epoch": 1.283551710978247, "ewc_loss": 0.02790030837059021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7900308850803412e-05, "grad_norm": 16.869112014770508, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8694244027137756, "num_tokens": 385044553.0, "step": 10090 }, { "epoch": 1.2836789212568376, "ewc_loss": 0.027823496609926224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7823496566270478e-05, "grad_norm": 16.99594497680664, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8785540461540222, "num_tokens": 385076873.0, "step": 10091 }, { "epoch": 1.283806131535428, "ewc_loss": 0.027855345979332924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7855345251737162e-05, "grad_norm": 16.869768142700195, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8690616488456726, "num_tokens": 385113969.0, "step": 10092 }, { "epoch": 1.2839333418140186, "ewc_loss": 0.027830561622977257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.783056152111385e-05, "grad_norm": 16.956295013427734, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8767292499542236, "num_tokens": 385149460.0, "step": 10093 }, { "epoch": 1.2840605520926092, "ewc_loss": 0.027870049700140953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7870049962075427e-05, "grad_norm": 16.965497970581055, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8496364951133728, "num_tokens": 385187240.0, "step": 10094 }, { "epoch": 1.2841877623711997, "ewc_loss": 0.027821188792586327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.782118826871738e-05, "grad_norm": 16.959413528442383, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.858502209186554, "num_tokens": 385225532.0, "step": 10095 }, { "epoch": 1.2843149726497902, "ewc_loss": 0.027862532064318657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7862532078870572e-05, "grad_norm": 16.95279312133789, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8694548606872559, "num_tokens": 385267404.0, "step": 10096 }, { "epoch": 1.2844421829283807, "ewc_loss": 0.027755888178944588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7755888368119486e-05, "grad_norm": 16.895671844482422, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8557450175285339, "num_tokens": 385314008.0, "step": 10097 }, { "epoch": 1.284569393206971, "ewc_loss": 0.027825096622109413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.78250972769456e-05, "grad_norm": 17.033540725708008, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8551128506660461, "num_tokens": 385352204.0, "step": 10098 }, { "epoch": 1.2846966034855616, "ewc_loss": 0.02773592434823513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773592495941557e-05, "grad_norm": 16.8498477935791, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8584990501403809, "num_tokens": 385393307.0, "step": 10099 }, { "epoch": 1.2848238137641521, "ewc_loss": 0.027685286477208138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7685286113410257e-05, "grad_norm": 16.976346969604492, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8678461313247681, "num_tokens": 385426124.0, "step": 10100 }, { "epoch": 1.2849510240427426, "ewc_loss": 0.027787433937191963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7787433282355778e-05, "grad_norm": 16.91505241394043, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8718387484550476, "num_tokens": 385467471.0, "step": 10101 }, { "epoch": 1.2850782343213332, "ewc_loss": 0.027717724442481995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.771772415144369e-05, "grad_norm": 16.989166259765625, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8579707145690918, "num_tokens": 385508780.0, "step": 10102 }, { "epoch": 1.2852054445999237, "ewc_loss": 0.027790779247879982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.77907784038689e-05, "grad_norm": 16.902381896972656, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.873254656791687, "num_tokens": 385552646.0, "step": 10103 }, { "epoch": 1.2853326548785142, "ewc_loss": 0.027637315914034843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7637315724859945e-05, "grad_norm": 16.971281051635742, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.855047345161438, "num_tokens": 385586676.0, "step": 10104 }, { "epoch": 1.2854598651571048, "ewc_loss": 0.027749180793762207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774917993519921e-05, "grad_norm": 16.89883804321289, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8550127744674683, "num_tokens": 385623201.0, "step": 10105 }, { "epoch": 1.2855870754356953, "ewc_loss": 0.027730487287044525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773048800008837e-05, "grad_norm": 17.00119972229004, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.877738356590271, "num_tokens": 385662716.0, "step": 10106 }, { "epoch": 1.2857142857142856, "ewc_loss": 0.02773149497807026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7731495720217936e-05, "grad_norm": 16.809900283813477, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8563969135284424, "num_tokens": 385706986.0, "step": 10107 }, { "epoch": 1.2858414959928761, "ewc_loss": 0.02768469974398613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7684700398822315e-05, "grad_norm": 16.995811462402344, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.866016685962677, "num_tokens": 385744240.0, "step": 10108 }, { "epoch": 1.2859687062714666, "ewc_loss": 0.027837678790092468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7837679226649925e-05, "grad_norm": 16.95325469970703, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8516930341720581, "num_tokens": 385778583.0, "step": 10109 }, { "epoch": 1.2860959165500572, "ewc_loss": 0.027693642303347588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7693642550730146e-05, "grad_norm": 16.974668502807617, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8677629232406616, "num_tokens": 385819910.0, "step": 10110 }, { "epoch": 1.2862231268286477, "ewc_loss": 0.02776665799319744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.776665860437788e-05, "grad_norm": 16.88020896911621, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8643718957901001, "num_tokens": 385860068.0, "step": 10111 }, { "epoch": 1.2863503371072382, "ewc_loss": 0.027704358100891113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7704358217306435e-05, "grad_norm": 17.039949417114258, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8586509227752686, "num_tokens": 385890460.0, "step": 10112 }, { "epoch": 1.2864775473858288, "ewc_loss": 0.027827026322484016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.782702722470276e-05, "grad_norm": 16.965715408325195, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8638288378715515, "num_tokens": 385926174.0, "step": 10113 }, { "epoch": 1.2866047576644193, "ewc_loss": 0.02769264578819275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7692645744537003e-05, "grad_norm": 16.98272132873535, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8765151500701904, "num_tokens": 385962928.0, "step": 10114 }, { "epoch": 1.2867319679430098, "ewc_loss": 0.02775087021291256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7750869776355103e-05, "grad_norm": 16.868318557739258, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8747524619102478, "num_tokens": 386002509.0, "step": 10115 }, { "epoch": 1.2868591782216003, "ewc_loss": 0.02773246169090271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773246160359122e-05, "grad_norm": 16.92563819885254, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8662099838256836, "num_tokens": 386043304.0, "step": 10116 }, { "epoch": 1.2869863885001909, "ewc_loss": 0.027794547379016876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7794547349913046e-05, "grad_norm": 16.955331802368164, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8735626935958862, "num_tokens": 386081607.0, "step": 10117 }, { "epoch": 1.2871135987787814, "ewc_loss": 0.027760963886976242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.776096334855538e-05, "grad_norm": 16.922239303588867, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8853412866592407, "num_tokens": 386117762.0, "step": 10118 }, { "epoch": 1.287240809057372, "ewc_loss": 0.02773965708911419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7739657525671646e-05, "grad_norm": 16.9172420501709, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8586817979812622, "num_tokens": 386155908.0, "step": 10119 }, { "epoch": 1.2873680193359625, "ewc_loss": 0.02775471843779087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7754718757933006e-05, "grad_norm": 17.188180923461914, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8641771078109741, "num_tokens": 386199385.0, "step": 10120 }, { "epoch": 1.287495229614553, "ewc_loss": 0.02778216265141964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7782161851064302e-05, "grad_norm": 16.857934951782227, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.861697793006897, "num_tokens": 386231892.0, "step": 10121 }, { "epoch": 1.2876224398931433, "ewc_loss": 0.027654573321342468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7654572477331385e-05, "grad_norm": 16.931673049926758, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8506872057914734, "num_tokens": 386269657.0, "step": 10122 }, { "epoch": 1.2877496501717338, "ewc_loss": 0.027898574247956276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.789857353491243e-05, "grad_norm": 16.987218856811523, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8712334632873535, "num_tokens": 386306936.0, "step": 10123 }, { "epoch": 1.2878768604503243, "ewc_loss": 0.027800237759947777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7800237148767337e-05, "grad_norm": 16.906003952026367, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8663933277130127, "num_tokens": 386342260.0, "step": 10124 }, { "epoch": 1.2880040707289149, "ewc_loss": 0.027851860970258713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.785186006803997e-05, "grad_norm": 16.930097579956055, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8623270392417908, "num_tokens": 386381784.0, "step": 10125 }, { "epoch": 1.2881312810075054, "ewc_loss": 0.027852799743413925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7852800485561602e-05, "grad_norm": 16.98650360107422, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8652265071868896, "num_tokens": 386422350.0, "step": 10126 }, { "epoch": 1.288258491286096, "ewc_loss": 0.027872102335095406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7872101782122627e-05, "grad_norm": 16.88154411315918, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8622730374336243, "num_tokens": 386467951.0, "step": 10127 }, { "epoch": 1.2883857015646865, "ewc_loss": 0.027888989076018333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7888989279745147e-05, "grad_norm": 16.98674774169922, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.875446081161499, "num_tokens": 386511896.0, "step": 10128 }, { "epoch": 1.288512911843277, "ewc_loss": 0.027920709922909737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7920710635953583e-05, "grad_norm": 17.004192352294922, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8536792397499084, "num_tokens": 386550704.0, "step": 10129 }, { "epoch": 1.2886401221218675, "ewc_loss": 0.027824129909276962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7824129574582912e-05, "grad_norm": 17.009918212890625, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8498141765594482, "num_tokens": 386589295.0, "step": 10130 }, { "epoch": 1.2887673324004578, "ewc_loss": 0.027871856465935707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7871856218553148e-05, "grad_norm": 17.025272369384766, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8665486574172974, "num_tokens": 386623462.0, "step": 10131 }, { "epoch": 1.2888945426790483, "ewc_loss": 0.02777852490544319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7778525691246614e-05, "grad_norm": 16.916746139526367, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8694900274276733, "num_tokens": 386663036.0, "step": 10132 }, { "epoch": 1.2890217529576389, "ewc_loss": 0.027846839278936386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7846839657286182e-05, "grad_norm": 16.984514236450195, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8660338521003723, "num_tokens": 386702530.0, "step": 10133 }, { "epoch": 1.2891489632362294, "ewc_loss": 0.027832627296447754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7832627893076278e-05, "grad_norm": 16.94373321533203, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8807067275047302, "num_tokens": 386739758.0, "step": 10134 }, { "epoch": 1.28927617351482, "ewc_loss": 0.027768339961767197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7768339350586757e-05, "grad_norm": 17.034265518188477, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8761942982673645, "num_tokens": 386778287.0, "step": 10135 }, { "epoch": 1.2894033837934105, "ewc_loss": 0.027768593281507492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7768594009103253e-05, "grad_norm": 16.961746215820312, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8770616054534912, "num_tokens": 386812348.0, "step": 10136 }, { "epoch": 1.289530594072001, "ewc_loss": 0.027732418850064278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7732417947845533e-05, "grad_norm": 17.041975021362305, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8742083311080933, "num_tokens": 386855540.0, "step": 10137 }, { "epoch": 1.2896578043505915, "ewc_loss": 0.027771906927227974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777190638880711e-05, "grad_norm": 17.009220123291016, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8624213933944702, "num_tokens": 386893775.0, "step": 10138 }, { "epoch": 1.289785014629182, "ewc_loss": 0.027689732611179352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7689731723512523e-05, "grad_norm": 17.058025360107422, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8669491410255432, "num_tokens": 386929553.0, "step": 10139 }, { "epoch": 1.2899122249077726, "ewc_loss": 0.02771291323006153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.771291292447131e-05, "grad_norm": 17.0210018157959, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8612276315689087, "num_tokens": 386968033.0, "step": 10140 }, { "epoch": 1.290039435186363, "ewc_loss": 0.027658985927700996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7658985345624387e-05, "grad_norm": 17.043148040771484, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8652748465538025, "num_tokens": 387004967.0, "step": 10141 }, { "epoch": 1.2901666454649536, "ewc_loss": 0.027681894600391388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7681893698172644e-05, "grad_norm": 16.992965698242188, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8548297882080078, "num_tokens": 387038086.0, "step": 10142 }, { "epoch": 1.2902938557435442, "ewc_loss": 0.02766355499625206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7663554647006094e-05, "grad_norm": 17.030893325805664, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8416808247566223, "num_tokens": 387077112.0, "step": 10143 }, { "epoch": 1.2904210660221347, "ewc_loss": 0.027730632573366165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773063170025125e-05, "grad_norm": 16.891462326049805, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.849824845790863, "num_tokens": 387118800.0, "step": 10144 }, { "epoch": 1.2905482763007252, "ewc_loss": 0.02767457440495491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7674574084812775e-05, "grad_norm": 17.03063201904297, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8608282804489136, "num_tokens": 387160321.0, "step": 10145 }, { "epoch": 1.2906754865793157, "ewc_loss": 0.027759989723563194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.775999018922448e-05, "grad_norm": 16.958005905151367, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8645641803741455, "num_tokens": 387199387.0, "step": 10146 }, { "epoch": 1.290802696857906, "ewc_loss": 0.027658449485898018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.765844874375034e-05, "grad_norm": 17.033523559570312, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.87232506275177, "num_tokens": 387238859.0, "step": 10147 }, { "epoch": 1.2909299071364966, "ewc_loss": 0.02769523113965988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7695230528479442e-05, "grad_norm": 16.9862003326416, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8718022704124451, "num_tokens": 387272149.0, "step": 10148 }, { "epoch": 1.291057117415087, "ewc_loss": 0.02762892283499241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7628922907751985e-05, "grad_norm": 16.92850685119629, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8652156591415405, "num_tokens": 387311617.0, "step": 10149 }, { "epoch": 1.2911843276936776, "ewc_loss": 0.027654871344566345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7654870791593567e-05, "grad_norm": 17.007251739501953, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8532187938690186, "num_tokens": 387344281.0, "step": 10150 }, { "epoch": 1.2913115379722682, "ewc_loss": 0.027693672105669975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7693671654560603e-05, "grad_norm": 16.985671997070312, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8556807041168213, "num_tokens": 387385959.0, "step": 10151 }, { "epoch": 1.2914387482508587, "ewc_loss": 0.02767331339418888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7673313525156118e-05, "grad_norm": 17.018007278442383, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8608214855194092, "num_tokens": 387426197.0, "step": 10152 }, { "epoch": 1.2915659585294492, "ewc_loss": 0.02769838273525238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7698382837115787e-05, "grad_norm": 16.953144073486328, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8722915649414062, "num_tokens": 387462250.0, "step": 10153 }, { "epoch": 1.2916931688080397, "ewc_loss": 0.027662934735417366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7662934371619485e-05, "grad_norm": 17.04111099243164, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8654796481132507, "num_tokens": 387499903.0, "step": 10154 }, { "epoch": 1.2918203790866303, "ewc_loss": 0.027767520397901535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.776752080535516e-05, "grad_norm": 17.01997184753418, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8724661469459534, "num_tokens": 387535361.0, "step": 10155 }, { "epoch": 1.2919475893652206, "ewc_loss": 0.027637772262096405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7637772291200235e-05, "grad_norm": 16.973621368408203, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8625986576080322, "num_tokens": 387574300.0, "step": 10156 }, { "epoch": 1.2920747996438111, "ewc_loss": 0.02776367962360382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7763679099734873e-05, "grad_norm": 17.00289535522461, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8671258687973022, "num_tokens": 387606860.0, "step": 10157 }, { "epoch": 1.2922020099224016, "ewc_loss": 0.0277316365391016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773163578240201e-05, "grad_norm": 16.958988189697266, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8566909432411194, "num_tokens": 387646519.0, "step": 10158 }, { "epoch": 1.2923292202009922, "ewc_loss": 0.02771323174238205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7713231247616932e-05, "grad_norm": 16.977554321289062, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8771151900291443, "num_tokens": 387681100.0, "step": 10159 }, { "epoch": 1.2924564304795827, "ewc_loss": 0.02775297313928604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.77529725281056e-05, "grad_norm": 16.974119186401367, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.8882287740707397, "num_tokens": 387715962.0, "step": 10160 }, { "epoch": 1.2925836407581732, "ewc_loss": 0.027711031958460808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7711032089428045e-05, "grad_norm": 16.964027404785156, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.864843487739563, "num_tokens": 387755856.0, "step": 10161 }, { "epoch": 1.2927108510367638, "ewc_loss": 0.02774554304778576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774554377538152e-05, "grad_norm": 17.01133155822754, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8743276000022888, "num_tokens": 387793256.0, "step": 10162 }, { "epoch": 1.2928380613153543, "ewc_loss": 0.027774985879659653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7774985937867314e-05, "grad_norm": 16.913949966430664, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8618432283401489, "num_tokens": 387834696.0, "step": 10163 }, { "epoch": 1.2929652715939448, "ewc_loss": 0.027706243097782135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770624269032851e-05, "grad_norm": 16.981626510620117, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8549776077270508, "num_tokens": 387874973.0, "step": 10164 }, { "epoch": 1.2930924818725353, "ewc_loss": 0.02781388722360134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7813886845251545e-05, "grad_norm": 16.98794937133789, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8565207123756409, "num_tokens": 387904887.0, "step": 10165 }, { "epoch": 1.2932196921511259, "ewc_loss": 0.02777116559445858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7771166060119867e-05, "grad_norm": 16.985755920410156, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8731096982955933, "num_tokens": 387944676.0, "step": 10166 }, { "epoch": 1.2933469024297164, "ewc_loss": 0.02777360938489437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777360896288883e-05, "grad_norm": 17.02562713623047, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8674744367599487, "num_tokens": 387982234.0, "step": 10167 }, { "epoch": 1.293474112708307, "ewc_loss": 0.027743037790060043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7743037207983434e-05, "grad_norm": 16.900541305541992, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8687564730644226, "num_tokens": 388018001.0, "step": 10168 }, { "epoch": 1.2936013229868975, "ewc_loss": 0.0277429036796093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7742904421756975e-05, "grad_norm": 16.93177032470703, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.861622154712677, "num_tokens": 388057476.0, "step": 10169 }, { "epoch": 1.293728533265488, "ewc_loss": 0.027798820286989212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7798820156021975e-05, "grad_norm": 16.979860305786133, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8842509984970093, "num_tokens": 388095351.0, "step": 10170 }, { "epoch": 1.2938557435440783, "ewc_loss": 0.027797194197773933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7797193979495205e-05, "grad_norm": 17.02457046508789, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8657503128051758, "num_tokens": 388135366.0, "step": 10171 }, { "epoch": 1.2939829538226688, "ewc_loss": 0.027780327945947647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7780328309745528e-05, "grad_norm": 16.920719146728516, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8556650876998901, "num_tokens": 388175003.0, "step": 10172 }, { "epoch": 1.2941101641012593, "ewc_loss": 0.027764003723859787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7764002879848704e-05, "grad_norm": 16.98687171936035, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8478009700775146, "num_tokens": 388207083.0, "step": 10173 }, { "epoch": 1.2942373743798499, "ewc_loss": 0.027824796736240387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7824797143694013e-05, "grad_norm": 16.982378005981445, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8822687864303589, "num_tokens": 388245038.0, "step": 10174 }, { "epoch": 1.2943645846584404, "ewc_loss": 0.02778170257806778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778170346573461e-05, "grad_norm": 17.022571563720703, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.862446665763855, "num_tokens": 388285035.0, "step": 10175 }, { "epoch": 1.294491794937031, "ewc_loss": 0.02777032181620598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7770322049036622e-05, "grad_norm": 16.932987213134766, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8710945844650269, "num_tokens": 388323971.0, "step": 10176 }, { "epoch": 1.2946190052156215, "ewc_loss": 0.027732377871870995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7732377930078655e-05, "grad_norm": 16.955297470092773, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8612250089645386, "num_tokens": 388362256.0, "step": 10177 }, { "epoch": 1.294746215494212, "ewc_loss": 0.02780531346797943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7805313948192634e-05, "grad_norm": 16.980201721191406, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.8866803646087646, "num_tokens": 388401120.0, "step": 10178 }, { "epoch": 1.2948734257728025, "ewc_loss": 0.027780184522271156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7780184609582648e-05, "grad_norm": 16.949787139892578, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8687206506729126, "num_tokens": 388446833.0, "step": 10179 }, { "epoch": 1.2950006360513928, "ewc_loss": 0.02777317725121975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777317786240019e-05, "grad_norm": 17.004440307617188, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8786449432373047, "num_tokens": 388482266.0, "step": 10180 }, { "epoch": 1.2951278463299833, "ewc_loss": 0.02778097614645958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778097586997319e-05, "grad_norm": 16.956174850463867, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8678479790687561, "num_tokens": 388518024.0, "step": 10181 }, { "epoch": 1.2952550566085739, "ewc_loss": 0.027746882289648056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774688255158253e-05, "grad_norm": 16.984567642211914, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8692299723625183, "num_tokens": 388564246.0, "step": 10182 }, { "epoch": 1.2953822668871644, "ewc_loss": 0.027781346812844276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7781346943811513e-05, "grad_norm": 16.966516494750977, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8710856437683105, "num_tokens": 388603556.0, "step": 10183 }, { "epoch": 1.295509477165755, "ewc_loss": 0.027746057137846947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774605673039332e-05, "grad_norm": 17.014862060546875, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8750184774398804, "num_tokens": 388643615.0, "step": 10184 }, { "epoch": 1.2956366874443455, "ewc_loss": 0.027779320254921913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7779320589615963e-05, "grad_norm": 17.013656616210938, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.869982123374939, "num_tokens": 388682917.0, "step": 10185 }, { "epoch": 1.295763897722936, "ewc_loss": 0.027693573385477066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7693573429132812e-05, "grad_norm": 16.979684829711914, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8702492713928223, "num_tokens": 388720269.0, "step": 10186 }, { "epoch": 1.2958911080015265, "ewc_loss": 0.0277163814753294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7716381737263873e-05, "grad_norm": 16.9833927154541, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.859544575214386, "num_tokens": 388759245.0, "step": 10187 }, { "epoch": 1.296018318280117, "ewc_loss": 0.027764346450567245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7764346668845974e-05, "grad_norm": 16.966737747192383, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8633074164390564, "num_tokens": 388795673.0, "step": 10188 }, { "epoch": 1.2961455285587076, "ewc_loss": 0.027708716690540314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770871651591733e-05, "grad_norm": 16.986249923706055, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8708293437957764, "num_tokens": 388832926.0, "step": 10189 }, { "epoch": 1.296272738837298, "ewc_loss": 0.027747735381126404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7747735657612793e-05, "grad_norm": 16.962305068969727, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8702558279037476, "num_tokens": 388869767.0, "step": 10190 }, { "epoch": 1.2963999491158886, "ewc_loss": 0.02774285525083542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774285530904308e-05, "grad_norm": 17.00231170654297, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8696951270103455, "num_tokens": 388913347.0, "step": 10191 }, { "epoch": 1.2965271593944792, "ewc_loss": 0.02770416997373104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770417086139787e-05, "grad_norm": 16.90215301513672, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8788495659828186, "num_tokens": 388948211.0, "step": 10192 }, { "epoch": 1.2966543696730697, "ewc_loss": 0.027724917978048325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7724918254534714e-05, "grad_norm": 16.92970085144043, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8719922304153442, "num_tokens": 388989198.0, "step": 10193 }, { "epoch": 1.2967815799516602, "ewc_loss": 0.027717266231775284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7717265766113997e-05, "grad_norm": 16.90163230895996, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8584511280059814, "num_tokens": 389029076.0, "step": 10194 }, { "epoch": 1.2969087902302507, "ewc_loss": 0.02770812064409256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7708119887392968e-05, "grad_norm": 16.931455612182617, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8665602207183838, "num_tokens": 389066751.0, "step": 10195 }, { "epoch": 1.297036000508841, "ewc_loss": 0.027812447398900986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.781244802463334e-05, "grad_norm": 17.016563415527344, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8650884628295898, "num_tokens": 389104207.0, "step": 10196 }, { "epoch": 1.2971632107874316, "ewc_loss": 0.02776218205690384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7762182071455754e-05, "grad_norm": 16.93014144897461, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8706538677215576, "num_tokens": 389141941.0, "step": 10197 }, { "epoch": 1.297290421066022, "ewc_loss": 0.027755538001656532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.77555373031646e-05, "grad_norm": 17.01724624633789, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.876963198184967, "num_tokens": 389185326.0, "step": 10198 }, { "epoch": 1.2974176313446126, "ewc_loss": 0.027785029262304306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778502857836429e-05, "grad_norm": 16.94731903076172, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.876600980758667, "num_tokens": 389221512.0, "step": 10199 }, { "epoch": 1.2975448416232032, "ewc_loss": 0.02772338315844536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.772338302747812e-05, "grad_norm": 17.026586532592773, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8753923773765564, "num_tokens": 389259388.0, "step": 10200 }, { "epoch": 1.2976720519017937, "ewc_loss": 0.027733255177736282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7733254682971165e-05, "grad_norm": 16.96581268310547, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8565275073051453, "num_tokens": 389298773.0, "step": 10201 }, { "epoch": 1.2977992621803842, "ewc_loss": 0.027720803394913673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7720803700503893e-05, "grad_norm": 16.99818229675293, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8683558702468872, "num_tokens": 389339680.0, "step": 10202 }, { "epoch": 1.2979264724589747, "ewc_loss": 0.027758093550801277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7758092983276583e-05, "grad_norm": 16.98763656616211, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8703840970993042, "num_tokens": 389380755.0, "step": 10203 }, { "epoch": 1.2980536827375653, "ewc_loss": 0.02767918072640896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7679181584971957e-05, "grad_norm": 16.993375778198242, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8726733922958374, "num_tokens": 389412031.0, "step": 10204 }, { "epoch": 1.2981808930161556, "ewc_loss": 0.02769629843533039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7696298275259323e-05, "grad_norm": 16.964998245239258, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8771857619285583, "num_tokens": 389448299.0, "step": 10205 }, { "epoch": 1.298308103294746, "ewc_loss": 0.02769393101334572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.769393177004531e-05, "grad_norm": 17.002399444580078, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8615577220916748, "num_tokens": 389483962.0, "step": 10206 }, { "epoch": 1.2984353135733366, "ewc_loss": 0.02769097499549389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7690975912264548e-05, "grad_norm": 16.96472930908203, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8599740266799927, "num_tokens": 389521989.0, "step": 10207 }, { "epoch": 1.2985625238519272, "ewc_loss": 0.027678102254867554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7678102924255654e-05, "grad_norm": 16.966428756713867, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8620713353157043, "num_tokens": 389558228.0, "step": 10208 }, { "epoch": 1.2986897341305177, "ewc_loss": 0.02771543711423874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7715437681763433e-05, "grad_norm": 16.97090721130371, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8674145340919495, "num_tokens": 389597164.0, "step": 10209 }, { "epoch": 1.2988169444091082, "ewc_loss": 0.027739187702536583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773918822640553e-05, "grad_norm": 16.986217498779297, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8677203059196472, "num_tokens": 389636435.0, "step": 10210 }, { "epoch": 1.2989441546876987, "ewc_loss": 0.02767365239560604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7673651857185178e-05, "grad_norm": 16.9431209564209, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8602936267852783, "num_tokens": 389670498.0, "step": 10211 }, { "epoch": 1.2990713649662893, "ewc_loss": 0.027736129239201546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7736128686228767e-05, "grad_norm": 17.02952766418457, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8688927888870239, "num_tokens": 389707433.0, "step": 10212 }, { "epoch": 1.2991985752448798, "ewc_loss": 0.027748342603445053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7748343200073577e-05, "grad_norm": 16.98538589477539, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8679128885269165, "num_tokens": 389747300.0, "step": 10213 }, { "epoch": 1.2993257855234703, "ewc_loss": 0.027778228744864464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7778229195973836e-05, "grad_norm": 17.076335906982422, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8614394664764404, "num_tokens": 389788381.0, "step": 10214 }, { "epoch": 1.2994529958020609, "ewc_loss": 0.027805496007204056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7805495847132988e-05, "grad_norm": 16.999618530273438, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.870844304561615, "num_tokens": 389821792.0, "step": 10215 }, { "epoch": 1.2995802060806514, "ewc_loss": 0.027715250849723816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7715250325854868e-05, "grad_norm": 17.017629623413086, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8787108063697815, "num_tokens": 389859370.0, "step": 10216 }, { "epoch": 1.299707416359242, "ewc_loss": 0.027731461450457573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.773146115941927e-05, "grad_norm": 17.030881881713867, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8590068817138672, "num_tokens": 389897020.0, "step": 10217 }, { "epoch": 1.2998346266378324, "ewc_loss": 0.02776559814810753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7765598133555613e-05, "grad_norm": 17.0447940826416, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8404352068901062, "num_tokens": 389927107.0, "step": 10218 }, { "epoch": 1.299961836916423, "ewc_loss": 0.027785401791334152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7785401471192017e-05, "grad_norm": 17.035709381103516, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.864689290523529, "num_tokens": 389966323.0, "step": 10219 }, { "epoch": 1.3000890471950133, "ewc_loss": 0.02775839902460575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.775839857349638e-05, "grad_norm": 17.013704299926758, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8667575716972351, "num_tokens": 390001416.0, "step": 10220 }, { "epoch": 1.3002162574736038, "ewc_loss": 0.027813689783215523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7813690394395962e-05, "grad_norm": 17.09786605834961, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8704550266265869, "num_tokens": 390042326.0, "step": 10221 }, { "epoch": 1.3003434677521943, "ewc_loss": 0.027728363871574402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.772836342046503e-05, "grad_norm": 16.966350555419922, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8582024574279785, "num_tokens": 390084443.0, "step": 10222 }, { "epoch": 1.3004706780307849, "ewc_loss": 0.027775399386882782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777539884846192e-05, "grad_norm": 17.00969696044922, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8608317375183105, "num_tokens": 390126309.0, "step": 10223 }, { "epoch": 1.3005978883093754, "ewc_loss": 0.027807042002677917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7807041988126002e-05, "grad_norm": 16.959911346435547, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8722404837608337, "num_tokens": 390161463.0, "step": 10224 }, { "epoch": 1.300725098587966, "ewc_loss": 0.027768200263381004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7768201107392088e-05, "grad_norm": 17.009902954101562, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8775997161865234, "num_tokens": 390203221.0, "step": 10225 }, { "epoch": 1.3008523088665565, "ewc_loss": 0.027788490056991577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7788490115199238e-05, "grad_norm": 16.9681339263916, "learning_rate": 1e-06, "loss": 0.3605, "mean_token_accuracy": 0.8831243515014648, "num_tokens": 390243269.0, "step": 10226 }, { "epoch": 1.300979519145147, "ewc_loss": 0.027762053534388542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7762052923208103e-05, "grad_norm": 16.979637145996094, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8684710264205933, "num_tokens": 390282488.0, "step": 10227 }, { "epoch": 1.3011067294237375, "ewc_loss": 0.02771162986755371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.771163053694181e-05, "grad_norm": 16.91939926147461, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8627305030822754, "num_tokens": 390320268.0, "step": 10228 }, { "epoch": 1.3012339397023278, "ewc_loss": 0.027761446312069893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7761447199736722e-05, "grad_norm": 17.037498474121094, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8580389022827148, "num_tokens": 390354733.0, "step": 10229 }, { "epoch": 1.3013611499809183, "ewc_loss": 0.02779420092701912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7794201741926372e-05, "grad_norm": 16.97887420654297, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.866546630859375, "num_tokens": 390388161.0, "step": 10230 }, { "epoch": 1.3014883602595089, "ewc_loss": 0.027751963585615158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7751962988986634e-05, "grad_norm": 17.0166015625, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8772188425064087, "num_tokens": 390424609.0, "step": 10231 }, { "epoch": 1.3016155705380994, "ewc_loss": 0.027752045542001724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7752044843509793e-05, "grad_norm": 16.97452163696289, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8838436007499695, "num_tokens": 390460809.0, "step": 10232 }, { "epoch": 1.30174278081669, "ewc_loss": 0.027773194015026093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777319423330482e-05, "grad_norm": 16.963272094726562, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8566972613334656, "num_tokens": 390500198.0, "step": 10233 }, { "epoch": 1.3018699910952805, "ewc_loss": 0.027745747938752174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7745747502194718e-05, "grad_norm": 16.9814510345459, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8607441782951355, "num_tokens": 390545588.0, "step": 10234 }, { "epoch": 1.301997201373871, "ewc_loss": 0.027800463140010834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7800462703453377e-05, "grad_norm": 16.973167419433594, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.85752272605896, "num_tokens": 390581681.0, "step": 10235 }, { "epoch": 1.3021244116524615, "ewc_loss": 0.027759375050663948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7759375370806083e-05, "grad_norm": 17.018762588500977, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8851755857467651, "num_tokens": 390619122.0, "step": 10236 }, { "epoch": 1.302251621931052, "ewc_loss": 0.027741316705942154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774131644400768e-05, "grad_norm": 16.920217514038086, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8552606105804443, "num_tokens": 390667974.0, "step": 10237 }, { "epoch": 1.3023788322096426, "ewc_loss": 0.027761930599808693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7761931050918065e-05, "grad_norm": 17.044635772705078, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8695082664489746, "num_tokens": 390705188.0, "step": 10238 }, { "epoch": 1.302506042488233, "ewc_loss": 0.027796685695648193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7796686481451616e-05, "grad_norm": 16.963367462158203, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8771145343780518, "num_tokens": 390746892.0, "step": 10239 }, { "epoch": 1.3026332527668236, "ewc_loss": 0.02771560288965702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7715603209799156e-05, "grad_norm": 16.984811782836914, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8828226923942566, "num_tokens": 390787727.0, "step": 10240 }, { "epoch": 1.3027604630454142, "ewc_loss": 0.02775007300078869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.775007305899635e-05, "grad_norm": 16.97391700744629, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.858814001083374, "num_tokens": 390823702.0, "step": 10241 }, { "epoch": 1.3028876733240047, "ewc_loss": 0.027732400223612785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7732399757951498e-05, "grad_norm": 17.02100944519043, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8667594790458679, "num_tokens": 390860256.0, "step": 10242 }, { "epoch": 1.3030148836025952, "ewc_loss": 0.027758672833442688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7758673240896314e-05, "grad_norm": 16.942752838134766, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.881095290184021, "num_tokens": 390901676.0, "step": 10243 }, { "epoch": 1.3031420938811857, "ewc_loss": 0.027751265093684196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7751264497055672e-05, "grad_norm": 17.007062911987305, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8506776690483093, "num_tokens": 390936911.0, "step": 10244 }, { "epoch": 1.303269304159776, "ewc_loss": 0.027795307338237762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7795307687483728e-05, "grad_norm": 17.0231990814209, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8556699752807617, "num_tokens": 390976185.0, "step": 10245 }, { "epoch": 1.3033965144383666, "ewc_loss": 0.02773706614971161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7737065465771593e-05, "grad_norm": 16.99371910095215, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8676177859306335, "num_tokens": 391014661.0, "step": 10246 }, { "epoch": 1.303523724716957, "ewc_loss": 0.027707073837518692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.770707396848593e-05, "grad_norm": 16.979694366455078, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8598807454109192, "num_tokens": 391056205.0, "step": 10247 }, { "epoch": 1.3036509349955476, "ewc_loss": 0.027695246040821075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7695246899384074e-05, "grad_norm": 16.982112884521484, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8692584037780762, "num_tokens": 391092187.0, "step": 10248 }, { "epoch": 1.3037781452741382, "ewc_loss": 0.027715157717466354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7715157557395287e-05, "grad_norm": 16.91231346130371, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8821123838424683, "num_tokens": 391130890.0, "step": 10249 }, { "epoch": 1.3039053555527287, "ewc_loss": 0.0277552492916584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.775524990283884e-05, "grad_norm": 16.967954635620117, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8768825531005859, "num_tokens": 391172783.0, "step": 10250 }, { "epoch": 1.3040325658313192, "ewc_loss": 0.027792885899543762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.779288661258761e-05, "grad_norm": 16.946882247924805, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8510161638259888, "num_tokens": 391215031.0, "step": 10251 }, { "epoch": 1.3041597761099097, "ewc_loss": 0.027728373184800148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7728372515412048e-05, "grad_norm": 16.994646072387695, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8703372478485107, "num_tokens": 391251400.0, "step": 10252 }, { "epoch": 1.3042869863885003, "ewc_loss": 0.027759965509176254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7759964723372832e-05, "grad_norm": 16.923099517822266, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8590906858444214, "num_tokens": 391285895.0, "step": 10253 }, { "epoch": 1.3044141966670906, "ewc_loss": 0.02775649167597294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7756492272601463e-05, "grad_norm": 17.01667594909668, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8687062859535217, "num_tokens": 391328361.0, "step": 10254 }, { "epoch": 1.304541406945681, "ewc_loss": 0.02778596803545952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778596717689652e-05, "grad_norm": 16.938852310180664, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8659693002700806, "num_tokens": 391365019.0, "step": 10255 }, { "epoch": 1.3046686172242716, "ewc_loss": 0.027743518352508545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.774351742118597e-05, "grad_norm": 16.999439239501953, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8513316512107849, "num_tokens": 391400407.0, "step": 10256 }, { "epoch": 1.3047958275028622, "ewc_loss": 0.02785763330757618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7857633540406823e-05, "grad_norm": 16.993154525756836, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.849037766456604, "num_tokens": 391440660.0, "step": 10257 }, { "epoch": 1.3049230377814527, "ewc_loss": 0.02781977877020836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.781977855192963e-05, "grad_norm": 17.01207160949707, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8700425624847412, "num_tokens": 391481096.0, "step": 10258 }, { "epoch": 1.3050502480600432, "ewc_loss": 0.0278118085116148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7811807740363292e-05, "grad_norm": 16.992692947387695, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8671342134475708, "num_tokens": 391516356.0, "step": 10259 }, { "epoch": 1.3051774583386337, "ewc_loss": 0.027793031185865402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.779303031275049e-05, "grad_norm": 16.981176376342773, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8561442494392395, "num_tokens": 391554691.0, "step": 10260 }, { "epoch": 1.3053046686172243, "ewc_loss": 0.02780218981206417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7802188924397342e-05, "grad_norm": 16.9641170501709, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8535367250442505, "num_tokens": 391592473.0, "step": 10261 }, { "epoch": 1.3054318788958148, "ewc_loss": 0.027830541133880615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.783054151223041e-05, "grad_norm": 16.85617446899414, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8725346326828003, "num_tokens": 391632091.0, "step": 10262 }, { "epoch": 1.3055590891744053, "ewc_loss": 0.02781115286052227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7811152904178016e-05, "grad_norm": 16.979772567749023, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.86040860414505, "num_tokens": 391676121.0, "step": 10263 }, { "epoch": 1.3056862994529959, "ewc_loss": 0.027940118685364723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7940119252889417e-05, "grad_norm": 16.916601181030273, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.864188551902771, "num_tokens": 391709664.0, "step": 10264 }, { "epoch": 1.3058135097315864, "ewc_loss": 0.02782324329018593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7823243726743385e-05, "grad_norm": 17.010095596313477, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8561272621154785, "num_tokens": 391755828.0, "step": 10265 }, { "epoch": 1.305940720010177, "ewc_loss": 0.027898484840989113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7898484404431656e-05, "grad_norm": 16.97855567932129, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8833975195884705, "num_tokens": 391800232.0, "step": 10266 }, { "epoch": 1.3060679302887674, "ewc_loss": 0.027871811762452126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7871812562807463e-05, "grad_norm": 16.991989135742188, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8678504228591919, "num_tokens": 391837612.0, "step": 10267 }, { "epoch": 1.306195140567358, "ewc_loss": 0.027850527316331863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.785052674880717e-05, "grad_norm": 16.990327835083008, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8771499991416931, "num_tokens": 391871193.0, "step": 10268 }, { "epoch": 1.3063223508459483, "ewc_loss": 0.02785808965563774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7858090106747113e-05, "grad_norm": 17.03023910522461, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8588414192199707, "num_tokens": 391908203.0, "step": 10269 }, { "epoch": 1.3064495611245388, "ewc_loss": 0.02788238599896431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7882386348210275e-05, "grad_norm": 17.03330421447754, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8729722499847412, "num_tokens": 391938744.0, "step": 10270 }, { "epoch": 1.3065767714031293, "ewc_loss": 0.027829928323626518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7829928512801416e-05, "grad_norm": 16.934465408325195, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8689520359039307, "num_tokens": 391976020.0, "step": 10271 }, { "epoch": 1.3067039816817199, "ewc_loss": 0.027852041646838188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7852041966980323e-05, "grad_norm": 16.976064682006836, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8580765128135681, "num_tokens": 392018006.0, "step": 10272 }, { "epoch": 1.3068311919603104, "ewc_loss": 0.02787531167268753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7875312298419885e-05, "grad_norm": 16.96855354309082, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8651096820831299, "num_tokens": 392055358.0, "step": 10273 }, { "epoch": 1.306958402238901, "ewc_loss": 0.02781873196363449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.781873263302259e-05, "grad_norm": 16.993677139282227, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8657588362693787, "num_tokens": 392097727.0, "step": 10274 }, { "epoch": 1.3070856125174914, "ewc_loss": 0.027835708111524582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.783570744213648e-05, "grad_norm": 16.986108779907227, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8720061779022217, "num_tokens": 392144641.0, "step": 10275 }, { "epoch": 1.307212822796082, "ewc_loss": 0.027793770655989647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.779377064143773e-05, "grad_norm": 17.0798282623291, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8741292953491211, "num_tokens": 392178810.0, "step": 10276 }, { "epoch": 1.3073400330746725, "ewc_loss": 0.027837416157126427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.783741547318641e-05, "grad_norm": 17.059043884277344, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8632539510726929, "num_tokens": 392213571.0, "step": 10277 }, { "epoch": 1.3074672433532628, "ewc_loss": 0.02780979312956333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7809792300104164e-05, "grad_norm": 17.021812438964844, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8628195524215698, "num_tokens": 392254628.0, "step": 10278 }, { "epoch": 1.3075944536318533, "ewc_loss": 0.027790449559688568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7790449166786857e-05, "grad_norm": 17.024927139282227, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8660737872123718, "num_tokens": 392297243.0, "step": 10279 }, { "epoch": 1.3077216639104439, "ewc_loss": 0.027821633964776993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7821633921121247e-05, "grad_norm": 16.99870491027832, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8695851564407349, "num_tokens": 392329116.0, "step": 10280 }, { "epoch": 1.3078488741890344, "ewc_loss": 0.027803048491477966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.780304930638522e-05, "grad_norm": 17.015094757080078, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.864708423614502, "num_tokens": 392370542.0, "step": 10281 }, { "epoch": 1.307976084467625, "ewc_loss": 0.027860337868332863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7860338377649896e-05, "grad_norm": 16.976892471313477, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8722429275512695, "num_tokens": 392411155.0, "step": 10282 }, { "epoch": 1.3081032947462155, "ewc_loss": 0.0277975182980299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7797517759609036e-05, "grad_norm": 16.970834732055664, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8673509955406189, "num_tokens": 392450440.0, "step": 10283 }, { "epoch": 1.308230505024806, "ewc_loss": 0.027794281020760536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7794281777460128e-05, "grad_norm": 16.960601806640625, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8774202466011047, "num_tokens": 392488545.0, "step": 10284 }, { "epoch": 1.3083577153033965, "ewc_loss": 0.027833709493279457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7833710191771388e-05, "grad_norm": 17.013931274414062, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8662292957305908, "num_tokens": 392527138.0, "step": 10285 }, { "epoch": 1.308484925581987, "ewc_loss": 0.027861889451742172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.786188997561112e-05, "grad_norm": 17.042020797729492, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8674802780151367, "num_tokens": 392568181.0, "step": 10286 }, { "epoch": 1.3086121358605776, "ewc_loss": 0.027811480686068535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7811480322270654e-05, "grad_norm": 17.02741241455078, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8818984031677246, "num_tokens": 392610900.0, "step": 10287 }, { "epoch": 1.308739346139168, "ewc_loss": 0.02782374806702137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7823747586808167e-05, "grad_norm": 17.083290100097656, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.859315812587738, "num_tokens": 392646624.0, "step": 10288 }, { "epoch": 1.3088665564177586, "ewc_loss": 0.027812926098704338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7812926418846473e-05, "grad_norm": 16.977521896362305, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8670933246612549, "num_tokens": 392683103.0, "step": 10289 }, { "epoch": 1.3089937666963491, "ewc_loss": 0.02779475226998329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7794752895715646e-05, "grad_norm": 17.03662109375, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8690067529678345, "num_tokens": 392723169.0, "step": 10290 }, { "epoch": 1.3091209769749397, "ewc_loss": 0.027841337025165558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7841337214340456e-05, "grad_norm": 17.01283836364746, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8814436197280884, "num_tokens": 392762323.0, "step": 10291 }, { "epoch": 1.3092481872535302, "ewc_loss": 0.027775023132562637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7775022317655385e-05, "grad_norm": 16.99819564819336, "learning_rate": 1e-06, "loss": 0.3507, "mean_token_accuracy": 0.8843702673912048, "num_tokens": 392797555.0, "step": 10292 }, { "epoch": 1.3093753975321207, "ewc_loss": 0.0278047826141119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7804782803286798e-05, "grad_norm": 16.991281509399414, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8779236078262329, "num_tokens": 392840001.0, "step": 10293 }, { "epoch": 1.309502607810711, "ewc_loss": 0.02775779739022255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.775779830699321e-05, "grad_norm": 16.9981746673584, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8667974472045898, "num_tokens": 392880871.0, "step": 10294 }, { "epoch": 1.3096298180893016, "ewc_loss": 0.027744170278310776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7744170438381843e-05, "grad_norm": 16.95827865600586, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8773552179336548, "num_tokens": 392917554.0, "step": 10295 }, { "epoch": 1.309757028367892, "ewc_loss": 0.027783963829278946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7783964469563216e-05, "grad_norm": 16.94700050354004, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8846662044525146, "num_tokens": 392958601.0, "step": 10296 }, { "epoch": 1.3098842386464826, "ewc_loss": 0.0277785062789917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7778505682363175e-05, "grad_norm": 17.024625778198242, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8690527677536011, "num_tokens": 393000108.0, "step": 10297 }, { "epoch": 1.3100114489250732, "ewc_loss": 0.027819858863949776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7819858587463386e-05, "grad_norm": 17.114107131958008, "learning_rate": 1e-06, "loss": 0.3505, "mean_token_accuracy": 0.8888485431671143, "num_tokens": 393032321.0, "step": 10298 }, { "epoch": 1.3101386592036637, "ewc_loss": 0.027763858437538147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7763859179685824e-05, "grad_norm": 17.01607322692871, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8655678629875183, "num_tokens": 393068347.0, "step": 10299 }, { "epoch": 1.3102658694822542, "ewc_loss": 0.027748486027121544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7748486900236458e-05, "grad_norm": 17.08523178100586, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8845845460891724, "num_tokens": 393102509.0, "step": 10300 }, { "epoch": 1.3103930797608447, "ewc_loss": 0.027799390256404877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7799389499705285e-05, "grad_norm": 17.00490379333496, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8757349848747253, "num_tokens": 393140974.0, "step": 10301 }, { "epoch": 1.3105202900394353, "ewc_loss": 0.027774907648563385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777490772132296e-05, "grad_norm": 17.073772430419922, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8770063519477844, "num_tokens": 393177214.0, "step": 10302 }, { "epoch": 1.3106475003180256, "ewc_loss": 0.027827754616737366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.782775482046418e-05, "grad_norm": 17.015642166137695, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8708117604255676, "num_tokens": 393215801.0, "step": 10303 }, { "epoch": 1.310774710596616, "ewc_loss": 0.027707688510417938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7707688786904328e-05, "grad_norm": 17.007478713989258, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8796223402023315, "num_tokens": 393256444.0, "step": 10304 }, { "epoch": 1.3109019208752066, "ewc_loss": 0.02778819389641285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.778819361992646e-05, "grad_norm": 16.92926025390625, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8640632629394531, "num_tokens": 393300238.0, "step": 10305 }, { "epoch": 1.3110291311537972, "ewc_loss": 0.027773939073085785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.777393819997087e-05, "grad_norm": 17.01689910888672, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8539004325866699, "num_tokens": 393344087.0, "step": 10306 }, { "epoch": 1.3111563414323877, "ewc_loss": 0.027781378477811813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7781377866631374e-05, "grad_norm": 16.92487335205078, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8735752105712891, "num_tokens": 393381678.0, "step": 10307 }, { "epoch": 1.3112835517109782, "ewc_loss": 0.02776382490992546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7763824618887156e-05, "grad_norm": 17.011011123657227, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8661301732063293, "num_tokens": 393415740.0, "step": 10308 }, { "epoch": 1.3114107619895687, "ewc_loss": 0.027805270627141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.780527029244695e-05, "grad_norm": 16.930767059326172, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8442577123641968, "num_tokens": 393457692.0, "step": 10309 }, { "epoch": 1.3115379722681593, "ewc_loss": 0.027813127264380455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7813126507680863e-05, "grad_norm": 16.96668815612793, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.86297607421875, "num_tokens": 393499003.0, "step": 10310 }, { "epoch": 1.3116651825467498, "ewc_loss": 0.027859695255756378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.785969445540104e-05, "grad_norm": 16.99102020263672, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8578771352767944, "num_tokens": 393533842.0, "step": 10311 }, { "epoch": 1.3117923928253403, "ewc_loss": 0.027843324467539787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.784332536975853e-05, "grad_norm": 16.991294860839844, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8757823705673218, "num_tokens": 393573906.0, "step": 10312 }, { "epoch": 1.3119196031039309, "ewc_loss": 0.027829399332404137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7829399186884984e-05, "grad_norm": 17.010698318481445, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8668050765991211, "num_tokens": 393615298.0, "step": 10313 }, { "epoch": 1.3120468133825214, "ewc_loss": 0.0278759878128767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.78759871434886e-05, "grad_norm": 17.091814041137695, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8578901290893555, "num_tokens": 393654690.0, "step": 10314 }, { "epoch": 1.312174023661112, "ewc_loss": 0.027838561683893204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7838561436510645e-05, "grad_norm": 17.047544479370117, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8704246282577515, "num_tokens": 393696222.0, "step": 10315 }, { "epoch": 1.3123012339397024, "ewc_loss": 0.027733350172638893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7733351089409553e-05, "grad_norm": 16.899137496948242, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8646684288978577, "num_tokens": 393736748.0, "step": 10316 }, { "epoch": 1.312428444218293, "ewc_loss": 0.027823274955153465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7823274649563245e-05, "grad_norm": 17.094696044921875, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8596725463867188, "num_tokens": 393773077.0, "step": 10317 }, { "epoch": 1.3125556544968833, "ewc_loss": 0.02781631611287594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.781631519610528e-05, "grad_norm": 16.93358039855957, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8797551989555359, "num_tokens": 393806466.0, "step": 10318 }, { "epoch": 1.3126828647754738, "ewc_loss": 0.02772814966738224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.772814877971541e-05, "grad_norm": 16.981975555419922, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.864414632320404, "num_tokens": 393840922.0, "step": 10319 }, { "epoch": 1.3128100750540643, "ewc_loss": 0.027842005714774132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.784200660244096e-05, "grad_norm": 16.985050201416016, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8719465732574463, "num_tokens": 393874544.0, "step": 10320 }, { "epoch": 1.3129372853326549, "ewc_loss": 0.027799198403954506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7799198505817913e-05, "grad_norm": 16.982908248901367, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.864065408706665, "num_tokens": 393917773.0, "step": 10321 }, { "epoch": 1.3130644956112454, "ewc_loss": 0.027824603021144867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7824602511827834e-05, "grad_norm": 17.021026611328125, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.863823413848877, "num_tokens": 393951525.0, "step": 10322 }, { "epoch": 1.313191705889836, "ewc_loss": 0.02781265787780285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.781265720841475e-05, "grad_norm": 17.031736373901367, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8719245195388794, "num_tokens": 393989796.0, "step": 10323 }, { "epoch": 1.3133189161684264, "ewc_loss": 0.027799533680081367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7799533199868165e-05, "grad_norm": 17.063640594482422, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8651390075683594, "num_tokens": 394028746.0, "step": 10324 }, { "epoch": 1.313446126447017, "ewc_loss": 0.027863632887601852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7863632567459717e-05, "grad_norm": 17.03346061706543, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8765297532081604, "num_tokens": 394065241.0, "step": 10325 }, { "epoch": 1.3135733367256075, "ewc_loss": 0.027857007458806038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7857007808052003e-05, "grad_norm": 16.99650764465332, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.862879753112793, "num_tokens": 394110113.0, "step": 10326 }, { "epoch": 1.3137005470041978, "ewc_loss": 0.02784043364226818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7840433176606894e-05, "grad_norm": 17.007843017578125, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8517806529998779, "num_tokens": 394146577.0, "step": 10327 }, { "epoch": 1.3138277572827883, "ewc_loss": 0.027887724339962006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7887725082109682e-05, "grad_norm": 17.04362678527832, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8678469657897949, "num_tokens": 394185392.0, "step": 10328 }, { "epoch": 1.3139549675613789, "ewc_loss": 0.027847958728671074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7847958335769363e-05, "grad_norm": 16.971710205078125, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8590115904808044, "num_tokens": 394226301.0, "step": 10329 }, { "epoch": 1.3140821778399694, "ewc_loss": 0.027870751917362213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7870752091985196e-05, "grad_norm": 17.05424690246582, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8592523336410522, "num_tokens": 394262654.0, "step": 10330 }, { "epoch": 1.31420938811856, "ewc_loss": 0.027923010289669037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7923009838559665e-05, "grad_norm": 17.006023406982422, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8789868354797363, "num_tokens": 394299185.0, "step": 10331 }, { "epoch": 1.3143365983971504, "ewc_loss": 0.027904780581593513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7904779926757328e-05, "grad_norm": 17.050323486328125, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8684085011482239, "num_tokens": 394331364.0, "step": 10332 }, { "epoch": 1.314463808675741, "ewc_loss": 0.027951274067163467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.795127329591196e-05, "grad_norm": 17.01145362854004, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8562244176864624, "num_tokens": 394375479.0, "step": 10333 }, { "epoch": 1.3145910189543315, "ewc_loss": 0.027886081486940384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7886080715688877e-05, "grad_norm": 16.956735610961914, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8737735748291016, "num_tokens": 394414913.0, "step": 10334 }, { "epoch": 1.314718229232922, "ewc_loss": 0.027929777279496193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7929776479140855e-05, "grad_norm": 17.046205520629883, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8686481714248657, "num_tokens": 394450889.0, "step": 10335 }, { "epoch": 1.3148454395115126, "ewc_loss": 0.027895666658878326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7895666789845563e-05, "grad_norm": 16.98121452331543, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8514399528503418, "num_tokens": 394495280.0, "step": 10336 }, { "epoch": 1.314972649790103, "ewc_loss": 0.027908697724342346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7908698029932566e-05, "grad_norm": 17.052494049072266, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8764660358428955, "num_tokens": 394536811.0, "step": 10337 }, { "epoch": 1.3150998600686936, "ewc_loss": 0.027907410636544228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7907410185434856e-05, "grad_norm": 16.982030868530273, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8648369312286377, "num_tokens": 394573353.0, "step": 10338 }, { "epoch": 1.3152270703472841, "ewc_loss": 0.027866654098033905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7866653908859007e-05, "grad_norm": 17.072908401489258, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.859799861907959, "num_tokens": 394603517.0, "step": 10339 }, { "epoch": 1.3153542806258747, "ewc_loss": 0.02793739177286625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7937392587773502e-05, "grad_norm": 17.04014778137207, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8740938305854797, "num_tokens": 394650547.0, "step": 10340 }, { "epoch": 1.3154814909044652, "ewc_loss": 0.027853570878505707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7853571737068705e-05, "grad_norm": 16.923322677612305, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8706066012382507, "num_tokens": 394684460.0, "step": 10341 }, { "epoch": 1.3156087011830557, "ewc_loss": 0.027854150161147118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7854150175699033e-05, "grad_norm": 16.973499298095703, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8830308318138123, "num_tokens": 394733520.0, "step": 10342 }, { "epoch": 1.315735911461646, "ewc_loss": 0.027884233742952347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7884234441444278e-05, "grad_norm": 17.007389068603516, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.862134575843811, "num_tokens": 394775498.0, "step": 10343 }, { "epoch": 1.3158631217402366, "ewc_loss": 0.027877025306224823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7877025786438026e-05, "grad_norm": 16.969266891479492, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.854641318321228, "num_tokens": 394815138.0, "step": 10344 }, { "epoch": 1.315990332018827, "ewc_loss": 0.027858486399054527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7858486646437086e-05, "grad_norm": 16.982187271118164, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8595156073570251, "num_tokens": 394850447.0, "step": 10345 }, { "epoch": 1.3161175422974176, "ewc_loss": 0.027860211208462715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7860211048391648e-05, "grad_norm": 16.973426818847656, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8609665632247925, "num_tokens": 394891800.0, "step": 10346 }, { "epoch": 1.3162447525760081, "ewc_loss": 0.027865663170814514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7865662559634075e-05, "grad_norm": 17.00615882873535, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8749959468841553, "num_tokens": 394930087.0, "step": 10347 }, { "epoch": 1.3163719628545987, "ewc_loss": 0.027887867763638496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.788786696328316e-05, "grad_norm": 16.98761558532715, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8769272565841675, "num_tokens": 394964497.0, "step": 10348 }, { "epoch": 1.3164991731331892, "ewc_loss": 0.02788628078997135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7886280804523267e-05, "grad_norm": 17.029478073120117, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8499910831451416, "num_tokens": 395003127.0, "step": 10349 }, { "epoch": 1.3166263834117797, "ewc_loss": 0.02792980894446373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.792980922095012e-05, "grad_norm": 16.967927932739258, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8570907711982727, "num_tokens": 395040570.0, "step": 10350 }, { "epoch": 1.3167535936903703, "ewc_loss": 0.02788235992193222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7882359063369222e-05, "grad_norm": 17.020410537719727, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8691501021385193, "num_tokens": 395075205.0, "step": 10351 }, { "epoch": 1.3168808039689606, "ewc_loss": 0.027938859537243843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.793885869323276e-05, "grad_norm": 16.9798526763916, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8670703768730164, "num_tokens": 395117492.0, "step": 10352 }, { "epoch": 1.317008014247551, "ewc_loss": 0.027840452268719673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.784045136650093e-05, "grad_norm": 16.993051528930664, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8642586469650269, "num_tokens": 395162449.0, "step": 10353 }, { "epoch": 1.3171352245261416, "ewc_loss": 0.027902856469154358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790285725495778e-05, "grad_norm": 17.025390625, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8760961294174194, "num_tokens": 395197310.0, "step": 10354 }, { "epoch": 1.3172624348047322, "ewc_loss": 0.027912627905607224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7912627047044225e-05, "grad_norm": 16.979963302612305, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8782278895378113, "num_tokens": 395232420.0, "step": 10355 }, { "epoch": 1.3173896450833227, "ewc_loss": 0.027896912768483162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7896912797586992e-05, "grad_norm": 16.989343643188477, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.846560001373291, "num_tokens": 395269392.0, "step": 10356 }, { "epoch": 1.3175168553619132, "ewc_loss": 0.027902929112315178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790292819554452e-05, "grad_norm": 17.0257625579834, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8560799360275269, "num_tokens": 395305496.0, "step": 10357 }, { "epoch": 1.3176440656405037, "ewc_loss": 0.027909673750400543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7909673008252867e-05, "grad_norm": 16.961158752441406, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.876159131526947, "num_tokens": 395347089.0, "step": 10358 }, { "epoch": 1.3177712759190943, "ewc_loss": 0.02789553813636303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7895537641597912e-05, "grad_norm": 16.915407180786133, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8775426745414734, "num_tokens": 395382885.0, "step": 10359 }, { "epoch": 1.3178984861976848, "ewc_loss": 0.02793174609541893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7931746444664896e-05, "grad_norm": 17.03464698791504, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8817057609558105, "num_tokens": 395416260.0, "step": 10360 }, { "epoch": 1.3180256964762753, "ewc_loss": 0.02793901413679123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.793901330733206e-05, "grad_norm": 17.000879287719727, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8583148717880249, "num_tokens": 395454179.0, "step": 10361 }, { "epoch": 1.3181529067548658, "ewc_loss": 0.027919922024011612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7919921194552444e-05, "grad_norm": 17.039554595947266, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8579421043395996, "num_tokens": 395489173.0, "step": 10362 }, { "epoch": 1.3182801170334564, "ewc_loss": 0.027927614748477936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7927615519729443e-05, "grad_norm": 16.94009017944336, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8607954978942871, "num_tokens": 395532057.0, "step": 10363 }, { "epoch": 1.318407327312047, "ewc_loss": 0.027900680899620056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790068174363114e-05, "grad_norm": 17.00377655029297, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8735248446464539, "num_tokens": 395570321.0, "step": 10364 }, { "epoch": 1.3185345375906374, "ewc_loss": 0.027980413287878036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7980413506156765e-05, "grad_norm": 16.97931671142578, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8870733976364136, "num_tokens": 395609041.0, "step": 10365 }, { "epoch": 1.318661747869228, "ewc_loss": 0.02793772704899311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7937727281823754e-05, "grad_norm": 17.02816390991211, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8443127870559692, "num_tokens": 395649961.0, "step": 10366 }, { "epoch": 1.3187889581478183, "ewc_loss": 0.027966443449258804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7966443667537533e-05, "grad_norm": 16.982358932495117, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8684676885604858, "num_tokens": 395691882.0, "step": 10367 }, { "epoch": 1.3189161684264088, "ewc_loss": 0.027927227318286896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7927228074986488e-05, "grad_norm": 17.015548706054688, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8548935651779175, "num_tokens": 395731496.0, "step": 10368 }, { "epoch": 1.3190433787049993, "ewc_loss": 0.027984026819467545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7984026019112207e-05, "grad_norm": 17.070327758789062, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8715065121650696, "num_tokens": 395772618.0, "step": 10369 }, { "epoch": 1.3191705889835899, "ewc_loss": 0.027900107204914093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790010694297962e-05, "grad_norm": 16.974103927612305, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8509382009506226, "num_tokens": 395811907.0, "step": 10370 }, { "epoch": 1.3192977992621804, "ewc_loss": 0.027927357703447342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.792735722323414e-05, "grad_norm": 17.09362030029297, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8619693517684937, "num_tokens": 395851186.0, "step": 10371 }, { "epoch": 1.319425009540771, "ewc_loss": 0.027911504730582237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7911504730582237e-05, "grad_norm": 16.944299697875977, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8676655888557434, "num_tokens": 395893868.0, "step": 10372 }, { "epoch": 1.3195522198193614, "ewc_loss": 0.02786123938858509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.786123877740465e-05, "grad_norm": 17.057777404785156, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8765401244163513, "num_tokens": 395930519.0, "step": 10373 }, { "epoch": 1.319679430097952, "ewc_loss": 0.02794186770915985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7941867301706225e-05, "grad_norm": 16.99609375, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8828198313713074, "num_tokens": 395965692.0, "step": 10374 }, { "epoch": 1.3198066403765425, "ewc_loss": 0.027848882600665092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7848882382386364e-05, "grad_norm": 17.004323959350586, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.86947101354599, "num_tokens": 396002316.0, "step": 10375 }, { "epoch": 1.3199338506551328, "ewc_loss": 0.027890261262655258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7890260753338225e-05, "grad_norm": 17.03568458557129, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8736305236816406, "num_tokens": 396043392.0, "step": 10376 }, { "epoch": 1.3200610609337233, "ewc_loss": 0.027852743864059448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7852744096890092e-05, "grad_norm": 17.0339412689209, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.864921510219574, "num_tokens": 396078603.0, "step": 10377 }, { "epoch": 1.3201882712123139, "ewc_loss": 0.027822352945804596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7822352421935648e-05, "grad_norm": 17.016860961914062, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.858803391456604, "num_tokens": 396121132.0, "step": 10378 }, { "epoch": 1.3203154814909044, "ewc_loss": 0.027827193960547447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7827194571727887e-05, "grad_norm": 17.040674209594727, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8768796324729919, "num_tokens": 396155730.0, "step": 10379 }, { "epoch": 1.320442691769495, "ewc_loss": 0.027808869257569313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7808870072476566e-05, "grad_norm": 16.98817253112793, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8531461954116821, "num_tokens": 396190976.0, "step": 10380 }, { "epoch": 1.3205699020480854, "ewc_loss": 0.027869055047631264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7869054974871688e-05, "grad_norm": 17.05828094482422, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8662192821502686, "num_tokens": 396229571.0, "step": 10381 }, { "epoch": 1.320697112326676, "ewc_loss": 0.027866849675774574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.786685035971459e-05, "grad_norm": 16.992115020751953, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8662882447242737, "num_tokens": 396265586.0, "step": 10382 }, { "epoch": 1.3208243226052665, "ewc_loss": 0.027881454676389694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.788145502563566e-05, "grad_norm": 17.07138442993164, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8719446659088135, "num_tokens": 396303810.0, "step": 10383 }, { "epoch": 1.320951532883857, "ewc_loss": 0.02788979932665825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7889798730029725e-05, "grad_norm": 16.973726272583008, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8877285718917847, "num_tokens": 396345671.0, "step": 10384 }, { "epoch": 1.3210787431624476, "ewc_loss": 0.027805054560303688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7805053832707927e-05, "grad_norm": 17.014949798583984, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.884576678276062, "num_tokens": 396388062.0, "step": 10385 }, { "epoch": 1.321205953441038, "ewc_loss": 0.02789299562573433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7892996513401158e-05, "grad_norm": 16.99981117248535, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.880570650100708, "num_tokens": 396425189.0, "step": 10386 }, { "epoch": 1.3213331637196286, "ewc_loss": 0.027824940159916878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7824940843856893e-05, "grad_norm": 17.058177947998047, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8579444885253906, "num_tokens": 396461861.0, "step": 10387 }, { "epoch": 1.3214603739982191, "ewc_loss": 0.027909215539693832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7909216441912577e-05, "grad_norm": 17.004056930541992, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8791585564613342, "num_tokens": 396506190.0, "step": 10388 }, { "epoch": 1.3215875842768097, "ewc_loss": 0.027835510671138763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.78355109912809e-05, "grad_norm": 17.028501510620117, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8695875406265259, "num_tokens": 396538900.0, "step": 10389 }, { "epoch": 1.3217147945554002, "ewc_loss": 0.027847014367580414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7847014280268922e-05, "grad_norm": 16.987953186035156, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.876860499382019, "num_tokens": 396577029.0, "step": 10390 }, { "epoch": 1.3218420048339907, "ewc_loss": 0.02781202457845211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7812024200102314e-05, "grad_norm": 16.949270248413086, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8522706031799316, "num_tokens": 396616917.0, "step": 10391 }, { "epoch": 1.321969215112581, "ewc_loss": 0.02786894328892231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.786894401651807e-05, "grad_norm": 16.94881248474121, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8770488500595093, "num_tokens": 396651726.0, "step": 10392 }, { "epoch": 1.3220964253911716, "ewc_loss": 0.027902774512767792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7902773581445217e-05, "grad_norm": 17.054462432861328, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8555489182472229, "num_tokens": 396693796.0, "step": 10393 }, { "epoch": 1.322223635669762, "ewc_loss": 0.027841096743941307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7841097107739188e-05, "grad_norm": 16.909896850585938, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8580451607704163, "num_tokens": 396731947.0, "step": 10394 }, { "epoch": 1.3223508459483526, "ewc_loss": 0.027855398133397102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7855398002429865e-05, "grad_norm": 17.022014617919922, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8706567883491516, "num_tokens": 396767062.0, "step": 10395 }, { "epoch": 1.3224780562269431, "ewc_loss": 0.027994906529784203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799490721372422e-05, "grad_norm": 16.95207977294922, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8704496026039124, "num_tokens": 396803918.0, "step": 10396 }, { "epoch": 1.3226052665055337, "ewc_loss": 0.02784554846584797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7845548174809664e-05, "grad_norm": 16.993976593017578, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8404417634010315, "num_tokens": 396850314.0, "step": 10397 }, { "epoch": 1.3227324767841242, "ewc_loss": 0.02798629179596901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7986292479909025e-05, "grad_norm": 16.9500789642334, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8611852526664734, "num_tokens": 396887920.0, "step": 10398 }, { "epoch": 1.3228596870627147, "ewc_loss": 0.027877790853381157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.787779158097692e-05, "grad_norm": 17.024520874023438, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8565670847892761, "num_tokens": 396923369.0, "step": 10399 }, { "epoch": 1.3229868973413053, "ewc_loss": 0.027965134009718895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.796513399516698e-05, "grad_norm": 17.015411376953125, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8661624193191528, "num_tokens": 396962338.0, "step": 10400 }, { "epoch": 1.3231141076198956, "ewc_loss": 0.0279543437063694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7954343750025146e-05, "grad_norm": 16.983671188354492, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8779346942901611, "num_tokens": 396998275.0, "step": 10401 }, { "epoch": 1.323241317898486, "ewc_loss": 0.027897458523511887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7897458494408056e-05, "grad_norm": 16.967479705810547, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8656178712844849, "num_tokens": 397040034.0, "step": 10402 }, { "epoch": 1.3233685281770766, "ewc_loss": 0.027974147349596024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.797414708766155e-05, "grad_norm": 17.00651741027832, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8853448629379272, "num_tokens": 397080549.0, "step": 10403 }, { "epoch": 1.3234957384556671, "ewc_loss": 0.02794838696718216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7948386559728533e-05, "grad_norm": 16.975069046020508, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8804057240486145, "num_tokens": 397120803.0, "step": 10404 }, { "epoch": 1.3236229487342577, "ewc_loss": 0.027963202446699142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7963202228420414e-05, "grad_norm": 16.978185653686523, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8717672824859619, "num_tokens": 397160463.0, "step": 10405 }, { "epoch": 1.3237501590128482, "ewc_loss": 0.027914827689528465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7914828024222516e-05, "grad_norm": 16.970596313476562, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8610135316848755, "num_tokens": 397205425.0, "step": 10406 }, { "epoch": 1.3238773692914387, "ewc_loss": 0.02796822413802147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7968224458163604e-05, "grad_norm": 17.044050216674805, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8636161088943481, "num_tokens": 397238907.0, "step": 10407 }, { "epoch": 1.3240045795700293, "ewc_loss": 0.027976201847195625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7976202545687556e-05, "grad_norm": 17.039772033691406, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8679139614105225, "num_tokens": 397281639.0, "step": 10408 }, { "epoch": 1.3241317898486198, "ewc_loss": 0.027902161702513695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7902162401005626e-05, "grad_norm": 16.982799530029297, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8567418456077576, "num_tokens": 397323350.0, "step": 10409 }, { "epoch": 1.3242590001272103, "ewc_loss": 0.02792133204638958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7921332730329596e-05, "grad_norm": 17.00027084350586, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8610306978225708, "num_tokens": 397364438.0, "step": 10410 }, { "epoch": 1.3243862104058008, "ewc_loss": 0.027917547151446342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7917547413380817e-05, "grad_norm": 17.029905319213867, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8554568290710449, "num_tokens": 397406881.0, "step": 10411 }, { "epoch": 1.3245134206843914, "ewc_loss": 0.02786378562450409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7863785362569615e-05, "grad_norm": 16.9521541595459, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8686662912368774, "num_tokens": 397451190.0, "step": 10412 }, { "epoch": 1.324640630962982, "ewc_loss": 0.027850011363625526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7850011974805966e-05, "grad_norm": 16.987913131713867, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8685668706893921, "num_tokens": 397491063.0, "step": 10413 }, { "epoch": 1.3247678412415724, "ewc_loss": 0.027900541201233864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7900541681447066e-05, "grad_norm": 16.984769821166992, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8687209486961365, "num_tokens": 397529961.0, "step": 10414 }, { "epoch": 1.324895051520163, "ewc_loss": 0.027856914326548576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7856915039592423e-05, "grad_norm": 17.01390838623047, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8784348368644714, "num_tokens": 397565616.0, "step": 10415 }, { "epoch": 1.3250222617987533, "ewc_loss": 0.027874920517206192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7874921215698123e-05, "grad_norm": 17.023576736450195, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8743113279342651, "num_tokens": 397602223.0, "step": 10416 }, { "epoch": 1.3251494720773438, "ewc_loss": 0.027948511764407158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7948512069997378e-05, "grad_norm": 17.041231155395508, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8460683822631836, "num_tokens": 397638910.0, "step": 10417 }, { "epoch": 1.3252766823559343, "ewc_loss": 0.027875909581780434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.787590892694425e-05, "grad_norm": 17.054506301879883, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8607886433601379, "num_tokens": 397681200.0, "step": 10418 }, { "epoch": 1.3254038926345248, "ewc_loss": 0.027923742309212685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7923742891289294e-05, "grad_norm": 17.02167320251465, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8698042035102844, "num_tokens": 397724832.0, "step": 10419 }, { "epoch": 1.3255311029131154, "ewc_loss": 0.027822665870189667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7822665288113058e-05, "grad_norm": 16.9487361907959, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8594815731048584, "num_tokens": 397765537.0, "step": 10420 }, { "epoch": 1.325658313191706, "ewc_loss": 0.027886386960744858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7886386305908673e-05, "grad_norm": 17.086454391479492, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8582247495651245, "num_tokens": 397800945.0, "step": 10421 }, { "epoch": 1.3257855234702964, "ewc_loss": 0.02793067879974842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7930678697885014e-05, "grad_norm": 16.974853515625, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8743605613708496, "num_tokens": 397840786.0, "step": 10422 }, { "epoch": 1.325912733748887, "ewc_loss": 0.027820928022265434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.782092815323267e-05, "grad_norm": 17.051334381103516, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8723013401031494, "num_tokens": 397875607.0, "step": 10423 }, { "epoch": 1.3260399440274775, "ewc_loss": 0.02793312445282936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7933125238632783e-05, "grad_norm": 17.07062339782715, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8836936950683594, "num_tokens": 397909524.0, "step": 10424 }, { "epoch": 1.3261671543060678, "ewc_loss": 0.027826903387904167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.782690353342332e-05, "grad_norm": 17.027873992919922, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8521571755409241, "num_tokens": 397947862.0, "step": 10425 }, { "epoch": 1.3262943645846583, "ewc_loss": 0.02786669321358204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7866693926625885e-05, "grad_norm": 17.034025192260742, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8816690444946289, "num_tokens": 397980958.0, "step": 10426 }, { "epoch": 1.3264215748632489, "ewc_loss": 0.027910133823752403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7910133212571964e-05, "grad_norm": 17.067943572998047, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8655902743339539, "num_tokens": 398021231.0, "step": 10427 }, { "epoch": 1.3265487851418394, "ewc_loss": 0.027830177918076515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7830177714349702e-05, "grad_norm": 16.980117797851562, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.865073561668396, "num_tokens": 398059202.0, "step": 10428 }, { "epoch": 1.32667599542043, "ewc_loss": 0.027897585183382034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7897585823666304e-05, "grad_norm": 17.0965576171875, "learning_rate": 1e-06, "loss": 0.3419, "mean_token_accuracy": 0.8896663784980774, "num_tokens": 398097121.0, "step": 10429 }, { "epoch": 1.3268032056990204, "ewc_loss": 0.027908606454730034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790860708046239e-05, "grad_norm": 17.071182250976562, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8670393824577332, "num_tokens": 398130413.0, "step": 10430 }, { "epoch": 1.326930415977611, "ewc_loss": 0.02783859148621559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7838592359330505e-05, "grad_norm": 17.041561126708984, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8665639162063599, "num_tokens": 398172535.0, "step": 10431 }, { "epoch": 1.3270576262562015, "ewc_loss": 0.027875464409589767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7875465093529783e-05, "grad_norm": 17.078231811523438, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8685232400894165, "num_tokens": 398206681.0, "step": 10432 }, { "epoch": 1.327184836534792, "ewc_loss": 0.02788502909243107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7885029339813627e-05, "grad_norm": 17.1074275970459, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8716303110122681, "num_tokens": 398245206.0, "step": 10433 }, { "epoch": 1.3273120468133826, "ewc_loss": 0.027844013646245003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7844012947753072e-05, "grad_norm": 17.01751708984375, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8640685081481934, "num_tokens": 398278902.0, "step": 10434 }, { "epoch": 1.327439257091973, "ewc_loss": 0.027897989377379417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.789798963931389e-05, "grad_norm": 17.114765167236328, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8683305382728577, "num_tokens": 398315833.0, "step": 10435 }, { "epoch": 1.3275664673705636, "ewc_loss": 0.027902446687221527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790244616335258e-05, "grad_norm": 16.98377799987793, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8494964241981506, "num_tokens": 398356496.0, "step": 10436 }, { "epoch": 1.3276936776491541, "ewc_loss": 0.027834231033921242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7834230422740802e-05, "grad_norm": 17.087810516357422, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8738933801651001, "num_tokens": 398393661.0, "step": 10437 }, { "epoch": 1.3278208879277447, "ewc_loss": 0.02787676267325878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.787676203297451e-05, "grad_norm": 17.019845962524414, "learning_rate": 1e-06, "loss": 0.3506, "mean_token_accuracy": 0.8848190307617188, "num_tokens": 398430463.0, "step": 10438 }, { "epoch": 1.3279480982063352, "ewc_loss": 0.027823762968182564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7823762138723396e-05, "grad_norm": 17.092586517333984, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8668113946914673, "num_tokens": 398470382.0, "step": 10439 }, { "epoch": 1.3280753084849257, "ewc_loss": 0.02789304032921791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7893040169146843e-05, "grad_norm": 17.062341690063477, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.850612998008728, "num_tokens": 398504793.0, "step": 10440 }, { "epoch": 1.328202518763516, "ewc_loss": 0.02783641591668129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.783641502901446e-05, "grad_norm": 16.9481143951416, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.85844486951828, "num_tokens": 398541038.0, "step": 10441 }, { "epoch": 1.3283297290421066, "ewc_loss": 0.027899859473109245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7899859560420737e-05, "grad_norm": 17.005678176879883, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8505464792251587, "num_tokens": 398578047.0, "step": 10442 }, { "epoch": 1.328456939320697, "ewc_loss": 0.02797459252178669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7974592740065418e-05, "grad_norm": 17.049833297729492, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8570219278335571, "num_tokens": 398618946.0, "step": 10443 }, { "epoch": 1.3285841495992876, "ewc_loss": 0.027916453778743744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7916454200749286e-05, "grad_norm": 16.94832992553711, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8721327781677246, "num_tokens": 398656895.0, "step": 10444 }, { "epoch": 1.3287113598778781, "ewc_loss": 0.027989961206912994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7989961381535977e-05, "grad_norm": 17.038217544555664, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8668489456176758, "num_tokens": 398696521.0, "step": 10445 }, { "epoch": 1.3288385701564687, "ewc_loss": 0.028022512793540955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.802251219691243e-05, "grad_norm": 17.027236938476562, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8681329488754272, "num_tokens": 398735275.0, "step": 10446 }, { "epoch": 1.3289657804350592, "ewc_loss": 0.02792130596935749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7921305445488542e-05, "grad_norm": 16.991565704345703, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8527655601501465, "num_tokens": 398774753.0, "step": 10447 }, { "epoch": 1.3290929907136497, "ewc_loss": 0.02797630988061428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7976309866062365e-05, "grad_norm": 17.017532348632812, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8617337942123413, "num_tokens": 398810893.0, "step": 10448 }, { "epoch": 1.3292202009922403, "ewc_loss": 0.028016388416290283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.801638765959069e-05, "grad_norm": 17.07000732421875, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8615733981132507, "num_tokens": 398849741.0, "step": 10449 }, { "epoch": 1.3293474112708306, "ewc_loss": 0.027960022911429405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7960022634943016e-05, "grad_norm": 16.99321937561035, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8611060380935669, "num_tokens": 398881186.0, "step": 10450 }, { "epoch": 1.329474621549421, "ewc_loss": 0.02803610824048519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8036109142703936e-05, "grad_norm": 17.119895935058594, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8843660950660706, "num_tokens": 398918677.0, "step": 10451 }, { "epoch": 1.3296018318280116, "ewc_loss": 0.027993157505989075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7993157345918007e-05, "grad_norm": 16.97976303100586, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.864035964012146, "num_tokens": 398953435.0, "step": 10452 }, { "epoch": 1.3297290421066021, "ewc_loss": 0.02802332118153572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8023321647197008e-05, "grad_norm": 17.115726470947266, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8743808269500732, "num_tokens": 398991799.0, "step": 10453 }, { "epoch": 1.3298562523851927, "ewc_loss": 0.02805091068148613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8050910259480588e-05, "grad_norm": 16.978029251098633, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8720818758010864, "num_tokens": 399023211.0, "step": 10454 }, { "epoch": 1.3299834626637832, "ewc_loss": 0.02798938751220703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7989386580884457e-05, "grad_norm": 17.086252212524414, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8639708757400513, "num_tokens": 399061431.0, "step": 10455 }, { "epoch": 1.3301106729423737, "ewc_loss": 0.028101781383156776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8101781936129555e-05, "grad_norm": 16.99386215209961, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.869591236114502, "num_tokens": 399097384.0, "step": 10456 }, { "epoch": 1.3302378832209643, "ewc_loss": 0.02801685221493244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8016851501888596e-05, "grad_norm": 17.0358829498291, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8698927164077759, "num_tokens": 399133935.0, "step": 10457 }, { "epoch": 1.3303650934995548, "ewc_loss": 0.028094597160816193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809459692798555e-05, "grad_norm": 16.95572853088379, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8729616403579712, "num_tokens": 399165709.0, "step": 10458 }, { "epoch": 1.3304923037781453, "ewc_loss": 0.02800692804157734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8006928914692253e-05, "grad_norm": 17.09142303466797, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8652275800704956, "num_tokens": 399203548.0, "step": 10459 }, { "epoch": 1.3306195140567358, "ewc_loss": 0.02809770591557026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809770558087621e-05, "grad_norm": 17.026660919189453, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8661812543869019, "num_tokens": 399244341.0, "step": 10460 }, { "epoch": 1.3307467243353264, "ewc_loss": 0.028007332235574722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.800733273033984e-05, "grad_norm": 17.109664916992188, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.87198805809021, "num_tokens": 399282284.0, "step": 10461 }, { "epoch": 1.330873934613917, "ewc_loss": 0.02805968187749386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8059681426384486e-05, "grad_norm": 16.9578914642334, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8807620406150818, "num_tokens": 399320523.0, "step": 10462 }, { "epoch": 1.3310011448925074, "ewc_loss": 0.027957040816545486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7957041311310604e-05, "grad_norm": 17.02393341064453, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.867896556854248, "num_tokens": 399357489.0, "step": 10463 }, { "epoch": 1.331128355171098, "ewc_loss": 0.02808007225394249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8080072297598235e-05, "grad_norm": 17.088138580322266, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8553370237350464, "num_tokens": 399393561.0, "step": 10464 }, { "epoch": 1.3312555654496883, "ewc_loss": 0.02803073264658451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8030732210027054e-05, "grad_norm": 17.122894287109375, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8721291422843933, "num_tokens": 399424514.0, "step": 10465 }, { "epoch": 1.3313827757282788, "ewc_loss": 0.027996990829706192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799698995659128e-05, "grad_norm": 16.997560501098633, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8732939958572388, "num_tokens": 399461372.0, "step": 10466 }, { "epoch": 1.3315099860068693, "ewc_loss": 0.027942614629864693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7942614906351082e-05, "grad_norm": 17.12042808532715, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8811527490615845, "num_tokens": 399502503.0, "step": 10467 }, { "epoch": 1.3316371962854598, "ewc_loss": 0.028045404702425003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8045404178556055e-05, "grad_norm": 16.99380874633789, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8617755174636841, "num_tokens": 399537836.0, "step": 10468 }, { "epoch": 1.3317644065640504, "ewc_loss": 0.02794032357633114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7940322979702614e-05, "grad_norm": 17.11720848083496, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8637069463729858, "num_tokens": 399575189.0, "step": 10469 }, { "epoch": 1.331891616842641, "ewc_loss": 0.02801489271223545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8014892450300977e-05, "grad_norm": 17.04271697998047, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8497180938720703, "num_tokens": 399614406.0, "step": 10470 }, { "epoch": 1.3320188271212314, "ewc_loss": 0.027901874855160713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7901875000679865e-05, "grad_norm": 17.02264976501465, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8736475110054016, "num_tokens": 399652847.0, "step": 10471 }, { "epoch": 1.332146037399822, "ewc_loss": 0.02803969569504261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8039696189807728e-05, "grad_norm": 17.11501693725586, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8569749593734741, "num_tokens": 399691628.0, "step": 10472 }, { "epoch": 1.3322732476784125, "ewc_loss": 0.02799210697412491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799210778903216e-05, "grad_norm": 17.05777359008789, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8780497312545776, "num_tokens": 399732875.0, "step": 10473 }, { "epoch": 1.3324004579570028, "ewc_loss": 0.027986066415905952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7986066925222985e-05, "grad_norm": 17.0911865234375, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8787562847137451, "num_tokens": 399769537.0, "step": 10474 }, { "epoch": 1.3325276682355933, "ewc_loss": 0.027924858033657074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7924857931793667e-05, "grad_norm": 17.054777145385742, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8521275520324707, "num_tokens": 399807757.0, "step": 10475 }, { "epoch": 1.3326548785141838, "ewc_loss": 0.027962220832705498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.79622199741425e-05, "grad_norm": 17.093448638916016, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8734793663024902, "num_tokens": 399837351.0, "step": 10476 }, { "epoch": 1.3327820887927744, "ewc_loss": 0.02791690267622471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.791690349113196e-05, "grad_norm": 17.046756744384766, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8856920003890991, "num_tokens": 399873678.0, "step": 10477 }, { "epoch": 1.332909299071365, "ewc_loss": 0.027918102219700813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7918102205148898e-05, "grad_norm": 17.037681579589844, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8699613213539124, "num_tokens": 399916206.0, "step": 10478 }, { "epoch": 1.3330365093499554, "ewc_loss": 0.027937347069382668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7937347113038413e-05, "grad_norm": 16.993247985839844, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8796161413192749, "num_tokens": 399954144.0, "step": 10479 }, { "epoch": 1.333163719628546, "ewc_loss": 0.027915820479393005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.791582119243685e-05, "grad_norm": 17.02500343322754, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8753658533096313, "num_tokens": 399995865.0, "step": 10480 }, { "epoch": 1.3332909299071365, "ewc_loss": 0.027990272268652916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7990272428723983e-05, "grad_norm": 17.14153289794922, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8701267242431641, "num_tokens": 400033234.0, "step": 10481 }, { "epoch": 1.333418140185727, "ewc_loss": 0.02793055772781372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7930556825594977e-05, "grad_norm": 16.935157775878906, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8771330118179321, "num_tokens": 400066842.0, "step": 10482 }, { "epoch": 1.3335453504643175, "ewc_loss": 0.02794477716088295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7944777684751898e-05, "grad_norm": 17.12066650390625, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8794935941696167, "num_tokens": 400099380.0, "step": 10483 }, { "epoch": 1.333672560742908, "ewc_loss": 0.028007274493575096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8007274522678927e-05, "grad_norm": 17.037639617919922, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8796535134315491, "num_tokens": 400137419.0, "step": 10484 }, { "epoch": 1.3337997710214986, "ewc_loss": 0.027909988537430763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.790998769341968e-05, "grad_norm": 17.050859451293945, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8685125708580017, "num_tokens": 400171086.0, "step": 10485 }, { "epoch": 1.3339269813000891, "ewc_loss": 0.027949854731559753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7949854484177195e-05, "grad_norm": 17.065351486206055, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8664799928665161, "num_tokens": 400206763.0, "step": 10486 }, { "epoch": 1.3340541915786797, "ewc_loss": 0.027993090450763702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7993090043310076e-05, "grad_norm": 17.005586624145508, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8526583313941956, "num_tokens": 400242708.0, "step": 10487 }, { "epoch": 1.3341814018572702, "ewc_loss": 0.0279769878834486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7976988349109888e-05, "grad_norm": 17.09981918334961, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8485552072525024, "num_tokens": 400277417.0, "step": 10488 }, { "epoch": 1.3343086121358605, "ewc_loss": 0.028058357536792755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8058357202098705e-05, "grad_norm": 17.03721046447754, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.867377519607544, "num_tokens": 400315811.0, "step": 10489 }, { "epoch": 1.334435822414451, "ewc_loss": 0.027905212715268135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7905212846235372e-05, "grad_norm": 16.992267608642578, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8484784364700317, "num_tokens": 400356321.0, "step": 10490 }, { "epoch": 1.3345630326930416, "ewc_loss": 0.028021594509482384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.802159360726364e-05, "grad_norm": 17.071298599243164, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8683915138244629, "num_tokens": 400392382.0, "step": 10491 }, { "epoch": 1.334690242971632, "ewc_loss": 0.028045615181326866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8045615181326866e-05, "grad_norm": 17.057435989379883, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8747797608375549, "num_tokens": 400434947.0, "step": 10492 }, { "epoch": 1.3348174532502226, "ewc_loss": 0.02799959108233452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799959111143835e-05, "grad_norm": 17.076814651489258, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8493653535842896, "num_tokens": 400472742.0, "step": 10493 }, { "epoch": 1.3349446635288131, "ewc_loss": 0.028022926300764084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.802292692649644e-05, "grad_norm": 17.02835464477539, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8817237019538879, "num_tokens": 400510029.0, "step": 10494 }, { "epoch": 1.3350718738074037, "ewc_loss": 0.027989665046334267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.79896648862632e-05, "grad_norm": 17.01064109802246, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8651652336120605, "num_tokens": 400547383.0, "step": 10495 }, { "epoch": 1.3351990840859942, "ewc_loss": 0.028090130537748337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8090131308999844e-05, "grad_norm": 17.119815826416016, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8788861036300659, "num_tokens": 400585242.0, "step": 10496 }, { "epoch": 1.3353262943645847, "ewc_loss": 0.028083190321922302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8083190045435913e-05, "grad_norm": 17.114730834960938, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8789191842079163, "num_tokens": 400623117.0, "step": 10497 }, { "epoch": 1.3354535046431752, "ewc_loss": 0.027955656871199608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7955657060374506e-05, "grad_norm": 17.004528045654297, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8574988842010498, "num_tokens": 400660170.0, "step": 10498 }, { "epoch": 1.3355807149217656, "ewc_loss": 0.02803531102836132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.803531060635578e-05, "grad_norm": 17.097488403320312, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.874409556388855, "num_tokens": 400700467.0, "step": 10499 }, { "epoch": 1.335707925200356, "ewc_loss": 0.028011970221996307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.801196933432948e-05, "grad_norm": 17.06460952758789, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8623359203338623, "num_tokens": 400740269.0, "step": 10500 }, { "epoch": 1.3358351354789466, "ewc_loss": 0.027943316847085953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.794331703626085e-05, "grad_norm": 17.02708625793457, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8783444166183472, "num_tokens": 400776465.0, "step": 10501 }, { "epoch": 1.3359623457575371, "ewc_loss": 0.028000306338071823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8000305974273942e-05, "grad_norm": 17.069059371948242, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8660649657249451, "num_tokens": 400811210.0, "step": 10502 }, { "epoch": 1.3360895560361277, "ewc_loss": 0.028050502762198448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8050502805854194e-05, "grad_norm": 17.130611419677734, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8601292371749878, "num_tokens": 400848820.0, "step": 10503 }, { "epoch": 1.3362167663147182, "ewc_loss": 0.02797205001115799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.797204979287926e-05, "grad_norm": 16.945268630981445, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8695889711380005, "num_tokens": 400888774.0, "step": 10504 }, { "epoch": 1.3363439765933087, "ewc_loss": 0.027954787015914917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.795478758343961e-05, "grad_norm": 17.101394653320312, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8699798583984375, "num_tokens": 400935067.0, "step": 10505 }, { "epoch": 1.3364711868718993, "ewc_loss": 0.02807331085205078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8073311113985255e-05, "grad_norm": 17.022584915161133, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8775601387023926, "num_tokens": 400974122.0, "step": 10506 }, { "epoch": 1.3365983971504898, "ewc_loss": 0.027934730052947998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.793472958728671e-05, "grad_norm": 17.027355194091797, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8777619004249573, "num_tokens": 401008911.0, "step": 10507 }, { "epoch": 1.3367256074290803, "ewc_loss": 0.02801062911748886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8010628739139065e-05, "grad_norm": 17.02957534790039, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8743725419044495, "num_tokens": 401039453.0, "step": 10508 }, { "epoch": 1.3368528177076708, "ewc_loss": 0.027955008670687675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7955009500146843e-05, "grad_norm": 16.981157302856445, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8771508932113647, "num_tokens": 401075854.0, "step": 10509 }, { "epoch": 1.3369800279862614, "ewc_loss": 0.028013786301016808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.801378650474362e-05, "grad_norm": 17.052440643310547, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8667385578155518, "num_tokens": 401114922.0, "step": 10510 }, { "epoch": 1.337107238264852, "ewc_loss": 0.02804090827703476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.804090763675049e-05, "grad_norm": 17.035133361816406, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8763087391853333, "num_tokens": 401155833.0, "step": 10511 }, { "epoch": 1.3372344485434424, "ewc_loss": 0.027975227683782578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7975227567367256e-05, "grad_norm": 17.024003982543945, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.874119222164154, "num_tokens": 401196971.0, "step": 10512 }, { "epoch": 1.337361658822033, "ewc_loss": 0.027979938313364983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.797993874992244e-05, "grad_norm": 17.054113388061523, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8370752334594727, "num_tokens": 401237312.0, "step": 10513 }, { "epoch": 1.3374888691006233, "ewc_loss": 0.027987854555249214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.798785499180667e-05, "grad_norm": 17.05887794494629, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8659867644309998, "num_tokens": 401274630.0, "step": 10514 }, { "epoch": 1.3376160793792138, "ewc_loss": 0.0279913991689682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799139838316478e-05, "grad_norm": 17.006214141845703, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.873681902885437, "num_tokens": 401311780.0, "step": 10515 }, { "epoch": 1.3377432896578043, "ewc_loss": 0.027981603518128395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7981603125226684e-05, "grad_norm": 17.029390335083008, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8767828941345215, "num_tokens": 401352614.0, "step": 10516 }, { "epoch": 1.3378704999363948, "ewc_loss": 0.028031235560774803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8031236070091836e-05, "grad_norm": 17.049468994140625, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8707095980644226, "num_tokens": 401394495.0, "step": 10517 }, { "epoch": 1.3379977102149854, "ewc_loss": 0.02798250876367092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.798250898194965e-05, "grad_norm": 17.125713348388672, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8548616766929626, "num_tokens": 401427758.0, "step": 10518 }, { "epoch": 1.338124920493576, "ewc_loss": 0.027978472411632538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.797847264446318e-05, "grad_norm": 16.8951473236084, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8536461591720581, "num_tokens": 401473868.0, "step": 10519 }, { "epoch": 1.3382521307721664, "ewc_loss": 0.027935149148106575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.793514977383893e-05, "grad_norm": 17.0841064453125, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8507940769195557, "num_tokens": 401510459.0, "step": 10520 }, { "epoch": 1.338379341050757, "ewc_loss": 0.028091059997677803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8091060812585056e-05, "grad_norm": 17.021663665771484, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8398218154907227, "num_tokens": 401545283.0, "step": 10521 }, { "epoch": 1.3385065513293475, "ewc_loss": 0.02796284481883049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7962843887507915e-05, "grad_norm": 16.956613540649414, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8698222637176514, "num_tokens": 401577166.0, "step": 10522 }, { "epoch": 1.3386337616079378, "ewc_loss": 0.02808529883623123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8085298254154623e-05, "grad_norm": 17.00923728942871, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8810747265815735, "num_tokens": 401619075.0, "step": 10523 }, { "epoch": 1.3387609718865283, "ewc_loss": 0.02807333692908287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8073336579836905e-05, "grad_norm": 16.979318618774414, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8612414598464966, "num_tokens": 401661286.0, "step": 10524 }, { "epoch": 1.3388881821651188, "ewc_loss": 0.028110383078455925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8110383937018923e-05, "grad_norm": 17.04871368408203, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8590371608734131, "num_tokens": 401699672.0, "step": 10525 }, { "epoch": 1.3390153924437094, "ewc_loss": 0.02810298092663288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8102980650146492e-05, "grad_norm": 17.033267974853516, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.862470269203186, "num_tokens": 401732676.0, "step": 10526 }, { "epoch": 1.3391426027223, "ewc_loss": 0.028100596740841866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8100595955038443e-05, "grad_norm": 17.04802894592285, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8659558296203613, "num_tokens": 401771593.0, "step": 10527 }, { "epoch": 1.3392698130008904, "ewc_loss": 0.028111150488257408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8111149731557816e-05, "grad_norm": 17.017589569091797, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8547797799110413, "num_tokens": 401809605.0, "step": 10528 }, { "epoch": 1.339397023279481, "ewc_loss": 0.02811211533844471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81121156149311e-05, "grad_norm": 17.059123992919922, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8583030104637146, "num_tokens": 401845349.0, "step": 10529 }, { "epoch": 1.3395242335580715, "ewc_loss": 0.0281020849943161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8102085707359947e-05, "grad_norm": 17.0463809967041, "learning_rate": 1e-06, "loss": 0.3323, "mean_token_accuracy": 0.890792727470398, "num_tokens": 401883186.0, "step": 10530 }, { "epoch": 1.339651443836662, "ewc_loss": 0.028115831315517426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115831810282543e-05, "grad_norm": 17.103818893432617, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8521676063537598, "num_tokens": 401921149.0, "step": 10531 }, { "epoch": 1.3397786541152525, "ewc_loss": 0.028145985677838326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8145985197625123e-05, "grad_norm": 17.047658920288086, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8620333075523376, "num_tokens": 401961813.0, "step": 10532 }, { "epoch": 1.339905864393843, "ewc_loss": 0.028135327622294426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8135327738709748e-05, "grad_norm": 17.021577835083008, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8508235812187195, "num_tokens": 402001212.0, "step": 10533 }, { "epoch": 1.3400330746724336, "ewc_loss": 0.0281386561691761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8138656489318237e-05, "grad_norm": 17.082101821899414, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8609676361083984, "num_tokens": 402044648.0, "step": 10534 }, { "epoch": 1.3401602849510241, "ewc_loss": 0.02809346653521061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8093467335565947e-05, "grad_norm": 17.07720375061035, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8710846900939941, "num_tokens": 402087391.0, "step": 10535 }, { "epoch": 1.3402874952296147, "ewc_loss": 0.028093047440052032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8093047149013728e-05, "grad_norm": 17.056650161743164, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8875249624252319, "num_tokens": 402119598.0, "step": 10536 }, { "epoch": 1.3404147055082052, "ewc_loss": 0.028083890676498413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8083890356356278e-05, "grad_norm": 17.10144805908203, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8425514698028564, "num_tokens": 402153628.0, "step": 10537 }, { "epoch": 1.3405419157867955, "ewc_loss": 0.028081277385354042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8081278287572786e-05, "grad_norm": 17.000261306762695, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8847576975822449, "num_tokens": 402190876.0, "step": 10538 }, { "epoch": 1.340669126065386, "ewc_loss": 0.028037022799253464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8037022275384516e-05, "grad_norm": 17.087047576904297, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.876318633556366, "num_tokens": 402227168.0, "step": 10539 }, { "epoch": 1.3407963363439765, "ewc_loss": 0.028098804876208305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809880425047595e-05, "grad_norm": 17.059024810791016, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8765958547592163, "num_tokens": 402262731.0, "step": 10540 }, { "epoch": 1.340923546622567, "ewc_loss": 0.02802978828549385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8029788154526614e-05, "grad_norm": 16.992698669433594, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8690572381019592, "num_tokens": 402299992.0, "step": 10541 }, { "epoch": 1.3410507569011576, "ewc_loss": 0.028076594695448875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8076594389858656e-05, "grad_norm": 17.106210708618164, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8608804941177368, "num_tokens": 402339129.0, "step": 10542 }, { "epoch": 1.3411779671797481, "ewc_loss": 0.02808983251452446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8089832994737662e-05, "grad_norm": 17.076263427734375, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.851283073425293, "num_tokens": 402375380.0, "step": 10543 }, { "epoch": 1.3413051774583387, "ewc_loss": 0.028048304840922356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.804830546665471e-05, "grad_norm": 17.029138565063477, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8561850190162659, "num_tokens": 402412159.0, "step": 10544 }, { "epoch": 1.3414323877369292, "ewc_loss": 0.028091810643672943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8091810236219317e-05, "grad_norm": 17.183801651000977, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8641785383224487, "num_tokens": 402446676.0, "step": 10545 }, { "epoch": 1.3415595980155197, "ewc_loss": 0.0281129889190197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81129887298448e-05, "grad_norm": 17.05210304260254, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8684952855110168, "num_tokens": 402490741.0, "step": 10546 }, { "epoch": 1.3416868082941102, "ewc_loss": 0.028006993234157562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8006992579321377e-05, "grad_norm": 17.0844669342041, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8766480088233948, "num_tokens": 402527398.0, "step": 10547 }, { "epoch": 1.3418140185727006, "ewc_loss": 0.028090769425034523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809076977428049e-05, "grad_norm": 17.076242446899414, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8673524260520935, "num_tokens": 402565914.0, "step": 10548 }, { "epoch": 1.341941228851291, "ewc_loss": 0.028068335726857185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8068336177966557e-05, "grad_norm": 17.094539642333984, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8706867098808289, "num_tokens": 402600671.0, "step": 10549 }, { "epoch": 1.3420684391298816, "ewc_loss": 0.02807990461587906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.807990495057311e-05, "grad_norm": 16.982585906982422, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8801838755607605, "num_tokens": 402637381.0, "step": 10550 }, { "epoch": 1.3421956494084721, "ewc_loss": 0.02807537093758583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.807537020999007e-05, "grad_norm": 17.1070613861084, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8716502785682678, "num_tokens": 402671190.0, "step": 10551 }, { "epoch": 1.3423228596870627, "ewc_loss": 0.028090700507164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8090700652683154e-05, "grad_norm": 16.964412689208984, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8676344156265259, "num_tokens": 402721518.0, "step": 10552 }, { "epoch": 1.3424500699656532, "ewc_loss": 0.02807442471385002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8074424335500225e-05, "grad_norm": 17.16633415222168, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8686386346817017, "num_tokens": 402758949.0, "step": 10553 }, { "epoch": 1.3425772802442437, "ewc_loss": 0.028132151812314987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8132151783211157e-05, "grad_norm": 17.013477325439453, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8622500896453857, "num_tokens": 402794342.0, "step": 10554 }, { "epoch": 1.3427044905228342, "ewc_loss": 0.027967369183897972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7967369533143938e-05, "grad_norm": 17.03528594970703, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.879277229309082, "num_tokens": 402831031.0, "step": 10555 }, { "epoch": 1.3428317008014248, "ewc_loss": 0.028115399181842804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81153988908045e-05, "grad_norm": 17.034835815429688, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8779332637786865, "num_tokens": 402871486.0, "step": 10556 }, { "epoch": 1.3429589110800153, "ewc_loss": 0.028050608932971954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80506083072396e-05, "grad_norm": 17.050800323486328, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8803104758262634, "num_tokens": 402916024.0, "step": 10557 }, { "epoch": 1.3430861213586058, "ewc_loss": 0.02808530256152153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.808530189213343e-05, "grad_norm": 17.153812408447266, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.850407063961029, "num_tokens": 402952116.0, "step": 10558 }, { "epoch": 1.3432133316371964, "ewc_loss": 0.028050588443875313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805058829835616e-05, "grad_norm": 16.93559455871582, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8665801882743835, "num_tokens": 402998064.0, "step": 10559 }, { "epoch": 1.343340541915787, "ewc_loss": 0.02798708714544773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7987087378278375e-05, "grad_norm": 17.07379913330078, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8726369142532349, "num_tokens": 403041763.0, "step": 10560 }, { "epoch": 1.3434677521943774, "ewc_loss": 0.02811143919825554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811143895087298e-05, "grad_norm": 16.9848690032959, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8798056840896606, "num_tokens": 403080956.0, "step": 10561 }, { "epoch": 1.343594962472968, "ewc_loss": 0.028012637048959732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.801263690344058e-05, "grad_norm": 17.10936164855957, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8772836923599243, "num_tokens": 403117200.0, "step": 10562 }, { "epoch": 1.3437221727515583, "ewc_loss": 0.02813459187746048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8134591048001312e-05, "grad_norm": 17.116558074951172, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8679991364479065, "num_tokens": 403154112.0, "step": 10563 }, { "epoch": 1.3438493830301488, "ewc_loss": 0.02797914482653141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7979145670542493e-05, "grad_norm": 17.018619537353516, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8662030696868896, "num_tokens": 403188265.0, "step": 10564 }, { "epoch": 1.3439765933087393, "ewc_loss": 0.028018701821565628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8018701414112e-05, "grad_norm": 17.07779312133789, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8469042778015137, "num_tokens": 403231236.0, "step": 10565 }, { "epoch": 1.3441038035873298, "ewc_loss": 0.02802576869726181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8025768187944777e-05, "grad_norm": 17.002132415771484, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8700418472290039, "num_tokens": 403268642.0, "step": 10566 }, { "epoch": 1.3442310138659204, "ewc_loss": 0.027941599488258362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7941599910263903e-05, "grad_norm": 17.043272018432617, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8755029439926147, "num_tokens": 403307679.0, "step": 10567 }, { "epoch": 1.344358224144511, "ewc_loss": 0.027993982657790184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7993983167107217e-05, "grad_norm": 17.021987915039062, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8730253577232361, "num_tokens": 403346395.0, "step": 10568 }, { "epoch": 1.3444854344231014, "ewc_loss": 0.02799679897725582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7996798962703906e-05, "grad_norm": 17.042949676513672, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8771594762802124, "num_tokens": 403383001.0, "step": 10569 }, { "epoch": 1.344612644701692, "ewc_loss": 0.027983494102954865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.798349487420637e-05, "grad_norm": 17.027042388916016, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.86036217212677, "num_tokens": 403426090.0, "step": 10570 }, { "epoch": 1.3447398549802825, "ewc_loss": 0.02802208438515663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8022084734402597e-05, "grad_norm": 17.039100646972656, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8614206910133362, "num_tokens": 403470231.0, "step": 10571 }, { "epoch": 1.3448670652588728, "ewc_loss": 0.028026465326547623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8026464860886335e-05, "grad_norm": 17.09123992919922, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8733876943588257, "num_tokens": 403508677.0, "step": 10572 }, { "epoch": 1.3449942755374633, "ewc_loss": 0.02798309363424778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7983092877548188e-05, "grad_norm": 17.0565185546875, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.849402904510498, "num_tokens": 403545175.0, "step": 10573 }, { "epoch": 1.3451214858160538, "ewc_loss": 0.02797200158238411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7972000680165365e-05, "grad_norm": 17.046157836914062, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8822433352470398, "num_tokens": 403583321.0, "step": 10574 }, { "epoch": 1.3452486960946444, "ewc_loss": 0.02798989973962307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7989899535896257e-05, "grad_norm": 17.094566345214844, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8615937232971191, "num_tokens": 403618547.0, "step": 10575 }, { "epoch": 1.345375906373235, "ewc_loss": 0.027992548421025276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.799254798446782e-05, "grad_norm": 17.05084991455078, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8717862963676453, "num_tokens": 403657552.0, "step": 10576 }, { "epoch": 1.3455031166518254, "ewc_loss": 0.02799631468951702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7996315111522563e-05, "grad_norm": 17.136272430419922, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8708085417747498, "num_tokens": 403697021.0, "step": 10577 }, { "epoch": 1.345630326930416, "ewc_loss": 0.027989987283945084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7989986847387627e-05, "grad_norm": 17.085168838500977, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8586919903755188, "num_tokens": 403736480.0, "step": 10578 }, { "epoch": 1.3457575372090065, "ewc_loss": 0.027958378195762634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.795837826852221e-05, "grad_norm": 16.962703704833984, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8751492500305176, "num_tokens": 403772056.0, "step": 10579 }, { "epoch": 1.345884747487597, "ewc_loss": 0.027988767251372337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.798876812448725e-05, "grad_norm": 17.15138053894043, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8624727725982666, "num_tokens": 403813646.0, "step": 10580 }, { "epoch": 1.3460119577661875, "ewc_loss": 0.028013013303279877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8013013434247114e-05, "grad_norm": 16.951885223388672, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8715011477470398, "num_tokens": 403853904.0, "step": 10581 }, { "epoch": 1.346139168044778, "ewc_loss": 0.02788548171520233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.788548226817511e-05, "grad_norm": 16.984455108642578, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8526256084442139, "num_tokens": 403898528.0, "step": 10582 }, { "epoch": 1.3462663783233686, "ewc_loss": 0.02808264084160328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8082640710636042e-05, "grad_norm": 17.183021545410156, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8535333871841431, "num_tokens": 403929518.0, "step": 10583 }, { "epoch": 1.3463935886019591, "ewc_loss": 0.028027798980474472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8027798180119134e-05, "grad_norm": 16.988903045654297, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8700850605964661, "num_tokens": 403970323.0, "step": 10584 }, { "epoch": 1.3465207988805497, "ewc_loss": 0.02793772704899311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7937727281823754e-05, "grad_norm": 17.057443618774414, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8716966509819031, "num_tokens": 404012499.0, "step": 10585 }, { "epoch": 1.3466480091591402, "ewc_loss": 0.028047727420926094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8047727028024383e-05, "grad_norm": 17.079248428344727, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8630892634391785, "num_tokens": 404055370.0, "step": 10586 }, { "epoch": 1.3467752194377305, "ewc_loss": 0.02798689343035221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7986892746412195e-05, "grad_norm": 16.95932960510254, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8675636649131775, "num_tokens": 404092498.0, "step": 10587 }, { "epoch": 1.346902429716321, "ewc_loss": 0.027973119169473648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7973119358648546e-05, "grad_norm": 17.013565063476562, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8746914267539978, "num_tokens": 404132466.0, "step": 10588 }, { "epoch": 1.3470296399949115, "ewc_loss": 0.02806021086871624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8060210752300918e-05, "grad_norm": 17.02379035949707, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8679233193397522, "num_tokens": 404171913.0, "step": 10589 }, { "epoch": 1.347156850273502, "ewc_loss": 0.02807222492992878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.807222517731134e-05, "grad_norm": 17.13521385192871, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.881731390953064, "num_tokens": 404205817.0, "step": 10590 }, { "epoch": 1.3472840605520926, "ewc_loss": 0.02807411551475525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8074115107301623e-05, "grad_norm": 17.08013916015625, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8719609975814819, "num_tokens": 404243922.0, "step": 10591 }, { "epoch": 1.3474112708306831, "ewc_loss": 0.02801911160349846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80191106867278e-05, "grad_norm": 17.092439651489258, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8711835145950317, "num_tokens": 404283392.0, "step": 10592 }, { "epoch": 1.3475384811092737, "ewc_loss": 0.028100082650780678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8100083000026643e-05, "grad_norm": 17.087392807006836, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8426268100738525, "num_tokens": 404323044.0, "step": 10593 }, { "epoch": 1.3476656913878642, "ewc_loss": 0.028069106861948967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806910742947366e-05, "grad_norm": 17.08271026611328, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8746631145477295, "num_tokens": 404359761.0, "step": 10594 }, { "epoch": 1.3477929016664547, "ewc_loss": 0.028034208342432976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.803420829877723e-05, "grad_norm": 17.079063415527344, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8559021949768066, "num_tokens": 404395354.0, "step": 10595 }, { "epoch": 1.3479201119450452, "ewc_loss": 0.028103210031986237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810320984281134e-05, "grad_norm": 17.148778915405273, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8513795137405396, "num_tokens": 404432052.0, "step": 10596 }, { "epoch": 1.3480473222236355, "ewc_loss": 0.02805987186729908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805987242027186e-05, "grad_norm": 17.083698272705078, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8603535890579224, "num_tokens": 404472689.0, "step": 10597 }, { "epoch": 1.348174532502226, "ewc_loss": 0.028051814064383507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805181429721415e-05, "grad_norm": 17.090335845947266, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8529294729232788, "num_tokens": 404515916.0, "step": 10598 }, { "epoch": 1.3483017427808166, "ewc_loss": 0.028076820075511932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8076819944544695e-05, "grad_norm": 17.056678771972656, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8675861954689026, "num_tokens": 404557399.0, "step": 10599 }, { "epoch": 1.3484289530594071, "ewc_loss": 0.028089364990592003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.808936551446095e-05, "grad_norm": 17.132749557495117, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8644402027130127, "num_tokens": 404592573.0, "step": 10600 }, { "epoch": 1.3485561633379977, "ewc_loss": 0.028076687827706337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8076687158318236e-05, "grad_norm": 17.084531784057617, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8650200366973877, "num_tokens": 404635266.0, "step": 10601 }, { "epoch": 1.3486833736165882, "ewc_loss": 0.02804410643875599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8044107239111327e-05, "grad_norm": 17.095382690429688, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8766964077949524, "num_tokens": 404673004.0, "step": 10602 }, { "epoch": 1.3488105838951787, "ewc_loss": 0.028115354478359222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115355235058814e-05, "grad_norm": 17.10531234741211, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8529757261276245, "num_tokens": 404705153.0, "step": 10603 }, { "epoch": 1.3489377941737692, "ewc_loss": 0.028059236705303192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805923759297002e-05, "grad_norm": 17.115489959716797, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8592385649681091, "num_tokens": 404737068.0, "step": 10604 }, { "epoch": 1.3490650044523598, "ewc_loss": 0.028144346550107002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8144346288172528e-05, "grad_norm": 17.104328155517578, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8462182283401489, "num_tokens": 404773680.0, "step": 10605 }, { "epoch": 1.3491922147309503, "ewc_loss": 0.028118764981627464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8118765840190463e-05, "grad_norm": 17.057939529418945, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8738961219787598, "num_tokens": 404809937.0, "step": 10606 }, { "epoch": 1.3493194250095408, "ewc_loss": 0.02810709737241268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8107097023166716e-05, "grad_norm": 17.13789176940918, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8586705327033997, "num_tokens": 404847146.0, "step": 10607 }, { "epoch": 1.3494466352881314, "ewc_loss": 0.028147408738732338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81474094663281e-05, "grad_norm": 17.031070709228516, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8686574697494507, "num_tokens": 404882952.0, "step": 10608 }, { "epoch": 1.3495738455667219, "ewc_loss": 0.028093652799725533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809365287248511e-05, "grad_norm": 17.095457077026367, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8761857151985168, "num_tokens": 404925347.0, "step": 10609 }, { "epoch": 1.3497010558453124, "ewc_loss": 0.028144780546426773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8144781026639976e-05, "grad_norm": 17.06968116760254, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8526694774627686, "num_tokens": 404963411.0, "step": 10610 }, { "epoch": 1.349828266123903, "ewc_loss": 0.028119413182139397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8119413400418125e-05, "grad_norm": 17.08854103088379, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8651724457740784, "num_tokens": 404997522.0, "step": 10611 }, { "epoch": 1.3499554764024932, "ewc_loss": 0.028173793107271194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8173793907626532e-05, "grad_norm": 17.135753631591797, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8652271628379822, "num_tokens": 405034049.0, "step": 10612 }, { "epoch": 1.3500826866810838, "ewc_loss": 0.028164617717266083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8164617106085643e-05, "grad_norm": 17.086191177368164, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8649542331695557, "num_tokens": 405074985.0, "step": 10613 }, { "epoch": 1.3502098969596743, "ewc_loss": 0.028140805661678314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8140806534793228e-05, "grad_norm": 17.112123489379883, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8821022510528564, "num_tokens": 405116133.0, "step": 10614 }, { "epoch": 1.3503371072382648, "ewc_loss": 0.028139598667621613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8139598725829273e-05, "grad_norm": 17.122589111328125, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8660112619400024, "num_tokens": 405155572.0, "step": 10615 }, { "epoch": 1.3504643175168554, "ewc_loss": 0.028171436861157417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8171436497359537e-05, "grad_norm": 17.190547943115234, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8692284822463989, "num_tokens": 405194700.0, "step": 10616 }, { "epoch": 1.350591527795446, "ewc_loss": 0.02809309773147106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8093098080717027e-05, "grad_norm": 17.13442611694336, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8634243011474609, "num_tokens": 405233582.0, "step": 10617 }, { "epoch": 1.3507187380740364, "ewc_loss": 0.02806025743484497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806025804602541e-05, "grad_norm": 17.123878479003906, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.880281925201416, "num_tokens": 405274987.0, "step": 10618 }, { "epoch": 1.350845948352627, "ewc_loss": 0.028040610253810883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.804061114147771e-05, "grad_norm": 17.093013763427734, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8651201128959656, "num_tokens": 405308151.0, "step": 10619 }, { "epoch": 1.3509731586312175, "ewc_loss": 0.02803804725408554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8038048185408115e-05, "grad_norm": 17.11095428466797, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8678703904151917, "num_tokens": 405343939.0, "step": 10620 }, { "epoch": 1.3511003689098078, "ewc_loss": 0.028115879744291306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115879104007035e-05, "grad_norm": 17.13315200805664, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8663671016693115, "num_tokens": 405384211.0, "step": 10621 }, { "epoch": 1.3512275791883983, "ewc_loss": 0.028050418943166733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805041913234163e-05, "grad_norm": 17.070850372314453, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8749327063560486, "num_tokens": 405419971.0, "step": 10622 }, { "epoch": 1.3513547894669888, "ewc_loss": 0.028012612834572792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8012613256578334e-05, "grad_norm": 17.07471466064453, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8610172867774963, "num_tokens": 405451322.0, "step": 10623 }, { "epoch": 1.3514819997455794, "ewc_loss": 0.02808435633778572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8084356017643586e-05, "grad_norm": 17.04483413696289, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8596941828727722, "num_tokens": 405496101.0, "step": 10624 }, { "epoch": 1.35160921002417, "ewc_loss": 0.028065256774425507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8065256628906354e-05, "grad_norm": 17.093196868896484, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8510944843292236, "num_tokens": 405530034.0, "step": 10625 }, { "epoch": 1.3517364203027604, "ewc_loss": 0.028148123994469643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8148124329163693e-05, "grad_norm": 17.086000442504883, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8792983293533325, "num_tokens": 405570893.0, "step": 10626 }, { "epoch": 1.351863630581351, "ewc_loss": 0.028088221326470375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.808822137012612e-05, "grad_norm": 17.058420181274414, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8542271852493286, "num_tokens": 405610120.0, "step": 10627 }, { "epoch": 1.3519908408599415, "ewc_loss": 0.02811952494084835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811952435877174e-05, "grad_norm": 17.122966766357422, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8668742179870605, "num_tokens": 405651878.0, "step": 10628 }, { "epoch": 1.352118051138532, "ewc_loss": 0.028157148510217667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8157148335594684e-05, "grad_norm": 17.121761322021484, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.872602105140686, "num_tokens": 405694578.0, "step": 10629 }, { "epoch": 1.3522452614171225, "ewc_loss": 0.02810426615178585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81042666756548e-05, "grad_norm": 17.077091217041016, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8738958835601807, "num_tokens": 405734128.0, "step": 10630 }, { "epoch": 1.352372471695713, "ewc_loss": 0.02810623124241829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8106231184210628e-05, "grad_norm": 17.13848114013672, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8730028867721558, "num_tokens": 405776588.0, "step": 10631 }, { "epoch": 1.3524996819743036, "ewc_loss": 0.028128501027822495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.812850107147824e-05, "grad_norm": 17.13939666748047, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8633654117584229, "num_tokens": 405810867.0, "step": 10632 }, { "epoch": 1.3526268922528941, "ewc_loss": 0.028100986033678055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8100985218770802e-05, "grad_norm": 17.06544303894043, "learning_rate": 1e-06, "loss": 0.3539, "mean_token_accuracy": 0.8849982023239136, "num_tokens": 405848884.0, "step": 10633 }, { "epoch": 1.3527541025314846, "ewc_loss": 0.02805837243795395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8058371754013933e-05, "grad_norm": 17.116907119750977, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.887010931968689, "num_tokens": 405888759.0, "step": 10634 }, { "epoch": 1.3528813128100752, "ewc_loss": 0.028193630278110504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81936299870722e-05, "grad_norm": 17.185344696044922, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8809839487075806, "num_tokens": 405925686.0, "step": 10635 }, { "epoch": 1.3530085230886655, "ewc_loss": 0.028094008564949036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8094009394408204e-05, "grad_norm": 17.068910598754883, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8800442218780518, "num_tokens": 405960836.0, "step": 10636 }, { "epoch": 1.353135733367256, "ewc_loss": 0.02803988568484783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8039885364705697e-05, "grad_norm": 17.077533721923828, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8786558508872986, "num_tokens": 406000314.0, "step": 10637 }, { "epoch": 1.3532629436458465, "ewc_loss": 0.028101297095417976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810129626595881e-05, "grad_norm": 17.14011001586914, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8776896595954895, "num_tokens": 406039468.0, "step": 10638 }, { "epoch": 1.353390153924437, "ewc_loss": 0.028086140751838684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8086140446248464e-05, "grad_norm": 17.089937210083008, "learning_rate": 1e-06, "loss": 0.3422, "mean_token_accuracy": 0.8916161060333252, "num_tokens": 406076828.0, "step": 10639 }, { "epoch": 1.3535173642030276, "ewc_loss": 0.028027478605508804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.802747803798411e-05, "grad_norm": 17.129322052001953, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8667713403701782, "num_tokens": 406116504.0, "step": 10640 }, { "epoch": 1.3536445744816181, "ewc_loss": 0.028039835393428802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80398362519918e-05, "grad_norm": 17.05322265625, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8672729134559631, "num_tokens": 406155184.0, "step": 10641 }, { "epoch": 1.3537717847602087, "ewc_loss": 0.02800045907497406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.800045876938384e-05, "grad_norm": 17.113313674926758, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8490951657295227, "num_tokens": 406195411.0, "step": 10642 }, { "epoch": 1.3538989950387992, "ewc_loss": 0.028138423338532448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8138423658674583e-05, "grad_norm": 17.137619018554688, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8586866855621338, "num_tokens": 406230159.0, "step": 10643 }, { "epoch": 1.3540262053173897, "ewc_loss": 0.02800377458333969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8003774787066504e-05, "grad_norm": 17.06769371032715, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8571710586547852, "num_tokens": 406264541.0, "step": 10644 }, { "epoch": 1.3541534155959802, "ewc_loss": 0.02806149609386921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8061496777809225e-05, "grad_norm": 17.08088493347168, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8739791512489319, "num_tokens": 406301869.0, "step": 10645 }, { "epoch": 1.3542806258745705, "ewc_loss": 0.028071952983736992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8071952328900807e-05, "grad_norm": 17.079753875732422, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8693017363548279, "num_tokens": 406338819.0, "step": 10646 }, { "epoch": 1.354407836153161, "ewc_loss": 0.02806844748556614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8068447136320174e-05, "grad_norm": 17.074687957763672, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8694040775299072, "num_tokens": 406375914.0, "step": 10647 }, { "epoch": 1.3545350464317516, "ewc_loss": 0.02800704725086689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8007047149003483e-05, "grad_norm": 17.0299015045166, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8728379607200623, "num_tokens": 406418543.0, "step": 10648 }, { "epoch": 1.3546622567103421, "ewc_loss": 0.028078870847821236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.807886994560249e-05, "grad_norm": 17.06392478942871, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8663456439971924, "num_tokens": 406455731.0, "step": 10649 }, { "epoch": 1.3547894669889327, "ewc_loss": 0.0281250961124897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8125095923314802e-05, "grad_norm": 17.1015625, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8675286769866943, "num_tokens": 406496266.0, "step": 10650 }, { "epoch": 1.3549166772675232, "ewc_loss": 0.02806757763028145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806757765938528e-05, "grad_norm": 17.034759521484375, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8632329106330872, "num_tokens": 406541676.0, "step": 10651 }, { "epoch": 1.3550438875461137, "ewc_loss": 0.028064759448170662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8064760044799186e-05, "grad_norm": 17.074583053588867, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8647683262825012, "num_tokens": 406578997.0, "step": 10652 }, { "epoch": 1.3551710978247042, "ewc_loss": 0.02810097113251686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8100970666855574e-05, "grad_norm": 17.02173614501953, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.872394323348999, "num_tokens": 406619404.0, "step": 10653 }, { "epoch": 1.3552983081032948, "ewc_loss": 0.028130006045103073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8130005375714973e-05, "grad_norm": 17.163227081298828, "learning_rate": 1e-06, "loss": 0.3505, "mean_token_accuracy": 0.884901762008667, "num_tokens": 406656092.0, "step": 10654 }, { "epoch": 1.3554255183818853, "ewc_loss": 0.028122859075665474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.812285856634844e-05, "grad_norm": 17.05181312561035, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8518410921096802, "num_tokens": 406701600.0, "step": 10655 }, { "epoch": 1.3555527286604758, "ewc_loss": 0.028067946434020996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80679469142342e-05, "grad_norm": 17.12842559814453, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8671436309814453, "num_tokens": 406739649.0, "step": 10656 }, { "epoch": 1.3556799389390664, "ewc_loss": 0.028096720576286316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8096719688619487e-05, "grad_norm": 17.057758331298828, "learning_rate": 1e-06, "loss": 0.3482, "mean_token_accuracy": 0.8867169618606567, "num_tokens": 406774962.0, "step": 10657 }, { "epoch": 1.3558071492176569, "ewc_loss": 0.028090961277484894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809096076816786e-05, "grad_norm": 17.17470359802246, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8655833601951599, "num_tokens": 406818047.0, "step": 10658 }, { "epoch": 1.3559343594962474, "ewc_loss": 0.02810983546078205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8109834602219053e-05, "grad_norm": 17.073522567749023, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8612030744552612, "num_tokens": 406859464.0, "step": 10659 }, { "epoch": 1.356061569774838, "ewc_loss": 0.028047991916537285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80479926004773e-05, "grad_norm": 17.15348243713379, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8660147190093994, "num_tokens": 406896444.0, "step": 10660 }, { "epoch": 1.3561887800534282, "ewc_loss": 0.028106221929192543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810622208926361e-05, "grad_norm": 17.136184692382812, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8623530268669128, "num_tokens": 406934623.0, "step": 10661 }, { "epoch": 1.3563159903320188, "ewc_loss": 0.02805282361805439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.805282383633312e-05, "grad_norm": 17.11334991455078, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.861548125743866, "num_tokens": 406974290.0, "step": 10662 }, { "epoch": 1.3564432006106093, "ewc_loss": 0.028108257800340652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810825753840618e-05, "grad_norm": 17.088409423828125, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8865160942077637, "num_tokens": 407011279.0, "step": 10663 }, { "epoch": 1.3565704108891998, "ewc_loss": 0.028060391545295715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806039083225187e-05, "grad_norm": 17.142709732055664, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8574459552764893, "num_tokens": 407054733.0, "step": 10664 }, { "epoch": 1.3566976211677904, "ewc_loss": 0.028050366789102554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8050366381648928e-05, "grad_norm": 17.114011764526367, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8685371279716492, "num_tokens": 407098410.0, "step": 10665 }, { "epoch": 1.3568248314463809, "ewc_loss": 0.028050629422068596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8050630135112442e-05, "grad_norm": 17.14438247680664, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8618850708007812, "num_tokens": 407134231.0, "step": 10666 }, { "epoch": 1.3569520417249714, "ewc_loss": 0.028102055191993713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8102054784540087e-05, "grad_norm": 17.09221839904785, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8538199067115784, "num_tokens": 407167231.0, "step": 10667 }, { "epoch": 1.357079252003562, "ewc_loss": 0.02803165651857853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8031656256644055e-05, "grad_norm": 17.189620971679688, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8709380626678467, "num_tokens": 407206604.0, "step": 10668 }, { "epoch": 1.3572064622821525, "ewc_loss": 0.028063999488949776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8063999707228504e-05, "grad_norm": 17.08195686340332, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8704186677932739, "num_tokens": 407248247.0, "step": 10669 }, { "epoch": 1.3573336725607428, "ewc_loss": 0.028048565611243248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8048565582139418e-05, "grad_norm": 17.1954288482666, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8778042197227478, "num_tokens": 407282870.0, "step": 10670 }, { "epoch": 1.3574608828393333, "ewc_loss": 0.02806522697210312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8065227525075898e-05, "grad_norm": 17.070165634155273, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8741358518600464, "num_tokens": 407318704.0, "step": 10671 }, { "epoch": 1.3575880931179238, "ewc_loss": 0.028010476380586624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8010475944029167e-05, "grad_norm": 17.161792755126953, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8701539039611816, "num_tokens": 407360003.0, "step": 10672 }, { "epoch": 1.3577153033965144, "ewc_loss": 0.028043536469340324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8043536076438613e-05, "grad_norm": 16.9921932220459, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8623418211936951, "num_tokens": 407399473.0, "step": 10673 }, { "epoch": 1.357842513675105, "ewc_loss": 0.02806137315928936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8061373086529784e-05, "grad_norm": 17.07511329650879, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8378328084945679, "num_tokens": 407440576.0, "step": 10674 }, { "epoch": 1.3579697239536954, "ewc_loss": 0.028116542845964432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811654303513933e-05, "grad_norm": 17.12358856201172, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8648372292518616, "num_tokens": 407481057.0, "step": 10675 }, { "epoch": 1.358096934232286, "ewc_loss": 0.028165549039840698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.816554842866026e-05, "grad_norm": 17.13471794128418, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8802427053451538, "num_tokens": 407517104.0, "step": 10676 }, { "epoch": 1.3582241445108765, "ewc_loss": 0.028109150007367134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810915066220332e-05, "grad_norm": 17.102245330810547, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.875198483467102, "num_tokens": 407552604.0, "step": 10677 }, { "epoch": 1.358351354789467, "ewc_loss": 0.028070421889424324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8070422558812425e-05, "grad_norm": 17.08405113220215, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8811520338058472, "num_tokens": 407588282.0, "step": 10678 }, { "epoch": 1.3584785650680575, "ewc_loss": 0.028098996728658676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8098997063352726e-05, "grad_norm": 17.108604431152344, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8632667064666748, "num_tokens": 407619837.0, "step": 10679 }, { "epoch": 1.358605775346648, "ewc_loss": 0.028115496039390564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115495297242887e-05, "grad_norm": 17.12523078918457, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8652697205543518, "num_tokens": 407656939.0, "step": 10680 }, { "epoch": 1.3587329856252386, "ewc_loss": 0.028195803984999657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8195803679409437e-05, "grad_norm": 17.166141510009766, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8581481575965881, "num_tokens": 407702756.0, "step": 10681 }, { "epoch": 1.3588601959038291, "ewc_loss": 0.028111154213547707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8111153369536623e-05, "grad_norm": 17.063669204711914, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8766262531280518, "num_tokens": 407736142.0, "step": 10682 }, { "epoch": 1.3589874061824196, "ewc_loss": 0.028118887916207314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81188877124805e-05, "grad_norm": 17.12590980529785, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8646434545516968, "num_tokens": 407773184.0, "step": 10683 }, { "epoch": 1.3591146164610102, "ewc_loss": 0.02816189080476761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8161890440969728e-05, "grad_norm": 17.134292602539062, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8531370162963867, "num_tokens": 407806543.0, "step": 10684 }, { "epoch": 1.3592418267396005, "ewc_loss": 0.028119616210460663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811961530824192e-05, "grad_norm": 17.1395263671875, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8671203851699829, "num_tokens": 407841371.0, "step": 10685 }, { "epoch": 1.359369037018191, "ewc_loss": 0.028135480359196663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8135480533819646e-05, "grad_norm": 17.127370834350586, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8712358474731445, "num_tokens": 407879862.0, "step": 10686 }, { "epoch": 1.3594962472967815, "ewc_loss": 0.02811252512037754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8112524887546897e-05, "grad_norm": 17.08281707763672, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8614953756332397, "num_tokens": 407917947.0, "step": 10687 }, { "epoch": 1.359623457575372, "ewc_loss": 0.028100455179810524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810045589285437e-05, "grad_norm": 17.078563690185547, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8755013942718506, "num_tokens": 407953987.0, "step": 10688 }, { "epoch": 1.3597506678539626, "ewc_loss": 0.02815230004489422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8152300728834234e-05, "grad_norm": 17.158964157104492, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8517659902572632, "num_tokens": 407993166.0, "step": 10689 }, { "epoch": 1.3598778781325531, "ewc_loss": 0.028180450201034546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8180449589854106e-05, "grad_norm": 17.097244262695312, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8762063980102539, "num_tokens": 408037063.0, "step": 10690 }, { "epoch": 1.3600050884111436, "ewc_loss": 0.028118176385760307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8118176487623714e-05, "grad_norm": 17.142229080200195, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8703325986862183, "num_tokens": 408078526.0, "step": 10691 }, { "epoch": 1.3601322986897342, "ewc_loss": 0.028164157643914223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8164156901766546e-05, "grad_norm": 17.106510162353516, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.875041127204895, "num_tokens": 408116002.0, "step": 10692 }, { "epoch": 1.3602595089683247, "ewc_loss": 0.028139298781752586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.813929859257769e-05, "grad_norm": 17.21040153503418, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8637471199035645, "num_tokens": 408155885.0, "step": 10693 }, { "epoch": 1.3603867192469152, "ewc_loss": 0.02816254273056984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81625434581656e-05, "grad_norm": 17.11774253845215, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.872956395149231, "num_tokens": 408190085.0, "step": 10694 }, { "epoch": 1.3605139295255055, "ewc_loss": 0.028103385120630264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810338446579408e-05, "grad_norm": 17.191200256347656, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8698385953903198, "num_tokens": 408227289.0, "step": 10695 }, { "epoch": 1.360641139804096, "ewc_loss": 0.028106365352869034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.810636578942649e-05, "grad_norm": 17.073040008544922, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8762364387512207, "num_tokens": 408271853.0, "step": 10696 }, { "epoch": 1.3607683500826866, "ewc_loss": 0.02805348113179207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.80534804915078e-05, "grad_norm": 17.155223846435547, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8803793787956238, "num_tokens": 408311133.0, "step": 10697 }, { "epoch": 1.3608955603612771, "ewc_loss": 0.02807586081326008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8075861337129027e-05, "grad_norm": 17.09356689453125, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8701885938644409, "num_tokens": 408342775.0, "step": 10698 }, { "epoch": 1.3610227706398677, "ewc_loss": 0.028072327375411987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8072327040717937e-05, "grad_norm": 17.17354393005371, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8807856440544128, "num_tokens": 408379063.0, "step": 10699 }, { "epoch": 1.3611499809184582, "ewc_loss": 0.02811598777770996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115988243371248e-05, "grad_norm": 17.145631790161133, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8562527894973755, "num_tokens": 408415856.0, "step": 10700 }, { "epoch": 1.3612771911970487, "ewc_loss": 0.028074538335204124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.807453893183265e-05, "grad_norm": 17.128267288208008, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8727864027023315, "num_tokens": 408451522.0, "step": 10701 }, { "epoch": 1.3614044014756392, "ewc_loss": 0.028091374784708023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.809137549775187e-05, "grad_norm": 17.09425926208496, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8734276294708252, "num_tokens": 408494605.0, "step": 10702 }, { "epoch": 1.3615316117542298, "ewc_loss": 0.028020089492201805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8020089303026907e-05, "grad_norm": 17.08918571472168, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8670281767845154, "num_tokens": 408536373.0, "step": 10703 }, { "epoch": 1.3616588220328203, "ewc_loss": 0.028060827404260635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.806082738970872e-05, "grad_norm": 17.10184669494629, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8558253049850464, "num_tokens": 408574724.0, "step": 10704 }, { "epoch": 1.3617860323114108, "ewc_loss": 0.02809137850999832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8091379135730676e-05, "grad_norm": 17.171337127685547, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.861047089099884, "num_tokens": 408615057.0, "step": 10705 }, { "epoch": 1.3619132425900013, "ewc_loss": 0.027997305616736412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.7997306460747495e-05, "grad_norm": 17.03693962097168, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8557276725769043, "num_tokens": 408651324.0, "step": 10706 }, { "epoch": 1.3620404528685919, "ewc_loss": 0.028092259541153908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8092259526601993e-05, "grad_norm": 17.163856506347656, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.873956561088562, "num_tokens": 408690533.0, "step": 10707 }, { "epoch": 1.3621676631471824, "ewc_loss": 0.028097089380025864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8097088943468407e-05, "grad_norm": 17.009082794189453, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8611544370651245, "num_tokens": 408725970.0, "step": 10708 }, { "epoch": 1.362294873425773, "ewc_loss": 0.028103521093726158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8103520889999345e-05, "grad_norm": 17.159194946289062, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8764715194702148, "num_tokens": 408766550.0, "step": 10709 }, { "epoch": 1.3624220837043632, "ewc_loss": 0.028119612485170364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811961167026311e-05, "grad_norm": 17.062986373901367, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8721178770065308, "num_tokens": 408804320.0, "step": 10710 }, { "epoch": 1.3625492939829538, "ewc_loss": 0.028128664940595627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.812866478052456e-05, "grad_norm": 17.111682891845703, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8709186315536499, "num_tokens": 408840902.0, "step": 10711 }, { "epoch": 1.3626765042615443, "ewc_loss": 0.028159521520137787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.815952211676631e-05, "grad_norm": 17.0658016204834, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8667300939559937, "num_tokens": 408879346.0, "step": 10712 }, { "epoch": 1.3628037145401348, "ewc_loss": 0.0281129851937294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8112985091865994e-05, "grad_norm": 17.082687377929688, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8801129460334778, "num_tokens": 408919336.0, "step": 10713 }, { "epoch": 1.3629309248187254, "ewc_loss": 0.028149835765361786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.814983599819243e-05, "grad_norm": 17.089204788208008, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8634625673294067, "num_tokens": 408961547.0, "step": 10714 }, { "epoch": 1.3630581350973159, "ewc_loss": 0.02815195918083191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8151958758826368e-05, "grad_norm": 17.0831356048584, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8833790421485901, "num_tokens": 409003573.0, "step": 10715 }, { "epoch": 1.3631853453759064, "ewc_loss": 0.028086470440030098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8086469683330506e-05, "grad_norm": 17.067228317260742, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8612627983093262, "num_tokens": 409045560.0, "step": 10716 }, { "epoch": 1.363312555654497, "ewc_loss": 0.028133945539593697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8133945306763053e-05, "grad_norm": 17.111740112304688, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8717703819274902, "num_tokens": 409087245.0, "step": 10717 }, { "epoch": 1.3634397659330875, "ewc_loss": 0.02818709798157215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8187097996124066e-05, "grad_norm": 17.1444034576416, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8766646385192871, "num_tokens": 409134177.0, "step": 10718 }, { "epoch": 1.3635669762116778, "ewc_loss": 0.028115786612033844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8115786335547455e-05, "grad_norm": 17.108177185058594, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8547555208206177, "num_tokens": 409169495.0, "step": 10719 }, { "epoch": 1.3636941864902683, "ewc_loss": 0.028156105428934097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8156106054666452e-05, "grad_norm": 17.178525924682617, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8442537784576416, "num_tokens": 409207781.0, "step": 10720 }, { "epoch": 1.3638213967688588, "ewc_loss": 0.02815009094774723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8150090656708926e-05, "grad_norm": 17.136110305786133, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8682104349136353, "num_tokens": 409245411.0, "step": 10721 }, { "epoch": 1.3639486070474494, "ewc_loss": 0.0280916728079319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8091671993024647e-05, "grad_norm": 17.102331161499023, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8757867217063904, "num_tokens": 409281757.0, "step": 10722 }, { "epoch": 1.3640758173260399, "ewc_loss": 0.028129225596785545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.812922502926085e-05, "grad_norm": 17.181394577026367, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8571513295173645, "num_tokens": 409321317.0, "step": 10723 }, { "epoch": 1.3642030276046304, "ewc_loss": 0.028133980929851532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8133981686551124e-05, "grad_norm": 17.128347396850586, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8750361204147339, "num_tokens": 409358919.0, "step": 10724 }, { "epoch": 1.364330237883221, "ewc_loss": 0.02805504761636257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8055048460373655e-05, "grad_norm": 17.11029624938965, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8594101667404175, "num_tokens": 409394355.0, "step": 10725 }, { "epoch": 1.3644574481618115, "ewc_loss": 0.028122983872890472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8122984076617286e-05, "grad_norm": 17.207862854003906, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8776102662086487, "num_tokens": 409433781.0, "step": 10726 }, { "epoch": 1.364584658440402, "ewc_loss": 0.02810254506766796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8102545911679044e-05, "grad_norm": 17.07394790649414, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8587528467178345, "num_tokens": 409471728.0, "step": 10727 }, { "epoch": 1.3647118687189925, "ewc_loss": 0.028067832812666893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8067832317901775e-05, "grad_norm": 17.162094116210938, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8859093189239502, "num_tokens": 409502294.0, "step": 10728 }, { "epoch": 1.364839078997583, "ewc_loss": 0.028201187029480934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8201187888043933e-05, "grad_norm": 17.125093460083008, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8700203895568848, "num_tokens": 409545878.0, "step": 10729 }, { "epoch": 1.3649662892761736, "ewc_loss": 0.028058070689439774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8058069801772945e-05, "grad_norm": 17.05307388305664, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8707962036132812, "num_tokens": 409584753.0, "step": 10730 }, { "epoch": 1.3650934995547641, "ewc_loss": 0.028184853494167328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.818485336320009e-05, "grad_norm": 17.165557861328125, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8731939792633057, "num_tokens": 409613725.0, "step": 10731 }, { "epoch": 1.3652207098333546, "ewc_loss": 0.02815348096191883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8153481252957135e-05, "grad_norm": 17.15456771850586, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.861139178276062, "num_tokens": 409652443.0, "step": 10732 }, { "epoch": 1.3653479201119452, "ewc_loss": 0.02816636487841606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.816636515490245e-05, "grad_norm": 17.108112335205078, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.874720573425293, "num_tokens": 409690683.0, "step": 10733 }, { "epoch": 1.3654751303905355, "ewc_loss": 0.02813156507909298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.813156424963381e-05, "grad_norm": 17.08151626586914, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8688212633132935, "num_tokens": 409727502.0, "step": 10734 }, { "epoch": 1.365602340669126, "ewc_loss": 0.02819790504872799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8197904612170532e-05, "grad_norm": 17.094308853149414, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8641185164451599, "num_tokens": 409764592.0, "step": 10735 }, { "epoch": 1.3657295509477165, "ewc_loss": 0.028208067640662193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8208067305968143e-05, "grad_norm": 17.186023712158203, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8470019698143005, "num_tokens": 409809645.0, "step": 10736 }, { "epoch": 1.365856761226307, "ewc_loss": 0.028191855177283287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819185465341434e-05, "grad_norm": 17.202665328979492, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8680664300918579, "num_tokens": 409848048.0, "step": 10737 }, { "epoch": 1.3659839715048976, "ewc_loss": 0.028145920485258102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8145919714006595e-05, "grad_norm": 17.10247230529785, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8621947765350342, "num_tokens": 409884176.0, "step": 10738 }, { "epoch": 1.3661111817834881, "ewc_loss": 0.028147857636213303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.814785693772137e-05, "grad_norm": 17.1286563873291, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8707431554794312, "num_tokens": 409922630.0, "step": 10739 }, { "epoch": 1.3662383920620786, "ewc_loss": 0.0281498022377491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8149801437393762e-05, "grad_norm": 17.095760345458984, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8742027282714844, "num_tokens": 409961876.0, "step": 10740 }, { "epoch": 1.3663656023406692, "ewc_loss": 0.028208307921886444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.820830741256941e-05, "grad_norm": 17.113691329956055, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.877025842666626, "num_tokens": 409995432.0, "step": 10741 }, { "epoch": 1.3664928126192597, "ewc_loss": 0.028182025998830795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.818202665366698e-05, "grad_norm": 17.08247184753418, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8663032054901123, "num_tokens": 410034876.0, "step": 10742 }, { "epoch": 1.3666200228978502, "ewc_loss": 0.02816447988152504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8164480681880377e-05, "grad_norm": 17.09236717224121, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8717467188835144, "num_tokens": 410079190.0, "step": 10743 }, { "epoch": 1.3667472331764405, "ewc_loss": 0.02824084646999836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8240847314009443e-05, "grad_norm": 17.11703109741211, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8714218735694885, "num_tokens": 410112475.0, "step": 10744 }, { "epoch": 1.366874443455031, "ewc_loss": 0.028191588819026947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819158908096142e-05, "grad_norm": 17.088281631469727, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8832379579544067, "num_tokens": 410154583.0, "step": 10745 }, { "epoch": 1.3670016537336216, "ewc_loss": 0.028197169303894043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81971697404515e-05, "grad_norm": 17.071605682373047, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8685065507888794, "num_tokens": 410191631.0, "step": 10746 }, { "epoch": 1.3671288640122121, "ewc_loss": 0.02816391922533512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.816391861415468e-05, "grad_norm": 17.045734405517578, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8465113043785095, "num_tokens": 410232435.0, "step": 10747 }, { "epoch": 1.3672560742908026, "ewc_loss": 0.028211291879415512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821129237418063e-05, "grad_norm": 17.05259895324707, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.872903048992157, "num_tokens": 410272940.0, "step": 10748 }, { "epoch": 1.3673832845693932, "ewc_loss": 0.028213346377015114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8213346013217233e-05, "grad_norm": 17.156394958496094, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.864121675491333, "num_tokens": 410312807.0, "step": 10749 }, { "epoch": 1.3675104948479837, "ewc_loss": 0.028255533427000046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8255533834453672e-05, "grad_norm": 17.124038696289062, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8746494054794312, "num_tokens": 410352433.0, "step": 10750 }, { "epoch": 1.3676377051265742, "ewc_loss": 0.02816554717719555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8165546609670855e-05, "grad_norm": 17.14314079284668, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8689843416213989, "num_tokens": 410394843.0, "step": 10751 }, { "epoch": 1.3677649154051648, "ewc_loss": 0.028223751112818718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8223750632605515e-05, "grad_norm": 17.10647964477539, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8698140978813171, "num_tokens": 410439353.0, "step": 10752 }, { "epoch": 1.3678921256837553, "ewc_loss": 0.028104552999138832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8104552256991155e-05, "grad_norm": 17.078645706176758, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8467323780059814, "num_tokens": 410485789.0, "step": 10753 }, { "epoch": 1.3680193359623458, "ewc_loss": 0.028239639475941658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.823963950504549e-05, "grad_norm": 17.115436553955078, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8522834181785583, "num_tokens": 410523385.0, "step": 10754 }, { "epoch": 1.3681465462409363, "ewc_loss": 0.02813461422920227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8134614694863558e-05, "grad_norm": 17.073633193969727, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8619910478591919, "num_tokens": 410553340.0, "step": 10755 }, { "epoch": 1.3682737565195269, "ewc_loss": 0.02821248769760132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821248745021876e-05, "grad_norm": 17.184064865112305, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8768869638442993, "num_tokens": 410590041.0, "step": 10756 }, { "epoch": 1.3684009667981174, "ewc_loss": 0.028168819844722748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8168818971607834e-05, "grad_norm": 17.0208740234375, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8701050281524658, "num_tokens": 410627703.0, "step": 10757 }, { "epoch": 1.368528177076708, "ewc_loss": 0.028204865753650665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8204865884617902e-05, "grad_norm": 17.16404914855957, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8603689670562744, "num_tokens": 410662555.0, "step": 10758 }, { "epoch": 1.3686553873552982, "ewc_loss": 0.028196467086672783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819646761054173e-05, "grad_norm": 17.040075302124023, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8505704402923584, "num_tokens": 410706026.0, "step": 10759 }, { "epoch": 1.3687825976338888, "ewc_loss": 0.028115548193454742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.811554804793559e-05, "grad_norm": 17.13053321838379, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8657145500183105, "num_tokens": 410743275.0, "step": 10760 }, { "epoch": 1.3689098079124793, "ewc_loss": 0.028268881142139435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.826888157869689e-05, "grad_norm": 17.16422462463379, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8770592212677002, "num_tokens": 410781220.0, "step": 10761 }, { "epoch": 1.3690370181910698, "ewc_loss": 0.028168663382530212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.816866253851913e-05, "grad_norm": 17.10260009765625, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8764543533325195, "num_tokens": 410820014.0, "step": 10762 }, { "epoch": 1.3691642284696603, "ewc_loss": 0.028205983340740204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.820598274411168e-05, "grad_norm": 17.175439834594727, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8635318875312805, "num_tokens": 410858721.0, "step": 10763 }, { "epoch": 1.3692914387482509, "ewc_loss": 0.028219830244779587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8219830710440874e-05, "grad_norm": 17.13361167907715, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8735370635986328, "num_tokens": 410898527.0, "step": 10764 }, { "epoch": 1.3694186490268414, "ewc_loss": 0.028167711570858955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8167711207061075e-05, "grad_norm": 17.133180618286133, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8685648441314697, "num_tokens": 410938426.0, "step": 10765 }, { "epoch": 1.369545859305432, "ewc_loss": 0.02818172425031662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8181724701425992e-05, "grad_norm": 17.1390323638916, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8738547563552856, "num_tokens": 410969940.0, "step": 10766 }, { "epoch": 1.3696730695840225, "ewc_loss": 0.028204450383782387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8204451155033894e-05, "grad_norm": 17.16585350036621, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8531723618507385, "num_tokens": 411008859.0, "step": 10767 }, { "epoch": 1.3698002798626128, "ewc_loss": 0.02816445380449295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8164453397039324e-05, "grad_norm": 17.08782386779785, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8815673589706421, "num_tokens": 411043516.0, "step": 10768 }, { "epoch": 1.3699274901412033, "ewc_loss": 0.028134789317846298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8134789317846298e-05, "grad_norm": 17.121562957763672, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8614553809165955, "num_tokens": 411079092.0, "step": 10769 }, { "epoch": 1.3700547004197938, "ewc_loss": 0.028238872066140175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8238871891517192e-05, "grad_norm": 17.214632034301758, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.872221827507019, "num_tokens": 411117360.0, "step": 10770 }, { "epoch": 1.3701819106983844, "ewc_loss": 0.028207939118146896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8207939976709895e-05, "grad_norm": 17.144451141357422, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8577266931533813, "num_tokens": 411153615.0, "step": 10771 }, { "epoch": 1.3703091209769749, "ewc_loss": 0.028182698413729668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.818269786075689e-05, "grad_norm": 17.150392532348633, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8700082302093506, "num_tokens": 411193571.0, "step": 10772 }, { "epoch": 1.3704363312555654, "ewc_loss": 0.028192562982439995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8192562240292318e-05, "grad_norm": 17.129459381103516, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8489724397659302, "num_tokens": 411230427.0, "step": 10773 }, { "epoch": 1.370563541534156, "ewc_loss": 0.028159603476524353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.815960397128947e-05, "grad_norm": 17.084436416625977, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8663066625595093, "num_tokens": 411271251.0, "step": 10774 }, { "epoch": 1.3706907518127465, "ewc_loss": 0.028209801763296127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8209802621859126e-05, "grad_norm": 17.26753807067871, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8707377910614014, "num_tokens": 411304906.0, "step": 10775 }, { "epoch": 1.370817962091337, "ewc_loss": 0.02827305532991886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.827305615937803e-05, "grad_norm": 17.048362731933594, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.885079026222229, "num_tokens": 411341417.0, "step": 10776 }, { "epoch": 1.3709451723699275, "ewc_loss": 0.028147343546152115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8147343982709572e-05, "grad_norm": 17.063533782958984, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8686982989311218, "num_tokens": 411382118.0, "step": 10777 }, { "epoch": 1.371072382648518, "ewc_loss": 0.028327399864792824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8327400286798365e-05, "grad_norm": 17.126646041870117, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8572746515274048, "num_tokens": 411426702.0, "step": 10778 }, { "epoch": 1.3711995929271086, "ewc_loss": 0.028268683701753616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8268683308851905e-05, "grad_norm": 17.181182861328125, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8675124645233154, "num_tokens": 411461689.0, "step": 10779 }, { "epoch": 1.371326803205699, "ewc_loss": 0.02825338952243328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.825338924594689e-05, "grad_norm": 17.207679748535156, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8685541152954102, "num_tokens": 411499613.0, "step": 10780 }, { "epoch": 1.3714540134842896, "ewc_loss": 0.028255123645067215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.825512274284847e-05, "grad_norm": 17.08041000366211, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8668365478515625, "num_tokens": 411538422.0, "step": 10781 }, { "epoch": 1.3715812237628802, "ewc_loss": 0.028241746127605438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8241745894774795e-05, "grad_norm": 17.171768188476562, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8803668022155762, "num_tokens": 411572401.0, "step": 10782 }, { "epoch": 1.3717084340414705, "ewc_loss": 0.028266659006476402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.826665877364576e-05, "grad_norm": 17.0904598236084, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8752883672714233, "num_tokens": 411611268.0, "step": 10783 }, { "epoch": 1.371835644320061, "ewc_loss": 0.028185278177261353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.818527900672052e-05, "grad_norm": 17.08877944946289, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8680540919303894, "num_tokens": 411648837.0, "step": 10784 }, { "epoch": 1.3719628545986515, "ewc_loss": 0.028266243636608124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.826624404406175e-05, "grad_norm": 17.17806625366211, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8722174167633057, "num_tokens": 411690327.0, "step": 10785 }, { "epoch": 1.372090064877242, "ewc_loss": 0.028311429545283318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8311429559835233e-05, "grad_norm": 17.22332191467285, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8667190074920654, "num_tokens": 411729665.0, "step": 10786 }, { "epoch": 1.3722172751558326, "ewc_loss": 0.028199724853038788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8199725420563482e-05, "grad_norm": 17.185754776000977, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8830291628837585, "num_tokens": 411764716.0, "step": 10787 }, { "epoch": 1.3723444854344231, "ewc_loss": 0.028163786977529526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8163787646917626e-05, "grad_norm": 17.15503692626953, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8700252771377563, "num_tokens": 411805560.0, "step": 10788 }, { "epoch": 1.3724716957130136, "ewc_loss": 0.028178930282592773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8178930733702146e-05, "grad_norm": 17.20577049255371, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8718769550323486, "num_tokens": 411841151.0, "step": 10789 }, { "epoch": 1.3725989059916042, "ewc_loss": 0.02820546366274357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.820546433213167e-05, "grad_norm": 17.15041732788086, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8751423954963684, "num_tokens": 411875088.0, "step": 10790 }, { "epoch": 1.3727261162701947, "ewc_loss": 0.028148379176855087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.814837898768019e-05, "grad_norm": 17.156864166259766, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8686533570289612, "num_tokens": 411914554.0, "step": 10791 }, { "epoch": 1.3728533265487852, "ewc_loss": 0.028184035792946815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.81840366369579e-05, "grad_norm": 17.159313201904297, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8690958619117737, "num_tokens": 411952237.0, "step": 10792 }, { "epoch": 1.3729805368273755, "ewc_loss": 0.02815905585885048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8159056455479003e-05, "grad_norm": 17.109033584594727, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8632988333702087, "num_tokens": 411989676.0, "step": 10793 }, { "epoch": 1.373107747105966, "ewc_loss": 0.028144821524620056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8144821044406854e-05, "grad_norm": 17.149980545043945, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8811286091804504, "num_tokens": 412025804.0, "step": 10794 }, { "epoch": 1.3732349573845566, "ewc_loss": 0.028143227100372314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8143227609689347e-05, "grad_norm": 17.093647003173828, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8843132257461548, "num_tokens": 412060085.0, "step": 10795 }, { "epoch": 1.3733621676631471, "ewc_loss": 0.02815227396786213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.815227344399318e-05, "grad_norm": 17.152923583984375, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8747298717498779, "num_tokens": 412100750.0, "step": 10796 }, { "epoch": 1.3734893779417376, "ewc_loss": 0.028192592784762383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819259316311218e-05, "grad_norm": 17.12751007080078, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8702372312545776, "num_tokens": 412145829.0, "step": 10797 }, { "epoch": 1.3736165882203282, "ewc_loss": 0.028141595423221588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8141595976194367e-05, "grad_norm": 17.11371421813965, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8733699917793274, "num_tokens": 412187607.0, "step": 10798 }, { "epoch": 1.3737437984989187, "ewc_loss": 0.02819492481648922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8194925107527524e-05, "grad_norm": 17.131370544433594, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8564232587814331, "num_tokens": 412227214.0, "step": 10799 }, { "epoch": 1.3738710087775092, "ewc_loss": 0.02817930094897747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.817930180754047e-05, "grad_norm": 17.1514892578125, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8724902868270874, "num_tokens": 412261929.0, "step": 10800 }, { "epoch": 1.3739982190560998, "ewc_loss": 0.028255924582481384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8255924917175435e-05, "grad_norm": 17.149084091186523, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8647520542144775, "num_tokens": 412299441.0, "step": 10801 }, { "epoch": 1.3741254293346903, "ewc_loss": 0.028213923797011375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821392445184756e-05, "grad_norm": 17.129314422607422, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8667967915534973, "num_tokens": 412339248.0, "step": 10802 }, { "epoch": 1.3742526396132808, "ewc_loss": 0.028199153020977974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8199152438901365e-05, "grad_norm": 17.17655372619629, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.868277907371521, "num_tokens": 412377224.0, "step": 10803 }, { "epoch": 1.3743798498918713, "ewc_loss": 0.028215141966938972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8215141355758533e-05, "grad_norm": 17.087629318237305, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8457513451576233, "num_tokens": 412415331.0, "step": 10804 }, { "epoch": 1.3745070601704619, "ewc_loss": 0.028237808495759964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8237807782716118e-05, "grad_norm": 17.142608642578125, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8680217862129211, "num_tokens": 412455604.0, "step": 10805 }, { "epoch": 1.3746342704490524, "ewc_loss": 0.028218412771821022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821841189870611e-05, "grad_norm": 17.12261962890625, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8761621713638306, "num_tokens": 412489295.0, "step": 10806 }, { "epoch": 1.374761480727643, "ewc_loss": 0.028247497975826263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8247497539268807e-05, "grad_norm": 17.150650024414062, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8639779090881348, "num_tokens": 412525750.0, "step": 10807 }, { "epoch": 1.3748886910062332, "ewc_loss": 0.028243856504559517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8243855922482908e-05, "grad_norm": 17.080219268798828, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8785396814346313, "num_tokens": 412567163.0, "step": 10808 }, { "epoch": 1.3750159012848238, "ewc_loss": 0.02817031554877758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8170315999886952e-05, "grad_norm": 17.091981887817383, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.875953197479248, "num_tokens": 412598789.0, "step": 10809 }, { "epoch": 1.3751431115634143, "ewc_loss": 0.02828916721045971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8289166948525235e-05, "grad_norm": 17.166688919067383, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8717367649078369, "num_tokens": 412632588.0, "step": 10810 }, { "epoch": 1.3752703218420048, "ewc_loss": 0.028257932513952255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.825793308147695e-05, "grad_norm": 17.14557647705078, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8676180243492126, "num_tokens": 412676369.0, "step": 10811 }, { "epoch": 1.3753975321205953, "ewc_loss": 0.028220005333423615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8220005333423615e-05, "grad_norm": 17.036785125732422, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8829419612884521, "num_tokens": 412712674.0, "step": 10812 }, { "epoch": 1.3755247423991859, "ewc_loss": 0.02819729968905449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819729888869915e-05, "grad_norm": 17.16525650024414, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8632805943489075, "num_tokens": 412753531.0, "step": 10813 }, { "epoch": 1.3756519526777764, "ewc_loss": 0.02828439697623253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8284397558309138e-05, "grad_norm": 17.04584503173828, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8662877678871155, "num_tokens": 412799450.0, "step": 10814 }, { "epoch": 1.375779162956367, "ewc_loss": 0.028161918744444847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8161919544800185e-05, "grad_norm": 17.093839645385742, "learning_rate": 1e-06, "loss": 0.341, "mean_token_accuracy": 0.8896582126617432, "num_tokens": 412836858.0, "step": 10815 }, { "epoch": 1.3759063732349575, "ewc_loss": 0.02828584425151348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8285843654884957e-05, "grad_norm": 17.09869956970215, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8717217445373535, "num_tokens": 412871169.0, "step": 10816 }, { "epoch": 1.3760335835135478, "ewc_loss": 0.028216667473316193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8216667487868108e-05, "grad_norm": 17.095966339111328, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8738955855369568, "num_tokens": 412909281.0, "step": 10817 }, { "epoch": 1.3761607937921383, "ewc_loss": 0.02825232222676277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.825232149916701e-05, "grad_norm": 17.133878707885742, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8708881735801697, "num_tokens": 412950126.0, "step": 10818 }, { "epoch": 1.3762880040707288, "ewc_loss": 0.028219271451234818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8219272280693986e-05, "grad_norm": 17.109647750854492, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8561588525772095, "num_tokens": 412988644.0, "step": 10819 }, { "epoch": 1.3764152143493193, "ewc_loss": 0.028215477243065834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821547786879819e-05, "grad_norm": 17.152496337890625, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8558061122894287, "num_tokens": 413028403.0, "step": 10820 }, { "epoch": 1.3765424246279099, "ewc_loss": 0.028244592249393463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8244592613191344e-05, "grad_norm": 17.20420265197754, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8597126007080078, "num_tokens": 413067429.0, "step": 10821 }, { "epoch": 1.3766696349065004, "ewc_loss": 0.028241854161024094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8241855034139007e-05, "grad_norm": 17.227834701538086, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8655756711959839, "num_tokens": 413101630.0, "step": 10822 }, { "epoch": 1.376796845185091, "ewc_loss": 0.028249427676200867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.824942748702597e-05, "grad_norm": 17.169113159179688, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8623526096343994, "num_tokens": 413147757.0, "step": 10823 }, { "epoch": 1.3769240554636815, "ewc_loss": 0.02821001224219799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8210011805640534e-05, "grad_norm": 17.207441329956055, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8715723752975464, "num_tokens": 413186803.0, "step": 10824 }, { "epoch": 1.377051265742272, "ewc_loss": 0.028203286230564117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8203287001815625e-05, "grad_norm": 17.223690032958984, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8626120090484619, "num_tokens": 413227201.0, "step": 10825 }, { "epoch": 1.3771784760208625, "ewc_loss": 0.028194570913910866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8194570404593833e-05, "grad_norm": 17.170745849609375, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.867053747177124, "num_tokens": 413270594.0, "step": 10826 }, { "epoch": 1.377305686299453, "ewc_loss": 0.028144454583525658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.814445542753674e-05, "grad_norm": 17.23798179626465, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8507393598556519, "num_tokens": 413307345.0, "step": 10827 }, { "epoch": 1.3774328965780436, "ewc_loss": 0.028204679489135742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.820468034769874e-05, "grad_norm": 17.13970375061035, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8728957772254944, "num_tokens": 413344310.0, "step": 10828 }, { "epoch": 1.377560106856634, "ewc_loss": 0.028160633519291878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8160633519291878e-05, "grad_norm": 17.204696655273438, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8873666524887085, "num_tokens": 413382047.0, "step": 10829 }, { "epoch": 1.3776873171352246, "ewc_loss": 0.028212742879986763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8212742108735256e-05, "grad_norm": 17.29108428955078, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8659573197364807, "num_tokens": 413424658.0, "step": 10830 }, { "epoch": 1.3778145274138152, "ewc_loss": 0.028140081092715263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8140080758021213e-05, "grad_norm": 17.11620330810547, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8763071298599243, "num_tokens": 413458386.0, "step": 10831 }, { "epoch": 1.3779417376924055, "ewc_loss": 0.028123553842306137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8123553420300595e-05, "grad_norm": 17.234601974487305, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.855668842792511, "num_tokens": 413492530.0, "step": 10832 }, { "epoch": 1.378068947970996, "ewc_loss": 0.02824176475405693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.824176408466883e-05, "grad_norm": 17.172666549682617, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8799052238464355, "num_tokens": 413528894.0, "step": 10833 }, { "epoch": 1.3781961582495865, "ewc_loss": 0.028114182874560356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8114181986893527e-05, "grad_norm": 17.22767448425293, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8655003309249878, "num_tokens": 413564765.0, "step": 10834 }, { "epoch": 1.378323368528177, "ewc_loss": 0.02818381041288376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.818381108227186e-05, "grad_norm": 17.159282684326172, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8412327170372009, "num_tokens": 413598122.0, "step": 10835 }, { "epoch": 1.3784505788067676, "ewc_loss": 0.028161179274320602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.816117921611294e-05, "grad_norm": 17.189041137695312, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8519837260246277, "num_tokens": 413634519.0, "step": 10836 }, { "epoch": 1.378577789085358, "ewc_loss": 0.028280125930905342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8280126571189612e-05, "grad_norm": 17.177690505981445, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8610357642173767, "num_tokens": 413674021.0, "step": 10837 }, { "epoch": 1.3787049993639486, "ewc_loss": 0.028197158128023148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.819715882651508e-05, "grad_norm": 17.162105560302734, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8744016289710999, "num_tokens": 413713804.0, "step": 10838 }, { "epoch": 1.3788322096425392, "ewc_loss": 0.028231550008058548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.823155045916792e-05, "grad_norm": 17.06134033203125, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8758470416069031, "num_tokens": 413755665.0, "step": 10839 }, { "epoch": 1.3789594199211297, "ewc_loss": 0.02821032889187336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821032830979675e-05, "grad_norm": 17.128095626831055, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8677354454994202, "num_tokens": 413790872.0, "step": 10840 }, { "epoch": 1.3790866301997202, "ewc_loss": 0.02823876030743122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8238760933163576e-05, "grad_norm": 17.050424575805664, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8885381817817688, "num_tokens": 413830798.0, "step": 10841 }, { "epoch": 1.3792138404783105, "ewc_loss": 0.02828805521130562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.828805554599967e-05, "grad_norm": 17.18452262878418, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8731554746627808, "num_tokens": 413865960.0, "step": 10842 }, { "epoch": 1.379341050756901, "ewc_loss": 0.02836182527244091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.836182466126047e-05, "grad_norm": 17.11980438232422, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8790370225906372, "num_tokens": 413898222.0, "step": 10843 }, { "epoch": 1.3794682610354916, "ewc_loss": 0.028318431228399277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.831843085004948e-05, "grad_norm": 17.11444854736328, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8635761141777039, "num_tokens": 413931839.0, "step": 10844 }, { "epoch": 1.3795954713140821, "ewc_loss": 0.02833636663854122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8336366085568443e-05, "grad_norm": 17.182632446289062, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.858964204788208, "num_tokens": 413963815.0, "step": 10845 }, { "epoch": 1.3797226815926726, "ewc_loss": 0.028329450637102127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.832945028785616e-05, "grad_norm": 17.1610107421875, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8768568634986877, "num_tokens": 414002476.0, "step": 10846 }, { "epoch": 1.3798498918712632, "ewc_loss": 0.028302786871790886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8302787541178986e-05, "grad_norm": 17.11602783203125, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8634434938430786, "num_tokens": 414041121.0, "step": 10847 }, { "epoch": 1.3799771021498537, "ewc_loss": 0.028377963230013847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837796273524873e-05, "grad_norm": 17.175676345825195, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8560118079185486, "num_tokens": 414080447.0, "step": 10848 }, { "epoch": 1.3801043124284442, "ewc_loss": 0.028361838310956955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.83618392131757e-05, "grad_norm": 17.19270896911621, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8587275743484497, "num_tokens": 414108925.0, "step": 10849 }, { "epoch": 1.3802315227070348, "ewc_loss": 0.028361167758703232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.836116800608579e-05, "grad_norm": 17.154367446899414, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.873253345489502, "num_tokens": 414154296.0, "step": 10850 }, { "epoch": 1.3803587329856253, "ewc_loss": 0.028346318751573563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8346319595584646e-05, "grad_norm": 17.135602951049805, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8563129901885986, "num_tokens": 414194612.0, "step": 10851 }, { "epoch": 1.3804859432642158, "ewc_loss": 0.028326185420155525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8326185201876797e-05, "grad_norm": 17.09794807434082, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8709632158279419, "num_tokens": 414229939.0, "step": 10852 }, { "epoch": 1.3806131535428063, "ewc_loss": 0.028385523706674576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.838552427419927e-05, "grad_norm": 17.159263610839844, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8656593561172485, "num_tokens": 414265193.0, "step": 10853 }, { "epoch": 1.3807403638213969, "ewc_loss": 0.028377670794725418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837766987795476e-05, "grad_norm": 17.111560821533203, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8663148283958435, "num_tokens": 414301471.0, "step": 10854 }, { "epoch": 1.3808675740999874, "ewc_loss": 0.028334597125649452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8334598027868196e-05, "grad_norm": 17.086156845092773, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8395885229110718, "num_tokens": 414338755.0, "step": 10855 }, { "epoch": 1.380994784378578, "ewc_loss": 0.02841106429696083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8411064704414457e-05, "grad_norm": 17.14703941345215, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8693728446960449, "num_tokens": 414379996.0, "step": 10856 }, { "epoch": 1.3811219946571682, "ewc_loss": 0.028392931446433067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.839293119905051e-05, "grad_norm": 17.167724609375, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8710625171661377, "num_tokens": 414421905.0, "step": 10857 }, { "epoch": 1.3812492049357588, "ewc_loss": 0.028375951573252678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8375950932968408e-05, "grad_norm": 17.108205795288086, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.873611330986023, "num_tokens": 414465149.0, "step": 10858 }, { "epoch": 1.3813764152143493, "ewc_loss": 0.028349673375487328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8349673812044784e-05, "grad_norm": 17.121753692626953, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8788024187088013, "num_tokens": 414504086.0, "step": 10859 }, { "epoch": 1.3815036254929398, "ewc_loss": 0.0283951535820961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.839515400410164e-05, "grad_norm": 17.196489334106445, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8518370389938354, "num_tokens": 414545392.0, "step": 10860 }, { "epoch": 1.3816308357715303, "ewc_loss": 0.028332877904176712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8332877263892442e-05, "grad_norm": 17.205495834350586, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8334907293319702, "num_tokens": 414586445.0, "step": 10861 }, { "epoch": 1.3817580460501209, "ewc_loss": 0.02833268791437149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8332688088994473e-05, "grad_norm": 17.185134887695312, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8779579997062683, "num_tokens": 414618467.0, "step": 10862 }, { "epoch": 1.3818852563287114, "ewc_loss": 0.02833917737007141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8339178243186325e-05, "grad_norm": 17.176441192626953, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8790789842605591, "num_tokens": 414651253.0, "step": 10863 }, { "epoch": 1.382012466607302, "ewc_loss": 0.02832658775150776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.832658719853498e-05, "grad_norm": 17.207563400268555, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.867162823677063, "num_tokens": 414692016.0, "step": 10864 }, { "epoch": 1.3821396768858925, "ewc_loss": 0.028309175744652748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.830917583196424e-05, "grad_norm": 17.161197662353516, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8730453252792358, "num_tokens": 414731840.0, "step": 10865 }, { "epoch": 1.3822668871644828, "ewc_loss": 0.028283409774303436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8283409847063012e-05, "grad_norm": 17.169841766357422, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8696380853652954, "num_tokens": 414767110.0, "step": 10866 }, { "epoch": 1.3823940974430733, "ewc_loss": 0.028298068791627884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.829806908266619e-05, "grad_norm": 17.173994064331055, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8740337491035461, "num_tokens": 414799340.0, "step": 10867 }, { "epoch": 1.3825213077216638, "ewc_loss": 0.028274066746234894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.82740675174864e-05, "grad_norm": 17.12636947631836, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8711540699005127, "num_tokens": 414840915.0, "step": 10868 }, { "epoch": 1.3826485180002543, "ewc_loss": 0.028225313872098923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.822531314450316e-05, "grad_norm": 17.160215377807617, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8567953109741211, "num_tokens": 414877298.0, "step": 10869 }, { "epoch": 1.3827757282788449, "ewc_loss": 0.028300736099481583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8300735721131787e-05, "grad_norm": 17.09286117553711, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8756636381149292, "num_tokens": 414916431.0, "step": 10870 }, { "epoch": 1.3829029385574354, "ewc_loss": 0.0282963365316391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8296337404754013e-05, "grad_norm": 17.205698013305664, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8672128319740295, "num_tokens": 414958148.0, "step": 10871 }, { "epoch": 1.383030148836026, "ewc_loss": 0.02831588312983513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8315882445895113e-05, "grad_norm": 17.06575584411621, "learning_rate": 1e-06, "loss": 0.3362, "mean_token_accuracy": 0.8936789035797119, "num_tokens": 414997008.0, "step": 10872 }, { "epoch": 1.3831573591146165, "ewc_loss": 0.028270749375224113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8270749680814333e-05, "grad_norm": 17.195096969604492, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8548544645309448, "num_tokens": 415042893.0, "step": 10873 }, { "epoch": 1.383284569393207, "ewc_loss": 0.028296103700995445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.829610457411036e-05, "grad_norm": 17.115692138671875, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.865947425365448, "num_tokens": 415077476.0, "step": 10874 }, { "epoch": 1.3834117796717975, "ewc_loss": 0.028285983949899673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.828598371706903e-05, "grad_norm": 17.136322021484375, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8635851740837097, "num_tokens": 415124329.0, "step": 10875 }, { "epoch": 1.383538989950388, "ewc_loss": 0.028257174417376518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.825717456289567e-05, "grad_norm": 17.078176498413086, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8802281618118286, "num_tokens": 415164101.0, "step": 10876 }, { "epoch": 1.3836662002289786, "ewc_loss": 0.028304031118750572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8304031729931012e-05, "grad_norm": 17.128990173339844, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8685488700866699, "num_tokens": 415203792.0, "step": 10877 }, { "epoch": 1.383793410507569, "ewc_loss": 0.028316492214798927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.83164918073453e-05, "grad_norm": 17.189624786376953, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8657557964324951, "num_tokens": 415233754.0, "step": 10878 }, { "epoch": 1.3839206207861596, "ewc_loss": 0.028322765603661537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.832276550179813e-05, "grad_norm": 17.114017486572266, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8758299946784973, "num_tokens": 415269567.0, "step": 10879 }, { "epoch": 1.3840478310647502, "ewc_loss": 0.028269102796912193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8269103495404124e-05, "grad_norm": 17.13107681274414, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8714298605918884, "num_tokens": 415312660.0, "step": 10880 }, { "epoch": 1.3841750413433405, "ewc_loss": 0.028359318152070045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8359318093862385e-05, "grad_norm": 17.219484329223633, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8789612054824829, "num_tokens": 415355377.0, "step": 10881 }, { "epoch": 1.384302251621931, "ewc_loss": 0.028293730691075325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.829373079293873e-05, "grad_norm": 17.075489044189453, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8804954290390015, "num_tokens": 415393109.0, "step": 10882 }, { "epoch": 1.3844294619005215, "ewc_loss": 0.028272278606891632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8272279450902715e-05, "grad_norm": 17.223451614379883, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8797482252120972, "num_tokens": 415428650.0, "step": 10883 }, { "epoch": 1.384556672179112, "ewc_loss": 0.028380965813994408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8380965886753984e-05, "grad_norm": 17.173580169677734, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.865612268447876, "num_tokens": 415467166.0, "step": 10884 }, { "epoch": 1.3846838824577026, "ewc_loss": 0.028234193101525307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8234193450771272e-05, "grad_norm": 17.154870986938477, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.881454586982727, "num_tokens": 415505633.0, "step": 10885 }, { "epoch": 1.384811092736293, "ewc_loss": 0.0282998476177454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8299848054302856e-05, "grad_norm": 17.14813804626465, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8689966201782227, "num_tokens": 415546254.0, "step": 10886 }, { "epoch": 1.3849383030148836, "ewc_loss": 0.02824699878692627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8246999136172235e-05, "grad_norm": 17.139928817749023, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8672833442687988, "num_tokens": 415584999.0, "step": 10887 }, { "epoch": 1.3850655132934742, "ewc_loss": 0.028246454894542694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8246455258340575e-05, "grad_norm": 17.077516555786133, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8714132905006409, "num_tokens": 415624608.0, "step": 10888 }, { "epoch": 1.3851927235720647, "ewc_loss": 0.028288111090660095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.828811193467118e-05, "grad_norm": 17.195199966430664, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8555575609207153, "num_tokens": 415663405.0, "step": 10889 }, { "epoch": 1.385319933850655, "ewc_loss": 0.028300343081355095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.830034281942062e-05, "grad_norm": 17.174348831176758, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.882943868637085, "num_tokens": 415698934.0, "step": 10890 }, { "epoch": 1.3854471441292455, "ewc_loss": 0.0282675139605999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8267513698665425e-05, "grad_norm": 17.14145278930664, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8401500582695007, "num_tokens": 415736973.0, "step": 10891 }, { "epoch": 1.385574354407836, "ewc_loss": 0.028219034895300865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.821903399308212e-05, "grad_norm": 17.132104873657227, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8714977502822876, "num_tokens": 415778364.0, "step": 10892 }, { "epoch": 1.3857015646864266, "ewc_loss": 0.028275050222873688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8275049771764316e-05, "grad_norm": 17.152965545654297, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8849344253540039, "num_tokens": 415817502.0, "step": 10893 }, { "epoch": 1.385828774965017, "ewc_loss": 0.0282425694167614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.82425698969746e-05, "grad_norm": 17.0943603515625, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8640804290771484, "num_tokens": 415860364.0, "step": 10894 }, { "epoch": 1.3859559852436076, "ewc_loss": 0.028268318623304367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8268319510971196e-05, "grad_norm": 17.21893882751465, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8725491762161255, "num_tokens": 415896816.0, "step": 10895 }, { "epoch": 1.3860831955221982, "ewc_loss": 0.028301436454057693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8301436032052152e-05, "grad_norm": 17.203506469726562, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8514678478240967, "num_tokens": 415935173.0, "step": 10896 }, { "epoch": 1.3862104058007887, "ewc_loss": 0.02824283577501774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.824283546942752e-05, "grad_norm": 17.17051887512207, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8788168430328369, "num_tokens": 415977043.0, "step": 10897 }, { "epoch": 1.3863376160793792, "ewc_loss": 0.028283920139074326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.828392098308541e-05, "grad_norm": 17.25225067138672, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8777194619178772, "num_tokens": 416014376.0, "step": 10898 }, { "epoch": 1.3864648263579697, "ewc_loss": 0.02826538123190403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.826538184308447e-05, "grad_norm": 17.190725326538086, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8557819724082947, "num_tokens": 416055648.0, "step": 10899 }, { "epoch": 1.3865920366365603, "ewc_loss": 0.028244731947779655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8244732675375417e-05, "grad_norm": 17.242694854736328, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8610858917236328, "num_tokens": 416087062.0, "step": 10900 }, { "epoch": 1.3867192469151508, "ewc_loss": 0.028293153271079063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8293152354308404e-05, "grad_norm": 17.20387077331543, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8755236864089966, "num_tokens": 416122559.0, "step": 10901 }, { "epoch": 1.3868464571937413, "ewc_loss": 0.028271201997995377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8271202609175816e-05, "grad_norm": 17.169776916503906, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8618618845939636, "num_tokens": 416155228.0, "step": 10902 }, { "epoch": 1.3869736674723319, "ewc_loss": 0.028240758925676346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.824075818352867e-05, "grad_norm": 17.186077117919922, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8556725978851318, "num_tokens": 416191190.0, "step": 10903 }, { "epoch": 1.3871008777509224, "ewc_loss": 0.028285441920161247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8285441658226773e-05, "grad_norm": 17.091341018676758, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8728749752044678, "num_tokens": 416228009.0, "step": 10904 }, { "epoch": 1.387228088029513, "ewc_loss": 0.02828390523791313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8283904612180777e-05, "grad_norm": 17.204225540161133, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8642640113830566, "num_tokens": 416263657.0, "step": 10905 }, { "epoch": 1.3873552983081032, "ewc_loss": 0.028350964188575745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.83509634755319e-05, "grad_norm": 17.12451171875, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.862174928188324, "num_tokens": 416305187.0, "step": 10906 }, { "epoch": 1.3874825085866938, "ewc_loss": 0.02830497920513153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.830497942341026e-05, "grad_norm": 17.22527313232422, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8667414784431458, "num_tokens": 416342578.0, "step": 10907 }, { "epoch": 1.3876097188652843, "ewc_loss": 0.02839822508394718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8398224458214827e-05, "grad_norm": 17.177963256835938, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8667892813682556, "num_tokens": 416378969.0, "step": 10908 }, { "epoch": 1.3877369291438748, "ewc_loss": 0.028348607942461967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8348607884254307e-05, "grad_norm": 17.131772994995117, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8660557866096497, "num_tokens": 416412958.0, "step": 10909 }, { "epoch": 1.3878641394224653, "ewc_loss": 0.028354892507195473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8354892492643557e-05, "grad_norm": 17.130043029785156, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8495666980743408, "num_tokens": 416445643.0, "step": 10910 }, { "epoch": 1.3879913497010559, "ewc_loss": 0.028317395597696304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8317395845078863e-05, "grad_norm": 17.087175369262695, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8550950288772583, "num_tokens": 416484891.0, "step": 10911 }, { "epoch": 1.3881185599796464, "ewc_loss": 0.028388069942593575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.838806904037483e-05, "grad_norm": 17.129413604736328, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.854059100151062, "num_tokens": 416520504.0, "step": 10912 }, { "epoch": 1.388245770258237, "ewc_loss": 0.028440145775675774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8440144888008945e-05, "grad_norm": 17.17508888244629, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.868181049823761, "num_tokens": 416561877.0, "step": 10913 }, { "epoch": 1.3883729805368275, "ewc_loss": 0.028425656259059906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8425656637409702e-05, "grad_norm": 17.15565299987793, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8696999549865723, "num_tokens": 416597676.0, "step": 10914 }, { "epoch": 1.3885001908154178, "ewc_loss": 0.028374740853905678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837474130501505e-05, "grad_norm": 17.102943420410156, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8748123645782471, "num_tokens": 416630557.0, "step": 10915 }, { "epoch": 1.3886274010940083, "ewc_loss": 0.02840092033147812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8400920200510882e-05, "grad_norm": 17.138042449951172, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8708932399749756, "num_tokens": 416668604.0, "step": 10916 }, { "epoch": 1.3887546113725988, "ewc_loss": 0.028411351144313812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8411350285750814e-05, "grad_norm": 17.14117431640625, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8479338884353638, "num_tokens": 416710246.0, "step": 10917 }, { "epoch": 1.3888818216511893, "ewc_loss": 0.028426459059119225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8426458811736666e-05, "grad_norm": 17.091793060302734, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8683977127075195, "num_tokens": 416744962.0, "step": 10918 }, { "epoch": 1.3890090319297799, "ewc_loss": 0.02842845767736435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8428457881091163e-05, "grad_norm": 17.198383331298828, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8694902658462524, "num_tokens": 416779483.0, "step": 10919 }, { "epoch": 1.3891362422083704, "ewc_loss": 0.028439881280064583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.843988113454543e-05, "grad_norm": 17.084976196289062, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8806122541427612, "num_tokens": 416822135.0, "step": 10920 }, { "epoch": 1.389263452486961, "ewc_loss": 0.028414541855454445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8414542612154037e-05, "grad_norm": 17.149965286254883, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8668310642242432, "num_tokens": 416860383.0, "step": 10921 }, { "epoch": 1.3893906627655515, "ewc_loss": 0.02843584679067135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8435846616048366e-05, "grad_norm": 17.138071060180664, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8639588356018066, "num_tokens": 416896960.0, "step": 10922 }, { "epoch": 1.389517873044142, "ewc_loss": 0.028432883322238922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.843288348230999e-05, "grad_norm": 17.158349990844727, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8787612318992615, "num_tokens": 416931981.0, "step": 10923 }, { "epoch": 1.3896450833227325, "ewc_loss": 0.028384419158101082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8384418328641914e-05, "grad_norm": 17.12638282775879, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8772856593132019, "num_tokens": 416968179.0, "step": 10924 }, { "epoch": 1.389772293601323, "ewc_loss": 0.02844947949051857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8449479941627942e-05, "grad_norm": 17.165977478027344, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.872001588344574, "num_tokens": 417008879.0, "step": 10925 }, { "epoch": 1.3898995038799136, "ewc_loss": 0.028387468308210373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.838746877387166e-05, "grad_norm": 17.1868896484375, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8793593645095825, "num_tokens": 417042497.0, "step": 10926 }, { "epoch": 1.390026714158504, "ewc_loss": 0.028434116393327713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8434116757125594e-05, "grad_norm": 17.160947799682617, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8696511387825012, "num_tokens": 417084396.0, "step": 10927 }, { "epoch": 1.3901539244370946, "ewc_loss": 0.02836649864912033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8366499464027584e-05, "grad_norm": 17.155349731445312, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8821548223495483, "num_tokens": 417121014.0, "step": 10928 }, { "epoch": 1.3902811347156852, "ewc_loss": 0.028341008350253105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8341008146526292e-05, "grad_norm": 17.171932220458984, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8669999837875366, "num_tokens": 417160169.0, "step": 10929 }, { "epoch": 1.3904083449942755, "ewc_loss": 0.028307363390922546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8307364118518308e-05, "grad_norm": 17.145315170288086, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8527237176895142, "num_tokens": 417195024.0, "step": 10930 }, { "epoch": 1.390535555272866, "ewc_loss": 0.028375940397381783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8375940019031987e-05, "grad_norm": 17.094411849975586, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8747551441192627, "num_tokens": 417231344.0, "step": 10931 }, { "epoch": 1.3906627655514565, "ewc_loss": 0.028352344408631325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.835234408848919e-05, "grad_norm": 17.187326431274414, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8605639338493347, "num_tokens": 417270266.0, "step": 10932 }, { "epoch": 1.390789975830047, "ewc_loss": 0.028391283005475998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8391283194650896e-05, "grad_norm": 17.159624099731445, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8673115968704224, "num_tokens": 417308292.0, "step": 10933 }, { "epoch": 1.3909171861086376, "ewc_loss": 0.028339896351099014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8339896744000725e-05, "grad_norm": 17.1695556640625, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8656788468360901, "num_tokens": 417346358.0, "step": 10934 }, { "epoch": 1.391044396387228, "ewc_loss": 0.028307415544986725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8307415050221607e-05, "grad_norm": 17.166744232177734, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.868977963924408, "num_tokens": 417381924.0, "step": 10935 }, { "epoch": 1.3911716066658186, "ewc_loss": 0.0283778365701437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8377837224979885e-05, "grad_norm": 17.190242767333984, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8419193029403687, "num_tokens": 417417779.0, "step": 10936 }, { "epoch": 1.3912988169444092, "ewc_loss": 0.028367239981889725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8367239792714827e-05, "grad_norm": 17.204877853393555, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8612017631530762, "num_tokens": 417459076.0, "step": 10937 }, { "epoch": 1.3914260272229997, "ewc_loss": 0.028364643454551697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8364644094835967e-05, "grad_norm": 17.192045211791992, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8682746291160583, "num_tokens": 417499422.0, "step": 10938 }, { "epoch": 1.39155323750159, "ewc_loss": 0.0283095370978117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8309537810855545e-05, "grad_norm": 17.157182693481445, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.868627667427063, "num_tokens": 417541776.0, "step": 10939 }, { "epoch": 1.3916804477801805, "ewc_loss": 0.028374798595905304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8374799512675963e-05, "grad_norm": 17.22390365600586, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8561978340148926, "num_tokens": 417576469.0, "step": 10940 }, { "epoch": 1.391807658058771, "ewc_loss": 0.02838311530649662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8383115932228975e-05, "grad_norm": 17.160924911499023, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8700340986251831, "num_tokens": 417611726.0, "step": 10941 }, { "epoch": 1.3919348683373616, "ewc_loss": 0.028315816074609756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8315815143287182e-05, "grad_norm": 17.10780143737793, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8608967661857605, "num_tokens": 417650135.0, "step": 10942 }, { "epoch": 1.392062078615952, "ewc_loss": 0.02840948849916458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8409487640601583e-05, "grad_norm": 17.294790267944336, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8497903347015381, "num_tokens": 417690981.0, "step": 10943 }, { "epoch": 1.3921892888945426, "ewc_loss": 0.028422299772500992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842230060196016e-05, "grad_norm": 17.15840721130371, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8560800552368164, "num_tokens": 417732152.0, "step": 10944 }, { "epoch": 1.3923164991731332, "ewc_loss": 0.028292527422308922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8292526621953584e-05, "grad_norm": 17.100677490234375, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8569471836090088, "num_tokens": 417771814.0, "step": 10945 }, { "epoch": 1.3924437094517237, "ewc_loss": 0.02842378057539463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8423781259334646e-05, "grad_norm": 17.21742820739746, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8581913709640503, "num_tokens": 417810663.0, "step": 10946 }, { "epoch": 1.3925709197303142, "ewc_loss": 0.028386646881699562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8386646590661258e-05, "grad_norm": 17.08065414428711, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8713811635971069, "num_tokens": 417845664.0, "step": 10947 }, { "epoch": 1.3926981300089047, "ewc_loss": 0.028353611007332802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.835361192410346e-05, "grad_norm": 17.190404891967773, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8600261807441711, "num_tokens": 417889617.0, "step": 10948 }, { "epoch": 1.3928253402874953, "ewc_loss": 0.028438502922654152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8438502340577543e-05, "grad_norm": 17.11526107788086, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8648077249526978, "num_tokens": 417930820.0, "step": 10949 }, { "epoch": 1.3929525505660858, "ewc_loss": 0.02836536057293415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8365360776660964e-05, "grad_norm": 17.25434684753418, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8694239854812622, "num_tokens": 417966799.0, "step": 10950 }, { "epoch": 1.3930797608446763, "ewc_loss": 0.028439803048968315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8439802918001078e-05, "grad_norm": 17.12575340270996, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8604251146316528, "num_tokens": 418009328.0, "step": 10951 }, { "epoch": 1.3932069711232669, "ewc_loss": 0.02838512510061264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8385125915519893e-05, "grad_norm": 17.261394500732422, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8696402311325073, "num_tokens": 418048548.0, "step": 10952 }, { "epoch": 1.3933341814018574, "ewc_loss": 0.028439505025744438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8439504603738897e-05, "grad_norm": 17.1500186920166, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8655267953872681, "num_tokens": 418094947.0, "step": 10953 }, { "epoch": 1.393461391680448, "ewc_loss": 0.02833458036184311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.833457983797416e-05, "grad_norm": 17.153289794921875, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8581809997558594, "num_tokens": 418137144.0, "step": 10954 }, { "epoch": 1.3935886019590382, "ewc_loss": 0.02844264544546604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.844264599843882e-05, "grad_norm": 17.181602478027344, "learning_rate": 1e-06, "loss": 0.3454, "mean_token_accuracy": 0.8883540034294128, "num_tokens": 418171449.0, "step": 10955 }, { "epoch": 1.3937158122376287, "ewc_loss": 0.02842017449438572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8420174203347415e-05, "grad_norm": 17.16407585144043, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8697924613952637, "num_tokens": 418209854.0, "step": 10956 }, { "epoch": 1.3938430225162193, "ewc_loss": 0.028429007157683372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8429007215891033e-05, "grad_norm": 17.178226470947266, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8747802972793579, "num_tokens": 418245121.0, "step": 10957 }, { "epoch": 1.3939702327948098, "ewc_loss": 0.028371015563607216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8371016014716588e-05, "grad_norm": 17.13633918762207, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8711298704147339, "num_tokens": 418284158.0, "step": 10958 }, { "epoch": 1.3940974430734003, "ewc_loss": 0.02838766574859619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8387665224727243e-05, "grad_norm": 17.116933822631836, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8750368356704712, "num_tokens": 418324091.0, "step": 10959 }, { "epoch": 1.3942246533519909, "ewc_loss": 0.02837584912776947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837584906956181e-05, "grad_norm": 17.130165100097656, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8682314157485962, "num_tokens": 418364764.0, "step": 10960 }, { "epoch": 1.3943518636305814, "ewc_loss": 0.02840653620660305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8406535420799628e-05, "grad_norm": 17.189424514770508, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8641023635864258, "num_tokens": 418403974.0, "step": 10961 }, { "epoch": 1.394479073909172, "ewc_loss": 0.02840331383049488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8403313990565948e-05, "grad_norm": 17.12240982055664, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8573411703109741, "num_tokens": 418446341.0, "step": 10962 }, { "epoch": 1.3946062841877624, "ewc_loss": 0.02840229496359825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8402295356499963e-05, "grad_norm": 17.170795440673828, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.867754340171814, "num_tokens": 418490989.0, "step": 10963 }, { "epoch": 1.3947334944663528, "ewc_loss": 0.02832288108766079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8322881917119958e-05, "grad_norm": 17.17538070678711, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8623794913291931, "num_tokens": 418524633.0, "step": 10964 }, { "epoch": 1.3948607047449433, "ewc_loss": 0.028368109837174416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.836810926964972e-05, "grad_norm": 17.172245025634766, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8667638897895813, "num_tokens": 418562576.0, "step": 10965 }, { "epoch": 1.3949879150235338, "ewc_loss": 0.02831357717514038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.831357778632082e-05, "grad_norm": 17.115158081054688, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8676592707633972, "num_tokens": 418596537.0, "step": 10966 }, { "epoch": 1.3951151253021243, "ewc_loss": 0.028401058167219162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840105844370555e-05, "grad_norm": 17.238727569580078, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8651677370071411, "num_tokens": 418625800.0, "step": 10967 }, { "epoch": 1.3952423355807149, "ewc_loss": 0.02837153896689415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837153988366481e-05, "grad_norm": 17.213743209838867, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.853965163230896, "num_tokens": 418663818.0, "step": 10968 }, { "epoch": 1.3953695458593054, "ewc_loss": 0.028361979871988297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8361979275359772e-05, "grad_norm": 17.193387985229492, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8728214502334595, "num_tokens": 418700607.0, "step": 10969 }, { "epoch": 1.395496756137896, "ewc_loss": 0.028363872319459915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8363872843328863e-05, "grad_norm": 17.166744232177734, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8614948987960815, "num_tokens": 418739357.0, "step": 10970 }, { "epoch": 1.3956239664164865, "ewc_loss": 0.028337884694337845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8337884941720404e-05, "grad_norm": 17.148773193359375, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8739815354347229, "num_tokens": 418769752.0, "step": 10971 }, { "epoch": 1.395751176695077, "ewc_loss": 0.028413888067007065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.841388777596876e-05, "grad_norm": 17.185293197631836, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.849288821220398, "num_tokens": 418808196.0, "step": 10972 }, { "epoch": 1.3958783869736675, "ewc_loss": 0.02838789112865925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8387890779413283e-05, "grad_norm": 17.165851593017578, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8828530311584473, "num_tokens": 418843335.0, "step": 10973 }, { "epoch": 1.396005597252258, "ewc_loss": 0.028407994657754898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840799425030127e-05, "grad_norm": 17.119709014892578, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8763449192047119, "num_tokens": 418876591.0, "step": 10974 }, { "epoch": 1.3961328075308486, "ewc_loss": 0.02842336893081665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842336834874004e-05, "grad_norm": 17.196727752685547, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.855622410774231, "num_tokens": 418910482.0, "step": 10975 }, { "epoch": 1.396260017809439, "ewc_loss": 0.028452102094888687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8452102924347855e-05, "grad_norm": 17.126312255859375, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8590754866600037, "num_tokens": 418942248.0, "step": 10976 }, { "epoch": 1.3963872280880296, "ewc_loss": 0.02845877781510353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.845877861545887e-05, "grad_norm": 17.199909210205078, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8713080883026123, "num_tokens": 418977770.0, "step": 10977 }, { "epoch": 1.3965144383666201, "ewc_loss": 0.028522569686174393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.852257057384122e-05, "grad_norm": 17.147550582885742, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8610403537750244, "num_tokens": 419016464.0, "step": 10978 }, { "epoch": 1.3966416486452105, "ewc_loss": 0.028453757986426353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8453758204705082e-05, "grad_norm": 17.089065551757812, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.858768880367279, "num_tokens": 419053254.0, "step": 10979 }, { "epoch": 1.396768858923801, "ewc_loss": 0.02849353477358818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.849353404599242e-05, "grad_norm": 17.234739303588867, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8618712425231934, "num_tokens": 419088940.0, "step": 10980 }, { "epoch": 1.3968960692023915, "ewc_loss": 0.028519924730062485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8519923944259062e-05, "grad_norm": 17.04863929748535, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8670819997787476, "num_tokens": 419127897.0, "step": 10981 }, { "epoch": 1.397023279480982, "ewc_loss": 0.028429677709937096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8429678422980942e-05, "grad_norm": 17.146915435791016, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8696408271789551, "num_tokens": 419170921.0, "step": 10982 }, { "epoch": 1.3971504897595726, "ewc_loss": 0.028584826737642288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8584827305166982e-05, "grad_norm": 17.10141372680664, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.873866081237793, "num_tokens": 419206032.0, "step": 10983 }, { "epoch": 1.397277700038163, "ewc_loss": 0.02853495441377163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8534954253700562e-05, "grad_norm": 17.161888122558594, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8826059103012085, "num_tokens": 419244240.0, "step": 10984 }, { "epoch": 1.3974049103167536, "ewc_loss": 0.028580065816640854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85800651909085e-05, "grad_norm": 17.162479400634766, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8750709295272827, "num_tokens": 419274902.0, "step": 10985 }, { "epoch": 1.3975321205953442, "ewc_loss": 0.028574304655194283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857430445146747e-05, "grad_norm": 17.20475959777832, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.880039632320404, "num_tokens": 419312295.0, "step": 10986 }, { "epoch": 1.3976593308739347, "ewc_loss": 0.028536198660731316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8536198442452587e-05, "grad_norm": 17.142353057861328, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8501279354095459, "num_tokens": 419350999.0, "step": 10987 }, { "epoch": 1.397786541152525, "ewc_loss": 0.028521331027150154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8521331842057407e-05, "grad_norm": 17.164087295532227, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8684706687927246, "num_tokens": 419386739.0, "step": 10988 }, { "epoch": 1.3979137514311155, "ewc_loss": 0.028580352663993835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858035259123426e-05, "grad_norm": 17.14375877380371, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8742571473121643, "num_tokens": 419430353.0, "step": 10989 }, { "epoch": 1.398040961709706, "ewc_loss": 0.028501400724053383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8501401175162755e-05, "grad_norm": 17.221162796020508, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.863895058631897, "num_tokens": 419462910.0, "step": 10990 }, { "epoch": 1.3981681719882966, "ewc_loss": 0.02846059575676918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.846059578587301e-05, "grad_norm": 17.067157745361328, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8780911564826965, "num_tokens": 419498727.0, "step": 10991 }, { "epoch": 1.398295382266887, "ewc_loss": 0.028485780581831932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8485779694165103e-05, "grad_norm": 17.251249313354492, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8588989973068237, "num_tokens": 419534496.0, "step": 10992 }, { "epoch": 1.3984225925454776, "ewc_loss": 0.028534457087516785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8534457669593394e-05, "grad_norm": 17.111291885375977, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8640068769454956, "num_tokens": 419575760.0, "step": 10993 }, { "epoch": 1.3985498028240682, "ewc_loss": 0.02850099466741085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8500995540525764e-05, "grad_norm": 17.23212432861328, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.874785840511322, "num_tokens": 419616822.0, "step": 10994 }, { "epoch": 1.3986770131026587, "ewc_loss": 0.02854909934103489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8549098715302534e-05, "grad_norm": 17.526519775390625, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8743550777435303, "num_tokens": 419650425.0, "step": 10995 }, { "epoch": 1.3988042233812492, "ewc_loss": 0.028509942814707756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8509943149401806e-05, "grad_norm": 17.045143127441406, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8752176761627197, "num_tokens": 419685501.0, "step": 10996 }, { "epoch": 1.3989314336598397, "ewc_loss": 0.028420323505997658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8420323360478505e-05, "grad_norm": 17.08487892150879, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8696737289428711, "num_tokens": 419724471.0, "step": 10997 }, { "epoch": 1.3990586439384303, "ewc_loss": 0.028683152049779892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8683152777375653e-05, "grad_norm": 17.18497085571289, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8757017850875854, "num_tokens": 419763683.0, "step": 10998 }, { "epoch": 1.3991858542170208, "ewc_loss": 0.028573352843523026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8573353120009415e-05, "grad_norm": 17.131010055541992, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8716535568237305, "num_tokens": 419801238.0, "step": 10999 }, { "epoch": 1.3993130644956113, "ewc_loss": 0.02865641377866268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.865641363314353e-05, "grad_norm": 17.202722549438477, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8640661239624023, "num_tokens": 419832787.0, "step": 11000 }, { "epoch": 1.3994402747742019, "ewc_loss": 0.028675852343440056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8675853172899224e-05, "grad_norm": 17.223833084106445, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8581639528274536, "num_tokens": 419864294.0, "step": 11001 }, { "epoch": 1.3995674850527924, "ewc_loss": 0.02865183725953102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8651837055804208e-05, "grad_norm": 17.16515350341797, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8700871467590332, "num_tokens": 419900493.0, "step": 11002 }, { "epoch": 1.399694695331383, "ewc_loss": 0.028642650693655014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86426511593163e-05, "grad_norm": 17.190109252929688, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8863705396652222, "num_tokens": 419938917.0, "step": 11003 }, { "epoch": 1.3998219056099732, "ewc_loss": 0.02870521880686283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8705218937830068e-05, "grad_norm": 17.24322509765625, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8835117816925049, "num_tokens": 419978771.0, "step": 11004 }, { "epoch": 1.3999491158885637, "ewc_loss": 0.02871098555624485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871098513423931e-05, "grad_norm": 17.274673461914062, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8665146827697754, "num_tokens": 420011836.0, "step": 11005 }, { "epoch": 1.4000763261671543, "ewc_loss": 0.028661035001277924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866103568521794e-05, "grad_norm": 17.244054794311523, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.847310483455658, "num_tokens": 420051618.0, "step": 11006 }, { "epoch": 1.4002035364457448, "ewc_loss": 0.02864115685224533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.864115776901599e-05, "grad_norm": 17.296173095703125, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8549859523773193, "num_tokens": 420089119.0, "step": 11007 }, { "epoch": 1.4003307467243353, "ewc_loss": 0.028605883941054344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.860588392650243e-05, "grad_norm": 17.162214279174805, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8794863224029541, "num_tokens": 420123706.0, "step": 11008 }, { "epoch": 1.4004579570029259, "ewc_loss": 0.028535917401313782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853591831808444e-05, "grad_norm": 17.27712631225586, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8655930757522583, "num_tokens": 420156723.0, "step": 11009 }, { "epoch": 1.4005851672815164, "ewc_loss": 0.028643762692809105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8643762561841868e-05, "grad_norm": 17.222103118896484, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8680796027183533, "num_tokens": 420198519.0, "step": 11010 }, { "epoch": 1.400712377560107, "ewc_loss": 0.02851364202797413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8513641154859215e-05, "grad_norm": 17.256006240844727, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8806621432304382, "num_tokens": 420231722.0, "step": 11011 }, { "epoch": 1.4008395878386974, "ewc_loss": 0.028567630797624588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856763057934586e-05, "grad_norm": 17.25105094909668, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8711889386177063, "num_tokens": 420272059.0, "step": 11012 }, { "epoch": 1.4009667981172877, "ewc_loss": 0.028511513024568558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8511512937257066e-05, "grad_norm": 17.237815856933594, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8750200867652893, "num_tokens": 420307695.0, "step": 11013 }, { "epoch": 1.4010940083958783, "ewc_loss": 0.02848701737821102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8487016606959514e-05, "grad_norm": 17.212623596191406, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8781397342681885, "num_tokens": 420345989.0, "step": 11014 }, { "epoch": 1.4012212186744688, "ewc_loss": 0.0285112876445055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8511287382571027e-05, "grad_norm": 17.305700302124023, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8744544982910156, "num_tokens": 420387194.0, "step": 11015 }, { "epoch": 1.4013484289530593, "ewc_loss": 0.028540220111608505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8540220228023827e-05, "grad_norm": 17.199979782104492, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8834574222564697, "num_tokens": 420427608.0, "step": 11016 }, { "epoch": 1.4014756392316499, "ewc_loss": 0.02846117690205574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8461176043492742e-05, "grad_norm": 17.26757049560547, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8745982646942139, "num_tokens": 420461363.0, "step": 11017 }, { "epoch": 1.4016028495102404, "ewc_loss": 0.028539013117551804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8539012419059873e-05, "grad_norm": 17.21473503112793, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8501531481742859, "num_tokens": 420503577.0, "step": 11018 }, { "epoch": 1.401730059788831, "ewc_loss": 0.028460904955863953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8460905014071614e-05, "grad_norm": 17.227327346801758, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8786410689353943, "num_tokens": 420538852.0, "step": 11019 }, { "epoch": 1.4018572700674214, "ewc_loss": 0.02850918471813202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8509184630820528e-05, "grad_norm": 17.224105834960938, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8657759428024292, "num_tokens": 420577495.0, "step": 11020 }, { "epoch": 1.401984480346012, "ewc_loss": 0.028482303023338318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8482303605414927e-05, "grad_norm": 17.205747604370117, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.868205726146698, "num_tokens": 420616123.0, "step": 11021 }, { "epoch": 1.4021116906246025, "ewc_loss": 0.028490468859672546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8490469048847444e-05, "grad_norm": 17.283689498901367, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8779369592666626, "num_tokens": 420653904.0, "step": 11022 }, { "epoch": 1.402238900903193, "ewc_loss": 0.02849975787103176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8499758627731353e-05, "grad_norm": 17.17975425720215, "learning_rate": 1e-06, "loss": 0.3471, "mean_token_accuracy": 0.8921558260917664, "num_tokens": 420693539.0, "step": 11023 }, { "epoch": 1.4023661111817836, "ewc_loss": 0.028381086885929108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.838108775904402e-05, "grad_norm": 17.22077178955078, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8728637099266052, "num_tokens": 420729046.0, "step": 11024 }, { "epoch": 1.402493321460374, "ewc_loss": 0.028494689613580704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.849468910426367e-05, "grad_norm": 17.253786087036133, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8722310662269592, "num_tokens": 420766143.0, "step": 11025 }, { "epoch": 1.4026205317389646, "ewc_loss": 0.028408223763108253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840822344296612e-05, "grad_norm": 17.185108184814453, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8655985593795776, "num_tokens": 420806367.0, "step": 11026 }, { "epoch": 1.4027477420175551, "ewc_loss": 0.02843090333044529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8430902602849528e-05, "grad_norm": 17.16900062561035, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8755910396575928, "num_tokens": 420845081.0, "step": 11027 }, { "epoch": 1.4028749522961454, "ewc_loss": 0.028415609151124954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8415608539944515e-05, "grad_norm": 17.158422470092773, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8737183809280396, "num_tokens": 420885833.0, "step": 11028 }, { "epoch": 1.403002162574736, "ewc_loss": 0.02842114493250847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8421145543688908e-05, "grad_norm": 17.280397415161133, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8742212653160095, "num_tokens": 420916313.0, "step": 11029 }, { "epoch": 1.4031293728533265, "ewc_loss": 0.028459720313549042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8459720851969905e-05, "grad_norm": 17.19050407409668, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8516793251037598, "num_tokens": 420957011.0, "step": 11030 }, { "epoch": 1.403256583131917, "ewc_loss": 0.028361478820443153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8361479053273797e-05, "grad_norm": 17.211511611938477, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.872744083404541, "num_tokens": 420994815.0, "step": 11031 }, { "epoch": 1.4033837934105076, "ewc_loss": 0.028439978137612343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.843997754098382e-05, "grad_norm": 17.246795654296875, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.876214861869812, "num_tokens": 421033977.0, "step": 11032 }, { "epoch": 1.403511003689098, "ewc_loss": 0.028406936675310135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840693741745781e-05, "grad_norm": 17.208969116210938, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8728558421134949, "num_tokens": 421068871.0, "step": 11033 }, { "epoch": 1.4036382139676886, "ewc_loss": 0.028339577838778496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8339578420855105e-05, "grad_norm": 17.22620964050293, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8700982332229614, "num_tokens": 421105680.0, "step": 11034 }, { "epoch": 1.4037654242462791, "ewc_loss": 0.028430424630641937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8430424208636396e-05, "grad_norm": 17.26382827758789, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8813164234161377, "num_tokens": 421149519.0, "step": 11035 }, { "epoch": 1.4038926345248697, "ewc_loss": 0.02830958552658558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8309585104580037e-05, "grad_norm": 17.188438415527344, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.870658278465271, "num_tokens": 421190195.0, "step": 11036 }, { "epoch": 1.40401984480346, "ewc_loss": 0.028330720961093903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.833072176144924e-05, "grad_norm": 17.276676177978516, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8643895983695984, "num_tokens": 421218637.0, "step": 11037 }, { "epoch": 1.4041470550820505, "ewc_loss": 0.02835862524807453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8358625058899634e-05, "grad_norm": 17.2087345123291, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8730177879333496, "num_tokens": 421261300.0, "step": 11038 }, { "epoch": 1.404274265360641, "ewc_loss": 0.028297817334532738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.82978180621285e-05, "grad_norm": 17.205169677734375, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.865280032157898, "num_tokens": 421297250.0, "step": 11039 }, { "epoch": 1.4044014756392316, "ewc_loss": 0.028375767171382904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837576721503865e-05, "grad_norm": 17.262773513793945, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8615544438362122, "num_tokens": 421334213.0, "step": 11040 }, { "epoch": 1.404528685917822, "ewc_loss": 0.028362037613987923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8362037483020686e-05, "grad_norm": 17.18589973449707, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8758447766304016, "num_tokens": 421368341.0, "step": 11041 }, { "epoch": 1.4046558961964126, "ewc_loss": 0.028341952711343765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8341952202026732e-05, "grad_norm": 17.220760345458984, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8564199209213257, "num_tokens": 421408278.0, "step": 11042 }, { "epoch": 1.4047831064750032, "ewc_loss": 0.02836630865931511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.836630847014021e-05, "grad_norm": 17.264978408813477, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8617011308670044, "num_tokens": 421442559.0, "step": 11043 }, { "epoch": 1.4049103167535937, "ewc_loss": 0.028375940397381783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8375940019031987e-05, "grad_norm": 17.207319259643555, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8668828010559082, "num_tokens": 421474014.0, "step": 11044 }, { "epoch": 1.4050375270321842, "ewc_loss": 0.02832220308482647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8322203434072435e-05, "grad_norm": 17.127262115478516, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8443624377250671, "num_tokens": 421511193.0, "step": 11045 }, { "epoch": 1.4051647373107747, "ewc_loss": 0.02844790741801262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.844790833478328e-05, "grad_norm": 17.233060836791992, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8695566654205322, "num_tokens": 421547895.0, "step": 11046 }, { "epoch": 1.4052919475893653, "ewc_loss": 0.02841511368751526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.841511377482675e-05, "grad_norm": 17.17756462097168, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.868331789970398, "num_tokens": 421582883.0, "step": 11047 }, { "epoch": 1.4054191578679558, "ewc_loss": 0.02845042757689953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.845042763510719e-05, "grad_norm": 17.24418830871582, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8705874681472778, "num_tokens": 421619838.0, "step": 11048 }, { "epoch": 1.4055463681465463, "ewc_loss": 0.028449134901165962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.844913433364127e-05, "grad_norm": 17.27218246459961, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8782316446304321, "num_tokens": 421659451.0, "step": 11049 }, { "epoch": 1.4056735784251368, "ewc_loss": 0.028377551585435867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8377551643643528e-05, "grad_norm": 17.17274284362793, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.866595983505249, "num_tokens": 421704826.0, "step": 11050 }, { "epoch": 1.4058007887037274, "ewc_loss": 0.028480859473347664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.848085932782851e-05, "grad_norm": 17.354394912719727, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8651493787765503, "num_tokens": 421748953.0, "step": 11051 }, { "epoch": 1.405927998982318, "ewc_loss": 0.028454739600419998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8454740458982997e-05, "grad_norm": 17.25464630126953, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8517502546310425, "num_tokens": 421786614.0, "step": 11052 }, { "epoch": 1.4060552092609082, "ewc_loss": 0.028380602598190308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8380602088873275e-05, "grad_norm": 17.264997482299805, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8628499507904053, "num_tokens": 421817291.0, "step": 11053 }, { "epoch": 1.4061824195394987, "ewc_loss": 0.02843083068728447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8430829843273386e-05, "grad_norm": 17.209754943847656, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.866495668888092, "num_tokens": 421858200.0, "step": 11054 }, { "epoch": 1.4063096298180893, "ewc_loss": 0.028407389298081398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840738852682989e-05, "grad_norm": 17.21396827697754, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8689860701560974, "num_tokens": 421903343.0, "step": 11055 }, { "epoch": 1.4064368400966798, "ewc_loss": 0.028448021039366722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8448021112126298e-05, "grad_norm": 17.201679229736328, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8853050470352173, "num_tokens": 421941426.0, "step": 11056 }, { "epoch": 1.4065640503752703, "ewc_loss": 0.028419917449355125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8419917725841515e-05, "grad_norm": 17.19536781311035, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8651396632194519, "num_tokens": 421983992.0, "step": 11057 }, { "epoch": 1.4066912606538609, "ewc_loss": 0.02841527760028839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.841527748387307e-05, "grad_norm": 17.253677368164062, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8765690326690674, "num_tokens": 422015066.0, "step": 11058 }, { "epoch": 1.4068184709324514, "ewc_loss": 0.02840554341673851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8405544071574695e-05, "grad_norm": 17.17620277404785, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8704811334609985, "num_tokens": 422052392.0, "step": 11059 }, { "epoch": 1.406945681211042, "ewc_loss": 0.0284290574491024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8429058147594333e-05, "grad_norm": 17.249713897705078, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8639895915985107, "num_tokens": 422091453.0, "step": 11060 }, { "epoch": 1.4070728914896324, "ewc_loss": 0.028423670679330826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842367030098103e-05, "grad_norm": 17.222881317138672, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.852888822555542, "num_tokens": 422131019.0, "step": 11061 }, { "epoch": 1.4072001017682227, "ewc_loss": 0.028369275853037834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8369275241857395e-05, "grad_norm": 17.17221450805664, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8624399304389954, "num_tokens": 422172139.0, "step": 11062 }, { "epoch": 1.4073273120468133, "ewc_loss": 0.028389902785420418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8389902581693605e-05, "grad_norm": 17.17993927001953, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8753772974014282, "num_tokens": 422209708.0, "step": 11063 }, { "epoch": 1.4074545223254038, "ewc_loss": 0.028405895456671715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.840589513652958e-05, "grad_norm": 17.2438907623291, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8697682619094849, "num_tokens": 422245091.0, "step": 11064 }, { "epoch": 1.4075817326039943, "ewc_loss": 0.028454219922423363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8454220228013583e-05, "grad_norm": 17.28080940246582, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8696454763412476, "num_tokens": 422280054.0, "step": 11065 }, { "epoch": 1.4077089428825849, "ewc_loss": 0.028400521725416183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8400521841831505e-05, "grad_norm": 17.16542625427246, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8748091459274292, "num_tokens": 422310036.0, "step": 11066 }, { "epoch": 1.4078361531611754, "ewc_loss": 0.02841748297214508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8417482099030167e-05, "grad_norm": 17.167930603027344, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8529355525970459, "num_tokens": 422347914.0, "step": 11067 }, { "epoch": 1.407963363439766, "ewc_loss": 0.02845901809632778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8459018722060136e-05, "grad_norm": 17.281538009643555, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8705226182937622, "num_tokens": 422383207.0, "step": 11068 }, { "epoch": 1.4080905737183564, "ewc_loss": 0.028494171798229218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8494172511273064e-05, "grad_norm": 17.2272891998291, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8731644749641418, "num_tokens": 422414675.0, "step": 11069 }, { "epoch": 1.408217783996947, "ewc_loss": 0.028453079983592033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.845307972165756e-05, "grad_norm": 17.255966186523438, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8573247790336609, "num_tokens": 422449746.0, "step": 11070 }, { "epoch": 1.4083449942755375, "ewc_loss": 0.02847244404256344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.847244468284771e-05, "grad_norm": 17.21381950378418, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8621420860290527, "num_tokens": 422487789.0, "step": 11071 }, { "epoch": 1.408472204554128, "ewc_loss": 0.028446806594729424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.844680602720473e-05, "grad_norm": 17.184297561645508, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8744808435440063, "num_tokens": 422521750.0, "step": 11072 }, { "epoch": 1.4085994148327186, "ewc_loss": 0.028478218242526054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8478218155214563e-05, "grad_norm": 17.198925018310547, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8778243660926819, "num_tokens": 422562923.0, "step": 11073 }, { "epoch": 1.408726625111309, "ewc_loss": 0.02848728932440281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8487289455370046e-05, "grad_norm": 17.097280502319336, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.857831597328186, "num_tokens": 422606832.0, "step": 11074 }, { "epoch": 1.4088538353898996, "ewc_loss": 0.028510434553027153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8510434276540764e-05, "grad_norm": 17.236623764038086, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8724033832550049, "num_tokens": 422644465.0, "step": 11075 }, { "epoch": 1.4089810456684901, "ewc_loss": 0.02858574688434601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8585747713805176e-05, "grad_norm": 17.23422622680664, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.876977801322937, "num_tokens": 422681204.0, "step": 11076 }, { "epoch": 1.4091082559470804, "ewc_loss": 0.02851785346865654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8517853934317827e-05, "grad_norm": 17.229631423950195, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8583283424377441, "num_tokens": 422716325.0, "step": 11077 }, { "epoch": 1.409235466225671, "ewc_loss": 0.028513124212622643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8513124561868608e-05, "grad_norm": 17.183687210083008, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8576375842094421, "num_tokens": 422753644.0, "step": 11078 }, { "epoch": 1.4093626765042615, "ewc_loss": 0.02852499671280384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8524997105705552e-05, "grad_norm": 17.207786560058594, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.868527352809906, "num_tokens": 422790893.0, "step": 11079 }, { "epoch": 1.409489886782852, "ewc_loss": 0.028574110940098763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8574111638590693e-05, "grad_norm": 17.207212448120117, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8604210615158081, "num_tokens": 422829400.0, "step": 11080 }, { "epoch": 1.4096170970614426, "ewc_loss": 0.02849726751446724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84972684312379e-05, "grad_norm": 17.148916244506836, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8779901266098022, "num_tokens": 422867386.0, "step": 11081 }, { "epoch": 1.409744307340033, "ewc_loss": 0.02852792851626873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8527929316624068e-05, "grad_norm": 17.179058074951172, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8710894584655762, "num_tokens": 422910736.0, "step": 11082 }, { "epoch": 1.4098715176186236, "ewc_loss": 0.028520645573735237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.852064608305227e-05, "grad_norm": 17.19736099243164, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8730152249336243, "num_tokens": 422950302.0, "step": 11083 }, { "epoch": 1.4099987278972141, "ewc_loss": 0.028539272025227547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853927253454458e-05, "grad_norm": 17.211471557617188, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8676701188087463, "num_tokens": 422985892.0, "step": 11084 }, { "epoch": 1.4101259381758047, "ewc_loss": 0.028558773919939995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8558773919939995e-05, "grad_norm": 17.197036743164062, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8604589700698853, "num_tokens": 423020075.0, "step": 11085 }, { "epoch": 1.410253148454395, "ewc_loss": 0.02857823669910431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8578237106557935e-05, "grad_norm": 17.158573150634766, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8715674877166748, "num_tokens": 423058260.0, "step": 11086 }, { "epoch": 1.4103803587329855, "ewc_loss": 0.028545837849378586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8545837267301977e-05, "grad_norm": 17.211191177368164, "learning_rate": 1e-06, "loss": 0.3361, "mean_token_accuracy": 0.8918951153755188, "num_tokens": 423092158.0, "step": 11087 }, { "epoch": 1.410507569011576, "ewc_loss": 0.028562072664499283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8562071747728623e-05, "grad_norm": 17.19536781311035, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8657331466674805, "num_tokens": 423123999.0, "step": 11088 }, { "epoch": 1.4106347792901666, "ewc_loss": 0.028559992089867592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.855999264284037e-05, "grad_norm": 17.214799880981445, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8648036122322083, "num_tokens": 423167111.0, "step": 11089 }, { "epoch": 1.410761989568757, "ewc_loss": 0.028579125180840492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8579124773386866e-05, "grad_norm": 17.262638092041016, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8612546324729919, "num_tokens": 423202060.0, "step": 11090 }, { "epoch": 1.4108891998473476, "ewc_loss": 0.028531162068247795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853116166079417e-05, "grad_norm": 17.177001953125, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8597919344902039, "num_tokens": 423241200.0, "step": 11091 }, { "epoch": 1.4110164101259381, "ewc_loss": 0.02855456806719303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8554568416438997e-05, "grad_norm": 17.260744094848633, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8775590062141418, "num_tokens": 423284713.0, "step": 11092 }, { "epoch": 1.4111436204045287, "ewc_loss": 0.02860029973089695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8600299629033543e-05, "grad_norm": 17.20438575744629, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8710604310035706, "num_tokens": 423322524.0, "step": 11093 }, { "epoch": 1.4112708306831192, "ewc_loss": 0.02857269160449505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8572691007866524e-05, "grad_norm": 17.242422103881836, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8694416880607605, "num_tokens": 423360551.0, "step": 11094 }, { "epoch": 1.4113980409617097, "ewc_loss": 0.028524965047836304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8524964363896288e-05, "grad_norm": 17.151498794555664, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8789794445037842, "num_tokens": 423395227.0, "step": 11095 }, { "epoch": 1.4115252512403003, "ewc_loss": 0.028547123074531555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8547123292810284e-05, "grad_norm": 17.244625091552734, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.855290412902832, "num_tokens": 423434757.0, "step": 11096 }, { "epoch": 1.4116524615188908, "ewc_loss": 0.02858109585940838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858109655790031e-05, "grad_norm": 17.17231559753418, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.886115550994873, "num_tokens": 423464599.0, "step": 11097 }, { "epoch": 1.4117796717974813, "ewc_loss": 0.028563862666487694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8563863452291116e-05, "grad_norm": 17.206748962402344, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8619983792304993, "num_tokens": 423504327.0, "step": 11098 }, { "epoch": 1.4119068820760718, "ewc_loss": 0.02860773541033268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.860773565771524e-05, "grad_norm": 17.312236785888672, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8492525219917297, "num_tokens": 423543114.0, "step": 11099 }, { "epoch": 1.4120340923546624, "ewc_loss": 0.02859756536781788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8597565687960014e-05, "grad_norm": 17.2325382232666, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8568629622459412, "num_tokens": 423581727.0, "step": 11100 }, { "epoch": 1.412161302633253, "ewc_loss": 0.02851613610982895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.851613680832088e-05, "grad_norm": 17.162715911865234, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8724640011787415, "num_tokens": 423617073.0, "step": 11101 }, { "epoch": 1.4122885129118432, "ewc_loss": 0.028597218915820122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8597218260983936e-05, "grad_norm": 17.180944442749023, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8721933960914612, "num_tokens": 423664922.0, "step": 11102 }, { "epoch": 1.4124157231904337, "ewc_loss": 0.02862590178847313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.862590190488845e-05, "grad_norm": 17.26668930053711, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8600273728370667, "num_tokens": 423706990.0, "step": 11103 }, { "epoch": 1.4125429334690243, "ewc_loss": 0.028553767129778862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8553766242112033e-05, "grad_norm": 17.211376190185547, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.852017879486084, "num_tokens": 423745945.0, "step": 11104 }, { "epoch": 1.4126701437476148, "ewc_loss": 0.028528226539492607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8528225811896846e-05, "grad_norm": 17.278366088867188, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.874016523361206, "num_tokens": 423780824.0, "step": 11105 }, { "epoch": 1.4127973540262053, "ewc_loss": 0.028541356325149536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8541357096401043e-05, "grad_norm": 17.217517852783203, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8805948495864868, "num_tokens": 423818640.0, "step": 11106 }, { "epoch": 1.4129245643047958, "ewc_loss": 0.028515057638287544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8515058147604577e-05, "grad_norm": 17.30922508239746, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8777933716773987, "num_tokens": 423856657.0, "step": 11107 }, { "epoch": 1.4130517745833864, "ewc_loss": 0.028511200100183487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8511200071079656e-05, "grad_norm": 17.160083770751953, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8800344467163086, "num_tokens": 423900288.0, "step": 11108 }, { "epoch": 1.413178984861977, "ewc_loss": 0.02840700000524521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8406999263097532e-05, "grad_norm": 17.158000946044922, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8744399547576904, "num_tokens": 423935929.0, "step": 11109 }, { "epoch": 1.4133061951405674, "ewc_loss": 0.02850007452070713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.850007513188757e-05, "grad_norm": 17.301410675048828, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8672522902488708, "num_tokens": 423970719.0, "step": 11110 }, { "epoch": 1.4134334054191577, "ewc_loss": 0.028480734676122665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8480733817559667e-05, "grad_norm": 17.199810028076172, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8705933690071106, "num_tokens": 424012338.0, "step": 11111 }, { "epoch": 1.4135606156977483, "ewc_loss": 0.028418544679880142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8418544388841838e-05, "grad_norm": 17.241840362548828, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8519093990325928, "num_tokens": 424050112.0, "step": 11112 }, { "epoch": 1.4136878259763388, "ewc_loss": 0.02849605120718479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8496051527326927e-05, "grad_norm": 17.144468307495117, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8702799081802368, "num_tokens": 424092688.0, "step": 11113 }, { "epoch": 1.4138150362549293, "ewc_loss": 0.028425032272934914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8425032724044286e-05, "grad_norm": 17.26020622253418, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8660251498222351, "num_tokens": 424131355.0, "step": 11114 }, { "epoch": 1.4139422465335199, "ewc_loss": 0.02852211520075798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8522115826490335e-05, "grad_norm": 17.210424423217773, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8591219782829285, "num_tokens": 424177768.0, "step": 11115 }, { "epoch": 1.4140694568121104, "ewc_loss": 0.02842528186738491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842528192559257e-05, "grad_norm": 17.196849822998047, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8817040324211121, "num_tokens": 424215834.0, "step": 11116 }, { "epoch": 1.414196667090701, "ewc_loss": 0.02844412252306938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84441230178345e-05, "grad_norm": 17.218172073364258, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8728533983230591, "num_tokens": 424260002.0, "step": 11117 }, { "epoch": 1.4143238773692914, "ewc_loss": 0.0283975750207901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.839757507899776e-05, "grad_norm": 17.213134765625, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8673005104064941, "num_tokens": 424299343.0, "step": 11118 }, { "epoch": 1.414451087647882, "ewc_loss": 0.028413567692041397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8413567633833736e-05, "grad_norm": 17.21246910095215, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8621981143951416, "num_tokens": 424340312.0, "step": 11119 }, { "epoch": 1.4145782979264725, "ewc_loss": 0.028372760862112045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.837276042555459e-05, "grad_norm": 17.16849136352539, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8772518634796143, "num_tokens": 424376658.0, "step": 11120 }, { "epoch": 1.414705508205063, "ewc_loss": 0.028466390445828438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8466391086112708e-05, "grad_norm": 17.196300506591797, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8802910447120667, "num_tokens": 424415499.0, "step": 11121 }, { "epoch": 1.4148327184836536, "ewc_loss": 0.028443315997719765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8443315386539325e-05, "grad_norm": 17.25933837890625, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8701625466346741, "num_tokens": 424453401.0, "step": 11122 }, { "epoch": 1.414959928762244, "ewc_loss": 0.028475433588027954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8475433282437734e-05, "grad_norm": 17.187498092651367, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8724780082702637, "num_tokens": 424490654.0, "step": 11123 }, { "epoch": 1.4150871390408346, "ewc_loss": 0.028372200205922127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8372200176818296e-05, "grad_norm": 17.174962997436523, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8781812191009521, "num_tokens": 424534469.0, "step": 11124 }, { "epoch": 1.4152143493194251, "ewc_loss": 0.02842794358730316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.842794310708996e-05, "grad_norm": 17.18783950805664, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8601683378219604, "num_tokens": 424570187.0, "step": 11125 }, { "epoch": 1.4153415595980154, "ewc_loss": 0.028462231159210205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84622310573468e-05, "grad_norm": 17.2907657623291, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8498165607452393, "num_tokens": 424607898.0, "step": 11126 }, { "epoch": 1.415468769876606, "ewc_loss": 0.028434790670871735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8434789783204906e-05, "grad_norm": 17.20742416381836, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8718222379684448, "num_tokens": 424652156.0, "step": 11127 }, { "epoch": 1.4155959801551965, "ewc_loss": 0.028401236981153488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84012367046671e-05, "grad_norm": 17.227081298828125, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8588874340057373, "num_tokens": 424688735.0, "step": 11128 }, { "epoch": 1.415723190433787, "ewc_loss": 0.028475236147642136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.847523683158215e-05, "grad_norm": 17.2396297454834, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8693782687187195, "num_tokens": 424732859.0, "step": 11129 }, { "epoch": 1.4158504007123776, "ewc_loss": 0.02841760218143463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84176021523308e-05, "grad_norm": 17.176179885864258, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8607429265975952, "num_tokens": 424769381.0, "step": 11130 }, { "epoch": 1.415977610990968, "ewc_loss": 0.02846740372478962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8467404263210483e-05, "grad_norm": 17.23799705505371, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.869852602481842, "num_tokens": 424802362.0, "step": 11131 }, { "epoch": 1.4161048212695586, "ewc_loss": 0.028469379991292953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8469379685702734e-05, "grad_norm": 17.212553024291992, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8661720752716064, "num_tokens": 424837897.0, "step": 11132 }, { "epoch": 1.4162320315481491, "ewc_loss": 0.028394414111971855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.83944136754144e-05, "grad_norm": 17.095348358154297, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8807512521743774, "num_tokens": 424878154.0, "step": 11133 }, { "epoch": 1.4163592418267397, "ewc_loss": 0.028467655181884766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8467655283748172e-05, "grad_norm": 17.260570526123047, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8750501871109009, "num_tokens": 424915063.0, "step": 11134 }, { "epoch": 1.41648645210533, "ewc_loss": 0.028491878882050514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8491878765635192e-05, "grad_norm": 17.189472198486328, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8677370548248291, "num_tokens": 424955159.0, "step": 11135 }, { "epoch": 1.4166136623839205, "ewc_loss": 0.02847919426858425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8479194952524267e-05, "grad_norm": 17.201156616210938, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8610540628433228, "num_tokens": 424992573.0, "step": 11136 }, { "epoch": 1.416740872662511, "ewc_loss": 0.02851315587759018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8513155484688468e-05, "grad_norm": 17.186914443969727, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8735828399658203, "num_tokens": 425030452.0, "step": 11137 }, { "epoch": 1.4168680829411016, "ewc_loss": 0.028451843187212944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8451842808863148e-05, "grad_norm": 17.23474884033203, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8824305534362793, "num_tokens": 425067118.0, "step": 11138 }, { "epoch": 1.416995293219692, "ewc_loss": 0.028517501428723335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.851750105037354e-05, "grad_norm": 17.203235626220703, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8600278496742249, "num_tokens": 425101684.0, "step": 11139 }, { "epoch": 1.4171225034982826, "ewc_loss": 0.02847198024392128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8471980840549804e-05, "grad_norm": 17.222537994384766, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8776301145553589, "num_tokens": 425143127.0, "step": 11140 }, { "epoch": 1.4172497137768731, "ewc_loss": 0.028540197759866714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8540198400150985e-05, "grad_norm": 17.240943908691406, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8698955774307251, "num_tokens": 425173912.0, "step": 11141 }, { "epoch": 1.4173769240554637, "ewc_loss": 0.028522612527012825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8522612410597503e-05, "grad_norm": 17.303359985351562, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8739819526672363, "num_tokens": 425210146.0, "step": 11142 }, { "epoch": 1.4175041343340542, "ewc_loss": 0.028473325073719025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8473325073719025e-05, "grad_norm": 17.133464813232422, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8811814188957214, "num_tokens": 425246082.0, "step": 11143 }, { "epoch": 1.4176313446126447, "ewc_loss": 0.028446562588214874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8446562282624654e-05, "grad_norm": 17.23596954345703, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.865746259689331, "num_tokens": 425287470.0, "step": 11144 }, { "epoch": 1.4177585548912353, "ewc_loss": 0.028561312705278397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8561313229147345e-05, "grad_norm": 17.2402400970459, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8657339811325073, "num_tokens": 425326553.0, "step": 11145 }, { "epoch": 1.4178857651698258, "ewc_loss": 0.02849910967051983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8499109248514287e-05, "grad_norm": 17.248023986816406, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8623375296592712, "num_tokens": 425363105.0, "step": 11146 }, { "epoch": 1.4180129754484163, "ewc_loss": 0.028546636924147606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8546637622639537e-05, "grad_norm": 17.19916534423828, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8638095855712891, "num_tokens": 425402513.0, "step": 11147 }, { "epoch": 1.4181401857270068, "ewc_loss": 0.02852650173008442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8526501409942284e-05, "grad_norm": 17.197040557861328, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8702865839004517, "num_tokens": 425440132.0, "step": 11148 }, { "epoch": 1.4182673960055974, "ewc_loss": 0.02854090929031372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.854090962500777e-05, "grad_norm": 17.247947692871094, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8683575391769409, "num_tokens": 425481738.0, "step": 11149 }, { "epoch": 1.418394606284188, "ewc_loss": 0.028562812134623528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8562812076415867e-05, "grad_norm": 17.22274398803711, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8678466081619263, "num_tokens": 425516916.0, "step": 11150 }, { "epoch": 1.4185218165627782, "ewc_loss": 0.02853935770690441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8539358027046546e-05, "grad_norm": 17.221738815307617, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8686363697052002, "num_tokens": 425555951.0, "step": 11151 }, { "epoch": 1.4186490268413687, "ewc_loss": 0.028562959283590317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8562959414557554e-05, "grad_norm": 17.245697021484375, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8695105314254761, "num_tokens": 425595302.0, "step": 11152 }, { "epoch": 1.4187762371199593, "ewc_loss": 0.028613632544875145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8613632821361534e-05, "grad_norm": 17.34002113342285, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8725736141204834, "num_tokens": 425623281.0, "step": 11153 }, { "epoch": 1.4189034473985498, "ewc_loss": 0.02860717475414276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8607175408978947e-05, "grad_norm": 17.236486434936523, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8775749802589417, "num_tokens": 425659124.0, "step": 11154 }, { "epoch": 1.4190306576771403, "ewc_loss": 0.02852681651711464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8526816095109098e-05, "grad_norm": 17.242900848388672, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8566983938217163, "num_tokens": 425695809.0, "step": 11155 }, { "epoch": 1.4191578679557308, "ewc_loss": 0.028549112379550934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.854911144822836e-05, "grad_norm": 17.223329544067383, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8595312833786011, "num_tokens": 425731890.0, "step": 11156 }, { "epoch": 1.4192850782343214, "ewc_loss": 0.02855815552175045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.855815546354279e-05, "grad_norm": 17.2492618560791, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.877564549446106, "num_tokens": 425770899.0, "step": 11157 }, { "epoch": 1.419412288512912, "ewc_loss": 0.02858617901802063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8586178814293817e-05, "grad_norm": 17.239513397216797, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8798489570617676, "num_tokens": 425807812.0, "step": 11158 }, { "epoch": 1.4195394987915024, "ewc_loss": 0.02857462875545025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85746282315813e-05, "grad_norm": 17.283456802368164, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8650062084197998, "num_tokens": 425848997.0, "step": 11159 }, { "epoch": 1.4196667090700927, "ewc_loss": 0.02861875668168068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8618756914511323e-05, "grad_norm": 17.231975555419922, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8796171545982361, "num_tokens": 425887816.0, "step": 11160 }, { "epoch": 1.4197939193486833, "ewc_loss": 0.028554178774356842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8554179152706638e-05, "grad_norm": 17.234479904174805, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8609827756881714, "num_tokens": 425925488.0, "step": 11161 }, { "epoch": 1.4199211296272738, "ewc_loss": 0.028606673702597618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8606673367903568e-05, "grad_norm": 17.269344329833984, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.875008761882782, "num_tokens": 425965616.0, "step": 11162 }, { "epoch": 1.4200483399058643, "ewc_loss": 0.028561405837535858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8561405997606926e-05, "grad_norm": 17.209697723388672, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8625541925430298, "num_tokens": 426005308.0, "step": 11163 }, { "epoch": 1.4201755501844548, "ewc_loss": 0.02858082950115204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858082916645799e-05, "grad_norm": 17.286348342895508, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8650543689727783, "num_tokens": 426042339.0, "step": 11164 }, { "epoch": 1.4203027604630454, "ewc_loss": 0.028628967702388763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8628966902033426e-05, "grad_norm": 17.298616409301758, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8686476349830627, "num_tokens": 426081925.0, "step": 11165 }, { "epoch": 1.420429970741636, "ewc_loss": 0.02853565476834774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8535654564620927e-05, "grad_norm": 17.214651107788086, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8634768128395081, "num_tokens": 426120918.0, "step": 11166 }, { "epoch": 1.4205571810202264, "ewc_loss": 0.028557565063238144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8557564291986637e-05, "grad_norm": 17.240310668945312, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8574299216270447, "num_tokens": 426158711.0, "step": 11167 }, { "epoch": 1.420684391298817, "ewc_loss": 0.02854592725634575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.854592639778275e-05, "grad_norm": 17.18684196472168, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.865188479423523, "num_tokens": 426197034.0, "step": 11168 }, { "epoch": 1.4208116015774075, "ewc_loss": 0.028503483161330223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8503483918029815e-05, "grad_norm": 17.257610321044922, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8616725206375122, "num_tokens": 426239390.0, "step": 11169 }, { "epoch": 1.420938811855998, "ewc_loss": 0.028583379462361336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858337938960176e-05, "grad_norm": 17.25265884399414, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8583080172538757, "num_tokens": 426285488.0, "step": 11170 }, { "epoch": 1.4210660221345885, "ewc_loss": 0.028509026393294334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.850902637874242e-05, "grad_norm": 17.2899169921875, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8558749556541443, "num_tokens": 426323955.0, "step": 11171 }, { "epoch": 1.421193232413179, "ewc_loss": 0.02852717600762844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8527176255011e-05, "grad_norm": 17.186349868774414, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8867539167404175, "num_tokens": 426359460.0, "step": 11172 }, { "epoch": 1.4213204426917696, "ewc_loss": 0.028478456661105156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8478456442826428e-05, "grad_norm": 17.280824661254883, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8618383407592773, "num_tokens": 426397104.0, "step": 11173 }, { "epoch": 1.4214476529703601, "ewc_loss": 0.028530512005090714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8530512281577103e-05, "grad_norm": 17.203685760498047, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8773081302642822, "num_tokens": 426441349.0, "step": 11174 }, { "epoch": 1.4215748632489504, "ewc_loss": 0.028474068269133568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8474069040385075e-05, "grad_norm": 17.16709327697754, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8767132759094238, "num_tokens": 426478410.0, "step": 11175 }, { "epoch": 1.421702073527541, "ewc_loss": 0.028553130105137825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8553129595820792e-05, "grad_norm": 17.257339477539062, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8668913841247559, "num_tokens": 426519874.0, "step": 11176 }, { "epoch": 1.4218292838061315, "ewc_loss": 0.028515147045254707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.851514727808535e-05, "grad_norm": 17.195499420166016, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8570501804351807, "num_tokens": 426559970.0, "step": 11177 }, { "epoch": 1.421956494084722, "ewc_loss": 0.0285053551197052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8505355658126064e-05, "grad_norm": 17.289424896240234, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8700795769691467, "num_tokens": 426593897.0, "step": 11178 }, { "epoch": 1.4220837043633126, "ewc_loss": 0.028570903465151787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857090294128284e-05, "grad_norm": 17.267648696899414, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8686254620552063, "num_tokens": 426633712.0, "step": 11179 }, { "epoch": 1.422210914641903, "ewc_loss": 0.028522755950689316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8522756110760383e-05, "grad_norm": 17.258211135864258, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.883365273475647, "num_tokens": 426669085.0, "step": 11180 }, { "epoch": 1.4223381249204936, "ewc_loss": 0.028505032882094383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8505033697001636e-05, "grad_norm": 17.206029891967773, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8814064264297485, "num_tokens": 426709260.0, "step": 11181 }, { "epoch": 1.4224653351990841, "ewc_loss": 0.028456443920731544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8456443033064716e-05, "grad_norm": 17.153636932373047, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8577214479446411, "num_tokens": 426747747.0, "step": 11182 }, { "epoch": 1.4225925454776747, "ewc_loss": 0.028552966192364693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8552965886774473e-05, "grad_norm": 17.284391403198242, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8557693362236023, "num_tokens": 426788185.0, "step": 11183 }, { "epoch": 1.422719755756265, "ewc_loss": 0.028579827398061752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8579826903296635e-05, "grad_norm": 17.258636474609375, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8705246448516846, "num_tokens": 426828663.0, "step": 11184 }, { "epoch": 1.4228469660348555, "ewc_loss": 0.028460051864385605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.846005190804135e-05, "grad_norm": 17.22336196899414, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8589775562286377, "num_tokens": 426869205.0, "step": 11185 }, { "epoch": 1.422974176313446, "ewc_loss": 0.028551191091537476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8551190553116612e-05, "grad_norm": 17.211145401000977, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8846272230148315, "num_tokens": 426900561.0, "step": 11186 }, { "epoch": 1.4231013865920366, "ewc_loss": 0.028493257239460945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.849325755960308e-05, "grad_norm": 17.266765594482422, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.883004903793335, "num_tokens": 426940618.0, "step": 11187 }, { "epoch": 1.423228596870627, "ewc_loss": 0.028574224561452866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8574224415933713e-05, "grad_norm": 17.342134475708008, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8602023720741272, "num_tokens": 426975509.0, "step": 11188 }, { "epoch": 1.4233558071492176, "ewc_loss": 0.028514418751001358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8514419682323933e-05, "grad_norm": 17.257591247558594, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8774603605270386, "num_tokens": 427011647.0, "step": 11189 }, { "epoch": 1.4234830174278081, "ewc_loss": 0.028426066040992737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84260659100255e-05, "grad_norm": 17.21414566040039, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8659895658493042, "num_tokens": 427046631.0, "step": 11190 }, { "epoch": 1.4236102277063987, "ewc_loss": 0.02853708155453205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853708247130271e-05, "grad_norm": 17.352643966674805, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8663052916526794, "num_tokens": 427081364.0, "step": 11191 }, { "epoch": 1.4237374379849892, "ewc_loss": 0.02851717174053192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8517171813291498e-05, "grad_norm": 17.23340606689453, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8708198070526123, "num_tokens": 427118667.0, "step": 11192 }, { "epoch": 1.4238646482635797, "ewc_loss": 0.028432950377464294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.843295078491792e-05, "grad_norm": 17.263141632080078, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8740143775939941, "num_tokens": 427155017.0, "step": 11193 }, { "epoch": 1.4239918585421703, "ewc_loss": 0.028534820303320885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85348196484847e-05, "grad_norm": 17.275062561035156, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8622918128967285, "num_tokens": 427191741.0, "step": 11194 }, { "epoch": 1.4241190688207608, "ewc_loss": 0.02845895104110241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8458951419452205e-05, "grad_norm": 17.245492935180664, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8724813461303711, "num_tokens": 427226210.0, "step": 11195 }, { "epoch": 1.4242462790993513, "ewc_loss": 0.028483960777521133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8483960704761557e-05, "grad_norm": 17.317258834838867, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8653002381324768, "num_tokens": 427261699.0, "step": 11196 }, { "epoch": 1.4243734893779418, "ewc_loss": 0.028532058000564575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8532058422570117e-05, "grad_norm": 17.289073944091797, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8588305711746216, "num_tokens": 427297158.0, "step": 11197 }, { "epoch": 1.4245006996565324, "ewc_loss": 0.028426388278603554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8426387871149927e-05, "grad_norm": 17.19583511352539, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8546780347824097, "num_tokens": 427338234.0, "step": 11198 }, { "epoch": 1.424627909935123, "ewc_loss": 0.028578441590070724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8578440833371133e-05, "grad_norm": 17.324058532714844, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8632803559303284, "num_tokens": 427376727.0, "step": 11199 }, { "epoch": 1.4247551202137132, "ewc_loss": 0.028538797050714493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8538797778310254e-05, "grad_norm": 17.206584930419922, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8804998397827148, "num_tokens": 427417095.0, "step": 11200 }, { "epoch": 1.4248823304923037, "ewc_loss": 0.02848052605986595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8480526452767663e-05, "grad_norm": 17.23989486694336, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8520534634590149, "num_tokens": 427453243.0, "step": 11201 }, { "epoch": 1.4250095407708943, "ewc_loss": 0.028635680675506592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8635680791921914e-05, "grad_norm": 17.207698822021484, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.871605396270752, "num_tokens": 427490961.0, "step": 11202 }, { "epoch": 1.4251367510494848, "ewc_loss": 0.02852385863661766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8523858418338932e-05, "grad_norm": 17.2136287689209, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8780405521392822, "num_tokens": 427521487.0, "step": 11203 }, { "epoch": 1.4252639613280753, "ewc_loss": 0.02860303968191147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8603039027075283e-05, "grad_norm": 17.271398544311523, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8654600977897644, "num_tokens": 427556607.0, "step": 11204 }, { "epoch": 1.4253911716066658, "ewc_loss": 0.028576917946338654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8576918339240365e-05, "grad_norm": 17.276763916015625, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.869351863861084, "num_tokens": 427598940.0, "step": 11205 }, { "epoch": 1.4255183818852564, "ewc_loss": 0.028605829924345016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8605829356820323e-05, "grad_norm": 17.2708797454834, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8588140606880188, "num_tokens": 427641337.0, "step": 11206 }, { "epoch": 1.425645592163847, "ewc_loss": 0.028618933632969856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8618933356483467e-05, "grad_norm": 17.25059700012207, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8789882063865662, "num_tokens": 427680235.0, "step": 11207 }, { "epoch": 1.4257728024424374, "ewc_loss": 0.028549406677484512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8549406124511734e-05, "grad_norm": 17.272829055786133, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8690330386161804, "num_tokens": 427719918.0, "step": 11208 }, { "epoch": 1.4259000127210277, "ewc_loss": 0.028592029586434364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8592030503205024e-05, "grad_norm": 17.25910758972168, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8713169097900391, "num_tokens": 427758042.0, "step": 11209 }, { "epoch": 1.4260272229996183, "ewc_loss": 0.028530526906251907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853052683349233e-05, "grad_norm": 17.28042221069336, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8707647323608398, "num_tokens": 427794591.0, "step": 11210 }, { "epoch": 1.4261544332782088, "ewc_loss": 0.028630083426833153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8630083761527203e-05, "grad_norm": 17.305870056152344, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8769500255584717, "num_tokens": 427828190.0, "step": 11211 }, { "epoch": 1.4262816435567993, "ewc_loss": 0.02852492779493332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8524927984108217e-05, "grad_norm": 17.307994842529297, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8609534502029419, "num_tokens": 427870363.0, "step": 11212 }, { "epoch": 1.4264088538353898, "ewc_loss": 0.028583046048879623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858304651454091e-05, "grad_norm": 17.20745277404785, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8832278251647949, "num_tokens": 427910037.0, "step": 11213 }, { "epoch": 1.4265360641139804, "ewc_loss": 0.028480956330895424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84809557342669e-05, "grad_norm": 17.31403160095215, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8682202100753784, "num_tokens": 427945782.0, "step": 11214 }, { "epoch": 1.426663274392571, "ewc_loss": 0.028525756672024727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8525757443276234e-05, "grad_norm": 17.15459632873535, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8766039609909058, "num_tokens": 427984635.0, "step": 11215 }, { "epoch": 1.4267904846711614, "ewc_loss": 0.02851238287985325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.851238241419196e-05, "grad_norm": 17.311872482299805, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8846778273582458, "num_tokens": 428024289.0, "step": 11216 }, { "epoch": 1.426917694949752, "ewc_loss": 0.028587903827428818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8587903216248378e-05, "grad_norm": 17.246904373168945, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8611451387405396, "num_tokens": 428067119.0, "step": 11217 }, { "epoch": 1.4270449052283425, "ewc_loss": 0.02846485935151577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8464859497034922e-05, "grad_norm": 17.321735382080078, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8674675226211548, "num_tokens": 428108150.0, "step": 11218 }, { "epoch": 1.427172115506933, "ewc_loss": 0.028519216924905777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8519216357381083e-05, "grad_norm": 17.222288131713867, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8798373341560364, "num_tokens": 428150839.0, "step": 11219 }, { "epoch": 1.4272993257855235, "ewc_loss": 0.028511883690953255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.851188401109539e-05, "grad_norm": 17.363399505615234, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8579308390617371, "num_tokens": 428194325.0, "step": 11220 }, { "epoch": 1.427426536064114, "ewc_loss": 0.028550904244184494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8550904971780255e-05, "grad_norm": 17.24108123779297, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8684990406036377, "num_tokens": 428230879.0, "step": 11221 }, { "epoch": 1.4275537463427046, "ewc_loss": 0.028440378606319427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.84403777186526e-05, "grad_norm": 17.3227481842041, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8799877762794495, "num_tokens": 428265795.0, "step": 11222 }, { "epoch": 1.4276809566212951, "ewc_loss": 0.028546232730150223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8546231988002546e-05, "grad_norm": 17.311458587646484, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.875190019607544, "num_tokens": 428300413.0, "step": 11223 }, { "epoch": 1.4278081668998854, "ewc_loss": 0.028419703245162964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8419703085091896e-05, "grad_norm": 17.21956443786621, "learning_rate": 1e-06, "loss": 0.3451, "mean_token_accuracy": 0.89039146900177, "num_tokens": 428339293.0, "step": 11224 }, { "epoch": 1.427935377178476, "ewc_loss": 0.02846108376979828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.846108327503316e-05, "grad_norm": 17.27329444885254, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8811259269714355, "num_tokens": 428374158.0, "step": 11225 }, { "epoch": 1.4280625874570665, "ewc_loss": 0.028497004881501198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8497004677774385e-05, "grad_norm": 17.278125762939453, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8699167966842651, "num_tokens": 428411715.0, "step": 11226 }, { "epoch": 1.428189797735657, "ewc_loss": 0.028463222086429596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.846322240657173e-05, "grad_norm": 17.234525680541992, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8587118983268738, "num_tokens": 428450000.0, "step": 11227 }, { "epoch": 1.4283170080142475, "ewc_loss": 0.028506437316536903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8506437956821173e-05, "grad_norm": 17.25904083251953, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8754917979240417, "num_tokens": 428484546.0, "step": 11228 }, { "epoch": 1.428444218292838, "ewc_loss": 0.02849932201206684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8499322070274502e-05, "grad_norm": 17.322010040283203, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8708077073097229, "num_tokens": 428527519.0, "step": 11229 }, { "epoch": 1.4285714285714286, "ewc_loss": 0.02853710763156414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.853710793715436e-05, "grad_norm": 17.225566864013672, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8660153150558472, "num_tokens": 428568744.0, "step": 11230 }, { "epoch": 1.4286986388500191, "ewc_loss": 0.02845245972275734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.845245944627095e-05, "grad_norm": 17.248403549194336, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8657706379890442, "num_tokens": 428613050.0, "step": 11231 }, { "epoch": 1.4288258491286097, "ewc_loss": 0.028507040813565254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8507040042313747e-05, "grad_norm": 17.20703887939453, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.86687171459198, "num_tokens": 428647334.0, "step": 11232 }, { "epoch": 1.4289530594072, "ewc_loss": 0.02850424125790596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.850424061762169e-05, "grad_norm": 17.317773818969727, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8756231665611267, "num_tokens": 428681015.0, "step": 11233 }, { "epoch": 1.4290802696857905, "ewc_loss": 0.028569266200065613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8569265850819647e-05, "grad_norm": 17.263364791870117, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8624330759048462, "num_tokens": 428718903.0, "step": 11234 }, { "epoch": 1.429207479964381, "ewc_loss": 0.02852601185441017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.852601210179273e-05, "grad_norm": 17.271976470947266, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.877748429775238, "num_tokens": 428756038.0, "step": 11235 }, { "epoch": 1.4293346902429716, "ewc_loss": 0.02857128717005253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8571286748046987e-05, "grad_norm": 17.277875900268555, "learning_rate": 1e-06, "loss": 0.3533, "mean_token_accuracy": 0.8857537508010864, "num_tokens": 428793908.0, "step": 11236 }, { "epoch": 1.429461900521562, "ewc_loss": 0.028505761176347733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8505761292763054e-05, "grad_norm": 17.221799850463867, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8627583980560303, "num_tokens": 428830924.0, "step": 11237 }, { "epoch": 1.4295891108001526, "ewc_loss": 0.028531743213534355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8531743737403303e-05, "grad_norm": 17.292007446289062, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8663967251777649, "num_tokens": 428869083.0, "step": 11238 }, { "epoch": 1.4297163210787431, "ewc_loss": 0.02854832448065281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8548323825816624e-05, "grad_norm": 17.29855728149414, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8440968990325928, "num_tokens": 428903970.0, "step": 11239 }, { "epoch": 1.4298435313573337, "ewc_loss": 0.02854117378592491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8541173378471285e-05, "grad_norm": 17.242389678955078, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8744666576385498, "num_tokens": 428940710.0, "step": 11240 }, { "epoch": 1.4299707416359242, "ewc_loss": 0.028593428432941437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.859342930605635e-05, "grad_norm": 17.316953659057617, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8717197775840759, "num_tokens": 428981160.0, "step": 11241 }, { "epoch": 1.4300979519145147, "ewc_loss": 0.028571560978889465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8571561415446922e-05, "grad_norm": 17.26923370361328, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8711503148078918, "num_tokens": 429022464.0, "step": 11242 }, { "epoch": 1.4302251621931052, "ewc_loss": 0.02847701869904995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8477019441197626e-05, "grad_norm": 17.24570083618164, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.871103048324585, "num_tokens": 429060964.0, "step": 11243 }, { "epoch": 1.4303523724716958, "ewc_loss": 0.028549669310450554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8549669877975248e-05, "grad_norm": 17.233734130859375, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8709158897399902, "num_tokens": 429101389.0, "step": 11244 }, { "epoch": 1.4304795827502863, "ewc_loss": 0.028544636443257332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8544636734295636e-05, "grad_norm": 17.250202178955078, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8669048547744751, "num_tokens": 429135547.0, "step": 11245 }, { "epoch": 1.4306067930288768, "ewc_loss": 0.028509696945548058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8509697585832328e-05, "grad_norm": 17.272314071655273, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8728197813034058, "num_tokens": 429177504.0, "step": 11246 }, { "epoch": 1.4307340033074674, "ewc_loss": 0.028524208813905716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8524209483293816e-05, "grad_norm": 17.213756561279297, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8745350241661072, "num_tokens": 429217923.0, "step": 11247 }, { "epoch": 1.430861213586058, "ewc_loss": 0.02849750965833664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.849751035682857e-05, "grad_norm": 17.25161361694336, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8699017763137817, "num_tokens": 429251514.0, "step": 11248 }, { "epoch": 1.4309884238646482, "ewc_loss": 0.02855277806520462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8552778530865908e-05, "grad_norm": 17.20903205871582, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8544996976852417, "num_tokens": 429291747.0, "step": 11249 }, { "epoch": 1.4311156341432387, "ewc_loss": 0.02856418490409851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8564185413415544e-05, "grad_norm": 17.265377044677734, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8665211796760559, "num_tokens": 429333480.0, "step": 11250 }, { "epoch": 1.4312428444218293, "ewc_loss": 0.028595006093382835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8595006369869225e-05, "grad_norm": 17.279674530029297, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8810135722160339, "num_tokens": 429366482.0, "step": 11251 }, { "epoch": 1.4313700547004198, "ewc_loss": 0.02856968529522419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8569686037371866e-05, "grad_norm": 17.276531219482422, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8617361783981323, "num_tokens": 429400704.0, "step": 11252 }, { "epoch": 1.4314972649790103, "ewc_loss": 0.028535757213830948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8535756428027526e-05, "grad_norm": 17.211416244506836, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8580713272094727, "num_tokens": 429437463.0, "step": 11253 }, { "epoch": 1.4316244752576008, "ewc_loss": 0.0285949744284153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.859497362805996e-05, "grad_norm": 17.2921199798584, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8583709001541138, "num_tokens": 429474496.0, "step": 11254 }, { "epoch": 1.4317516855361914, "ewc_loss": 0.028586894273757935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858689367712941e-05, "grad_norm": 17.29331398010254, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8568814992904663, "num_tokens": 429514641.0, "step": 11255 }, { "epoch": 1.431878895814782, "ewc_loss": 0.028567753732204437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85677542706253e-05, "grad_norm": 17.28142738342285, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.868855357170105, "num_tokens": 429554054.0, "step": 11256 }, { "epoch": 1.4320061060933724, "ewc_loss": 0.028562571853399277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85625719698146e-05, "grad_norm": 17.25813102722168, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8646875619888306, "num_tokens": 429594004.0, "step": 11257 }, { "epoch": 1.4321333163719627, "ewc_loss": 0.02857060357928276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8570602808031254e-05, "grad_norm": 17.26494026184082, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8766021728515625, "num_tokens": 429633019.0, "step": 11258 }, { "epoch": 1.4322605266505533, "ewc_loss": 0.028585413470864296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8585413019754924e-05, "grad_norm": 17.21881866455078, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8707084655761719, "num_tokens": 429666869.0, "step": 11259 }, { "epoch": 1.4323877369291438, "ewc_loss": 0.028643431141972542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8643431505770423e-05, "grad_norm": 17.36992645263672, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8787315487861633, "num_tokens": 429700049.0, "step": 11260 }, { "epoch": 1.4325149472077343, "ewc_loss": 0.028599556535482407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8599555662367493e-05, "grad_norm": 17.24504280090332, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8678759336471558, "num_tokens": 429737517.0, "step": 11261 }, { "epoch": 1.4326421574863248, "ewc_loss": 0.02856455370783806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856455284927506e-05, "grad_norm": 17.25264549255371, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.881667971611023, "num_tokens": 429778697.0, "step": 11262 }, { "epoch": 1.4327693677649154, "ewc_loss": 0.028612958267331123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861295797629282e-05, "grad_norm": 17.306222915649414, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8693913817405701, "num_tokens": 429811883.0, "step": 11263 }, { "epoch": 1.432896578043506, "ewc_loss": 0.02858204022049904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858204061340075e-05, "grad_norm": 17.295490264892578, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8754181861877441, "num_tokens": 429851748.0, "step": 11264 }, { "epoch": 1.4330237883220964, "ewc_loss": 0.028575044125318527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8575044780154712e-05, "grad_norm": 17.253620147705078, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8597278594970703, "num_tokens": 429891033.0, "step": 11265 }, { "epoch": 1.433150998600687, "ewc_loss": 0.028559178113937378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8559177735587582e-05, "grad_norm": 17.204099655151367, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.871633768081665, "num_tokens": 429935980.0, "step": 11266 }, { "epoch": 1.4332782088792775, "ewc_loss": 0.02860749326646328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8607493732124567e-05, "grad_norm": 17.334657669067383, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8582463264465332, "num_tokens": 429970874.0, "step": 11267 }, { "epoch": 1.433405419157868, "ewc_loss": 0.028667984530329704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8667984224739484e-05, "grad_norm": 17.190542221069336, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8544975519180298, "num_tokens": 430017909.0, "step": 11268 }, { "epoch": 1.4335326294364585, "ewc_loss": 0.02858263999223709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8582639060914516e-05, "grad_norm": 17.307771682739258, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8435145020484924, "num_tokens": 430050641.0, "step": 11269 }, { "epoch": 1.433659839715049, "ewc_loss": 0.02870464324951172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8704642318189144e-05, "grad_norm": 17.32573699951172, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8727660179138184, "num_tokens": 430082985.0, "step": 11270 }, { "epoch": 1.4337870499936396, "ewc_loss": 0.02857416495680809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8574164389283396e-05, "grad_norm": 17.248716354370117, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8703254461288452, "num_tokens": 430122340.0, "step": 11271 }, { "epoch": 1.4339142602722301, "ewc_loss": 0.02862715721130371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8627157007576898e-05, "grad_norm": 17.201452255249023, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.855351984500885, "num_tokens": 430162140.0, "step": 11272 }, { "epoch": 1.4340414705508204, "ewc_loss": 0.02865412086248398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.865412170649506e-05, "grad_norm": 17.348079681396484, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.87996506690979, "num_tokens": 430201672.0, "step": 11273 }, { "epoch": 1.434168680829411, "ewc_loss": 0.028693703934550285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8693704734905623e-05, "grad_norm": 17.244882583618164, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8590579032897949, "num_tokens": 430239760.0, "step": 11274 }, { "epoch": 1.4342958911080015, "ewc_loss": 0.028652019798755646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8652018954744563e-05, "grad_norm": 17.241464614868164, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8519672155380249, "num_tokens": 430280527.0, "step": 11275 }, { "epoch": 1.434423101386592, "ewc_loss": 0.02870352193713188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870352182071656e-05, "grad_norm": 17.301485061645508, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8646208047866821, "num_tokens": 430312788.0, "step": 11276 }, { "epoch": 1.4345503116651825, "ewc_loss": 0.02868642657995224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8686426958302036e-05, "grad_norm": 17.231691360473633, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8762141466140747, "num_tokens": 430348044.0, "step": 11277 }, { "epoch": 1.434677521943773, "ewc_loss": 0.02868831902742386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8688318707281724e-05, "grad_norm": 17.231534957885742, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.884476363658905, "num_tokens": 430385676.0, "step": 11278 }, { "epoch": 1.4348047322223636, "ewc_loss": 0.028671324253082275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8671323889284395e-05, "grad_norm": 17.21399688720703, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8665915727615356, "num_tokens": 430416976.0, "step": 11279 }, { "epoch": 1.4349319425009541, "ewc_loss": 0.028745291754603386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8745291274390183e-05, "grad_norm": 17.25609588623047, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8802487254142761, "num_tokens": 430460002.0, "step": 11280 }, { "epoch": 1.4350591527795447, "ewc_loss": 0.0287474412471056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8747441319865175e-05, "grad_norm": 17.269611358642578, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8484737873077393, "num_tokens": 430503172.0, "step": 11281 }, { "epoch": 1.435186363058135, "ewc_loss": 0.0287281796336174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8728180041071028e-05, "grad_norm": 17.33221435546875, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8560515642166138, "num_tokens": 430541221.0, "step": 11282 }, { "epoch": 1.4353135733367255, "ewc_loss": 0.028702620416879654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.87026196019724e-05, "grad_norm": 17.24478530883789, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.862236499786377, "num_tokens": 430579527.0, "step": 11283 }, { "epoch": 1.435440783615316, "ewc_loss": 0.02867518924176693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867518924176693e-05, "grad_norm": 17.29172706604004, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8707847595214844, "num_tokens": 430617263.0, "step": 11284 }, { "epoch": 1.4355679938939065, "ewc_loss": 0.028676968067884445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8676968213403597e-05, "grad_norm": 17.28411102294922, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8824745416641235, "num_tokens": 430649961.0, "step": 11285 }, { "epoch": 1.435695204172497, "ewc_loss": 0.02869535982608795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869536001526285e-05, "grad_norm": 17.318912506103516, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8625128269195557, "num_tokens": 430682599.0, "step": 11286 }, { "epoch": 1.4358224144510876, "ewc_loss": 0.028663303703069687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866330396500416e-05, "grad_norm": 17.304800033569336, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.841514527797699, "num_tokens": 430721014.0, "step": 11287 }, { "epoch": 1.4359496247296781, "ewc_loss": 0.02866649068892002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8666490834439173e-05, "grad_norm": 17.252233505249023, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8670865297317505, "num_tokens": 430757014.0, "step": 11288 }, { "epoch": 1.4360768350082687, "ewc_loss": 0.028650356456637383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.865035639842972e-05, "grad_norm": 17.258840560913086, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8695223927497864, "num_tokens": 430796169.0, "step": 11289 }, { "epoch": 1.4362040452868592, "ewc_loss": 0.028666215017437935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8666214348049834e-05, "grad_norm": 17.275957107543945, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8673009276390076, "num_tokens": 430832726.0, "step": 11290 }, { "epoch": 1.4363312555654497, "ewc_loss": 0.028664322569966316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8664322599070147e-05, "grad_norm": 17.293657302856445, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8573781251907349, "num_tokens": 430866343.0, "step": 11291 }, { "epoch": 1.4364584658440402, "ewc_loss": 0.02865670621395111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86567064904375e-05, "grad_norm": 17.215465545654297, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8740043640136719, "num_tokens": 430901622.0, "step": 11292 }, { "epoch": 1.4365856761226308, "ewc_loss": 0.02864581160247326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.864581074391026e-05, "grad_norm": 17.252939224243164, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8826068043708801, "num_tokens": 430937499.0, "step": 11293 }, { "epoch": 1.4367128864012213, "ewc_loss": 0.028763892129063606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8763892260030843e-05, "grad_norm": 17.193588256835938, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8561075329780579, "num_tokens": 430974048.0, "step": 11294 }, { "epoch": 1.4368400966798118, "ewc_loss": 0.028695620596408844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8695620130747557e-05, "grad_norm": 17.19498062133789, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8763318061828613, "num_tokens": 431010927.0, "step": 11295 }, { "epoch": 1.4369673069584024, "ewc_loss": 0.028746936470270157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.874693564081099e-05, "grad_norm": 17.320165634155273, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.853158712387085, "num_tokens": 431051731.0, "step": 11296 }, { "epoch": 1.4370945172369929, "ewc_loss": 0.02875310555100441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8753105652867816e-05, "grad_norm": 17.2777099609375, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8838194608688354, "num_tokens": 431087876.0, "step": 11297 }, { "epoch": 1.4372217275155832, "ewc_loss": 0.02867032401263714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8670323445112444e-05, "grad_norm": 17.30873680114746, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8628549575805664, "num_tokens": 431126217.0, "step": 11298 }, { "epoch": 1.4373489377941737, "ewc_loss": 0.0287029929459095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8702992494800128e-05, "grad_norm": 17.244787216186523, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.868561863899231, "num_tokens": 431168049.0, "step": 11299 }, { "epoch": 1.4374761480727642, "ewc_loss": 0.028705749660730362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8705750082735904e-05, "grad_norm": 17.335857391357422, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8669263124465942, "num_tokens": 431211143.0, "step": 11300 }, { "epoch": 1.4376033583513548, "ewc_loss": 0.02871110662817955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8711107006529346e-05, "grad_norm": 17.238698959350586, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8752739429473877, "num_tokens": 431255747.0, "step": 11301 }, { "epoch": 1.4377305686299453, "ewc_loss": 0.028665078803896904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8665079298662022e-05, "grad_norm": 17.31419563293457, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8710458278656006, "num_tokens": 431296618.0, "step": 11302 }, { "epoch": 1.4378577789085358, "ewc_loss": 0.02868357114493847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.868357114493847e-05, "grad_norm": 17.28877830505371, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8697108030319214, "num_tokens": 431336493.0, "step": 11303 }, { "epoch": 1.4379849891871264, "ewc_loss": 0.028602100908756256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8602100428543054e-05, "grad_norm": 17.283035278320312, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8664681911468506, "num_tokens": 431376200.0, "step": 11304 }, { "epoch": 1.438112199465717, "ewc_loss": 0.028638996183872223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8638996809604578e-05, "grad_norm": 17.340566635131836, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8485074043273926, "num_tokens": 431414015.0, "step": 11305 }, { "epoch": 1.4382394097443074, "ewc_loss": 0.028582265600562096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858226616808679e-05, "grad_norm": 17.305160522460938, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8636866211891174, "num_tokens": 431453498.0, "step": 11306 }, { "epoch": 1.4383666200228977, "ewc_loss": 0.0286578219383955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8657821530941874e-05, "grad_norm": 17.376930236816406, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8683828115463257, "num_tokens": 431491126.0, "step": 11307 }, { "epoch": 1.4384938303014883, "ewc_loss": 0.028595756739377975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.859575761249289e-05, "grad_norm": 17.26482582092285, "learning_rate": 1e-06, "loss": 0.3382, "mean_token_accuracy": 0.8905935287475586, "num_tokens": 431528549.0, "step": 11308 }, { "epoch": 1.4386210405800788, "ewc_loss": 0.028547463938593864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8547463443828747e-05, "grad_norm": 17.28606414794922, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8743363618850708, "num_tokens": 431569435.0, "step": 11309 }, { "epoch": 1.4387482508586693, "ewc_loss": 0.028584761545062065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8584761821548454e-05, "grad_norm": 17.33859634399414, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8743823170661926, "num_tokens": 431606125.0, "step": 11310 }, { "epoch": 1.4388754611372598, "ewc_loss": 0.028500966727733612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8500966436695307e-05, "grad_norm": 17.258380889892578, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8637992143630981, "num_tokens": 431643517.0, "step": 11311 }, { "epoch": 1.4390026714158504, "ewc_loss": 0.028569383546710014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8569384085130878e-05, "grad_norm": 17.251684188842773, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8779687881469727, "num_tokens": 431682058.0, "step": 11312 }, { "epoch": 1.439129881694441, "ewc_loss": 0.028590349480509758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8590349756996147e-05, "grad_norm": 17.350322723388672, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8796523809432983, "num_tokens": 431721241.0, "step": 11313 }, { "epoch": 1.4392570919730314, "ewc_loss": 0.0285673588514328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856735954992473e-05, "grad_norm": 17.302762985229492, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8807476758956909, "num_tokens": 431757809.0, "step": 11314 }, { "epoch": 1.439384302251622, "ewc_loss": 0.028560951352119446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856095125025604e-05, "grad_norm": 17.351652145385742, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8575806617736816, "num_tokens": 431795931.0, "step": 11315 }, { "epoch": 1.4395115125302125, "ewc_loss": 0.028544647619128227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8544647648232058e-05, "grad_norm": 17.238937377929688, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8778480887413025, "num_tokens": 431830517.0, "step": 11316 }, { "epoch": 1.439638722808803, "ewc_loss": 0.028481777757406235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8481777917477302e-05, "grad_norm": 17.311328887939453, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8704223036766052, "num_tokens": 431866047.0, "step": 11317 }, { "epoch": 1.4397659330873935, "ewc_loss": 0.02857515588402748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857515573850833e-05, "grad_norm": 17.26746368408203, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8576663732528687, "num_tokens": 431901602.0, "step": 11318 }, { "epoch": 1.439893143365984, "ewc_loss": 0.028500396758317947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8500397093011998e-05, "grad_norm": 17.306211471557617, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8640364408493042, "num_tokens": 431934989.0, "step": 11319 }, { "epoch": 1.4400203536445746, "ewc_loss": 0.028575612232089043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857561230484862e-05, "grad_norm": 17.345237731933594, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8724983930587769, "num_tokens": 431968664.0, "step": 11320 }, { "epoch": 1.4401475639231651, "ewc_loss": 0.028605099767446518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86050999420695e-05, "grad_norm": 17.2668399810791, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.879449188709259, "num_tokens": 432008025.0, "step": 11321 }, { "epoch": 1.4402747742017554, "ewc_loss": 0.02851058542728424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8510585252661258e-05, "grad_norm": 17.27739715576172, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8736726641654968, "num_tokens": 432043526.0, "step": 11322 }, { "epoch": 1.440401984480346, "ewc_loss": 0.028589406982064247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.858940752048511e-05, "grad_norm": 17.260421752929688, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8691830039024353, "num_tokens": 432085175.0, "step": 11323 }, { "epoch": 1.4405291947589365, "ewc_loss": 0.0285708699375391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8570870199473575e-05, "grad_norm": 17.27654266357422, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8593826293945312, "num_tokens": 432122379.0, "step": 11324 }, { "epoch": 1.440656405037527, "ewc_loss": 0.028654539957642555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8654540074057877e-05, "grad_norm": 17.314144134521484, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8709554672241211, "num_tokens": 432157690.0, "step": 11325 }, { "epoch": 1.4407836153161175, "ewc_loss": 0.0285696629434824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.856966239050962e-05, "grad_norm": 17.24637222290039, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8772225379943848, "num_tokens": 432188885.0, "step": 11326 }, { "epoch": 1.440910825594708, "ewc_loss": 0.028628796339035034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8628795917029493e-05, "grad_norm": 17.248416900634766, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8679473400115967, "num_tokens": 432228804.0, "step": 11327 }, { "epoch": 1.4410380358732986, "ewc_loss": 0.02867583930492401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8675838620983995e-05, "grad_norm": 17.304691314697266, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8513079881668091, "num_tokens": 432264150.0, "step": 11328 }, { "epoch": 1.4411652461518891, "ewc_loss": 0.02866467833518982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866467912099324e-05, "grad_norm": 17.30432891845703, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8623155355453491, "num_tokens": 432306759.0, "step": 11329 }, { "epoch": 1.4412924564304797, "ewc_loss": 0.028672389686107635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8672389817074873e-05, "grad_norm": 17.25772476196289, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8794407844543457, "num_tokens": 432346928.0, "step": 11330 }, { "epoch": 1.44141966670907, "ewc_loss": 0.028612062335014343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8612063033506274e-05, "grad_norm": 17.374208450317383, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8744349479675293, "num_tokens": 432386630.0, "step": 11331 }, { "epoch": 1.4415468769876605, "ewc_loss": 0.028628770262002945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8628770451177843e-05, "grad_norm": 17.261552810668945, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8565375804901123, "num_tokens": 432427331.0, "step": 11332 }, { "epoch": 1.441674087266251, "ewc_loss": 0.028644083067774773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8644082703976892e-05, "grad_norm": 17.362770080566406, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8612596988677979, "num_tokens": 432462901.0, "step": 11333 }, { "epoch": 1.4418012975448415, "ewc_loss": 0.028621038421988487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.862103792722337e-05, "grad_norm": 17.219314575195312, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8651041984558105, "num_tokens": 432494232.0, "step": 11334 }, { "epoch": 1.441928507823432, "ewc_loss": 0.028616292402148247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861629218386952e-05, "grad_norm": 17.35663414001465, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8811843395233154, "num_tokens": 432534276.0, "step": 11335 }, { "epoch": 1.4420557181020226, "ewc_loss": 0.028713181614875793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8713182473438792e-05, "grad_norm": 17.284711837768555, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8604812622070312, "num_tokens": 432577337.0, "step": 11336 }, { "epoch": 1.4421829283806131, "ewc_loss": 0.028547732159495354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.854773265426047e-05, "grad_norm": 17.347402572631836, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8595396280288696, "num_tokens": 432614290.0, "step": 11337 }, { "epoch": 1.4423101386592037, "ewc_loss": 0.02864256501197815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8642565666814335e-05, "grad_norm": 17.324979782104492, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8693835735321045, "num_tokens": 432648558.0, "step": 11338 }, { "epoch": 1.4424373489377942, "ewc_loss": 0.028582213446497917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8582213417394087e-05, "grad_norm": 17.311016082763672, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8416709899902344, "num_tokens": 432684243.0, "step": 11339 }, { "epoch": 1.4425645592163847, "ewc_loss": 0.02863161265850067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8631613531615585e-05, "grad_norm": 17.306665420532227, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.867522656917572, "num_tokens": 432719355.0, "step": 11340 }, { "epoch": 1.4426917694949752, "ewc_loss": 0.02865665964782238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8656659196713008e-05, "grad_norm": 17.27800178527832, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8679607510566711, "num_tokens": 432758675.0, "step": 11341 }, { "epoch": 1.4428189797735658, "ewc_loss": 0.028644075617194176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8644075428019278e-05, "grad_norm": 17.30814552307129, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8655509352684021, "num_tokens": 432801677.0, "step": 11342 }, { "epoch": 1.4429461900521563, "ewc_loss": 0.02865845151245594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86584509012755e-05, "grad_norm": 17.289457321166992, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8665262460708618, "num_tokens": 432836299.0, "step": 11343 }, { "epoch": 1.4430734003307468, "ewc_loss": 0.028635509312152863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.863550980691798e-05, "grad_norm": 17.321767807006836, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8668369650840759, "num_tokens": 432868838.0, "step": 11344 }, { "epoch": 1.4432006106093374, "ewc_loss": 0.028698531910777092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8698532332782634e-05, "grad_norm": 17.295351028442383, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8694027066230774, "num_tokens": 432904895.0, "step": 11345 }, { "epoch": 1.4433278208879279, "ewc_loss": 0.028629004955291748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86290051008109e-05, "grad_norm": 17.26057243347168, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8631203770637512, "num_tokens": 432944785.0, "step": 11346 }, { "epoch": 1.4434550311665182, "ewc_loss": 0.02869255281984806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869255331461318e-05, "grad_norm": 17.332815170288086, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8586770296096802, "num_tokens": 432975645.0, "step": 11347 }, { "epoch": 1.4435822414451087, "ewc_loss": 0.028654780238866806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8654780180659145e-05, "grad_norm": 17.24463653564453, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8636782169342041, "num_tokens": 433016887.0, "step": 11348 }, { "epoch": 1.4437094517236992, "ewc_loss": 0.028706669807434082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8706670491374098e-05, "grad_norm": 17.346765518188477, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8696343898773193, "num_tokens": 433053666.0, "step": 11349 }, { "epoch": 1.4438366620022898, "ewc_loss": 0.028694313019514084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8694312277366407e-05, "grad_norm": 17.253528594970703, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8636934757232666, "num_tokens": 433101131.0, "step": 11350 }, { "epoch": 1.4439638722808803, "ewc_loss": 0.028687966987490654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.868796764232684e-05, "grad_norm": 17.33867073059082, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8753138184547424, "num_tokens": 433132535.0, "step": 11351 }, { "epoch": 1.4440910825594708, "ewc_loss": 0.02870841510593891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8708414902212098e-05, "grad_norm": 17.30284881591797, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8534754514694214, "num_tokens": 433168229.0, "step": 11352 }, { "epoch": 1.4442182928380614, "ewc_loss": 0.028637880459427834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86378799501108e-05, "grad_norm": 17.339256286621094, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8734821081161499, "num_tokens": 433206094.0, "step": 11353 }, { "epoch": 1.4443455031166519, "ewc_loss": 0.028648756444454193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86487556877546e-05, "grad_norm": 17.235933303833008, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8565213680267334, "num_tokens": 433241038.0, "step": 11354 }, { "epoch": 1.4444727133952424, "ewc_loss": 0.028655709698796272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8655709684244357e-05, "grad_norm": 17.35505485534668, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8609857559204102, "num_tokens": 433274674.0, "step": 11355 }, { "epoch": 1.4445999236738327, "ewc_loss": 0.028725888580083847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.872588811442256e-05, "grad_norm": 17.2596378326416, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8714459538459778, "num_tokens": 433314511.0, "step": 11356 }, { "epoch": 1.4447271339524232, "ewc_loss": 0.028650345280766487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86503454844933e-05, "grad_norm": 17.35243034362793, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8704860210418701, "num_tokens": 433349338.0, "step": 11357 }, { "epoch": 1.4448543442310138, "ewc_loss": 0.02869827300310135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8698272217297927e-05, "grad_norm": 17.227636337280273, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8779476881027222, "num_tokens": 433382609.0, "step": 11358 }, { "epoch": 1.4449815545096043, "ewc_loss": 0.028633326292037964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8633327019633725e-05, "grad_norm": 17.361772537231445, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8825741410255432, "num_tokens": 433411553.0, "step": 11359 }, { "epoch": 1.4451087647881948, "ewc_loss": 0.0287689920514822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8768992706318386e-05, "grad_norm": 17.236032485961914, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8656882047653198, "num_tokens": 433450571.0, "step": 11360 }, { "epoch": 1.4452359750667854, "ewc_loss": 0.028652122244238853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8652122637140565e-05, "grad_norm": 17.286907196044922, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.863090455532074, "num_tokens": 433486994.0, "step": 11361 }, { "epoch": 1.445363185345376, "ewc_loss": 0.028766624629497528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.876662438211497e-05, "grad_norm": 17.264741897583008, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8812494277954102, "num_tokens": 433521407.0, "step": 11362 }, { "epoch": 1.4454903956239664, "ewc_loss": 0.0287335105240345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8733509680023417e-05, "grad_norm": 17.329639434814453, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8797159790992737, "num_tokens": 433561338.0, "step": 11363 }, { "epoch": 1.445617605902557, "ewc_loss": 0.02872631698846817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8726317395921797e-05, "grad_norm": 17.309207916259766, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8754267692565918, "num_tokens": 433598892.0, "step": 11364 }, { "epoch": 1.4457448161811475, "ewc_loss": 0.028678685426712036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8678685339400545e-05, "grad_norm": 17.26714515686035, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8580182790756226, "num_tokens": 433641423.0, "step": 11365 }, { "epoch": 1.445872026459738, "ewc_loss": 0.02871289663016796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8712896892102435e-05, "grad_norm": 17.31338882446289, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8706894516944885, "num_tokens": 433677385.0, "step": 11366 }, { "epoch": 1.4459992367383285, "ewc_loss": 0.028676843270659447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8676842703134753e-05, "grad_norm": 17.319616317749023, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8670440912246704, "num_tokens": 433716901.0, "step": 11367 }, { "epoch": 1.446126447016919, "ewc_loss": 0.02866220660507679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8662207114393823e-05, "grad_norm": 17.2498836517334, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.864544153213501, "num_tokens": 433755099.0, "step": 11368 }, { "epoch": 1.4462536572955096, "ewc_loss": 0.02866995707154274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8669957828242332e-05, "grad_norm": 17.365489959716797, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8647423982620239, "num_tokens": 433794012.0, "step": 11369 }, { "epoch": 1.4463808675741001, "ewc_loss": 0.028775136917829514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8775137252523564e-05, "grad_norm": 17.271604537963867, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8721193075180054, "num_tokens": 433831965.0, "step": 11370 }, { "epoch": 1.4465080778526904, "ewc_loss": 0.028629211708903313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8629212465602905e-05, "grad_norm": 17.31477928161621, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8684701919555664, "num_tokens": 433870819.0, "step": 11371 }, { "epoch": 1.446635288131281, "ewc_loss": 0.028676651418209076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867665170924738e-05, "grad_norm": 17.247081756591797, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.863116443157196, "num_tokens": 433909197.0, "step": 11372 }, { "epoch": 1.4467624984098715, "ewc_loss": 0.02866634726524353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8666347134276293e-05, "grad_norm": 17.319711685180664, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8755861520767212, "num_tokens": 433946026.0, "step": 11373 }, { "epoch": 1.446889708688462, "ewc_loss": 0.028677713125944138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867771399905905e-05, "grad_norm": 17.258527755737305, "learning_rate": 1e-06, "loss": 0.3484, "mean_token_accuracy": 0.8896792531013489, "num_tokens": 433986239.0, "step": 11374 }, { "epoch": 1.4470169189670525, "ewc_loss": 0.028685295954346657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.868529554689303e-05, "grad_norm": 17.293773651123047, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8731709122657776, "num_tokens": 434022557.0, "step": 11375 }, { "epoch": 1.447144129245643, "ewc_loss": 0.02869025059044361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869025047402829e-05, "grad_norm": 17.311283111572266, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8722057342529297, "num_tokens": 434059258.0, "step": 11376 }, { "epoch": 1.4472713395242336, "ewc_loss": 0.02866022102534771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866022077796515e-05, "grad_norm": 17.2739315032959, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8548632860183716, "num_tokens": 434094872.0, "step": 11377 }, { "epoch": 1.4473985498028241, "ewc_loss": 0.028657348826527596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8657348593696952e-05, "grad_norm": 17.240829467773438, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8459144830703735, "num_tokens": 434134842.0, "step": 11378 }, { "epoch": 1.4475257600814146, "ewc_loss": 0.028686771169304848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8686770747299306e-05, "grad_norm": 17.364686965942383, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8799466490745544, "num_tokens": 434173152.0, "step": 11379 }, { "epoch": 1.447652970360005, "ewc_loss": 0.028695762157440186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8695762011921033e-05, "grad_norm": 17.242990493774414, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8709782958030701, "num_tokens": 434208318.0, "step": 11380 }, { "epoch": 1.4477801806385955, "ewc_loss": 0.028663352131843567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8663351258728653e-05, "grad_norm": 17.34020233154297, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8658425211906433, "num_tokens": 434243693.0, "step": 11381 }, { "epoch": 1.447907390917186, "ewc_loss": 0.028707893565297127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870789285225328e-05, "grad_norm": 17.2667236328125, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8568640947341919, "num_tokens": 434283172.0, "step": 11382 }, { "epoch": 1.4480346011957765, "ewc_loss": 0.0286625474691391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8662547265412286e-05, "grad_norm": 17.31369400024414, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8696833848953247, "num_tokens": 434318789.0, "step": 11383 }, { "epoch": 1.448161811474367, "ewc_loss": 0.028668547049164772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866854629246518e-05, "grad_norm": 17.33419418334961, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8596509695053101, "num_tokens": 434358943.0, "step": 11384 }, { "epoch": 1.4482890217529576, "ewc_loss": 0.028683047741651535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8683047275990248e-05, "grad_norm": 17.33066177368164, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8695396780967712, "num_tokens": 434398264.0, "step": 11385 }, { "epoch": 1.4484162320315481, "ewc_loss": 0.028651895001530647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.865189526346512e-05, "grad_norm": 17.290334701538086, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8775939345359802, "num_tokens": 434433602.0, "step": 11386 }, { "epoch": 1.4485434423101387, "ewc_loss": 0.02867588959634304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8675889552687295e-05, "grad_norm": 17.310264587402344, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8617082834243774, "num_tokens": 434480001.0, "step": 11387 }, { "epoch": 1.4486706525887292, "ewc_loss": 0.028652125969529152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8652126275119372e-05, "grad_norm": 17.26478385925293, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.884497880935669, "num_tokens": 434519285.0, "step": 11388 }, { "epoch": 1.4487978628673197, "ewc_loss": 0.028646981343626976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8646982173086144e-05, "grad_norm": 17.405658721923828, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8680680394172668, "num_tokens": 434556118.0, "step": 11389 }, { "epoch": 1.4489250731459102, "ewc_loss": 0.02869516983628273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8695169021375477e-05, "grad_norm": 17.29950714111328, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8523358106613159, "num_tokens": 434595350.0, "step": 11390 }, { "epoch": 1.4490522834245008, "ewc_loss": 0.028613844886422157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861384564312175e-05, "grad_norm": 17.283388137817383, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8660883903503418, "num_tokens": 434632988.0, "step": 11391 }, { "epoch": 1.4491794937030913, "ewc_loss": 0.028604526072740555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.860452514141798e-05, "grad_norm": 17.267568588256836, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8755972385406494, "num_tokens": 434673040.0, "step": 11392 }, { "epoch": 1.4493067039816818, "ewc_loss": 0.028605906292796135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.860590575437527e-05, "grad_norm": 17.328033447265625, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8841065168380737, "num_tokens": 434712201.0, "step": 11393 }, { "epoch": 1.4494339142602723, "ewc_loss": 0.02866065874695778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8660659154411405e-05, "grad_norm": 17.382844924926758, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.882081151008606, "num_tokens": 434756676.0, "step": 11394 }, { "epoch": 1.4495611245388629, "ewc_loss": 0.028601787984371185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8601787562365644e-05, "grad_norm": 17.250452041625977, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8721883296966553, "num_tokens": 434797890.0, "step": 11395 }, { "epoch": 1.4496883348174532, "ewc_loss": 0.02860117331147194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8601172743947245e-05, "grad_norm": 17.451045989990234, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8830676078796387, "num_tokens": 434836933.0, "step": 11396 }, { "epoch": 1.4498155450960437, "ewc_loss": 0.028633296489715576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8633296096813865e-05, "grad_norm": 17.32512664794922, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8817586898803711, "num_tokens": 434868098.0, "step": 11397 }, { "epoch": 1.4499427553746342, "ewc_loss": 0.02850860357284546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8508604373200797e-05, "grad_norm": 17.33224105834961, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8647664189338684, "num_tokens": 434907239.0, "step": 11398 }, { "epoch": 1.4500699656532248, "ewc_loss": 0.02861659787595272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8616597774089314e-05, "grad_norm": 17.331327438354492, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8747529983520508, "num_tokens": 434945672.0, "step": 11399 }, { "epoch": 1.4501971759318153, "ewc_loss": 0.02853146381676197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8531463613035157e-05, "grad_norm": 17.309011459350586, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8584257364273071, "num_tokens": 434981693.0, "step": 11400 }, { "epoch": 1.4503243862104058, "ewc_loss": 0.028567058965563774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8567059416673146e-05, "grad_norm": 17.284711837768555, "learning_rate": 1e-06, "loss": 0.3348, "mean_token_accuracy": 0.89012610912323, "num_tokens": 435012452.0, "step": 11401 }, { "epoch": 1.4504515964889964, "ewc_loss": 0.028570180758833885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.857018080248963e-05, "grad_norm": 17.289499282836914, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8629960417747498, "num_tokens": 435052711.0, "step": 11402 }, { "epoch": 1.4505788067675869, "ewc_loss": 0.028603648766875267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.860364838852547e-05, "grad_norm": 17.22136688232422, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8669342398643494, "num_tokens": 435092766.0, "step": 11403 }, { "epoch": 1.4507060170461774, "ewc_loss": 0.028578324243426323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8578324418049306e-05, "grad_norm": 17.3084716796875, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8538341522216797, "num_tokens": 435132093.0, "step": 11404 }, { "epoch": 1.4508332273247677, "ewc_loss": 0.02867918275296688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8679181923507713e-05, "grad_norm": 17.31401824951172, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8654035329818726, "num_tokens": 435169071.0, "step": 11405 }, { "epoch": 1.4509604376033582, "ewc_loss": 0.0286264568567276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8626456696656533e-05, "grad_norm": 17.268484115600586, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8795201778411865, "num_tokens": 435202665.0, "step": 11406 }, { "epoch": 1.4510876478819488, "ewc_loss": 0.028615394607186317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861539542209357e-05, "grad_norm": 17.24899673461914, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8830392956733704, "num_tokens": 435234719.0, "step": 11407 }, { "epoch": 1.4512148581605393, "ewc_loss": 0.028678761795163155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8678761736955494e-05, "grad_norm": 17.360183715820312, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8832038044929504, "num_tokens": 435267090.0, "step": 11408 }, { "epoch": 1.4513420684391298, "ewc_loss": 0.02871706523001194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8717066015815362e-05, "grad_norm": 17.232707977294922, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8763102293014526, "num_tokens": 435307243.0, "step": 11409 }, { "epoch": 1.4514692787177204, "ewc_loss": 0.028656503185629845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8656502763624303e-05, "grad_norm": 17.476665496826172, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8766263723373413, "num_tokens": 435347507.0, "step": 11410 }, { "epoch": 1.4515964889963109, "ewc_loss": 0.028705207630991936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8705208023893647e-05, "grad_norm": 17.23813247680664, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8599646687507629, "num_tokens": 435383499.0, "step": 11411 }, { "epoch": 1.4517236992749014, "ewc_loss": 0.028584733605384827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85847345367074e-05, "grad_norm": 17.276077270507812, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8692078590393066, "num_tokens": 435423500.0, "step": 11412 }, { "epoch": 1.451850909553492, "ewc_loss": 0.028703441843390465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8703441785182804e-05, "grad_norm": 17.3138427734375, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8732909560203552, "num_tokens": 435457075.0, "step": 11413 }, { "epoch": 1.4519781198320825, "ewc_loss": 0.02872389554977417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8723896321025677e-05, "grad_norm": 17.336166381835938, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8670781850814819, "num_tokens": 435498933.0, "step": 11414 }, { "epoch": 1.452105330110673, "ewc_loss": 0.028656020760536194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8656020731432363e-05, "grad_norm": 17.282363891601562, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8608489036560059, "num_tokens": 435535949.0, "step": 11415 }, { "epoch": 1.4522325403892635, "ewc_loss": 0.02871219627559185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871219658118207e-05, "grad_norm": 17.391597747802734, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.862798810005188, "num_tokens": 435565252.0, "step": 11416 }, { "epoch": 1.452359750667854, "ewc_loss": 0.028708653524518013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8708653189823963e-05, "grad_norm": 17.304767608642578, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8507077097892761, "num_tokens": 435604663.0, "step": 11417 }, { "epoch": 1.4524869609464446, "ewc_loss": 0.028671300038695335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867130024242215e-05, "grad_norm": 17.289287567138672, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8641209602355957, "num_tokens": 435640883.0, "step": 11418 }, { "epoch": 1.4526141712250351, "ewc_loss": 0.028716394677758217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8716394808725454e-05, "grad_norm": 17.30832862854004, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8720384836196899, "num_tokens": 435682787.0, "step": 11419 }, { "epoch": 1.4527413815036254, "ewc_loss": 0.028734203428030014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8734202714986168e-05, "grad_norm": 17.286746978759766, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8662611246109009, "num_tokens": 435719982.0, "step": 11420 }, { "epoch": 1.452868591782216, "ewc_loss": 0.028685586526989937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8685586585197598e-05, "grad_norm": 17.427947998046875, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.851207435131073, "num_tokens": 435755153.0, "step": 11421 }, { "epoch": 1.4529958020608065, "ewc_loss": 0.028732260689139366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873226003430318e-05, "grad_norm": 17.36417007446289, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8780983090400696, "num_tokens": 435797513.0, "step": 11422 }, { "epoch": 1.453123012339397, "ewc_loss": 0.028629139065742493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8629139706026763e-05, "grad_norm": 17.348791122436523, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8685965538024902, "num_tokens": 435836835.0, "step": 11423 }, { "epoch": 1.4532502226179875, "ewc_loss": 0.0287515576928854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.87515576928854e-05, "grad_norm": 17.40291404724121, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8595571517944336, "num_tokens": 435875124.0, "step": 11424 }, { "epoch": 1.453377432896578, "ewc_loss": 0.028671713545918465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8671713153016753e-05, "grad_norm": 17.428152084350586, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8804675340652466, "num_tokens": 435914993.0, "step": 11425 }, { "epoch": 1.4535046431751686, "ewc_loss": 0.0287184938788414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8718493922497146e-05, "grad_norm": 17.302326202392578, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8768088817596436, "num_tokens": 435950916.0, "step": 11426 }, { "epoch": 1.4536318534537591, "ewc_loss": 0.02863471210002899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8634711270569824e-05, "grad_norm": 17.382844924926758, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8637256026268005, "num_tokens": 435991986.0, "step": 11427 }, { "epoch": 1.4537590637323496, "ewc_loss": 0.02870349772274494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8703498173854314e-05, "grad_norm": 17.27048683166504, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8771038055419922, "num_tokens": 436024603.0, "step": 11428 }, { "epoch": 1.45388627401094, "ewc_loss": 0.028631672263145447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86316717392765e-05, "grad_norm": 17.305095672607422, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8640353679656982, "num_tokens": 436065540.0, "step": 11429 }, { "epoch": 1.4540134842895305, "ewc_loss": 0.028740111738443375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.874011261155829e-05, "grad_norm": 17.35954475402832, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8545812368392944, "num_tokens": 436104394.0, "step": 11430 }, { "epoch": 1.454140694568121, "ewc_loss": 0.028653213754296303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8653214030782692e-05, "grad_norm": 17.354087829589844, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8559170961380005, "num_tokens": 436139977.0, "step": 11431 }, { "epoch": 1.4542679048467115, "ewc_loss": 0.028682870790362358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8682870834018104e-05, "grad_norm": 17.301137924194336, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8839461207389832, "num_tokens": 436178047.0, "step": 11432 }, { "epoch": 1.454395115125302, "ewc_loss": 0.028678232803940773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8678232411039062e-05, "grad_norm": 17.35260581970215, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8668359518051147, "num_tokens": 436218331.0, "step": 11433 }, { "epoch": 1.4545223254038926, "ewc_loss": 0.028782334178686142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8782334993593395e-05, "grad_norm": 17.351062774658203, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8662225008010864, "num_tokens": 436255663.0, "step": 11434 }, { "epoch": 1.4546495356824831, "ewc_loss": 0.028676625341176987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8676624424406327e-05, "grad_norm": 17.36501693725586, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8687419891357422, "num_tokens": 436301328.0, "step": 11435 }, { "epoch": 1.4547767459610736, "ewc_loss": 0.028668930754065514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.866893009922933e-05, "grad_norm": 17.332740783691406, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8771836161613464, "num_tokens": 436336487.0, "step": 11436 }, { "epoch": 1.4549039562396642, "ewc_loss": 0.028610192239284515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861019311239943e-05, "grad_norm": 17.33294105529785, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8716986775398254, "num_tokens": 436377629.0, "step": 11437 }, { "epoch": 1.4550311665182547, "ewc_loss": 0.028647320345044136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8647320505115204e-05, "grad_norm": 17.30625343322754, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8819726705551147, "num_tokens": 436421590.0, "step": 11438 }, { "epoch": 1.4551583767968452, "ewc_loss": 0.028675150126218796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867514922400005e-05, "grad_norm": 17.33414649963379, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8617626428604126, "num_tokens": 436461172.0, "step": 11439 }, { "epoch": 1.4552855870754358, "ewc_loss": 0.028681999072432518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8681999538093805e-05, "grad_norm": 17.320871353149414, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8770794868469238, "num_tokens": 436497536.0, "step": 11440 }, { "epoch": 1.4554127973540263, "ewc_loss": 0.028669461607933044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8669461244135164e-05, "grad_norm": 17.38663673400879, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8652862310409546, "num_tokens": 436543906.0, "step": 11441 }, { "epoch": 1.4555400076326168, "ewc_loss": 0.02860029973089695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8600299629033543e-05, "grad_norm": 17.266563415527344, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8701149821281433, "num_tokens": 436579462.0, "step": 11442 }, { "epoch": 1.4556672179112073, "ewc_loss": 0.028590600937604904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8590600777533837e-05, "grad_norm": 17.415855407714844, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8614383935928345, "num_tokens": 436618840.0, "step": 11443 }, { "epoch": 1.4557944281897979, "ewc_loss": 0.028647543862462044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.864754424081184e-05, "grad_norm": 17.294696807861328, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8573144674301147, "num_tokens": 436654176.0, "step": 11444 }, { "epoch": 1.4559216384683882, "ewc_loss": 0.028629174456000328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.862917426682543e-05, "grad_norm": 17.396650314331055, "learning_rate": 1e-06, "loss": 0.3598, "mean_token_accuracy": 0.8807033896446228, "num_tokens": 436685507.0, "step": 11445 }, { "epoch": 1.4560488487469787, "ewc_loss": 0.028666013851761818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8666014259215444e-05, "grad_norm": 17.439626693725586, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8773424625396729, "num_tokens": 436729881.0, "step": 11446 }, { "epoch": 1.4561760590255692, "ewc_loss": 0.028548382222652435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8548382033477537e-05, "grad_norm": 17.255390167236328, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8679347634315491, "num_tokens": 436772355.0, "step": 11447 }, { "epoch": 1.4563032693041598, "ewc_loss": 0.02859630063176155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.859630149032455e-05, "grad_norm": 17.416852951049805, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8625143766403198, "num_tokens": 436808701.0, "step": 11448 }, { "epoch": 1.4564304795827503, "ewc_loss": 0.028648290783166885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8648290026467294e-05, "grad_norm": 17.263214111328125, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.877799391746521, "num_tokens": 436845441.0, "step": 11449 }, { "epoch": 1.4565576898613408, "ewc_loss": 0.028544463217258453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.85444639303023e-05, "grad_norm": 17.32721519470215, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8710700869560242, "num_tokens": 436885316.0, "step": 11450 }, { "epoch": 1.4566849001399313, "ewc_loss": 0.028689919039607048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8689919417956844e-05, "grad_norm": 17.372333526611328, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8630406856536865, "num_tokens": 436918315.0, "step": 11451 }, { "epoch": 1.4568121104185219, "ewc_loss": 0.028640547767281532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8640548407565802e-05, "grad_norm": 17.3292179107666, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8606485724449158, "num_tokens": 436956039.0, "step": 11452 }, { "epoch": 1.4569393206971124, "ewc_loss": 0.02863960526883602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8639606171054766e-05, "grad_norm": 17.27475929260254, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8538116216659546, "num_tokens": 436991274.0, "step": 11453 }, { "epoch": 1.4570665309757027, "ewc_loss": 0.028674159198999405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8674159693764523e-05, "grad_norm": 17.2902889251709, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8800716400146484, "num_tokens": 437029963.0, "step": 11454 }, { "epoch": 1.4571937412542932, "ewc_loss": 0.02870248071849346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870248135877773e-05, "grad_norm": 17.37666130065918, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8613671660423279, "num_tokens": 437067030.0, "step": 11455 }, { "epoch": 1.4573209515328838, "ewc_loss": 0.02866269461810589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8662694603553973e-05, "grad_norm": 17.332796096801758, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8764504194259644, "num_tokens": 437103363.0, "step": 11456 }, { "epoch": 1.4574481618114743, "ewc_loss": 0.028709478676319122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8709479011013173e-05, "grad_norm": 17.36156463623047, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8745055794715881, "num_tokens": 437131574.0, "step": 11457 }, { "epoch": 1.4575753720900648, "ewc_loss": 0.028669560328125954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8669559469562955e-05, "grad_norm": 17.30961799621582, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.861182689666748, "num_tokens": 437169537.0, "step": 11458 }, { "epoch": 1.4577025823686554, "ewc_loss": 0.028714699670672417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871469951060135e-05, "grad_norm": 17.38160514831543, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8733892440795898, "num_tokens": 437208025.0, "step": 11459 }, { "epoch": 1.4578297926472459, "ewc_loss": 0.02875550650060177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8755506718880497e-05, "grad_norm": 17.38300132751465, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8801261782646179, "num_tokens": 437246642.0, "step": 11460 }, { "epoch": 1.4579570029258364, "ewc_loss": 0.028737133368849754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873713310691528e-05, "grad_norm": 17.322280883789062, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.866158664226532, "num_tokens": 437285274.0, "step": 11461 }, { "epoch": 1.458084213204427, "ewc_loss": 0.02868865802884102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8688658858300187e-05, "grad_norm": 17.314510345458984, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8861502408981323, "num_tokens": 437325865.0, "step": 11462 }, { "epoch": 1.4582114234830175, "ewc_loss": 0.028696255758404732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8696254958049394e-05, "grad_norm": 17.34918785095215, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8738617897033691, "num_tokens": 437364093.0, "step": 11463 }, { "epoch": 1.458338633761608, "ewc_loss": 0.028701890259981155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870189018722158e-05, "grad_norm": 17.288990020751953, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8558185696601868, "num_tokens": 437402413.0, "step": 11464 }, { "epoch": 1.4584658440401985, "ewc_loss": 0.028692275285720825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8692275009234436e-05, "grad_norm": 17.290555953979492, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8549147844314575, "num_tokens": 437442583.0, "step": 11465 }, { "epoch": 1.458593054318789, "ewc_loss": 0.028779419139027596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.877941915357951e-05, "grad_norm": 17.297203063964844, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8686584830284119, "num_tokens": 437485682.0, "step": 11466 }, { "epoch": 1.4587202645973796, "ewc_loss": 0.02873917855322361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873917946999427e-05, "grad_norm": 17.333250045776367, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8591234683990479, "num_tokens": 437524859.0, "step": 11467 }, { "epoch": 1.45884747487597, "ewc_loss": 0.028650358319282532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8650358217419125e-05, "grad_norm": 17.324485778808594, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8606703877449036, "num_tokens": 437563590.0, "step": 11468 }, { "epoch": 1.4589746851545604, "ewc_loss": 0.02873867191374302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873867197195068e-05, "grad_norm": 17.31062889099121, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8854290246963501, "num_tokens": 437601429.0, "step": 11469 }, { "epoch": 1.459101895433151, "ewc_loss": 0.028676263988018036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8676264264504425e-05, "grad_norm": 17.355012893676758, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8569003343582153, "num_tokens": 437639144.0, "step": 11470 }, { "epoch": 1.4592291057117415, "ewc_loss": 0.02874850295484066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8748503609676845e-05, "grad_norm": 17.329336166381836, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8767884969711304, "num_tokens": 437680846.0, "step": 11471 }, { "epoch": 1.459356315990332, "ewc_loss": 0.02867755852639675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867755938495975e-05, "grad_norm": 17.404075622558594, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8707127571105957, "num_tokens": 437722156.0, "step": 11472 }, { "epoch": 1.4594835262689225, "ewc_loss": 0.028675133362412453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867513285309542e-05, "grad_norm": 17.28716278076172, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8444784879684448, "num_tokens": 437766905.0, "step": 11473 }, { "epoch": 1.459610736547513, "ewc_loss": 0.028649700805544853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8649701562244445e-05, "grad_norm": 17.38006591796875, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8682777285575867, "num_tokens": 437801425.0, "step": 11474 }, { "epoch": 1.4597379468261036, "ewc_loss": 0.028721090406179428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8721089620376006e-05, "grad_norm": 17.337621688842773, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8575512170791626, "num_tokens": 437838126.0, "step": 11475 }, { "epoch": 1.4598651571046941, "ewc_loss": 0.02862895093858242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8628950531128794e-05, "grad_norm": 17.38156509399414, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8629627227783203, "num_tokens": 437876597.0, "step": 11476 }, { "epoch": 1.4599923673832846, "ewc_loss": 0.028698813170194626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869881245715078e-05, "grad_norm": 17.2913875579834, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8792397975921631, "num_tokens": 437911852.0, "step": 11477 }, { "epoch": 1.460119577661875, "ewc_loss": 0.02863818034529686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8638180083362386e-05, "grad_norm": 17.40334129333496, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8692638874053955, "num_tokens": 437953418.0, "step": 11478 }, { "epoch": 1.4602467879404655, "ewc_loss": 0.0287470780313015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8747077521984465e-05, "grad_norm": 17.411989212036133, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8738268613815308, "num_tokens": 437989561.0, "step": 11479 }, { "epoch": 1.460373998219056, "ewc_loss": 0.028632551431655884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8632552130147815e-05, "grad_norm": 17.33707618713379, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.877819299697876, "num_tokens": 438033464.0, "step": 11480 }, { "epoch": 1.4605012084976465, "ewc_loss": 0.028630798682570457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8630798624362797e-05, "grad_norm": 17.308568954467773, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8725039958953857, "num_tokens": 438068676.0, "step": 11481 }, { "epoch": 1.460628418776237, "ewc_loss": 0.028656499460339546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8656499125645496e-05, "grad_norm": 17.41499137878418, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.850312352180481, "num_tokens": 438107867.0, "step": 11482 }, { "epoch": 1.4607556290548276, "ewc_loss": 0.02867700904607773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8677008231170475e-05, "grad_norm": 17.43811798095703, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8845259547233582, "num_tokens": 438144256.0, "step": 11483 }, { "epoch": 1.4608828393334181, "ewc_loss": 0.02864680625498295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8646805731114e-05, "grad_norm": 17.32442855834961, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8787856698036194, "num_tokens": 438183458.0, "step": 11484 }, { "epoch": 1.4610100496120086, "ewc_loss": 0.028581304475665092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8581303922692314e-05, "grad_norm": 17.412569046020508, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.856392502784729, "num_tokens": 438216025.0, "step": 11485 }, { "epoch": 1.4611372598905992, "ewc_loss": 0.028633056208491325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8633055990212597e-05, "grad_norm": 17.274446487426758, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8789973258972168, "num_tokens": 438258185.0, "step": 11486 }, { "epoch": 1.4612644701691897, "ewc_loss": 0.028674984350800514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867498369596433e-05, "grad_norm": 17.442331314086914, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8645573854446411, "num_tokens": 438298189.0, "step": 11487 }, { "epoch": 1.4613916804477802, "ewc_loss": 0.028672149404883385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8672149710473605e-05, "grad_norm": 17.342697143554688, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8575443029403687, "num_tokens": 438336593.0, "step": 11488 }, { "epoch": 1.4615188907263708, "ewc_loss": 0.028642136603593826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8642136385315098e-05, "grad_norm": 17.355876922607422, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8757333159446716, "num_tokens": 438374110.0, "step": 11489 }, { "epoch": 1.4616461010049613, "ewc_loss": 0.028636934235692024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8636934075620957e-05, "grad_norm": 17.307796478271484, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8528175950050354, "num_tokens": 438413062.0, "step": 11490 }, { "epoch": 1.4617733112835518, "ewc_loss": 0.028604330494999886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.86043305095518e-05, "grad_norm": 17.26148223876953, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.87115079164505, "num_tokens": 438452055.0, "step": 11491 }, { "epoch": 1.4619005215621423, "ewc_loss": 0.028705744072794914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8705744625767693e-05, "grad_norm": 17.39046859741211, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8710291385650635, "num_tokens": 438494224.0, "step": 11492 }, { "epoch": 1.4620277318407329, "ewc_loss": 0.02872380241751671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8723801733576693e-05, "grad_norm": 17.291183471679688, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8649412393569946, "num_tokens": 438529790.0, "step": 11493 }, { "epoch": 1.4621549421193232, "ewc_loss": 0.02861180528998375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.861180473701097e-05, "grad_norm": 17.331518173217773, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8728067874908447, "num_tokens": 438571371.0, "step": 11494 }, { "epoch": 1.4622821523979137, "ewc_loss": 0.02871425449848175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871425385819748e-05, "grad_norm": 17.34770393371582, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8550150394439697, "num_tokens": 438612383.0, "step": 11495 }, { "epoch": 1.4624093626765042, "ewc_loss": 0.02872188203036785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8721882699755952e-05, "grad_norm": 17.313655853271484, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8585180044174194, "num_tokens": 438653443.0, "step": 11496 }, { "epoch": 1.4625365729550948, "ewc_loss": 0.028672512620687485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.867251168936491e-05, "grad_norm": 17.32076072692871, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8567528128623962, "num_tokens": 438689565.0, "step": 11497 }, { "epoch": 1.4626637832336853, "ewc_loss": 0.028743332251906395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8743332222802565e-05, "grad_norm": 17.45915412902832, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8694581985473633, "num_tokens": 438727753.0, "step": 11498 }, { "epoch": 1.4627909935122758, "ewc_loss": 0.028692757710814476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8692757041426376e-05, "grad_norm": 17.27100372314453, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8663316965103149, "num_tokens": 438764667.0, "step": 11499 }, { "epoch": 1.4629182037908663, "ewc_loss": 0.028685588389635086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8685588404187e-05, "grad_norm": 17.359811782836914, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8737383484840393, "num_tokens": 438801739.0, "step": 11500 }, { "epoch": 1.4630454140694569, "ewc_loss": 0.02879861742258072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8798616767744534e-05, "grad_norm": 17.312179565429688, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8703149557113647, "num_tokens": 438840028.0, "step": 11501 }, { "epoch": 1.4631726243480474, "ewc_loss": 0.028702469542622566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8702468625851907e-05, "grad_norm": 17.315109252929688, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8661975860595703, "num_tokens": 438880632.0, "step": 11502 }, { "epoch": 1.4632998346266377, "ewc_loss": 0.028740249574184418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8740249035763554e-05, "grad_norm": 17.26263427734375, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.865625262260437, "num_tokens": 438923040.0, "step": 11503 }, { "epoch": 1.4634270449052282, "ewc_loss": 0.028745513409376144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8745513191097416e-05, "grad_norm": 17.390777587890625, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8683589696884155, "num_tokens": 438958597.0, "step": 11504 }, { "epoch": 1.4635542551838188, "ewc_loss": 0.02878301590681076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.878301529563032e-05, "grad_norm": 17.347116470336914, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.857205331325531, "num_tokens": 438997909.0, "step": 11505 }, { "epoch": 1.4636814654624093, "ewc_loss": 0.028716878965497017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8716878659906797e-05, "grad_norm": 17.32145118713379, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8763291835784912, "num_tokens": 439030084.0, "step": 11506 }, { "epoch": 1.4638086757409998, "ewc_loss": 0.02874170057475567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8741700589307584e-05, "grad_norm": 17.27561378479004, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8716577887535095, "num_tokens": 439067770.0, "step": 11507 }, { "epoch": 1.4639358860195903, "ewc_loss": 0.028713002800941467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871300239348784e-05, "grad_norm": 17.348064422607422, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8706979155540466, "num_tokens": 439102476.0, "step": 11508 }, { "epoch": 1.4640630962981809, "ewc_loss": 0.028786558657884598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8786558686988428e-05, "grad_norm": 17.34563636779785, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8785572052001953, "num_tokens": 439138660.0, "step": 11509 }, { "epoch": 1.4641903065767714, "ewc_loss": 0.02875075861811638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8750759156537242e-05, "grad_norm": 17.384775161743164, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8761894702911377, "num_tokens": 439181690.0, "step": 11510 }, { "epoch": 1.464317516855362, "ewc_loss": 0.028767118230462074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8767119147232734e-05, "grad_norm": 17.387989044189453, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8753877282142639, "num_tokens": 439218825.0, "step": 11511 }, { "epoch": 1.4644447271339525, "ewc_loss": 0.02872551791369915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8725517040584236e-05, "grad_norm": 17.371654510498047, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8736162185668945, "num_tokens": 439253505.0, "step": 11512 }, { "epoch": 1.464571937412543, "ewc_loss": 0.028706571087241173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8706570446956903e-05, "grad_norm": 17.31747817993164, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.862689197063446, "num_tokens": 439292764.0, "step": 11513 }, { "epoch": 1.4646991476911335, "ewc_loss": 0.028759963810443878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8759963242919184e-05, "grad_norm": 17.338876724243164, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8781570196151733, "num_tokens": 439335269.0, "step": 11514 }, { "epoch": 1.464826357969724, "ewc_loss": 0.028750911355018616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.875091195164714e-05, "grad_norm": 17.358491897583008, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8697692155838013, "num_tokens": 439371977.0, "step": 11515 }, { "epoch": 1.4649535682483146, "ewc_loss": 0.0287115927785635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8711592676700093e-05, "grad_norm": 17.302690505981445, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8683434724807739, "num_tokens": 439412710.0, "step": 11516 }, { "epoch": 1.465080778526905, "ewc_loss": 0.028681021183729172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8681020921794698e-05, "grad_norm": 17.35682487487793, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8685569763183594, "num_tokens": 439449637.0, "step": 11517 }, { "epoch": 1.4652079888054954, "ewc_loss": 0.028760435059666634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8760434361174703e-05, "grad_norm": 17.313405990600586, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8625690937042236, "num_tokens": 439495143.0, "step": 11518 }, { "epoch": 1.465335199084086, "ewc_loss": 0.02871100604534149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871100696211215e-05, "grad_norm": 17.3472843170166, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8569795489311218, "num_tokens": 439532721.0, "step": 11519 }, { "epoch": 1.4654624093626765, "ewc_loss": 0.028785085305571556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8785085305571556e-05, "grad_norm": 17.330780029296875, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8781991004943848, "num_tokens": 439567938.0, "step": 11520 }, { "epoch": 1.465589619641267, "ewc_loss": 0.02877264842391014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8772648875019513e-05, "grad_norm": 17.402002334594727, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8776981234550476, "num_tokens": 439595720.0, "step": 11521 }, { "epoch": 1.4657168299198575, "ewc_loss": 0.028741054236888885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8741054848069325e-05, "grad_norm": 17.331037521362305, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.868588924407959, "num_tokens": 439634091.0, "step": 11522 }, { "epoch": 1.465844040198448, "ewc_loss": 0.028780445456504822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.878044506360311e-05, "grad_norm": 17.32294464111328, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8584120869636536, "num_tokens": 439671978.0, "step": 11523 }, { "epoch": 1.4659712504770386, "ewc_loss": 0.028812594711780548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881259388232138e-05, "grad_norm": 17.420055389404297, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8641369342803955, "num_tokens": 439707748.0, "step": 11524 }, { "epoch": 1.466098460755629, "ewc_loss": 0.028815677389502525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881567706936039e-05, "grad_norm": 17.414283752441406, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8757800459861755, "num_tokens": 439744060.0, "step": 11525 }, { "epoch": 1.4662256710342196, "ewc_loss": 0.028694087639451027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.869408854166977e-05, "grad_norm": 17.31830596923828, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8610167503356934, "num_tokens": 439789277.0, "step": 11526 }, { "epoch": 1.46635288131281, "ewc_loss": 0.028770241886377335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8770242352038622e-05, "grad_norm": 17.353519439697266, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8856850266456604, "num_tokens": 439827524.0, "step": 11527 }, { "epoch": 1.4664800915914005, "ewc_loss": 0.02877722680568695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8777227271348238e-05, "grad_norm": 17.3568058013916, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8664727210998535, "num_tokens": 439865181.0, "step": 11528 }, { "epoch": 1.466607301869991, "ewc_loss": 0.028798971325159073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8798971470678225e-05, "grad_norm": 17.408401489257812, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8613780736923218, "num_tokens": 439900071.0, "step": 11529 }, { "epoch": 1.4667345121485815, "ewc_loss": 0.028819043189287186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8819044018746354e-05, "grad_norm": 17.371856689453125, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8689716458320618, "num_tokens": 439940839.0, "step": 11530 }, { "epoch": 1.466861722427172, "ewc_loss": 0.02877720631659031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8777205443475395e-05, "grad_norm": 17.37450408935547, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8656649589538574, "num_tokens": 439978908.0, "step": 11531 }, { "epoch": 1.4669889327057626, "ewc_loss": 0.02883334644138813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8833346732426435e-05, "grad_norm": 17.375577926635742, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8678104877471924, "num_tokens": 440013448.0, "step": 11532 }, { "epoch": 1.467116142984353, "ewc_loss": 0.028809109702706337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.880911051761359e-05, "grad_norm": 17.291221618652344, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.872348427772522, "num_tokens": 440048377.0, "step": 11533 }, { "epoch": 1.4672433532629436, "ewc_loss": 0.0287917573004961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8791757358703762e-05, "grad_norm": 17.3143253326416, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8634243011474609, "num_tokens": 440089806.0, "step": 11534 }, { "epoch": 1.4673705635415342, "ewc_loss": 0.02885584719479084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8855847631348297e-05, "grad_norm": 17.37259864807129, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8742681741714478, "num_tokens": 440124209.0, "step": 11535 }, { "epoch": 1.4674977738201247, "ewc_loss": 0.028814276680350304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881427644751966e-05, "grad_norm": 17.390094757080078, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8697638511657715, "num_tokens": 440158349.0, "step": 11536 }, { "epoch": 1.4676249840987152, "ewc_loss": 0.02881500869989395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881500950024929e-05, "grad_norm": 17.364593505859375, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8715473413467407, "num_tokens": 440195220.0, "step": 11537 }, { "epoch": 1.4677521943773058, "ewc_loss": 0.028800183907151222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.880018473661039e-05, "grad_norm": 17.389360427856445, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.868519127368927, "num_tokens": 440235158.0, "step": 11538 }, { "epoch": 1.4678794046558963, "ewc_loss": 0.02879900112748146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8799000574508682e-05, "grad_norm": 17.255481719970703, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8646055459976196, "num_tokens": 440271879.0, "step": 11539 }, { "epoch": 1.4680066149344868, "ewc_loss": 0.02880685217678547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8806851332774386e-05, "grad_norm": 17.393075942993164, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8677381873130798, "num_tokens": 440306454.0, "step": 11540 }, { "epoch": 1.4681338252130773, "ewc_loss": 0.028824815526604652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8824815672123805e-05, "grad_norm": 17.30698013305664, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8565071225166321, "num_tokens": 440341185.0, "step": 11541 }, { "epoch": 1.4682610354916679, "ewc_loss": 0.028792228549718857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879222847695928e-05, "grad_norm": 17.28152847290039, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8676137328147888, "num_tokens": 440379168.0, "step": 11542 }, { "epoch": 1.4683882457702582, "ewc_loss": 0.028897050768136978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8897051379317418e-05, "grad_norm": 17.366092681884766, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8705556392669678, "num_tokens": 440417945.0, "step": 11543 }, { "epoch": 1.4685154560488487, "ewc_loss": 0.02891317941248417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8913178539369255e-05, "grad_norm": 17.32492446899414, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8537108898162842, "num_tokens": 440457864.0, "step": 11544 }, { "epoch": 1.4686426663274392, "ewc_loss": 0.028920980170369148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8920980184921063e-05, "grad_norm": 17.366451263427734, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8645538091659546, "num_tokens": 440491709.0, "step": 11545 }, { "epoch": 1.4687698766060298, "ewc_loss": 0.028875352814793587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887535265472252e-05, "grad_norm": 17.27461814880371, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8755298852920532, "num_tokens": 440533570.0, "step": 11546 }, { "epoch": 1.4688970868846203, "ewc_loss": 0.028897810727357864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8897809897898696e-05, "grad_norm": 17.35779571533203, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8741753697395325, "num_tokens": 440568360.0, "step": 11547 }, { "epoch": 1.4690242971632108, "ewc_loss": 0.0288777407258749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8877740987809375e-05, "grad_norm": 17.317258834838867, "learning_rate": 1e-06, "loss": 0.37, "mean_token_accuracy": 0.8784593343734741, "num_tokens": 440606848.0, "step": 11548 }, { "epoch": 1.4691515074418013, "ewc_loss": 0.028889048844575882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.888904964493122e-05, "grad_norm": 17.354534149169922, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8816728591918945, "num_tokens": 440638254.0, "step": 11549 }, { "epoch": 1.4692787177203919, "ewc_loss": 0.02889699675142765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889699680963531e-05, "grad_norm": 17.37540626525879, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8746614456176758, "num_tokens": 440674122.0, "step": 11550 }, { "epoch": 1.4694059279989824, "ewc_loss": 0.028922151774168015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8922151614096947e-05, "grad_norm": 17.33527183532715, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8833459615707397, "num_tokens": 440718362.0, "step": 11551 }, { "epoch": 1.4695331382775727, "ewc_loss": 0.028862476348876953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8862476028734818e-05, "grad_norm": 17.36276626586914, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8762713670730591, "num_tokens": 440755971.0, "step": 11552 }, { "epoch": 1.4696603485561632, "ewc_loss": 0.02889132872223854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8891328838653862e-05, "grad_norm": 17.29098892211914, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8664954304695129, "num_tokens": 440794328.0, "step": 11553 }, { "epoch": 1.4697875588347538, "ewc_loss": 0.02885078266263008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8850781745859422e-05, "grad_norm": 17.412302017211914, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8560380339622498, "num_tokens": 440833383.0, "step": 11554 }, { "epoch": 1.4699147691133443, "ewc_loss": 0.02889801748096943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.88980172626907e-05, "grad_norm": 17.337509155273438, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8749691247940063, "num_tokens": 440875502.0, "step": 11555 }, { "epoch": 1.4700419793919348, "ewc_loss": 0.028771402314305305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8771402867278084e-05, "grad_norm": 17.330219268798828, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8722271919250488, "num_tokens": 440915259.0, "step": 11556 }, { "epoch": 1.4701691896705253, "ewc_loss": 0.028870048001408577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887004848162178e-05, "grad_norm": 17.445472717285156, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.86638343334198, "num_tokens": 440951660.0, "step": 11557 }, { "epoch": 1.4702963999491159, "ewc_loss": 0.028810016810894012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8810016374336556e-05, "grad_norm": 17.36687660217285, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8692716956138611, "num_tokens": 440992533.0, "step": 11558 }, { "epoch": 1.4704236102277064, "ewc_loss": 0.0287884883582592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.878848863474559e-05, "grad_norm": 17.3861083984375, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8575546741485596, "num_tokens": 441027360.0, "step": 11559 }, { "epoch": 1.470550820506297, "ewc_loss": 0.028795257210731506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8795257094316185e-05, "grad_norm": 17.397783279418945, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8750545978546143, "num_tokens": 441071746.0, "step": 11560 }, { "epoch": 1.4706780307848875, "ewc_loss": 0.028753647580742836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8753647711710073e-05, "grad_norm": 17.395368576049805, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8679941892623901, "num_tokens": 441105378.0, "step": 11561 }, { "epoch": 1.470805241063478, "ewc_loss": 0.02873215638101101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873215635190718e-05, "grad_norm": 17.38243865966797, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8689742088317871, "num_tokens": 441150000.0, "step": 11562 }, { "epoch": 1.4709324513420685, "ewc_loss": 0.02876746468245983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8767464755219407e-05, "grad_norm": 17.390602111816406, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8619552850723267, "num_tokens": 441188426.0, "step": 11563 }, { "epoch": 1.471059661620659, "ewc_loss": 0.028751172125339508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8751172067131847e-05, "grad_norm": 17.43474769592285, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8763988018035889, "num_tokens": 441226028.0, "step": 11564 }, { "epoch": 1.4711868718992496, "ewc_loss": 0.028794990852475166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8794991521863267e-05, "grad_norm": 17.339033126831055, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8700269460678101, "num_tokens": 441265788.0, "step": 11565 }, { "epoch": 1.47131408217784, "ewc_loss": 0.0286557674407959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.865576789190527e-05, "grad_norm": 17.403215408325195, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8779606819152832, "num_tokens": 441297280.0, "step": 11566 }, { "epoch": 1.4714412924564304, "ewc_loss": 0.028781989589333534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.878198938560672e-05, "grad_norm": 17.396434783935547, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8675501942634583, "num_tokens": 441331229.0, "step": 11567 }, { "epoch": 1.471568502735021, "ewc_loss": 0.028729580342769623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8729580662911758e-05, "grad_norm": 17.46258544921875, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8726340532302856, "num_tokens": 441371064.0, "step": 11568 }, { "epoch": 1.4716957130136115, "ewc_loss": 0.02872134931385517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8721349735860713e-05, "grad_norm": 17.379587173461914, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8507721424102783, "num_tokens": 441405706.0, "step": 11569 }, { "epoch": 1.471822923292202, "ewc_loss": 0.02867061458528042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8670614483417012e-05, "grad_norm": 17.442089080810547, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8601090312004089, "num_tokens": 441445966.0, "step": 11570 }, { "epoch": 1.4719501335707925, "ewc_loss": 0.02871725708246231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8717257009702735e-05, "grad_norm": 17.420122146606445, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8525760173797607, "num_tokens": 441478916.0, "step": 11571 }, { "epoch": 1.472077343849383, "ewc_loss": 0.028682589530944824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8682588890660554e-05, "grad_norm": 17.32784652709961, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8659082651138306, "num_tokens": 441519245.0, "step": 11572 }, { "epoch": 1.4722045541279736, "ewc_loss": 0.028703598305583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870359821827151e-05, "grad_norm": 17.412433624267578, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8612439632415771, "num_tokens": 441557485.0, "step": 11573 }, { "epoch": 1.472331764406564, "ewc_loss": 0.02876347117125988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.876347025448922e-05, "grad_norm": 17.359655380249023, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.878750205039978, "num_tokens": 441596042.0, "step": 11574 }, { "epoch": 1.4724589746851546, "ewc_loss": 0.028717176988720894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.871717697416898e-05, "grad_norm": 17.343032836914062, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8784657716751099, "num_tokens": 441634391.0, "step": 11575 }, { "epoch": 1.472586184963745, "ewc_loss": 0.02880518138408661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.880518150050193e-05, "grad_norm": 17.439176559448242, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8606083989143372, "num_tokens": 441672201.0, "step": 11576 }, { "epoch": 1.4727133952423355, "ewc_loss": 0.028749976307153702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8749976991093718e-05, "grad_norm": 17.386859893798828, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8674496412277222, "num_tokens": 441713350.0, "step": 11577 }, { "epoch": 1.472840605520926, "ewc_loss": 0.028713399544358253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8713398933177814e-05, "grad_norm": 17.386030197143555, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8679117560386658, "num_tokens": 441754677.0, "step": 11578 }, { "epoch": 1.4729678157995165, "ewc_loss": 0.028785936534404755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8785936592612416e-05, "grad_norm": 17.436124801635742, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8583687543869019, "num_tokens": 441797037.0, "step": 11579 }, { "epoch": 1.473095026078107, "ewc_loss": 0.02879481576383114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8794815079891123e-05, "grad_norm": 17.404035568237305, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8665606379508972, "num_tokens": 441831661.0, "step": 11580 }, { "epoch": 1.4732222363566976, "ewc_loss": 0.02874414063990116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.874413985409774e-05, "grad_norm": 17.410024642944336, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8722518086433411, "num_tokens": 441876933.0, "step": 11581 }, { "epoch": 1.473349446635288, "ewc_loss": 0.028772860765457153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8772861696779728e-05, "grad_norm": 17.375587463378906, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8905034065246582, "num_tokens": 441918015.0, "step": 11582 }, { "epoch": 1.4734766569138786, "ewc_loss": 0.028714081272482872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8714081054204144e-05, "grad_norm": 17.275033950805664, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8639774322509766, "num_tokens": 441956573.0, "step": 11583 }, { "epoch": 1.4736038671924692, "ewc_loss": 0.02875461056828499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.875460995710455e-05, "grad_norm": 17.34684944152832, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8824501037597656, "num_tokens": 442002064.0, "step": 11584 }, { "epoch": 1.4737310774710597, "ewc_loss": 0.028795506805181503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879550629586447e-05, "grad_norm": 17.386180877685547, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.876190721988678, "num_tokens": 442040122.0, "step": 11585 }, { "epoch": 1.4738582877496502, "ewc_loss": 0.02876584231853485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8765842216671444e-05, "grad_norm": 17.356061935424805, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.870978057384491, "num_tokens": 442079698.0, "step": 11586 }, { "epoch": 1.4739854980282407, "ewc_loss": 0.028738748282194138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.873874836950563e-05, "grad_norm": 17.365345001220703, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8818390965461731, "num_tokens": 442118252.0, "step": 11587 }, { "epoch": 1.4741127083068313, "ewc_loss": 0.028734304010868073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8734304578392766e-05, "grad_norm": 17.317501068115234, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8719680309295654, "num_tokens": 442154675.0, "step": 11588 }, { "epoch": 1.4742399185854218, "ewc_loss": 0.028763551265001297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.876355210901238e-05, "grad_norm": 17.416423797607422, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8753362894058228, "num_tokens": 442189449.0, "step": 11589 }, { "epoch": 1.4743671288640123, "ewc_loss": 0.028745874762535095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.874587516998872e-05, "grad_norm": 17.452381134033203, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8703786134719849, "num_tokens": 442222844.0, "step": 11590 }, { "epoch": 1.4744943391426026, "ewc_loss": 0.028770385310053825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8770386052201502e-05, "grad_norm": 17.385875701904297, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8761325478553772, "num_tokens": 442258599.0, "step": 11591 }, { "epoch": 1.4746215494211932, "ewc_loss": 0.02867085672914982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8670856409007683e-05, "grad_norm": 17.35585594177246, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8756480813026428, "num_tokens": 442294775.0, "step": 11592 }, { "epoch": 1.4747487596997837, "ewc_loss": 0.028750281780958176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8750282581313513e-05, "grad_norm": 17.261703491210938, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8799230456352234, "num_tokens": 442330313.0, "step": 11593 }, { "epoch": 1.4748759699783742, "ewc_loss": 0.028727050870656967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8727050448651426e-05, "grad_norm": 17.412593841552734, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.86796635389328, "num_tokens": 442364988.0, "step": 11594 }, { "epoch": 1.4750031802569648, "ewc_loss": 0.028750943019986153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8750942874467e-05, "grad_norm": 17.252113342285156, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.871473491191864, "num_tokens": 442406927.0, "step": 11595 }, { "epoch": 1.4751303905355553, "ewc_loss": 0.02866247668862343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8662476324825548e-05, "grad_norm": 17.335926055908203, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.872717559337616, "num_tokens": 442442092.0, "step": 11596 }, { "epoch": 1.4752576008141458, "ewc_loss": 0.028833573684096336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8833574106101878e-05, "grad_norm": 17.357080459594727, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8754761219024658, "num_tokens": 442483013.0, "step": 11597 }, { "epoch": 1.4753848110927363, "ewc_loss": 0.02880219556391239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.880219471990131e-05, "grad_norm": 17.412261962890625, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8757530450820923, "num_tokens": 442525052.0, "step": 11598 }, { "epoch": 1.4755120213713269, "ewc_loss": 0.028801029548048973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8801028747693636e-05, "grad_norm": 17.36935806274414, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8612731695175171, "num_tokens": 442562543.0, "step": 11599 }, { "epoch": 1.4756392316499174, "ewc_loss": 0.028770310804247856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8770311473635957e-05, "grad_norm": 17.364166259765625, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8757555484771729, "num_tokens": 442604976.0, "step": 11600 }, { "epoch": 1.4757664419285077, "ewc_loss": 0.028773460537195206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8773460144293495e-05, "grad_norm": 17.346294403076172, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8801522850990295, "num_tokens": 442642271.0, "step": 11601 }, { "epoch": 1.4758936522070982, "ewc_loss": 0.028737636283040047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8737636966980062e-05, "grad_norm": 17.359556198120117, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8670328855514526, "num_tokens": 442680508.0, "step": 11602 }, { "epoch": 1.4760208624856888, "ewc_loss": 0.028776632621884346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.877663246181328e-05, "grad_norm": 17.327594757080078, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8574559688568115, "num_tokens": 442718391.0, "step": 11603 }, { "epoch": 1.4761480727642793, "ewc_loss": 0.028785977512598038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8785976610379294e-05, "grad_norm": 17.330129623413086, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8798066973686218, "num_tokens": 442757688.0, "step": 11604 }, { "epoch": 1.4762752830428698, "ewc_loss": 0.02880181558430195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.880181637010537e-05, "grad_norm": 17.327165603637695, "learning_rate": 1e-06, "loss": 0.3605, "mean_token_accuracy": 0.882668137550354, "num_tokens": 442796704.0, "step": 11605 }, { "epoch": 1.4764024933214603, "ewc_loss": 0.02882135845720768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8821357773267664e-05, "grad_norm": 17.40439796447754, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8795924782752991, "num_tokens": 442838791.0, "step": 11606 }, { "epoch": 1.4765297036000509, "ewc_loss": 0.028819644823670387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8819644285249524e-05, "grad_norm": 17.348417282104492, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.870952844619751, "num_tokens": 442882368.0, "step": 11607 }, { "epoch": 1.4766569138786414, "ewc_loss": 0.028780387714505196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8780386855942197e-05, "grad_norm": 17.387361526489258, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8697640299797058, "num_tokens": 442923768.0, "step": 11608 }, { "epoch": 1.476784124157232, "ewc_loss": 0.028812134638428688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8812135496991687e-05, "grad_norm": 17.378355026245117, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8830020427703857, "num_tokens": 442955656.0, "step": 11609 }, { "epoch": 1.4769113344358225, "ewc_loss": 0.028720136731863022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8720136469928548e-05, "grad_norm": 17.309202194213867, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.860733151435852, "num_tokens": 442997323.0, "step": 11610 }, { "epoch": 1.477038544714413, "ewc_loss": 0.0287676639854908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8767664844053797e-05, "grad_norm": 17.365840911865234, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8551289439201355, "num_tokens": 443033454.0, "step": 11611 }, { "epoch": 1.4771657549930035, "ewc_loss": 0.028796223923563957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879622479667887e-05, "grad_norm": 17.357013702392578, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.86962890625, "num_tokens": 443077902.0, "step": 11612 }, { "epoch": 1.477292965271594, "ewc_loss": 0.02878311462700367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8783115340047516e-05, "grad_norm": 17.321779251098633, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8602323532104492, "num_tokens": 443116515.0, "step": 11613 }, { "epoch": 1.4774201755501846, "ewc_loss": 0.028755351901054382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8755352104781196e-05, "grad_norm": 17.357206344604492, "learning_rate": 1e-06, "loss": 0.3229, "mean_token_accuracy": 0.8949582576751709, "num_tokens": 443154124.0, "step": 11614 }, { "epoch": 1.477547385828775, "ewc_loss": 0.02876843325793743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8768432457582094e-05, "grad_norm": 17.326213836669922, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.878204882144928, "num_tokens": 443195852.0, "step": 11615 }, { "epoch": 1.4776745961073654, "ewc_loss": 0.02877550758421421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8775508326361887e-05, "grad_norm": 17.427627563476562, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8632371425628662, "num_tokens": 443229850.0, "step": 11616 }, { "epoch": 1.477801806385956, "ewc_loss": 0.028773417696356773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8773418307537213e-05, "grad_norm": 17.367008209228516, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8612602353096008, "num_tokens": 443274481.0, "step": 11617 }, { "epoch": 1.4779290166645465, "ewc_loss": 0.02869555354118347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8695552828139625e-05, "grad_norm": 17.29121971130371, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8754253387451172, "num_tokens": 443311737.0, "step": 11618 }, { "epoch": 1.478056226943137, "ewc_loss": 0.028763270005583763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.876327016565483e-05, "grad_norm": 17.445981979370117, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8619742393493652, "num_tokens": 443345310.0, "step": 11619 }, { "epoch": 1.4781834372217275, "ewc_loss": 0.0287923701107502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8792370358132757e-05, "grad_norm": 17.32927703857422, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8680678606033325, "num_tokens": 443392869.0, "step": 11620 }, { "epoch": 1.478310647500318, "ewc_loss": 0.028747573494911194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8747574106091633e-05, "grad_norm": 17.3829402923584, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8684183955192566, "num_tokens": 443432842.0, "step": 11621 }, { "epoch": 1.4784378577789086, "ewc_loss": 0.02878354676067829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8783546440536156e-05, "grad_norm": 17.39605712890625, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8710368871688843, "num_tokens": 443465954.0, "step": 11622 }, { "epoch": 1.478565068057499, "ewc_loss": 0.028755538165569305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8755537641700357e-05, "grad_norm": 17.36835479736328, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8611719608306885, "num_tokens": 443508506.0, "step": 11623 }, { "epoch": 1.4786922783360896, "ewc_loss": 0.028776295483112335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8776295948773623e-05, "grad_norm": 17.418991088867188, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8684381246566772, "num_tokens": 443543575.0, "step": 11624 }, { "epoch": 1.47881948861468, "ewc_loss": 0.02876066416501999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.876066355383955e-05, "grad_norm": 17.35196876525879, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8727887272834778, "num_tokens": 443578504.0, "step": 11625 }, { "epoch": 1.4789466988932705, "ewc_loss": 0.02870062366127968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.870062417059671e-05, "grad_norm": 17.312633514404297, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8823113441467285, "num_tokens": 443614647.0, "step": 11626 }, { "epoch": 1.479073909171861, "ewc_loss": 0.028810348361730576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8810347430408e-05, "grad_norm": 17.454715728759766, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8653868436813354, "num_tokens": 443649244.0, "step": 11627 }, { "epoch": 1.4792011194504515, "ewc_loss": 0.02885282412171364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8852824470959604e-05, "grad_norm": 17.370134353637695, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8654983043670654, "num_tokens": 443688374.0, "step": 11628 }, { "epoch": 1.479328329729042, "ewc_loss": 0.02875441126525402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8754411687259562e-05, "grad_norm": 17.382511138916016, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8394807577133179, "num_tokens": 443730929.0, "step": 11629 }, { "epoch": 1.4794555400076326, "ewc_loss": 0.0288521870970726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8852187824668363e-05, "grad_norm": 17.443159103393555, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8658366203308105, "num_tokens": 443763067.0, "step": 11630 }, { "epoch": 1.479582750286223, "ewc_loss": 0.028824953362345695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8824953915318474e-05, "grad_norm": 17.3731746673584, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8577085137367249, "num_tokens": 443801273.0, "step": 11631 }, { "epoch": 1.4797099605648136, "ewc_loss": 0.028770485892891884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8770486096618697e-05, "grad_norm": 17.370527267456055, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8568604588508606, "num_tokens": 443840394.0, "step": 11632 }, { "epoch": 1.4798371708434042, "ewc_loss": 0.028888268396258354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.88882692984771e-05, "grad_norm": 17.372150421142578, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8650924563407898, "num_tokens": 443877486.0, "step": 11633 }, { "epoch": 1.4799643811219947, "ewc_loss": 0.028849322348833084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8849322916357778e-05, "grad_norm": 17.420337677001953, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8688629865646362, "num_tokens": 443913763.0, "step": 11634 }, { "epoch": 1.4800915914005852, "ewc_loss": 0.02889658696949482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8896587537019514e-05, "grad_norm": 17.3443546295166, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8765380382537842, "num_tokens": 443946840.0, "step": 11635 }, { "epoch": 1.4802188016791757, "ewc_loss": 0.028860092163085938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.886009133362677e-05, "grad_norm": 17.355140686035156, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8543055057525635, "num_tokens": 443992439.0, "step": 11636 }, { "epoch": 1.4803460119577663, "ewc_loss": 0.028915317729115486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8915317670907825e-05, "grad_norm": 17.383264541625977, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.879056990146637, "num_tokens": 444029348.0, "step": 11637 }, { "epoch": 1.4804732222363568, "ewc_loss": 0.02889345958828926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8893458875245415e-05, "grad_norm": 17.409278869628906, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8763909339904785, "num_tokens": 444068386.0, "step": 11638 }, { "epoch": 1.4806004325149473, "ewc_loss": 0.028850499540567398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8850499802501872e-05, "grad_norm": 17.349138259887695, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8580200672149658, "num_tokens": 444110307.0, "step": 11639 }, { "epoch": 1.4807276427935376, "ewc_loss": 0.028861841186881065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.886184120143298e-05, "grad_norm": 17.436628341674805, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8696563839912415, "num_tokens": 444153919.0, "step": 11640 }, { "epoch": 1.4808548530721282, "ewc_loss": 0.02885923720896244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8859236408607103e-05, "grad_norm": 17.362606048583984, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8710668087005615, "num_tokens": 444190575.0, "step": 11641 }, { "epoch": 1.4809820633507187, "ewc_loss": 0.028835266828536987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.883526758523658e-05, "grad_norm": 17.403568267822266, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8631693124771118, "num_tokens": 444226108.0, "step": 11642 }, { "epoch": 1.4811092736293092, "ewc_loss": 0.02885722741484642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8857228244305588e-05, "grad_norm": 17.337697982788086, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8655489683151245, "num_tokens": 444268162.0, "step": 11643 }, { "epoch": 1.4812364839078997, "ewc_loss": 0.028844987973570824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8844988264609128e-05, "grad_norm": 17.37314224243164, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8551938533782959, "num_tokens": 444309273.0, "step": 11644 }, { "epoch": 1.4813636941864903, "ewc_loss": 0.028811035677790642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881103500840254e-05, "grad_norm": 17.30719566345215, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8732450008392334, "num_tokens": 444338697.0, "step": 11645 }, { "epoch": 1.4814909044650808, "ewc_loss": 0.0288715697824955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8871569156763144e-05, "grad_norm": 17.408353805541992, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8788718581199646, "num_tokens": 444377954.0, "step": 11646 }, { "epoch": 1.4816181147436713, "ewc_loss": 0.028864078223705292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8864078558399342e-05, "grad_norm": 17.340852737426758, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.869676411151886, "num_tokens": 444416996.0, "step": 11647 }, { "epoch": 1.4817453250222619, "ewc_loss": 0.028873974457383156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887397386075463e-05, "grad_norm": 17.371288299560547, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8757809400558472, "num_tokens": 444455550.0, "step": 11648 }, { "epoch": 1.4818725353008524, "ewc_loss": 0.02886282280087471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8862823455710895e-05, "grad_norm": 17.41323471069336, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8649739027023315, "num_tokens": 444493979.0, "step": 11649 }, { "epoch": 1.4819997455794427, "ewc_loss": 0.028873251751065254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8873251721961424e-05, "grad_norm": 17.365711212158203, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8678027391433716, "num_tokens": 444540622.0, "step": 11650 }, { "epoch": 1.4821269558580332, "ewc_loss": 0.028855491429567337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8855491109425202e-05, "grad_norm": 17.475997924804688, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8661211729049683, "num_tokens": 444577772.0, "step": 11651 }, { "epoch": 1.4822541661366238, "ewc_loss": 0.028869135305285454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.88691353489412e-05, "grad_norm": 17.327539443969727, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8554261922836304, "num_tokens": 444615523.0, "step": 11652 }, { "epoch": 1.4823813764152143, "ewc_loss": 0.028753232210874557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8753232982126065e-05, "grad_norm": 17.36399269104004, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8683009147644043, "num_tokens": 444659785.0, "step": 11653 }, { "epoch": 1.4825085866938048, "ewc_loss": 0.028854811564087868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8854810807388276e-05, "grad_norm": 17.281005859375, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8651426434516907, "num_tokens": 444700806.0, "step": 11654 }, { "epoch": 1.4826357969723953, "ewc_loss": 0.028811683878302574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8811684387619607e-05, "grad_norm": 17.399608612060547, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8697141408920288, "num_tokens": 444734320.0, "step": 11655 }, { "epoch": 1.4827630072509859, "ewc_loss": 0.028894392773509026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8894392016809434e-05, "grad_norm": 17.383363723754883, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8629734516143799, "num_tokens": 444770215.0, "step": 11656 }, { "epoch": 1.4828902175295764, "ewc_loss": 0.02880892902612686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8808928618673235e-05, "grad_norm": 17.45220375061035, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8671343326568604, "num_tokens": 444805432.0, "step": 11657 }, { "epoch": 1.483017427808167, "ewc_loss": 0.028868291527032852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8868291337857954e-05, "grad_norm": 17.368135452270508, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.868583083152771, "num_tokens": 444832398.0, "step": 11658 }, { "epoch": 1.4831446380867574, "ewc_loss": 0.028882676735520363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8882675906061195e-05, "grad_norm": 17.38628387451172, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8722158670425415, "num_tokens": 444876128.0, "step": 11659 }, { "epoch": 1.483271848365348, "ewc_loss": 0.02895927242934704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8959271730855107e-05, "grad_norm": 17.38820457458496, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8636831045150757, "num_tokens": 444915188.0, "step": 11660 }, { "epoch": 1.4833990586439385, "ewc_loss": 0.028846489265561104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8846488930867054e-05, "grad_norm": 17.3265438079834, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8741538524627686, "num_tokens": 444953528.0, "step": 11661 }, { "epoch": 1.483526268922529, "ewc_loss": 0.0289307814091444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893078089982737e-05, "grad_norm": 17.400245666503906, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.861927330493927, "num_tokens": 444996469.0, "step": 11662 }, { "epoch": 1.4836534792011196, "ewc_loss": 0.028885794803500175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8885795472888276e-05, "grad_norm": 17.34283447265625, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8850544095039368, "num_tokens": 445034298.0, "step": 11663 }, { "epoch": 1.48378068947971, "ewc_loss": 0.028871795162558556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8871794711449184e-05, "grad_norm": 17.40934944152832, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8672449588775635, "num_tokens": 445079010.0, "step": 11664 }, { "epoch": 1.4839078997583004, "ewc_loss": 0.028923526406288147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8923526770086028e-05, "grad_norm": 17.357885360717773, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8817639946937561, "num_tokens": 445116009.0, "step": 11665 }, { "epoch": 1.484035110036891, "ewc_loss": 0.028842994943261147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8842994652222842e-05, "grad_norm": 17.35001564025879, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8815358877182007, "num_tokens": 445156313.0, "step": 11666 }, { "epoch": 1.4841623203154815, "ewc_loss": 0.028874855488538742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887485607061535e-05, "grad_norm": 17.3736515045166, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8696266412734985, "num_tokens": 445187424.0, "step": 11667 }, { "epoch": 1.484289530594072, "ewc_loss": 0.02889060042798519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8890601242892444e-05, "grad_norm": 17.450809478759766, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8702445030212402, "num_tokens": 445233448.0, "step": 11668 }, { "epoch": 1.4844167408726625, "ewc_loss": 0.028922908008098602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8922908313688822e-05, "grad_norm": 17.393096923828125, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8797421455383301, "num_tokens": 445273663.0, "step": 11669 }, { "epoch": 1.484543951151253, "ewc_loss": 0.02882305346429348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.882305307139177e-05, "grad_norm": 17.392332077026367, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8749098777770996, "num_tokens": 445313790.0, "step": 11670 }, { "epoch": 1.4846711614298436, "ewc_loss": 0.028794167563319206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879416751966346e-05, "grad_norm": 17.34906768798828, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8798558115959167, "num_tokens": 445351818.0, "step": 11671 }, { "epoch": 1.484798371708434, "ewc_loss": 0.02883993461728096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8839935112046078e-05, "grad_norm": 17.362178802490234, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.843826413154602, "num_tokens": 445391763.0, "step": 11672 }, { "epoch": 1.4849255819870246, "ewc_loss": 0.028828807175159454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8828806534875184e-05, "grad_norm": 17.389944076538086, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8624464273452759, "num_tokens": 445426723.0, "step": 11673 }, { "epoch": 1.485052792265615, "ewc_loss": 0.0288707222789526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887072150770109e-05, "grad_norm": 17.32201385498047, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8744914531707764, "num_tokens": 445465635.0, "step": 11674 }, { "epoch": 1.4851800025442055, "ewc_loss": 0.02883584424853325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8835844204877503e-05, "grad_norm": 17.467409133911133, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8702684640884399, "num_tokens": 445502819.0, "step": 11675 }, { "epoch": 1.485307212822796, "ewc_loss": 0.02888987772166729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8889877285109833e-05, "grad_norm": 17.301727294921875, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8784212470054626, "num_tokens": 445541861.0, "step": 11676 }, { "epoch": 1.4854344231013865, "ewc_loss": 0.028796106576919556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.879610656236764e-05, "grad_norm": 17.433256149291992, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8848263621330261, "num_tokens": 445578058.0, "step": 11677 }, { "epoch": 1.485561633379977, "ewc_loss": 0.0289227943867445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.89227937173564e-05, "grad_norm": 17.402019500732422, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8537005186080933, "num_tokens": 445612896.0, "step": 11678 }, { "epoch": 1.4856888436585676, "ewc_loss": 0.028808917850255966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8808917704736814e-05, "grad_norm": 17.34837532043457, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8625137805938721, "num_tokens": 445647849.0, "step": 11679 }, { "epoch": 1.485816053937158, "ewc_loss": 0.028919238597154617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.891923941206187e-05, "grad_norm": 17.4514102935791, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8545658588409424, "num_tokens": 445689199.0, "step": 11680 }, { "epoch": 1.4859432642157486, "ewc_loss": 0.02890489064157009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.89048912236467e-05, "grad_norm": 17.380781173706055, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8675289154052734, "num_tokens": 445727869.0, "step": 11681 }, { "epoch": 1.4860704744943392, "ewc_loss": 0.02882523648440838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8825235858676024e-05, "grad_norm": 17.347978591918945, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.872702956199646, "num_tokens": 445770105.0, "step": 11682 }, { "epoch": 1.4861976847729297, "ewc_loss": 0.028891174122691154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889117422455456e-05, "grad_norm": 17.400554656982422, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8486177325248718, "num_tokens": 445812950.0, "step": 11683 }, { "epoch": 1.4863248950515202, "ewc_loss": 0.028844241052865982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.884424065996427e-05, "grad_norm": 17.334938049316406, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8669870495796204, "num_tokens": 445850811.0, "step": 11684 }, { "epoch": 1.4864521053301107, "ewc_loss": 0.028879540041089058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8879539968329482e-05, "grad_norm": 17.37755584716797, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8724794387817383, "num_tokens": 445891967.0, "step": 11685 }, { "epoch": 1.4865793156087013, "ewc_loss": 0.028877755627036095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8877755539724603e-05, "grad_norm": 17.3447265625, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8708794116973877, "num_tokens": 445927936.0, "step": 11686 }, { "epoch": 1.4867065258872918, "ewc_loss": 0.028843527659773827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.884352761611808e-05, "grad_norm": 17.394805908203125, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8808274269104004, "num_tokens": 445962481.0, "step": 11687 }, { "epoch": 1.4868337361658823, "ewc_loss": 0.02888723835349083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8887237931485288e-05, "grad_norm": 17.32392692565918, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8824713826179504, "num_tokens": 446001445.0, "step": 11688 }, { "epoch": 1.4869609464444726, "ewc_loss": 0.028865450993180275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.886545189539902e-05, "grad_norm": 17.378803253173828, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8870731592178345, "num_tokens": 446037904.0, "step": 11689 }, { "epoch": 1.4870881567230632, "ewc_loss": 0.028866520151495934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.88665196421789e-05, "grad_norm": 17.37746810913086, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8782833814620972, "num_tokens": 446077583.0, "step": 11690 }, { "epoch": 1.4872153670016537, "ewc_loss": 0.028819944709539413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.881994441850111e-05, "grad_norm": 17.33970069885254, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.867827296257019, "num_tokens": 446110123.0, "step": 11691 }, { "epoch": 1.4873425772802442, "ewc_loss": 0.02887175790965557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8871758331661113e-05, "grad_norm": 17.370986938476562, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.866528332233429, "num_tokens": 446149832.0, "step": 11692 }, { "epoch": 1.4874697875588347, "ewc_loss": 0.02889821119606495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889821189455688e-05, "grad_norm": 17.406415939331055, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8760212063789368, "num_tokens": 446184211.0, "step": 11693 }, { "epoch": 1.4875969978374253, "ewc_loss": 0.028893066570162773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889306597353425e-05, "grad_norm": 17.32646369934082, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8632811307907104, "num_tokens": 446229620.0, "step": 11694 }, { "epoch": 1.4877242081160158, "ewc_loss": 0.028860952705144882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.886095353460405e-05, "grad_norm": 17.429224014282227, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8704416751861572, "num_tokens": 446267134.0, "step": 11695 }, { "epoch": 1.4878514183946063, "ewc_loss": 0.028927670791745186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8927670427947305e-05, "grad_norm": 17.324787139892578, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8581210374832153, "num_tokens": 446303001.0, "step": 11696 }, { "epoch": 1.4879786286731969, "ewc_loss": 0.02889874018728733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8898739401483908e-05, "grad_norm": 17.412694931030273, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8678960800170898, "num_tokens": 446334286.0, "step": 11697 }, { "epoch": 1.4881058389517874, "ewc_loss": 0.028918921947479248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.891892108891625e-05, "grad_norm": 17.373388290405273, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8862804174423218, "num_tokens": 446369547.0, "step": 11698 }, { "epoch": 1.4882330492303777, "ewc_loss": 0.02888648211956024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8886481231893413e-05, "grad_norm": 17.339265823364258, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8550411462783813, "num_tokens": 446408179.0, "step": 11699 }, { "epoch": 1.4883602595089682, "ewc_loss": 0.028934495523571968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893449527618941e-05, "grad_norm": 17.39725685119629, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8714867830276489, "num_tokens": 446448533.0, "step": 11700 }, { "epoch": 1.4884874697875587, "ewc_loss": 0.028896935284137726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889693496399559e-05, "grad_norm": 17.357730865478516, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8669805526733398, "num_tokens": 446489445.0, "step": 11701 }, { "epoch": 1.4886146800661493, "ewc_loss": 0.028877748176455498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887774826376699e-05, "grad_norm": 17.368083953857422, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8828508257865906, "num_tokens": 446526530.0, "step": 11702 }, { "epoch": 1.4887418903447398, "ewc_loss": 0.02889489382505417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8894894057884812e-05, "grad_norm": 17.389904022216797, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8697067499160767, "num_tokens": 446558315.0, "step": 11703 }, { "epoch": 1.4888691006233303, "ewc_loss": 0.028920982033014297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8920982003910467e-05, "grad_norm": 17.362485885620117, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8727387189865112, "num_tokens": 446598548.0, "step": 11704 }, { "epoch": 1.4889963109019209, "ewc_loss": 0.02889629453420639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8896294679725543e-05, "grad_norm": 17.424535751342773, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8649091720581055, "num_tokens": 446632111.0, "step": 11705 }, { "epoch": 1.4891235211805114, "ewc_loss": 0.02893700823187828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893700911954511e-05, "grad_norm": 17.36258888244629, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8773548603057861, "num_tokens": 446669296.0, "step": 11706 }, { "epoch": 1.489250731459102, "ewc_loss": 0.028921468183398247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8921467674081214e-05, "grad_norm": 17.44259262084961, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8719435930252075, "num_tokens": 446707891.0, "step": 11707 }, { "epoch": 1.4893779417376924, "ewc_loss": 0.028940847143530846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.894084718718659e-05, "grad_norm": 17.37232208251953, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8581148982048035, "num_tokens": 446749455.0, "step": 11708 }, { "epoch": 1.489505152016283, "ewc_loss": 0.028866160660982132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8866161301266402e-05, "grad_norm": 17.426973342895508, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8815464973449707, "num_tokens": 446787001.0, "step": 11709 }, { "epoch": 1.4896323622948735, "ewc_loss": 0.02895750291645527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.895750367315486e-05, "grad_norm": 17.413158416748047, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8695259094238281, "num_tokens": 446827262.0, "step": 11710 }, { "epoch": 1.489759572573464, "ewc_loss": 0.028894921764731407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8894921342725866e-05, "grad_norm": 17.42674446105957, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8851646780967712, "num_tokens": 446860514.0, "step": 11711 }, { "epoch": 1.4898867828520546, "ewc_loss": 0.02891385741531849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8913857022416778e-05, "grad_norm": 17.4318790435791, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8606917262077332, "num_tokens": 446902605.0, "step": 11712 }, { "epoch": 1.490013993130645, "ewc_loss": 0.028918379917740822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8918379030073993e-05, "grad_norm": 17.4040584564209, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8757405877113342, "num_tokens": 446938638.0, "step": 11713 }, { "epoch": 1.4901412034092354, "ewc_loss": 0.02892150543630123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8921505872858688e-05, "grad_norm": 17.36482810974121, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8751441240310669, "num_tokens": 446979286.0, "step": 11714 }, { "epoch": 1.490268413687826, "ewc_loss": 0.028869932517409325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8869932066299953e-05, "grad_norm": 17.42479705810547, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8545179963111877, "num_tokens": 447017405.0, "step": 11715 }, { "epoch": 1.4903956239664164, "ewc_loss": 0.0288948193192482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8894819479319267e-05, "grad_norm": 17.360586166381836, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8718505501747131, "num_tokens": 447056733.0, "step": 11716 }, { "epoch": 1.490522834245007, "ewc_loss": 0.02886955626308918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.886955553549342e-05, "grad_norm": 17.34270477294922, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8655685186386108, "num_tokens": 447094493.0, "step": 11717 }, { "epoch": 1.4906500445235975, "ewc_loss": 0.02889278717339039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8892787668155506e-05, "grad_norm": 17.343158721923828, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8843115568161011, "num_tokens": 447134039.0, "step": 11718 }, { "epoch": 1.490777254802188, "ewc_loss": 0.02890702895820141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8907028536195867e-05, "grad_norm": 17.329147338867188, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8789823055267334, "num_tokens": 447168320.0, "step": 11719 }, { "epoch": 1.4909044650807786, "ewc_loss": 0.02892608568072319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8926086088176817e-05, "grad_norm": 17.37601661682129, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.854479193687439, "num_tokens": 447205640.0, "step": 11720 }, { "epoch": 1.491031675359369, "ewc_loss": 0.028944697231054306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8944697987753898e-05, "grad_norm": 17.35193634033203, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8831447958946228, "num_tokens": 447239831.0, "step": 11721 }, { "epoch": 1.4911588856379596, "ewc_loss": 0.02892863005399704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8928630854352377e-05, "grad_norm": 17.41200065612793, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8551321029663086, "num_tokens": 447281100.0, "step": 11722 }, { "epoch": 1.49128609591655, "ewc_loss": 0.02893471159040928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893471173592843e-05, "grad_norm": 17.34112548828125, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8509486317634583, "num_tokens": 447316137.0, "step": 11723 }, { "epoch": 1.4914133061951405, "ewc_loss": 0.028942620381712914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.894262070185505e-05, "grad_norm": 17.37691307067871, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8620719909667969, "num_tokens": 447357922.0, "step": 11724 }, { "epoch": 1.491540516473731, "ewc_loss": 0.02897721529006958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8977216061321087e-05, "grad_norm": 17.421520233154297, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8662232160568237, "num_tokens": 447394905.0, "step": 11725 }, { "epoch": 1.4916677267523215, "ewc_loss": 0.028907226398587227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8907226806040853e-05, "grad_norm": 17.34926986694336, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8797227740287781, "num_tokens": 447435379.0, "step": 11726 }, { "epoch": 1.491794937030912, "ewc_loss": 0.028915245085954666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8915244911331683e-05, "grad_norm": 17.444202423095703, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8604394197463989, "num_tokens": 447473058.0, "step": 11727 }, { "epoch": 1.4919221473095026, "ewc_loss": 0.028932511806488037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8932512577739544e-05, "grad_norm": 17.36665153503418, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8790596723556519, "num_tokens": 447510966.0, "step": 11728 }, { "epoch": 1.492049357588093, "ewc_loss": 0.028914831578731537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.891483200073708e-05, "grad_norm": 17.437063217163086, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8715806603431702, "num_tokens": 447554747.0, "step": 11729 }, { "epoch": 1.4921765678666836, "ewc_loss": 0.02899821288883686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8998212656006217e-05, "grad_norm": 17.364791870117188, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8718607425689697, "num_tokens": 447594426.0, "step": 11730 }, { "epoch": 1.4923037781452742, "ewc_loss": 0.02892649546265602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8926495360792615e-05, "grad_norm": 17.37464714050293, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8619050979614258, "num_tokens": 447634786.0, "step": 11731 }, { "epoch": 1.4924309884238647, "ewc_loss": 0.028959207236766815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8959208066225983e-05, "grad_norm": 17.409387588500977, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8672606945037842, "num_tokens": 447668682.0, "step": 11732 }, { "epoch": 1.4925581987024552, "ewc_loss": 0.028956029564142227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.895603029173799e-05, "grad_norm": 17.42751693725586, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8612766861915588, "num_tokens": 447705566.0, "step": 11733 }, { "epoch": 1.4926854089810457, "ewc_loss": 0.02892555668950081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8925556762260385e-05, "grad_norm": 17.37906265258789, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8665487170219421, "num_tokens": 447746385.0, "step": 11734 }, { "epoch": 1.4928126192596363, "ewc_loss": 0.028937673196196556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8937673050677404e-05, "grad_norm": 17.423065185546875, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8737404346466064, "num_tokens": 447790483.0, "step": 11735 }, { "epoch": 1.4929398295382268, "ewc_loss": 0.028918512165546417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8918511816300452e-05, "grad_norm": 17.45879364013672, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8671289682388306, "num_tokens": 447826872.0, "step": 11736 }, { "epoch": 1.4930670398168173, "ewc_loss": 0.02889092266559601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8890923204016872e-05, "grad_norm": 17.369800567626953, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8747114539146423, "num_tokens": 447864071.0, "step": 11737 }, { "epoch": 1.4931942500954076, "ewc_loss": 0.028862008824944496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8862008548458107e-05, "grad_norm": 17.470186233520508, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8532191514968872, "num_tokens": 447903320.0, "step": 11738 }, { "epoch": 1.4933214603739982, "ewc_loss": 0.028934694826602936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.89346953650238e-05, "grad_norm": 17.432537078857422, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8833630681037903, "num_tokens": 447940253.0, "step": 11739 }, { "epoch": 1.4934486706525887, "ewc_loss": 0.028833016753196716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8833017495344393e-05, "grad_norm": 17.388540267944336, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8744985461235046, "num_tokens": 447978277.0, "step": 11740 }, { "epoch": 1.4935758809311792, "ewc_loss": 0.028864553198218346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8864553314633667e-05, "grad_norm": 17.39515495300293, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8583173155784607, "num_tokens": 448018798.0, "step": 11741 }, { "epoch": 1.4937030912097697, "ewc_loss": 0.028890440240502357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889043935283553e-05, "grad_norm": 17.3553409576416, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8597817420959473, "num_tokens": 448057469.0, "step": 11742 }, { "epoch": 1.4938303014883603, "ewc_loss": 0.028814638033509254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8814638426410966e-05, "grad_norm": 17.350786209106445, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8569109439849854, "num_tokens": 448089032.0, "step": 11743 }, { "epoch": 1.4939575117669508, "ewc_loss": 0.028883039951324463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8883039703941904e-05, "grad_norm": 17.396854400634766, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8641785383224487, "num_tokens": 448122438.0, "step": 11744 }, { "epoch": 1.4940847220455413, "ewc_loss": 0.028936682268977165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893668170145247e-05, "grad_norm": 17.39674949645996, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8681046962738037, "num_tokens": 448166844.0, "step": 11745 }, { "epoch": 1.4942119323241319, "ewc_loss": 0.028848085552453995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8848086003563367e-05, "grad_norm": 17.38118553161621, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8594791889190674, "num_tokens": 448197766.0, "step": 11746 }, { "epoch": 1.4943391426027224, "ewc_loss": 0.028947805985808372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8947806640644558e-05, "grad_norm": 17.46805763244629, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8559092283248901, "num_tokens": 448240850.0, "step": 11747 }, { "epoch": 1.4944663528813127, "ewc_loss": 0.02896481193602085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8964812372578308e-05, "grad_norm": 17.433679580688477, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8774043321609497, "num_tokens": 448274187.0, "step": 11748 }, { "epoch": 1.4945935631599032, "ewc_loss": 0.028872407972812653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887240771087818e-05, "grad_norm": 17.37586784362793, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8750320076942444, "num_tokens": 448311069.0, "step": 11749 }, { "epoch": 1.4947207734384937, "ewc_loss": 0.02893134579062462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893134660553187e-05, "grad_norm": 17.381729125976562, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8687782287597656, "num_tokens": 448351536.0, "step": 11750 }, { "epoch": 1.4948479837170843, "ewc_loss": 0.02892446704208851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.892446718760766e-05, "grad_norm": 17.357769012451172, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8659306764602661, "num_tokens": 448385108.0, "step": 11751 }, { "epoch": 1.4949751939956748, "ewc_loss": 0.028973259031772614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8973259759368375e-05, "grad_norm": 17.406770706176758, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8803327679634094, "num_tokens": 448423272.0, "step": 11752 }, { "epoch": 1.4951024042742653, "ewc_loss": 0.028975529596209526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8975529858144e-05, "grad_norm": 17.32924461364746, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8742333054542542, "num_tokens": 448462904.0, "step": 11753 }, { "epoch": 1.4952296145528559, "ewc_loss": 0.02896900847554207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8969008781132288e-05, "grad_norm": 17.311126708984375, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8655798435211182, "num_tokens": 448503257.0, "step": 11754 }, { "epoch": 1.4953568248314464, "ewc_loss": 0.028987672179937363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898767161241267e-05, "grad_norm": 17.3245849609375, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8746649026870728, "num_tokens": 448535299.0, "step": 11755 }, { "epoch": 1.495484035110037, "ewc_loss": 0.029058191925287247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9058192012598738e-05, "grad_norm": 17.392751693725586, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8758819699287415, "num_tokens": 448568878.0, "step": 11756 }, { "epoch": 1.4956112453886274, "ewc_loss": 0.02903411164879799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9034112230874598e-05, "grad_norm": 17.389680862426758, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8869788646697998, "num_tokens": 448604959.0, "step": 11757 }, { "epoch": 1.495738455667218, "ewc_loss": 0.028995998203754425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8995998945902102e-05, "grad_norm": 17.325191497802734, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.880469799041748, "num_tokens": 448638487.0, "step": 11758 }, { "epoch": 1.4958656659458085, "ewc_loss": 0.029030224308371544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.903022505051922e-05, "grad_norm": 17.41411590576172, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.874751091003418, "num_tokens": 448677212.0, "step": 11759 }, { "epoch": 1.495992876224399, "ewc_loss": 0.028996387496590614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.899638820963446e-05, "grad_norm": 17.354963302612305, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8638968467712402, "num_tokens": 448719654.0, "step": 11760 }, { "epoch": 1.4961200865029896, "ewc_loss": 0.028960948809981346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8960948839085177e-05, "grad_norm": 17.376310348510742, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8724343180656433, "num_tokens": 448760783.0, "step": 11761 }, { "epoch": 1.49624729678158, "ewc_loss": 0.029001358896493912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900135950767435e-05, "grad_norm": 17.359277725219727, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8591833710670471, "num_tokens": 448802843.0, "step": 11762 }, { "epoch": 1.4963745070601704, "ewc_loss": 0.028995810076594353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8995809771004133e-05, "grad_norm": 17.425378799438477, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8659563064575195, "num_tokens": 448838430.0, "step": 11763 }, { "epoch": 1.496501717338761, "ewc_loss": 0.029009586200118065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900958679674659e-05, "grad_norm": 17.389205932617188, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8601728677749634, "num_tokens": 448880450.0, "step": 11764 }, { "epoch": 1.4966289276173514, "ewc_loss": 0.028948787599802017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.894878707593307e-05, "grad_norm": 17.3413028717041, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8721551895141602, "num_tokens": 448919352.0, "step": 11765 }, { "epoch": 1.496756137895942, "ewc_loss": 0.028987575322389603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898757520597428e-05, "grad_norm": 17.348783493041992, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8439180254936218, "num_tokens": 448965402.0, "step": 11766 }, { "epoch": 1.4968833481745325, "ewc_loss": 0.028987564146518707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898756429203786e-05, "grad_norm": 17.41152000427246, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.880418062210083, "num_tokens": 448996983.0, "step": 11767 }, { "epoch": 1.497010558453123, "ewc_loss": 0.029015183448791504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.90151838271413e-05, "grad_norm": 17.407852172851562, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8562144041061401, "num_tokens": 449036555.0, "step": 11768 }, { "epoch": 1.4971377687317136, "ewc_loss": 0.028968283906579018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8968283004360273e-05, "grad_norm": 17.385425567626953, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8799021244049072, "num_tokens": 449070545.0, "step": 11769 }, { "epoch": 1.497264979010304, "ewc_loss": 0.028984878212213516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8984877644688822e-05, "grad_norm": 17.405838012695312, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8535430431365967, "num_tokens": 449108859.0, "step": 11770 }, { "epoch": 1.4973921892888946, "ewc_loss": 0.028983816504478455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8983817173866555e-05, "grad_norm": 17.43463134765625, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8656293749809265, "num_tokens": 449145333.0, "step": 11771 }, { "epoch": 1.497519399567485, "ewc_loss": 0.02901129424571991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.901129482779652e-05, "grad_norm": 17.401994705200195, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8631793260574341, "num_tokens": 449181292.0, "step": 11772 }, { "epoch": 1.4976466098460754, "ewc_loss": 0.028963390737771988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8963389922864735e-05, "grad_norm": 17.412687301635742, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8726552128791809, "num_tokens": 449213940.0, "step": 11773 }, { "epoch": 1.497773820124666, "ewc_loss": 0.028980687260627747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8980686693103053e-05, "grad_norm": 17.43300437927246, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8796554803848267, "num_tokens": 449252487.0, "step": 11774 }, { "epoch": 1.4979010304032565, "ewc_loss": 0.028971735388040543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8971735446248204e-05, "grad_norm": 17.41499137878418, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8623238205909729, "num_tokens": 449287268.0, "step": 11775 }, { "epoch": 1.498028240681847, "ewc_loss": 0.028936034068465233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893603414122481e-05, "grad_norm": 17.40043067932129, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8663493990898132, "num_tokens": 449326975.0, "step": 11776 }, { "epoch": 1.4981554509604376, "ewc_loss": 0.029003510251641273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9003509553149343e-05, "grad_norm": 17.397144317626953, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8690122365951538, "num_tokens": 449366061.0, "step": 11777 }, { "epoch": 1.498282661239028, "ewc_loss": 0.028871068730950356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.887106893467717e-05, "grad_norm": 17.407087326049805, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8772673606872559, "num_tokens": 449401763.0, "step": 11778 }, { "epoch": 1.4984098715176186, "ewc_loss": 0.02896469086408615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.896469050028827e-05, "grad_norm": 17.44922637939453, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.8881343603134155, "num_tokens": 449436605.0, "step": 11779 }, { "epoch": 1.4985370817962091, "ewc_loss": 0.028997881338000298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.899788159993477e-05, "grad_norm": 17.424930572509766, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8644089698791504, "num_tokens": 449474523.0, "step": 11780 }, { "epoch": 1.4986642920747997, "ewc_loss": 0.02888966165482998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.888966082537081e-05, "grad_norm": 17.404499053955078, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8811769485473633, "num_tokens": 449513301.0, "step": 11781 }, { "epoch": 1.4987915023533902, "ewc_loss": 0.028924858197569847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8924858270329423e-05, "grad_norm": 17.352548599243164, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8652408123016357, "num_tokens": 449547826.0, "step": 11782 }, { "epoch": 1.4989187126319807, "ewc_loss": 0.028936579823493958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8936579838045873e-05, "grad_norm": 17.45751953125, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8684459328651428, "num_tokens": 449584818.0, "step": 11783 }, { "epoch": 1.4990459229105713, "ewc_loss": 0.0289712306112051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8971229767194018e-05, "grad_norm": 17.412551879882812, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8511937260627747, "num_tokens": 449625368.0, "step": 11784 }, { "epoch": 1.4991731331891618, "ewc_loss": 0.028940878808498383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.894087811000645e-05, "grad_norm": 17.393739700317383, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8570103049278259, "num_tokens": 449666320.0, "step": 11785 }, { "epoch": 1.4993003434677523, "ewc_loss": 0.028966298326849937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8966298486921005e-05, "grad_norm": 17.39813995361328, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8605670928955078, "num_tokens": 449705993.0, "step": 11786 }, { "epoch": 1.4994275537463426, "ewc_loss": 0.028933972120285034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8933971407241188e-05, "grad_norm": 17.425559997558594, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8643885850906372, "num_tokens": 449747088.0, "step": 11787 }, { "epoch": 1.4995547640249332, "ewc_loss": 0.028920436277985573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8920436307089403e-05, "grad_norm": 17.40618133544922, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8714651465415955, "num_tokens": 449789840.0, "step": 11788 }, { "epoch": 1.4996819743035237, "ewc_loss": 0.028910374268889427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8910373657708988e-05, "grad_norm": 17.409576416015625, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8682084679603577, "num_tokens": 449830599.0, "step": 11789 }, { "epoch": 1.4998091845821142, "ewc_loss": 0.028936291113495827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893629061873071e-05, "grad_norm": 17.38959503173828, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8739653825759888, "num_tokens": 449864476.0, "step": 11790 }, { "epoch": 1.4999363948607047, "ewc_loss": 0.02891065552830696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8910655601066537e-05, "grad_norm": 17.391782760620117, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8534249663352966, "num_tokens": 449903791.0, "step": 11791 }, { "epoch": 1.5000636051392953, "ewc_loss": 0.028883421793580055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.888342169171665e-05, "grad_norm": 17.375688552856445, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.869056224822998, "num_tokens": 449945478.0, "step": 11792 }, { "epoch": 1.5001908154178858, "ewc_loss": 0.02889910526573658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889910501835402e-05, "grad_norm": 17.3742618560791, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8663926124572754, "num_tokens": 449986175.0, "step": 11793 }, { "epoch": 1.5003180256964763, "ewc_loss": 0.02889826148748398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8898261007270776e-05, "grad_norm": 17.40234375, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8662165999412537, "num_tokens": 450025537.0, "step": 11794 }, { "epoch": 1.5004452359750666, "ewc_loss": 0.02892455831170082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8924558137077838e-05, "grad_norm": 17.385541915893555, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8548249006271362, "num_tokens": 450068269.0, "step": 11795 }, { "epoch": 1.5005724462536572, "ewc_loss": 0.028901007026433945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.890100768127013e-05, "grad_norm": 17.4064998626709, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8846929669380188, "num_tokens": 450106234.0, "step": 11796 }, { "epoch": 1.5006996565322477, "ewc_loss": 0.02888050675392151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8880505851702765e-05, "grad_norm": 17.40477180480957, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8779616951942444, "num_tokens": 450141916.0, "step": 11797 }, { "epoch": 1.5008268668108382, "ewc_loss": 0.028939472511410713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893947203119751e-05, "grad_norm": 17.51471519470215, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8550845384597778, "num_tokens": 450175427.0, "step": 11798 }, { "epoch": 1.5009540770894287, "ewc_loss": 0.02895343117415905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.895343095588032e-05, "grad_norm": 17.384078979492188, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8709040880203247, "num_tokens": 450211104.0, "step": 11799 }, { "epoch": 1.5010812873680193, "ewc_loss": 0.028835101053118706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8835100238211453e-05, "grad_norm": 17.477458953857422, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8607091903686523, "num_tokens": 450244844.0, "step": 11800 }, { "epoch": 1.5012084976466098, "ewc_loss": 0.028928568586707115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8928569008712657e-05, "grad_norm": 17.359691619873047, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8682209253311157, "num_tokens": 450286094.0, "step": 11801 }, { "epoch": 1.5013357079252003, "ewc_loss": 0.028891298919916153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8891299734823406e-05, "grad_norm": 17.387487411499023, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8598290681838989, "num_tokens": 450324938.0, "step": 11802 }, { "epoch": 1.5014629182037909, "ewc_loss": 0.028968729078769684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8968728656764142e-05, "grad_norm": 17.355846405029297, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8691287636756897, "num_tokens": 450368572.0, "step": 11803 }, { "epoch": 1.5015901284823814, "ewc_loss": 0.02891474775969982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8914748327224515e-05, "grad_norm": 17.343873977661133, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.848630964756012, "num_tokens": 450410926.0, "step": 11804 }, { "epoch": 1.501717338760972, "ewc_loss": 0.028993774205446243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8993774321861565e-05, "grad_norm": 17.471229553222656, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8727025985717773, "num_tokens": 450450206.0, "step": 11805 }, { "epoch": 1.5018445490395624, "ewc_loss": 0.029003724455833435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900372419389896e-05, "grad_norm": 17.349592208862305, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8488434553146362, "num_tokens": 450488998.0, "step": 11806 }, { "epoch": 1.501971759318153, "ewc_loss": 0.028964539989829063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8964539524167776e-05, "grad_norm": 17.413469314575195, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8792056441307068, "num_tokens": 450530476.0, "step": 11807 }, { "epoch": 1.5020989695967435, "ewc_loss": 0.029070880264043808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.907087946368847e-05, "grad_norm": 17.418132781982422, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8765600919723511, "num_tokens": 450560655.0, "step": 11808 }, { "epoch": 1.502226179875334, "ewc_loss": 0.028959112241864204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8959111659787595e-05, "grad_norm": 17.391952514648438, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8713244199752808, "num_tokens": 450601816.0, "step": 11809 }, { "epoch": 1.5023533901539246, "ewc_loss": 0.029065130278468132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9065129638183862e-05, "grad_norm": 17.411571502685547, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8728814125061035, "num_tokens": 450634182.0, "step": 11810 }, { "epoch": 1.502480600432515, "ewc_loss": 0.02900918573141098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900918661907781e-05, "grad_norm": 17.39311981201172, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8641120195388794, "num_tokens": 450672705.0, "step": 11811 }, { "epoch": 1.5026078107111056, "ewc_loss": 0.02905629761517048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9056298444629647e-05, "grad_norm": 17.476139068603516, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8561719655990601, "num_tokens": 450705636.0, "step": 11812 }, { "epoch": 1.502735020989696, "ewc_loss": 0.029030118137598038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.903011773014441e-05, "grad_norm": 17.37090301513672, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8597345352172852, "num_tokens": 450736165.0, "step": 11813 }, { "epoch": 1.5028622312682864, "ewc_loss": 0.029043490067124367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.904349094023928e-05, "grad_norm": 17.79036521911621, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8704686760902405, "num_tokens": 450776939.0, "step": 11814 }, { "epoch": 1.502989441546877, "ewc_loss": 0.029190512374043465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9190512577770278e-05, "grad_norm": 17.44651222229004, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8731770515441895, "num_tokens": 450811994.0, "step": 11815 }, { "epoch": 1.5031166518254675, "ewc_loss": 0.028967738151550293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.896773730753921e-05, "grad_norm": 17.359704971313477, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8692454099655151, "num_tokens": 450853688.0, "step": 11816 }, { "epoch": 1.503243862104058, "ewc_loss": 0.02913011983036995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9130120310583152e-05, "grad_norm": 17.44279670715332, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8688757419586182, "num_tokens": 450892100.0, "step": 11817 }, { "epoch": 1.5033710723826486, "ewc_loss": 0.029138362035155296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9138362151570618e-05, "grad_norm": 17.460617065429688, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8850765228271484, "num_tokens": 450936383.0, "step": 11818 }, { "epoch": 1.503498282661239, "ewc_loss": 0.029094012454152107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9094013370922767e-05, "grad_norm": 17.405532836914062, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8766022324562073, "num_tokens": 450965525.0, "step": 11819 }, { "epoch": 1.5036254929398294, "ewc_loss": 0.029203221201896667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9203221856732853e-05, "grad_norm": 17.477523803710938, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8552839756011963, "num_tokens": 451005874.0, "step": 11820 }, { "epoch": 1.50375270321842, "ewc_loss": 0.029164612293243408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916461198765319e-05, "grad_norm": 17.451107025146484, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8489892482757568, "num_tokens": 451040668.0, "step": 11821 }, { "epoch": 1.5038799134970104, "ewc_loss": 0.029152795672416687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9152795832487755e-05, "grad_norm": 17.446640014648438, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8663678169250488, "num_tokens": 451082747.0, "step": 11822 }, { "epoch": 1.504007123775601, "ewc_loss": 0.029161104932427406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9161104976083152e-05, "grad_norm": 17.438278198242188, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8758346438407898, "num_tokens": 451120344.0, "step": 11823 }, { "epoch": 1.5041343340541915, "ewc_loss": 0.029134703800082207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9134704163880087e-05, "grad_norm": 17.44614601135254, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8634404540061951, "num_tokens": 451156564.0, "step": 11824 }, { "epoch": 1.504261544332782, "ewc_loss": 0.029173679649829865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9173679649829865e-05, "grad_norm": 17.478845596313477, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8639234304428101, "num_tokens": 451197252.0, "step": 11825 }, { "epoch": 1.5043887546113726, "ewc_loss": 0.029141465201973915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9141465347493067e-05, "grad_norm": 17.428197860717773, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8568812012672424, "num_tokens": 451236150.0, "step": 11826 }, { "epoch": 1.504515964889963, "ewc_loss": 0.02904755249619484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.90475527435774e-05, "grad_norm": 17.415435791015625, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8651471138000488, "num_tokens": 451275751.0, "step": 11827 }, { "epoch": 1.5046431751685536, "ewc_loss": 0.029121939092874527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9121938496246003e-05, "grad_norm": 17.38201141357422, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8899390697479248, "num_tokens": 451316778.0, "step": 11828 }, { "epoch": 1.5047703854471441, "ewc_loss": 0.02911711484193802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.91171145363478e-05, "grad_norm": 17.483871459960938, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8676667809486389, "num_tokens": 451359921.0, "step": 11829 }, { "epoch": 1.5048975957257347, "ewc_loss": 0.02911330945789814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.911330921051558e-05, "grad_norm": 17.4776554107666, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8747566938400269, "num_tokens": 451395616.0, "step": 11830 }, { "epoch": 1.5050248060043252, "ewc_loss": 0.02905830554664135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9058304789941758e-05, "grad_norm": 17.476402282714844, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8680630922317505, "num_tokens": 451427766.0, "step": 11831 }, { "epoch": 1.5051520162829157, "ewc_loss": 0.02908151037991047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9081509637762792e-05, "grad_norm": 17.43214988708496, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8776118159294128, "num_tokens": 451463073.0, "step": 11832 }, { "epoch": 1.5052792265615063, "ewc_loss": 0.028985995799303055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8985996323172003e-05, "grad_norm": 17.42745590209961, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8874260187149048, "num_tokens": 451503039.0, "step": 11833 }, { "epoch": 1.5054064368400968, "ewc_loss": 0.029035931453108788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9035931220278144e-05, "grad_norm": 17.396892547607422, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8507198691368103, "num_tokens": 451541599.0, "step": 11834 }, { "epoch": 1.5055336471186873, "ewc_loss": 0.02900061383843422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9000613722018898e-05, "grad_norm": 17.4245548248291, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.853898286819458, "num_tokens": 451579278.0, "step": 11835 }, { "epoch": 1.5056608573972778, "ewc_loss": 0.029033532366156578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9033531973254867e-05, "grad_norm": 17.412086486816406, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8720629215240479, "num_tokens": 451618591.0, "step": 11836 }, { "epoch": 1.5057880676758684, "ewc_loss": 0.02901366725564003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9013666789978743e-05, "grad_norm": 17.440940856933594, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8698305487632751, "num_tokens": 451654453.0, "step": 11837 }, { "epoch": 1.5059152779544587, "ewc_loss": 0.029054831713438034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.905483233917039e-05, "grad_norm": 17.433250427246094, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.866572380065918, "num_tokens": 451688302.0, "step": 11838 }, { "epoch": 1.5060424882330492, "ewc_loss": 0.02901897206902504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9018972782068886e-05, "grad_norm": 17.450986862182617, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8743172287940979, "num_tokens": 451727301.0, "step": 11839 }, { "epoch": 1.5061696985116397, "ewc_loss": 0.029032310470938683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9032309612375684e-05, "grad_norm": 17.344741821289062, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8541060090065002, "num_tokens": 451770471.0, "step": 11840 }, { "epoch": 1.5062969087902303, "ewc_loss": 0.028964491561055183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8964492230443284e-05, "grad_norm": 17.461299896240234, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8574654459953308, "num_tokens": 451809413.0, "step": 11841 }, { "epoch": 1.5064241190688208, "ewc_loss": 0.029011284932494164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.90112857328495e-05, "grad_norm": 17.315948486328125, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.865876317024231, "num_tokens": 451850177.0, "step": 11842 }, { "epoch": 1.5065513293474113, "ewc_loss": 0.028997402638196945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.899740320572164e-05, "grad_norm": 17.501237869262695, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8564357757568359, "num_tokens": 451887977.0, "step": 11843 }, { "epoch": 1.5066785396260016, "ewc_loss": 0.029085882008075714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9085882488288917e-05, "grad_norm": 17.308561325073242, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8791337013244629, "num_tokens": 451927403.0, "step": 11844 }, { "epoch": 1.5068057499045922, "ewc_loss": 0.028955895453691483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8955895686522126e-05, "grad_norm": 17.461076736450195, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8680195808410645, "num_tokens": 451967350.0, "step": 11845 }, { "epoch": 1.5069329601831827, "ewc_loss": 0.029070686548948288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9070686650811695e-05, "grad_norm": 17.362234115600586, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.867750346660614, "num_tokens": 452008664.0, "step": 11846 }, { "epoch": 1.5070601704617732, "ewc_loss": 0.02897387184202671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8973870939807966e-05, "grad_norm": 17.509784698486328, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8746523857116699, "num_tokens": 452047303.0, "step": 11847 }, { "epoch": 1.5071873807403637, "ewc_loss": 0.029066743329167366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9066743081784807e-05, "grad_norm": 17.42015266418457, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8695650696754456, "num_tokens": 452087359.0, "step": 11848 }, { "epoch": 1.5073145910189543, "ewc_loss": 0.028963211923837662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8963211661903188e-05, "grad_norm": 17.525657653808594, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8542342782020569, "num_tokens": 452129426.0, "step": 11849 }, { "epoch": 1.5074418012975448, "ewc_loss": 0.029036235064268112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9036234991508536e-05, "grad_norm": 17.39251136779785, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8730515837669373, "num_tokens": 452169134.0, "step": 11850 }, { "epoch": 1.5075690115761353, "ewc_loss": 0.02892787754535675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.892787779273931e-05, "grad_norm": 17.487520217895508, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8586406707763672, "num_tokens": 452209962.0, "step": 11851 }, { "epoch": 1.5076962218547258, "ewc_loss": 0.029073866084218025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9073866244289093e-05, "grad_norm": 17.49436378479004, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8714584112167358, "num_tokens": 452245978.0, "step": 11852 }, { "epoch": 1.5078234321333164, "ewc_loss": 0.028971100226044655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8971100618946366e-05, "grad_norm": 17.41766929626465, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.87811279296875, "num_tokens": 452281013.0, "step": 11853 }, { "epoch": 1.507950642411907, "ewc_loss": 0.02898014523088932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8980144634260796e-05, "grad_norm": 17.46630096435547, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8818155527114868, "num_tokens": 452317174.0, "step": 11854 }, { "epoch": 1.5080778526904974, "ewc_loss": 0.02897992543876171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8979924536542967e-05, "grad_norm": 17.446199417114258, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.864832878112793, "num_tokens": 452362974.0, "step": 11855 }, { "epoch": 1.508205062969088, "ewc_loss": 0.028913181275129318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.891318035835866e-05, "grad_norm": 17.435890197753906, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.879798948764801, "num_tokens": 452400701.0, "step": 11856 }, { "epoch": 1.5083322732476785, "ewc_loss": 0.02894807979464531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.894807948905509e-05, "grad_norm": 17.40727996826172, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8772922158241272, "num_tokens": 452438541.0, "step": 11857 }, { "epoch": 1.508459483526269, "ewc_loss": 0.028973478823900223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.89734780380968e-05, "grad_norm": 17.553682327270508, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8623243570327759, "num_tokens": 452476918.0, "step": 11858 }, { "epoch": 1.5085866938048595, "ewc_loss": 0.02893940359354019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8939402909600176e-05, "grad_norm": 17.376365661621094, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8727979063987732, "num_tokens": 452509231.0, "step": 11859 }, { "epoch": 1.50871390408345, "ewc_loss": 0.028937382623553276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8937382012372836e-05, "grad_norm": 17.439960479736328, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8634131550788879, "num_tokens": 452547079.0, "step": 11860 }, { "epoch": 1.5088411143620406, "ewc_loss": 0.029014617204666138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9014618121436797e-05, "grad_norm": 17.500946044921875, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8790011405944824, "num_tokens": 452580215.0, "step": 11861 }, { "epoch": 1.508968324640631, "ewc_loss": 0.02896205708384514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8962056603631936e-05, "grad_norm": 17.52574348449707, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8745378255844116, "num_tokens": 452615778.0, "step": 11862 }, { "epoch": 1.5090955349192214, "ewc_loss": 0.02895783632993698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.895783654821571e-05, "grad_norm": 17.476943969726562, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8701041340827942, "num_tokens": 452652184.0, "step": 11863 }, { "epoch": 1.509222745197812, "ewc_loss": 0.028973251581192017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8973250664421357e-05, "grad_norm": 17.52606773376465, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8544130325317383, "num_tokens": 452691345.0, "step": 11864 }, { "epoch": 1.5093499554764025, "ewc_loss": 0.028934050351381302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.893404962378554e-05, "grad_norm": 17.4154052734375, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8696298599243164, "num_tokens": 452729982.0, "step": 11865 }, { "epoch": 1.509477165754993, "ewc_loss": 0.02896992117166519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8969921913812868e-05, "grad_norm": 17.492191314697266, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8677243590354919, "num_tokens": 452771272.0, "step": 11866 }, { "epoch": 1.5096043760335836, "ewc_loss": 0.02902183309197426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.902183223341126e-05, "grad_norm": 17.44712257385254, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8648556470870972, "num_tokens": 452812231.0, "step": 11867 }, { "epoch": 1.509731586312174, "ewc_loss": 0.028912469744682312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8912469133501872e-05, "grad_norm": 17.46739387512207, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8568295240402222, "num_tokens": 452858406.0, "step": 11868 }, { "epoch": 1.5098587965907644, "ewc_loss": 0.028983835130929947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898383536376059e-05, "grad_norm": 17.465435028076172, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8599452972412109, "num_tokens": 452898793.0, "step": 11869 }, { "epoch": 1.509986006869355, "ewc_loss": 0.02898556925356388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898556886066217e-05, "grad_norm": 17.503501892089844, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8738572597503662, "num_tokens": 452937929.0, "step": 11870 }, { "epoch": 1.5101132171479454, "ewc_loss": 0.02900248020887375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9002480005146936e-05, "grad_norm": 17.58504867553711, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8713328838348389, "num_tokens": 452975436.0, "step": 11871 }, { "epoch": 1.510240427426536, "ewc_loss": 0.02892059087753296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8920590921188705e-05, "grad_norm": 17.40750503540039, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8690087795257568, "num_tokens": 453016334.0, "step": 11872 }, { "epoch": 1.5103676377051265, "ewc_loss": 0.02892937883734703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8929378458997235e-05, "grad_norm": 17.498220443725586, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8724700808525085, "num_tokens": 453053630.0, "step": 11873 }, { "epoch": 1.510494847983717, "ewc_loss": 0.028927648440003395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8927648600074463e-05, "grad_norm": 17.417224884033203, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8505697250366211, "num_tokens": 453092341.0, "step": 11874 }, { "epoch": 1.5106220582623076, "ewc_loss": 0.02889879234135151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.889879215217661e-05, "grad_norm": 17.442832946777344, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8765908479690552, "num_tokens": 453126859.0, "step": 11875 }, { "epoch": 1.510749268540898, "ewc_loss": 0.029021216556429863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.902121741499286e-05, "grad_norm": 17.483257293701172, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8656296730041504, "num_tokens": 453167327.0, "step": 11876 }, { "epoch": 1.5108764788194886, "ewc_loss": 0.028905026614665985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8905025828862563e-05, "grad_norm": 17.37860107421875, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8738861083984375, "num_tokens": 453205901.0, "step": 11877 }, { "epoch": 1.5110036890980791, "ewc_loss": 0.028903478756546974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8903477868880145e-05, "grad_norm": 17.421611785888672, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8671329021453857, "num_tokens": 453241856.0, "step": 11878 }, { "epoch": 1.5111308993766697, "ewc_loss": 0.028941284865140915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8941285563632846e-05, "grad_norm": 17.40212059020996, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8764861822128296, "num_tokens": 453282528.0, "step": 11879 }, { "epoch": 1.5112581096552602, "ewc_loss": 0.028962232172489166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.896223304560408e-05, "grad_norm": 17.49289894104004, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8596780896186829, "num_tokens": 453318762.0, "step": 11880 }, { "epoch": 1.5113853199338507, "ewc_loss": 0.02896925061941147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.896925070672296e-05, "grad_norm": 17.4412841796875, "learning_rate": 1e-06, "loss": 0.37, "mean_token_accuracy": 0.8835332989692688, "num_tokens": 453352482.0, "step": 11881 }, { "epoch": 1.5115125302124413, "ewc_loss": 0.028993552550673485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8993552405154333e-05, "grad_norm": 17.46265411376953, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8678038120269775, "num_tokens": 453393692.0, "step": 11882 }, { "epoch": 1.5116397404910318, "ewc_loss": 0.02901776134967804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9017761335126124e-05, "grad_norm": 17.479021072387695, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8719630837440491, "num_tokens": 453429217.0, "step": 11883 }, { "epoch": 1.5117669507696223, "ewc_loss": 0.02901928871870041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9019289286225103e-05, "grad_norm": 17.434186935424805, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8537120819091797, "num_tokens": 453468190.0, "step": 11884 }, { "epoch": 1.5118941610482128, "ewc_loss": 0.028970085084438324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8970085622859187e-05, "grad_norm": 17.45884895324707, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8622614145278931, "num_tokens": 453509893.0, "step": 11885 }, { "epoch": 1.5120213713268034, "ewc_loss": 0.028974859043955803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8974858651054092e-05, "grad_norm": 17.38046646118164, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8722203373908997, "num_tokens": 453546972.0, "step": 11886 }, { "epoch": 1.5121485816053937, "ewc_loss": 0.028948895633220673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8948896215297282e-05, "grad_norm": 17.523908615112305, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8798457384109497, "num_tokens": 453578480.0, "step": 11887 }, { "epoch": 1.5122757918839842, "ewc_loss": 0.029056519269943237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9056518542347476e-05, "grad_norm": 17.423328399658203, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8692716360092163, "num_tokens": 453623248.0, "step": 11888 }, { "epoch": 1.5124030021625747, "ewc_loss": 0.02896970696747303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.896970727306325e-05, "grad_norm": 17.46449851989746, "learning_rate": 1e-06, "loss": 0.3452, "mean_token_accuracy": 0.8883472681045532, "num_tokens": 453660088.0, "step": 11889 }, { "epoch": 1.5125302124411653, "ewc_loss": 0.029055604711174965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9055605409666896e-05, "grad_norm": 17.416730880737305, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.869767963886261, "num_tokens": 453698931.0, "step": 11890 }, { "epoch": 1.5126574227197558, "ewc_loss": 0.029003508388996124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900350773415994e-05, "grad_norm": 17.56056785583496, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8708289265632629, "num_tokens": 453734220.0, "step": 11891 }, { "epoch": 1.5127846329983463, "ewc_loss": 0.029037203639745712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9037204512860626e-05, "grad_norm": 17.42401885986328, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8675635457038879, "num_tokens": 453774056.0, "step": 11892 }, { "epoch": 1.5129118432769366, "ewc_loss": 0.028955886140465736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.895588659157511e-05, "grad_norm": 17.532529830932617, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8756979703903198, "num_tokens": 453806616.0, "step": 11893 }, { "epoch": 1.5130390535555271, "ewc_loss": 0.029118184000253677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9118184102117084e-05, "grad_norm": 17.485849380493164, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8771519064903259, "num_tokens": 453843838.0, "step": 11894 }, { "epoch": 1.5131662638341177, "ewc_loss": 0.02892221324145794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8922213459736668e-05, "grad_norm": 17.532047271728516, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8765085935592651, "num_tokens": 453876888.0, "step": 11895 }, { "epoch": 1.5132934741127082, "ewc_loss": 0.02902544103562832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9025441108387895e-05, "grad_norm": 17.403236389160156, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8840994238853455, "num_tokens": 453915334.0, "step": 11896 }, { "epoch": 1.5134206843912987, "ewc_loss": 0.028932739049196243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8932738132425584e-05, "grad_norm": 17.496307373046875, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8780101537704468, "num_tokens": 453950590.0, "step": 11897 }, { "epoch": 1.5135478946698893, "ewc_loss": 0.028960861265659332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8960861527593806e-05, "grad_norm": 17.4061336517334, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8728204965591431, "num_tokens": 453992590.0, "step": 11898 }, { "epoch": 1.5136751049484798, "ewc_loss": 0.02897229604423046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8972295694984496e-05, "grad_norm": 17.423612594604492, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8733997344970703, "num_tokens": 454038619.0, "step": 11899 }, { "epoch": 1.5138023152270703, "ewc_loss": 0.0290078017860651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900780236814171e-05, "grad_norm": 17.44038200378418, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8606885671615601, "num_tokens": 454081564.0, "step": 11900 }, { "epoch": 1.5139295255056608, "ewc_loss": 0.028979916125535965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.897991544159595e-05, "grad_norm": 17.402875900268555, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8567801713943481, "num_tokens": 454125581.0, "step": 11901 }, { "epoch": 1.5140567357842514, "ewc_loss": 0.02896742895245552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8967428079340607e-05, "grad_norm": 17.455591201782227, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.877021312713623, "num_tokens": 454163961.0, "step": 11902 }, { "epoch": 1.514183946062842, "ewc_loss": 0.029018735513091087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9018736313446425e-05, "grad_norm": 17.462461471557617, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8632925748825073, "num_tokens": 454200410.0, "step": 11903 }, { "epoch": 1.5143111563414324, "ewc_loss": 0.028940392658114433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8940392439835705e-05, "grad_norm": 17.51666259765625, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8556739091873169, "num_tokens": 454237143.0, "step": 11904 }, { "epoch": 1.514438366620023, "ewc_loss": 0.029009483754634857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9009483114350587e-05, "grad_norm": 17.39508819580078, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8766695857048035, "num_tokens": 454274022.0, "step": 11905 }, { "epoch": 1.5145655768986135, "ewc_loss": 0.0289441104978323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8944110454176553e-05, "grad_norm": 17.527997970581055, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8650429248809814, "num_tokens": 454307955.0, "step": 11906 }, { "epoch": 1.514692787177204, "ewc_loss": 0.028983717784285545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.898371712944936e-05, "grad_norm": 17.395843505859375, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8767046332359314, "num_tokens": 454345449.0, "step": 11907 }, { "epoch": 1.5148199974557945, "ewc_loss": 0.02888387255370617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8883872801088728e-05, "grad_norm": 17.414247512817383, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8586969375610352, "num_tokens": 454383096.0, "step": 11908 }, { "epoch": 1.514947207734385, "ewc_loss": 0.029002612456679344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9002612791373394e-05, "grad_norm": 17.480539321899414, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8584490418434143, "num_tokens": 454420142.0, "step": 11909 }, { "epoch": 1.5150744180129756, "ewc_loss": 0.028954004868865013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8954005756531842e-05, "grad_norm": 17.39963722229004, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8677709102630615, "num_tokens": 454454991.0, "step": 11910 }, { "epoch": 1.515201628291566, "ewc_loss": 0.02897031605243683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.8970316634513438e-05, "grad_norm": 17.50641441345215, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8660097122192383, "num_tokens": 454490161.0, "step": 11911 }, { "epoch": 1.5153288385701564, "ewc_loss": 0.029004063457250595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.900406252592802e-05, "grad_norm": 17.351083755493164, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8604275584220886, "num_tokens": 454534767.0, "step": 11912 }, { "epoch": 1.515456048848747, "ewc_loss": 0.02900359220802784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9003591407672502e-05, "grad_norm": 17.49474334716797, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8774678707122803, "num_tokens": 454573246.0, "step": 11913 }, { "epoch": 1.5155832591273375, "ewc_loss": 0.029103869572281837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9103868655511178e-05, "grad_norm": 17.39824867248535, "learning_rate": 1e-06, "loss": 0.3614, "mean_token_accuracy": 0.8818868398666382, "num_tokens": 454607626.0, "step": 11914 }, { "epoch": 1.515710469405928, "ewc_loss": 0.02901376038789749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9013759558438323e-05, "grad_norm": 17.444339752197266, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8625950217247009, "num_tokens": 454639840.0, "step": 11915 }, { "epoch": 1.5158376796845185, "ewc_loss": 0.029041580855846405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9041581001365557e-05, "grad_norm": 17.428709030151367, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.860177218914032, "num_tokens": 454681445.0, "step": 11916 }, { "epoch": 1.515964889963109, "ewc_loss": 0.02905300445854664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9053004254819825e-05, "grad_norm": 17.416154861450195, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8620350360870361, "num_tokens": 454726191.0, "step": 11917 }, { "epoch": 1.5160921002416994, "ewc_loss": 0.029045825824141502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9045826522633433e-05, "grad_norm": 17.43018341064453, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8757561445236206, "num_tokens": 454765112.0, "step": 11918 }, { "epoch": 1.51621931052029, "ewc_loss": 0.029099835082888603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9099835956003517e-05, "grad_norm": 17.405393600463867, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8657572865486145, "num_tokens": 454805254.0, "step": 11919 }, { "epoch": 1.5163465207988804, "ewc_loss": 0.029066333547234535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.906633380916901e-05, "grad_norm": 17.52751922607422, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8618333339691162, "num_tokens": 454841344.0, "step": 11920 }, { "epoch": 1.516473731077471, "ewc_loss": 0.029096493497490883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.90964926534798e-05, "grad_norm": 17.473787307739258, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8821395635604858, "num_tokens": 454879986.0, "step": 11921 }, { "epoch": 1.5166009413560615, "ewc_loss": 0.029026761651039124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.902676169469487e-05, "grad_norm": 17.483978271484375, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8607654571533203, "num_tokens": 454917731.0, "step": 11922 }, { "epoch": 1.516728151634652, "ewc_loss": 0.029045697301626205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.904569737438578e-05, "grad_norm": 17.44807243347168, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8747497797012329, "num_tokens": 454952837.0, "step": 11923 }, { "epoch": 1.5168553619132426, "ewc_loss": 0.029017411172389984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9017412089160644e-05, "grad_norm": 17.485769271850586, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8715591430664062, "num_tokens": 454992751.0, "step": 11924 }, { "epoch": 1.516982572191833, "ewc_loss": 0.029109282419085503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.910928196797613e-05, "grad_norm": 17.492204666137695, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8670501112937927, "num_tokens": 455029829.0, "step": 11925 }, { "epoch": 1.5171097824704236, "ewc_loss": 0.028999963775277138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.899996434280183e-05, "grad_norm": 17.39777183532715, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8693691492080688, "num_tokens": 455072167.0, "step": 11926 }, { "epoch": 1.5172369927490141, "ewc_loss": 0.029027272015810013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9027272830717266e-05, "grad_norm": 17.555667877197266, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8761545419692993, "num_tokens": 455108433.0, "step": 11927 }, { "epoch": 1.5173642030276047, "ewc_loss": 0.02907113917171955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9071139579173177e-05, "grad_norm": 17.378143310546875, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8717154264450073, "num_tokens": 455148943.0, "step": 11928 }, { "epoch": 1.5174914133061952, "ewc_loss": 0.02892809361219406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.892809425247833e-05, "grad_norm": 17.4423770904541, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8635088801383972, "num_tokens": 455185099.0, "step": 11929 }, { "epoch": 1.5176186235847857, "ewc_loss": 0.02910536527633667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9105365683790296e-05, "grad_norm": 17.460865020751953, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8830695152282715, "num_tokens": 455225322.0, "step": 11930 }, { "epoch": 1.5177458338633762, "ewc_loss": 0.029059037566184998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9059037842671387e-05, "grad_norm": 17.496360778808594, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8658992648124695, "num_tokens": 455262830.0, "step": 11931 }, { "epoch": 1.5178730441419668, "ewc_loss": 0.02904493920505047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9044938855804503e-05, "grad_norm": 17.395048141479492, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8755989074707031, "num_tokens": 455300825.0, "step": 11932 }, { "epoch": 1.5180002544205573, "ewc_loss": 0.029045268893241882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9045268092886545e-05, "grad_norm": 17.552753448486328, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8662759065628052, "num_tokens": 455336209.0, "step": 11933 }, { "epoch": 1.5181274646991478, "ewc_loss": 0.029097048565745354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9097049264237285e-05, "grad_norm": 17.492097854614258, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8686763048171997, "num_tokens": 455374909.0, "step": 11934 }, { "epoch": 1.5182546749777384, "ewc_loss": 0.028971707448363304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.897170816140715e-05, "grad_norm": 17.455835342407227, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8816759586334229, "num_tokens": 455417179.0, "step": 11935 }, { "epoch": 1.5183818852563287, "ewc_loss": 0.029023529961705208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9023529350524768e-05, "grad_norm": 17.434656143188477, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8557353019714355, "num_tokens": 455457923.0, "step": 11936 }, { "epoch": 1.5185090955349192, "ewc_loss": 0.029006054624915123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9006054319324903e-05, "grad_norm": 17.448734283447266, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8599137663841248, "num_tokens": 455492932.0, "step": 11937 }, { "epoch": 1.5186363058135097, "ewc_loss": 0.029066182672977448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9066182833048515e-05, "grad_norm": 17.406726837158203, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8601555824279785, "num_tokens": 455540392.0, "step": 11938 }, { "epoch": 1.5187635160921003, "ewc_loss": 0.02904588356614113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9045882911304943e-05, "grad_norm": 17.465789794921875, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8643777370452881, "num_tokens": 455574461.0, "step": 11939 }, { "epoch": 1.5188907263706908, "ewc_loss": 0.02906603366136551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9066033675917424e-05, "grad_norm": 17.40900230407715, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8643972873687744, "num_tokens": 455608315.0, "step": 11940 }, { "epoch": 1.5190179366492813, "ewc_loss": 0.029018094763159752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9018094210186973e-05, "grad_norm": 17.37261962890625, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8717805743217468, "num_tokens": 455654927.0, "step": 11941 }, { "epoch": 1.5191451469278716, "ewc_loss": 0.029063232243061066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9063232432235964e-05, "grad_norm": 17.404489517211914, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8572201132774353, "num_tokens": 455695745.0, "step": 11942 }, { "epoch": 1.5192723572064621, "ewc_loss": 0.029118621721863747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.911862247856334e-05, "grad_norm": 17.403644561767578, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8697983026504517, "num_tokens": 455734652.0, "step": 11943 }, { "epoch": 1.5193995674850527, "ewc_loss": 0.029072122648358345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9072121833451092e-05, "grad_norm": 17.44586753845215, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8456655144691467, "num_tokens": 455776063.0, "step": 11944 }, { "epoch": 1.5195267777636432, "ewc_loss": 0.029065849259495735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9065849957987666e-05, "grad_norm": 17.4178524017334, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8631642460823059, "num_tokens": 455820595.0, "step": 11945 }, { "epoch": 1.5196539880422337, "ewc_loss": 0.02908855490386486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9088554583722726e-05, "grad_norm": 17.505979537963867, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8742787837982178, "num_tokens": 455859553.0, "step": 11946 }, { "epoch": 1.5197811983208243, "ewc_loss": 0.029103005304932594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9103004635544494e-05, "grad_norm": 17.475479125976562, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8689481019973755, "num_tokens": 455892676.0, "step": 11947 }, { "epoch": 1.5199084085994148, "ewc_loss": 0.02903790958225727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9037910280749202e-05, "grad_norm": 17.382017135620117, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8603518605232239, "num_tokens": 455932106.0, "step": 11948 }, { "epoch": 1.5200356188780053, "ewc_loss": 0.02914544753730297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914544711529743e-05, "grad_norm": 17.56861114501953, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8735809326171875, "num_tokens": 455971555.0, "step": 11949 }, { "epoch": 1.5201628291565958, "ewc_loss": 0.029097793623805046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9097793230903335e-05, "grad_norm": 17.5036678314209, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8631974458694458, "num_tokens": 456014177.0, "step": 11950 }, { "epoch": 1.5202900394351864, "ewc_loss": 0.029014388099312782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.901438892877195e-05, "grad_norm": 17.53043556213379, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8737677335739136, "num_tokens": 456048229.0, "step": 11951 }, { "epoch": 1.520417249713777, "ewc_loss": 0.029059775173664093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9059774533379823e-05, "grad_norm": 17.42911148071289, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8874842524528503, "num_tokens": 456085815.0, "step": 11952 }, { "epoch": 1.5205444599923674, "ewc_loss": 0.029062552377581596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9062552130199037e-05, "grad_norm": 17.49383544921875, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8711324334144592, "num_tokens": 456128971.0, "step": 11953 }, { "epoch": 1.520671670270958, "ewc_loss": 0.029045499861240387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9045499104540795e-05, "grad_norm": 17.47539520263672, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8515750765800476, "num_tokens": 456159705.0, "step": 11954 }, { "epoch": 1.5207988805495485, "ewc_loss": 0.029046641662716866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9046641429886222e-05, "grad_norm": 17.519908905029297, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8593589663505554, "num_tokens": 456198251.0, "step": 11955 }, { "epoch": 1.520926090828139, "ewc_loss": 0.029073061421513557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9073062250972725e-05, "grad_norm": 17.429080963134766, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8616751432418823, "num_tokens": 456237612.0, "step": 11956 }, { "epoch": 1.5210533011067295, "ewc_loss": 0.029049929231405258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.904992834373843e-05, "grad_norm": 17.479700088500977, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8671774864196777, "num_tokens": 456273665.0, "step": 11957 }, { "epoch": 1.52118051138532, "ewc_loss": 0.02909100614488125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9091006581438705e-05, "grad_norm": 17.47134780883789, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8696385622024536, "num_tokens": 456305467.0, "step": 11958 }, { "epoch": 1.5213077216639106, "ewc_loss": 0.029122617095708847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9122616979293525e-05, "grad_norm": 17.5174503326416, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8749679327011108, "num_tokens": 456342951.0, "step": 11959 }, { "epoch": 1.521434931942501, "ewc_loss": 0.02913374826312065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9133749194443226e-05, "grad_norm": 17.445465087890625, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8680516481399536, "num_tokens": 456375846.0, "step": 11960 }, { "epoch": 1.5215621422210914, "ewc_loss": 0.029103053733706474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.910305374825839e-05, "grad_norm": 17.48578453063965, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.877803385257721, "num_tokens": 456407993.0, "step": 11961 }, { "epoch": 1.521689352499682, "ewc_loss": 0.029148928821086884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9148928661015816e-05, "grad_norm": 17.511106491088867, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8494710922241211, "num_tokens": 456444278.0, "step": 11962 }, { "epoch": 1.5218165627782725, "ewc_loss": 0.029116341844201088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9116341465851292e-05, "grad_norm": 17.47038459777832, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8772746324539185, "num_tokens": 456483950.0, "step": 11963 }, { "epoch": 1.521943773056863, "ewc_loss": 0.029146427288651466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914642755058594e-05, "grad_norm": 17.49583625793457, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8692646622657776, "num_tokens": 456527438.0, "step": 11964 }, { "epoch": 1.5220709833354535, "ewc_loss": 0.02911292016506195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9112919946783222e-05, "grad_norm": 17.45853614807129, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8787646889686584, "num_tokens": 456560572.0, "step": 11965 }, { "epoch": 1.5221981936140438, "ewc_loss": 0.029114168137311935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9114167773514055e-05, "grad_norm": 17.435272216796875, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8599241375923157, "num_tokens": 456604781.0, "step": 11966 }, { "epoch": 1.5223254038926344, "ewc_loss": 0.02909930981695652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9099310268065892e-05, "grad_norm": 17.431116104125977, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8818985819816589, "num_tokens": 456639415.0, "step": 11967 }, { "epoch": 1.522452614171225, "ewc_loss": 0.029132865369319916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9132865165593103e-05, "grad_norm": 17.482515335083008, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8805598020553589, "num_tokens": 456671183.0, "step": 11968 }, { "epoch": 1.5225798244498154, "ewc_loss": 0.029140446335077286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9140446713427082e-05, "grad_norm": 17.38990020751953, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8574773669242859, "num_tokens": 456712995.0, "step": 11969 }, { "epoch": 1.522707034728406, "ewc_loss": 0.0291446540504694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9144654035917483e-05, "grad_norm": 17.521772384643555, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8640567064285278, "num_tokens": 456750587.0, "step": 11970 }, { "epoch": 1.5228342450069965, "ewc_loss": 0.0291285403072834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.912853960879147e-05, "grad_norm": 17.42473793029785, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8763934373855591, "num_tokens": 456787503.0, "step": 11971 }, { "epoch": 1.522961455285587, "ewc_loss": 0.029072128236293793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9072129109408706e-05, "grad_norm": 17.450881958007812, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.847079336643219, "num_tokens": 456831645.0, "step": 11972 }, { "epoch": 1.5230886655641775, "ewc_loss": 0.029145509004592896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914550896093715e-05, "grad_norm": 17.49982452392578, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8761450052261353, "num_tokens": 456869143.0, "step": 11973 }, { "epoch": 1.523215875842768, "ewc_loss": 0.029109517112374306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9109516617609188e-05, "grad_norm": 17.411922454833984, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.866852879524231, "num_tokens": 456906465.0, "step": 11974 }, { "epoch": 1.5233430861213586, "ewc_loss": 0.02916421741247177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916421726695262e-05, "grad_norm": 17.592979431152344, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.867368757724762, "num_tokens": 456942048.0, "step": 11975 }, { "epoch": 1.5234702963999491, "ewc_loss": 0.029130972921848297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9130973416613415e-05, "grad_norm": 17.48617172241211, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.886367917060852, "num_tokens": 456975710.0, "step": 11976 }, { "epoch": 1.5235975066785397, "ewc_loss": 0.029054157435894012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9054157494101673e-05, "grad_norm": 17.465614318847656, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8568916320800781, "num_tokens": 457022295.0, "step": 11977 }, { "epoch": 1.5237247169571302, "ewc_loss": 0.029100243002176285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.910024340962991e-05, "grad_norm": 17.47434425354004, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8614211678504944, "num_tokens": 457060433.0, "step": 11978 }, { "epoch": 1.5238519272357207, "ewc_loss": 0.029026873409748077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9026872653048486e-05, "grad_norm": 17.443614959716797, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8750505447387695, "num_tokens": 457101760.0, "step": 11979 }, { "epoch": 1.5239791375143112, "ewc_loss": 0.029076851904392242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.907685120590031e-05, "grad_norm": 17.479928970336914, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8835101127624512, "num_tokens": 457142676.0, "step": 11980 }, { "epoch": 1.5241063477929018, "ewc_loss": 0.029087070375680923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9087070288369432e-05, "grad_norm": 17.432355880737305, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8752923607826233, "num_tokens": 457179507.0, "step": 11981 }, { "epoch": 1.5242335580714923, "ewc_loss": 0.029068203642964363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9068203730275854e-05, "grad_norm": 17.487180709838867, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8697111010551453, "num_tokens": 457221563.0, "step": 11982 }, { "epoch": 1.5243607683500828, "ewc_loss": 0.029103729873895645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.910373041231651e-05, "grad_norm": 17.39306640625, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8662024140357971, "num_tokens": 457258634.0, "step": 11983 }, { "epoch": 1.5244879786286734, "ewc_loss": 0.029091913253068924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.909191243816167e-05, "grad_norm": 17.45549774169922, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8833243250846863, "num_tokens": 457293097.0, "step": 11984 }, { "epoch": 1.5246151889072637, "ewc_loss": 0.029072044417262077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.907204361690674e-05, "grad_norm": 17.38313102722168, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8826196193695068, "num_tokens": 457333525.0, "step": 11985 }, { "epoch": 1.5247423991858542, "ewc_loss": 0.029140474274754524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9140473998268135e-05, "grad_norm": 17.4778995513916, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8547546863555908, "num_tokens": 457379772.0, "step": 11986 }, { "epoch": 1.5248696094644447, "ewc_loss": 0.029103774577379227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9103774068062194e-05, "grad_norm": 17.361265182495117, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8737624883651733, "num_tokens": 457415852.0, "step": 11987 }, { "epoch": 1.5249968197430352, "ewc_loss": 0.029102586209774017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9102586267981678e-05, "grad_norm": 17.495866775512695, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8737643957138062, "num_tokens": 457449771.0, "step": 11988 }, { "epoch": 1.5251240300216258, "ewc_loss": 0.029096024110913277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9096023354213685e-05, "grad_norm": 17.38727378845215, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8647289872169495, "num_tokens": 457489601.0, "step": 11989 }, { "epoch": 1.5252512403002163, "ewc_loss": 0.029078258201479912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9078257284709252e-05, "grad_norm": 17.43419647216797, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8830908536911011, "num_tokens": 457530516.0, "step": 11990 }, { "epoch": 1.5253784505788066, "ewc_loss": 0.02914191596210003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9141916456865147e-05, "grad_norm": 17.439340591430664, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.86538165807724, "num_tokens": 457568299.0, "step": 11991 }, { "epoch": 1.5255056608573971, "ewc_loss": 0.02906535379588604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9065353373880498e-05, "grad_norm": 17.430646896362305, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8681522607803345, "num_tokens": 457600476.0, "step": 11992 }, { "epoch": 1.5256328711359877, "ewc_loss": 0.029137954115867615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9137954697944224e-05, "grad_norm": 17.54793930053711, "learning_rate": 1e-06, "loss": 0.3567, "mean_token_accuracy": 0.88807213306427, "num_tokens": 457632024.0, "step": 11993 }, { "epoch": 1.5257600814145782, "ewc_loss": 0.029146632179617882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914663309638854e-05, "grad_norm": 17.42713737487793, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8755004405975342, "num_tokens": 457670805.0, "step": 11994 }, { "epoch": 1.5258872916931687, "ewc_loss": 0.02909572422504425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9095725039951503e-05, "grad_norm": 17.478145599365234, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8703480958938599, "num_tokens": 457702468.0, "step": 11995 }, { "epoch": 1.5260145019717593, "ewc_loss": 0.029182400554418564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9182399885030463e-05, "grad_norm": 17.485897064208984, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8775873184204102, "num_tokens": 457740443.0, "step": 11996 }, { "epoch": 1.5261417122503498, "ewc_loss": 0.029099639505147934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9099639505147934e-05, "grad_norm": 17.425233840942383, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8717621564865112, "num_tokens": 457775833.0, "step": 11997 }, { "epoch": 1.5262689225289403, "ewc_loss": 0.02917524054646492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9175240342738107e-05, "grad_norm": 17.550729751586914, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8724279403686523, "num_tokens": 457815850.0, "step": 11998 }, { "epoch": 1.5263961328075308, "ewc_loss": 0.029236022382974625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.923602187365759e-05, "grad_norm": 17.497539520263672, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8774540424346924, "num_tokens": 457854946.0, "step": 11999 }, { "epoch": 1.5265233430861214, "ewc_loss": 0.029080169275403023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.908016904257238e-05, "grad_norm": 17.444917678833008, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8746220469474792, "num_tokens": 457890775.0, "step": 12000 }, { "epoch": 1.526650553364712, "ewc_loss": 0.029218293726444244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9218294002930634e-05, "grad_norm": 17.513681411743164, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.874832034111023, "num_tokens": 457928143.0, "step": 12001 }, { "epoch": 1.5267777636433024, "ewc_loss": 0.029166078194975853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9166078093112446e-05, "grad_norm": 17.440858840942383, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8759074211120605, "num_tokens": 457965999.0, "step": 12002 }, { "epoch": 1.526904973921893, "ewc_loss": 0.029166247695684433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9166247259126976e-05, "grad_norm": 17.5192928314209, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8695332407951355, "num_tokens": 458006638.0, "step": 12003 }, { "epoch": 1.5270321842004835, "ewc_loss": 0.029169483110308647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9169483241275884e-05, "grad_norm": 17.4710750579834, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8701364994049072, "num_tokens": 458047522.0, "step": 12004 }, { "epoch": 1.527159394479074, "ewc_loss": 0.029142457991838455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9142458515707403e-05, "grad_norm": 17.426115036010742, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8697572946548462, "num_tokens": 458091219.0, "step": 12005 }, { "epoch": 1.5272866047576645, "ewc_loss": 0.029162945225834846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916294579335954e-05, "grad_norm": 17.45701789855957, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.858093798160553, "num_tokens": 458129208.0, "step": 12006 }, { "epoch": 1.527413815036255, "ewc_loss": 0.029118651524186134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9118651582393795e-05, "grad_norm": 17.475648880004883, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8652899861335754, "num_tokens": 458163632.0, "step": 12007 }, { "epoch": 1.5275410253148456, "ewc_loss": 0.029152197763323784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9152197384973988e-05, "grad_norm": 17.48399543762207, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8633219003677368, "num_tokens": 458202377.0, "step": 12008 }, { "epoch": 1.527668235593436, "ewc_loss": 0.02911200188100338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.911200135713443e-05, "grad_norm": 17.509403228759766, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8690721988677979, "num_tokens": 458239901.0, "step": 12009 }, { "epoch": 1.5277954458720264, "ewc_loss": 0.029097415506839752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9097414881107397e-05, "grad_norm": 17.44504165649414, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8643522262573242, "num_tokens": 458280291.0, "step": 12010 }, { "epoch": 1.527922656150617, "ewc_loss": 0.029126115143299103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9126114895916544e-05, "grad_norm": 17.51973533630371, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8753235936164856, "num_tokens": 458316931.0, "step": 12011 }, { "epoch": 1.5280498664292075, "ewc_loss": 0.029103782027959824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9103781344019808e-05, "grad_norm": 17.459247589111328, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8633660674095154, "num_tokens": 458356477.0, "step": 12012 }, { "epoch": 1.528177076707798, "ewc_loss": 0.02910206839442253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9102067856001668e-05, "grad_norm": 17.49988555908203, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.855370283126831, "num_tokens": 458401340.0, "step": 12013 }, { "epoch": 1.5283042869863885, "ewc_loss": 0.029099207371473312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.909920658566989e-05, "grad_norm": 17.440776824951172, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8762537240982056, "num_tokens": 458436190.0, "step": 12014 }, { "epoch": 1.5284314972649788, "ewc_loss": 0.02910798229277134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9107983209542e-05, "grad_norm": 17.493701934814453, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8682523965835571, "num_tokens": 458465513.0, "step": 12015 }, { "epoch": 1.5285587075435694, "ewc_loss": 0.029158219695091248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9158220058889128e-05, "grad_norm": 17.50850486755371, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8668539524078369, "num_tokens": 458505465.0, "step": 12016 }, { "epoch": 1.52868591782216, "ewc_loss": 0.029164735227823257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916473567893263e-05, "grad_norm": 17.518009185791016, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8675495982170105, "num_tokens": 458545087.0, "step": 12017 }, { "epoch": 1.5288131281007504, "ewc_loss": 0.029118575155735016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9118575184838846e-05, "grad_norm": 17.470491409301758, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8770369291305542, "num_tokens": 458583274.0, "step": 12018 }, { "epoch": 1.528940338379341, "ewc_loss": 0.02912493608891964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9124936190783046e-05, "grad_norm": 17.454294204711914, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8639786839485168, "num_tokens": 458626026.0, "step": 12019 }, { "epoch": 1.5290675486579315, "ewc_loss": 0.02912457473576069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.912457421189174e-05, "grad_norm": 17.52877426147461, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8807621002197266, "num_tokens": 458668774.0, "step": 12020 }, { "epoch": 1.529194758936522, "ewc_loss": 0.029107699170708656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9107699447195046e-05, "grad_norm": 17.483789443969727, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8731957674026489, "num_tokens": 458702256.0, "step": 12021 }, { "epoch": 1.5293219692151125, "ewc_loss": 0.029028775170445442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9028775315964594e-05, "grad_norm": 17.43654441833496, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.842670202255249, "num_tokens": 458741429.0, "step": 12022 }, { "epoch": 1.529449179493703, "ewc_loss": 0.02911384031176567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9113840355421416e-05, "grad_norm": 17.455564498901367, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8743621110916138, "num_tokens": 458775608.0, "step": 12023 }, { "epoch": 1.5295763897722936, "ewc_loss": 0.029104672372341156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9104672648827545e-05, "grad_norm": 17.471797943115234, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8728895783424377, "num_tokens": 458811451.0, "step": 12024 }, { "epoch": 1.5297036000508841, "ewc_loss": 0.029122529551386833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9122529667802155e-05, "grad_norm": 17.455801010131836, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8736428022384644, "num_tokens": 458851761.0, "step": 12025 }, { "epoch": 1.5298308103294747, "ewc_loss": 0.029083337634801865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9083337722113356e-05, "grad_norm": 17.477378845214844, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8628718852996826, "num_tokens": 458888948.0, "step": 12026 }, { "epoch": 1.5299580206080652, "ewc_loss": 0.029162650927901268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9162651117076166e-05, "grad_norm": 17.490123748779297, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8676086664199829, "num_tokens": 458922466.0, "step": 12027 }, { "epoch": 1.5300852308866557, "ewc_loss": 0.02914350852370262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914350807259325e-05, "grad_norm": 17.481027603149414, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8642185926437378, "num_tokens": 458956947.0, "step": 12028 }, { "epoch": 1.5302124411652462, "ewc_loss": 0.029097886756062508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9097885999362916e-05, "grad_norm": 17.485795974731445, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8871743679046631, "num_tokens": 458993634.0, "step": 12029 }, { "epoch": 1.5303396514438368, "ewc_loss": 0.029212985187768936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9212984372861683e-05, "grad_norm": 17.482526779174805, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8781604766845703, "num_tokens": 459034314.0, "step": 12030 }, { "epoch": 1.5304668617224273, "ewc_loss": 0.0291028693318367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.910287003032863e-05, "grad_norm": 17.443405151367188, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8747678995132446, "num_tokens": 459072117.0, "step": 12031 }, { "epoch": 1.5305940720010178, "ewc_loss": 0.02915128879249096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9151287890272215e-05, "grad_norm": 17.503427505493164, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8640056848526001, "num_tokens": 459112382.0, "step": 12032 }, { "epoch": 1.5307212822796084, "ewc_loss": 0.029142143204808235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914214383054059e-05, "grad_norm": 17.493112564086914, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8818415403366089, "num_tokens": 459148973.0, "step": 12033 }, { "epoch": 1.5308484925581987, "ewc_loss": 0.029202435165643692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9202434234321117e-05, "grad_norm": 17.44808578491211, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8671969175338745, "num_tokens": 459183242.0, "step": 12034 }, { "epoch": 1.5309757028367892, "ewc_loss": 0.029213638976216316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.921363920904696e-05, "grad_norm": 17.54941749572754, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.8868809938430786, "num_tokens": 459219606.0, "step": 12035 }, { "epoch": 1.5311029131153797, "ewc_loss": 0.029165195301175117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9165195883251727e-05, "grad_norm": 17.494306564331055, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8874908685684204, "num_tokens": 459253763.0, "step": 12036 }, { "epoch": 1.5312301233939702, "ewc_loss": 0.02912667766213417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.912667696364224e-05, "grad_norm": 17.524166107177734, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8725141286849976, "num_tokens": 459291187.0, "step": 12037 }, { "epoch": 1.5313573336725608, "ewc_loss": 0.029179561883211136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9179562261560932e-05, "grad_norm": 17.45223617553711, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8663771748542786, "num_tokens": 459327423.0, "step": 12038 }, { "epoch": 1.5314845439511513, "ewc_loss": 0.029155174270272255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9155175070627593e-05, "grad_norm": 17.593421936035156, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8656453490257263, "num_tokens": 459366207.0, "step": 12039 }, { "epoch": 1.5316117542297416, "ewc_loss": 0.029184600338339806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9184600862208754e-05, "grad_norm": 17.497066497802734, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8621160387992859, "num_tokens": 459408338.0, "step": 12040 }, { "epoch": 1.5317389645083321, "ewc_loss": 0.02913074381649494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9130744223948568e-05, "grad_norm": 17.465620040893555, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8801311254501343, "num_tokens": 459443420.0, "step": 12041 }, { "epoch": 1.5318661747869227, "ewc_loss": 0.029174435883760452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.917443634942174e-05, "grad_norm": 17.531047821044922, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8545167446136475, "num_tokens": 459475835.0, "step": 12042 }, { "epoch": 1.5319933850655132, "ewc_loss": 0.029173482209444046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9173481379984878e-05, "grad_norm": 17.51401138305664, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.869446337223053, "num_tokens": 459511993.0, "step": 12043 }, { "epoch": 1.5321205953441037, "ewc_loss": 0.029166387394070625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916638732131105e-05, "grad_norm": 17.488065719604492, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8773491978645325, "num_tokens": 459546221.0, "step": 12044 }, { "epoch": 1.5322478056226942, "ewc_loss": 0.029155749827623367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9155749871279113e-05, "grad_norm": 17.529687881469727, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8740614652633667, "num_tokens": 459587271.0, "step": 12045 }, { "epoch": 1.5323750159012848, "ewc_loss": 0.029187098145484924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9187098334659822e-05, "grad_norm": 17.53054428100586, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.858309268951416, "num_tokens": 459623388.0, "step": 12046 }, { "epoch": 1.5325022261798753, "ewc_loss": 0.02916570007801056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916569974331651e-05, "grad_norm": 17.508195877075195, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8700608015060425, "num_tokens": 459666440.0, "step": 12047 }, { "epoch": 1.5326294364584658, "ewc_loss": 0.029144270345568657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9144270229153335e-05, "grad_norm": 17.489473342895508, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8572512269020081, "num_tokens": 459708373.0, "step": 12048 }, { "epoch": 1.5327566467370564, "ewc_loss": 0.02917211502790451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9172115318942815e-05, "grad_norm": 17.459943771362305, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8803437352180481, "num_tokens": 459746707.0, "step": 12049 }, { "epoch": 1.532883857015647, "ewc_loss": 0.029155051335692406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.915505137934815e-05, "grad_norm": 17.473421096801758, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8750780820846558, "num_tokens": 459792493.0, "step": 12050 }, { "epoch": 1.5330110672942374, "ewc_loss": 0.029194612056016922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.919461257988587e-05, "grad_norm": 17.52796173095703, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8787060976028442, "num_tokens": 459828897.0, "step": 12051 }, { "epoch": 1.533138277572828, "ewc_loss": 0.029137980192899704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9137980163795874e-05, "grad_norm": 17.42914581298828, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8610202074050903, "num_tokens": 459866274.0, "step": 12052 }, { "epoch": 1.5332654878514185, "ewc_loss": 0.02914859540760517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9148595785954967e-05, "grad_norm": 17.519447326660156, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8849070072174072, "num_tokens": 459902442.0, "step": 12053 }, { "epoch": 1.533392698130009, "ewc_loss": 0.029151059687137604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9151060516596772e-05, "grad_norm": 17.478025436401367, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8782271146774292, "num_tokens": 459943331.0, "step": 12054 }, { "epoch": 1.5335199084085995, "ewc_loss": 0.029115475714206696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9115475626895204e-05, "grad_norm": 17.47100067138672, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8759810924530029, "num_tokens": 459975349.0, "step": 12055 }, { "epoch": 1.53364711868719, "ewc_loss": 0.029175125062465668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9175125746405683e-05, "grad_norm": 17.54559898376465, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.883152186870575, "num_tokens": 460008295.0, "step": 12056 }, { "epoch": 1.5337743289657806, "ewc_loss": 0.029116246849298477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9116246878402308e-05, "grad_norm": 17.46727180480957, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8746484518051147, "num_tokens": 460045892.0, "step": 12057 }, { "epoch": 1.533901539244371, "ewc_loss": 0.029124144464731216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9124144930392504e-05, "grad_norm": 17.52312469482422, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.870120644569397, "num_tokens": 460084255.0, "step": 12058 }, { "epoch": 1.5340287495229614, "ewc_loss": 0.02912074886262417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9120748877176084e-05, "grad_norm": 17.497634887695312, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8564264178276062, "num_tokens": 460123706.0, "step": 12059 }, { "epoch": 1.534155959801552, "ewc_loss": 0.02914637327194214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9146372980903834e-05, "grad_norm": 17.558147430419922, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8562501072883606, "num_tokens": 460162351.0, "step": 12060 }, { "epoch": 1.5342831700801425, "ewc_loss": 0.02912409044802189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9124090360710397e-05, "grad_norm": 17.483110427856445, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.86020427942276, "num_tokens": 460200135.0, "step": 12061 }, { "epoch": 1.534410380358733, "ewc_loss": 0.029088860377669334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.908886017394252e-05, "grad_norm": 17.530614852905273, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8638172745704651, "num_tokens": 460243585.0, "step": 12062 }, { "epoch": 1.5345375906373235, "ewc_loss": 0.029142990708351135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9142991479602642e-05, "grad_norm": 17.48324966430664, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.863762617111206, "num_tokens": 460283695.0, "step": 12063 }, { "epoch": 1.5346648009159138, "ewc_loss": 0.029089294373989105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.908929491240997e-05, "grad_norm": 17.490678787231445, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8773142099380493, "num_tokens": 460317241.0, "step": 12064 }, { "epoch": 1.5347920111945044, "ewc_loss": 0.029138559475541115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.91385586024262e-05, "grad_norm": 17.48924446105957, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8718887567520142, "num_tokens": 460351900.0, "step": 12065 }, { "epoch": 1.534919221473095, "ewc_loss": 0.0290797408670187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9079741580062546e-05, "grad_norm": 17.40839385986328, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8724079132080078, "num_tokens": 460391672.0, "step": 12066 }, { "epoch": 1.5350464317516854, "ewc_loss": 0.029129818081855774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9129818358342163e-05, "grad_norm": 17.56926155090332, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8628883957862854, "num_tokens": 460434120.0, "step": 12067 }, { "epoch": 1.535173642030276, "ewc_loss": 0.029181089252233505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9181088393670507e-05, "grad_norm": 17.477848052978516, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.871511697769165, "num_tokens": 460474214.0, "step": 12068 }, { "epoch": 1.5353008523088665, "ewc_loss": 0.02914542332291603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9145423468435183e-05, "grad_norm": 17.500993728637695, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8755079507827759, "num_tokens": 460507549.0, "step": 12069 }, { "epoch": 1.535428062587457, "ewc_loss": 0.029195252805948257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.919525286415592e-05, "grad_norm": 17.504308700561523, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8626555800437927, "num_tokens": 460546391.0, "step": 12070 }, { "epoch": 1.5355552728660475, "ewc_loss": 0.02919856272637844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9198563424870372e-05, "grad_norm": 17.499357223510742, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8591710329055786, "num_tokens": 460585232.0, "step": 12071 }, { "epoch": 1.535682483144638, "ewc_loss": 0.029150042682886124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9150041882530786e-05, "grad_norm": 17.520381927490234, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.878908097743988, "num_tokens": 460624398.0, "step": 12072 }, { "epoch": 1.5358096934232286, "ewc_loss": 0.029188955202698708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9188955522840843e-05, "grad_norm": 17.52665901184082, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8630168437957764, "num_tokens": 460667193.0, "step": 12073 }, { "epoch": 1.5359369037018191, "ewc_loss": 0.029129276052117348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9129276299499907e-05, "grad_norm": 17.517045974731445, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8469232320785522, "num_tokens": 460710614.0, "step": 12074 }, { "epoch": 1.5360641139804097, "ewc_loss": 0.029174892231822014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.917489291576203e-05, "grad_norm": 17.549205780029297, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8660762906074524, "num_tokens": 460751079.0, "step": 12075 }, { "epoch": 1.5361913242590002, "ewc_loss": 0.02917763590812683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9177635951782577e-05, "grad_norm": 17.41123390197754, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8693747520446777, "num_tokens": 460786768.0, "step": 12076 }, { "epoch": 1.5363185345375907, "ewc_loss": 0.029160866513848305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9160866688471287e-05, "grad_norm": 17.588214874267578, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8601377606391907, "num_tokens": 460824757.0, "step": 12077 }, { "epoch": 1.5364457448161812, "ewc_loss": 0.029216784983873367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.921678424172569e-05, "grad_norm": 17.450706481933594, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8629037141799927, "num_tokens": 460865024.0, "step": 12078 }, { "epoch": 1.5365729550947718, "ewc_loss": 0.02910630777478218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9106307920301333e-05, "grad_norm": 17.547849655151367, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8611900210380554, "num_tokens": 460900895.0, "step": 12079 }, { "epoch": 1.5367001653733623, "ewc_loss": 0.02921099215745926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.92109925794648e-05, "grad_norm": 17.454086303710938, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8750203251838684, "num_tokens": 460936876.0, "step": 12080 }, { "epoch": 1.5368273756519528, "ewc_loss": 0.02914200723171234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9142007406335324e-05, "grad_norm": 17.522567749023438, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.854361891746521, "num_tokens": 460975297.0, "step": 12081 }, { "epoch": 1.5369545859305433, "ewc_loss": 0.029214462265372276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9214461392257363e-05, "grad_norm": 17.572111129760742, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8571295738220215, "num_tokens": 461008603.0, "step": 12082 }, { "epoch": 1.5370817962091337, "ewc_loss": 0.029207227751612663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.920722727139946e-05, "grad_norm": 17.480546951293945, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8611042499542236, "num_tokens": 461047770.0, "step": 12083 }, { "epoch": 1.5372090064877242, "ewc_loss": 0.029154706746339798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.915470759035088e-05, "grad_norm": 17.54426383972168, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8568964600563049, "num_tokens": 461088554.0, "step": 12084 }, { "epoch": 1.5373362167663147, "ewc_loss": 0.029221395030617714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.922139537986368e-05, "grad_norm": 17.518564224243164, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8482324481010437, "num_tokens": 461126070.0, "step": 12085 }, { "epoch": 1.5374634270449052, "ewc_loss": 0.02916446514427662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.91644646495115e-05, "grad_norm": 17.478038787841797, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8665683269500732, "num_tokens": 461166394.0, "step": 12086 }, { "epoch": 1.5375906373234958, "ewc_loss": 0.02919982373714447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.919982398452703e-05, "grad_norm": 17.481735229492188, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8621533513069153, "num_tokens": 461208263.0, "step": 12087 }, { "epoch": 1.5377178476020863, "ewc_loss": 0.02921067178249359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9210672437329777e-05, "grad_norm": 17.54302215576172, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8794677257537842, "num_tokens": 461245054.0, "step": 12088 }, { "epoch": 1.5378450578806766, "ewc_loss": 0.029202444478869438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.920244514825754e-05, "grad_norm": 17.534076690673828, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8580710887908936, "num_tokens": 461281584.0, "step": 12089 }, { "epoch": 1.5379722681592671, "ewc_loss": 0.029174750670790672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9174751034588553e-05, "grad_norm": 17.53952407836914, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.877441942691803, "num_tokens": 461322247.0, "step": 12090 }, { "epoch": 1.5380994784378577, "ewc_loss": 0.029196886345744133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9196886316640303e-05, "grad_norm": 17.589447021484375, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8570802807807922, "num_tokens": 461358371.0, "step": 12091 }, { "epoch": 1.5382266887164482, "ewc_loss": 0.029229726642370224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.922972635133192e-05, "grad_norm": 17.433115005493164, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8754880428314209, "num_tokens": 461394666.0, "step": 12092 }, { "epoch": 1.5383538989950387, "ewc_loss": 0.02913784608244896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.913784555858001e-05, "grad_norm": 17.528261184692383, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8813410997390747, "num_tokens": 461430875.0, "step": 12093 }, { "epoch": 1.5384811092736292, "ewc_loss": 0.029292676597833633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929267611762043e-05, "grad_norm": 17.500537872314453, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.865400493144989, "num_tokens": 461467412.0, "step": 12094 }, { "epoch": 1.5386083195522198, "ewc_loss": 0.02912447415292263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9124474167474546e-05, "grad_norm": 17.513839721679688, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8471847176551819, "num_tokens": 461501970.0, "step": 12095 }, { "epoch": 1.5387355298308103, "ewc_loss": 0.02920462377369404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9204624297562987e-05, "grad_norm": 17.459976196289062, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8565667271614075, "num_tokens": 461542735.0, "step": 12096 }, { "epoch": 1.5388627401094008, "ewc_loss": 0.029205741360783577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9205741157056764e-05, "grad_norm": 17.55620765686035, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8715952634811401, "num_tokens": 461583883.0, "step": 12097 }, { "epoch": 1.5389899503879914, "ewc_loss": 0.029203221201896667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9203221856732853e-05, "grad_norm": 17.508981704711914, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8611810207366943, "num_tokens": 461625124.0, "step": 12098 }, { "epoch": 1.5391171606665819, "ewc_loss": 0.02917388081550598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9173881557653658e-05, "grad_norm": 17.464895248413086, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8587570190429688, "num_tokens": 461660273.0, "step": 12099 }, { "epoch": 1.5392443709451724, "ewc_loss": 0.02920518070459366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.920518090832047e-05, "grad_norm": 17.472000122070312, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8628939986228943, "num_tokens": 461698890.0, "step": 12100 }, { "epoch": 1.539371581223763, "ewc_loss": 0.02919292263686657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9192922738729976e-05, "grad_norm": 17.50498390197754, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8689020276069641, "num_tokens": 461736927.0, "step": 12101 }, { "epoch": 1.5394987915023535, "ewc_loss": 0.02919463813304901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.919463804573752e-05, "grad_norm": 17.40411376953125, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8671747446060181, "num_tokens": 461777796.0, "step": 12102 }, { "epoch": 1.539626001780944, "ewc_loss": 0.029210427775979042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.92104286927497e-05, "grad_norm": 17.504249572753906, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.877723217010498, "num_tokens": 461814656.0, "step": 12103 }, { "epoch": 1.5397532120595345, "ewc_loss": 0.029243942350149155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.924394175352063e-05, "grad_norm": 17.50867462158203, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8514397144317627, "num_tokens": 461852186.0, "step": 12104 }, { "epoch": 1.539880422338125, "ewc_loss": 0.029245996847748756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9245997211546637e-05, "grad_norm": 17.52756690979004, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.882276713848114, "num_tokens": 461893724.0, "step": 12105 }, { "epoch": 1.5400076326167156, "ewc_loss": 0.029222233220934868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9222233933978714e-05, "grad_norm": 17.548297882080078, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8564110994338989, "num_tokens": 461935380.0, "step": 12106 }, { "epoch": 1.5401348428953059, "ewc_loss": 0.02918306551873684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.918306563515216e-05, "grad_norm": 17.5047607421875, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8754422664642334, "num_tokens": 461971463.0, "step": 12107 }, { "epoch": 1.5402620531738964, "ewc_loss": 0.02920786291360855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9207862098701298e-05, "grad_norm": 17.548656463623047, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8753690123558044, "num_tokens": 462007514.0, "step": 12108 }, { "epoch": 1.540389263452487, "ewc_loss": 0.02916344627737999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9163446015445516e-05, "grad_norm": 17.442642211914062, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8818107843399048, "num_tokens": 462046022.0, "step": 12109 }, { "epoch": 1.5405164737310775, "ewc_loss": 0.029161641374230385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9161641577957198e-05, "grad_norm": 17.518129348754883, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8723485469818115, "num_tokens": 462089367.0, "step": 12110 }, { "epoch": 1.540643684009668, "ewc_loss": 0.02918657660484314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9186576284701005e-05, "grad_norm": 17.491785049438477, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8739867806434631, "num_tokens": 462128558.0, "step": 12111 }, { "epoch": 1.5407708942882585, "ewc_loss": 0.02913808263838291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9138082027202472e-05, "grad_norm": 17.510520935058594, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.876400887966156, "num_tokens": 462162639.0, "step": 12112 }, { "epoch": 1.5408981045668488, "ewc_loss": 0.02920084074139595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.920084079960361e-05, "grad_norm": 17.470277786254883, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.868804931640625, "num_tokens": 462199303.0, "step": 12113 }, { "epoch": 1.5410253148454394, "ewc_loss": 0.029209839180111885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9209839340182953e-05, "grad_norm": 17.59328842163086, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8666703104972839, "num_tokens": 462246095.0, "step": 12114 }, { "epoch": 1.54115252512403, "ewc_loss": 0.02917426824569702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9174269002396613e-05, "grad_norm": 17.40833282470703, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8661525249481201, "num_tokens": 462286653.0, "step": 12115 }, { "epoch": 1.5412797354026204, "ewc_loss": 0.029134493321180344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9134493161109276e-05, "grad_norm": 17.52939224243164, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8655560612678528, "num_tokens": 462328295.0, "step": 12116 }, { "epoch": 1.541406945681211, "ewc_loss": 0.02919692173600197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.919692087743897e-05, "grad_norm": 17.453386306762695, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8712963461875916, "num_tokens": 462365175.0, "step": 12117 }, { "epoch": 1.5415341559598015, "ewc_loss": 0.029144376516342163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914437573053874e-05, "grad_norm": 17.5567569732666, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8618100881576538, "num_tokens": 462407044.0, "step": 12118 }, { "epoch": 1.541661366238392, "ewc_loss": 0.029165828600525856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916582889156416e-05, "grad_norm": 17.451583862304688, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8622974157333374, "num_tokens": 462443819.0, "step": 12119 }, { "epoch": 1.5417885765169825, "ewc_loss": 0.02908143773674965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.908143687818665e-05, "grad_norm": 17.46999740600586, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8777806162834167, "num_tokens": 462487319.0, "step": 12120 }, { "epoch": 1.541915786795573, "ewc_loss": 0.029186200350522995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.918619975389447e-05, "grad_norm": 17.531585693359375, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8696798086166382, "num_tokens": 462527026.0, "step": 12121 }, { "epoch": 1.5420429970741636, "ewc_loss": 0.02909740060567856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.909740032919217e-05, "grad_norm": 17.437911987304688, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8599783182144165, "num_tokens": 462557552.0, "step": 12122 }, { "epoch": 1.5421702073527541, "ewc_loss": 0.029146218672394753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9146218366804533e-05, "grad_norm": 17.53497314453125, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8695536851882935, "num_tokens": 462595243.0, "step": 12123 }, { "epoch": 1.5422974176313446, "ewc_loss": 0.02916443906724453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.916443918365985e-05, "grad_norm": 17.472957611083984, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.871117115020752, "num_tokens": 462636077.0, "step": 12124 }, { "epoch": 1.5424246279099352, "ewc_loss": 0.02914954163134098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914954166044481e-05, "grad_norm": 17.495975494384766, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8803247213363647, "num_tokens": 462668407.0, "step": 12125 }, { "epoch": 1.5425518381885257, "ewc_loss": 0.02919420786201954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9194208764238283e-05, "grad_norm": 17.52823257446289, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8687213659286499, "num_tokens": 462707678.0, "step": 12126 }, { "epoch": 1.5426790484671162, "ewc_loss": 0.029138043522834778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9138043828424998e-05, "grad_norm": 17.528587341308594, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.86123126745224, "num_tokens": 462739722.0, "step": 12127 }, { "epoch": 1.5428062587457068, "ewc_loss": 0.029152762144804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9152761271689087e-05, "grad_norm": 17.459993362426758, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8753348588943481, "num_tokens": 462770369.0, "step": 12128 }, { "epoch": 1.5429334690242973, "ewc_loss": 0.02919716015458107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9197160984040238e-05, "grad_norm": 17.607036590576172, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8751238584518433, "num_tokens": 462806126.0, "step": 12129 }, { "epoch": 1.5430606793028878, "ewc_loss": 0.029220588505268097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9220587748568505e-05, "grad_norm": 17.500213623046875, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8708653450012207, "num_tokens": 462846585.0, "step": 12130 }, { "epoch": 1.5431878895814783, "ewc_loss": 0.02913740649819374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9137407182133757e-05, "grad_norm": 17.62349510192871, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8654406070709229, "num_tokens": 462889892.0, "step": 12131 }, { "epoch": 1.5433150998600687, "ewc_loss": 0.02924603968858719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.924603904830292e-05, "grad_norm": 17.487499237060547, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8694129586219788, "num_tokens": 462927612.0, "step": 12132 }, { "epoch": 1.5434423101386592, "ewc_loss": 0.02913905866444111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9139058824512176e-05, "grad_norm": 17.57384490966797, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8714120388031006, "num_tokens": 462971661.0, "step": 12133 }, { "epoch": 1.5435695204172497, "ewc_loss": 0.029209552332758904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9209551939857192e-05, "grad_norm": 17.549455642700195, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.873822033405304, "num_tokens": 463011260.0, "step": 12134 }, { "epoch": 1.5436967306958402, "ewc_loss": 0.029116354882717133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9116354198777117e-05, "grad_norm": 17.491186141967773, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8760226964950562, "num_tokens": 463045395.0, "step": 12135 }, { "epoch": 1.5438239409744308, "ewc_loss": 0.029162516817450523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9162516511860304e-05, "grad_norm": 17.59967803955078, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8589723706245422, "num_tokens": 463081015.0, "step": 12136 }, { "epoch": 1.5439511512530213, "ewc_loss": 0.0291936956346035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9193695809226483e-05, "grad_norm": 17.58922576904297, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8689305186271667, "num_tokens": 463115658.0, "step": 12137 }, { "epoch": 1.5440783615316116, "ewc_loss": 0.029099829494953156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9099828680045903e-05, "grad_norm": 17.55232048034668, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8812787532806396, "num_tokens": 463152084.0, "step": 12138 }, { "epoch": 1.5442055718102021, "ewc_loss": 0.02913966029882431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.913966091000475e-05, "grad_norm": 17.548660278320312, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.859237015247345, "num_tokens": 463190137.0, "step": 12139 }, { "epoch": 1.5443327820887927, "ewc_loss": 0.0291837640106678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9183764127083123e-05, "grad_norm": 17.52302360534668, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.868941068649292, "num_tokens": 463227626.0, "step": 12140 }, { "epoch": 1.5444599923673832, "ewc_loss": 0.029142744839191437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.914274409704376e-05, "grad_norm": 17.4608097076416, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8694196939468384, "num_tokens": 463266451.0, "step": 12141 }, { "epoch": 1.5445872026459737, "ewc_loss": 0.029168926179409027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.91689266305184e-05, "grad_norm": 17.571441650390625, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8577662110328674, "num_tokens": 463305327.0, "step": 12142 }, { "epoch": 1.5447144129245642, "ewc_loss": 0.029181132093071938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9181132049416192e-05, "grad_norm": 17.52960777282715, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8655293583869934, "num_tokens": 463345890.0, "step": 12143 }, { "epoch": 1.5448416232031548, "ewc_loss": 0.029106508940458298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9106508009135723e-05, "grad_norm": 17.490699768066406, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8731639385223389, "num_tokens": 463381583.0, "step": 12144 }, { "epoch": 1.5449688334817453, "ewc_loss": 0.029151134192943573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9151133276172914e-05, "grad_norm": 17.489177703857422, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8774354457855225, "num_tokens": 463414442.0, "step": 12145 }, { "epoch": 1.5450960437603358, "ewc_loss": 0.02918017841875553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9180178898968734e-05, "grad_norm": 17.53162956237793, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8754466772079468, "num_tokens": 463449517.0, "step": 12146 }, { "epoch": 1.5452232540389264, "ewc_loss": 0.029210401698946953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9210401407908648e-05, "grad_norm": 17.511106491088867, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8758774995803833, "num_tokens": 463487952.0, "step": 12147 }, { "epoch": 1.5453504643175169, "ewc_loss": 0.029219115152955055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9219114367151633e-05, "grad_norm": 17.539825439453125, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8693814277648926, "num_tokens": 463520724.0, "step": 12148 }, { "epoch": 1.5454776745961074, "ewc_loss": 0.029155375435948372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9155375159461983e-05, "grad_norm": 17.462608337402344, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8664367198944092, "num_tokens": 463564691.0, "step": 12149 }, { "epoch": 1.545604884874698, "ewc_loss": 0.02918240614235401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9182405341998674e-05, "grad_norm": 17.6107234954834, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8661331534385681, "num_tokens": 463599037.0, "step": 12150 }, { "epoch": 1.5457320951532885, "ewc_loss": 0.02923085354268551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9230854124762118e-05, "grad_norm": 17.58203125, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8571529388427734, "num_tokens": 463630744.0, "step": 12151 }, { "epoch": 1.545859305431879, "ewc_loss": 0.02915802411735058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9158023608033545e-05, "grad_norm": 17.519472122192383, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8656169176101685, "num_tokens": 463663551.0, "step": 12152 }, { "epoch": 1.5459865157104695, "ewc_loss": 0.029207522049546242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9207521947682835e-05, "grad_norm": 17.581262588500977, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8809283971786499, "num_tokens": 463702446.0, "step": 12153 }, { "epoch": 1.54611372598906, "ewc_loss": 0.02916562557220459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9165625164750963e-05, "grad_norm": 17.500547409057617, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8606049418449402, "num_tokens": 463739324.0, "step": 12154 }, { "epoch": 1.5462409362676506, "ewc_loss": 0.029234517365694046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.923451756942086e-05, "grad_norm": 17.59035873413086, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8497674465179443, "num_tokens": 463780765.0, "step": 12155 }, { "epoch": 1.5463681465462409, "ewc_loss": 0.029250049963593483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9250049919937737e-05, "grad_norm": 17.519775390625, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8473799228668213, "num_tokens": 463817727.0, "step": 12156 }, { "epoch": 1.5464953568248314, "ewc_loss": 0.029163233935832977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.91632331936853e-05, "grad_norm": 17.480920791625977, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8845025300979614, "num_tokens": 463865793.0, "step": 12157 }, { "epoch": 1.546622567103422, "ewc_loss": 0.029259804636240005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9259805160108954e-05, "grad_norm": 17.523021697998047, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8697658777236938, "num_tokens": 463908299.0, "step": 12158 }, { "epoch": 1.5467497773820125, "ewc_loss": 0.02921457029879093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9214570531621575e-05, "grad_norm": 17.49051284790039, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.864965558052063, "num_tokens": 463944513.0, "step": 12159 }, { "epoch": 1.546876987660603, "ewc_loss": 0.029221268370747566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.922126805060543e-05, "grad_norm": 17.52033042907715, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8617111444473267, "num_tokens": 463982862.0, "step": 12160 }, { "epoch": 1.5470041979391935, "ewc_loss": 0.029218332841992378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9218332201708108e-05, "grad_norm": 17.413625717163086, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.873043954372406, "num_tokens": 464023357.0, "step": 12161 }, { "epoch": 1.5471314082177838, "ewc_loss": 0.029214197769761086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.921419763879385e-05, "grad_norm": 17.454200744628906, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8690245151519775, "num_tokens": 464071889.0, "step": 12162 }, { "epoch": 1.5472586184963744, "ewc_loss": 0.029265983030200005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9265982448123395e-05, "grad_norm": 17.536096572875977, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8557642698287964, "num_tokens": 464112334.0, "step": 12163 }, { "epoch": 1.5473858287749649, "ewc_loss": 0.029222458600997925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9222459488664754e-05, "grad_norm": 17.49993133544922, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8645646572113037, "num_tokens": 464156833.0, "step": 12164 }, { "epoch": 1.5475130390535554, "ewc_loss": 0.029172418639063835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9172419090173207e-05, "grad_norm": 17.50750732421875, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8723976612091064, "num_tokens": 464192989.0, "step": 12165 }, { "epoch": 1.547640249332146, "ewc_loss": 0.02922731824219227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9227318009361625e-05, "grad_norm": 17.53303337097168, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8751606345176697, "num_tokens": 464232855.0, "step": 12166 }, { "epoch": 1.5477674596107365, "ewc_loss": 0.029165131971240044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9165132218622603e-05, "grad_norm": 17.523160934448242, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.85274338722229, "num_tokens": 464271860.0, "step": 12167 }, { "epoch": 1.547894669889327, "ewc_loss": 0.029212379828095436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9212380468379706e-05, "grad_norm": 17.502859115600586, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8814113140106201, "num_tokens": 464312925.0, "step": 12168 }, { "epoch": 1.5480218801679175, "ewc_loss": 0.02917720563709736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9177204851293936e-05, "grad_norm": 17.59732437133789, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8640344142913818, "num_tokens": 464349190.0, "step": 12169 }, { "epoch": 1.548149090446508, "ewc_loss": 0.029154080897569656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9154080039006658e-05, "grad_norm": 17.467525482177734, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8643643260002136, "num_tokens": 464392031.0, "step": 12170 }, { "epoch": 1.5482763007250986, "ewc_loss": 0.02911597676575184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9115977667970583e-05, "grad_norm": 17.522886276245117, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8648194670677185, "num_tokens": 464424884.0, "step": 12171 }, { "epoch": 1.5484035110036891, "ewc_loss": 0.029187219217419624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9187218387960456e-05, "grad_norm": 17.52970314025879, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8628784418106079, "num_tokens": 464466047.0, "step": 12172 }, { "epoch": 1.5485307212822796, "ewc_loss": 0.02914881892502308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9148819521651603e-05, "grad_norm": 17.554134368896484, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8678460121154785, "num_tokens": 464497293.0, "step": 12173 }, { "epoch": 1.5486579315608702, "ewc_loss": 0.029203088954091072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9203089070506394e-05, "grad_norm": 17.500261306762695, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8552467823028564, "num_tokens": 464541873.0, "step": 12174 }, { "epoch": 1.5487851418394607, "ewc_loss": 0.029159152880311012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9159153200453147e-05, "grad_norm": 17.4258975982666, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8687331080436707, "num_tokens": 464579712.0, "step": 12175 }, { "epoch": 1.5489123521180512, "ewc_loss": 0.029232466593384743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.923246574937366e-05, "grad_norm": 17.540307998657227, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8669993877410889, "num_tokens": 464612277.0, "step": 12176 }, { "epoch": 1.5490395623966418, "ewc_loss": 0.02926291711628437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.926291745097842e-05, "grad_norm": 17.471572875976562, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8650966882705688, "num_tokens": 464647062.0, "step": 12177 }, { "epoch": 1.5491667726752323, "ewc_loss": 0.029188508167862892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.918850805144757e-05, "grad_norm": 17.487573623657227, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8708206415176392, "num_tokens": 464677998.0, "step": 12178 }, { "epoch": 1.5492939829538228, "ewc_loss": 0.02924404852092266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.924404907389544e-05, "grad_norm": 17.477075576782227, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8621837496757507, "num_tokens": 464719205.0, "step": 12179 }, { "epoch": 1.5494211932324133, "ewc_loss": 0.029261427000164986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9261427698656917e-05, "grad_norm": 17.57960319519043, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8728240728378296, "num_tokens": 464760758.0, "step": 12180 }, { "epoch": 1.5495484035110036, "ewc_loss": 0.02926652505993843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9266524506965652e-05, "grad_norm": 17.481996536254883, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8715962171554565, "num_tokens": 464797422.0, "step": 12181 }, { "epoch": 1.5496756137895942, "ewc_loss": 0.029248053207993507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9248052669572644e-05, "grad_norm": 17.5845947265625, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8700571656227112, "num_tokens": 464832889.0, "step": 12182 }, { "epoch": 1.5498028240681847, "ewc_loss": 0.02927367202937603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9273671316332184e-05, "grad_norm": 17.53171730041504, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8616681694984436, "num_tokens": 464867743.0, "step": 12183 }, { "epoch": 1.5499300343467752, "ewc_loss": 0.029215218499302864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9215218091849238e-05, "grad_norm": 17.534154891967773, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8559336066246033, "num_tokens": 464904666.0, "step": 12184 }, { "epoch": 1.5500572446253658, "ewc_loss": 0.029299478977918625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9299479137989692e-05, "grad_norm": 17.535282135009766, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8720629215240479, "num_tokens": 464942886.0, "step": 12185 }, { "epoch": 1.5501844549039563, "ewc_loss": 0.029264181852340698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9264181648613885e-05, "grad_norm": 17.517000198364258, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8683144450187683, "num_tokens": 464977463.0, "step": 12186 }, { "epoch": 1.5503116651825466, "ewc_loss": 0.029303988441824913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9303988412721083e-05, "grad_norm": 17.52463722229004, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.877585232257843, "num_tokens": 465015732.0, "step": 12187 }, { "epoch": 1.5504388754611371, "ewc_loss": 0.02927083894610405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9270839149830863e-05, "grad_norm": 17.442947387695312, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8770929574966431, "num_tokens": 465060789.0, "step": 12188 }, { "epoch": 1.5505660857397277, "ewc_loss": 0.029287083074450493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9287082725204527e-05, "grad_norm": 17.523677825927734, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8661224842071533, "num_tokens": 465097907.0, "step": 12189 }, { "epoch": 1.5506932960183182, "ewc_loss": 0.029386349022388458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9386348614934832e-05, "grad_norm": 17.544910430908203, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8768266439437866, "num_tokens": 465139401.0, "step": 12190 }, { "epoch": 1.5508205062969087, "ewc_loss": 0.029332896694540977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9332895792322233e-05, "grad_norm": 17.522279739379883, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8485088348388672, "num_tokens": 465176531.0, "step": 12191 }, { "epoch": 1.5509477165754992, "ewc_loss": 0.02927425689995289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9274257030920126e-05, "grad_norm": 17.481090545654297, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8779992461204529, "num_tokens": 465216074.0, "step": 12192 }, { "epoch": 1.5510749268540898, "ewc_loss": 0.029310422018170357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9310422178241424e-05, "grad_norm": 17.523927688598633, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.881682276725769, "num_tokens": 465257986.0, "step": 12193 }, { "epoch": 1.5512021371326803, "ewc_loss": 0.029273366555571556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9273365726112388e-05, "grad_norm": 17.44227409362793, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8729792237281799, "num_tokens": 465296038.0, "step": 12194 }, { "epoch": 1.5513293474112708, "ewc_loss": 0.02931702509522438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9317025109776296e-05, "grad_norm": 17.517995834350586, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8455196619033813, "num_tokens": 465334215.0, "step": 12195 }, { "epoch": 1.5514565576898613, "ewc_loss": 0.029348056763410568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9348057069000788e-05, "grad_norm": 17.532604217529297, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8679038286209106, "num_tokens": 465373362.0, "step": 12196 }, { "epoch": 1.5515837679684519, "ewc_loss": 0.02927461452782154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9274615371832624e-05, "grad_norm": 17.532480239868164, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8628440499305725, "num_tokens": 465410181.0, "step": 12197 }, { "epoch": 1.5517109782470424, "ewc_loss": 0.02927563153207302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9275632186909206e-05, "grad_norm": 17.49921989440918, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8760765790939331, "num_tokens": 465443056.0, "step": 12198 }, { "epoch": 1.551838188525633, "ewc_loss": 0.02930736355483532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.930736263806466e-05, "grad_norm": 17.544404983520508, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8720556497573853, "num_tokens": 465484509.0, "step": 12199 }, { "epoch": 1.5519653988042235, "ewc_loss": 0.029291437938809395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9291437385836616e-05, "grad_norm": 17.526731491088867, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8666602969169617, "num_tokens": 465520017.0, "step": 12200 }, { "epoch": 1.552092609082814, "ewc_loss": 0.02927916869521141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.92791683023097e-05, "grad_norm": 17.587360382080078, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8856447339057922, "num_tokens": 465557221.0, "step": 12201 }, { "epoch": 1.5522198193614045, "ewc_loss": 0.029306717216968536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.93067168968264e-05, "grad_norm": 17.544904708862305, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8781574368476868, "num_tokens": 465595455.0, "step": 12202 }, { "epoch": 1.552347029639995, "ewc_loss": 0.029224753379821777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9224753234302625e-05, "grad_norm": 17.536157608032227, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8701075911521912, "num_tokens": 465636436.0, "step": 12203 }, { "epoch": 1.5524742399185856, "ewc_loss": 0.0292658731341362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9265873308759183e-05, "grad_norm": 17.541704177856445, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8669873476028442, "num_tokens": 465668956.0, "step": 12204 }, { "epoch": 1.5526014501971759, "ewc_loss": 0.029294051229953766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929405127360951e-05, "grad_norm": 17.592012405395508, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8497960567474365, "num_tokens": 465708812.0, "step": 12205 }, { "epoch": 1.5527286604757664, "ewc_loss": 0.029272984713315964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9272985557327047e-05, "grad_norm": 17.517778396606445, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8559741973876953, "num_tokens": 465742610.0, "step": 12206 }, { "epoch": 1.552855870754357, "ewc_loss": 0.02928953617811203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.928953654190991e-05, "grad_norm": 17.644113540649414, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8676538467407227, "num_tokens": 465776468.0, "step": 12207 }, { "epoch": 1.5529830810329475, "ewc_loss": 0.029308771714568138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9308772354852408e-05, "grad_norm": 17.544567108154297, "learning_rate": 1e-06, "loss": 0.3535, "mean_token_accuracy": 0.8866639137268066, "num_tokens": 465809300.0, "step": 12208 }, { "epoch": 1.553110291311538, "ewc_loss": 0.029224054887890816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9224054742371663e-05, "grad_norm": 17.629804611206055, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8711681365966797, "num_tokens": 465849847.0, "step": 12209 }, { "epoch": 1.5532375015901285, "ewc_loss": 0.029267074540257454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9267073841765523e-05, "grad_norm": 17.445018768310547, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8803410530090332, "num_tokens": 465884986.0, "step": 12210 }, { "epoch": 1.5533647118687188, "ewc_loss": 0.029201103374361992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9201102734077722e-05, "grad_norm": 17.604991912841797, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8761621117591858, "num_tokens": 465919710.0, "step": 12211 }, { "epoch": 1.5534919221473094, "ewc_loss": 0.0293356291949749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9335629733395763e-05, "grad_norm": 17.512836456298828, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8610831499099731, "num_tokens": 465956041.0, "step": 12212 }, { "epoch": 1.5536191324258999, "ewc_loss": 0.029253516346216202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9253516913740896e-05, "grad_norm": 17.59212303161621, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8806862235069275, "num_tokens": 465994352.0, "step": 12213 }, { "epoch": 1.5537463427044904, "ewc_loss": 0.029336687177419662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9336686566239223e-05, "grad_norm": 17.630056381225586, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8693931102752686, "num_tokens": 466032493.0, "step": 12214 }, { "epoch": 1.553873552983081, "ewc_loss": 0.029218068346381187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9218068448244594e-05, "grad_norm": 17.5267333984375, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8725042343139648, "num_tokens": 466067048.0, "step": 12215 }, { "epoch": 1.5540007632616715, "ewc_loss": 0.029249664396047592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9249664294184186e-05, "grad_norm": 17.501657485961914, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8472318649291992, "num_tokens": 466106895.0, "step": 12216 }, { "epoch": 1.554127973540262, "ewc_loss": 0.029299747198820114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9299746529432014e-05, "grad_norm": 17.588497161865234, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8546744585037231, "num_tokens": 466146740.0, "step": 12217 }, { "epoch": 1.5542551838188525, "ewc_loss": 0.02926284819841385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9262848329381086e-05, "grad_norm": 17.551177978515625, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8752117156982422, "num_tokens": 466187097.0, "step": 12218 }, { "epoch": 1.554382394097443, "ewc_loss": 0.02924320101737976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9243201424833387e-05, "grad_norm": 17.494701385498047, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8595417737960815, "num_tokens": 466223969.0, "step": 12219 }, { "epoch": 1.5545096043760336, "ewc_loss": 0.029287047684192657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.928704816440586e-05, "grad_norm": 17.543731689453125, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.879848062992096, "num_tokens": 466260127.0, "step": 12220 }, { "epoch": 1.554636814654624, "ewc_loss": 0.029316402971744537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9316403015400283e-05, "grad_norm": 17.557384490966797, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8751344680786133, "num_tokens": 466297219.0, "step": 12221 }, { "epoch": 1.5547640249332146, "ewc_loss": 0.029267488047480583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.926748857134953e-05, "grad_norm": 17.52588653564453, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8497408032417297, "num_tokens": 466329474.0, "step": 12222 }, { "epoch": 1.5548912352118052, "ewc_loss": 0.029296742752194405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929674337792676e-05, "grad_norm": 17.51215171813965, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8676434755325317, "num_tokens": 466372801.0, "step": 12223 }, { "epoch": 1.5550184454903957, "ewc_loss": 0.029301710426807404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9301711037987843e-05, "grad_norm": 17.52211570739746, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8754475712776184, "num_tokens": 466412950.0, "step": 12224 }, { "epoch": 1.5551456557689862, "ewc_loss": 0.02928985096514225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9289851227076724e-05, "grad_norm": 17.499610900878906, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8755191564559937, "num_tokens": 466448830.0, "step": 12225 }, { "epoch": 1.5552728660475768, "ewc_loss": 0.029326390475034714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9326391086215153e-05, "grad_norm": 17.501150131225586, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8590871691703796, "num_tokens": 466489749.0, "step": 12226 }, { "epoch": 1.5554000763261673, "ewc_loss": 0.02926858700811863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9268587240949273e-05, "grad_norm": 17.492971420288086, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8515418767929077, "num_tokens": 466531320.0, "step": 12227 }, { "epoch": 1.5555272866047578, "ewc_loss": 0.029293563216924667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929356378444936e-05, "grad_norm": 17.505207061767578, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8610775470733643, "num_tokens": 466570001.0, "step": 12228 }, { "epoch": 1.5556544968833483, "ewc_loss": 0.02934967540204525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9349675969569944e-05, "grad_norm": 17.57097625732422, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.871243417263031, "num_tokens": 466607186.0, "step": 12229 }, { "epoch": 1.5557817071619386, "ewc_loss": 0.029317215085029602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9317214284674264e-05, "grad_norm": 17.486095428466797, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8597114086151123, "num_tokens": 466649480.0, "step": 12230 }, { "epoch": 1.5559089174405292, "ewc_loss": 0.029315436258912086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9315437132027e-05, "grad_norm": 17.520069122314453, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8754943013191223, "num_tokens": 466688861.0, "step": 12231 }, { "epoch": 1.5560361277191197, "ewc_loss": 0.029280874878168106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9280874514370225e-05, "grad_norm": 17.593730926513672, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8501632213592529, "num_tokens": 466723200.0, "step": 12232 }, { "epoch": 1.5561633379977102, "ewc_loss": 0.029332082718610764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.933208270405885e-05, "grad_norm": 17.476367950439453, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8552426099777222, "num_tokens": 466759639.0, "step": 12233 }, { "epoch": 1.5562905482763008, "ewc_loss": 0.02932165004312992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9321650799829513e-05, "grad_norm": 17.573925018310547, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8627128005027771, "num_tokens": 466795748.0, "step": 12234 }, { "epoch": 1.5564177585548913, "ewc_loss": 0.029345525428652763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9345525035751052e-05, "grad_norm": 17.535364151000977, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8784868121147156, "num_tokens": 466832938.0, "step": 12235 }, { "epoch": 1.5565449688334816, "ewc_loss": 0.029278364032506943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9278364308993332e-05, "grad_norm": 17.517433166503906, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.862112820148468, "num_tokens": 466868710.0, "step": 12236 }, { "epoch": 1.5566721791120721, "ewc_loss": 0.029326757416129112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9326756703085266e-05, "grad_norm": 17.62001609802246, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8663486838340759, "num_tokens": 466903558.0, "step": 12237 }, { "epoch": 1.5567993893906626, "ewc_loss": 0.029299313202500343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929931360995397e-05, "grad_norm": 17.52614974975586, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8591333627700806, "num_tokens": 466946785.0, "step": 12238 }, { "epoch": 1.5569265996692532, "ewc_loss": 0.02925802953541279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9258029826451093e-05, "grad_norm": 17.51729393005371, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8605796098709106, "num_tokens": 466988563.0, "step": 12239 }, { "epoch": 1.5570538099478437, "ewc_loss": 0.029266132041811943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9266131605254486e-05, "grad_norm": 17.442258834838867, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8623167276382446, "num_tokens": 467025447.0, "step": 12240 }, { "epoch": 1.5571810202264342, "ewc_loss": 0.029336851090192795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9336850275285542e-05, "grad_norm": 17.538766860961914, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8761436343193054, "num_tokens": 467060751.0, "step": 12241 }, { "epoch": 1.5573082305050248, "ewc_loss": 0.029408421367406845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940842205134686e-05, "grad_norm": 17.608009338378906, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8673673868179321, "num_tokens": 467101889.0, "step": 12242 }, { "epoch": 1.5574354407836153, "ewc_loss": 0.029321979731321335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9321980036911555e-05, "grad_norm": 17.500701904296875, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8707726001739502, "num_tokens": 467136782.0, "step": 12243 }, { "epoch": 1.5575626510622058, "ewc_loss": 0.029353290796279907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935329030151479e-05, "grad_norm": 17.53639793395996, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8774322271347046, "num_tokens": 467174137.0, "step": 12244 }, { "epoch": 1.5576898613407963, "ewc_loss": 0.029351048171520233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9351047487580217e-05, "grad_norm": 17.557254791259766, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8648401498794556, "num_tokens": 467213919.0, "step": 12245 }, { "epoch": 1.5578170716193869, "ewc_loss": 0.029312631115317345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.931263043137733e-05, "grad_norm": 17.572643280029297, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8752400279045105, "num_tokens": 467254860.0, "step": 12246 }, { "epoch": 1.5579442818979774, "ewc_loss": 0.02932809293270111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9328093660296872e-05, "grad_norm": 17.477571487426758, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8788449764251709, "num_tokens": 467296806.0, "step": 12247 }, { "epoch": 1.558071492176568, "ewc_loss": 0.02932962030172348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9329619792406447e-05, "grad_norm": 17.615093231201172, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8757227659225464, "num_tokens": 467332882.0, "step": 12248 }, { "epoch": 1.5581987024551585, "ewc_loss": 0.029325837269425392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9325838113436475e-05, "grad_norm": 17.42824935913086, "learning_rate": 1e-06, "loss": 0.3595, "mean_token_accuracy": 0.8829512596130371, "num_tokens": 467375088.0, "step": 12249 }, { "epoch": 1.558325912733749, "ewc_loss": 0.029289502650499344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9289501981111243e-05, "grad_norm": 17.622800827026367, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8686833381652832, "num_tokens": 467413000.0, "step": 12250 }, { "epoch": 1.5584531230123395, "ewc_loss": 0.029378743842244148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9378743420238607e-05, "grad_norm": 17.555200576782227, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8823492527008057, "num_tokens": 467450947.0, "step": 12251 }, { "epoch": 1.55858033329093, "ewc_loss": 0.029274258762598038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.927425884990953e-05, "grad_norm": 17.5321102142334, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8591124415397644, "num_tokens": 467493770.0, "step": 12252 }, { "epoch": 1.5587075435695206, "ewc_loss": 0.029350722208619118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9350721888476983e-05, "grad_norm": 17.535320281982422, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8653228282928467, "num_tokens": 467531882.0, "step": 12253 }, { "epoch": 1.5588347538481109, "ewc_loss": 0.029274368658661842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9274367989273742e-05, "grad_norm": 17.533302307128906, "learning_rate": 1e-06, "loss": 0.3189, "mean_token_accuracy": 0.8939656615257263, "num_tokens": 467566059.0, "step": 12254 }, { "epoch": 1.5589619641267014, "ewc_loss": 0.029307814314961433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.930781374743674e-05, "grad_norm": 17.539262771606445, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.883397102355957, "num_tokens": 467603734.0, "step": 12255 }, { "epoch": 1.559089174405292, "ewc_loss": 0.029251696541905403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9251696105347946e-05, "grad_norm": 17.51798439025879, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8756232261657715, "num_tokens": 467641224.0, "step": 12256 }, { "epoch": 1.5592163846838825, "ewc_loss": 0.029281441122293472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9281440220074728e-05, "grad_norm": 17.565357208251953, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.867129385471344, "num_tokens": 467677576.0, "step": 12257 }, { "epoch": 1.559343594962473, "ewc_loss": 0.029305078089237213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9305077987373807e-05, "grad_norm": 17.477941513061523, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8756338357925415, "num_tokens": 467717526.0, "step": 12258 }, { "epoch": 1.5594708052410635, "ewc_loss": 0.02922804467380047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.922804378613364e-05, "grad_norm": 17.534744262695312, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8648568391799927, "num_tokens": 467757591.0, "step": 12259 }, { "epoch": 1.5595980155196538, "ewc_loss": 0.029282111674547195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9282111427164637e-05, "grad_norm": 17.453786849975586, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8552315831184387, "num_tokens": 467792794.0, "step": 12260 }, { "epoch": 1.5597252257982444, "ewc_loss": 0.029202383011579514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9202383302617818e-05, "grad_norm": 17.503032684326172, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8636146783828735, "num_tokens": 467829554.0, "step": 12261 }, { "epoch": 1.5598524360768349, "ewc_loss": 0.029298216104507446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.929821675934363e-05, "grad_norm": 17.494836807250977, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8530077934265137, "num_tokens": 467871403.0, "step": 12262 }, { "epoch": 1.5599796463554254, "ewc_loss": 0.029297687113285065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.92976874334272e-05, "grad_norm": 17.661069869995117, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8491084575653076, "num_tokens": 467906498.0, "step": 12263 }, { "epoch": 1.560106856634016, "ewc_loss": 0.029289675876498222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9289676604093984e-05, "grad_norm": 17.524538040161133, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8672395348548889, "num_tokens": 467946426.0, "step": 12264 }, { "epoch": 1.5602340669126065, "ewc_loss": 0.0292668417096138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.926684101112187e-05, "grad_norm": 17.611543655395508, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8713152408599854, "num_tokens": 467979930.0, "step": 12265 }, { "epoch": 1.560361277191197, "ewc_loss": 0.02933311089873314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9333110433071852e-05, "grad_norm": 17.485309600830078, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.862461268901825, "num_tokens": 468016347.0, "step": 12266 }, { "epoch": 1.5604884874697875, "ewc_loss": 0.029259538277983665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9259537768666632e-05, "grad_norm": 17.560401916503906, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8750205636024475, "num_tokens": 468051446.0, "step": 12267 }, { "epoch": 1.560615697748378, "ewc_loss": 0.029349319636821747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.934931944764685e-05, "grad_norm": 17.529464721679688, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.88453209400177, "num_tokens": 468085713.0, "step": 12268 }, { "epoch": 1.5607429080269686, "ewc_loss": 0.029278632253408432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9278631700435653e-05, "grad_norm": 17.53516960144043, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.881425678730011, "num_tokens": 468117863.0, "step": 12269 }, { "epoch": 1.560870118305559, "ewc_loss": 0.02940361760556698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9403618100332096e-05, "grad_norm": 17.600337982177734, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8649607300758362, "num_tokens": 468153041.0, "step": 12270 }, { "epoch": 1.5609973285841496, "ewc_loss": 0.029351109638810158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9351109333219938e-05, "grad_norm": 17.517004013061523, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8668756484985352, "num_tokens": 468193624.0, "step": 12271 }, { "epoch": 1.5611245388627402, "ewc_loss": 0.029386017471551895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9386017558863387e-05, "grad_norm": 17.57632064819336, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.883554995059967, "num_tokens": 468227091.0, "step": 12272 }, { "epoch": 1.5612517491413307, "ewc_loss": 0.029391638934612274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9391638236120343e-05, "grad_norm": 17.54750633239746, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8637995719909668, "num_tokens": 468265079.0, "step": 12273 }, { "epoch": 1.5613789594199212, "ewc_loss": 0.029353387653827667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935338852694258e-05, "grad_norm": 17.522939682006836, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8657337427139282, "num_tokens": 468307055.0, "step": 12274 }, { "epoch": 1.5615061696985117, "ewc_loss": 0.029389137402176857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9389137125690468e-05, "grad_norm": 17.561676025390625, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8634873628616333, "num_tokens": 468344429.0, "step": 12275 }, { "epoch": 1.5616333799771023, "ewc_loss": 0.02937452867627144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.937452882179059e-05, "grad_norm": 17.487468719482422, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8670915961265564, "num_tokens": 468379448.0, "step": 12276 }, { "epoch": 1.5617605902556928, "ewc_loss": 0.02933412604033947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.933412542915903e-05, "grad_norm": 17.598079681396484, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8713653087615967, "num_tokens": 468423344.0, "step": 12277 }, { "epoch": 1.5618878005342833, "ewc_loss": 0.029422322288155556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9422322768368758e-05, "grad_norm": 17.515031814575195, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8741518259048462, "num_tokens": 468461657.0, "step": 12278 }, { "epoch": 1.5620150108128736, "ewc_loss": 0.029406698420643806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.94066976493923e-05, "grad_norm": 17.572187423706055, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8824239373207092, "num_tokens": 468500428.0, "step": 12279 }, { "epoch": 1.5621422210914642, "ewc_loss": 0.029412679374217987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.941268030554056e-05, "grad_norm": 17.590587615966797, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8572759032249451, "num_tokens": 468536310.0, "step": 12280 }, { "epoch": 1.5622694313700547, "ewc_loss": 0.02936403825879097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9364038709900342e-05, "grad_norm": 17.565956115722656, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8663473129272461, "num_tokens": 468577954.0, "step": 12281 }, { "epoch": 1.5623966416486452, "ewc_loss": 0.029344404116272926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9344404538278468e-05, "grad_norm": 17.620595932006836, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.860244631767273, "num_tokens": 468612131.0, "step": 12282 }, { "epoch": 1.5625238519272358, "ewc_loss": 0.0293810423463583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938104262284469e-05, "grad_norm": 17.584495544433594, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8643704056739807, "num_tokens": 468654022.0, "step": 12283 }, { "epoch": 1.5626510622058263, "ewc_loss": 0.029334671795368195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9334671125980094e-05, "grad_norm": 17.523906707763672, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8598136901855469, "num_tokens": 468693120.0, "step": 12284 }, { "epoch": 1.5627782724844166, "ewc_loss": 0.029333095997571945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9333095881156623e-05, "grad_norm": 17.572940826416016, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8676822781562805, "num_tokens": 468727695.0, "step": 12285 }, { "epoch": 1.5629054827630071, "ewc_loss": 0.029331067577004433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.933106770797167e-05, "grad_norm": 17.542469024658203, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8651716709136963, "num_tokens": 468766705.0, "step": 12286 }, { "epoch": 1.5630326930415976, "ewc_loss": 0.029310358688235283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.93103585136123e-05, "grad_norm": 17.508913040161133, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.859529435634613, "num_tokens": 468804555.0, "step": 12287 }, { "epoch": 1.5631599033201882, "ewc_loss": 0.02939297817647457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9392978831310757e-05, "grad_norm": 17.607011795043945, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8651105761528015, "num_tokens": 468842495.0, "step": 12288 }, { "epoch": 1.5632871135987787, "ewc_loss": 0.02940589375793934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9405893656075932e-05, "grad_norm": 17.536373138427734, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8680046200752258, "num_tokens": 468877574.0, "step": 12289 }, { "epoch": 1.5634143238773692, "ewc_loss": 0.02931186929345131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9311870093806647e-05, "grad_norm": 17.593481063842773, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8794777393341064, "num_tokens": 468914573.0, "step": 12290 }, { "epoch": 1.5635415341559598, "ewc_loss": 0.0293701384216547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.937013778137043e-05, "grad_norm": 17.538341522216797, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8784909844398499, "num_tokens": 468950943.0, "step": 12291 }, { "epoch": 1.5636687444345503, "ewc_loss": 0.029320714995265007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.932071583927609e-05, "grad_norm": 17.51399040222168, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8829426765441895, "num_tokens": 468990007.0, "step": 12292 }, { "epoch": 1.5637959547131408, "ewc_loss": 0.029332701116800308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9332701160456054e-05, "grad_norm": 17.55440902709961, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8721797466278076, "num_tokens": 469022461.0, "step": 12293 }, { "epoch": 1.5639231649917313, "ewc_loss": 0.029314810410141945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9314809580682777e-05, "grad_norm": 17.546972274780273, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8745682835578918, "num_tokens": 469056396.0, "step": 12294 }, { "epoch": 1.5640503752703219, "ewc_loss": 0.029352407902479172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935240809165407e-05, "grad_norm": 17.503068923950195, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8775381445884705, "num_tokens": 469099464.0, "step": 12295 }, { "epoch": 1.5641775855489124, "ewc_loss": 0.029373852536082268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9373852157732472e-05, "grad_norm": 17.544374465942383, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8764147758483887, "num_tokens": 469139830.0, "step": 12296 }, { "epoch": 1.564304795827503, "ewc_loss": 0.02937021665275097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9370215997914784e-05, "grad_norm": 17.553617477416992, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8670452237129211, "num_tokens": 469177838.0, "step": 12297 }, { "epoch": 1.5644320061060935, "ewc_loss": 0.029320169240236282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9320170142455027e-05, "grad_norm": 17.509349822998047, "learning_rate": 1e-06, "loss": 0.3334, "mean_token_accuracy": 0.8933353424072266, "num_tokens": 469218727.0, "step": 12298 }, { "epoch": 1.564559216384684, "ewc_loss": 0.02935943566262722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9359434847719967e-05, "grad_norm": 17.592628479003906, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8458757400512695, "num_tokens": 469261186.0, "step": 12299 }, { "epoch": 1.5646864266632745, "ewc_loss": 0.029375199228525162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.93752000288805e-05, "grad_norm": 17.549800872802734, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8769373893737793, "num_tokens": 469301191.0, "step": 12300 }, { "epoch": 1.564813636941865, "ewc_loss": 0.029321152716875076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.932115239673294e-05, "grad_norm": 17.616493225097656, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8736366033554077, "num_tokens": 469340132.0, "step": 12301 }, { "epoch": 1.5649408472204556, "ewc_loss": 0.029331889003515244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.933188807219267e-05, "grad_norm": 17.539445877075195, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8871439695358276, "num_tokens": 469376211.0, "step": 12302 }, { "epoch": 1.5650680574990459, "ewc_loss": 0.029355410486459732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9355411243159324e-05, "grad_norm": 17.59941864013672, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8586999177932739, "num_tokens": 469413961.0, "step": 12303 }, { "epoch": 1.5651952677776364, "ewc_loss": 0.029358094558119774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9358094252529554e-05, "grad_norm": 17.565263748168945, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8685862421989441, "num_tokens": 469449507.0, "step": 12304 }, { "epoch": 1.565322478056227, "ewc_loss": 0.029327068477869034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9327067750273272e-05, "grad_norm": 17.58667755126953, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8566295504570007, "num_tokens": 469486324.0, "step": 12305 }, { "epoch": 1.5654496883348175, "ewc_loss": 0.02936982549726963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.936982491519302e-05, "grad_norm": 17.568510055541992, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8643231391906738, "num_tokens": 469523799.0, "step": 12306 }, { "epoch": 1.565576898613408, "ewc_loss": 0.029319502413272858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9319502573343925e-05, "grad_norm": 17.53751564025879, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8547897338867188, "num_tokens": 469558661.0, "step": 12307 }, { "epoch": 1.5657041088919985, "ewc_loss": 0.029345614835619926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9345614166231826e-05, "grad_norm": 17.559703826904297, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.867843508720398, "num_tokens": 469603087.0, "step": 12308 }, { "epoch": 1.5658313191705888, "ewc_loss": 0.029303651303052902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9303651899681427e-05, "grad_norm": 17.561246871948242, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8693621754646301, "num_tokens": 469637344.0, "step": 12309 }, { "epoch": 1.5659585294491793, "ewc_loss": 0.029370944947004318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9370945412665606e-05, "grad_norm": 17.5601749420166, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.878577470779419, "num_tokens": 469681127.0, "step": 12310 }, { "epoch": 1.5660857397277699, "ewc_loss": 0.02930799312889576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.930799382738769e-05, "grad_norm": 17.479393005371094, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8672788739204407, "num_tokens": 469724238.0, "step": 12311 }, { "epoch": 1.5662129500063604, "ewc_loss": 0.029323894530534744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9323893613764085e-05, "grad_norm": 17.538293838500977, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8616890907287598, "num_tokens": 469765967.0, "step": 12312 }, { "epoch": 1.566340160284951, "ewc_loss": 0.029382478445768356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9382477805484086e-05, "grad_norm": 17.532495498657227, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8691918849945068, "num_tokens": 469801387.0, "step": 12313 }, { "epoch": 1.5664673705635415, "ewc_loss": 0.029388515278697014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9388515031314455e-05, "grad_norm": 17.576904296875, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8694891929626465, "num_tokens": 469835448.0, "step": 12314 }, { "epoch": 1.566594580842132, "ewc_loss": 0.029347719624638557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.934771873697173e-05, "grad_norm": 17.571121215820312, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8598105907440186, "num_tokens": 469876647.0, "step": 12315 }, { "epoch": 1.5667217911207225, "ewc_loss": 0.029364794492721558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9364793590502813e-05, "grad_norm": 17.551259994506836, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8833472728729248, "num_tokens": 469918158.0, "step": 12316 }, { "epoch": 1.566849001399313, "ewc_loss": 0.02936316281557083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9363161957007833e-05, "grad_norm": 17.620878219604492, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.855394184589386, "num_tokens": 469953691.0, "step": 12317 }, { "epoch": 1.5669762116779036, "ewc_loss": 0.029364336282014847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9364337024162523e-05, "grad_norm": 17.524555206298828, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8711315393447876, "num_tokens": 469996174.0, "step": 12318 }, { "epoch": 1.567103421956494, "ewc_loss": 0.02931695245206356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9316952350200154e-05, "grad_norm": 17.548521041870117, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8565319776535034, "num_tokens": 470031371.0, "step": 12319 }, { "epoch": 1.5672306322350846, "ewc_loss": 0.029362371191382408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.936237069661729e-05, "grad_norm": 17.593482971191406, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8719891309738159, "num_tokens": 470068783.0, "step": 12320 }, { "epoch": 1.5673578425136752, "ewc_loss": 0.029373042285442352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9373042707447894e-05, "grad_norm": 17.59638786315918, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8520329594612122, "num_tokens": 470103490.0, "step": 12321 }, { "epoch": 1.5674850527922657, "ewc_loss": 0.029331477358937263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9331476980587468e-05, "grad_norm": 17.488155364990234, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.865199863910675, "num_tokens": 470137213.0, "step": 12322 }, { "epoch": 1.5676122630708562, "ewc_loss": 0.02934647910296917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9346480005187914e-05, "grad_norm": 17.58623695373535, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8733863234519958, "num_tokens": 470171816.0, "step": 12323 }, { "epoch": 1.5677394733494467, "ewc_loss": 0.0294492244720459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9449223802657798e-05, "grad_norm": 17.545394897460938, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8460675477981567, "num_tokens": 470210517.0, "step": 12324 }, { "epoch": 1.5678666836280373, "ewc_loss": 0.029351940378546715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935194061137736e-05, "grad_norm": 17.53247833251953, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8565828800201416, "num_tokens": 470247085.0, "step": 12325 }, { "epoch": 1.5679938939066278, "ewc_loss": 0.02942579612135887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942579703812953e-05, "grad_norm": 17.64086151123047, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8691409230232239, "num_tokens": 470288483.0, "step": 12326 }, { "epoch": 1.5681211041852183, "ewc_loss": 0.02942260168492794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.94226010737475e-05, "grad_norm": 17.550546646118164, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8679473400115967, "num_tokens": 470322173.0, "step": 12327 }, { "epoch": 1.5682483144638086, "ewc_loss": 0.029366882517933846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9366881790338084e-05, "grad_norm": 17.548858642578125, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8698402643203735, "num_tokens": 470356617.0, "step": 12328 }, { "epoch": 1.5683755247423992, "ewc_loss": 0.029469413682818413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9469412766047753e-05, "grad_norm": 17.579599380493164, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8795371651649475, "num_tokens": 470393095.0, "step": 12329 }, { "epoch": 1.5685027350209897, "ewc_loss": 0.029511431232094765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9511431421269663e-05, "grad_norm": 17.706758499145508, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8797029256820679, "num_tokens": 470427785.0, "step": 12330 }, { "epoch": 1.5686299452995802, "ewc_loss": 0.029422080144286156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9422080842778087e-05, "grad_norm": 17.527347564697266, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.863599419593811, "num_tokens": 470469971.0, "step": 12331 }, { "epoch": 1.5687571555781707, "ewc_loss": 0.02941158413887024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9411583454930224e-05, "grad_norm": 17.64986801147461, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8598241806030273, "num_tokens": 470503935.0, "step": 12332 }, { "epoch": 1.5688843658567613, "ewc_loss": 0.02944372221827507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9443721359712072e-05, "grad_norm": 17.55339813232422, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.871285617351532, "num_tokens": 470536014.0, "step": 12333 }, { "epoch": 1.5690115761353516, "ewc_loss": 0.029358811676502228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935881093435455e-05, "grad_norm": 17.640485763549805, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.88165283203125, "num_tokens": 470575632.0, "step": 12334 }, { "epoch": 1.569138786413942, "ewc_loss": 0.02949478104710579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9494780392269604e-05, "grad_norm": 17.573970794677734, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8794136047363281, "num_tokens": 470611295.0, "step": 12335 }, { "epoch": 1.5692659966925326, "ewc_loss": 0.029435215517878532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943521576526109e-05, "grad_norm": 17.62627601623535, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8671393394470215, "num_tokens": 470646993.0, "step": 12336 }, { "epoch": 1.5693932069711232, "ewc_loss": 0.029473336413502693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9473336326191202e-05, "grad_norm": 17.59391975402832, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8724429607391357, "num_tokens": 470680253.0, "step": 12337 }, { "epoch": 1.5695204172497137, "ewc_loss": 0.029382117092609406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9382117645582184e-05, "grad_norm": 17.516300201416016, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8665997385978699, "num_tokens": 470716169.0, "step": 12338 }, { "epoch": 1.5696476275283042, "ewc_loss": 0.02945834957063198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9458349672495387e-05, "grad_norm": 17.62433624267578, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8821876049041748, "num_tokens": 470754208.0, "step": 12339 }, { "epoch": 1.5697748378068948, "ewc_loss": 0.029469693079590797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.94696928904159e-05, "grad_norm": 17.574172973632812, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8703697323799133, "num_tokens": 470801329.0, "step": 12340 }, { "epoch": 1.5699020480854853, "ewc_loss": 0.02941782958805561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9417829864542e-05, "grad_norm": 17.55413818359375, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8636277914047241, "num_tokens": 470838408.0, "step": 12341 }, { "epoch": 1.5700292583640758, "ewc_loss": 0.029399925842881203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9399925551842898e-05, "grad_norm": 17.564373016357422, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8688126802444458, "num_tokens": 470874001.0, "step": 12342 }, { "epoch": 1.5701564686426663, "ewc_loss": 0.029467066749930382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946706626971718e-05, "grad_norm": 17.583633422851562, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8635185956954956, "num_tokens": 470916870.0, "step": 12343 }, { "epoch": 1.5702836789212569, "ewc_loss": 0.0294030774384737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9403077860479243e-05, "grad_norm": 17.504337310791016, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8770737051963806, "num_tokens": 470953862.0, "step": 12344 }, { "epoch": 1.5704108891998474, "ewc_loss": 0.029384618625044823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938461875601206e-05, "grad_norm": 17.50467300415039, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8789157271385193, "num_tokens": 470991507.0, "step": 12345 }, { "epoch": 1.570538099478438, "ewc_loss": 0.02943321317434311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9433213057927787e-05, "grad_norm": 17.640539169311523, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8725908994674683, "num_tokens": 471030109.0, "step": 12346 }, { "epoch": 1.5706653097570284, "ewc_loss": 0.02942744828760624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942744868050795e-05, "grad_norm": 17.593706130981445, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8824986219406128, "num_tokens": 471063829.0, "step": 12347 }, { "epoch": 1.570792520035619, "ewc_loss": 0.029337309300899506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9337308660615236e-05, "grad_norm": 17.53144073486328, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.859504759311676, "num_tokens": 471107259.0, "step": 12348 }, { "epoch": 1.5709197303142095, "ewc_loss": 0.029348617419600487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.934861731773708e-05, "grad_norm": 17.59784507751465, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8714540004730225, "num_tokens": 471141516.0, "step": 12349 }, { "epoch": 1.5710469405928, "ewc_loss": 0.029399430379271507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9399430786725134e-05, "grad_norm": 17.60928726196289, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8692829608917236, "num_tokens": 471178655.0, "step": 12350 }, { "epoch": 1.5711741508713906, "ewc_loss": 0.02940702810883522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9407028705463745e-05, "grad_norm": 17.550661087036133, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8797875046730042, "num_tokens": 471209437.0, "step": 12351 }, { "epoch": 1.5713013611499809, "ewc_loss": 0.0294132512062788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9413251468213275e-05, "grad_norm": 17.591136932373047, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8843831419944763, "num_tokens": 471249559.0, "step": 12352 }, { "epoch": 1.5714285714285714, "ewc_loss": 0.02937634475529194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.937634417321533e-05, "grad_norm": 17.533103942871094, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8737847208976746, "num_tokens": 471287179.0, "step": 12353 }, { "epoch": 1.571555781707162, "ewc_loss": 0.02936272695660591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9362727218540385e-05, "grad_norm": 17.581777572631836, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8729606866836548, "num_tokens": 471328052.0, "step": 12354 }, { "epoch": 1.5716829919857525, "ewc_loss": 0.029408792033791542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940879130619578e-05, "grad_norm": 17.606420516967773, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.867415189743042, "num_tokens": 471362414.0, "step": 12355 }, { "epoch": 1.571810202264343, "ewc_loss": 0.029348475858569145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9348475436563604e-05, "grad_norm": 17.531755447387695, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8644042015075684, "num_tokens": 471397910.0, "step": 12356 }, { "epoch": 1.5719374125429335, "ewc_loss": 0.029435960575938225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943595973192714e-05, "grad_norm": 17.71221351623535, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8834177255630493, "num_tokens": 471436951.0, "step": 12357 }, { "epoch": 1.5720646228215238, "ewc_loss": 0.029395153746008873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9395154342637397e-05, "grad_norm": 17.445344924926758, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8715530633926392, "num_tokens": 471470139.0, "step": 12358 }, { "epoch": 1.5721918331001143, "ewc_loss": 0.02933584898710251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9335849831113592e-05, "grad_norm": 17.663606643676758, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.876924991607666, "num_tokens": 471508591.0, "step": 12359 }, { "epoch": 1.5723190433787049, "ewc_loss": 0.029479773715138435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947977372969035e-05, "grad_norm": 17.44937515258789, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8697196841239929, "num_tokens": 471549165.0, "step": 12360 }, { "epoch": 1.5724462536572954, "ewc_loss": 0.029332159087061882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9332159101613797e-05, "grad_norm": 17.617172241210938, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8647157549858093, "num_tokens": 471589021.0, "step": 12361 }, { "epoch": 1.572573463935886, "ewc_loss": 0.029550738632678986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9550737963290885e-05, "grad_norm": 17.605220794677734, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8627881407737732, "num_tokens": 471624263.0, "step": 12362 }, { "epoch": 1.5727006742144765, "ewc_loss": 0.029337231069803238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9337230444070883e-05, "grad_norm": 17.569847106933594, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8639860153198242, "num_tokens": 471663678.0, "step": 12363 }, { "epoch": 1.572827884493067, "ewc_loss": 0.029412560164928436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9412560252239928e-05, "grad_norm": 17.56144905090332, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8674132227897644, "num_tokens": 471705887.0, "step": 12364 }, { "epoch": 1.5729550947716575, "ewc_loss": 0.029435470700263977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9435470423777588e-05, "grad_norm": 17.655969619750977, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8771026730537415, "num_tokens": 471739796.0, "step": 12365 }, { "epoch": 1.573082305050248, "ewc_loss": 0.02955363504588604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9553635613410734e-05, "grad_norm": 17.66914939880371, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.865967333316803, "num_tokens": 471774723.0, "step": 12366 }, { "epoch": 1.5732095153288386, "ewc_loss": 0.029485387727618217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9485387130989693e-05, "grad_norm": 17.555007934570312, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.862729549407959, "num_tokens": 471814169.0, "step": 12367 }, { "epoch": 1.573336725607429, "ewc_loss": 0.029523177072405815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9523176635848358e-05, "grad_norm": 17.61406135559082, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8706424236297607, "num_tokens": 471852367.0, "step": 12368 }, { "epoch": 1.5734639358860196, "ewc_loss": 0.0295685064047575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.956850585178472e-05, "grad_norm": 17.485280990600586, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8753631114959717, "num_tokens": 471883438.0, "step": 12369 }, { "epoch": 1.5735911461646102, "ewc_loss": 0.02949899435043335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949899499071762e-05, "grad_norm": 17.577659606933594, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8646012544631958, "num_tokens": 471921051.0, "step": 12370 }, { "epoch": 1.5737183564432007, "ewc_loss": 0.029640771448612213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964077066280879e-05, "grad_norm": 17.54649543762207, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.873518705368042, "num_tokens": 471959685.0, "step": 12371 }, { "epoch": 1.5738455667217912, "ewc_loss": 0.029613783583045006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9613784136017784e-05, "grad_norm": 17.659954071044922, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8702504634857178, "num_tokens": 471997345.0, "step": 12372 }, { "epoch": 1.5739727770003817, "ewc_loss": 0.029661312699317932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9661312510143034e-05, "grad_norm": 17.54792022705078, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8757256865501404, "num_tokens": 472034455.0, "step": 12373 }, { "epoch": 1.5740999872789723, "ewc_loss": 0.029542984440922737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.954298361146357e-05, "grad_norm": 17.523677825927734, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8689248561859131, "num_tokens": 472073630.0, "step": 12374 }, { "epoch": 1.5742271975575628, "ewc_loss": 0.029667822644114494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9667822673218325e-05, "grad_norm": 17.546344757080078, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8678103685379028, "num_tokens": 472111556.0, "step": 12375 }, { "epoch": 1.5743544078361533, "ewc_loss": 0.029653441160917282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965344174299389e-05, "grad_norm": 17.597585678100586, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8770171403884888, "num_tokens": 472148276.0, "step": 12376 }, { "epoch": 1.5744816181147436, "ewc_loss": 0.029636086896061897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.963608676509466e-05, "grad_norm": 17.57616424560547, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8692473769187927, "num_tokens": 472181665.0, "step": 12377 }, { "epoch": 1.5746088283933342, "ewc_loss": 0.029615120962262154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.961512109322939e-05, "grad_norm": 17.60590362548828, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8639816045761108, "num_tokens": 472222759.0, "step": 12378 }, { "epoch": 1.5747360386719247, "ewc_loss": 0.02956872433423996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9568724130513147e-05, "grad_norm": 17.622774124145508, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8639006614685059, "num_tokens": 472260867.0, "step": 12379 }, { "epoch": 1.5748632489505152, "ewc_loss": 0.029539331793785095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.953933108074125e-05, "grad_norm": 17.572359085083008, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8584461808204651, "num_tokens": 472300068.0, "step": 12380 }, { "epoch": 1.5749904592291057, "ewc_loss": 0.029584690928459167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9584691219497472e-05, "grad_norm": 17.66089630126953, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8807409405708313, "num_tokens": 472336459.0, "step": 12381 }, { "epoch": 1.5751176695076963, "ewc_loss": 0.029612580314278603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9612579965032637e-05, "grad_norm": 17.584325790405273, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8738957047462463, "num_tokens": 472376696.0, "step": 12382 }, { "epoch": 1.5752448797862866, "ewc_loss": 0.02953677624464035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9536775400629267e-05, "grad_norm": 17.709197998046875, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8818342685699463, "num_tokens": 472414491.0, "step": 12383 }, { "epoch": 1.575372090064877, "ewc_loss": 0.0295290295034647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.952903014374897e-05, "grad_norm": 17.54951286315918, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8687284588813782, "num_tokens": 472452188.0, "step": 12384 }, { "epoch": 1.5754993003434676, "ewc_loss": 0.029494188725948334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949418922071345e-05, "grad_norm": 17.652326583862305, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8671935796737671, "num_tokens": 472489690.0, "step": 12385 }, { "epoch": 1.5756265106220582, "ewc_loss": 0.02953059785068035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9530598112614825e-05, "grad_norm": 17.552230834960938, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8682116270065308, "num_tokens": 472529912.0, "step": 12386 }, { "epoch": 1.5757537209006487, "ewc_loss": 0.029498813673853874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9498813091777265e-05, "grad_norm": 17.647008895874023, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8679494857788086, "num_tokens": 472562534.0, "step": 12387 }, { "epoch": 1.5758809311792392, "ewc_loss": 0.029615813866257668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.961581412819214e-05, "grad_norm": 17.580841064453125, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.867905855178833, "num_tokens": 472598955.0, "step": 12388 }, { "epoch": 1.5760081414578297, "ewc_loss": 0.02946312353014946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9463122700690292e-05, "grad_norm": 17.58897590637207, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8729305863380432, "num_tokens": 472638064.0, "step": 12389 }, { "epoch": 1.5761353517364203, "ewc_loss": 0.02950897254049778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950897214759607e-05, "grad_norm": 17.630107879638672, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8732950687408447, "num_tokens": 472676823.0, "step": 12390 }, { "epoch": 1.5762625620150108, "ewc_loss": 0.029445331543684006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944533116533421e-05, "grad_norm": 17.554773330688477, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8695662617683411, "num_tokens": 472718528.0, "step": 12391 }, { "epoch": 1.5763897722936013, "ewc_loss": 0.029403340071439743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9403339794953354e-05, "grad_norm": 17.59577178955078, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8751240968704224, "num_tokens": 472755634.0, "step": 12392 }, { "epoch": 1.5765169825721919, "ewc_loss": 0.029446948319673538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9446948246913962e-05, "grad_norm": 17.554061889648438, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8813136219978333, "num_tokens": 472798060.0, "step": 12393 }, { "epoch": 1.5766441928507824, "ewc_loss": 0.029445268213748932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9445267500705086e-05, "grad_norm": 17.642499923706055, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8743641972541809, "num_tokens": 472837628.0, "step": 12394 }, { "epoch": 1.576771403129373, "ewc_loss": 0.02951745316386223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9517454095184803e-05, "grad_norm": 17.589693069458008, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8705698251724243, "num_tokens": 472877446.0, "step": 12395 }, { "epoch": 1.5768986134079634, "ewc_loss": 0.029371043667197227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9371043638093397e-05, "grad_norm": 17.572568893432617, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8636094927787781, "num_tokens": 472918437.0, "step": 12396 }, { "epoch": 1.577025823686554, "ewc_loss": 0.029407182708382607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9407183319563046e-05, "grad_norm": 17.56313705444336, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8784447312355042, "num_tokens": 472956242.0, "step": 12397 }, { "epoch": 1.5771530339651445, "ewc_loss": 0.02942010760307312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942010723927524e-05, "grad_norm": 17.59779930114746, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8794337511062622, "num_tokens": 472985620.0, "step": 12398 }, { "epoch": 1.577280244243735, "ewc_loss": 0.029375461861491203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.937546196335461e-05, "grad_norm": 17.57279396057129, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.87446129322052, "num_tokens": 473019517.0, "step": 12399 }, { "epoch": 1.5774074545223256, "ewc_loss": 0.029344135895371437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9344135327846743e-05, "grad_norm": 17.535865783691406, "learning_rate": 1e-06, "loss": 0.3408, "mean_token_accuracy": 0.8892065286636353, "num_tokens": 473054624.0, "step": 12400 }, { "epoch": 1.5775346648009159, "ewc_loss": 0.029429476708173752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9429476853692904e-05, "grad_norm": 17.620859146118164, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8596816062927246, "num_tokens": 473094140.0, "step": 12401 }, { "epoch": 1.5776618750795064, "ewc_loss": 0.029401324689388275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9401324354694225e-05, "grad_norm": 17.58037567138672, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8516217470169067, "num_tokens": 473136886.0, "step": 12402 }, { "epoch": 1.577789085358097, "ewc_loss": 0.0293738953769207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9373895813478157e-05, "grad_norm": 17.581878662109375, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8577369451522827, "num_tokens": 473173267.0, "step": 12403 }, { "epoch": 1.5779162956366874, "ewc_loss": 0.029397668316960335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9397668185993098e-05, "grad_norm": 17.63422203063965, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8510830402374268, "num_tokens": 473216477.0, "step": 12404 }, { "epoch": 1.578043505915278, "ewc_loss": 0.029403161257505417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9403161533991806e-05, "grad_norm": 17.588193893432617, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8624820113182068, "num_tokens": 473256986.0, "step": 12405 }, { "epoch": 1.5781707161938685, "ewc_loss": 0.029407113790512085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940711419796571e-05, "grad_norm": 17.568815231323242, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8775269985198975, "num_tokens": 473297967.0, "step": 12406 }, { "epoch": 1.5782979264724588, "ewc_loss": 0.029383504763245583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938350553449709e-05, "grad_norm": 17.566455841064453, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8626717329025269, "num_tokens": 473333580.0, "step": 12407 }, { "epoch": 1.5784251367510493, "ewc_loss": 0.029370471835136414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9370472475420684e-05, "grad_norm": 17.57407569885254, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8741759061813354, "num_tokens": 473370553.0, "step": 12408 }, { "epoch": 1.5785523470296399, "ewc_loss": 0.029439471662044525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943947220046539e-05, "grad_norm": 17.615999221801758, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8521852493286133, "num_tokens": 473410705.0, "step": 12409 }, { "epoch": 1.5786795573082304, "ewc_loss": 0.02935776859521866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.935776865342632e-05, "grad_norm": 17.556169509887695, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8751155138015747, "num_tokens": 473443315.0, "step": 12410 }, { "epoch": 1.578806767586821, "ewc_loss": 0.02941281720995903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9412816729745828e-05, "grad_norm": 17.581321716308594, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8616518378257751, "num_tokens": 473488767.0, "step": 12411 }, { "epoch": 1.5789339778654115, "ewc_loss": 0.029433703050017357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943370236607734e-05, "grad_norm": 17.612632751464844, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8850096464157104, "num_tokens": 473525360.0, "step": 12412 }, { "epoch": 1.579061188144002, "ewc_loss": 0.029364751651883125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9364751753746532e-05, "grad_norm": 17.57196807861328, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8618343472480774, "num_tokens": 473560625.0, "step": 12413 }, { "epoch": 1.5791883984225925, "ewc_loss": 0.029444025829434395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9444025130942464e-05, "grad_norm": 17.612140655517578, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8877471685409546, "num_tokens": 473595484.0, "step": 12414 }, { "epoch": 1.579315608701183, "ewc_loss": 0.029468074440956116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9468073989846744e-05, "grad_norm": 17.656370162963867, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8668144345283508, "num_tokens": 473627269.0, "step": 12415 }, { "epoch": 1.5794428189797736, "ewc_loss": 0.029439803212881088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9439803256536834e-05, "grad_norm": 17.593875885009766, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8546499609947205, "num_tokens": 473669159.0, "step": 12416 }, { "epoch": 1.579570029258364, "ewc_loss": 0.02940594218671322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9405942768789828e-05, "grad_norm": 17.54163932800293, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8793084621429443, "num_tokens": 473707739.0, "step": 12417 }, { "epoch": 1.5796972395369546, "ewc_loss": 0.02945037931203842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.945037886092905e-05, "grad_norm": 17.553739547729492, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8708785772323608, "num_tokens": 473746908.0, "step": 12418 }, { "epoch": 1.5798244498155452, "ewc_loss": 0.029464351013302803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9464350518537685e-05, "grad_norm": 17.595073699951172, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8679230213165283, "num_tokens": 473781480.0, "step": 12419 }, { "epoch": 1.5799516600941357, "ewc_loss": 0.02950604259967804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9506041755666956e-05, "grad_norm": 17.59682273864746, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8692191243171692, "num_tokens": 473814676.0, "step": 12420 }, { "epoch": 1.5800788703727262, "ewc_loss": 0.02944525144994259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9445251129800454e-05, "grad_norm": 17.549434661865234, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8649562001228333, "num_tokens": 473855487.0, "step": 12421 }, { "epoch": 1.5802060806513167, "ewc_loss": 0.02948862873017788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9488628570106812e-05, "grad_norm": 17.61460304260254, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.881758987903595, "num_tokens": 473885648.0, "step": 12422 }, { "epoch": 1.5803332909299073, "ewc_loss": 0.02950325980782509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9503260520868935e-05, "grad_norm": 17.5721378326416, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8655090928077698, "num_tokens": 473923862.0, "step": 12423 }, { "epoch": 1.5804605012084978, "ewc_loss": 0.02947837859392166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947837856481783e-05, "grad_norm": 17.598163604736328, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.873570442199707, "num_tokens": 473957725.0, "step": 12424 }, { "epoch": 1.5805877114870883, "ewc_loss": 0.02951240912079811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.951241003756877e-05, "grad_norm": 17.546428680419922, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8710750341415405, "num_tokens": 473994939.0, "step": 12425 }, { "epoch": 1.5807149217656786, "ewc_loss": 0.02946912869811058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.94691290037008e-05, "grad_norm": 17.609188079833984, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8654193878173828, "num_tokens": 474035996.0, "step": 12426 }, { "epoch": 1.5808421320442692, "ewc_loss": 0.029465192928910255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9465192710631527e-05, "grad_norm": 17.535837173461914, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8786272406578064, "num_tokens": 474071693.0, "step": 12427 }, { "epoch": 1.5809693423228597, "ewc_loss": 0.02950570546090603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.95057052426273e-05, "grad_norm": 17.62914276123047, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8762930035591125, "num_tokens": 474114415.0, "step": 12428 }, { "epoch": 1.5810965526014502, "ewc_loss": 0.029534509405493736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.953450893983245e-05, "grad_norm": 17.618711471557617, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8637701272964478, "num_tokens": 474152060.0, "step": 12429 }, { "epoch": 1.5812237628800407, "ewc_loss": 0.02944902889430523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944902917079162e-05, "grad_norm": 17.53853416442871, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8734837174415588, "num_tokens": 474190605.0, "step": 12430 }, { "epoch": 1.5813509731586313, "ewc_loss": 0.02950245328247547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950245288957376e-05, "grad_norm": 17.646772384643555, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8646191358566284, "num_tokens": 474229447.0, "step": 12431 }, { "epoch": 1.5814781834372216, "ewc_loss": 0.029514938592910767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.95149384328397e-05, "grad_norm": 17.5655574798584, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8770167827606201, "num_tokens": 474268088.0, "step": 12432 }, { "epoch": 1.581605393715812, "ewc_loss": 0.02944055013358593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944055086118169e-05, "grad_norm": 17.596906661987305, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8653161525726318, "num_tokens": 474304856.0, "step": 12433 }, { "epoch": 1.5817326039944026, "ewc_loss": 0.02946888655424118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946888707811013e-05, "grad_norm": 17.580387115478516, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8729464411735535, "num_tokens": 474349248.0, "step": 12434 }, { "epoch": 1.5818598142729932, "ewc_loss": 0.029430633410811424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943063373095356e-05, "grad_norm": 17.570816040039062, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8804343938827515, "num_tokens": 474384432.0, "step": 12435 }, { "epoch": 1.5819870245515837, "ewc_loss": 0.02945845201611519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9458451535901986e-05, "grad_norm": 17.650169372558594, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8669081926345825, "num_tokens": 474415881.0, "step": 12436 }, { "epoch": 1.5821142348301742, "ewc_loss": 0.029459591954946518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.945959204225801e-05, "grad_norm": 17.557235717773438, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8679250478744507, "num_tokens": 474452755.0, "step": 12437 }, { "epoch": 1.5822414451087647, "ewc_loss": 0.02941022254526615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.941022285085637e-05, "grad_norm": 17.636520385742188, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8650516867637634, "num_tokens": 474493937.0, "step": 12438 }, { "epoch": 1.5823686553873553, "ewc_loss": 0.029483191668987274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9483191610779613e-05, "grad_norm": 17.56756591796875, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8757150173187256, "num_tokens": 474534072.0, "step": 12439 }, { "epoch": 1.5824958656659458, "ewc_loss": 0.029409250244498253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9409249691525474e-05, "grad_norm": 17.651649475097656, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.861526370048523, "num_tokens": 474569343.0, "step": 12440 }, { "epoch": 1.5826230759445363, "ewc_loss": 0.029494047164916992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9494047339539975e-05, "grad_norm": 17.606494903564453, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8591053485870361, "num_tokens": 474606610.0, "step": 12441 }, { "epoch": 1.5827502862231269, "ewc_loss": 0.02939978614449501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9399785489658825e-05, "grad_norm": 17.636062622070312, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8615100383758545, "num_tokens": 474645071.0, "step": 12442 }, { "epoch": 1.5828774965017174, "ewc_loss": 0.029466992244124413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9466991691151634e-05, "grad_norm": 17.610885620117188, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8563669323921204, "num_tokens": 474683985.0, "step": 12443 }, { "epoch": 1.583004706780308, "ewc_loss": 0.029401779174804688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940177910204511e-05, "grad_norm": 17.57691192626953, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8758940100669861, "num_tokens": 474726102.0, "step": 12444 }, { "epoch": 1.5831319170588984, "ewc_loss": 0.029445260763168335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944526022474747e-05, "grad_norm": 17.61829376220703, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8718864321708679, "num_tokens": 474766430.0, "step": 12445 }, { "epoch": 1.583259127337489, "ewc_loss": 0.02948164753615856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9481647288776003e-05, "grad_norm": 17.666215896606445, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8685466051101685, "num_tokens": 474800648.0, "step": 12446 }, { "epoch": 1.5833863376160795, "ewc_loss": 0.02944226935505867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9442269806168042e-05, "grad_norm": 17.608455657958984, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8726654052734375, "num_tokens": 474845540.0, "step": 12447 }, { "epoch": 1.58351354789467, "ewc_loss": 0.029437564313411713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.943756408058107e-05, "grad_norm": 17.666234970092773, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.86711585521698, "num_tokens": 474884296.0, "step": 12448 }, { "epoch": 1.5836407581732606, "ewc_loss": 0.029458994045853615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9458993594744243e-05, "grad_norm": 17.616989135742188, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8742098808288574, "num_tokens": 474929344.0, "step": 12449 }, { "epoch": 1.5837679684518509, "ewc_loss": 0.029420334845781326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9420334612950683e-05, "grad_norm": 17.62001609802246, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8745908737182617, "num_tokens": 474968035.0, "step": 12450 }, { "epoch": 1.5838951787304414, "ewc_loss": 0.029458628967404366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9458629796863534e-05, "grad_norm": 17.62360191345215, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8642421364784241, "num_tokens": 475006189.0, "step": 12451 }, { "epoch": 1.584022389009032, "ewc_loss": 0.029391467571258545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.939146725111641e-05, "grad_norm": 17.554553985595703, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8652305603027344, "num_tokens": 475039768.0, "step": 12452 }, { "epoch": 1.5841495992876224, "ewc_loss": 0.029405424371361732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9405424356809817e-05, "grad_norm": 17.56963539123535, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8752238750457764, "num_tokens": 475079938.0, "step": 12453 }, { "epoch": 1.584276809566213, "ewc_loss": 0.029460279271006584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946027962025255e-05, "grad_norm": 17.55449867248535, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8572907447814941, "num_tokens": 475122405.0, "step": 12454 }, { "epoch": 1.5844040198448035, "ewc_loss": 0.029453212395310402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9453212846419774e-05, "grad_norm": 17.64399528503418, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8619583249092102, "num_tokens": 475158237.0, "step": 12455 }, { "epoch": 1.5845312301233938, "ewc_loss": 0.02949090488255024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9490904125850648e-05, "grad_norm": 17.61083221435547, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8706123232841492, "num_tokens": 475194966.0, "step": 12456 }, { "epoch": 1.5846584404019843, "ewc_loss": 0.02938687801361084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9386877940851264e-05, "grad_norm": 17.617643356323242, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8679250478744507, "num_tokens": 475234483.0, "step": 12457 }, { "epoch": 1.5847856506805749, "ewc_loss": 0.029466547071933746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946654785773717e-05, "grad_norm": 17.603559494018555, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.864456057548523, "num_tokens": 475275172.0, "step": 12458 }, { "epoch": 1.5849128609591654, "ewc_loss": 0.029390161857008934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9390161216724664e-05, "grad_norm": 17.640581130981445, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8741115927696228, "num_tokens": 475313550.0, "step": 12459 }, { "epoch": 1.585040071237756, "ewc_loss": 0.02944227121770382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9442271625157446e-05, "grad_norm": 17.60383415222168, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8769851922988892, "num_tokens": 475357019.0, "step": 12460 }, { "epoch": 1.5851672815163464, "ewc_loss": 0.02937517873942852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9375178201007657e-05, "grad_norm": 17.60788345336914, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8741037845611572, "num_tokens": 475394788.0, "step": 12461 }, { "epoch": 1.585294491794937, "ewc_loss": 0.029390433803200722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9390434065135196e-05, "grad_norm": 17.58814239501953, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8679016828536987, "num_tokens": 475432354.0, "step": 12462 }, { "epoch": 1.5854217020735275, "ewc_loss": 0.029384439811110497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9384440495050512e-05, "grad_norm": 17.628602981567383, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.873088002204895, "num_tokens": 475467284.0, "step": 12463 }, { "epoch": 1.585548912352118, "ewc_loss": 0.02942284755408764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942284663731698e-05, "grad_norm": 17.5458927154541, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8626155257225037, "num_tokens": 475505946.0, "step": 12464 }, { "epoch": 1.5856761226307086, "ewc_loss": 0.02938343770802021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938343823188916e-05, "grad_norm": 17.6547794342041, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8560782670974731, "num_tokens": 475542799.0, "step": 12465 }, { "epoch": 1.585803332909299, "ewc_loss": 0.029440902173519135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9440901926136576e-05, "grad_norm": 17.65637969970703, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8703529238700867, "num_tokens": 475582064.0, "step": 12466 }, { "epoch": 1.5859305431878896, "ewc_loss": 0.029386993497610092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938699435617309e-05, "grad_norm": 17.61485481262207, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8638769388198853, "num_tokens": 475615521.0, "step": 12467 }, { "epoch": 1.5860577534664801, "ewc_loss": 0.02935030311346054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9350303520914167e-05, "grad_norm": 17.534881591796875, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8776665925979614, "num_tokens": 475650500.0, "step": 12468 }, { "epoch": 1.5861849637450707, "ewc_loss": 0.029438165947794914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9438166166073643e-05, "grad_norm": 17.686166763305664, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8710583448410034, "num_tokens": 475695824.0, "step": 12469 }, { "epoch": 1.5863121740236612, "ewc_loss": 0.02946610562503338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9466105843312107e-05, "grad_norm": 17.57354736328125, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8472533822059631, "num_tokens": 475734015.0, "step": 12470 }, { "epoch": 1.5864393843022517, "ewc_loss": 0.02941453643143177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.941453567473218e-05, "grad_norm": 17.698150634765625, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8675720691680908, "num_tokens": 475768961.0, "step": 12471 }, { "epoch": 1.5865665945808423, "ewc_loss": 0.029495859518647194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9495859052985907e-05, "grad_norm": 17.630638122558594, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8834589719772339, "num_tokens": 475806551.0, "step": 12472 }, { "epoch": 1.5866938048594328, "ewc_loss": 0.029428524896502495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942852552223485e-05, "grad_norm": 17.694190979003906, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8624458312988281, "num_tokens": 475844066.0, "step": 12473 }, { "epoch": 1.5868210151380233, "ewc_loss": 0.029482189565896988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.948218934761826e-05, "grad_norm": 17.702693939208984, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8744884133338928, "num_tokens": 475874554.0, "step": 12474 }, { "epoch": 1.5869482254166136, "ewc_loss": 0.02938249334692955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9382494176388718e-05, "grad_norm": 17.580202102661133, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8761186599731445, "num_tokens": 475915840.0, "step": 12475 }, { "epoch": 1.5870754356952042, "ewc_loss": 0.029459262266755104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9459262805175968e-05, "grad_norm": 17.666519165039062, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8675737977027893, "num_tokens": 475951516.0, "step": 12476 }, { "epoch": 1.5872026459737947, "ewc_loss": 0.02946881763637066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9468817956512794e-05, "grad_norm": 17.66156005859375, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8713876605033875, "num_tokens": 475984224.0, "step": 12477 }, { "epoch": 1.5873298562523852, "ewc_loss": 0.029441136866807938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9441136575769633e-05, "grad_norm": 17.653844833374023, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8470832705497742, "num_tokens": 476015758.0, "step": 12478 }, { "epoch": 1.5874570665309757, "ewc_loss": 0.029490254819393158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9490254746633582e-05, "grad_norm": 17.674514770507812, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8665072321891785, "num_tokens": 476051835.0, "step": 12479 }, { "epoch": 1.5875842768095663, "ewc_loss": 0.029471972957253456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9471972084138542e-05, "grad_norm": 17.65053939819336, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8657828569412231, "num_tokens": 476090667.0, "step": 12480 }, { "epoch": 1.5877114870881566, "ewc_loss": 0.029480036348104477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.948003566416446e-05, "grad_norm": 17.70745277404785, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.866635799407959, "num_tokens": 476130448.0, "step": 12481 }, { "epoch": 1.587838697366747, "ewc_loss": 0.029520126059651375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9520126190618612e-05, "grad_norm": 17.58034324645996, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8600564002990723, "num_tokens": 476174495.0, "step": 12482 }, { "epoch": 1.5879659076453376, "ewc_loss": 0.029375195503234863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9375196390901692e-05, "grad_norm": 17.604917526245117, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8779451847076416, "num_tokens": 476213412.0, "step": 12483 }, { "epoch": 1.5880931179239282, "ewc_loss": 0.02950720675289631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9507205908885226e-05, "grad_norm": 17.58759307861328, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8823527097702026, "num_tokens": 476250441.0, "step": 12484 }, { "epoch": 1.5882203282025187, "ewc_loss": 0.029450993984937668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9450993679347448e-05, "grad_norm": 17.596282958984375, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8831347823143005, "num_tokens": 476284466.0, "step": 12485 }, { "epoch": 1.5883475384811092, "ewc_loss": 0.029495462775230408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9495462513295934e-05, "grad_norm": 17.59674644470215, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8686219453811646, "num_tokens": 476327760.0, "step": 12486 }, { "epoch": 1.5884747487596997, "ewc_loss": 0.029490554705262184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9490554879885167e-05, "grad_norm": 17.591739654541016, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8647007942199707, "num_tokens": 476369259.0, "step": 12487 }, { "epoch": 1.5886019590382903, "ewc_loss": 0.029493950307369232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9493950933101587e-05, "grad_norm": 17.651060104370117, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8703254461288452, "num_tokens": 476403915.0, "step": 12488 }, { "epoch": 1.5887291693168808, "ewc_loss": 0.02949555031955242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9495549824787304e-05, "grad_norm": 17.624448776245117, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8675035238265991, "num_tokens": 476440837.0, "step": 12489 }, { "epoch": 1.5888563795954713, "ewc_loss": 0.029497873038053513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9497872674255632e-05, "grad_norm": 17.63931655883789, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8668349981307983, "num_tokens": 476478478.0, "step": 12490 }, { "epoch": 1.5889835898740619, "ewc_loss": 0.02946864813566208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9468648790498264e-05, "grad_norm": 17.686513900756836, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8782960176467896, "num_tokens": 476514675.0, "step": 12491 }, { "epoch": 1.5891108001526524, "ewc_loss": 0.029414206743240356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9414206437650137e-05, "grad_norm": 17.62628173828125, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8671331405639648, "num_tokens": 476559015.0, "step": 12492 }, { "epoch": 1.589238010431243, "ewc_loss": 0.029418418183922768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9418417398119345e-05, "grad_norm": 17.644872665405273, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8539460301399231, "num_tokens": 476601848.0, "step": 12493 }, { "epoch": 1.5893652207098334, "ewc_loss": 0.029507141560316086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.95071422442561e-05, "grad_norm": 17.723878860473633, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8778491020202637, "num_tokens": 476636794.0, "step": 12494 }, { "epoch": 1.589492430988424, "ewc_loss": 0.02938084676861763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9380846171989106e-05, "grad_norm": 17.572031021118164, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8640047311782837, "num_tokens": 476671593.0, "step": 12495 }, { "epoch": 1.5896196412670145, "ewc_loss": 0.029428983107209206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9428983907564543e-05, "grad_norm": 17.633075714111328, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8781735301017761, "num_tokens": 476706776.0, "step": 12496 }, { "epoch": 1.589746851545605, "ewc_loss": 0.029448244720697403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944824518635869e-05, "grad_norm": 17.5645809173584, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.868179202079773, "num_tokens": 476746700.0, "step": 12497 }, { "epoch": 1.5898740618241956, "ewc_loss": 0.02942870929837227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9428709240164608e-05, "grad_norm": 17.598304748535156, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8741028308868408, "num_tokens": 476785837.0, "step": 12498 }, { "epoch": 1.5900012721027859, "ewc_loss": 0.02948850579559803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.948850487882737e-05, "grad_norm": 17.676105499267578, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8746582269668579, "num_tokens": 476818894.0, "step": 12499 }, { "epoch": 1.5901284823813764, "ewc_loss": 0.029445916414260864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.944591687992215e-05, "grad_norm": 17.591211318969727, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8516178727149963, "num_tokens": 476861773.0, "step": 12500 }, { "epoch": 1.590255692659967, "ewc_loss": 0.02943299151957035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9432991141220555e-05, "grad_norm": 17.63628387451172, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.87516188621521, "num_tokens": 476899522.0, "step": 12501 }, { "epoch": 1.5903829029385574, "ewc_loss": 0.029441677033901215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9441676815622486e-05, "grad_norm": 17.54564666748047, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8630552291870117, "num_tokens": 476939923.0, "step": 12502 }, { "epoch": 1.590510113217148, "ewc_loss": 0.029444077983498573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9444077881635167e-05, "grad_norm": 17.639223098754883, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8681731224060059, "num_tokens": 476979867.0, "step": 12503 }, { "epoch": 1.5906373234957385, "ewc_loss": 0.029489275068044662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.948927431134507e-05, "grad_norm": 17.601184844970703, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8772379159927368, "num_tokens": 477016068.0, "step": 12504 }, { "epoch": 1.5907645337743288, "ewc_loss": 0.029452301561832428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9452301532728598e-05, "grad_norm": 17.57615852355957, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8658272624015808, "num_tokens": 477054526.0, "step": 12505 }, { "epoch": 1.5908917440529193, "ewc_loss": 0.02947457879781723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9474578695953824e-05, "grad_norm": 17.580183029174805, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.864115834236145, "num_tokens": 477098029.0, "step": 12506 }, { "epoch": 1.5910189543315099, "ewc_loss": 0.029479555785655975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9479555450961925e-05, "grad_norm": 17.674055099487305, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8756231069564819, "num_tokens": 477137070.0, "step": 12507 }, { "epoch": 1.5911461646101004, "ewc_loss": 0.029477572068572044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947757275251206e-05, "grad_norm": 17.635726928710938, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8677952289581299, "num_tokens": 477175640.0, "step": 12508 }, { "epoch": 1.591273374888691, "ewc_loss": 0.029476985335350037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9476985218934715e-05, "grad_norm": 17.674692153930664, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.872228741645813, "num_tokens": 477210935.0, "step": 12509 }, { "epoch": 1.5914005851672814, "ewc_loss": 0.02948208525776863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9482085665222257e-05, "grad_norm": 17.67845916748047, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8756169676780701, "num_tokens": 477251567.0, "step": 12510 }, { "epoch": 1.591527795445872, "ewc_loss": 0.02943722903728485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9437229386530817e-05, "grad_norm": 17.623550415039062, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8714172840118408, "num_tokens": 477287964.0, "step": 12511 }, { "epoch": 1.5916550057244625, "ewc_loss": 0.029480567201972008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9480566809070297e-05, "grad_norm": 17.689743041992188, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8438887000083923, "num_tokens": 477328660.0, "step": 12512 }, { "epoch": 1.591782216003053, "ewc_loss": 0.029462894424796104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946289532701485e-05, "grad_norm": 17.589502334594727, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8741186857223511, "num_tokens": 477364111.0, "step": 12513 }, { "epoch": 1.5919094262816436, "ewc_loss": 0.02938205376267433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.938205398095306e-05, "grad_norm": 17.616836547851562, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8757666349411011, "num_tokens": 477402691.0, "step": 12514 }, { "epoch": 1.592036636560234, "ewc_loss": 0.029455799609422684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9455799449351616e-05, "grad_norm": 17.640050888061523, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8822343945503235, "num_tokens": 477437526.0, "step": 12515 }, { "epoch": 1.5921638468388246, "ewc_loss": 0.029463358223438263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946335735032335e-05, "grad_norm": 17.600547790527344, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8672996163368225, "num_tokens": 477479175.0, "step": 12516 }, { "epoch": 1.5922910571174151, "ewc_loss": 0.029425907880067825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9425907996483147e-05, "grad_norm": 17.60597038269043, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.871580183506012, "num_tokens": 477516896.0, "step": 12517 }, { "epoch": 1.5924182673960057, "ewc_loss": 0.029427729547023773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9427728804876097e-05, "grad_norm": 17.59840965270996, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8598812222480774, "num_tokens": 477555543.0, "step": 12518 }, { "epoch": 1.5925454776745962, "ewc_loss": 0.029432173818349838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9432174414978363e-05, "grad_norm": 17.638996124267578, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8808298110961914, "num_tokens": 477592251.0, "step": 12519 }, { "epoch": 1.5926726879531867, "ewc_loss": 0.029408331960439682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9408331101876684e-05, "grad_norm": 17.606576919555664, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8563438653945923, "num_tokens": 477630039.0, "step": 12520 }, { "epoch": 1.5927998982317773, "ewc_loss": 0.029499471187591553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949947156594135e-05, "grad_norm": 17.62226104736328, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8680773377418518, "num_tokens": 477661028.0, "step": 12521 }, { "epoch": 1.5929271085103678, "ewc_loss": 0.029459312558174133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9459311917889863e-05, "grad_norm": 17.672361373901367, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8822447061538696, "num_tokens": 477699612.0, "step": 12522 }, { "epoch": 1.5930543187889583, "ewc_loss": 0.029470372945070267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9470373192452826e-05, "grad_norm": 17.585264205932617, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8645111322402954, "num_tokens": 477737300.0, "step": 12523 }, { "epoch": 1.5931815290675486, "ewc_loss": 0.029421648010611534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9421647923300043e-05, "grad_norm": 17.618148803710938, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8573133945465088, "num_tokens": 477776031.0, "step": 12524 }, { "epoch": 1.5933087393461391, "ewc_loss": 0.02951565384864807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9515653295675293e-05, "grad_norm": 17.64365005493164, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.870722234249115, "num_tokens": 477813640.0, "step": 12525 }, { "epoch": 1.5934359496247297, "ewc_loss": 0.02937564253807068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.937564204330556e-05, "grad_norm": 17.521800994873047, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.864540696144104, "num_tokens": 477848652.0, "step": 12526 }, { "epoch": 1.5935631599033202, "ewc_loss": 0.029513906687498093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.951390706584789e-05, "grad_norm": 17.66729736328125, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8647356033325195, "num_tokens": 477883710.0, "step": 12527 }, { "epoch": 1.5936903701819107, "ewc_loss": 0.02952643856406212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9526438083848916e-05, "grad_norm": 17.59325408935547, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8629195094108582, "num_tokens": 477920316.0, "step": 12528 }, { "epoch": 1.5938175804605013, "ewc_loss": 0.02949327975511551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949327972601168e-05, "grad_norm": 17.589595794677734, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8785848021507263, "num_tokens": 477955523.0, "step": 12529 }, { "epoch": 1.5939447907390916, "ewc_loss": 0.02956426702439785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.956426760647446e-05, "grad_norm": 17.699851989746094, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8616466522216797, "num_tokens": 477995768.0, "step": 12530 }, { "epoch": 1.594072001017682, "ewc_loss": 0.0295853391289711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9585338779725134e-05, "grad_norm": 17.6500186920166, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8687175512313843, "num_tokens": 478037718.0, "step": 12531 }, { "epoch": 1.5941992112962726, "ewc_loss": 0.029473116621375084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9473116228473373e-05, "grad_norm": 17.594640731811523, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8570760488510132, "num_tokens": 478077388.0, "step": 12532 }, { "epoch": 1.5943264215748632, "ewc_loss": 0.02950577437877655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9505774364224635e-05, "grad_norm": 17.586650848388672, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.88388991355896, "num_tokens": 478109494.0, "step": 12533 }, { "epoch": 1.5944536318534537, "ewc_loss": 0.029504286125302315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9504286430892535e-05, "grad_norm": 17.591533660888672, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8643322587013245, "num_tokens": 478142420.0, "step": 12534 }, { "epoch": 1.5945808421320442, "ewc_loss": 0.029558856040239334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.955885611299891e-05, "grad_norm": 17.64719009399414, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8745760917663574, "num_tokens": 478183848.0, "step": 12535 }, { "epoch": 1.5947080524106347, "ewc_loss": 0.029527917504310608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9527916922234e-05, "grad_norm": 17.587261199951172, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8782473802566528, "num_tokens": 478225242.0, "step": 12536 }, { "epoch": 1.5948352626892253, "ewc_loss": 0.029507433995604515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950743328256067e-05, "grad_norm": 17.615989685058594, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8758655190467834, "num_tokens": 478263167.0, "step": 12537 }, { "epoch": 1.5949624729678158, "ewc_loss": 0.02952355332672596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9523553166654892e-05, "grad_norm": 17.57806968688965, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8694164156913757, "num_tokens": 478297752.0, "step": 12538 }, { "epoch": 1.5950896832464063, "ewc_loss": 0.02948012948036194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9480130251613446e-05, "grad_norm": 17.60210418701172, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8697038888931274, "num_tokens": 478332955.0, "step": 12539 }, { "epoch": 1.5952168935249968, "ewc_loss": 0.029607880860567093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9607881515403278e-05, "grad_norm": 17.614259719848633, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8494995832443237, "num_tokens": 478367298.0, "step": 12540 }, { "epoch": 1.5953441038035874, "ewc_loss": 0.02957143820822239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9571438062703237e-05, "grad_norm": 17.682207107543945, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8738670349121094, "num_tokens": 478411183.0, "step": 12541 }, { "epoch": 1.595471314082178, "ewc_loss": 0.029545951634645462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9545952202170156e-05, "grad_norm": 17.501523971557617, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8675685524940491, "num_tokens": 478451184.0, "step": 12542 }, { "epoch": 1.5955985243607684, "ewc_loss": 0.029476234689354897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947623397631105e-05, "grad_norm": 17.675003051757812, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8648783564567566, "num_tokens": 478490913.0, "step": 12543 }, { "epoch": 1.595725734639359, "ewc_loss": 0.029593318700790405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959331868623849e-05, "grad_norm": 17.594322204589844, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8704122304916382, "num_tokens": 478529769.0, "step": 12544 }, { "epoch": 1.5958529449179495, "ewc_loss": 0.029471738263964653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9471737434505485e-05, "grad_norm": 17.683528900146484, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8655474781990051, "num_tokens": 478566665.0, "step": 12545 }, { "epoch": 1.59598015519654, "ewc_loss": 0.02958383411169052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9583834475488402e-05, "grad_norm": 17.5913028717041, "learning_rate": 1e-06, "loss": 0.3262, "mean_token_accuracy": 0.8951776027679443, "num_tokens": 478600814.0, "step": 12546 }, { "epoch": 1.5961073654751305, "ewc_loss": 0.029484236612915993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9484237529686652e-05, "grad_norm": 17.695980072021484, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8724346160888672, "num_tokens": 478641610.0, "step": 12547 }, { "epoch": 1.5962345757537209, "ewc_loss": 0.029523581266403198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9523580451495945e-05, "grad_norm": 17.58560562133789, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8775075674057007, "num_tokens": 478678183.0, "step": 12548 }, { "epoch": 1.5963617860323114, "ewc_loss": 0.029469603672623634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9469603759935126e-05, "grad_norm": 17.697546005249023, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.870445728302002, "num_tokens": 478710881.0, "step": 12549 }, { "epoch": 1.596488996310902, "ewc_loss": 0.029539063572883606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9539063689298928e-05, "grad_norm": 17.650550842285156, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8834831714630127, "num_tokens": 478748550.0, "step": 12550 }, { "epoch": 1.5966162065894924, "ewc_loss": 0.02947033755481243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9470336812664755e-05, "grad_norm": 17.674558639526367, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8574545979499817, "num_tokens": 478790417.0, "step": 12551 }, { "epoch": 1.596743416868083, "ewc_loss": 0.029493920505046844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9493920010281727e-05, "grad_norm": 17.66183090209961, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8748981952667236, "num_tokens": 478828856.0, "step": 12552 }, { "epoch": 1.5968706271466735, "ewc_loss": 0.029401270672678947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940126978501212e-05, "grad_norm": 17.64331817626953, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8658433556556702, "num_tokens": 478870240.0, "step": 12553 }, { "epoch": 1.5969978374252638, "ewc_loss": 0.029469536617398262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9469536457327195e-05, "grad_norm": 17.664169311523438, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8616170287132263, "num_tokens": 478911920.0, "step": 12554 }, { "epoch": 1.5971250477038543, "ewc_loss": 0.029509836807847023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9509836167562753e-05, "grad_norm": 17.697059631347656, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8633295893669128, "num_tokens": 478949083.0, "step": 12555 }, { "epoch": 1.5972522579824449, "ewc_loss": 0.029448790475726128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9448790883179754e-05, "grad_norm": 17.647708892822266, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8683409690856934, "num_tokens": 478984087.0, "step": 12556 }, { "epoch": 1.5973794682610354, "ewc_loss": 0.02942565828561783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.942565879493486e-05, "grad_norm": 17.620990753173828, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8514553904533386, "num_tokens": 479019364.0, "step": 12557 }, { "epoch": 1.597506678539626, "ewc_loss": 0.029504958540201187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9504959456971847e-05, "grad_norm": 17.673742294311523, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8725453615188599, "num_tokens": 479059002.0, "step": 12558 }, { "epoch": 1.5976338888182164, "ewc_loss": 0.029452620074152946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9452619855874218e-05, "grad_norm": 17.681367874145508, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8689823150634766, "num_tokens": 479092676.0, "step": 12559 }, { "epoch": 1.597761099096807, "ewc_loss": 0.029422886669635773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9422886655083857e-05, "grad_norm": 17.64451026916504, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8730785250663757, "num_tokens": 479125951.0, "step": 12560 }, { "epoch": 1.5978883093753975, "ewc_loss": 0.02941972389817238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.941972343251109e-05, "grad_norm": 17.617074966430664, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.8850869536399841, "num_tokens": 479166230.0, "step": 12561 }, { "epoch": 1.598015519653988, "ewc_loss": 0.029455525800585747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9455526600941084e-05, "grad_norm": 17.563623428344727, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8664758205413818, "num_tokens": 479205813.0, "step": 12562 }, { "epoch": 1.5981427299325786, "ewc_loss": 0.02947353757917881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9473538234014995e-05, "grad_norm": 17.680509567260742, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8659342527389526, "num_tokens": 479244960.0, "step": 12563 }, { "epoch": 1.598269940211169, "ewc_loss": 0.029543697834014893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9543698474299163e-05, "grad_norm": 17.5888671875, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8755679130554199, "num_tokens": 479289617.0, "step": 12564 }, { "epoch": 1.5983971504897596, "ewc_loss": 0.02943606488406658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9436065233312547e-05, "grad_norm": 17.63386344909668, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8704975247383118, "num_tokens": 479326377.0, "step": 12565 }, { "epoch": 1.5985243607683501, "ewc_loss": 0.029506858438253403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950685848190915e-05, "grad_norm": 17.64327621459961, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8686274290084839, "num_tokens": 479360563.0, "step": 12566 }, { "epoch": 1.5986515710469407, "ewc_loss": 0.02949066460132599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949066401924938e-05, "grad_norm": 17.659130096435547, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8812389969825745, "num_tokens": 479403042.0, "step": 12567 }, { "epoch": 1.5987787813255312, "ewc_loss": 0.029429754242300987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9429755159071647e-05, "grad_norm": 17.58922004699707, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8757995367050171, "num_tokens": 479437099.0, "step": 12568 }, { "epoch": 1.5989059916041217, "ewc_loss": 0.0294986292719841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9498629373847507e-05, "grad_norm": 17.65233612060547, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8679993152618408, "num_tokens": 479479326.0, "step": 12569 }, { "epoch": 1.5990332018827123, "ewc_loss": 0.029448702931404114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9448703571688384e-05, "grad_norm": 17.56528091430664, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8524305820465088, "num_tokens": 479515320.0, "step": 12570 }, { "epoch": 1.5991604121613028, "ewc_loss": 0.029469303786754608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946930362668354e-05, "grad_norm": 17.614017486572266, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8581889867782593, "num_tokens": 479559960.0, "step": 12571 }, { "epoch": 1.5992876224398933, "ewc_loss": 0.029530012980103493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9530012398026884e-05, "grad_norm": 17.652477264404297, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8693148493766785, "num_tokens": 479599115.0, "step": 12572 }, { "epoch": 1.5994148327184836, "ewc_loss": 0.029460644349455833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9460645237122662e-05, "grad_norm": 17.584814071655273, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8777033090591431, "num_tokens": 479636116.0, "step": 12573 }, { "epoch": 1.5995420429970741, "ewc_loss": 0.029462676495313644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9462677048286423e-05, "grad_norm": 17.637699127197266, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8611887693405151, "num_tokens": 479676880.0, "step": 12574 }, { "epoch": 1.5996692532756647, "ewc_loss": 0.02951807714998722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.951807800855022e-05, "grad_norm": 17.60260772705078, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8714837431907654, "num_tokens": 479717424.0, "step": 12575 }, { "epoch": 1.5997964635542552, "ewc_loss": 0.0294625423848629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.946254244307056e-05, "grad_norm": 17.588865280151367, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8676592111587524, "num_tokens": 479756240.0, "step": 12576 }, { "epoch": 1.5999236738328457, "ewc_loss": 0.02952488884329796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9524888304877095e-05, "grad_norm": 17.58466339111328, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8544086813926697, "num_tokens": 479791863.0, "step": 12577 }, { "epoch": 1.6000508841114363, "ewc_loss": 0.029489871114492416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9489870939869434e-05, "grad_norm": 17.64940071105957, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.880362868309021, "num_tokens": 479827252.0, "step": 12578 }, { "epoch": 1.6001780943900266, "ewc_loss": 0.02955527976155281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.955527997983154e-05, "grad_norm": 17.614355087280273, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.865715503692627, "num_tokens": 479863100.0, "step": 12579 }, { "epoch": 1.600305304668617, "ewc_loss": 0.02948407083749771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9484070182661526e-05, "grad_norm": 17.666048049926758, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8587659597396851, "num_tokens": 479899684.0, "step": 12580 }, { "epoch": 1.6004325149472076, "ewc_loss": 0.029537450522184372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9537450245697983e-05, "grad_norm": 17.66633415222168, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8695487976074219, "num_tokens": 479937656.0, "step": 12581 }, { "epoch": 1.6005597252257981, "ewc_loss": 0.02950304187834263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950304224214051e-05, "grad_norm": 17.630107879638672, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8655693531036377, "num_tokens": 479973357.0, "step": 12582 }, { "epoch": 1.6006869355043887, "ewc_loss": 0.029494229704141617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949422923848033e-05, "grad_norm": 17.707910537719727, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8703818321228027, "num_tokens": 480017862.0, "step": 12583 }, { "epoch": 1.6008141457829792, "ewc_loss": 0.029525380581617355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9525381251005456e-05, "grad_norm": 17.600341796875, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8751567602157593, "num_tokens": 480057039.0, "step": 12584 }, { "epoch": 1.6009413560615697, "ewc_loss": 0.02951076626777649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9510765671147965e-05, "grad_norm": 17.629396438598633, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8724801540374756, "num_tokens": 480093396.0, "step": 12585 }, { "epoch": 1.6010685663401603, "ewc_loss": 0.02955107018351555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9551070838351734e-05, "grad_norm": 17.653045654296875, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8652369976043701, "num_tokens": 480125200.0, "step": 12586 }, { "epoch": 1.6011957766187508, "ewc_loss": 0.029546555131673813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.954655428766273e-05, "grad_norm": 17.705862045288086, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8672678470611572, "num_tokens": 480158085.0, "step": 12587 }, { "epoch": 1.6013229868973413, "ewc_loss": 0.029540451243519783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9540451578213833e-05, "grad_norm": 17.64493751525879, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8843656778335571, "num_tokens": 480192979.0, "step": 12588 }, { "epoch": 1.6014501971759318, "ewc_loss": 0.029530443251132965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9530443498515524e-05, "grad_norm": 17.653963088989258, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.87886643409729, "num_tokens": 480227984.0, "step": 12589 }, { "epoch": 1.6015774074545224, "ewc_loss": 0.02955925278365612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9559252652688883e-05, "grad_norm": 17.69237518310547, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.8866080045700073, "num_tokens": 480264280.0, "step": 12590 }, { "epoch": 1.601704617733113, "ewc_loss": 0.029547173529863358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9547172744059935e-05, "grad_norm": 17.69718360900879, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.855933666229248, "num_tokens": 480300670.0, "step": 12591 }, { "epoch": 1.6018318280117034, "ewc_loss": 0.02952047623693943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9520475436584093e-05, "grad_norm": 17.650543212890625, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8682535886764526, "num_tokens": 480334460.0, "step": 12592 }, { "epoch": 1.601959038290294, "ewc_loss": 0.02953645959496498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.953645889647305e-05, "grad_norm": 17.645593643188477, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8791515827178955, "num_tokens": 480376066.0, "step": 12593 }, { "epoch": 1.6020862485688845, "ewc_loss": 0.029475919902324677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9475919291144237e-05, "grad_norm": 17.65035629272461, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8752496242523193, "num_tokens": 480417575.0, "step": 12594 }, { "epoch": 1.602213458847475, "ewc_loss": 0.029565349221229553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.956534990516957e-05, "grad_norm": 17.62717628479004, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8592550754547119, "num_tokens": 480454106.0, "step": 12595 }, { "epoch": 1.6023406691260655, "ewc_loss": 0.029504263773560524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9504264603019692e-05, "grad_norm": 17.586820602416992, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.857248067855835, "num_tokens": 480493031.0, "step": 12596 }, { "epoch": 1.6024678794046558, "ewc_loss": 0.029558001086115837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9558001187979244e-05, "grad_norm": 17.644634246826172, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8710380792617798, "num_tokens": 480528326.0, "step": 12597 }, { "epoch": 1.6025950896832464, "ewc_loss": 0.029587125405669212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9587125027319416e-05, "grad_norm": 17.704524993896484, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8778183460235596, "num_tokens": 480565399.0, "step": 12598 }, { "epoch": 1.602722299961837, "ewc_loss": 0.029541004449129105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.954100455099251e-05, "grad_norm": 17.60695457458496, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8855553865432739, "num_tokens": 480600028.0, "step": 12599 }, { "epoch": 1.6028495102404274, "ewc_loss": 0.029516827315092087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.951682654384058e-05, "grad_norm": 17.645071029663086, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8699455261230469, "num_tokens": 480635759.0, "step": 12600 }, { "epoch": 1.602976720519018, "ewc_loss": 0.0296118576079607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.961185782623943e-05, "grad_norm": 17.69589614868164, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8752259016036987, "num_tokens": 480669528.0, "step": 12601 }, { "epoch": 1.6031039307976085, "ewc_loss": 0.029587846249341965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.958784534712322e-05, "grad_norm": 17.617000579833984, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.867804229259491, "num_tokens": 480709425.0, "step": 12602 }, { "epoch": 1.6032311410761988, "ewc_loss": 0.02953135222196579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9531352993217297e-05, "grad_norm": 17.627443313598633, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.85603266954422, "num_tokens": 480745010.0, "step": 12603 }, { "epoch": 1.6033583513547893, "ewc_loss": 0.029579464346170425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9579465262941085e-05, "grad_norm": 17.701889038085938, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8761700987815857, "num_tokens": 480780320.0, "step": 12604 }, { "epoch": 1.6034855616333799, "ewc_loss": 0.029582858085632324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9582857678178698e-05, "grad_norm": 17.596406936645508, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8763277530670166, "num_tokens": 480819335.0, "step": 12605 }, { "epoch": 1.6036127719119704, "ewc_loss": 0.02952050231397152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9520502721425146e-05, "grad_norm": 17.679155349731445, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8826615810394287, "num_tokens": 480852702.0, "step": 12606 }, { "epoch": 1.603739982190561, "ewc_loss": 0.02958914078772068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9589140467578545e-05, "grad_norm": 17.6042423248291, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8637282848358154, "num_tokens": 480890258.0, "step": 12607 }, { "epoch": 1.6038671924691514, "ewc_loss": 0.029585015028715134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9585014999611303e-05, "grad_norm": 17.617483139038086, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8695621490478516, "num_tokens": 480929555.0, "step": 12608 }, { "epoch": 1.603994402747742, "ewc_loss": 0.02959480695426464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959480661957059e-05, "grad_norm": 17.561708450317383, "learning_rate": 1e-06, "loss": 0.3416, "mean_token_accuracy": 0.8919493556022644, "num_tokens": 480970295.0, "step": 12609 }, { "epoch": 1.6041216130263325, "ewc_loss": 0.029578611254692078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9578612156910822e-05, "grad_norm": 17.65657615661621, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8712285757064819, "num_tokens": 481011213.0, "step": 12610 }, { "epoch": 1.604248823304923, "ewc_loss": 0.029589742422103882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.958974255307112e-05, "grad_norm": 17.65683937072754, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8645583391189575, "num_tokens": 481045273.0, "step": 12611 }, { "epoch": 1.6043760335835135, "ewc_loss": 0.029584992676973343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.958499317173846e-05, "grad_norm": 17.644498825073242, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8520609140396118, "num_tokens": 481084567.0, "step": 12612 }, { "epoch": 1.604503243862104, "ewc_loss": 0.029593707993626595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959370794997085e-05, "grad_norm": 17.673994064331055, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.870856761932373, "num_tokens": 481119455.0, "step": 12613 }, { "epoch": 1.6046304541406946, "ewc_loss": 0.02955646999180317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9556469598901458e-05, "grad_norm": 17.725765228271484, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8557546734809875, "num_tokens": 481154718.0, "step": 12614 }, { "epoch": 1.6047576644192851, "ewc_loss": 0.02957386150956154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.957386095658876e-05, "grad_norm": 17.665691375732422, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8644371032714844, "num_tokens": 481193716.0, "step": 12615 }, { "epoch": 1.6048848746978757, "ewc_loss": 0.02955019474029541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9550194085459225e-05, "grad_norm": 17.705551147460938, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8456763625144958, "num_tokens": 481231378.0, "step": 12616 }, { "epoch": 1.6050120849764662, "ewc_loss": 0.029611261561512947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9611261197715066e-05, "grad_norm": 17.587970733642578, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8676446080207825, "num_tokens": 481267907.0, "step": 12617 }, { "epoch": 1.6051392952550567, "ewc_loss": 0.02950618602335453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9506185455829836e-05, "grad_norm": 17.64415168762207, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8708834052085876, "num_tokens": 481309899.0, "step": 12618 }, { "epoch": 1.6052665055336472, "ewc_loss": 0.029601095244288445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960109486593865e-05, "grad_norm": 17.676528930664062, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8786642551422119, "num_tokens": 481348845.0, "step": 12619 }, { "epoch": 1.6053937158122378, "ewc_loss": 0.029540663585066795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9540664399974048e-05, "grad_norm": 17.633333206176758, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8597569465637207, "num_tokens": 481384999.0, "step": 12620 }, { "epoch": 1.6055209260908283, "ewc_loss": 0.02952195331454277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9521952455979772e-05, "grad_norm": 17.60529899597168, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8713322877883911, "num_tokens": 481417287.0, "step": 12621 }, { "epoch": 1.6056481363694186, "ewc_loss": 0.029563244432210922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9563243515440263e-05, "grad_norm": 17.636154174804688, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8730510473251343, "num_tokens": 481455943.0, "step": 12622 }, { "epoch": 1.6057753466480091, "ewc_loss": 0.029535366222262383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.953536568384152e-05, "grad_norm": 17.591650009155273, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8845459222793579, "num_tokens": 481492017.0, "step": 12623 }, { "epoch": 1.6059025569265997, "ewc_loss": 0.029568033292889595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.95680329145398e-05, "grad_norm": 17.614824295043945, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8633027672767639, "num_tokens": 481535115.0, "step": 12624 }, { "epoch": 1.6060297672051902, "ewc_loss": 0.029551347717642784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9551347324741073e-05, "grad_norm": 17.609134674072266, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8546459674835205, "num_tokens": 481580048.0, "step": 12625 }, { "epoch": 1.6061569774837807, "ewc_loss": 0.029573772102594376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9573771826107986e-05, "grad_norm": 17.570322036743164, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8796908855438232, "num_tokens": 481616748.0, "step": 12626 }, { "epoch": 1.6062841877623713, "ewc_loss": 0.029492860659956932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9492861358448863e-05, "grad_norm": 17.580320358276367, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8415412306785583, "num_tokens": 481659123.0, "step": 12627 }, { "epoch": 1.6064113980409616, "ewc_loss": 0.029598036780953407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9598037144751288e-05, "grad_norm": 17.613540649414062, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8606119155883789, "num_tokens": 481704791.0, "step": 12628 }, { "epoch": 1.606538608319552, "ewc_loss": 0.029588423669338226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9588423785753548e-05, "grad_norm": 17.65082359313965, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8565462827682495, "num_tokens": 481742913.0, "step": 12629 }, { "epoch": 1.6066658185981426, "ewc_loss": 0.029533272609114647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9533272027038038e-05, "grad_norm": 17.61587142944336, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8564225435256958, "num_tokens": 481783549.0, "step": 12630 }, { "epoch": 1.6067930288767331, "ewc_loss": 0.02956622652709484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9566226658062078e-05, "grad_norm": 17.665889739990234, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8748313188552856, "num_tokens": 481818486.0, "step": 12631 }, { "epoch": 1.6069202391553237, "ewc_loss": 0.029535990208387375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9535989597206935e-05, "grad_norm": 17.689367294311523, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8696954846382141, "num_tokens": 481849969.0, "step": 12632 }, { "epoch": 1.6070474494339142, "ewc_loss": 0.02947787195444107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947787106677424e-05, "grad_norm": 17.584030151367188, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8530223369598389, "num_tokens": 481886108.0, "step": 12633 }, { "epoch": 1.6071746597125047, "ewc_loss": 0.029532117769122124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9532116968766786e-05, "grad_norm": 17.670345306396484, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8811412453651428, "num_tokens": 481919389.0, "step": 12634 }, { "epoch": 1.6073018699910953, "ewc_loss": 0.02959166280925274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9591663405881263e-05, "grad_norm": 17.698728561401367, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8817124366760254, "num_tokens": 481958531.0, "step": 12635 }, { "epoch": 1.6074290802696858, "ewc_loss": 0.02950473688542843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9504737540264614e-05, "grad_norm": 17.58499526977539, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8655508756637573, "num_tokens": 481999864.0, "step": 12636 }, { "epoch": 1.6075562905482763, "ewc_loss": 0.029559556394815445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9559556423919275e-05, "grad_norm": 17.602209091186523, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8925011157989502, "num_tokens": 482033890.0, "step": 12637 }, { "epoch": 1.6076835008268668, "ewc_loss": 0.029609179124236107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9609178454848006e-05, "grad_norm": 17.684919357299805, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8744206428527832, "num_tokens": 482071186.0, "step": 12638 }, { "epoch": 1.6078107111054574, "ewc_loss": 0.029603980481624603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9603979783132672e-05, "grad_norm": 17.60783576965332, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8676433563232422, "num_tokens": 482107218.0, "step": 12639 }, { "epoch": 1.607937921384048, "ewc_loss": 0.029619254171848297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9619253837154247e-05, "grad_norm": 17.692508697509766, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8658747673034668, "num_tokens": 482146101.0, "step": 12640 }, { "epoch": 1.6080651316626384, "ewc_loss": 0.02958827279508114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9588272809633054e-05, "grad_norm": 17.61686134338379, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.8825698494911194, "num_tokens": 482178531.0, "step": 12641 }, { "epoch": 1.608192341941229, "ewc_loss": 0.02957046590745449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9570466722361743e-05, "grad_norm": 17.583528518676758, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8788591623306274, "num_tokens": 482217019.0, "step": 12642 }, { "epoch": 1.6083195522198195, "ewc_loss": 0.029648233205080032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964823397633154e-05, "grad_norm": 17.6960391998291, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8699581623077393, "num_tokens": 482250547.0, "step": 12643 }, { "epoch": 1.60844676249841, "ewc_loss": 0.029643092304468155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9643091693287715e-05, "grad_norm": 17.640167236328125, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8643090724945068, "num_tokens": 482287009.0, "step": 12644 }, { "epoch": 1.6085739727770005, "ewc_loss": 0.029625985771417618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.962598591693677e-05, "grad_norm": 17.63130760192871, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8563230037689209, "num_tokens": 482327373.0, "step": 12645 }, { "epoch": 1.6087011830555908, "ewc_loss": 0.029571838676929474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9571838240372017e-05, "grad_norm": 17.653240203857422, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8582535982131958, "num_tokens": 482363409.0, "step": 12646 }, { "epoch": 1.6088283933341814, "ewc_loss": 0.02965865284204483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965865314763505e-05, "grad_norm": 17.732807159423828, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8605427742004395, "num_tokens": 482396499.0, "step": 12647 }, { "epoch": 1.608955603612772, "ewc_loss": 0.02960200235247612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9602002541651018e-05, "grad_norm": 17.557973861694336, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8705034852027893, "num_tokens": 482435059.0, "step": 12648 }, { "epoch": 1.6090828138913624, "ewc_loss": 0.02956300787627697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9563007046817802e-05, "grad_norm": 17.678863525390625, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8626427054405212, "num_tokens": 482472793.0, "step": 12649 }, { "epoch": 1.609210024169953, "ewc_loss": 0.029679307714104652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9679307772312313e-05, "grad_norm": 17.59881591796875, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8715437650680542, "num_tokens": 482508857.0, "step": 12650 }, { "epoch": 1.6093372344485435, "ewc_loss": 0.029553553089499474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9553553758887574e-05, "grad_norm": 17.606637954711914, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.852783203125, "num_tokens": 482548526.0, "step": 12651 }, { "epoch": 1.6094644447271338, "ewc_loss": 0.029683060944080353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.968306034745183e-05, "grad_norm": 17.687475204467773, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8611354827880859, "num_tokens": 482586461.0, "step": 12652 }, { "epoch": 1.6095916550057243, "ewc_loss": 0.02960069105029106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960069105029106e-05, "grad_norm": 17.644472122192383, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8618849515914917, "num_tokens": 482624642.0, "step": 12653 }, { "epoch": 1.6097188652843148, "ewc_loss": 0.029685640707612038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.968564149341546e-05, "grad_norm": 17.703083038330078, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.868527889251709, "num_tokens": 482664096.0, "step": 12654 }, { "epoch": 1.6098460755629054, "ewc_loss": 0.029690386727452278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969038723676931e-05, "grad_norm": 17.702804565429688, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8799325823783875, "num_tokens": 482701230.0, "step": 12655 }, { "epoch": 1.609973285841496, "ewc_loss": 0.029606442898511887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9606442694785073e-05, "grad_norm": 17.68292999267578, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.869464635848999, "num_tokens": 482741611.0, "step": 12656 }, { "epoch": 1.6101004961200864, "ewc_loss": 0.029582202434539795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.958220284199342e-05, "grad_norm": 17.62098503112793, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8789176344871521, "num_tokens": 482779482.0, "step": 12657 }, { "epoch": 1.610227706398677, "ewc_loss": 0.029527023434638977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9527023798436858e-05, "grad_norm": 17.65241813659668, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8769669532775879, "num_tokens": 482816279.0, "step": 12658 }, { "epoch": 1.6103549166772675, "ewc_loss": 0.029607703909277916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960770325444173e-05, "grad_norm": 17.65512466430664, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.860534131526947, "num_tokens": 482859941.0, "step": 12659 }, { "epoch": 1.610482126955858, "ewc_loss": 0.02957621030509472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.957621109089814e-05, "grad_norm": 17.68759536743164, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8742598295211792, "num_tokens": 482894348.0, "step": 12660 }, { "epoch": 1.6106093372344485, "ewc_loss": 0.02956457994878292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.956458047265187e-05, "grad_norm": 17.627756118774414, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8632209897041321, "num_tokens": 482930882.0, "step": 12661 }, { "epoch": 1.610736547513039, "ewc_loss": 0.02961677312850952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.961677273560781e-05, "grad_norm": 17.630659103393555, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.851996660232544, "num_tokens": 482969515.0, "step": 12662 }, { "epoch": 1.6108637577916296, "ewc_loss": 0.02960231900215149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9602319045807235e-05, "grad_norm": 17.679298400878906, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8660291433334351, "num_tokens": 483008074.0, "step": 12663 }, { "epoch": 1.6109909680702201, "ewc_loss": 0.02952205017209053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9522050681407563e-05, "grad_norm": 17.60184669494629, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8622074127197266, "num_tokens": 483038920.0, "step": 12664 }, { "epoch": 1.6111181783488107, "ewc_loss": 0.0295566413551569, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.955664058390539e-05, "grad_norm": 17.679685592651367, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8802920579910278, "num_tokens": 483075559.0, "step": 12665 }, { "epoch": 1.6112453886274012, "ewc_loss": 0.02960130386054516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9601304049720056e-05, "grad_norm": 17.67984962463379, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8727492690086365, "num_tokens": 483114378.0, "step": 12666 }, { "epoch": 1.6113725989059917, "ewc_loss": 0.029554452747106552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9554452339652926e-05, "grad_norm": 17.632728576660156, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8727599382400513, "num_tokens": 483147911.0, "step": 12667 }, { "epoch": 1.6114998091845822, "ewc_loss": 0.029582571238279343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.958257209684234e-05, "grad_norm": 17.71558380126953, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8840125799179077, "num_tokens": 483188275.0, "step": 12668 }, { "epoch": 1.6116270194631728, "ewc_loss": 0.02962447889149189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.962447797472123e-05, "grad_norm": 17.60601806640625, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8799837231636047, "num_tokens": 483230804.0, "step": 12669 }, { "epoch": 1.6117542297417633, "ewc_loss": 0.02957901544868946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.957901597255841e-05, "grad_norm": 17.793071746826172, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.872390866279602, "num_tokens": 483262368.0, "step": 12670 }, { "epoch": 1.6118814400203536, "ewc_loss": 0.029663674533367157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.966367537737824e-05, "grad_norm": 17.762466430664062, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8578905463218689, "num_tokens": 483300894.0, "step": 12671 }, { "epoch": 1.6120086502989441, "ewc_loss": 0.02949414774775505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949414738395717e-05, "grad_norm": 17.61675453186035, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8759015798568726, "num_tokens": 483337377.0, "step": 12672 }, { "epoch": 1.6121358605775347, "ewc_loss": 0.029529264196753502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9529264793382026e-05, "grad_norm": 17.70627212524414, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8691916465759277, "num_tokens": 483376572.0, "step": 12673 }, { "epoch": 1.6122630708561252, "ewc_loss": 0.029595959931612015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959595985885244e-05, "grad_norm": 17.70370101928711, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8573907613754272, "num_tokens": 483415584.0, "step": 12674 }, { "epoch": 1.6123902811347157, "ewc_loss": 0.029547372832894325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9547372832894325e-05, "grad_norm": 17.697736740112305, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8697656989097595, "num_tokens": 483459584.0, "step": 12675 }, { "epoch": 1.6125174914133062, "ewc_loss": 0.02950550802052021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9505508791771717e-05, "grad_norm": 17.61309051513672, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8636653423309326, "num_tokens": 483504260.0, "step": 12676 }, { "epoch": 1.6126447016918966, "ewc_loss": 0.029529284685850143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9529284802265465e-05, "grad_norm": 17.702192306518555, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.872299313545227, "num_tokens": 483547271.0, "step": 12677 }, { "epoch": 1.612771911970487, "ewc_loss": 0.029561324045062065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9561324481619522e-05, "grad_norm": 17.709373474121094, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.870638370513916, "num_tokens": 483582255.0, "step": 12678 }, { "epoch": 1.6128991222490776, "ewc_loss": 0.02952643111348152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9526430807891302e-05, "grad_norm": 17.68535614013672, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.864915668964386, "num_tokens": 483622290.0, "step": 12679 }, { "epoch": 1.6130263325276681, "ewc_loss": 0.029512258246541023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9512259061448276e-05, "grad_norm": 17.70789909362793, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8816906213760376, "num_tokens": 483657253.0, "step": 12680 }, { "epoch": 1.6131535428062587, "ewc_loss": 0.029571393504738808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9571394406957552e-05, "grad_norm": 17.710887908935547, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8739756345748901, "num_tokens": 483693062.0, "step": 12681 }, { "epoch": 1.6132807530848492, "ewc_loss": 0.029527811333537102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9527811420848593e-05, "grad_norm": 17.64212989807129, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8638573288917542, "num_tokens": 483735845.0, "step": 12682 }, { "epoch": 1.6134079633634397, "ewc_loss": 0.029516659677028656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9516659196815453e-05, "grad_norm": 17.659473419189453, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8732273578643799, "num_tokens": 483783833.0, "step": 12683 }, { "epoch": 1.6135351736420303, "ewc_loss": 0.029533307999372482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.953330840682611e-05, "grad_norm": 17.692691802978516, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8686349987983704, "num_tokens": 483820175.0, "step": 12684 }, { "epoch": 1.6136623839206208, "ewc_loss": 0.02950146049261093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9501459721359424e-05, "grad_norm": 17.64883804321289, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8806036710739136, "num_tokens": 483863454.0, "step": 12685 }, { "epoch": 1.6137895941992113, "ewc_loss": 0.029484935104846954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.948493420262821e-05, "grad_norm": 17.63195037841797, "learning_rate": 1e-06, "loss": 0.3437, "mean_token_accuracy": 0.8891199827194214, "num_tokens": 483900657.0, "step": 12686 }, { "epoch": 1.6139168044778018, "ewc_loss": 0.029529858380556107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9529857783927582e-05, "grad_norm": 17.83461570739746, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.862831711769104, "num_tokens": 483935912.0, "step": 12687 }, { "epoch": 1.6140440147563924, "ewc_loss": 0.029532356187701225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9532357075368054e-05, "grad_norm": 17.725801467895508, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8698912858963013, "num_tokens": 483968594.0, "step": 12688 }, { "epoch": 1.614171225034983, "ewc_loss": 0.02940245531499386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.940245576610323e-05, "grad_norm": 17.720539093017578, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8792269825935364, "num_tokens": 484010866.0, "step": 12689 }, { "epoch": 1.6142984353135734, "ewc_loss": 0.029473913833498955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9473912945832126e-05, "grad_norm": 17.649600982666016, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8639117479324341, "num_tokens": 484051355.0, "step": 12690 }, { "epoch": 1.614425645592164, "ewc_loss": 0.02941351942718029, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9413518859655596e-05, "grad_norm": 17.74920654296875, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8804336786270142, "num_tokens": 484087049.0, "step": 12691 }, { "epoch": 1.6145528558707545, "ewc_loss": 0.029475359246134758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9475359042407945e-05, "grad_norm": 17.6307373046875, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8589884042739868, "num_tokens": 484124914.0, "step": 12692 }, { "epoch": 1.614680066149345, "ewc_loss": 0.02941327914595604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9413278753054328e-05, "grad_norm": 17.67763328552246, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.88763427734375, "num_tokens": 484154524.0, "step": 12693 }, { "epoch": 1.6148072764279355, "ewc_loss": 0.02947021648287773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947021675936412e-05, "grad_norm": 17.6727294921875, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8688479065895081, "num_tokens": 484195091.0, "step": 12694 }, { "epoch": 1.6149344867065258, "ewc_loss": 0.02945874258875847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9458742574206553e-05, "grad_norm": 17.670265197753906, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8720252513885498, "num_tokens": 484236787.0, "step": 12695 }, { "epoch": 1.6150616969851164, "ewc_loss": 0.029519280418753624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9519280360545963e-05, "grad_norm": 17.771570205688477, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8651695251464844, "num_tokens": 484281748.0, "step": 12696 }, { "epoch": 1.615188907263707, "ewc_loss": 0.02948729135096073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9487291612895206e-05, "grad_norm": 17.595849990844727, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8757054805755615, "num_tokens": 484321181.0, "step": 12697 }, { "epoch": 1.6153161175422974, "ewc_loss": 0.029402248561382294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9402248401311226e-05, "grad_norm": 17.603992462158203, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8671538829803467, "num_tokens": 484360476.0, "step": 12698 }, { "epoch": 1.615443327820888, "ewc_loss": 0.02947104349732399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.947104258055333e-05, "grad_norm": 17.717512130737305, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.871737539768219, "num_tokens": 484395065.0, "step": 12699 }, { "epoch": 1.6155705380994785, "ewc_loss": 0.029497377574443817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9497377909137867e-05, "grad_norm": 17.668970108032227, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8717035055160522, "num_tokens": 484439309.0, "step": 12700 }, { "epoch": 1.6156977483780688, "ewc_loss": 0.02949809469282627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9498094590962864e-05, "grad_norm": 17.691768646240234, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8704475164413452, "num_tokens": 484473018.0, "step": 12701 }, { "epoch": 1.6158249586566593, "ewc_loss": 0.029464146122336388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9464146791724488e-05, "grad_norm": 17.64130210876465, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8763999938964844, "num_tokens": 484518306.0, "step": 12702 }, { "epoch": 1.6159521689352498, "ewc_loss": 0.029451053589582443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9451053705997765e-05, "grad_norm": 17.640239715576172, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8565106391906738, "num_tokens": 484556348.0, "step": 12703 }, { "epoch": 1.6160793792138404, "ewc_loss": 0.029492946341633797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.949294685095083e-05, "grad_norm": 17.6501407623291, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8813226222991943, "num_tokens": 484593136.0, "step": 12704 }, { "epoch": 1.616206589492431, "ewc_loss": 0.029505161568522453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.950516136479564e-05, "grad_norm": 17.639535903930664, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8795034289360046, "num_tokens": 484628025.0, "step": 12705 }, { "epoch": 1.6163337997710214, "ewc_loss": 0.029558060690760612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.955806121462956e-05, "grad_norm": 17.668561935424805, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8755428791046143, "num_tokens": 484666703.0, "step": 12706 }, { "epoch": 1.616461010049612, "ewc_loss": 0.029531722888350487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9531722248066217e-05, "grad_norm": 17.585668563842773, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8793864250183105, "num_tokens": 484703417.0, "step": 12707 }, { "epoch": 1.6165882203282025, "ewc_loss": 0.02952938713133335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9529386665672064e-05, "grad_norm": 17.685970306396484, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8841962814331055, "num_tokens": 484741830.0, "step": 12708 }, { "epoch": 1.616715430606793, "ewc_loss": 0.02957715094089508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9577151508419774e-05, "grad_norm": 17.61343002319336, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8676640391349792, "num_tokens": 484780140.0, "step": 12709 }, { "epoch": 1.6168426408853835, "ewc_loss": 0.029531311243772507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9531311156461015e-05, "grad_norm": 17.673582077026367, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8682965040206909, "num_tokens": 484813856.0, "step": 12710 }, { "epoch": 1.616969851163974, "ewc_loss": 0.02960376814007759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960376878036186e-05, "grad_norm": 17.665836334228516, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8794611096382141, "num_tokens": 484851383.0, "step": 12711 }, { "epoch": 1.6170970614425646, "ewc_loss": 0.029551932588219643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9551933039329015e-05, "grad_norm": 17.659873962402344, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.878614068031311, "num_tokens": 484884361.0, "step": 12712 }, { "epoch": 1.6172242717211551, "ewc_loss": 0.02957981266081333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9579812689917162e-05, "grad_norm": 17.57893943786621, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.849123477935791, "num_tokens": 484922106.0, "step": 12713 }, { "epoch": 1.6173514819997457, "ewc_loss": 0.02963987924158573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9639879358001053e-05, "grad_norm": 17.769742965698242, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8707926869392395, "num_tokens": 484963100.0, "step": 12714 }, { "epoch": 1.6174786922783362, "ewc_loss": 0.029639145359396935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.963914448628202e-05, "grad_norm": 17.538881301879883, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8811630010604858, "num_tokens": 485006896.0, "step": 12715 }, { "epoch": 1.6176059025569267, "ewc_loss": 0.029568837955594063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.956883872684557e-05, "grad_norm": 17.892929077148438, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.870042085647583, "num_tokens": 485045968.0, "step": 12716 }, { "epoch": 1.6177331128355172, "ewc_loss": 0.029755787923932076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97557871817844e-05, "grad_norm": 17.60049819946289, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8675902485847473, "num_tokens": 485084736.0, "step": 12717 }, { "epoch": 1.6178603231141078, "ewc_loss": 0.029513489454984665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9513488698285073e-05, "grad_norm": 17.573415756225586, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8782774209976196, "num_tokens": 485123130.0, "step": 12718 }, { "epoch": 1.6179875333926983, "ewc_loss": 0.029796889051795006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979688906634692e-05, "grad_norm": 17.62530517578125, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8669456243515015, "num_tokens": 485158072.0, "step": 12719 }, { "epoch": 1.6181147436712886, "ewc_loss": 0.029692623764276505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9692624593735673e-05, "grad_norm": 17.695341110229492, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8787202835083008, "num_tokens": 485196783.0, "step": 12720 }, { "epoch": 1.6182419539498791, "ewc_loss": 0.029720153659582138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9720153179368936e-05, "grad_norm": 17.60704231262207, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8769010901451111, "num_tokens": 485231433.0, "step": 12721 }, { "epoch": 1.6183691642284697, "ewc_loss": 0.02973051182925701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.973051232402213e-05, "grad_norm": 17.659223556518555, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8687477111816406, "num_tokens": 485269016.0, "step": 12722 }, { "epoch": 1.6184963745070602, "ewc_loss": 0.029769470915198326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9769471439067274e-05, "grad_norm": 17.60272979736328, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.859356164932251, "num_tokens": 485308293.0, "step": 12723 }, { "epoch": 1.6186235847856507, "ewc_loss": 0.02975534275174141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9755343348369934e-05, "grad_norm": 17.616619110107422, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8793476223945618, "num_tokens": 485340758.0, "step": 12724 }, { "epoch": 1.6187507950642412, "ewc_loss": 0.029776928946375847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9776929295621812e-05, "grad_norm": 17.611469268798828, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8850221633911133, "num_tokens": 485384047.0, "step": 12725 }, { "epoch": 1.6188780053428315, "ewc_loss": 0.02975773625075817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9757737138425e-05, "grad_norm": 17.60503387451172, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8742271065711975, "num_tokens": 485416909.0, "step": 12726 }, { "epoch": 1.619005215621422, "ewc_loss": 0.029832595959305763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9832595828338526e-05, "grad_norm": 17.74431610107422, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8713055849075317, "num_tokens": 485449532.0, "step": 12727 }, { "epoch": 1.6191324259000126, "ewc_loss": 0.029794560745358467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9794560759910382e-05, "grad_norm": 17.545942306518555, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.880363941192627, "num_tokens": 485491171.0, "step": 12728 }, { "epoch": 1.6192596361786031, "ewc_loss": 0.029695359990000725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9695360353798606e-05, "grad_norm": 17.652559280395508, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8656145334243774, "num_tokens": 485525511.0, "step": 12729 }, { "epoch": 1.6193868464571937, "ewc_loss": 0.02984730340540409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9847304176655598e-05, "grad_norm": 17.67778778076172, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8670002818107605, "num_tokens": 485562517.0, "step": 12730 }, { "epoch": 1.6195140567357842, "ewc_loss": 0.029694192111492157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969419256260153e-05, "grad_norm": 17.634170532226562, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8714699149131775, "num_tokens": 485601058.0, "step": 12731 }, { "epoch": 1.6196412670143747, "ewc_loss": 0.029742561280727386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9742561309831217e-05, "grad_norm": 17.68557357788086, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8845279216766357, "num_tokens": 485634737.0, "step": 12732 }, { "epoch": 1.6197684772929652, "ewc_loss": 0.029743194580078125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.974319431814365e-05, "grad_norm": 17.6566162109375, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8748397827148438, "num_tokens": 485671300.0, "step": 12733 }, { "epoch": 1.6198956875715558, "ewc_loss": 0.02965453453361988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9654534955625422e-05, "grad_norm": 17.607852935791016, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.879248857498169, "num_tokens": 485708712.0, "step": 12734 }, { "epoch": 1.6200228978501463, "ewc_loss": 0.029748529195785522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.974852941406425e-05, "grad_norm": 17.65565299987793, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8833339214324951, "num_tokens": 485746820.0, "step": 12735 }, { "epoch": 1.6201501081287368, "ewc_loss": 0.029678069055080414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.96780690405285e-05, "grad_norm": 17.578296661376953, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8709864020347595, "num_tokens": 485785921.0, "step": 12736 }, { "epoch": 1.6202773184073274, "ewc_loss": 0.029707107692956924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9707107387366705e-05, "grad_norm": 17.626874923706055, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8716553449630737, "num_tokens": 485825766.0, "step": 12737 }, { "epoch": 1.6204045286859179, "ewc_loss": 0.029776114970445633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9776114388369024e-05, "grad_norm": 17.672773361206055, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8652587532997131, "num_tokens": 485862288.0, "step": 12738 }, { "epoch": 1.6205317389645084, "ewc_loss": 0.029681218788027763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.968121953017544e-05, "grad_norm": 17.600826263427734, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8647488355636597, "num_tokens": 485903582.0, "step": 12739 }, { "epoch": 1.620658949243099, "ewc_loss": 0.029714537784457207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.971453795908019e-05, "grad_norm": 17.671812057495117, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8675495386123657, "num_tokens": 485950931.0, "step": 12740 }, { "epoch": 1.6207861595216895, "ewc_loss": 0.02970840223133564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970840250782203e-05, "grad_norm": 17.582509994506836, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8728541731834412, "num_tokens": 485984185.0, "step": 12741 }, { "epoch": 1.62091336980028, "ewc_loss": 0.02965531125664711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9655311664100736e-05, "grad_norm": 17.681440353393555, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8759361505508423, "num_tokens": 486018796.0, "step": 12742 }, { "epoch": 1.6210405800788705, "ewc_loss": 0.02975361794233322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.975361712742597e-05, "grad_norm": 17.642322540283203, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.856934666633606, "num_tokens": 486055328.0, "step": 12743 }, { "epoch": 1.6211677903574608, "ewc_loss": 0.0297063197940588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970631976495497e-05, "grad_norm": 17.67436408996582, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8658167123794556, "num_tokens": 486092340.0, "step": 12744 }, { "epoch": 1.6212950006360514, "ewc_loss": 0.02975674346089363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9756743970210664e-05, "grad_norm": 17.730852127075195, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.858612596988678, "num_tokens": 486123195.0, "step": 12745 }, { "epoch": 1.621422210914642, "ewc_loss": 0.029704071581363678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9704071494052187e-05, "grad_norm": 17.612892150878906, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8802024126052856, "num_tokens": 486167145.0, "step": 12746 }, { "epoch": 1.6215494211932324, "ewc_loss": 0.02965369261801243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965369276353158e-05, "grad_norm": 17.654375076293945, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8624138832092285, "num_tokens": 486207925.0, "step": 12747 }, { "epoch": 1.621676631471823, "ewc_loss": 0.0297466479241848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9746648579020984e-05, "grad_norm": 17.65519142150879, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8744415044784546, "num_tokens": 486245446.0, "step": 12748 }, { "epoch": 1.6218038417504135, "ewc_loss": 0.02965257503092289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9652575904037803e-05, "grad_norm": 17.612974166870117, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.871514618396759, "num_tokens": 486280461.0, "step": 12749 }, { "epoch": 1.6219310520290038, "ewc_loss": 0.029657021164894104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965702151414007e-05, "grad_norm": 17.66975212097168, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8593767881393433, "num_tokens": 486323926.0, "step": 12750 }, { "epoch": 1.6220582623075943, "ewc_loss": 0.029711659997701645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.971166031784378e-05, "grad_norm": 17.68231964111328, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8644636869430542, "num_tokens": 486360016.0, "step": 12751 }, { "epoch": 1.6221854725861848, "ewc_loss": 0.029632477089762688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9632477890118025e-05, "grad_norm": 17.64582061767578, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8738783001899719, "num_tokens": 486394994.0, "step": 12752 }, { "epoch": 1.6223126828647754, "ewc_loss": 0.02969546429812908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9695464036194608e-05, "grad_norm": 17.68510627746582, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8696627020835876, "num_tokens": 486436909.0, "step": 12753 }, { "epoch": 1.622439893143366, "ewc_loss": 0.029654918238520622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965491876238957e-05, "grad_norm": 17.606847763061523, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8767569065093994, "num_tokens": 486473201.0, "step": 12754 }, { "epoch": 1.6225671034219564, "ewc_loss": 0.029661420732736588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9661421649507247e-05, "grad_norm": 17.634750366210938, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8518259525299072, "num_tokens": 486510733.0, "step": 12755 }, { "epoch": 1.622694313700547, "ewc_loss": 0.02972540073096752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9725400963798165e-05, "grad_norm": 17.64767074584961, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8864370584487915, "num_tokens": 486546666.0, "step": 12756 }, { "epoch": 1.6228215239791375, "ewc_loss": 0.029679883271455765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9679882572963834e-05, "grad_norm": 17.689685821533203, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8512935042381287, "num_tokens": 486585639.0, "step": 12757 }, { "epoch": 1.622948734257728, "ewc_loss": 0.02970466949045658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9704669941565953e-05, "grad_norm": 17.65218734741211, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8754235506057739, "num_tokens": 486623028.0, "step": 12758 }, { "epoch": 1.6230759445363185, "ewc_loss": 0.029661964625120163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9661965527338907e-05, "grad_norm": 17.6340274810791, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8657433390617371, "num_tokens": 486655906.0, "step": 12759 }, { "epoch": 1.623203154814909, "ewc_loss": 0.029650088399648666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9650087526533753e-05, "grad_norm": 17.666004180908203, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8670679330825806, "num_tokens": 486689461.0, "step": 12760 }, { "epoch": 1.6233303650934996, "ewc_loss": 0.029748452827334404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9748453016509302e-05, "grad_norm": 17.724952697753906, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8557899594306946, "num_tokens": 486728373.0, "step": 12761 }, { "epoch": 1.6234575753720901, "ewc_loss": 0.029694823548197746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969482375192456e-05, "grad_norm": 17.708139419555664, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8799304962158203, "num_tokens": 486769955.0, "step": 12762 }, { "epoch": 1.6235847856506807, "ewc_loss": 0.02970266528427601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9702665415243246e-05, "grad_norm": 17.669477462768555, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.87450110912323, "num_tokens": 486806468.0, "step": 12763 }, { "epoch": 1.6237119959292712, "ewc_loss": 0.029721515253186226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972151560243219e-05, "grad_norm": 17.691781997680664, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.857591986656189, "num_tokens": 486846526.0, "step": 12764 }, { "epoch": 1.6238392062078617, "ewc_loss": 0.02964138239622116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9641381843248382e-05, "grad_norm": 17.607677459716797, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.888015866279602, "num_tokens": 486886323.0, "step": 12765 }, { "epoch": 1.6239664164864522, "ewc_loss": 0.029671674594283104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9671675292775035e-05, "grad_norm": 17.74285316467285, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8526817560195923, "num_tokens": 486918099.0, "step": 12766 }, { "epoch": 1.6240936267650428, "ewc_loss": 0.02973306179046631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97330625471659e-05, "grad_norm": 17.70346450805664, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8850495219230652, "num_tokens": 486953789.0, "step": 12767 }, { "epoch": 1.6242208370436333, "ewc_loss": 0.029655834659934044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9655835533048958e-05, "grad_norm": 17.73306655883789, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8695192337036133, "num_tokens": 486996165.0, "step": 12768 }, { "epoch": 1.6243480473222236, "ewc_loss": 0.029714351519942284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9714350603171624e-05, "grad_norm": 17.705427169799805, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8810221552848816, "num_tokens": 487028249.0, "step": 12769 }, { "epoch": 1.6244752576008141, "ewc_loss": 0.02959620952606201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9596209060400724e-05, "grad_norm": 17.61810302734375, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8709293007850647, "num_tokens": 487070099.0, "step": 12770 }, { "epoch": 1.6246024678794047, "ewc_loss": 0.02963970974087715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9639710191986524e-05, "grad_norm": 17.73845863342285, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8742037415504456, "num_tokens": 487107117.0, "step": 12771 }, { "epoch": 1.6247296781579952, "ewc_loss": 0.029663117602467537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.966311694763135e-05, "grad_norm": 17.677505493164062, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8588136434555054, "num_tokens": 487146024.0, "step": 12772 }, { "epoch": 1.6248568884365857, "ewc_loss": 0.029607441276311874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960744131996762e-05, "grad_norm": 17.63585662841797, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8580211400985718, "num_tokens": 487189864.0, "step": 12773 }, { "epoch": 1.6249840987151762, "ewc_loss": 0.029673457145690918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.967345790239051e-05, "grad_norm": 17.71970558166504, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.873614490032196, "num_tokens": 487230307.0, "step": 12774 }, { "epoch": 1.6251113089937665, "ewc_loss": 0.029656410217285156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9656410333700478e-05, "grad_norm": 17.663818359375, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8736872673034668, "num_tokens": 487261387.0, "step": 12775 }, { "epoch": 1.625238519272357, "ewc_loss": 0.02964417263865471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964417217299342e-05, "grad_norm": 17.71725845336914, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.859324038028717, "num_tokens": 487298611.0, "step": 12776 }, { "epoch": 1.6253657295509476, "ewc_loss": 0.02965562604367733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965562634926755e-05, "grad_norm": 17.60459327697754, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8699663877487183, "num_tokens": 487338172.0, "step": 12777 }, { "epoch": 1.6254929398295381, "ewc_loss": 0.029621142894029617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.962114376714453e-05, "grad_norm": 17.713298797607422, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8724342584609985, "num_tokens": 487375126.0, "step": 12778 }, { "epoch": 1.6256201501081287, "ewc_loss": 0.029651420190930367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9651420845766552e-05, "grad_norm": 17.569019317626953, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8815725445747375, "num_tokens": 487416539.0, "step": 12779 }, { "epoch": 1.6257473603867192, "ewc_loss": 0.029593322426080704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9593322324217297e-05, "grad_norm": 17.636518478393555, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8733171224594116, "num_tokens": 487448429.0, "step": 12780 }, { "epoch": 1.6258745706653097, "ewc_loss": 0.029721492901444435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972149377455935e-05, "grad_norm": 17.659555435180664, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8703038692474365, "num_tokens": 487486054.0, "step": 12781 }, { "epoch": 1.6260017809439002, "ewc_loss": 0.029666267335414886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9666267437278293e-05, "grad_norm": 17.592857360839844, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8766647577285767, "num_tokens": 487523771.0, "step": 12782 }, { "epoch": 1.6261289912224908, "ewc_loss": 0.02967156283557415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9671562515432015e-05, "grad_norm": 17.657100677490234, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8758996725082397, "num_tokens": 487561334.0, "step": 12783 }, { "epoch": 1.6262562015010813, "ewc_loss": 0.029734423384070396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9734423151239753e-05, "grad_norm": 17.571697235107422, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8712683916091919, "num_tokens": 487606382.0, "step": 12784 }, { "epoch": 1.6263834117796718, "ewc_loss": 0.02972666546702385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972666516143363e-05, "grad_norm": 17.632028579711914, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8664976954460144, "num_tokens": 487647640.0, "step": 12785 }, { "epoch": 1.6265106220582624, "ewc_loss": 0.029758330434560776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9758330128970556e-05, "grad_norm": 17.69801139831543, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8683550357818604, "num_tokens": 487682604.0, "step": 12786 }, { "epoch": 1.6266378323368529, "ewc_loss": 0.029741164296865463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9741164325969294e-05, "grad_norm": 17.7045841217041, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8735721111297607, "num_tokens": 487718271.0, "step": 12787 }, { "epoch": 1.6267650426154434, "ewc_loss": 0.029685404151678085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9685405024793e-05, "grad_norm": 17.64859390258789, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8580968379974365, "num_tokens": 487752872.0, "step": 12788 }, { "epoch": 1.626892252894034, "ewc_loss": 0.029744070023298264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9744069252046756e-05, "grad_norm": 17.709367752075195, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8615926504135132, "num_tokens": 487798309.0, "step": 12789 }, { "epoch": 1.6270194631726245, "ewc_loss": 0.02972024865448475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972024776681792e-05, "grad_norm": 17.645830154418945, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8600829839706421, "num_tokens": 487834052.0, "step": 12790 }, { "epoch": 1.627146673451215, "ewc_loss": 0.029681820422410965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.968181979667861e-05, "grad_norm": 17.71174430847168, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8586189150810242, "num_tokens": 487868662.0, "step": 12791 }, { "epoch": 1.6272738837298055, "ewc_loss": 0.029725557193160057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972555739688687e-05, "grad_norm": 17.707984924316406, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.868035078048706, "num_tokens": 487907425.0, "step": 12792 }, { "epoch": 1.6274010940083958, "ewc_loss": 0.029636120423674583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9636121325893328e-05, "grad_norm": 17.632938385009766, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8790020942687988, "num_tokens": 487942979.0, "step": 12793 }, { "epoch": 1.6275283042869864, "ewc_loss": 0.029705436900258064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970543755509425e-05, "grad_norm": 17.74991226196289, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8749129772186279, "num_tokens": 487982402.0, "step": 12794 }, { "epoch": 1.6276555145655769, "ewc_loss": 0.02972055971622467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9720558814005926e-05, "grad_norm": 17.692720413208008, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8780696392059326, "num_tokens": 488020071.0, "step": 12795 }, { "epoch": 1.6277827248441674, "ewc_loss": 0.029660988599061966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9660988730029203e-05, "grad_norm": 17.695144653320312, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8573490381240845, "num_tokens": 488057457.0, "step": 12796 }, { "epoch": 1.627909935122758, "ewc_loss": 0.02971620298922062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9716202334384434e-05, "grad_norm": 17.717086791992188, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.866868793964386, "num_tokens": 488094002.0, "step": 12797 }, { "epoch": 1.6280371454013485, "ewc_loss": 0.029669184237718582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.966918509628158e-05, "grad_norm": 17.67332649230957, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8561393022537231, "num_tokens": 488134995.0, "step": 12798 }, { "epoch": 1.6281643556799388, "ewc_loss": 0.029651539400219917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9651539080077782e-05, "grad_norm": 17.66541862487793, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.874098002910614, "num_tokens": 488169942.0, "step": 12799 }, { "epoch": 1.6282915659585293, "ewc_loss": 0.029667237773537636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9667236958630383e-05, "grad_norm": 17.641042709350586, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8729479312896729, "num_tokens": 488210852.0, "step": 12800 }, { "epoch": 1.6284187762371198, "ewc_loss": 0.029662786051630974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9662785891559906e-05, "grad_norm": 17.68535804748535, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8739931583404541, "num_tokens": 488253067.0, "step": 12801 }, { "epoch": 1.6285459865157104, "ewc_loss": 0.029742863029241562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9742863262072206e-05, "grad_norm": 17.73160171508789, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8784571886062622, "num_tokens": 488293498.0, "step": 12802 }, { "epoch": 1.628673196794301, "ewc_loss": 0.029688429087400436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9688430004171096e-05, "grad_norm": 17.714696884155273, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8721340894699097, "num_tokens": 488333931.0, "step": 12803 }, { "epoch": 1.6288004070728914, "ewc_loss": 0.029604211449623108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9604210794786923e-05, "grad_norm": 17.661087036132812, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8747966885566711, "num_tokens": 488371724.0, "step": 12804 }, { "epoch": 1.628927617351482, "ewc_loss": 0.029661521315574646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9661521693924442e-05, "grad_norm": 17.656702041625977, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8656291365623474, "num_tokens": 488415171.0, "step": 12805 }, { "epoch": 1.6290548276300725, "ewc_loss": 0.029614919796586037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9614919185405597e-05, "grad_norm": 17.699926376342773, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8647388815879822, "num_tokens": 488451672.0, "step": 12806 }, { "epoch": 1.629182037908663, "ewc_loss": 0.029684875160455704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9684875698876567e-05, "grad_norm": 17.667011260986328, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8842765092849731, "num_tokens": 488492657.0, "step": 12807 }, { "epoch": 1.6293092481872535, "ewc_loss": 0.029643980786204338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964398117910605e-05, "grad_norm": 17.694055557250977, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8780162930488586, "num_tokens": 488530681.0, "step": 12808 }, { "epoch": 1.629436458465844, "ewc_loss": 0.029697228223085403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9697228455916047e-05, "grad_norm": 17.781944274902344, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8811473250389099, "num_tokens": 488560542.0, "step": 12809 }, { "epoch": 1.6295636687444346, "ewc_loss": 0.02967446856200695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.967446926049888e-05, "grad_norm": 17.786996841430664, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8715549111366272, "num_tokens": 488597491.0, "step": 12810 }, { "epoch": 1.6296908790230251, "ewc_loss": 0.02957765944302082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9577659006463364e-05, "grad_norm": 17.615680694580078, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8782861232757568, "num_tokens": 488634556.0, "step": 12811 }, { "epoch": 1.6298180893016156, "ewc_loss": 0.029586540535092354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9586541131720878e-05, "grad_norm": 17.680377960205078, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.867303729057312, "num_tokens": 488668174.0, "step": 12812 }, { "epoch": 1.6299452995802062, "ewc_loss": 0.029687177389860153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9687176720472053e-05, "grad_norm": 17.715782165527344, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.872724175453186, "num_tokens": 488705994.0, "step": 12813 }, { "epoch": 1.6300725098587967, "ewc_loss": 0.029594412073493004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959441189887002e-05, "grad_norm": 17.58414649963379, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8599004745483398, "num_tokens": 488746908.0, "step": 12814 }, { "epoch": 1.6301997201373872, "ewc_loss": 0.02964247576892376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9642475055879913e-05, "grad_norm": 17.71788215637207, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8717087507247925, "num_tokens": 488785607.0, "step": 12815 }, { "epoch": 1.6303269304159778, "ewc_loss": 0.029693886637687683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9693886972381733e-05, "grad_norm": 17.697031021118164, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8689776659011841, "num_tokens": 488823831.0, "step": 12816 }, { "epoch": 1.630454140694568, "ewc_loss": 0.029608454555273056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9608454497065395e-05, "grad_norm": 17.686054229736328, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8781833648681641, "num_tokens": 488859944.0, "step": 12817 }, { "epoch": 1.6305813509731586, "ewc_loss": 0.02964787557721138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964787563541904e-05, "grad_norm": 17.684913635253906, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8561266660690308, "num_tokens": 488903764.0, "step": 12818 }, { "epoch": 1.6307085612517491, "ewc_loss": 0.029680458828806877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9680459192604758e-05, "grad_norm": 17.755226135253906, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8606671094894409, "num_tokens": 488946890.0, "step": 12819 }, { "epoch": 1.6308357715303397, "ewc_loss": 0.02970064990222454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9700649974984117e-05, "grad_norm": 17.761539459228516, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8675209283828735, "num_tokens": 488981901.0, "step": 12820 }, { "epoch": 1.6309629818089302, "ewc_loss": 0.02964162267744541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964162194984965e-05, "grad_norm": 17.741497039794922, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.873171329498291, "num_tokens": 489024180.0, "step": 12821 }, { "epoch": 1.6310901920875207, "ewc_loss": 0.029595062136650085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9595061278087087e-05, "grad_norm": 17.657875061035156, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8738255500793457, "num_tokens": 489070971.0, "step": 12822 }, { "epoch": 1.6312174023661112, "ewc_loss": 0.029585836455225945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9585837182821706e-05, "grad_norm": 17.713167190551758, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8798984289169312, "num_tokens": 489111636.0, "step": 12823 }, { "epoch": 1.6313446126447015, "ewc_loss": 0.029648924246430397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9648923373315483e-05, "grad_norm": 17.68077278137207, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8683873414993286, "num_tokens": 489148677.0, "step": 12824 }, { "epoch": 1.631471822923292, "ewc_loss": 0.02961580455303192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9615805033245124e-05, "grad_norm": 17.75687599182129, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8590778708457947, "num_tokens": 489186222.0, "step": 12825 }, { "epoch": 1.6315990332018826, "ewc_loss": 0.029588377103209496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9588376492029056e-05, "grad_norm": 17.63953971862793, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8442819118499756, "num_tokens": 489223173.0, "step": 12826 }, { "epoch": 1.6317262434804731, "ewc_loss": 0.029542839154601097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.954283991130069e-05, "grad_norm": 17.71076011657715, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8713687658309937, "num_tokens": 489259236.0, "step": 12827 }, { "epoch": 1.6318534537590637, "ewc_loss": 0.029653018340468407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9653017918462865e-05, "grad_norm": 17.68574333190918, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8635114431381226, "num_tokens": 489298449.0, "step": 12828 }, { "epoch": 1.6319806640376542, "ewc_loss": 0.029570650309324265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.95706504402915e-05, "grad_norm": 17.678421020507812, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.85020911693573, "num_tokens": 489334800.0, "step": 12829 }, { "epoch": 1.6321078743162447, "ewc_loss": 0.029670624062418938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9670623916899785e-05, "grad_norm": 17.78395652770996, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8766607046127319, "num_tokens": 489378510.0, "step": 12830 }, { "epoch": 1.6322350845948352, "ewc_loss": 0.02965090610086918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965090607176535e-05, "grad_norm": 17.658994674682617, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8587905168533325, "num_tokens": 489419267.0, "step": 12831 }, { "epoch": 1.6323622948734258, "ewc_loss": 0.029599519446492195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9599519621115178e-05, "grad_norm": 17.743640899658203, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8735787868499756, "num_tokens": 489454772.0, "step": 12832 }, { "epoch": 1.6324895051520163, "ewc_loss": 0.029641680419445038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9641680157510564e-05, "grad_norm": 17.699071884155273, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8655778169631958, "num_tokens": 489490146.0, "step": 12833 }, { "epoch": 1.6326167154306068, "ewc_loss": 0.029599355533719063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.959935591206886e-05, "grad_norm": 17.717750549316406, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8409422039985657, "num_tokens": 489532754.0, "step": 12834 }, { "epoch": 1.6327439257091974, "ewc_loss": 0.029640089720487595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9640090360771865e-05, "grad_norm": 17.73314666748047, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8671671152114868, "num_tokens": 489570378.0, "step": 12835 }, { "epoch": 1.6328711359877879, "ewc_loss": 0.02959313616156578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9593136787298135e-05, "grad_norm": 17.644775390625, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8627898693084717, "num_tokens": 489604562.0, "step": 12836 }, { "epoch": 1.6329983462663784, "ewc_loss": 0.029608994722366333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9608994736918248e-05, "grad_norm": 17.773155212402344, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8780463933944702, "num_tokens": 489644060.0, "step": 12837 }, { "epoch": 1.633125556544969, "ewc_loss": 0.029637038707733154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9637038096552715e-05, "grad_norm": 17.696388244628906, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8719387054443359, "num_tokens": 489682693.0, "step": 12838 }, { "epoch": 1.6332527668235595, "ewc_loss": 0.029605215415358543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960521487693768e-05, "grad_norm": 17.740550994873047, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8787399530410767, "num_tokens": 489720786.0, "step": 12839 }, { "epoch": 1.63337997710215, "ewc_loss": 0.029696593061089516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969659362861421e-05, "grad_norm": 17.73013687133789, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8769258260726929, "num_tokens": 489758773.0, "step": 12840 }, { "epoch": 1.6335071873807405, "ewc_loss": 0.029604926705360413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960492747661192e-05, "grad_norm": 17.70038414001465, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8702435493469238, "num_tokens": 489793436.0, "step": 12841 }, { "epoch": 1.6336343976593308, "ewc_loss": 0.02960662916302681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.960663005069364e-05, "grad_norm": 17.68355369567871, "learning_rate": 1e-06, "loss": 0.3431, "mean_token_accuracy": 0.8891137838363647, "num_tokens": 489828634.0, "step": 12842 }, { "epoch": 1.6337616079379214, "ewc_loss": 0.02964506298303604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.964506347780116e-05, "grad_norm": 17.712919235229492, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8745275735855103, "num_tokens": 489866335.0, "step": 12843 }, { "epoch": 1.6338888182165119, "ewc_loss": 0.029676031321287155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9676031772396527e-05, "grad_norm": 17.78789710998535, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8793576955795288, "num_tokens": 489903059.0, "step": 12844 }, { "epoch": 1.6340160284951024, "ewc_loss": 0.029687311500310898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9687311325687915e-05, "grad_norm": 17.715112686157227, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8775475025177002, "num_tokens": 489940186.0, "step": 12845 }, { "epoch": 1.634143238773693, "ewc_loss": 0.029568757861852646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9568758691311814e-05, "grad_norm": 17.675148010253906, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8823325634002686, "num_tokens": 489984732.0, "step": 12846 }, { "epoch": 1.6342704490522835, "ewc_loss": 0.029660116881132126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9660117434104905e-05, "grad_norm": 17.707685470581055, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8717033863067627, "num_tokens": 490016699.0, "step": 12847 }, { "epoch": 1.6343976593308738, "ewc_loss": 0.029659898951649666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965989915537648e-05, "grad_norm": 17.715063095092773, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.862366795539856, "num_tokens": 490051351.0, "step": 12848 }, { "epoch": 1.6345248696094643, "ewc_loss": 0.02962310239672661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.962310281873215e-05, "grad_norm": 17.674421310424805, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8787058591842651, "num_tokens": 490089668.0, "step": 12849 }, { "epoch": 1.6346520798880548, "ewc_loss": 0.0296886395663023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9688639187952504e-05, "grad_norm": 17.75865936279297, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8713250160217285, "num_tokens": 490129651.0, "step": 12850 }, { "epoch": 1.6347792901666454, "ewc_loss": 0.02969963848590851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9699638616875745e-05, "grad_norm": 17.625961303710938, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8642077445983887, "num_tokens": 490168441.0, "step": 12851 }, { "epoch": 1.6349065004452359, "ewc_loss": 0.02971283718943596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9712837203987874e-05, "grad_norm": 17.713943481445312, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8802423477172852, "num_tokens": 490204754.0, "step": 12852 }, { "epoch": 1.6350337107238264, "ewc_loss": 0.029762841761112213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.976284122269135e-05, "grad_norm": 17.779373168945312, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8854260444641113, "num_tokens": 490244178.0, "step": 12853 }, { "epoch": 1.635160921002417, "ewc_loss": 0.029629122465848923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9629121854668483e-05, "grad_norm": 17.645048141479492, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8667306303977966, "num_tokens": 490281161.0, "step": 12854 }, { "epoch": 1.6352881312810075, "ewc_loss": 0.029670003801584244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9670003641513176e-05, "grad_norm": 17.714479446411133, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8719375729560852, "num_tokens": 490314892.0, "step": 12855 }, { "epoch": 1.635415341559598, "ewc_loss": 0.029721800237894058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9721799364779145e-05, "grad_norm": 17.654815673828125, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8596135377883911, "num_tokens": 490352600.0, "step": 12856 }, { "epoch": 1.6355425518381885, "ewc_loss": 0.029675893485546112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9675893529201858e-05, "grad_norm": 17.733795166015625, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8826544284820557, "num_tokens": 490384487.0, "step": 12857 }, { "epoch": 1.635669762116779, "ewc_loss": 0.029762890189886093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9762890335405245e-05, "grad_norm": 17.737443923950195, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8504408001899719, "num_tokens": 490422199.0, "step": 12858 }, { "epoch": 1.6357969723953696, "ewc_loss": 0.029711322858929634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.971132198581472e-05, "grad_norm": 17.686548233032227, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8635902404785156, "num_tokens": 490458058.0, "step": 12859 }, { "epoch": 1.6359241826739601, "ewc_loss": 0.029742706567049026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97427068289835e-05, "grad_norm": 17.72260093688965, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.881477952003479, "num_tokens": 490498447.0, "step": 12860 }, { "epoch": 1.6360513929525506, "ewc_loss": 0.029743481427431107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.974348171846941e-05, "grad_norm": 17.706764221191406, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8574193120002747, "num_tokens": 490538502.0, "step": 12861 }, { "epoch": 1.6361786032311412, "ewc_loss": 0.02975480444729328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9754804927506484e-05, "grad_norm": 17.698793411254883, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8745649456977844, "num_tokens": 490575752.0, "step": 12862 }, { "epoch": 1.6363058135097317, "ewc_loss": 0.029736733064055443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9736733267782256e-05, "grad_norm": 17.694900512695312, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8719795942306519, "num_tokens": 490613502.0, "step": 12863 }, { "epoch": 1.6364330237883222, "ewc_loss": 0.029689611867070198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.96896123472834e-05, "grad_norm": 17.675302505493164, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.864085853099823, "num_tokens": 490653820.0, "step": 12864 }, { "epoch": 1.6365602340669128, "ewc_loss": 0.029742104932665825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9742104743490927e-05, "grad_norm": 17.697364807128906, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8557525873184204, "num_tokens": 490693332.0, "step": 12865 }, { "epoch": 1.636687444345503, "ewc_loss": 0.029731936752796173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9731936592725106e-05, "grad_norm": 17.634544372558594, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8687773942947388, "num_tokens": 490733135.0, "step": 12866 }, { "epoch": 1.6368146546240936, "ewc_loss": 0.029736915603280067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.973691516672261e-05, "grad_norm": 17.690654754638672, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8739352226257324, "num_tokens": 490775505.0, "step": 12867 }, { "epoch": 1.6369418649026841, "ewc_loss": 0.02976464480161667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9764645660179667e-05, "grad_norm": 17.631790161132812, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8703368902206421, "num_tokens": 490817783.0, "step": 12868 }, { "epoch": 1.6370690751812746, "ewc_loss": 0.029786808416247368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.978680822707247e-05, "grad_norm": 17.79298973083496, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8648778796195984, "num_tokens": 490849464.0, "step": 12869 }, { "epoch": 1.6371962854598652, "ewc_loss": 0.029764829203486443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9764829378109425e-05, "grad_norm": 17.630016326904297, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.863974392414093, "num_tokens": 490891925.0, "step": 12870 }, { "epoch": 1.6373234957384557, "ewc_loss": 0.029719006270170212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9719005397055298e-05, "grad_norm": 17.758670806884766, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8638118505477905, "num_tokens": 490926221.0, "step": 12871 }, { "epoch": 1.6374507060170462, "ewc_loss": 0.029807530343532562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9807530154357664e-05, "grad_norm": 17.728267669677734, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8610193729400635, "num_tokens": 490964141.0, "step": 12872 }, { "epoch": 1.6375779162956365, "ewc_loss": 0.02974550612270832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9745506253675558e-05, "grad_norm": 17.790983200073242, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8665422201156616, "num_tokens": 490997917.0, "step": 12873 }, { "epoch": 1.637705126574227, "ewc_loss": 0.02976650930941105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97665101243183e-05, "grad_norm": 17.68667984008789, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8841281533241272, "num_tokens": 491029244.0, "step": 12874 }, { "epoch": 1.6378323368528176, "ewc_loss": 0.029667643830180168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9667644412256777e-05, "grad_norm": 17.682100296020508, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8695085048675537, "num_tokens": 491066591.0, "step": 12875 }, { "epoch": 1.6379595471314081, "ewc_loss": 0.0297558531165123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9755852665402927e-05, "grad_norm": 17.713287353515625, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8692799806594849, "num_tokens": 491106546.0, "step": 12876 }, { "epoch": 1.6380867574099987, "ewc_loss": 0.029761062934994698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9761062251054682e-05, "grad_norm": 17.696535110473633, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8636981844902039, "num_tokens": 491145512.0, "step": 12877 }, { "epoch": 1.6382139676885892, "ewc_loss": 0.02978367544710636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9783675927319564e-05, "grad_norm": 17.74061393737793, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8685659170150757, "num_tokens": 491185624.0, "step": 12878 }, { "epoch": 1.6383411779671797, "ewc_loss": 0.029796786606311798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9796787202940322e-05, "grad_norm": 17.670896530151367, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8638806939125061, "num_tokens": 491220982.0, "step": 12879 }, { "epoch": 1.6384683882457702, "ewc_loss": 0.02970121055841446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970121022372041e-05, "grad_norm": 17.697162628173828, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8725706338882446, "num_tokens": 491262501.0, "step": 12880 }, { "epoch": 1.6385955985243608, "ewc_loss": 0.029821189120411873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.982118894578889e-05, "grad_norm": 17.68438720703125, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8809101581573486, "num_tokens": 491298078.0, "step": 12881 }, { "epoch": 1.6387228088029513, "ewc_loss": 0.02979801595211029, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9798015020787716e-05, "grad_norm": 17.763935089111328, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.876926064491272, "num_tokens": 491336203.0, "step": 12882 }, { "epoch": 1.6388500190815418, "ewc_loss": 0.02979971282184124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9799712137901224e-05, "grad_norm": 17.702903747558594, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8591636419296265, "num_tokens": 491374723.0, "step": 12883 }, { "epoch": 1.6389772293601323, "ewc_loss": 0.029747596010565758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9747596272500232e-05, "grad_norm": 17.63919448852539, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8715702295303345, "num_tokens": 491410546.0, "step": 12884 }, { "epoch": 1.6391044396387229, "ewc_loss": 0.029762407764792442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9762408303213306e-05, "grad_norm": 17.7546443939209, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.866213858127594, "num_tokens": 491450642.0, "step": 12885 }, { "epoch": 1.6392316499173134, "ewc_loss": 0.029812302440404892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.981230318255257e-05, "grad_norm": 17.646038055419922, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8670889139175415, "num_tokens": 491489893.0, "step": 12886 }, { "epoch": 1.639358860195904, "ewc_loss": 0.029719745740294456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.971974572574254e-05, "grad_norm": 17.723358154296875, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8611999750137329, "num_tokens": 491535870.0, "step": 12887 }, { "epoch": 1.6394860704744945, "ewc_loss": 0.02978820912539959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97882088489132e-05, "grad_norm": 17.66802215576172, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8682374954223633, "num_tokens": 491575988.0, "step": 12888 }, { "epoch": 1.639613280753085, "ewc_loss": 0.02971538156270981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9715381970163435e-05, "grad_norm": 17.724353790283203, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8715394139289856, "num_tokens": 491616822.0, "step": 12889 }, { "epoch": 1.6397404910316755, "ewc_loss": 0.02974957972764969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9749578970950097e-05, "grad_norm": 17.650712966918945, "learning_rate": 1e-06, "loss": 0.3453, "mean_token_accuracy": 0.8895491361618042, "num_tokens": 491654132.0, "step": 12890 }, { "epoch": 1.6398677013102658, "ewc_loss": 0.029721345752477646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972134643641766e-05, "grad_norm": 17.728425979614258, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.857337474822998, "num_tokens": 491697215.0, "step": 12891 }, { "epoch": 1.6399949115888564, "ewc_loss": 0.02974771149456501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9747710868832655e-05, "grad_norm": 17.608131408691406, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8488672971725464, "num_tokens": 491731338.0, "step": 12892 }, { "epoch": 1.6401221218674469, "ewc_loss": 0.029654860496520996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9654860554728657e-05, "grad_norm": 17.715486526489258, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8685389757156372, "num_tokens": 491770808.0, "step": 12893 }, { "epoch": 1.6402493321460374, "ewc_loss": 0.029774636030197144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.977463554998394e-05, "grad_norm": 17.73124885559082, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8771252632141113, "num_tokens": 491804167.0, "step": 12894 }, { "epoch": 1.640376542424628, "ewc_loss": 0.029682260006666183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9682259992114268e-05, "grad_norm": 17.716611862182617, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8705982565879822, "num_tokens": 491840094.0, "step": 12895 }, { "epoch": 1.6405037527032185, "ewc_loss": 0.029743105173110962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9743105187662877e-05, "grad_norm": 17.694908142089844, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.855643093585968, "num_tokens": 491881643.0, "step": 12896 }, { "epoch": 1.6406309629818088, "ewc_loss": 0.029653199017047882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.965319981740322e-05, "grad_norm": 17.69466781616211, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8579049110412598, "num_tokens": 491920951.0, "step": 12897 }, { "epoch": 1.6407581732603993, "ewc_loss": 0.02967863529920578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9678634746233e-05, "grad_norm": 17.73639678955078, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8663299083709717, "num_tokens": 491954599.0, "step": 12898 }, { "epoch": 1.6408853835389898, "ewc_loss": 0.029665600508451462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9665599868167192e-05, "grad_norm": 17.642141342163086, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8703576922416687, "num_tokens": 491995375.0, "step": 12899 }, { "epoch": 1.6410125938175804, "ewc_loss": 0.029678799211978912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.967879845527932e-05, "grad_norm": 17.713481903076172, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.865013837814331, "num_tokens": 492035297.0, "step": 12900 }, { "epoch": 1.6411398040961709, "ewc_loss": 0.029691828414797783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969182787637692e-05, "grad_norm": 17.697805404663086, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8702546954154968, "num_tokens": 492071003.0, "step": 12901 }, { "epoch": 1.6412670143747614, "ewc_loss": 0.029718700796365738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9718701625824906e-05, "grad_norm": 17.673236846923828, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.869848370552063, "num_tokens": 492106046.0, "step": 12902 }, { "epoch": 1.641394224653352, "ewc_loss": 0.029737601056694984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9737600925727747e-05, "grad_norm": 17.70233917236328, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8810560703277588, "num_tokens": 492147252.0, "step": 12903 }, { "epoch": 1.6415214349319425, "ewc_loss": 0.029708925634622574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970892637677025e-05, "grad_norm": 17.669858932495117, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8613744974136353, "num_tokens": 492188185.0, "step": 12904 }, { "epoch": 1.641648645210533, "ewc_loss": 0.029770638793706894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.977063923026435e-05, "grad_norm": 17.718521118164062, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8759756088256836, "num_tokens": 492219630.0, "step": 12905 }, { "epoch": 1.6417758554891235, "ewc_loss": 0.029706250876188278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9706250643357635e-05, "grad_norm": 17.621505737304688, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8721374273300171, "num_tokens": 492251955.0, "step": 12906 }, { "epoch": 1.641903065767714, "ewc_loss": 0.029790004715323448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97900041914545e-05, "grad_norm": 17.68708038330078, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8726009130477905, "num_tokens": 492293484.0, "step": 12907 }, { "epoch": 1.6420302760463046, "ewc_loss": 0.02975345402956009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.975345341837965e-05, "grad_norm": 17.622779846191406, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8550747632980347, "num_tokens": 492331169.0, "step": 12908 }, { "epoch": 1.642157486324895, "ewc_loss": 0.029825251549482346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9825250749127008e-05, "grad_norm": 17.760007858276367, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.877232551574707, "num_tokens": 492366071.0, "step": 12909 }, { "epoch": 1.6422846966034856, "ewc_loss": 0.02977898344397545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9778982934658416e-05, "grad_norm": 17.671201705932617, "learning_rate": 1e-06, "loss": 0.3419, "mean_token_accuracy": 0.8894754648208618, "num_tokens": 492396820.0, "step": 12910 }, { "epoch": 1.6424119068820762, "ewc_loss": 0.029777968302369118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9777967938571237e-05, "grad_norm": 17.729185104370117, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.876685380935669, "num_tokens": 492435282.0, "step": 12911 }, { "epoch": 1.6425391171606667, "ewc_loss": 0.02980811521410942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9808115868945606e-05, "grad_norm": 17.626184463500977, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8784891366958618, "num_tokens": 492470195.0, "step": 12912 }, { "epoch": 1.6426663274392572, "ewc_loss": 0.02977115474641323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9771154004265554e-05, "grad_norm": 17.679689407348633, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8643156290054321, "num_tokens": 492514265.0, "step": 12913 }, { "epoch": 1.6427935377178478, "ewc_loss": 0.02988624945282936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9886248739785515e-05, "grad_norm": 17.689311981201172, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8721973896026611, "num_tokens": 492550348.0, "step": 12914 }, { "epoch": 1.642920747996438, "ewc_loss": 0.02979482337832451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9794822694384493e-05, "grad_norm": 17.695480346679688, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.872290313243866, "num_tokens": 492590648.0, "step": 12915 }, { "epoch": 1.6430479582750286, "ewc_loss": 0.029818588867783546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9818589609931223e-05, "grad_norm": 17.766748428344727, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8716424703598022, "num_tokens": 492629523.0, "step": 12916 }, { "epoch": 1.6431751685536191, "ewc_loss": 0.02983628399670124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9836284738848917e-05, "grad_norm": 17.695960998535156, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8648692965507507, "num_tokens": 492668392.0, "step": 12917 }, { "epoch": 1.6433023788322096, "ewc_loss": 0.029795661568641663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9795661248499528e-05, "grad_norm": 17.76112174987793, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8597092032432556, "num_tokens": 492696073.0, "step": 12918 }, { "epoch": 1.6434295891108002, "ewc_loss": 0.029799990355968475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9799990443279967e-05, "grad_norm": 17.72233772277832, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8727545142173767, "num_tokens": 492728590.0, "step": 12919 }, { "epoch": 1.6435567993893907, "ewc_loss": 0.029804809018969536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.980480894620996e-05, "grad_norm": 17.718114852905273, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8621717691421509, "num_tokens": 492767290.0, "step": 12920 }, { "epoch": 1.6436840096679812, "ewc_loss": 0.029864398762583733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9864399039070122e-05, "grad_norm": 17.72121238708496, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8813509941101074, "num_tokens": 492801019.0, "step": 12921 }, { "epoch": 1.6438112199465715, "ewc_loss": 0.02982178144156933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9821781936334446e-05, "grad_norm": 17.701759338378906, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8695801496505737, "num_tokens": 492841220.0, "step": 12922 }, { "epoch": 1.643938430225162, "ewc_loss": 0.029778456315398216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9778455427731387e-05, "grad_norm": 17.676132202148438, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8602370023727417, "num_tokens": 492885076.0, "step": 12923 }, { "epoch": 1.6440656405037526, "ewc_loss": 0.02985459566116333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9854596505174413e-05, "grad_norm": 17.76793098449707, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.866020917892456, "num_tokens": 492918554.0, "step": 12924 }, { "epoch": 1.6441928507823431, "ewc_loss": 0.02979695051908493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979695091198664e-05, "grad_norm": 17.649307250976562, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8864328861236572, "num_tokens": 492961890.0, "step": 12925 }, { "epoch": 1.6443200610609336, "ewc_loss": 0.029796428978443146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9796428862027824e-05, "grad_norm": 17.68402099609375, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8802343606948853, "num_tokens": 492997768.0, "step": 12926 }, { "epoch": 1.6444472713395242, "ewc_loss": 0.029804261401295662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9804261430399492e-05, "grad_norm": 17.65668487548828, "learning_rate": 1e-06, "loss": 0.3329, "mean_token_accuracy": 0.8940852284431458, "num_tokens": 493037017.0, "step": 12927 }, { "epoch": 1.6445744816181147, "ewc_loss": 0.02989031746983528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9890317819081247e-05, "grad_norm": 17.703243255615234, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8800528645515442, "num_tokens": 493074579.0, "step": 12928 }, { "epoch": 1.6447016918967052, "ewc_loss": 0.02987530082464218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9875300242565572e-05, "grad_norm": 17.777690887451172, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8710917830467224, "num_tokens": 493111249.0, "step": 12929 }, { "epoch": 1.6448289021752958, "ewc_loss": 0.029809292405843735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98092927550897e-05, "grad_norm": 17.688087463378906, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8544638156890869, "num_tokens": 493150723.0, "step": 12930 }, { "epoch": 1.6449561124538863, "ewc_loss": 0.029768789187073708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9768789318040945e-05, "grad_norm": 17.802125930786133, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.868266224861145, "num_tokens": 493189530.0, "step": 12931 }, { "epoch": 1.6450833227324768, "ewc_loss": 0.029831457883119583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9831457140971906e-05, "grad_norm": 17.63088607788086, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8778139352798462, "num_tokens": 493231369.0, "step": 12932 }, { "epoch": 1.6452105330110673, "ewc_loss": 0.02972325123846531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9723250918323174e-05, "grad_norm": 17.739418029785156, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8561779260635376, "num_tokens": 493278430.0, "step": 12933 }, { "epoch": 1.6453377432896579, "ewc_loss": 0.029809292405843735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98092927550897e-05, "grad_norm": 17.590925216674805, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8756887912750244, "num_tokens": 493312258.0, "step": 12934 }, { "epoch": 1.6454649535682484, "ewc_loss": 0.029814312234520912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9814311346854083e-05, "grad_norm": 17.762174606323242, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8695124983787537, "num_tokens": 493347725.0, "step": 12935 }, { "epoch": 1.645592163846839, "ewc_loss": 0.029854129999876022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9854130843887106e-05, "grad_norm": 17.58254051208496, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8730652332305908, "num_tokens": 493388119.0, "step": 12936 }, { "epoch": 1.6457193741254295, "ewc_loss": 0.029770858585834503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.977085932798218e-05, "grad_norm": 17.776132583618164, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8712136149406433, "num_tokens": 493424607.0, "step": 12937 }, { "epoch": 1.64584658440402, "ewc_loss": 0.029896488413214684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9896487831138074e-05, "grad_norm": 17.712112426757812, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8469905257225037, "num_tokens": 493459546.0, "step": 12938 }, { "epoch": 1.6459737946826105, "ewc_loss": 0.0297657772898674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9765777071588673e-05, "grad_norm": 17.78691864013672, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8734341859817505, "num_tokens": 493494162.0, "step": 12939 }, { "epoch": 1.6461010049612008, "ewc_loss": 0.029830535873770714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9830536732333712e-05, "grad_norm": 17.739215850830078, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8676500916481018, "num_tokens": 493537523.0, "step": 12940 }, { "epoch": 1.6462282152397913, "ewc_loss": 0.029760541394352913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9760542020085268e-05, "grad_norm": 17.65780258178711, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8686255216598511, "num_tokens": 493580065.0, "step": 12941 }, { "epoch": 1.6463554255183819, "ewc_loss": 0.029761573299765587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9761573387077078e-05, "grad_norm": 17.73493194580078, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8768773674964905, "num_tokens": 493626717.0, "step": 12942 }, { "epoch": 1.6464826357969724, "ewc_loss": 0.02977321855723858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.977321855723858e-05, "grad_norm": 17.6743106842041, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8760156035423279, "num_tokens": 493666266.0, "step": 12943 }, { "epoch": 1.646609846075563, "ewc_loss": 0.029747983440756798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9747983717243187e-05, "grad_norm": 17.7679443359375, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8603509068489075, "num_tokens": 493703184.0, "step": 12944 }, { "epoch": 1.6467370563541535, "ewc_loss": 0.029843781143426895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.984378079418093e-05, "grad_norm": 17.76313018798828, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8850224614143372, "num_tokens": 493744061.0, "step": 12945 }, { "epoch": 1.6468642666327438, "ewc_loss": 0.02972712367773056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9727123546763323e-05, "grad_norm": 17.72270393371582, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8708685040473938, "num_tokens": 493783585.0, "step": 12946 }, { "epoch": 1.6469914769113343, "ewc_loss": 0.029747363179922104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9747363441856578e-05, "grad_norm": 17.7803955078125, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8767256140708923, "num_tokens": 493827781.0, "step": 12947 }, { "epoch": 1.6471186871899248, "ewc_loss": 0.029726047068834305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9726046705036424e-05, "grad_norm": 17.7579402923584, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8747955560684204, "num_tokens": 493864418.0, "step": 12948 }, { "epoch": 1.6472458974685154, "ewc_loss": 0.02967149019241333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9671489755855873e-05, "grad_norm": 17.669193267822266, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8707701563835144, "num_tokens": 493895226.0, "step": 12949 }, { "epoch": 1.6473731077471059, "ewc_loss": 0.02969822660088539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9698227081098594e-05, "grad_norm": 17.768754959106445, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8617141246795654, "num_tokens": 493935210.0, "step": 12950 }, { "epoch": 1.6475003180256964, "ewc_loss": 0.029745742678642273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.974574272229802e-05, "grad_norm": 17.730573654174805, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8688149452209473, "num_tokens": 493966368.0, "step": 12951 }, { "epoch": 1.647627528304287, "ewc_loss": 0.02970093861222267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970093919429928e-05, "grad_norm": 17.724000930786133, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8645464181900024, "num_tokens": 494004304.0, "step": 12952 }, { "epoch": 1.6477547385828775, "ewc_loss": 0.029760941863059998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9760942197754048e-05, "grad_norm": 17.69735336303711, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8767717480659485, "num_tokens": 494044758.0, "step": 12953 }, { "epoch": 1.647881948861468, "ewc_loss": 0.029695700854063034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969570050481707e-05, "grad_norm": 17.773393630981445, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8554084897041321, "num_tokens": 494080624.0, "step": 12954 }, { "epoch": 1.6480091591400585, "ewc_loss": 0.029731445014476776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9731445465586148e-05, "grad_norm": 17.71772003173828, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8763518333435059, "num_tokens": 494116215.0, "step": 12955 }, { "epoch": 1.648136369418649, "ewc_loss": 0.029697496443986893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.969749584735837e-05, "grad_norm": 17.699459075927734, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8686049580574036, "num_tokens": 494150317.0, "step": 12956 }, { "epoch": 1.6482635796972396, "ewc_loss": 0.029706034809350967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9706034183618613e-05, "grad_norm": 17.656349182128906, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8842772841453552, "num_tokens": 494187273.0, "step": 12957 }, { "epoch": 1.64839078997583, "ewc_loss": 0.029726466163992882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9726466891588643e-05, "grad_norm": 17.7562198638916, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.866809070110321, "num_tokens": 494226921.0, "step": 12958 }, { "epoch": 1.6485180002544206, "ewc_loss": 0.029773499816656113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.977350050059613e-05, "grad_norm": 17.662429809570312, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8774338960647583, "num_tokens": 494263141.0, "step": 12959 }, { "epoch": 1.6486452105330112, "ewc_loss": 0.02971091866493225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9710918170167133e-05, "grad_norm": 17.745162963867188, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.86701500415802, "num_tokens": 494307925.0, "step": 12960 }, { "epoch": 1.6487724208116017, "ewc_loss": 0.029781892895698547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.978189331770409e-05, "grad_norm": 17.714584350585938, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8647509813308716, "num_tokens": 494353428.0, "step": 12961 }, { "epoch": 1.6488996310901922, "ewc_loss": 0.02971479296684265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9714792617596686e-05, "grad_norm": 17.66438865661621, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.858483076095581, "num_tokens": 494383559.0, "step": 12962 }, { "epoch": 1.6490268413687827, "ewc_loss": 0.029799340292811394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.97993410640629e-05, "grad_norm": 17.7418212890625, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8730865716934204, "num_tokens": 494419564.0, "step": 12963 }, { "epoch": 1.649154051647373, "ewc_loss": 0.02978372573852539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.978372504003346e-05, "grad_norm": 17.686460494995117, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8725804686546326, "num_tokens": 494466886.0, "step": 12964 }, { "epoch": 1.6492812619259636, "ewc_loss": 0.029767636209726334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9767636078759097e-05, "grad_norm": 17.658227920532227, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8532512784004211, "num_tokens": 494506672.0, "step": 12965 }, { "epoch": 1.649408472204554, "ewc_loss": 0.029821086674928665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.982108708238229e-05, "grad_norm": 17.75783920288086, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8702651262283325, "num_tokens": 494541881.0, "step": 12966 }, { "epoch": 1.6495356824831446, "ewc_loss": 0.029788345098495483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9788345273118466e-05, "grad_norm": 17.692012786865234, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8753117322921753, "num_tokens": 494583546.0, "step": 12967 }, { "epoch": 1.6496628927617352, "ewc_loss": 0.029748188331723213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9748187444056384e-05, "grad_norm": 17.67247772216797, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8587912321090698, "num_tokens": 494616625.0, "step": 12968 }, { "epoch": 1.6497901030403257, "ewc_loss": 0.0298349279910326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9834927772753872e-05, "grad_norm": 17.7807674407959, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8832324147224426, "num_tokens": 494654590.0, "step": 12969 }, { "epoch": 1.6499173133189162, "ewc_loss": 0.02981669455766678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.981669422297273e-05, "grad_norm": 17.76914405822754, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8839327692985535, "num_tokens": 494692471.0, "step": 12970 }, { "epoch": 1.6500445235975065, "ewc_loss": 0.02980070374906063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9800703487126157e-05, "grad_norm": 17.762529373168945, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8630268573760986, "num_tokens": 494730347.0, "step": 12971 }, { "epoch": 1.650171733876097, "ewc_loss": 0.02984258346259594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9842583899153396e-05, "grad_norm": 17.787914276123047, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8589502573013306, "num_tokens": 494763753.0, "step": 12972 }, { "epoch": 1.6502989441546876, "ewc_loss": 0.029812810942530632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9812810680596158e-05, "grad_norm": 17.827619552612305, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8653466105461121, "num_tokens": 494800416.0, "step": 12973 }, { "epoch": 1.6504261544332781, "ewc_loss": 0.029792338609695435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979233795485925e-05, "grad_norm": 17.79189682006836, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8683751225471497, "num_tokens": 494842733.0, "step": 12974 }, { "epoch": 1.6505533647118686, "ewc_loss": 0.029733356088399887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.973335540445987e-05, "grad_norm": 17.768827438354492, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8631709218025208, "num_tokens": 494878661.0, "step": 12975 }, { "epoch": 1.6506805749904592, "ewc_loss": 0.029755355790257454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.975535608129576e-05, "grad_norm": 17.704694747924805, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8630260825157166, "num_tokens": 494918142.0, "step": 12976 }, { "epoch": 1.6508077852690497, "ewc_loss": 0.029769154265522957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9769154934911057e-05, "grad_norm": 17.82370376586914, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8694075345993042, "num_tokens": 494956947.0, "step": 12977 }, { "epoch": 1.6509349955476402, "ewc_loss": 0.029781406745314598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9781407647533342e-05, "grad_norm": 17.80379867553711, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8650544881820679, "num_tokens": 494989850.0, "step": 12978 }, { "epoch": 1.6510622058262308, "ewc_loss": 0.029720958322286606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9720958991674706e-05, "grad_norm": 17.707914352416992, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8617757558822632, "num_tokens": 495026770.0, "step": 12979 }, { "epoch": 1.6511894161048213, "ewc_loss": 0.029763156548142433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9763155907858163e-05, "grad_norm": 17.79258918762207, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8808082938194275, "num_tokens": 495065009.0, "step": 12980 }, { "epoch": 1.6513166263834118, "ewc_loss": 0.02976912073791027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.976912037411239e-05, "grad_norm": 17.807241439819336, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8712876439094543, "num_tokens": 495096914.0, "step": 12981 }, { "epoch": 1.6514438366620023, "ewc_loss": 0.02971087396144867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9710874514421448e-05, "grad_norm": 17.782901763916016, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8738301992416382, "num_tokens": 495138348.0, "step": 12982 }, { "epoch": 1.6515710469405929, "ewc_loss": 0.029724350199103355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9724349587922916e-05, "grad_norm": 17.700014114379883, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8632797002792358, "num_tokens": 495177026.0, "step": 12983 }, { "epoch": 1.6516982572191834, "ewc_loss": 0.029727403074502945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972740367113147e-05, "grad_norm": 17.759233474731445, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8722472190856934, "num_tokens": 495213850.0, "step": 12984 }, { "epoch": 1.651825467497774, "ewc_loss": 0.0297494288533926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9749427994829603e-05, "grad_norm": 17.698259353637695, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.878445029258728, "num_tokens": 495257213.0, "step": 12985 }, { "epoch": 1.6519526777763645, "ewc_loss": 0.029752766713500023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.975276584038511e-05, "grad_norm": 17.726543426513672, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8607786893844604, "num_tokens": 495300018.0, "step": 12986 }, { "epoch": 1.652079888054955, "ewc_loss": 0.029791824519634247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979182499984745e-05, "grad_norm": 17.816102981567383, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8684841394424438, "num_tokens": 495336471.0, "step": 12987 }, { "epoch": 1.6522070983335455, "ewc_loss": 0.029699673876166344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9699673177674413e-05, "grad_norm": 17.578819274902344, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8729689121246338, "num_tokens": 495374680.0, "step": 12988 }, { "epoch": 1.6523343086121358, "ewc_loss": 0.029732711613178253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9732711482211016e-05, "grad_norm": 17.81745147705078, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.873071014881134, "num_tokens": 495414096.0, "step": 12989 }, { "epoch": 1.6524615188907263, "ewc_loss": 0.029816854745149612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.981685429404024e-05, "grad_norm": 17.661767959594727, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8576985597610474, "num_tokens": 495452955.0, "step": 12990 }, { "epoch": 1.6525887291693169, "ewc_loss": 0.02967832237482071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.967832188005559e-05, "grad_norm": 17.690074920654297, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8668755888938904, "num_tokens": 495490091.0, "step": 12991 }, { "epoch": 1.6527159394479074, "ewc_loss": 0.029832692816853523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9832692234776914e-05, "grad_norm": 17.697509765625, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8776720762252808, "num_tokens": 495524543.0, "step": 12992 }, { "epoch": 1.652843149726498, "ewc_loss": 0.029774146154522896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9774146241834387e-05, "grad_norm": 17.749496459960938, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8608114123344421, "num_tokens": 495562068.0, "step": 12993 }, { "epoch": 1.6529703600050885, "ewc_loss": 0.029782433062791824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9782433557556942e-05, "grad_norm": 17.714252471923828, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.876955509185791, "num_tokens": 495598355.0, "step": 12994 }, { "epoch": 1.6530975702836788, "ewc_loss": 0.029736416414380074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.973641676362604e-05, "grad_norm": 17.779653549194336, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8561475276947021, "num_tokens": 495634735.0, "step": 12995 }, { "epoch": 1.6532247805622693, "ewc_loss": 0.02974189631640911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.974189555970952e-05, "grad_norm": 17.628131866455078, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8694603443145752, "num_tokens": 495668804.0, "step": 12996 }, { "epoch": 1.6533519908408598, "ewc_loss": 0.029751714318990707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.975171446450986e-05, "grad_norm": 17.741559982299805, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8750724792480469, "num_tokens": 495707129.0, "step": 12997 }, { "epoch": 1.6534792011194503, "ewc_loss": 0.029772523790597916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9772523703286424e-05, "grad_norm": 17.66881561279297, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.849270224571228, "num_tokens": 495746501.0, "step": 12998 }, { "epoch": 1.6536064113980409, "ewc_loss": 0.029760390520095825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9760391043964773e-05, "grad_norm": 17.75189208984375, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8856625556945801, "num_tokens": 495789723.0, "step": 12999 }, { "epoch": 1.6537336216766314, "ewc_loss": 0.029804179444909096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9804179575876333e-05, "grad_norm": 17.72374153137207, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.860923171043396, "num_tokens": 495827073.0, "step": 13000 }, { "epoch": 1.653860831955222, "ewc_loss": 0.029811756685376167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9811757485731505e-05, "grad_norm": 17.757726669311523, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8594440221786499, "num_tokens": 495862362.0, "step": 13001 }, { "epoch": 1.6539880422338125, "ewc_loss": 0.02985966205596924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985966239066329e-05, "grad_norm": 17.75358772277832, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8725346922874451, "num_tokens": 495905546.0, "step": 13002 }, { "epoch": 1.654115252512403, "ewc_loss": 0.029857397079467773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9857397748855874e-05, "grad_norm": 17.769893646240234, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8682180643081665, "num_tokens": 495942291.0, "step": 13003 }, { "epoch": 1.6542424627909935, "ewc_loss": 0.029819142073392868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98191425827099e-05, "grad_norm": 17.7403507232666, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.859507143497467, "num_tokens": 495978631.0, "step": 13004 }, { "epoch": 1.654369673069584, "ewc_loss": 0.029810823500156403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9810824344167486e-05, "grad_norm": 17.722700119018555, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8639557361602783, "num_tokens": 496016626.0, "step": 13005 }, { "epoch": 1.6544968833481746, "ewc_loss": 0.029851973056793213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98519735224545e-05, "grad_norm": 17.724987030029297, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8622957468032837, "num_tokens": 496051949.0, "step": 13006 }, { "epoch": 1.654624093626765, "ewc_loss": 0.02983386628329754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98338654829422e-05, "grad_norm": 17.738117218017578, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8751875758171082, "num_tokens": 496092693.0, "step": 13007 }, { "epoch": 1.6547513039053556, "ewc_loss": 0.02986830100417137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9868300771340728e-05, "grad_norm": 17.658315658569336, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.863511323928833, "num_tokens": 496132853.0, "step": 13008 }, { "epoch": 1.6548785141839462, "ewc_loss": 0.029860127717256546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9860128051950596e-05, "grad_norm": 17.697940826416016, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8545100688934326, "num_tokens": 496168303.0, "step": 13009 }, { "epoch": 1.6550057244625367, "ewc_loss": 0.029908910393714905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990890970977489e-05, "grad_norm": 17.725996017456055, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8579090237617493, "num_tokens": 496206132.0, "step": 13010 }, { "epoch": 1.6551329347411272, "ewc_loss": 0.029865004122257233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9865004762541503e-05, "grad_norm": 17.741352081298828, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8631991744041443, "num_tokens": 496247542.0, "step": 13011 }, { "epoch": 1.6552601450197177, "ewc_loss": 0.029940949752926826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9940949389128946e-05, "grad_norm": 17.738204956054688, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8793087005615234, "num_tokens": 496285698.0, "step": 13012 }, { "epoch": 1.655387355298308, "ewc_loss": 0.029869718477129936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.986971776408609e-05, "grad_norm": 17.769004821777344, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8716679811477661, "num_tokens": 496322216.0, "step": 13013 }, { "epoch": 1.6555145655768986, "ewc_loss": 0.029908621683716774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990862230944913e-05, "grad_norm": 17.632932662963867, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8674940466880798, "num_tokens": 496357687.0, "step": 13014 }, { "epoch": 1.655641775855489, "ewc_loss": 0.029844123870134354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98441245831782e-05, "grad_norm": 17.715675354003906, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8811759948730469, "num_tokens": 496400875.0, "step": 13015 }, { "epoch": 1.6557689861340796, "ewc_loss": 0.02995164692401886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99516468658112e-05, "grad_norm": 17.738618850708008, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8721067905426025, "num_tokens": 496432076.0, "step": 13016 }, { "epoch": 1.6558961964126702, "ewc_loss": 0.029830394312739372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9830394851160236e-05, "grad_norm": 17.65700340270996, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8657374978065491, "num_tokens": 496466852.0, "step": 13017 }, { "epoch": 1.6560234066912607, "ewc_loss": 0.029930030927062035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9930031814728864e-05, "grad_norm": 17.802112579345703, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8675218224525452, "num_tokens": 496508890.0, "step": 13018 }, { "epoch": 1.6561506169698512, "ewc_loss": 0.029913710430264473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9913710022810847e-05, "grad_norm": 17.68529510498047, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8848199844360352, "num_tokens": 496537964.0, "step": 13019 }, { "epoch": 1.6562778272484415, "ewc_loss": 0.029898717999458313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.989871791214682e-05, "grad_norm": 17.72462272644043, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8532450199127197, "num_tokens": 496571263.0, "step": 13020 }, { "epoch": 1.656405037527032, "ewc_loss": 0.02993268147110939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993268208228983e-05, "grad_norm": 17.717052459716797, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8782466650009155, "num_tokens": 496610058.0, "step": 13021 }, { "epoch": 1.6565322478056226, "ewc_loss": 0.029908165335655212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990816574310884e-05, "grad_norm": 17.722274780273438, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8622928261756897, "num_tokens": 496640542.0, "step": 13022 }, { "epoch": 1.656659458084213, "ewc_loss": 0.02991623990237713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.991624023707118e-05, "grad_norm": 17.69851303100586, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8686996698379517, "num_tokens": 496678030.0, "step": 13023 }, { "epoch": 1.6567866683628036, "ewc_loss": 0.029961759224534035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.996175862790551e-05, "grad_norm": 17.757022857666016, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8653440475463867, "num_tokens": 496714057.0, "step": 13024 }, { "epoch": 1.6569138786413942, "ewc_loss": 0.02997756563127041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9977565645822324e-05, "grad_norm": 17.73944854736328, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8839864134788513, "num_tokens": 496744501.0, "step": 13025 }, { "epoch": 1.6570410889199847, "ewc_loss": 0.029919682070612907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9919681765022688e-05, "grad_norm": 17.740333557128906, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8742923736572266, "num_tokens": 496780233.0, "step": 13026 }, { "epoch": 1.6571682991985752, "ewc_loss": 0.02993427962064743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9934279154986143e-05, "grad_norm": 17.736740112304688, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8703119158744812, "num_tokens": 496809645.0, "step": 13027 }, { "epoch": 1.6572955094771658, "ewc_loss": 0.029896395280957222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9896395062678494e-05, "grad_norm": 17.671354293823242, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8701140284538269, "num_tokens": 496854871.0, "step": 13028 }, { "epoch": 1.6574227197557563, "ewc_loss": 0.029973940923810005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9973940399941057e-05, "grad_norm": 17.754945755004883, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.8851979970932007, "num_tokens": 496891958.0, "step": 13029 }, { "epoch": 1.6575499300343468, "ewc_loss": 0.02995891682803631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9958917366457172e-05, "grad_norm": 17.648225784301758, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.862786054611206, "num_tokens": 496928550.0, "step": 13030 }, { "epoch": 1.6576771403129373, "ewc_loss": 0.0299659613519907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99659604934277e-05, "grad_norm": 17.780162811279297, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8682488203048706, "num_tokens": 496972009.0, "step": 13031 }, { "epoch": 1.6578043505915279, "ewc_loss": 0.03001973405480385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0019733458175324e-05, "grad_norm": 17.738292694091797, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.868032693862915, "num_tokens": 497013605.0, "step": 13032 }, { "epoch": 1.6579315608701184, "ewc_loss": 0.029932867735624313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993286761920899e-05, "grad_norm": 17.833498001098633, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8705385327339172, "num_tokens": 497048712.0, "step": 13033 }, { "epoch": 1.658058771148709, "ewc_loss": 0.029985956847667694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.998595664394088e-05, "grad_norm": 17.708269119262695, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8828325271606445, "num_tokens": 497089161.0, "step": 13034 }, { "epoch": 1.6581859814272994, "ewc_loss": 0.029891086742281914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9891087251598947e-05, "grad_norm": 17.825244903564453, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8418365716934204, "num_tokens": 497127253.0, "step": 13035 }, { "epoch": 1.65831319170589, "ewc_loss": 0.029964743182063103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.996474358951673e-05, "grad_norm": 17.68507194519043, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.866716742515564, "num_tokens": 497167637.0, "step": 13036 }, { "epoch": 1.6584404019844805, "ewc_loss": 0.02981376461684704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9813763831043616e-05, "grad_norm": 17.745847702026367, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8694514036178589, "num_tokens": 497205663.0, "step": 13037 }, { "epoch": 1.6585676122630708, "ewc_loss": 0.029934624210000038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9934624762972817e-05, "grad_norm": 17.71219825744629, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8864865899085999, "num_tokens": 497242146.0, "step": 13038 }, { "epoch": 1.6586948225416613, "ewc_loss": 0.02980482205748558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9804821679135785e-05, "grad_norm": 17.664520263671875, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8741724491119385, "num_tokens": 497279339.0, "step": 13039 }, { "epoch": 1.6588220328202519, "ewc_loss": 0.029856059700250626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9856058972654864e-05, "grad_norm": 17.777294158935547, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8609984517097473, "num_tokens": 497318530.0, "step": 13040 }, { "epoch": 1.6589492430988424, "ewc_loss": 0.029882771894335747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988277265103534e-05, "grad_norm": 17.788476943969727, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8637319207191467, "num_tokens": 497352943.0, "step": 13041 }, { "epoch": 1.659076453377433, "ewc_loss": 0.029843401163816452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.984340062539559e-05, "grad_norm": 17.718402862548828, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8632158041000366, "num_tokens": 497387729.0, "step": 13042 }, { "epoch": 1.6592036636560235, "ewc_loss": 0.02986140176653862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.986140134453308e-05, "grad_norm": 17.720834732055664, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8712787628173828, "num_tokens": 497427875.0, "step": 13043 }, { "epoch": 1.6593308739346138, "ewc_loss": 0.029824940487742424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9824939701939e-05, "grad_norm": 17.699316024780273, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8625779747962952, "num_tokens": 497469076.0, "step": 13044 }, { "epoch": 1.6594580842132043, "ewc_loss": 0.029888615012168884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9888615244999528e-05, "grad_norm": 17.73457908630371, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8709636926651001, "num_tokens": 497509556.0, "step": 13045 }, { "epoch": 1.6595852944917948, "ewc_loss": 0.0298562403768301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985624087159522e-05, "grad_norm": 17.690013885498047, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8634465932846069, "num_tokens": 497548151.0, "step": 13046 }, { "epoch": 1.6597125047703853, "ewc_loss": 0.02991420030593872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9914201149949804e-05, "grad_norm": 17.726747512817383, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8436864614486694, "num_tokens": 497589088.0, "step": 13047 }, { "epoch": 1.6598397150489759, "ewc_loss": 0.02991633675992489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9916336643509567e-05, "grad_norm": 17.796995162963867, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8785508871078491, "num_tokens": 497632095.0, "step": 13048 }, { "epoch": 1.6599669253275664, "ewc_loss": 0.029904797673225403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9904796974733472e-05, "grad_norm": 17.739398956298828, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8810861110687256, "num_tokens": 497665148.0, "step": 13049 }, { "epoch": 1.660094135606157, "ewc_loss": 0.02984645776450634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.984645834658295e-05, "grad_norm": 17.683134078979492, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8642563819885254, "num_tokens": 497700134.0, "step": 13050 }, { "epoch": 1.6602213458847475, "ewc_loss": 0.02991388738155365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.991388646478299e-05, "grad_norm": 17.744670867919922, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8743360042572021, "num_tokens": 497743101.0, "step": 13051 }, { "epoch": 1.660348556163338, "ewc_loss": 0.02991369739174843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9913697289885022e-05, "grad_norm": 17.724411010742188, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8626387119293213, "num_tokens": 497775112.0, "step": 13052 }, { "epoch": 1.6604757664419285, "ewc_loss": 0.029921121895313263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9921122404630296e-05, "grad_norm": 17.735008239746094, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8805153369903564, "num_tokens": 497813320.0, "step": 13053 }, { "epoch": 1.660602976720519, "ewc_loss": 0.029852543026208878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985254286613781e-05, "grad_norm": 17.664939880371094, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8650893568992615, "num_tokens": 497852780.0, "step": 13054 }, { "epoch": 1.6607301869991096, "ewc_loss": 0.029882581904530525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9882581657147966e-05, "grad_norm": 17.73781967163086, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8709658980369568, "num_tokens": 497886313.0, "step": 13055 }, { "epoch": 1.6608573972777, "ewc_loss": 0.02993415854871273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993415910168551e-05, "grad_norm": 17.686378479003906, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8670404553413391, "num_tokens": 497924108.0, "step": 13056 }, { "epoch": 1.6609846075562906, "ewc_loss": 0.02988518960773945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988519008795265e-05, "grad_norm": 17.68949317932129, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8589684963226318, "num_tokens": 497964656.0, "step": 13057 }, { "epoch": 1.6611118178348812, "ewc_loss": 0.029924515634775162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992451481986791e-05, "grad_norm": 17.691390991210938, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8561344146728516, "num_tokens": 498005436.0, "step": 13058 }, { "epoch": 1.6612390281134717, "ewc_loss": 0.029921647161245346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9921646273578517e-05, "grad_norm": 17.714683532714844, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.865443229675293, "num_tokens": 498042237.0, "step": 13059 }, { "epoch": 1.6613662383920622, "ewc_loss": 0.029939254745841026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993925409100484e-05, "grad_norm": 17.69515037536621, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8687593340873718, "num_tokens": 498083008.0, "step": 13060 }, { "epoch": 1.6614934486706527, "ewc_loss": 0.029956426471471786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9956427169963717e-05, "grad_norm": 17.73665428161621, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8463798761367798, "num_tokens": 498120550.0, "step": 13061 }, { "epoch": 1.661620658949243, "ewc_loss": 0.029972849413752556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.997284900629893e-05, "grad_norm": 17.729928970336914, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8669635057449341, "num_tokens": 498153033.0, "step": 13062 }, { "epoch": 1.6617478692278336, "ewc_loss": 0.02995205484330654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9952054319437593e-05, "grad_norm": 17.70408058166504, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8710215091705322, "num_tokens": 498190491.0, "step": 13063 }, { "epoch": 1.661875079506424, "ewc_loss": 0.0299203060567379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9920305678388104e-05, "grad_norm": 17.72966194152832, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8629190921783447, "num_tokens": 498235889.0, "step": 13064 }, { "epoch": 1.6620022897850146, "ewc_loss": 0.029937928542494774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9937928047729656e-05, "grad_norm": 17.686782836914062, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8660545349121094, "num_tokens": 498277191.0, "step": 13065 }, { "epoch": 1.6621295000636052, "ewc_loss": 0.029905201867222786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9905202609370463e-05, "grad_norm": 17.76286506652832, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8585042953491211, "num_tokens": 498318443.0, "step": 13066 }, { "epoch": 1.6622567103421957, "ewc_loss": 0.029903491958975792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990349275933113e-05, "grad_norm": 17.701045989990234, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8575794696807861, "num_tokens": 498355955.0, "step": 13067 }, { "epoch": 1.662383920620786, "ewc_loss": 0.029878174886107445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9878174245823175e-05, "grad_norm": 17.80030632019043, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8701450824737549, "num_tokens": 498388588.0, "step": 13068 }, { "epoch": 1.6625111308993765, "ewc_loss": 0.02998543344438076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.998543277499266e-05, "grad_norm": 17.861129760742188, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8678402900695801, "num_tokens": 498430661.0, "step": 13069 }, { "epoch": 1.662638341177967, "ewc_loss": 0.029865195974707603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9865195756428875e-05, "grad_norm": 17.720746994018555, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8572295904159546, "num_tokens": 498470428.0, "step": 13070 }, { "epoch": 1.6627655514565576, "ewc_loss": 0.029827069491147995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9827069738530554e-05, "grad_norm": 17.776201248168945, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8686150312423706, "num_tokens": 498512031.0, "step": 13071 }, { "epoch": 1.662892761735148, "ewc_loss": 0.029861915856599808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9861916118534282e-05, "grad_norm": 17.741994857788086, "learning_rate": 1e-06, "loss": 0.3464, "mean_token_accuracy": 0.8912230730056763, "num_tokens": 498548344.0, "step": 13072 }, { "epoch": 1.6630199720137386, "ewc_loss": 0.029805932193994522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.980593308166135e-05, "grad_norm": 17.768354415893555, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8824701905250549, "num_tokens": 498590115.0, "step": 13073 }, { "epoch": 1.6631471822923292, "ewc_loss": 0.029869986698031425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9869986974517815e-05, "grad_norm": 17.852203369140625, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8695345520973206, "num_tokens": 498634034.0, "step": 13074 }, { "epoch": 1.6632743925709197, "ewc_loss": 0.02978481538593769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9784814614686184e-05, "grad_norm": 17.774478912353516, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8620947599411011, "num_tokens": 498675086.0, "step": 13075 }, { "epoch": 1.6634016028495102, "ewc_loss": 0.02970471978187561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.970471905427985e-05, "grad_norm": 17.734024047851562, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8752133846282959, "num_tokens": 498711374.0, "step": 13076 }, { "epoch": 1.6635288131281007, "ewc_loss": 0.029775165021419525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9775164875900373e-05, "grad_norm": 17.797826766967773, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8740078806877136, "num_tokens": 498753050.0, "step": 13077 }, { "epoch": 1.6636560234066913, "ewc_loss": 0.02981419675052166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.981419675052166e-05, "grad_norm": 17.797937393188477, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8702071905136108, "num_tokens": 498790406.0, "step": 13078 }, { "epoch": 1.6637832336852818, "ewc_loss": 0.02970801666378975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9708016882068478e-05, "grad_norm": 17.728437423706055, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8682440519332886, "num_tokens": 498827712.0, "step": 13079 }, { "epoch": 1.6639104439638723, "ewc_loss": 0.0297527015209198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9752702175755985e-05, "grad_norm": 17.835466384887695, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8735687732696533, "num_tokens": 498862283.0, "step": 13080 }, { "epoch": 1.6640376542424629, "ewc_loss": 0.029724199324846268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.972419861180242e-05, "grad_norm": 17.710119247436523, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8702594041824341, "num_tokens": 498902686.0, "step": 13081 }, { "epoch": 1.6641648645210534, "ewc_loss": 0.02973072975873947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9730730602750555e-05, "grad_norm": 17.76671028137207, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8582837581634521, "num_tokens": 498942785.0, "step": 13082 }, { "epoch": 1.664292074799644, "ewc_loss": 0.029766542837023735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9766542866127566e-05, "grad_norm": 17.715984344482422, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.873803436756134, "num_tokens": 498982761.0, "step": 13083 }, { "epoch": 1.6644192850782344, "ewc_loss": 0.029760047793388367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9760047254967503e-05, "grad_norm": 17.80175018310547, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8513036966323853, "num_tokens": 499025762.0, "step": 13084 }, { "epoch": 1.664546495356825, "ewc_loss": 0.02977449633181095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9774495487799868e-05, "grad_norm": 17.69965934753418, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8602668642997742, "num_tokens": 499063394.0, "step": 13085 }, { "epoch": 1.6646737056354155, "ewc_loss": 0.029745718464255333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9745719075435773e-05, "grad_norm": 17.71381378173828, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.879297137260437, "num_tokens": 499105824.0, "step": 13086 }, { "epoch": 1.6648009159140058, "ewc_loss": 0.02975377067923546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9753769922535866e-05, "grad_norm": 17.820789337158203, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.877449631690979, "num_tokens": 499140935.0, "step": 13087 }, { "epoch": 1.6649281261925963, "ewc_loss": 0.029763134196400642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.976313407998532e-05, "grad_norm": 17.73926544189453, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8682374954223633, "num_tokens": 499179492.0, "step": 13088 }, { "epoch": 1.6650553364711869, "ewc_loss": 0.029711319133639336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9711318347835913e-05, "grad_norm": 17.722774505615234, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8798395395278931, "num_tokens": 499214716.0, "step": 13089 }, { "epoch": 1.6651825467497774, "ewc_loss": 0.029751814901828766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9751814508927055e-05, "grad_norm": 17.752004623413086, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8758586645126343, "num_tokens": 499250337.0, "step": 13090 }, { "epoch": 1.665309757028368, "ewc_loss": 0.029762977734208107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9762977646896616e-05, "grad_norm": 17.69278907775879, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8739938735961914, "num_tokens": 499283675.0, "step": 13091 }, { "epoch": 1.6654369673069584, "ewc_loss": 0.029820706695318222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.982070691359695e-05, "grad_norm": 17.811750411987305, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8738415241241455, "num_tokens": 499319153.0, "step": 13092 }, { "epoch": 1.6655641775855488, "ewc_loss": 0.029808850958943367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.980885074066464e-05, "grad_norm": 17.722007751464844, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.866824746131897, "num_tokens": 499356675.0, "step": 13093 }, { "epoch": 1.6656913878641393, "ewc_loss": 0.029784630984067917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9784630896756425e-05, "grad_norm": 17.691858291625977, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8840399384498596, "num_tokens": 499392129.0, "step": 13094 }, { "epoch": 1.6658185981427298, "ewc_loss": 0.029855847358703613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9855847969884053e-05, "grad_norm": 17.801280975341797, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8811554908752441, "num_tokens": 499424958.0, "step": 13095 }, { "epoch": 1.6659458084213203, "ewc_loss": 0.029830021783709526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.983002195833251e-05, "grad_norm": 17.634328842163086, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8780689239501953, "num_tokens": 499467769.0, "step": 13096 }, { "epoch": 1.6660730186999109, "ewc_loss": 0.029833683744072914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9833683584001847e-05, "grad_norm": 17.831939697265625, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8602626323699951, "num_tokens": 499513828.0, "step": 13097 }, { "epoch": 1.6662002289785014, "ewc_loss": 0.029856037348508835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985603714478202e-05, "grad_norm": 17.661970138549805, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8748536705970764, "num_tokens": 499548219.0, "step": 13098 }, { "epoch": 1.666327439257092, "ewc_loss": 0.029837345704436302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9837345209671184e-05, "grad_norm": 17.763965606689453, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.852570652961731, "num_tokens": 499587158.0, "step": 13099 }, { "epoch": 1.6664546495356825, "ewc_loss": 0.02997610531747341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9976104997331277e-05, "grad_norm": 17.80190658569336, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8723949193954468, "num_tokens": 499625235.0, "step": 13100 }, { "epoch": 1.666581859814273, "ewc_loss": 0.029845604673027992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9845605240552686e-05, "grad_norm": 17.791898727416992, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8505406975746155, "num_tokens": 499662842.0, "step": 13101 }, { "epoch": 1.6667090700928635, "ewc_loss": 0.02986898645758629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9868986530345865e-05, "grad_norm": 17.714069366455078, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8463660478591919, "num_tokens": 499703464.0, "step": 13102 }, { "epoch": 1.666836280371454, "ewc_loss": 0.029849134385585785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9849134079995565e-05, "grad_norm": 17.846981048583984, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.868247389793396, "num_tokens": 499739160.0, "step": 13103 }, { "epoch": 1.6669634906500446, "ewc_loss": 0.02989603579044342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9896036721765995e-05, "grad_norm": 17.745853424072266, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8554793000221252, "num_tokens": 499775024.0, "step": 13104 }, { "epoch": 1.667090700928635, "ewc_loss": 0.02985771931707859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.98577197099803e-05, "grad_norm": 17.79044532775879, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8730359077453613, "num_tokens": 499816479.0, "step": 13105 }, { "epoch": 1.6672179112072256, "ewc_loss": 0.02990332804620266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9903327231295407e-05, "grad_norm": 17.720726013183594, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8705105185508728, "num_tokens": 499848790.0, "step": 13106 }, { "epoch": 1.6673451214858162, "ewc_loss": 0.029810907319188118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.981090801768005e-05, "grad_norm": 17.750295639038086, "learning_rate": 1e-06, "loss": 0.351, "mean_token_accuracy": 0.8888952136039734, "num_tokens": 499890955.0, "step": 13107 }, { "epoch": 1.6674723317644067, "ewc_loss": 0.029909227043390274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9909226213931106e-05, "grad_norm": 17.736875534057617, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8786486387252808, "num_tokens": 499924813.0, "step": 13108 }, { "epoch": 1.6675995420429972, "ewc_loss": 0.029836803674697876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9836803150828928e-05, "grad_norm": 17.830900192260742, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8665950298309326, "num_tokens": 499960032.0, "step": 13109 }, { "epoch": 1.6677267523215877, "ewc_loss": 0.029927903786301613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9927903597126715e-05, "grad_norm": 17.714834213256836, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8842004537582397, "num_tokens": 499995868.0, "step": 13110 }, { "epoch": 1.667853962600178, "ewc_loss": 0.029797976836562157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979797682201024e-05, "grad_norm": 17.770841598510742, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8742574453353882, "num_tokens": 500031662.0, "step": 13111 }, { "epoch": 1.6679811728787686, "ewc_loss": 0.029901592060923576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9901591915404424e-05, "grad_norm": 17.676424026489258, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8607710599899292, "num_tokens": 500070189.0, "step": 13112 }, { "epoch": 1.668108383157359, "ewc_loss": 0.02986619435250759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9866194381611422e-05, "grad_norm": 17.742843627929688, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8658843040466309, "num_tokens": 500109999.0, "step": 13113 }, { "epoch": 1.6682355934359496, "ewc_loss": 0.029932590201497078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993258931383025e-05, "grad_norm": 17.755216598510742, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8535706996917725, "num_tokens": 500151473.0, "step": 13114 }, { "epoch": 1.6683628037145402, "ewc_loss": 0.029846496880054474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9846496545360424e-05, "grad_norm": 17.719343185424805, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8686506152153015, "num_tokens": 500187436.0, "step": 13115 }, { "epoch": 1.6684900139931307, "ewc_loss": 0.02991129644215107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.991129622387234e-05, "grad_norm": 17.760910034179688, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8695290684700012, "num_tokens": 500222433.0, "step": 13116 }, { "epoch": 1.668617224271721, "ewc_loss": 0.029958529397845268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9958529921714216e-05, "grad_norm": 17.7440128326416, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8689426779747009, "num_tokens": 500257390.0, "step": 13117 }, { "epoch": 1.6687444345503115, "ewc_loss": 0.02990785427391529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9907854695920832e-05, "grad_norm": 17.727325439453125, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8689108490943909, "num_tokens": 500301260.0, "step": 13118 }, { "epoch": 1.668871644828902, "ewc_loss": 0.02989468351006508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9894683393649757e-05, "grad_norm": 17.710962295532227, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8749494552612305, "num_tokens": 500343453.0, "step": 13119 }, { "epoch": 1.6689988551074926, "ewc_loss": 0.02994249388575554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9942493711132556e-05, "grad_norm": 17.84394073486328, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8764983415603638, "num_tokens": 500375133.0, "step": 13120 }, { "epoch": 1.669126065386083, "ewc_loss": 0.02993675321340561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9936752980574965e-05, "grad_norm": 17.72951889038086, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8652102947235107, "num_tokens": 500408540.0, "step": 13121 }, { "epoch": 1.6692532756646736, "ewc_loss": 0.029874403029680252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9874403480789624e-05, "grad_norm": 17.764541625976562, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8820188045501709, "num_tokens": 500446243.0, "step": 13122 }, { "epoch": 1.6693804859432642, "ewc_loss": 0.029923098161816597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9923097827122547e-05, "grad_norm": 17.750749588012695, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8443951606750488, "num_tokens": 500484313.0, "step": 13123 }, { "epoch": 1.6695076962218547, "ewc_loss": 0.02987891063094139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.987891093653161e-05, "grad_norm": 17.753345489501953, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.881077766418457, "num_tokens": 500525566.0, "step": 13124 }, { "epoch": 1.6696349065004452, "ewc_loss": 0.02987922728061676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9879227440687828e-05, "grad_norm": 17.680036544799805, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8751634955406189, "num_tokens": 500566274.0, "step": 13125 }, { "epoch": 1.6697621167790357, "ewc_loss": 0.02987150475382805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9871504011680372e-05, "grad_norm": 17.757354736328125, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8630777597427368, "num_tokens": 500602769.0, "step": 13126 }, { "epoch": 1.6698893270576263, "ewc_loss": 0.029904041439294815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9904042094131e-05, "grad_norm": 17.744096755981445, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8535881042480469, "num_tokens": 500642895.0, "step": 13127 }, { "epoch": 1.6700165373362168, "ewc_loss": 0.02992246113717556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9922461180831306e-05, "grad_norm": 17.752260208129883, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.868216872215271, "num_tokens": 500681461.0, "step": 13128 }, { "epoch": 1.6701437476148073, "ewc_loss": 0.029925305396318436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992530608025845e-05, "grad_norm": 17.796234130859375, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.852478563785553, "num_tokens": 500723395.0, "step": 13129 }, { "epoch": 1.6702709578933979, "ewc_loss": 0.02993236482143402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9932365578133613e-05, "grad_norm": 17.729381561279297, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.870652437210083, "num_tokens": 500760571.0, "step": 13130 }, { "epoch": 1.6703981681719884, "ewc_loss": 0.029860597103834152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.986059735121671e-05, "grad_norm": 17.81342887878418, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8610959053039551, "num_tokens": 500797858.0, "step": 13131 }, { "epoch": 1.670525378450579, "ewc_loss": 0.02996690571308136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9966906367917545e-05, "grad_norm": 17.731653213500977, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8751204013824463, "num_tokens": 500840199.0, "step": 13132 }, { "epoch": 1.6706525887291694, "ewc_loss": 0.02983028255403042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9830282073817216e-05, "grad_norm": 17.84912109375, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8641905784606934, "num_tokens": 500883290.0, "step": 13133 }, { "epoch": 1.67077979900776, "ewc_loss": 0.029954101890325546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9954102501505986e-05, "grad_norm": 17.737293243408203, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8726978302001953, "num_tokens": 500923072.0, "step": 13134 }, { "epoch": 1.6709070092863505, "ewc_loss": 0.029833434149622917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.983343438245356e-05, "grad_norm": 17.795564651489258, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8743653297424316, "num_tokens": 500958726.0, "step": 13135 }, { "epoch": 1.6710342195649408, "ewc_loss": 0.029873833060264587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.987383231811691e-05, "grad_norm": 17.758188247680664, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8666538000106812, "num_tokens": 500996746.0, "step": 13136 }, { "epoch": 1.6711614298435313, "ewc_loss": 0.029847102239727974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9847102268831804e-05, "grad_norm": 17.82564926147461, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8623335361480713, "num_tokens": 501034135.0, "step": 13137 }, { "epoch": 1.6712886401221219, "ewc_loss": 0.029887152835726738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9887152777519077e-05, "grad_norm": 17.69229507446289, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8720912933349609, "num_tokens": 501069810.0, "step": 13138 }, { "epoch": 1.6714158504007124, "ewc_loss": 0.02982364036142826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.982364094350487e-05, "grad_norm": 17.78312873840332, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8628326654434204, "num_tokens": 501110597.0, "step": 13139 }, { "epoch": 1.671543060679303, "ewc_loss": 0.029923805966973305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9923805414000526e-05, "grad_norm": 17.82433319091797, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8586794137954712, "num_tokens": 501145927.0, "step": 13140 }, { "epoch": 1.6716702709578934, "ewc_loss": 0.029880326241254807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988032611028757e-05, "grad_norm": 17.792421340942383, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8501132130622864, "num_tokens": 501188473.0, "step": 13141 }, { "epoch": 1.6717974812364838, "ewc_loss": 0.029851242899894714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9851242288714275e-05, "grad_norm": 17.795621871948242, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8806711435317993, "num_tokens": 501226143.0, "step": 13142 }, { "epoch": 1.6719246915150743, "ewc_loss": 0.029808996245265007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9808996259816922e-05, "grad_norm": 17.76561164855957, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8599681854248047, "num_tokens": 501266980.0, "step": 13143 }, { "epoch": 1.6720519017936648, "ewc_loss": 0.029877353459596634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9877353881602176e-05, "grad_norm": 17.823606491088867, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8659438490867615, "num_tokens": 501305414.0, "step": 13144 }, { "epoch": 1.6721791120722553, "ewc_loss": 0.029801709577441216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9801709388266318e-05, "grad_norm": 17.816965103149414, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8611710667610168, "num_tokens": 501343009.0, "step": 13145 }, { "epoch": 1.6723063223508459, "ewc_loss": 0.02983768843114376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9837688998668455e-05, "grad_norm": 17.784175872802734, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8754684925079346, "num_tokens": 501383571.0, "step": 13146 }, { "epoch": 1.6724335326294364, "ewc_loss": 0.02981073595583439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9810735213686712e-05, "grad_norm": 17.782886505126953, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8698404431343079, "num_tokens": 501425444.0, "step": 13147 }, { "epoch": 1.672560742908027, "ewc_loss": 0.029916934669017792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9916935091023333e-05, "grad_norm": 17.787437438964844, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8816800713539124, "num_tokens": 501466536.0, "step": 13148 }, { "epoch": 1.6726879531866174, "ewc_loss": 0.02985742688179016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985742685268633e-05, "grad_norm": 17.81790542602539, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8720855712890625, "num_tokens": 501507346.0, "step": 13149 }, { "epoch": 1.672815163465208, "ewc_loss": 0.029858222231268883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985822175105568e-05, "grad_norm": 17.737186431884766, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8755311369895935, "num_tokens": 501545515.0, "step": 13150 }, { "epoch": 1.6729423737437985, "ewc_loss": 0.029901621863245964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990162101923488e-05, "grad_norm": 17.83303451538086, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8631475567817688, "num_tokens": 501588793.0, "step": 13151 }, { "epoch": 1.673069584022389, "ewc_loss": 0.029936110600829124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9936110877315514e-05, "grad_norm": 17.72995376586914, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8667000532150269, "num_tokens": 501624384.0, "step": 13152 }, { "epoch": 1.6731967943009796, "ewc_loss": 0.02988496795296669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988496817124542e-05, "grad_norm": 17.77587127685547, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8682836294174194, "num_tokens": 501663150.0, "step": 13153 }, { "epoch": 1.67332400457957, "ewc_loss": 0.02988983318209648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9889833967899904e-05, "grad_norm": 17.667638778686523, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8626705408096313, "num_tokens": 501706177.0, "step": 13154 }, { "epoch": 1.6734512148581606, "ewc_loss": 0.029865799471735954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9865799660910852e-05, "grad_norm": 17.82770538330078, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8813165426254272, "num_tokens": 501745267.0, "step": 13155 }, { "epoch": 1.6735784251367511, "ewc_loss": 0.02991608902812004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9916089260950685e-05, "grad_norm": 17.676706314086914, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.864042341709137, "num_tokens": 501783591.0, "step": 13156 }, { "epoch": 1.6737056354153417, "ewc_loss": 0.029859337955713272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9859338610549457e-05, "grad_norm": 17.737218856811523, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8591845035552979, "num_tokens": 501825190.0, "step": 13157 }, { "epoch": 1.6738328456939322, "ewc_loss": 0.029964793473482132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9964792702230625e-05, "grad_norm": 17.824068069458008, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8698276281356812, "num_tokens": 501855813.0, "step": 13158 }, { "epoch": 1.6739600559725227, "ewc_loss": 0.029907338321208954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9907338102930225e-05, "grad_norm": 17.774885177612305, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8646528124809265, "num_tokens": 501890106.0, "step": 13159 }, { "epoch": 1.674087266251113, "ewc_loss": 0.029910052195191383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9910052035120316e-05, "grad_norm": 17.794723510742188, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8554056882858276, "num_tokens": 501935437.0, "step": 13160 }, { "epoch": 1.6742144765297036, "ewc_loss": 0.02989233285188675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9892333259340376e-05, "grad_norm": 17.70388412475586, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8562414050102234, "num_tokens": 501980134.0, "step": 13161 }, { "epoch": 1.674341686808294, "ewc_loss": 0.029887478798627853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9887478376622312e-05, "grad_norm": 17.870697021484375, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8627840280532837, "num_tokens": 502015977.0, "step": 13162 }, { "epoch": 1.6744688970868846, "ewc_loss": 0.029874423518776894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9874423489673063e-05, "grad_norm": 17.72109603881836, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8720013499259949, "num_tokens": 502061717.0, "step": 13163 }, { "epoch": 1.6745961073654752, "ewc_loss": 0.029859814792871475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9859815185773186e-05, "grad_norm": 17.824195861816406, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.866712749004364, "num_tokens": 502101594.0, "step": 13164 }, { "epoch": 1.6747233176440657, "ewc_loss": 0.02993256412446499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99325638479786e-05, "grad_norm": 17.77046775817871, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8682441115379333, "num_tokens": 502136521.0, "step": 13165 }, { "epoch": 1.674850527922656, "ewc_loss": 0.02988814003765583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9888140488765202e-05, "grad_norm": 17.803220748901367, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8489285707473755, "num_tokens": 502174821.0, "step": 13166 }, { "epoch": 1.6749777382012465, "ewc_loss": 0.029894929379224777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9894928957219236e-05, "grad_norm": 17.752338409423828, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8623514175415039, "num_tokens": 502210615.0, "step": 13167 }, { "epoch": 1.675104948479837, "ewc_loss": 0.029942044988274574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.994204442074988e-05, "grad_norm": 17.884740829467773, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8688799142837524, "num_tokens": 502248220.0, "step": 13168 }, { "epoch": 1.6752321587584276, "ewc_loss": 0.029921898618340492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992189911310561e-05, "grad_norm": 17.69500732421875, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8663439750671387, "num_tokens": 502290686.0, "step": 13169 }, { "epoch": 1.675359369037018, "ewc_loss": 0.02984701655805111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9847016776329838e-05, "grad_norm": 17.811561584472656, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8572560548782349, "num_tokens": 502331789.0, "step": 13170 }, { "epoch": 1.6754865793156086, "ewc_loss": 0.029909338802099228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9909338991274126e-05, "grad_norm": 17.6507625579834, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8759061098098755, "num_tokens": 502371490.0, "step": 13171 }, { "epoch": 1.6756137895941992, "ewc_loss": 0.029820919036865234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9820919735357165e-05, "grad_norm": 17.78838348388672, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8673490881919861, "num_tokens": 502415606.0, "step": 13172 }, { "epoch": 1.6757409998727897, "ewc_loss": 0.029941478744149208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9941478715045378e-05, "grad_norm": 17.718290328979492, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8674224615097046, "num_tokens": 502456203.0, "step": 13173 }, { "epoch": 1.6758682101513802, "ewc_loss": 0.029846753925085068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9846754841855727e-05, "grad_norm": 17.723785400390625, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8854065537452698, "num_tokens": 502491508.0, "step": 13174 }, { "epoch": 1.6759954204299707, "ewc_loss": 0.029952554032206535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995255454152357e-05, "grad_norm": 17.74443244934082, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8569194078445435, "num_tokens": 502533408.0, "step": 13175 }, { "epoch": 1.6761226307085613, "ewc_loss": 0.029907289892435074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9907290809205733e-05, "grad_norm": 17.776592254638672, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8612250089645386, "num_tokens": 502569824.0, "step": 13176 }, { "epoch": 1.6762498409871518, "ewc_loss": 0.029895566403865814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9895565603510477e-05, "grad_norm": 17.680788040161133, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8769457936286926, "num_tokens": 502604776.0, "step": 13177 }, { "epoch": 1.6763770512657423, "ewc_loss": 0.029928164556622505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9928163712611422e-05, "grad_norm": 17.76238441467285, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8511974215507507, "num_tokens": 502645732.0, "step": 13178 }, { "epoch": 1.6765042615443329, "ewc_loss": 0.02998291701078415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9982917112647556e-05, "grad_norm": 17.75795555114746, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8723417520523071, "num_tokens": 502686044.0, "step": 13179 }, { "epoch": 1.6766314718229234, "ewc_loss": 0.029950084164738655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9950084353913553e-05, "grad_norm": 17.766921997070312, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8586623668670654, "num_tokens": 502728288.0, "step": 13180 }, { "epoch": 1.676758682101514, "ewc_loss": 0.0299138892441988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9913890102761798e-05, "grad_norm": 17.724218368530273, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8608263731002808, "num_tokens": 502760772.0, "step": 13181 }, { "epoch": 1.6768858923801044, "ewc_loss": 0.02993684820830822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993684756802395e-05, "grad_norm": 17.79206085205078, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8676323890686035, "num_tokens": 502791585.0, "step": 13182 }, { "epoch": 1.677013102658695, "ewc_loss": 0.029940290376544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9940290914964862e-05, "grad_norm": 17.735322952270508, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8855833411216736, "num_tokens": 502826339.0, "step": 13183 }, { "epoch": 1.6771403129372855, "ewc_loss": 0.029861750081181526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.986175059049856e-05, "grad_norm": 17.761810302734375, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8848152756690979, "num_tokens": 502868716.0, "step": 13184 }, { "epoch": 1.6772675232158758, "ewc_loss": 0.029989056289196014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9989056201884523e-05, "grad_norm": 17.77419662475586, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8686289191246033, "num_tokens": 502907992.0, "step": 13185 }, { "epoch": 1.6773947334944663, "ewc_loss": 0.029919015243649483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.991901601490099e-05, "grad_norm": 17.782535552978516, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8629013299942017, "num_tokens": 502947197.0, "step": 13186 }, { "epoch": 1.6775219437730569, "ewc_loss": 0.02988390251994133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988390224345494e-05, "grad_norm": 17.661306381225586, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8776354789733887, "num_tokens": 502985890.0, "step": 13187 }, { "epoch": 1.6776491540516474, "ewc_loss": 0.029937412589788437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9937413273728453e-05, "grad_norm": 17.77060317993164, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.863582968711853, "num_tokens": 503033476.0, "step": 13188 }, { "epoch": 1.677776364330238, "ewc_loss": 0.029994098469614983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9994098440511152e-05, "grad_norm": 17.777664184570312, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8765084743499756, "num_tokens": 503068002.0, "step": 13189 }, { "epoch": 1.6779035746088284, "ewc_loss": 0.029916934669017792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9916935091023333e-05, "grad_norm": 17.718351364135742, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8611503839492798, "num_tokens": 503110059.0, "step": 13190 }, { "epoch": 1.6780307848874187, "ewc_loss": 0.02999715693295002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9997156161698513e-05, "grad_norm": 17.808095932006836, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.86225825548172, "num_tokens": 503146617.0, "step": 13191 }, { "epoch": 1.6781579951660093, "ewc_loss": 0.029969055205583572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9969054594403133e-05, "grad_norm": 17.756927490234375, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8662149906158447, "num_tokens": 503186436.0, "step": 13192 }, { "epoch": 1.6782852054445998, "ewc_loss": 0.02995976433157921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9959765015519224e-05, "grad_norm": 17.799030303955078, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8740379214286804, "num_tokens": 503227983.0, "step": 13193 }, { "epoch": 1.6784124157231903, "ewc_loss": 0.02992965467274189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992965528392233e-05, "grad_norm": 17.726215362548828, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8549144268035889, "num_tokens": 503269301.0, "step": 13194 }, { "epoch": 1.6785396260017809, "ewc_loss": 0.029936833307147026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993683301610872e-05, "grad_norm": 17.81903648376465, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8510587811470032, "num_tokens": 503308919.0, "step": 13195 }, { "epoch": 1.6786668362803714, "ewc_loss": 0.029962731525301933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9962731787236407e-05, "grad_norm": 17.742103576660156, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8776237964630127, "num_tokens": 503345030.0, "step": 13196 }, { "epoch": 1.678794046558962, "ewc_loss": 0.029946157708764076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9946157155791298e-05, "grad_norm": 17.880165100097656, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8597221970558167, "num_tokens": 503379142.0, "step": 13197 }, { "epoch": 1.6789212568375524, "ewc_loss": 0.030051887035369873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0051887733861804e-05, "grad_norm": 17.820388793945312, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.86905837059021, "num_tokens": 503411810.0, "step": 13198 }, { "epoch": 1.679048467116143, "ewc_loss": 0.02991979941725731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9919799999333918e-05, "grad_norm": 17.770681381225586, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.865755558013916, "num_tokens": 503450266.0, "step": 13199 }, { "epoch": 1.6791756773947335, "ewc_loss": 0.02994806505739689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9948065275675617e-05, "grad_norm": 17.814834594726562, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8764132261276245, "num_tokens": 503491015.0, "step": 13200 }, { "epoch": 1.679302887673324, "ewc_loss": 0.029979068785905838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9979068131069653e-05, "grad_norm": 17.830232620239258, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8761860728263855, "num_tokens": 503531964.0, "step": 13201 }, { "epoch": 1.6794300979519146, "ewc_loss": 0.029919099062681198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9919099688413553e-05, "grad_norm": 17.771766662597656, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8724473714828491, "num_tokens": 503576069.0, "step": 13202 }, { "epoch": 1.679557308230505, "ewc_loss": 0.02995322272181511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995322211063467e-05, "grad_norm": 17.840023040771484, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8702585697174072, "num_tokens": 503607895.0, "step": 13203 }, { "epoch": 1.6796845185090956, "ewc_loss": 0.02998778037726879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9987781090312637e-05, "grad_norm": 17.817047119140625, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8638846278190613, "num_tokens": 503636891.0, "step": 13204 }, { "epoch": 1.6798117287876861, "ewc_loss": 0.02990484982728958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9904849725426175e-05, "grad_norm": 17.84401512145996, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8828431367874146, "num_tokens": 503682037.0, "step": 13205 }, { "epoch": 1.6799389390662767, "ewc_loss": 0.02993135154247284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9931352401035838e-05, "grad_norm": 17.67174530029297, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8709620833396912, "num_tokens": 503724008.0, "step": 13206 }, { "epoch": 1.6800661493448672, "ewc_loss": 0.02992640621960163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9926406568847597e-05, "grad_norm": 17.800687789916992, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8757712244987488, "num_tokens": 503759008.0, "step": 13207 }, { "epoch": 1.6801933596234577, "ewc_loss": 0.03001343458890915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0013434297870845e-05, "grad_norm": 17.771381378173828, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8644334673881531, "num_tokens": 503796599.0, "step": 13208 }, { "epoch": 1.680320569902048, "ewc_loss": 0.0300099216401577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0009921829332598e-05, "grad_norm": 17.783790588378906, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.865851640701294, "num_tokens": 503835539.0, "step": 13209 }, { "epoch": 1.6804477801806386, "ewc_loss": 0.029988691210746765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.998869058501441e-05, "grad_norm": 17.845726013183594, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8648121356964111, "num_tokens": 503875933.0, "step": 13210 }, { "epoch": 1.680574990459229, "ewc_loss": 0.030011462047696114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.00114625133574e-05, "grad_norm": 17.857223510742188, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8660886287689209, "num_tokens": 503913740.0, "step": 13211 }, { "epoch": 1.6807022007378196, "ewc_loss": 0.02993260696530342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9932607503724284e-05, "grad_norm": 17.822124481201172, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.871645450592041, "num_tokens": 503955105.0, "step": 13212 }, { "epoch": 1.6808294110164101, "ewc_loss": 0.029927803203463554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992780355270952e-05, "grad_norm": 17.831924438476562, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8680207133293152, "num_tokens": 503987641.0, "step": 13213 }, { "epoch": 1.6809566212950007, "ewc_loss": 0.029921703040599823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9921702662250027e-05, "grad_norm": 17.789104461669922, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8604026436805725, "num_tokens": 504028699.0, "step": 13214 }, { "epoch": 1.681083831573591, "ewc_loss": 0.02992415800690651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9924158297944814e-05, "grad_norm": 17.785633087158203, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8627831339836121, "num_tokens": 504065597.0, "step": 13215 }, { "epoch": 1.6812110418521815, "ewc_loss": 0.029947105795145035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.994710666825995e-05, "grad_norm": 17.787946701049805, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8545558452606201, "num_tokens": 504105968.0, "step": 13216 }, { "epoch": 1.681338252130772, "ewc_loss": 0.029936153441667557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9936152714071795e-05, "grad_norm": 17.824697494506836, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8711641430854797, "num_tokens": 504143743.0, "step": 13217 }, { "epoch": 1.6814654624093626, "ewc_loss": 0.029910556972026825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99105577141745e-05, "grad_norm": 17.7470760345459, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8704565167427063, "num_tokens": 504177023.0, "step": 13218 }, { "epoch": 1.681592672687953, "ewc_loss": 0.02993009425699711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9930093660368584e-05, "grad_norm": 17.867219924926758, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8657296299934387, "num_tokens": 504210492.0, "step": 13219 }, { "epoch": 1.6817198829665436, "ewc_loss": 0.029957741498947144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995774229930248e-05, "grad_norm": 17.72130584716797, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8682842254638672, "num_tokens": 504248322.0, "step": 13220 }, { "epoch": 1.6818470932451342, "ewc_loss": 0.02990597113966942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990597022289876e-05, "grad_norm": 17.819393157958984, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8545880913734436, "num_tokens": 504285915.0, "step": 13221 }, { "epoch": 1.6819743035237247, "ewc_loss": 0.03004600666463375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.004600694112014e-05, "grad_norm": 17.753637313842773, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8670327067375183, "num_tokens": 504325578.0, "step": 13222 }, { "epoch": 1.6821015138023152, "ewc_loss": 0.029964258894324303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9964259738335386e-05, "grad_norm": 17.80661392211914, "learning_rate": 1e-06, "loss": 0.3376, "mean_token_accuracy": 0.888187050819397, "num_tokens": 504362876.0, "step": 13223 }, { "epoch": 1.6822287240809057, "ewc_loss": 0.030037838965654373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.003783967869822e-05, "grad_norm": 17.779794692993164, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8667005300521851, "num_tokens": 504402303.0, "step": 13224 }, { "epoch": 1.6823559343594963, "ewc_loss": 0.029984069988131523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9984070351929404e-05, "grad_norm": 17.76515769958496, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8701788187026978, "num_tokens": 504440039.0, "step": 13225 }, { "epoch": 1.6824831446380868, "ewc_loss": 0.02996627427637577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9966275178594515e-05, "grad_norm": 17.759458541870117, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8603708744049072, "num_tokens": 504485102.0, "step": 13226 }, { "epoch": 1.6826103549166773, "ewc_loss": 0.0299739558249712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9973954951856285e-05, "grad_norm": 17.7618350982666, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.866475522518158, "num_tokens": 504522762.0, "step": 13227 }, { "epoch": 1.6827375651952678, "ewc_loss": 0.029972577467560768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99725779768778e-05, "grad_norm": 17.794307708740234, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8728686571121216, "num_tokens": 504562818.0, "step": 13228 }, { "epoch": 1.6828647754738584, "ewc_loss": 0.029979070648550987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9979069950059056e-05, "grad_norm": 17.74197769165039, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8586961030960083, "num_tokens": 504602914.0, "step": 13229 }, { "epoch": 1.682991985752449, "ewc_loss": 0.029990335926413536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.999033677042462e-05, "grad_norm": 17.80133819580078, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8754029273986816, "num_tokens": 504641376.0, "step": 13230 }, { "epoch": 1.6831191960310394, "ewc_loss": 0.029895519837737083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9895520128775388e-05, "grad_norm": 17.675321578979492, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8758401870727539, "num_tokens": 504675031.0, "step": 13231 }, { "epoch": 1.68324640630963, "ewc_loss": 0.029933853074908257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9933853511465713e-05, "grad_norm": 17.760860443115234, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8741499781608582, "num_tokens": 504709624.0, "step": 13232 }, { "epoch": 1.6833736165882205, "ewc_loss": 0.029975689947605133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9975690267747268e-05, "grad_norm": 17.727888107299805, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8720252513885498, "num_tokens": 504749310.0, "step": 13233 }, { "epoch": 1.6835008268668108, "ewc_loss": 0.030038494616746902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0038494514883496e-05, "grad_norm": 17.803478240966797, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8686615228652954, "num_tokens": 504791700.0, "step": 13234 }, { "epoch": 1.6836280371454013, "ewc_loss": 0.0299729835242033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9972983611514792e-05, "grad_norm": 17.76409339904785, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.858291745185852, "num_tokens": 504833246.0, "step": 13235 }, { "epoch": 1.6837552474239919, "ewc_loss": 0.030010847374796867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0010847694939002e-05, "grad_norm": 17.799217224121094, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8731164336204529, "num_tokens": 504872901.0, "step": 13236 }, { "epoch": 1.6838824577025824, "ewc_loss": 0.02995658665895462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995658724103123e-05, "grad_norm": 17.754756927490234, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8634121417999268, "num_tokens": 504906138.0, "step": 13237 }, { "epoch": 1.684009667981173, "ewc_loss": 0.03000900149345398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0009001420694403e-05, "grad_norm": 17.762022018432617, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8485777378082275, "num_tokens": 504947147.0, "step": 13238 }, { "epoch": 1.6841368782597634, "ewc_loss": 0.03002467192709446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.002467201440595e-05, "grad_norm": 17.752199172973633, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8707541227340698, "num_tokens": 504985332.0, "step": 13239 }, { "epoch": 1.6842640885383537, "ewc_loss": 0.030018534511327744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0018534744158387e-05, "grad_norm": 17.839004516601562, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8601294755935669, "num_tokens": 505027995.0, "step": 13240 }, { "epoch": 1.6843912988169443, "ewc_loss": 0.030014224350452423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0014223739271984e-05, "grad_norm": 17.70387077331543, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8718985319137573, "num_tokens": 505065311.0, "step": 13241 }, { "epoch": 1.6845185090955348, "ewc_loss": 0.02991591952741146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9915920094936155e-05, "grad_norm": 17.785390853881836, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8642073273658752, "num_tokens": 505105810.0, "step": 13242 }, { "epoch": 1.6846457193741253, "ewc_loss": 0.03008068911731243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008068961207755e-05, "grad_norm": 17.78293228149414, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8743377923965454, "num_tokens": 505141003.0, "step": 13243 }, { "epoch": 1.6847729296527159, "ewc_loss": 0.029922176152467728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992217559949495e-05, "grad_norm": 17.753894805908203, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8583176136016846, "num_tokens": 505188025.0, "step": 13244 }, { "epoch": 1.6849001399313064, "ewc_loss": 0.029959190636873245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9959190214867704e-05, "grad_norm": 17.723400115966797, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8825781345367432, "num_tokens": 505226101.0, "step": 13245 }, { "epoch": 1.685027350209897, "ewc_loss": 0.029972216114401817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9972215997986495e-05, "grad_norm": 17.81332015991211, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8590348958969116, "num_tokens": 505264880.0, "step": 13246 }, { "epoch": 1.6851545604884874, "ewc_loss": 0.029985597357153893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.998559648403898e-05, "grad_norm": 17.675724029541016, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8617072701454163, "num_tokens": 505306208.0, "step": 13247 }, { "epoch": 1.685281770767078, "ewc_loss": 0.03000280261039734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000280230480712e-05, "grad_norm": 17.86008644104004, "learning_rate": 1e-06, "loss": 0.3448, "mean_token_accuracy": 0.8897605538368225, "num_tokens": 505346024.0, "step": 13248 }, { "epoch": 1.6854089810456685, "ewc_loss": 0.03001069463789463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0010694899829105e-05, "grad_norm": 17.696680068969727, "learning_rate": 1e-06, "loss": 0.3402, "mean_token_accuracy": 0.8909628391265869, "num_tokens": 505383644.0, "step": 13249 }, { "epoch": 1.685536191324259, "ewc_loss": 0.029903750866651535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9903751055826433e-05, "grad_norm": 17.818422317504883, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8723688125610352, "num_tokens": 505420365.0, "step": 13250 }, { "epoch": 1.6856634016028496, "ewc_loss": 0.029992224648594856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99922248814255e-05, "grad_norm": 17.704233169555664, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8705156445503235, "num_tokens": 505458211.0, "step": 13251 }, { "epoch": 1.68579061188144, "ewc_loss": 0.02990908920764923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990908978972584e-05, "grad_norm": 17.753686904907227, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8634270429611206, "num_tokens": 505501649.0, "step": 13252 }, { "epoch": 1.6859178221600306, "ewc_loss": 0.029980426654219627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99804269161541e-05, "grad_norm": 17.78021240234375, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8710446357727051, "num_tokens": 505538763.0, "step": 13253 }, { "epoch": 1.6860450324386211, "ewc_loss": 0.02995477244257927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995477188960649e-05, "grad_norm": 17.80584716796875, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8906615972518921, "num_tokens": 505574562.0, "step": 13254 }, { "epoch": 1.6861722427172117, "ewc_loss": 0.029950570315122604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99505700240843e-05, "grad_norm": 17.773635864257812, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8815044164657593, "num_tokens": 505609718.0, "step": 13255 }, { "epoch": 1.6862994529958022, "ewc_loss": 0.029912561178207397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9912560421507806e-05, "grad_norm": 17.779123306274414, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8812726736068726, "num_tokens": 505644337.0, "step": 13256 }, { "epoch": 1.6864266632743927, "ewc_loss": 0.029981207102537155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9981207262608223e-05, "grad_norm": 17.848182678222656, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8789597749710083, "num_tokens": 505681929.0, "step": 13257 }, { "epoch": 1.686553873552983, "ewc_loss": 0.02990451455116272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9904515031375922e-05, "grad_norm": 17.815887451171875, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8466414213180542, "num_tokens": 505719373.0, "step": 13258 }, { "epoch": 1.6866810838315736, "ewc_loss": 0.029897764325141907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9897764761699364e-05, "grad_norm": 17.796955108642578, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8558831214904785, "num_tokens": 505757767.0, "step": 13259 }, { "epoch": 1.686808294110164, "ewc_loss": 0.029972562566399574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9972563424962573e-05, "grad_norm": 17.899272918701172, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8384755849838257, "num_tokens": 505794777.0, "step": 13260 }, { "epoch": 1.6869355043887546, "ewc_loss": 0.029940498992800713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9940498279756866e-05, "grad_norm": 17.75777816772461, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8756380081176758, "num_tokens": 505832672.0, "step": 13261 }, { "epoch": 1.6870627146673451, "ewc_loss": 0.02988867275416851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988867345266044e-05, "grad_norm": 17.806726455688477, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.857509970664978, "num_tokens": 505873142.0, "step": 13262 }, { "epoch": 1.6871899249459357, "ewc_loss": 0.02998090721666813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9980907129356638e-05, "grad_norm": 17.788360595703125, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8712772727012634, "num_tokens": 505911994.0, "step": 13263 }, { "epoch": 1.687317135224526, "ewc_loss": 0.029878297820687294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9878297937102616e-05, "grad_norm": 17.74542999267578, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8726560473442078, "num_tokens": 505948874.0, "step": 13264 }, { "epoch": 1.6874443455031165, "ewc_loss": 0.029920607805252075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9920607630629092e-05, "grad_norm": 17.70784568786621, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8754246830940247, "num_tokens": 505989679.0, "step": 13265 }, { "epoch": 1.687571555781707, "ewc_loss": 0.029904302209615707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9904302209615707e-05, "grad_norm": 17.752696990966797, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8773126602172852, "num_tokens": 506033209.0, "step": 13266 }, { "epoch": 1.6876987660602976, "ewc_loss": 0.029970599338412285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9970598916406743e-05, "grad_norm": 17.772104263305664, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8780133724212646, "num_tokens": 506065379.0, "step": 13267 }, { "epoch": 1.687825976338888, "ewc_loss": 0.029947031289339066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9947032089694403e-05, "grad_norm": 17.7228946685791, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8601641058921814, "num_tokens": 506099484.0, "step": 13268 }, { "epoch": 1.6879531866174786, "ewc_loss": 0.02996860258281231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9968603485031053e-05, "grad_norm": 17.79377555847168, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8518221974372864, "num_tokens": 506136559.0, "step": 13269 }, { "epoch": 1.6880803968960691, "ewc_loss": 0.030031368136405945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0031367714400403e-05, "grad_norm": 17.824771881103516, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8676955699920654, "num_tokens": 506172997.0, "step": 13270 }, { "epoch": 1.6882076071746597, "ewc_loss": 0.03001321665942669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.001321601914242e-05, "grad_norm": 17.802961349487305, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8830814361572266, "num_tokens": 506209858.0, "step": 13271 }, { "epoch": 1.6883348174532502, "ewc_loss": 0.030019447207450867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0019447876838967e-05, "grad_norm": 17.768186569213867, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.881257176399231, "num_tokens": 506242260.0, "step": 13272 }, { "epoch": 1.6884620277318407, "ewc_loss": 0.029988516122102737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.998851596203167e-05, "grad_norm": 17.82895278930664, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8565474152565002, "num_tokens": 506276856.0, "step": 13273 }, { "epoch": 1.6885892380104313, "ewc_loss": 0.030020566657185555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0020566555322148e-05, "grad_norm": 17.77218246459961, "learning_rate": 1e-06, "loss": 0.3466, "mean_token_accuracy": 0.8876560926437378, "num_tokens": 506311469.0, "step": 13274 }, { "epoch": 1.6887164482890218, "ewc_loss": 0.02997501753270626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9975017241667956e-05, "grad_norm": 17.78124237060547, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8801748752593994, "num_tokens": 506348240.0, "step": 13275 }, { "epoch": 1.6888436585676123, "ewc_loss": 0.030057864263653755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0057864933041856e-05, "grad_norm": 17.778261184692383, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8708822727203369, "num_tokens": 506387203.0, "step": 13276 }, { "epoch": 1.6889708688462028, "ewc_loss": 0.030000068247318268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000006836373359e-05, "grad_norm": 17.75757598876953, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8477283120155334, "num_tokens": 506421869.0, "step": 13277 }, { "epoch": 1.6890980791247934, "ewc_loss": 0.030065855011343956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0065855753491633e-05, "grad_norm": 17.724836349487305, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.869512677192688, "num_tokens": 506458873.0, "step": 13278 }, { "epoch": 1.689225289403384, "ewc_loss": 0.030063580721616745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0063580197747797e-05, "grad_norm": 17.80499267578125, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8560954332351685, "num_tokens": 506495947.0, "step": 13279 }, { "epoch": 1.6893524996819744, "ewc_loss": 0.030111176893115044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.011117769347038e-05, "grad_norm": 17.78928565979004, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8506072759628296, "num_tokens": 506533098.0, "step": 13280 }, { "epoch": 1.689479709960565, "ewc_loss": 0.030050436034798622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0050436180317774e-05, "grad_norm": 17.793907165527344, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8796068429946899, "num_tokens": 506571346.0, "step": 13281 }, { "epoch": 1.6896069202391555, "ewc_loss": 0.03005800023674965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0057999538257718e-05, "grad_norm": 17.763757705688477, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8703685402870178, "num_tokens": 506607912.0, "step": 13282 }, { "epoch": 1.6897341305177458, "ewc_loss": 0.030059998854994774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0059998607612215e-05, "grad_norm": 17.77446937561035, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8580330610275269, "num_tokens": 506654258.0, "step": 13283 }, { "epoch": 1.6898613407963363, "ewc_loss": 0.030142618343234062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014261892531067e-05, "grad_norm": 17.835622787475586, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8763578534126282, "num_tokens": 506693436.0, "step": 13284 }, { "epoch": 1.6899885510749268, "ewc_loss": 0.030022453516721725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0022452847333625e-05, "grad_norm": 17.752328872680664, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8714057207107544, "num_tokens": 506737383.0, "step": 13285 }, { "epoch": 1.6901157613535174, "ewc_loss": 0.0300352331250906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0035233066882938e-05, "grad_norm": 17.80845832824707, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8791215419769287, "num_tokens": 506772282.0, "step": 13286 }, { "epoch": 1.690242971632108, "ewc_loss": 0.030078109353780746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0078108466113918e-05, "grad_norm": 17.790931701660156, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8810853958129883, "num_tokens": 506807208.0, "step": 13287 }, { "epoch": 1.6903701819106984, "ewc_loss": 0.03004119172692299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0041192076168954e-05, "grad_norm": 17.767873764038086, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8750307559967041, "num_tokens": 506847952.0, "step": 13288 }, { "epoch": 1.6904973921892887, "ewc_loss": 0.030042776837944984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0042776415939443e-05, "grad_norm": 17.852659225463867, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8730102777481079, "num_tokens": 506888265.0, "step": 13289 }, { "epoch": 1.6906246024678793, "ewc_loss": 0.029995661228895187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9995660952408798e-05, "grad_norm": 17.832761764526367, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8729044198989868, "num_tokens": 506926096.0, "step": 13290 }, { "epoch": 1.6907518127464698, "ewc_loss": 0.030009586364030838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0009587135282345e-05, "grad_norm": 17.8705997467041, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8636771440505981, "num_tokens": 506962252.0, "step": 13291 }, { "epoch": 1.6908790230250603, "ewc_loss": 0.030023690313100815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0023689760128036e-05, "grad_norm": 17.787250518798828, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8750919103622437, "num_tokens": 507004709.0, "step": 13292 }, { "epoch": 1.6910062333036509, "ewc_loss": 0.029959628358483315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9959628591313958e-05, "grad_norm": 17.865562438964844, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8855042457580566, "num_tokens": 507041041.0, "step": 13293 }, { "epoch": 1.6911334435822414, "ewc_loss": 0.030011054128408432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0011055059731007e-05, "grad_norm": 17.775060653686523, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8635027408599854, "num_tokens": 507080251.0, "step": 13294 }, { "epoch": 1.691260653860832, "ewc_loss": 0.029943101108074188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.994310125359334e-05, "grad_norm": 17.9759521484375, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8680648803710938, "num_tokens": 507120813.0, "step": 13295 }, { "epoch": 1.6913878641394224, "ewc_loss": 0.03003021702170372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.003021629410796e-05, "grad_norm": 17.76827621459961, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8558145761489868, "num_tokens": 507160284.0, "step": 13296 }, { "epoch": 1.691515074418013, "ewc_loss": 0.029823850840330124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9823850127286278e-05, "grad_norm": 17.840791702270508, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.862369179725647, "num_tokens": 507205069.0, "step": 13297 }, { "epoch": 1.6916422846966035, "ewc_loss": 0.02999570220708847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.999570278916508e-05, "grad_norm": 17.820066452026367, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8773350119590759, "num_tokens": 507244689.0, "step": 13298 }, { "epoch": 1.691769494975194, "ewc_loss": 0.029933327808976173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993332782352809e-05, "grad_norm": 17.81853675842285, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8716620206832886, "num_tokens": 507283917.0, "step": 13299 }, { "epoch": 1.6918967052537845, "ewc_loss": 0.02997679077088833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9976790756336413e-05, "grad_norm": 17.87769317626953, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8717650175094604, "num_tokens": 507327937.0, "step": 13300 }, { "epoch": 1.692023915532375, "ewc_loss": 0.029851866886019707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985186620207969e-05, "grad_norm": 17.777559280395508, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8836212158203125, "num_tokens": 507369521.0, "step": 13301 }, { "epoch": 1.6921511258109656, "ewc_loss": 0.029856689274311066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.985668834298849e-05, "grad_norm": 17.873966217041016, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8794610500335693, "num_tokens": 507408157.0, "step": 13302 }, { "epoch": 1.6922783360895561, "ewc_loss": 0.029944458976387978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9944458219688386e-05, "grad_norm": 17.767072677612305, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8759258985519409, "num_tokens": 507448003.0, "step": 13303 }, { "epoch": 1.6924055463681467, "ewc_loss": 0.0297932717949152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.979327109642327e-05, "grad_norm": 17.753631591796875, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8767826557159424, "num_tokens": 507489970.0, "step": 13304 }, { "epoch": 1.6925327566467372, "ewc_loss": 0.029949160292744637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9949160307296552e-05, "grad_norm": 17.85530662536621, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8711435794830322, "num_tokens": 507524536.0, "step": 13305 }, { "epoch": 1.6926599669253277, "ewc_loss": 0.029871337115764618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9871336664655246e-05, "grad_norm": 17.772069931030273, "learning_rate": 1e-06, "loss": 0.3314, "mean_token_accuracy": 0.8893989324569702, "num_tokens": 507561575.0, "step": 13306 }, { "epoch": 1.692787177203918, "ewc_loss": 0.029860803857445717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9860804716008715e-05, "grad_norm": 17.84023094177246, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8720653057098389, "num_tokens": 507597212.0, "step": 13307 }, { "epoch": 1.6929143874825086, "ewc_loss": 0.029941590502858162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9941589673398994e-05, "grad_norm": 17.87824058532715, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8724826574325562, "num_tokens": 507634251.0, "step": 13308 }, { "epoch": 1.693041597761099, "ewc_loss": 0.02984670177102089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9846702091163024e-05, "grad_norm": 17.798343658447266, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8756045699119568, "num_tokens": 507671345.0, "step": 13309 }, { "epoch": 1.6931688080396896, "ewc_loss": 0.02987714484333992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9877144697820768e-05, "grad_norm": 17.898765563964844, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8592650890350342, "num_tokens": 507709797.0, "step": 13310 }, { "epoch": 1.6932960183182801, "ewc_loss": 0.02990017458796501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9900174922659062e-05, "grad_norm": 17.771522521972656, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8697898983955383, "num_tokens": 507747047.0, "step": 13311 }, { "epoch": 1.6934232285968707, "ewc_loss": 0.029844239354133606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9844239179510623e-05, "grad_norm": 17.889982223510742, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8699089288711548, "num_tokens": 507782187.0, "step": 13312 }, { "epoch": 1.693550438875461, "ewc_loss": 0.029912011697888374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9912011086707935e-05, "grad_norm": 17.656084060668945, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8568504452705383, "num_tokens": 507826714.0, "step": 13313 }, { "epoch": 1.6936776491540515, "ewc_loss": 0.02986469306051731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9864693715353496e-05, "grad_norm": 17.827438354492188, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8617787957191467, "num_tokens": 507869423.0, "step": 13314 }, { "epoch": 1.693804859432642, "ewc_loss": 0.029995225369930267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.999522621394135e-05, "grad_norm": 17.808317184448242, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8787134289741516, "num_tokens": 507904711.0, "step": 13315 }, { "epoch": 1.6939320697112326, "ewc_loss": 0.029883384704589844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.988338383147493e-05, "grad_norm": 17.814485549926758, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8645579814910889, "num_tokens": 507947760.0, "step": 13316 }, { "epoch": 1.694059279989823, "ewc_loss": 0.029981521889567375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9981521947775036e-05, "grad_norm": 17.835195541381836, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8671137094497681, "num_tokens": 507984249.0, "step": 13317 }, { "epoch": 1.6941864902684136, "ewc_loss": 0.029940789565443993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9940789318061434e-05, "grad_norm": 17.852270126342773, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8660296201705933, "num_tokens": 508017206.0, "step": 13318 }, { "epoch": 1.6943137005470041, "ewc_loss": 0.029956243932247162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995624345203396e-05, "grad_norm": 17.80811309814453, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8645621538162231, "num_tokens": 508056357.0, "step": 13319 }, { "epoch": 1.6944409108255947, "ewc_loss": 0.029978958889842033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.997895899170544e-05, "grad_norm": 17.806163787841797, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8610719442367554, "num_tokens": 508094870.0, "step": 13320 }, { "epoch": 1.6945681211041852, "ewc_loss": 0.03000309318304062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0003093343111686e-05, "grad_norm": 17.891258239746094, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.868378758430481, "num_tokens": 508120323.0, "step": 13321 }, { "epoch": 1.6946953313827757, "ewc_loss": 0.029944974929094315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9944974812678993e-05, "grad_norm": 17.73310661315918, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.861506462097168, "num_tokens": 508160018.0, "step": 13322 }, { "epoch": 1.6948225416613663, "ewc_loss": 0.029981106519699097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9981107218191028e-05, "grad_norm": 17.76409149169922, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8870202898979187, "num_tokens": 508200947.0, "step": 13323 }, { "epoch": 1.6949497519399568, "ewc_loss": 0.030043838545680046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0043838705751114e-05, "grad_norm": 17.80946922302246, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8561966419219971, "num_tokens": 508246457.0, "step": 13324 }, { "epoch": 1.6950769622185473, "ewc_loss": 0.030048226937651634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0048226108192466e-05, "grad_norm": 17.89218521118164, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8648731708526611, "num_tokens": 508281951.0, "step": 13325 }, { "epoch": 1.6952041724971378, "ewc_loss": 0.03003559075295925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0035591407795437e-05, "grad_norm": 17.758581161499023, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8521418571472168, "num_tokens": 508322571.0, "step": 13326 }, { "epoch": 1.6953313827757284, "ewc_loss": 0.0300382599234581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.003825986525044e-05, "grad_norm": 17.85153579711914, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8758713603019714, "num_tokens": 508357347.0, "step": 13327 }, { "epoch": 1.695458593054319, "ewc_loss": 0.030067846179008484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0067845727899112e-05, "grad_norm": 17.745092391967773, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.871300995349884, "num_tokens": 508401813.0, "step": 13328 }, { "epoch": 1.6955858033329094, "ewc_loss": 0.029997292906045914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.999729258590378e-05, "grad_norm": 17.81267738342285, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.858832597732544, "num_tokens": 508440457.0, "step": 13329 }, { "epoch": 1.6957130136115, "ewc_loss": 0.030111510306596756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.011151056853123e-05, "grad_norm": 17.870134353637695, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8602606654167175, "num_tokens": 508472272.0, "step": 13330 }, { "epoch": 1.6958402238900905, "ewc_loss": 0.030069973319768906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.006997394550126e-05, "grad_norm": 17.75598907470703, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.881632924079895, "num_tokens": 508509352.0, "step": 13331 }, { "epoch": 1.6959674341686808, "ewc_loss": 0.030010204762220383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.001020559167955e-05, "grad_norm": 17.890256881713867, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8783494234085083, "num_tokens": 508552515.0, "step": 13332 }, { "epoch": 1.6960946444472713, "ewc_loss": 0.030087510123848915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0087510822340846e-05, "grad_norm": 17.74423599243164, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8825817108154297, "num_tokens": 508591399.0, "step": 13333 }, { "epoch": 1.6962218547258618, "ewc_loss": 0.030001316219568253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0001316190464422e-05, "grad_norm": 17.887956619262695, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8743947744369507, "num_tokens": 508630427.0, "step": 13334 }, { "epoch": 1.6963490650044524, "ewc_loss": 0.0300816148519516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0081615477683954e-05, "grad_norm": 17.782917022705078, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8772907257080078, "num_tokens": 508667469.0, "step": 13335 }, { "epoch": 1.696476275283043, "ewc_loss": 0.030000755563378334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000075594172813e-05, "grad_norm": 17.7854061126709, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8725699186325073, "num_tokens": 508706308.0, "step": 13336 }, { "epoch": 1.6966034855616334, "ewc_loss": 0.030129142105579376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01291420328198e-05, "grad_norm": 17.88545799255371, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8572891354560852, "num_tokens": 508738968.0, "step": 13337 }, { "epoch": 1.6967306958402237, "ewc_loss": 0.030051132664084435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0051132853259332e-05, "grad_norm": 17.77296257019043, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.875670313835144, "num_tokens": 508769879.0, "step": 13338 }, { "epoch": 1.6968579061188143, "ewc_loss": 0.030070127919316292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0070128559600562e-05, "grad_norm": 17.91669464111328, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8532727956771851, "num_tokens": 508812277.0, "step": 13339 }, { "epoch": 1.6969851163974048, "ewc_loss": 0.030125796794891357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012579691130668e-05, "grad_norm": 17.843595504760742, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8598541021347046, "num_tokens": 508849293.0, "step": 13340 }, { "epoch": 1.6971123266759953, "ewc_loss": 0.0299663282930851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9966327929287218e-05, "grad_norm": 17.80110740661621, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8704493045806885, "num_tokens": 508885798.0, "step": 13341 }, { "epoch": 1.6972395369545858, "ewc_loss": 0.030072014778852463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007201485161204e-05, "grad_norm": 17.758468627929688, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8673343658447266, "num_tokens": 508920491.0, "step": 13342 }, { "epoch": 1.6973667472331764, "ewc_loss": 0.03003069758415222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.00306983262999e-05, "grad_norm": 17.835338592529297, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8693439960479736, "num_tokens": 508956161.0, "step": 13343 }, { "epoch": 1.697493957511767, "ewc_loss": 0.030095506459474564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0095507099758834e-05, "grad_norm": 17.730846405029297, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8661664724349976, "num_tokens": 508995696.0, "step": 13344 }, { "epoch": 1.6976211677903574, "ewc_loss": 0.03008187562227249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008187559316866e-05, "grad_norm": 17.858901977539062, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8741273880004883, "num_tokens": 509029505.0, "step": 13345 }, { "epoch": 1.697748378068948, "ewc_loss": 0.03006962686777115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0069626518525183e-05, "grad_norm": 17.78845977783203, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8772444725036621, "num_tokens": 509062815.0, "step": 13346 }, { "epoch": 1.6978755883475385, "ewc_loss": 0.030049730092287064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.00497304124292e-05, "grad_norm": 17.787233352661133, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8753867745399475, "num_tokens": 509103376.0, "step": 13347 }, { "epoch": 1.698002798626129, "ewc_loss": 0.03006145916879177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0061459256103262e-05, "grad_norm": 17.855606079101562, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8801532983779907, "num_tokens": 509142797.0, "step": 13348 }, { "epoch": 1.6981300089047195, "ewc_loss": 0.030043458566069603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0043458536965773e-05, "grad_norm": 17.78229522705078, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8585739135742188, "num_tokens": 509182724.0, "step": 13349 }, { "epoch": 1.69825721918331, "ewc_loss": 0.030029281973838806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0029281333554536e-05, "grad_norm": 17.765838623046875, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8788982033729553, "num_tokens": 509217856.0, "step": 13350 }, { "epoch": 1.6983844294619006, "ewc_loss": 0.030067820101976395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0067820262047462e-05, "grad_norm": 17.790559768676758, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8702216744422913, "num_tokens": 509251581.0, "step": 13351 }, { "epoch": 1.6985116397404911, "ewc_loss": 0.030102605000138283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010260479641147e-05, "grad_norm": 17.801584243774414, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8857619166374207, "num_tokens": 509286687.0, "step": 13352 }, { "epoch": 1.6986388500190817, "ewc_loss": 0.030063370242714882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.006337101396639e-05, "grad_norm": 17.70787811279297, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8645293116569519, "num_tokens": 509323372.0, "step": 13353 }, { "epoch": 1.6987660602976722, "ewc_loss": 0.03010903298854828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01090331049636e-05, "grad_norm": 17.825170516967773, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.860939085483551, "num_tokens": 509362472.0, "step": 13354 }, { "epoch": 1.6988932705762627, "ewc_loss": 0.03014875389635563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014875437656883e-05, "grad_norm": 17.797470092773438, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8701043128967285, "num_tokens": 509397676.0, "step": 13355 }, { "epoch": 1.699020480854853, "ewc_loss": 0.03002025932073593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.002025914611295e-05, "grad_norm": 17.752967834472656, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8730270266532898, "num_tokens": 509441100.0, "step": 13356 }, { "epoch": 1.6991476911334435, "ewc_loss": 0.030117092654109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0117093047010712e-05, "grad_norm": 17.78802490234375, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8650476932525635, "num_tokens": 509485430.0, "step": 13357 }, { "epoch": 1.699274901412034, "ewc_loss": 0.030065171420574188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.00651718134759e-05, "grad_norm": 17.758018493652344, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8401328325271606, "num_tokens": 509520102.0, "step": 13358 }, { "epoch": 1.6994021116906246, "ewc_loss": 0.030112624168395996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01126237900462e-05, "grad_norm": 17.822641372680664, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8706941604614258, "num_tokens": 509556525.0, "step": 13359 }, { "epoch": 1.6995293219692151, "ewc_loss": 0.03005826473236084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0058265110710636e-05, "grad_norm": 17.751829147338867, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8632194399833679, "num_tokens": 509596899.0, "step": 13360 }, { "epoch": 1.6996565322478057, "ewc_loss": 0.03009062074124813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.009062129422091e-05, "grad_norm": 17.911401748657227, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8698400855064392, "num_tokens": 509632125.0, "step": 13361 }, { "epoch": 1.699783742526396, "ewc_loss": 0.030120521783828735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0120521842036396e-05, "grad_norm": 17.732280731201172, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8657508492469788, "num_tokens": 509668131.0, "step": 13362 }, { "epoch": 1.6999109528049865, "ewc_loss": 0.03010215424001217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010215368703939e-05, "grad_norm": 17.90500831604004, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8825027346611023, "num_tokens": 509704078.0, "step": 13363 }, { "epoch": 1.700038163083577, "ewc_loss": 0.03010603040456772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0106029953458346e-05, "grad_norm": 17.814516067504883, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8824633955955505, "num_tokens": 509739464.0, "step": 13364 }, { "epoch": 1.7001653733621676, "ewc_loss": 0.030016906559467316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0016906748642214e-05, "grad_norm": 17.808679580688477, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8686037659645081, "num_tokens": 509777970.0, "step": 13365 }, { "epoch": 1.700292583640758, "ewc_loss": 0.030126187950372696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012618799402844e-05, "grad_norm": 17.852191925048828, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8558362126350403, "num_tokens": 509817010.0, "step": 13366 }, { "epoch": 1.7004197939193486, "ewc_loss": 0.0300232395529747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0023238650755957e-05, "grad_norm": 17.78206443786621, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8749555349349976, "num_tokens": 509854192.0, "step": 13367 }, { "epoch": 1.7005470041979391, "ewc_loss": 0.030083848163485527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0083847377682105e-05, "grad_norm": 17.857410430908203, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8832760453224182, "num_tokens": 509888633.0, "step": 13368 }, { "epoch": 1.7006742144765297, "ewc_loss": 0.030034616589546204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0034616429475136e-05, "grad_norm": 17.806554794311523, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8710229396820068, "num_tokens": 509928474.0, "step": 13369 }, { "epoch": 1.7008014247551202, "ewc_loss": 0.03002956509590149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.002956509590149e-05, "grad_norm": 17.809335708618164, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8616914749145508, "num_tokens": 509968875.0, "step": 13370 }, { "epoch": 1.7009286350337107, "ewc_loss": 0.030071696266531944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007169652846642e-05, "grad_norm": 17.80883026123047, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8700931668281555, "num_tokens": 510006021.0, "step": 13371 }, { "epoch": 1.7010558453123013, "ewc_loss": 0.030058082193136215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0058081392780878e-05, "grad_norm": 17.891754150390625, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8747832775115967, "num_tokens": 510042111.0, "step": 13372 }, { "epoch": 1.7011830555908918, "ewc_loss": 0.030052345246076584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0052346119191498e-05, "grad_norm": 17.877798080444336, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8756914734840393, "num_tokens": 510074894.0, "step": 13373 }, { "epoch": 1.7013102658694823, "ewc_loss": 0.030037565156817436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0037565011298284e-05, "grad_norm": 17.82733154296875, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8617830276489258, "num_tokens": 510111516.0, "step": 13374 }, { "epoch": 1.7014374761480728, "ewc_loss": 0.02997271902859211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9972719858051278e-05, "grad_norm": 17.798398971557617, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8722133636474609, "num_tokens": 510151549.0, "step": 13375 }, { "epoch": 1.7015646864266634, "ewc_loss": 0.030000261962413788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0000261176610366e-05, "grad_norm": 17.7781925201416, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8697910904884338, "num_tokens": 510189215.0, "step": 13376 }, { "epoch": 1.701691896705254, "ewc_loss": 0.030077755451202393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007775558216963e-05, "grad_norm": 17.84703254699707, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.8893466591835022, "num_tokens": 510231189.0, "step": 13377 }, { "epoch": 1.7018191069838444, "ewc_loss": 0.030058372765779495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0058372431085445e-05, "grad_norm": 17.786527633666992, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8742191195487976, "num_tokens": 510273956.0, "step": 13378 }, { "epoch": 1.701946317262435, "ewc_loss": 0.029928918927907944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9928918593213893e-05, "grad_norm": 17.79300308227539, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8857100009918213, "num_tokens": 510315095.0, "step": 13379 }, { "epoch": 1.7020735275410255, "ewc_loss": 0.030056755989789963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0056755349505693e-05, "grad_norm": 17.92069435119629, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8488702774047852, "num_tokens": 510350753.0, "step": 13380 }, { "epoch": 1.7022007378196158, "ewc_loss": 0.0300019308924675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000193100888282e-05, "grad_norm": 17.78101348876953, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8756948113441467, "num_tokens": 510389257.0, "step": 13381 }, { "epoch": 1.7023279480982063, "ewc_loss": 0.029917534440755844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9917535357526504e-05, "grad_norm": 17.805421829223633, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8712162971496582, "num_tokens": 510428879.0, "step": 13382 }, { "epoch": 1.7024551583767968, "ewc_loss": 0.030042672529816628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.004267273354344e-05, "grad_norm": 17.850685119628906, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8734092712402344, "num_tokens": 510468409.0, "step": 13383 }, { "epoch": 1.7025823686553874, "ewc_loss": 0.029988568276166916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9988568712724373e-05, "grad_norm": 17.772457122802734, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8844144940376282, "num_tokens": 510503988.0, "step": 13384 }, { "epoch": 1.702709578933978, "ewc_loss": 0.02996300905942917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9963008273625746e-05, "grad_norm": 17.8303165435791, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.868409276008606, "num_tokens": 510539009.0, "step": 13385 }, { "epoch": 1.7028367892125684, "ewc_loss": 0.030002323910593987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0002323910593987e-05, "grad_norm": 17.876829147338867, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8847146034240723, "num_tokens": 510576177.0, "step": 13386 }, { "epoch": 1.7029639994911587, "ewc_loss": 0.029996756464242935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9996755984029733e-05, "grad_norm": 17.823774337768555, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8695770502090454, "num_tokens": 510622194.0, "step": 13387 }, { "epoch": 1.7030912097697493, "ewc_loss": 0.029964566230773926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9964567147544585e-05, "grad_norm": 17.82390594482422, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8861732482910156, "num_tokens": 510663564.0, "step": 13388 }, { "epoch": 1.7032184200483398, "ewc_loss": 0.029939355328679085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.993935595441144e-05, "grad_norm": 17.804536819458008, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8807660937309265, "num_tokens": 510702153.0, "step": 13389 }, { "epoch": 1.7033456303269303, "ewc_loss": 0.029979348182678223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99793482554378e-05, "grad_norm": 17.792118072509766, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8666679859161377, "num_tokens": 510742812.0, "step": 13390 }, { "epoch": 1.7034728406055208, "ewc_loss": 0.02992570772767067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9925708076916635e-05, "grad_norm": 17.847793579101562, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8654792904853821, "num_tokens": 510783363.0, "step": 13391 }, { "epoch": 1.7036000508841114, "ewc_loss": 0.030006717890501022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0006718588992953e-05, "grad_norm": 17.885364532470703, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8562138080596924, "num_tokens": 510818362.0, "step": 13392 }, { "epoch": 1.703727261162702, "ewc_loss": 0.029958199709653854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.995819886564277e-05, "grad_norm": 17.86423683166504, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8821045756340027, "num_tokens": 510854732.0, "step": 13393 }, { "epoch": 1.7038544714412924, "ewc_loss": 0.029952721670269966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9952721888548695e-05, "grad_norm": 17.91092300415039, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8634294271469116, "num_tokens": 510895107.0, "step": 13394 }, { "epoch": 1.703981681719883, "ewc_loss": 0.029964132234454155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9964132409077138e-05, "grad_norm": 17.839153289794922, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8674225807189941, "num_tokens": 510930737.0, "step": 13395 }, { "epoch": 1.7041088919984735, "ewc_loss": 0.02990776114165783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.990776192746125e-05, "grad_norm": 17.84218406677246, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8777298927307129, "num_tokens": 510964646.0, "step": 13396 }, { "epoch": 1.704236102277064, "ewc_loss": 0.029932942241430283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9932942197774537e-05, "grad_norm": 17.869150161743164, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8739116191864014, "num_tokens": 511000583.0, "step": 13397 }, { "epoch": 1.7043633125556545, "ewc_loss": 0.029928674921393394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.992867484863382e-05, "grad_norm": 17.874486923217773, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8724789023399353, "num_tokens": 511033729.0, "step": 13398 }, { "epoch": 1.704490522834245, "ewc_loss": 0.029955662786960602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9955663194414228e-05, "grad_norm": 17.827999114990234, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8752617835998535, "num_tokens": 511074099.0, "step": 13399 }, { "epoch": 1.7046177331128356, "ewc_loss": 0.029923975467681885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9923974580015056e-05, "grad_norm": 17.83673667907715, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8574250340461731, "num_tokens": 511112473.0, "step": 13400 }, { "epoch": 1.7047449433914261, "ewc_loss": 0.02998216450214386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.9982164051034488e-05, "grad_norm": 17.78148078918457, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8703990578651428, "num_tokens": 511149839.0, "step": 13401 }, { "epoch": 1.7048721536700167, "ewc_loss": 0.029992636293172836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 2.99926359730307e-05, "grad_norm": 17.85297203063965, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8598743081092834, "num_tokens": 511183054.0, "step": 13402 }, { "epoch": 1.7049993639486072, "ewc_loss": 0.030077526345849037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0077526389504783e-05, "grad_norm": 17.887348175048828, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8740358948707581, "num_tokens": 511218498.0, "step": 13403 }, { "epoch": 1.7051265742271977, "ewc_loss": 0.03000197559595108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000197648361791e-05, "grad_norm": 17.761045455932617, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8661395907402039, "num_tokens": 511259776.0, "step": 13404 }, { "epoch": 1.705253784505788, "ewc_loss": 0.030062489211559296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.006248880410567e-05, "grad_norm": 17.81415557861328, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8770124912261963, "num_tokens": 511297331.0, "step": 13405 }, { "epoch": 1.7053809947843785, "ewc_loss": 0.03005712851881981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.005712824233342e-05, "grad_norm": 17.78146743774414, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8821383714675903, "num_tokens": 511331055.0, "step": 13406 }, { "epoch": 1.705508205062969, "ewc_loss": 0.030041517689824104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.004151767527219e-05, "grad_norm": 17.781938552856445, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.8881679773330688, "num_tokens": 511368651.0, "step": 13407 }, { "epoch": 1.7056354153415596, "ewc_loss": 0.03014386259019375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0143863114062697e-05, "grad_norm": 17.874187469482422, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8750859498977661, "num_tokens": 511402163.0, "step": 13408 }, { "epoch": 1.7057626256201501, "ewc_loss": 0.030092738568782806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0092738597886637e-05, "grad_norm": 17.803979873657227, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.863949716091156, "num_tokens": 511443769.0, "step": 13409 }, { "epoch": 1.7058898358987407, "ewc_loss": 0.03008333034813404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0083330784691498e-05, "grad_norm": 17.85508918762207, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8681641221046448, "num_tokens": 511481038.0, "step": 13410 }, { "epoch": 1.706017046177331, "ewc_loss": 0.030089667066931725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0089666324784048e-05, "grad_norm": 17.78896141052246, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8723258972167969, "num_tokens": 511522940.0, "step": 13411 }, { "epoch": 1.7061442564559215, "ewc_loss": 0.03007267601788044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0072675144765526e-05, "grad_norm": 17.861194610595703, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8670520782470703, "num_tokens": 511562630.0, "step": 13412 }, { "epoch": 1.706271466734512, "ewc_loss": 0.030097756534814835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.009775718965102e-05, "grad_norm": 17.85748291015625, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8561109900474548, "num_tokens": 511598986.0, "step": 13413 }, { "epoch": 1.7063986770131025, "ewc_loss": 0.030095573514699936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0095574402366765e-05, "grad_norm": 17.889440536499023, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8721061944961548, "num_tokens": 511633582.0, "step": 13414 }, { "epoch": 1.706525887291693, "ewc_loss": 0.030028726905584335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0028726541786455e-05, "grad_norm": 17.76215934753418, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8703112006187439, "num_tokens": 511673296.0, "step": 13415 }, { "epoch": 1.7066530975702836, "ewc_loss": 0.030054017901420593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0054017770453356e-05, "grad_norm": 17.85194206237793, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8622633218765259, "num_tokens": 511707890.0, "step": 13416 }, { "epoch": 1.7067803078488741, "ewc_loss": 0.030151227489113808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0151228202157654e-05, "grad_norm": 17.853229522705078, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8759828209877014, "num_tokens": 511741597.0, "step": 13417 }, { "epoch": 1.7069075181274647, "ewc_loss": 0.0300542451441288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.00542451441288e-05, "grad_norm": 17.794557571411133, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8741012811660767, "num_tokens": 511776816.0, "step": 13418 }, { "epoch": 1.7070347284060552, "ewc_loss": 0.03010368160903454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010368163813837e-05, "grad_norm": 17.85332679748535, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8639638423919678, "num_tokens": 511814866.0, "step": 13419 }, { "epoch": 1.7071619386846457, "ewc_loss": 0.0300829466432333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008294697792735e-05, "grad_norm": 17.736204147338867, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8719015121459961, "num_tokens": 511852458.0, "step": 13420 }, { "epoch": 1.7072891489632362, "ewc_loss": 0.030096638947725296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.009663851116784e-05, "grad_norm": 17.85054588317871, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8648034334182739, "num_tokens": 511891201.0, "step": 13421 }, { "epoch": 1.7074163592418268, "ewc_loss": 0.030214514583349228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.021451448148582e-05, "grad_norm": 17.791337966918945, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.869805097579956, "num_tokens": 511930862.0, "step": 13422 }, { "epoch": 1.7075435695204173, "ewc_loss": 0.030103519558906555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0103519748081453e-05, "grad_norm": 17.786149978637695, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8775382041931152, "num_tokens": 511967695.0, "step": 13423 }, { "epoch": 1.7076707797990078, "ewc_loss": 0.030141299590468407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01413001579931e-05, "grad_norm": 17.877716064453125, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8840416669845581, "num_tokens": 512005976.0, "step": 13424 }, { "epoch": 1.7077979900775984, "ewc_loss": 0.030131936073303223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0131936000543647e-05, "grad_norm": 17.83905601501465, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.871484637260437, "num_tokens": 512041575.0, "step": 13425 }, { "epoch": 1.7079252003561889, "ewc_loss": 0.03014662116765976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0146620701998472e-05, "grad_norm": 17.80583953857422, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8666173219680786, "num_tokens": 512076171.0, "step": 13426 }, { "epoch": 1.7080524106347794, "ewc_loss": 0.030124925076961517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012492561538238e-05, "grad_norm": 17.844858169555664, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8668296933174133, "num_tokens": 512112082.0, "step": 13427 }, { "epoch": 1.70817962091337, "ewc_loss": 0.03015569970011711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015569927811157e-05, "grad_norm": 17.844514846801758, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8562989830970764, "num_tokens": 512149126.0, "step": 13428 }, { "epoch": 1.7083068311919605, "ewc_loss": 0.03012513369321823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0125132980174385e-05, "grad_norm": 17.877920150756836, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8721740245819092, "num_tokens": 512182368.0, "step": 13429 }, { "epoch": 1.7084340414705508, "ewc_loss": 0.03014829009771347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0148290534270927e-05, "grad_norm": 17.95512580871582, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8509288430213928, "num_tokens": 512218224.0, "step": 13430 }, { "epoch": 1.7085612517491413, "ewc_loss": 0.030110472813248634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0110471925581805e-05, "grad_norm": 17.880022048950195, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8694406747817993, "num_tokens": 512255523.0, "step": 13431 }, { "epoch": 1.7086884620277318, "ewc_loss": 0.030100859701633453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010086038557347e-05, "grad_norm": 17.81397819519043, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8797954320907593, "num_tokens": 512290514.0, "step": 13432 }, { "epoch": 1.7088156723063224, "ewc_loss": 0.03006841614842415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0068416890571825e-05, "grad_norm": 17.85210609436035, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8799474239349365, "num_tokens": 512328468.0, "step": 13433 }, { "epoch": 1.708942882584913, "ewc_loss": 0.0301212165504694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012121669598855e-05, "grad_norm": 17.918630599975586, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8703770637512207, "num_tokens": 512359786.0, "step": 13434 }, { "epoch": 1.7090700928635034, "ewc_loss": 0.03009357489645481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0093575333012268e-05, "grad_norm": 17.900724411010742, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8492908477783203, "num_tokens": 512395679.0, "step": 13435 }, { "epoch": 1.7091973031420937, "ewc_loss": 0.030046597123146057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.004659629368689e-05, "grad_norm": 17.823719024658203, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8693413734436035, "num_tokens": 512433491.0, "step": 13436 }, { "epoch": 1.7093245134206843, "ewc_loss": 0.03011191263794899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0111912565189414e-05, "grad_norm": 17.8670597076416, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8644976019859314, "num_tokens": 512470215.0, "step": 13437 }, { "epoch": 1.7094517236992748, "ewc_loss": 0.030120188370347023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0120188966975547e-05, "grad_norm": 17.85914421081543, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.865200936794281, "num_tokens": 512507055.0, "step": 13438 }, { "epoch": 1.7095789339778653, "ewc_loss": 0.030092209577560425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0092209271970205e-05, "grad_norm": 17.805675506591797, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8524062037467957, "num_tokens": 512547877.0, "step": 13439 }, { "epoch": 1.7097061442564558, "ewc_loss": 0.03011324256658554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0113242246443406e-05, "grad_norm": 17.877710342407227, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8560590744018555, "num_tokens": 512586558.0, "step": 13440 }, { "epoch": 1.7098333545350464, "ewc_loss": 0.030164822936058044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0164823328959756e-05, "grad_norm": 17.91236114501953, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8671994805335999, "num_tokens": 512631181.0, "step": 13441 }, { "epoch": 1.709960564813637, "ewc_loss": 0.030074497684836388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007449777214788e-05, "grad_norm": 17.78504180908203, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8796148300170898, "num_tokens": 512667925.0, "step": 13442 }, { "epoch": 1.7100877750922274, "ewc_loss": 0.030069366097450256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0069366403040476e-05, "grad_norm": 17.866838455200195, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8517731428146362, "num_tokens": 512703136.0, "step": 13443 }, { "epoch": 1.710214985370818, "ewc_loss": 0.030183250084519386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0183249691617675e-05, "grad_norm": 17.789762496948242, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.872289776802063, "num_tokens": 512739001.0, "step": 13444 }, { "epoch": 1.7103421956494085, "ewc_loss": 0.03011024370789528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0110244551906362e-05, "grad_norm": 17.880352020263672, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.873702883720398, "num_tokens": 512780517.0, "step": 13445 }, { "epoch": 1.710469405927999, "ewc_loss": 0.03015521541237831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0155215426930226e-05, "grad_norm": 17.869291305541992, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8762390613555908, "num_tokens": 512815964.0, "step": 13446 }, { "epoch": 1.7105966162065895, "ewc_loss": 0.030084149911999702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0084149329923093e-05, "grad_norm": 17.794639587402344, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8729466199874878, "num_tokens": 512855451.0, "step": 13447 }, { "epoch": 1.71072382648518, "ewc_loss": 0.03007930889725685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0079308999120258e-05, "grad_norm": 17.875324249267578, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8729879856109619, "num_tokens": 512895809.0, "step": 13448 }, { "epoch": 1.7108510367637706, "ewc_loss": 0.030140282586216927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014028334291652e-05, "grad_norm": 17.79378318786621, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.874835729598999, "num_tokens": 512936424.0, "step": 13449 }, { "epoch": 1.7109782470423611, "ewc_loss": 0.030059780925512314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.005978032888379e-05, "grad_norm": 17.80698585510254, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8837454319000244, "num_tokens": 512976486.0, "step": 13450 }, { "epoch": 1.7111054573209517, "ewc_loss": 0.030116276815533638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.011627632076852e-05, "grad_norm": 17.802064895629883, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8613278865814209, "num_tokens": 513014131.0, "step": 13451 }, { "epoch": 1.7112326675995422, "ewc_loss": 0.03014703094959259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0147031793603674e-05, "grad_norm": 17.90223503112793, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8423867225646973, "num_tokens": 513054706.0, "step": 13452 }, { "epoch": 1.7113598778781327, "ewc_loss": 0.03005296178162098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0052960937609896e-05, "grad_norm": 17.80068588256836, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8747310638427734, "num_tokens": 513094089.0, "step": 13453 }, { "epoch": 1.711487088156723, "ewc_loss": 0.03008771874010563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008771818713285e-05, "grad_norm": 17.824525833129883, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8678325414657593, "num_tokens": 513136306.0, "step": 13454 }, { "epoch": 1.7116142984353135, "ewc_loss": 0.03019256703555584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0192566555342637e-05, "grad_norm": 17.892358779907227, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8662729859352112, "num_tokens": 513174376.0, "step": 13455 }, { "epoch": 1.711741508713904, "ewc_loss": 0.03000963106751442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.000963079102803e-05, "grad_norm": 17.79554557800293, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8729325532913208, "num_tokens": 513211834.0, "step": 13456 }, { "epoch": 1.7118687189924946, "ewc_loss": 0.03012489341199398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0124892873573117e-05, "grad_norm": 17.92182731628418, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.860768735408783, "num_tokens": 513241765.0, "step": 13457 }, { "epoch": 1.7119959292710851, "ewc_loss": 0.030064668506383896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0064667953411117e-05, "grad_norm": 17.81796646118164, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8568494915962219, "num_tokens": 513283510.0, "step": 13458 }, { "epoch": 1.7121231395496757, "ewc_loss": 0.030077438801527023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0077439078013413e-05, "grad_norm": 17.89820671081543, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8662291765213013, "num_tokens": 513320746.0, "step": 13459 }, { "epoch": 1.712250349828266, "ewc_loss": 0.030073704198002815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0073704692767933e-05, "grad_norm": 17.82866668701172, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8665947318077087, "num_tokens": 513358036.0, "step": 13460 }, { "epoch": 1.7123775601068565, "ewc_loss": 0.0300950538367033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.009505417139735e-05, "grad_norm": 17.867023468017578, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8747468590736389, "num_tokens": 513395156.0, "step": 13461 }, { "epoch": 1.712504770385447, "ewc_loss": 0.030092885717749596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0092885936028324e-05, "grad_norm": 17.78012466430664, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8638455271720886, "num_tokens": 513438631.0, "step": 13462 }, { "epoch": 1.7126319806640375, "ewc_loss": 0.03008325956761837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008325984410476e-05, "grad_norm": 17.930322647094727, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8610833287239075, "num_tokens": 513471225.0, "step": 13463 }, { "epoch": 1.712759190942628, "ewc_loss": 0.030137455090880394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0137454814394005e-05, "grad_norm": 17.873414993286133, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8618054389953613, "num_tokens": 513513824.0, "step": 13464 }, { "epoch": 1.7128864012212186, "ewc_loss": 0.030072126537561417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0072125809965655e-05, "grad_norm": 17.881948471069336, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8631296157836914, "num_tokens": 513556263.0, "step": 13465 }, { "epoch": 1.7130136114998091, "ewc_loss": 0.030147040262818336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014704088855069e-05, "grad_norm": 17.79891586303711, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8728629350662231, "num_tokens": 513595093.0, "step": 13466 }, { "epoch": 1.7131408217783997, "ewc_loss": 0.030082140117883682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0082139346632175e-05, "grad_norm": 17.910480499267578, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8620970845222473, "num_tokens": 513633761.0, "step": 13467 }, { "epoch": 1.7132680320569902, "ewc_loss": 0.030174970626831055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0174971470842138e-05, "grad_norm": 17.774843215942383, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8659482598304749, "num_tokens": 513671404.0, "step": 13468 }, { "epoch": 1.7133952423355807, "ewc_loss": 0.030108245089650154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0108245482551865e-05, "grad_norm": 17.897029876708984, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8684245347976685, "num_tokens": 513713752.0, "step": 13469 }, { "epoch": 1.7135224526141712, "ewc_loss": 0.03020520694553852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0205206712707877e-05, "grad_norm": 17.88848304748535, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8658133149147034, "num_tokens": 513755261.0, "step": 13470 }, { "epoch": 1.7136496628927618, "ewc_loss": 0.03009987249970436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0099872674327344e-05, "grad_norm": 17.844146728515625, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8740684390068054, "num_tokens": 513799042.0, "step": 13471 }, { "epoch": 1.7137768731713523, "ewc_loss": 0.030140789225697517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0140789021970704e-05, "grad_norm": 17.896709442138672, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8597751259803772, "num_tokens": 513838148.0, "step": 13472 }, { "epoch": 1.7139040834499428, "ewc_loss": 0.030154110863804817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0154111300362274e-05, "grad_norm": 17.8424072265625, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8749163150787354, "num_tokens": 513878960.0, "step": 13473 }, { "epoch": 1.7140312937285334, "ewc_loss": 0.030086098238825798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008609746757429e-05, "grad_norm": 17.896793365478516, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8704841732978821, "num_tokens": 513914083.0, "step": 13474 }, { "epoch": 1.7141585040071239, "ewc_loss": 0.030128872022032738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128872822388075e-05, "grad_norm": 17.803205490112305, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8586004972457886, "num_tokens": 513951591.0, "step": 13475 }, { "epoch": 1.7142857142857144, "ewc_loss": 0.03008442372083664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008442399732303e-05, "grad_norm": 17.872182846069336, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8563187122344971, "num_tokens": 513986127.0, "step": 13476 }, { "epoch": 1.714412924564305, "ewc_loss": 0.03015209175646305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0152092222124338e-05, "grad_norm": 17.80824089050293, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8648039102554321, "num_tokens": 514028248.0, "step": 13477 }, { "epoch": 1.7145401348428955, "ewc_loss": 0.0301045048981905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0104505640338175e-05, "grad_norm": 17.807329177856445, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8729202151298523, "num_tokens": 514062757.0, "step": 13478 }, { "epoch": 1.7146673451214858, "ewc_loss": 0.030157428234815598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015742913703434e-05, "grad_norm": 17.872739791870117, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8561617136001587, "num_tokens": 514102635.0, "step": 13479 }, { "epoch": 1.7147945554000763, "ewc_loss": 0.030167562887072563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0167562727001496e-05, "grad_norm": 17.87880516052246, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8684073090553284, "num_tokens": 514140883.0, "step": 13480 }, { "epoch": 1.7149217656786668, "ewc_loss": 0.0300651453435421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0065144528634846e-05, "grad_norm": 17.859373092651367, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8774114847183228, "num_tokens": 514176536.0, "step": 13481 }, { "epoch": 1.7150489759572574, "ewc_loss": 0.03012312762439251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0123128453851677e-05, "grad_norm": 17.920488357543945, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.863451361656189, "num_tokens": 514209969.0, "step": 13482 }, { "epoch": 1.7151761862358479, "ewc_loss": 0.030104197561740875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0104198231128976e-05, "grad_norm": 17.8758544921875, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8683508634567261, "num_tokens": 514249679.0, "step": 13483 }, { "epoch": 1.7153033965144384, "ewc_loss": 0.030101165175437927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0101165975793265e-05, "grad_norm": 17.822282791137695, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8629906177520752, "num_tokens": 514292719.0, "step": 13484 }, { "epoch": 1.7154306067930287, "ewc_loss": 0.030085870996117592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008587191288825e-05, "grad_norm": 17.88112449645996, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8633987903594971, "num_tokens": 514323396.0, "step": 13485 }, { "epoch": 1.7155578170716193, "ewc_loss": 0.03016713634133339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0167137083481066e-05, "grad_norm": 17.810935974121094, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.874366044998169, "num_tokens": 514360951.0, "step": 13486 }, { "epoch": 1.7156850273502098, "ewc_loss": 0.030048362910747528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0048362532397732e-05, "grad_norm": 17.874591827392578, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8528722524642944, "num_tokens": 514403879.0, "step": 13487 }, { "epoch": 1.7158122376288003, "ewc_loss": 0.030159585177898407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0159584639477544e-05, "grad_norm": 17.816801071166992, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8767869472503662, "num_tokens": 514446628.0, "step": 13488 }, { "epoch": 1.7159394479073908, "ewc_loss": 0.03003830462694168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0038305339985527e-05, "grad_norm": 17.79258155822754, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8698756694793701, "num_tokens": 514486132.0, "step": 13489 }, { "epoch": 1.7160666581859814, "ewc_loss": 0.03014489822089672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0144898119033314e-05, "grad_norm": 17.830514907836914, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8583473563194275, "num_tokens": 514526384.0, "step": 13490 }, { "epoch": 1.716193868464572, "ewc_loss": 0.030107056722044945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0107055863481946e-05, "grad_norm": 17.855260848999023, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8669205904006958, "num_tokens": 514565828.0, "step": 13491 }, { "epoch": 1.7163210787431624, "ewc_loss": 0.03012813813984394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128137950669043e-05, "grad_norm": 17.78778648376465, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.863868236541748, "num_tokens": 514605260.0, "step": 13492 }, { "epoch": 1.716448289021753, "ewc_loss": 0.030199283733963966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0199284083209932e-05, "grad_norm": 17.917776107788086, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8702270984649658, "num_tokens": 514643433.0, "step": 13493 }, { "epoch": 1.7165754993003435, "ewc_loss": 0.03021063841879368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0210638215066865e-05, "grad_norm": 17.81094741821289, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8717302680015564, "num_tokens": 514683264.0, "step": 13494 }, { "epoch": 1.716702709578934, "ewc_loss": 0.030107764527201653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010776526934933e-05, "grad_norm": 17.907100677490234, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8756704330444336, "num_tokens": 514719775.0, "step": 13495 }, { "epoch": 1.7168299198575245, "ewc_loss": 0.030095158144831657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0095157853793353e-05, "grad_norm": 17.715667724609375, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.862634003162384, "num_tokens": 514766287.0, "step": 13496 }, { "epoch": 1.716957130136115, "ewc_loss": 0.030123401433229446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012340130226221e-05, "grad_norm": 17.835323333740234, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8780797123908997, "num_tokens": 514806725.0, "step": 13497 }, { "epoch": 1.7170843404147056, "ewc_loss": 0.0301731675863266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.017316703335382e-05, "grad_norm": 17.806875228881836, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8713414072990417, "num_tokens": 514844008.0, "step": 13498 }, { "epoch": 1.7172115506932961, "ewc_loss": 0.0301644466817379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.016444679815322e-05, "grad_norm": 17.861732482910156, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8575809001922607, "num_tokens": 514880356.0, "step": 13499 }, { "epoch": 1.7173387609718866, "ewc_loss": 0.030068736523389816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.006873703270685e-05, "grad_norm": 17.773555755615234, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8796489238739014, "num_tokens": 514915309.0, "step": 13500 }, { "epoch": 1.7174659712504772, "ewc_loss": 0.030118428170681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0118428185232915e-05, "grad_norm": 17.845447540283203, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8612271547317505, "num_tokens": 514956496.0, "step": 13501 }, { "epoch": 1.7175931815290677, "ewc_loss": 0.03016449511051178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0164495910867117e-05, "grad_norm": 17.83610725402832, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8694016933441162, "num_tokens": 514992247.0, "step": 13502 }, { "epoch": 1.717720391807658, "ewc_loss": 0.030140284448862076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0140285161905922e-05, "grad_norm": 17.840862274169922, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8607446551322937, "num_tokens": 515035443.0, "step": 13503 }, { "epoch": 1.7178476020862485, "ewc_loss": 0.030169671401381493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0169670935720205e-05, "grad_norm": 17.80299186706543, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8729530572891235, "num_tokens": 515072279.0, "step": 13504 }, { "epoch": 1.717974812364839, "ewc_loss": 0.030108828097581863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0108827559161e-05, "grad_norm": 17.816205978393555, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8681200742721558, "num_tokens": 515112547.0, "step": 13505 }, { "epoch": 1.7181020226434296, "ewc_loss": 0.030160963535308838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.016096343344543e-05, "grad_norm": 17.886510848999023, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8568722009658813, "num_tokens": 515146814.0, "step": 13506 }, { "epoch": 1.7182292329220201, "ewc_loss": 0.030125943943858147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0125944249448366e-05, "grad_norm": 17.79176139831543, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8710649013519287, "num_tokens": 515184393.0, "step": 13507 }, { "epoch": 1.7183564432006107, "ewc_loss": 0.030179675668478012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0179675377439708e-05, "grad_norm": 17.858116149902344, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8609708547592163, "num_tokens": 515225312.0, "step": 13508 }, { "epoch": 1.718483653479201, "ewc_loss": 0.03015221282839775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0152212275424972e-05, "grad_norm": 17.75548553466797, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8462589383125305, "num_tokens": 515261330.0, "step": 13509 }, { "epoch": 1.7186108637577915, "ewc_loss": 0.030175622552633286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0175622669048607e-05, "grad_norm": 17.944639205932617, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8740876913070679, "num_tokens": 515299325.0, "step": 13510 }, { "epoch": 1.718738074036382, "ewc_loss": 0.030193200334906578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.019319956365507e-05, "grad_norm": 17.803674697875977, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8693472146987915, "num_tokens": 515335213.0, "step": 13511 }, { "epoch": 1.7188652843149725, "ewc_loss": 0.030119305476546288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0119304938125424e-05, "grad_norm": 17.920549392700195, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8808804750442505, "num_tokens": 515365699.0, "step": 13512 }, { "epoch": 1.718992494593563, "ewc_loss": 0.030227208510041237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0227209208533168e-05, "grad_norm": 17.83416748046875, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.877877950668335, "num_tokens": 515396025.0, "step": 13513 }, { "epoch": 1.7191197048721536, "ewc_loss": 0.03006919100880623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0069191780057736e-05, "grad_norm": 17.954227447509766, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8685857057571411, "num_tokens": 515429724.0, "step": 13514 }, { "epoch": 1.7192469151507441, "ewc_loss": 0.030220095068216324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.02200951409759e-05, "grad_norm": 17.835208892822266, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8657602071762085, "num_tokens": 515465894.0, "step": 13515 }, { "epoch": 1.7193741254293347, "ewc_loss": 0.03007172793149948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007172745128628e-05, "grad_norm": 17.8770694732666, "learning_rate": 1e-06, "loss": 0.3504, "mean_token_accuracy": 0.8872442841529846, "num_tokens": 515498894.0, "step": 13516 }, { "epoch": 1.7195013357079252, "ewc_loss": 0.03021148219704628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.021148222615011e-05, "grad_norm": 17.90042495727539, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8684388399124146, "num_tokens": 515538783.0, "step": 13517 }, { "epoch": 1.7196285459865157, "ewc_loss": 0.03010849840939045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0108498322078958e-05, "grad_norm": 17.846555709838867, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8670264482498169, "num_tokens": 515573107.0, "step": 13518 }, { "epoch": 1.7197557562651062, "ewc_loss": 0.03015732765197754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0157327273627743e-05, "grad_norm": 17.909160614013672, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8702634572982788, "num_tokens": 515614901.0, "step": 13519 }, { "epoch": 1.7198829665436968, "ewc_loss": 0.030210111290216446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0210110708139837e-05, "grad_norm": 17.894540786743164, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8722718954086304, "num_tokens": 515654197.0, "step": 13520 }, { "epoch": 1.7200101768222873, "ewc_loss": 0.030118068680167198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0118068025331013e-05, "grad_norm": 17.813121795654297, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.887438178062439, "num_tokens": 515697003.0, "step": 13521 }, { "epoch": 1.7201373871008778, "ewc_loss": 0.030135169625282288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013517016370315e-05, "grad_norm": 17.89767837524414, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.865188717842102, "num_tokens": 515737801.0, "step": 13522 }, { "epoch": 1.7202645973794684, "ewc_loss": 0.030161388218402863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.016138907696586e-05, "grad_norm": 17.931060791015625, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8610793352127075, "num_tokens": 515774711.0, "step": 13523 }, { "epoch": 1.7203918076580589, "ewc_loss": 0.03019017167389393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0190170946298167e-05, "grad_norm": 17.900026321411133, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.861184298992157, "num_tokens": 515811728.0, "step": 13524 }, { "epoch": 1.7205190179366494, "ewc_loss": 0.030097514390945435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0097515264060348e-05, "grad_norm": 17.8448543548584, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8750567436218262, "num_tokens": 515844425.0, "step": 13525 }, { "epoch": 1.72064622821524, "ewc_loss": 0.03018558956682682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0185588911990635e-05, "grad_norm": 17.920503616333008, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8710727095603943, "num_tokens": 515882439.0, "step": 13526 }, { "epoch": 1.7207734384938305, "ewc_loss": 0.03009744919836521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.009744978044182e-05, "grad_norm": 17.83698272705078, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8464978933334351, "num_tokens": 515922168.0, "step": 13527 }, { "epoch": 1.7209006487724208, "ewc_loss": 0.030144739896059036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0144739866955206e-05, "grad_norm": 17.868356704711914, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8757035732269287, "num_tokens": 515961840.0, "step": 13528 }, { "epoch": 1.7210278590510113, "ewc_loss": 0.03016519919037819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.016519985976629e-05, "grad_norm": 17.93798065185547, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8697792887687683, "num_tokens": 515997586.0, "step": 13529 }, { "epoch": 1.7211550693296018, "ewc_loss": 0.030128872022032738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128872822388075e-05, "grad_norm": 17.850313186645508, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.875144898891449, "num_tokens": 516036660.0, "step": 13530 }, { "epoch": 1.7212822796081924, "ewc_loss": 0.03007698804140091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0076987968641333e-05, "grad_norm": 17.917125701904297, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8618409037590027, "num_tokens": 516070846.0, "step": 13531 }, { "epoch": 1.7214094898867829, "ewc_loss": 0.030219754204154015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0219754989957437e-05, "grad_norm": 17.91526985168457, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.8904749155044556, "num_tokens": 516107027.0, "step": 13532 }, { "epoch": 1.7215367001653734, "ewc_loss": 0.030127832666039467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0127832360449247e-05, "grad_norm": 17.878252029418945, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8704921007156372, "num_tokens": 516143971.0, "step": 13533 }, { "epoch": 1.7216639104439637, "ewc_loss": 0.03010348230600357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010348154930398e-05, "grad_norm": 17.814414978027344, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8714540004730225, "num_tokens": 516180617.0, "step": 13534 }, { "epoch": 1.7217911207225542, "ewc_loss": 0.030122794210910797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0122793759801425e-05, "grad_norm": 17.8817138671875, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8698135614395142, "num_tokens": 516220371.0, "step": 13535 }, { "epoch": 1.7219183310011448, "ewc_loss": 0.03014058619737625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014058711414691e-05, "grad_norm": 17.81568145751953, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.872462272644043, "num_tokens": 516258642.0, "step": 13536 }, { "epoch": 1.7220455412797353, "ewc_loss": 0.030138656497001648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013865716638975e-05, "grad_norm": 17.83776092529297, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8624303340911865, "num_tokens": 516298971.0, "step": 13537 }, { "epoch": 1.7221727515583258, "ewc_loss": 0.03022579662501812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0225795853766613e-05, "grad_norm": 17.807544708251953, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8809183835983276, "num_tokens": 516334457.0, "step": 13538 }, { "epoch": 1.7222999618369164, "ewc_loss": 0.03015250898897648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015250877069775e-05, "grad_norm": 17.839021682739258, "learning_rate": 1e-06, "loss": 0.3456, "mean_token_accuracy": 0.894349217414856, "num_tokens": 516374585.0, "step": 13539 }, { "epoch": 1.7224271721155069, "ewc_loss": 0.030206559225916862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0206560040824115e-05, "grad_norm": 17.86194610595703, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8552871942520142, "num_tokens": 516414180.0, "step": 13540 }, { "epoch": 1.7225543823940974, "ewc_loss": 0.030194755643606186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0194754799595103e-05, "grad_norm": 17.91313934326172, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8779329061508179, "num_tokens": 516450189.0, "step": 13541 }, { "epoch": 1.722681592672688, "ewc_loss": 0.030186446383595467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0186445655999705e-05, "grad_norm": 17.831693649291992, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8674285411834717, "num_tokens": 516487549.0, "step": 13542 }, { "epoch": 1.7228088029512785, "ewc_loss": 0.03011980652809143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0119806979200803e-05, "grad_norm": 17.82121467590332, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8563829064369202, "num_tokens": 516526649.0, "step": 13543 }, { "epoch": 1.722936013229869, "ewc_loss": 0.030196959152817726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01969594147522e-05, "grad_norm": 17.83234405517578, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8741977214813232, "num_tokens": 516563829.0, "step": 13544 }, { "epoch": 1.7230632235084595, "ewc_loss": 0.030184512957930565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0184512070263736e-05, "grad_norm": 17.82069969177246, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.853508472442627, "num_tokens": 516605179.0, "step": 13545 }, { "epoch": 1.72319043378705, "ewc_loss": 0.030191268771886826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.019126961589791e-05, "grad_norm": 17.851472854614258, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8677172660827637, "num_tokens": 516641576.0, "step": 13546 }, { "epoch": 1.7233176440656406, "ewc_loss": 0.030190270394086838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0190270990715362e-05, "grad_norm": 17.794649124145508, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8662950396537781, "num_tokens": 516678687.0, "step": 13547 }, { "epoch": 1.7234448543442311, "ewc_loss": 0.030248157680034637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0248158509493805e-05, "grad_norm": 17.95719337463379, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8638405203819275, "num_tokens": 516711484.0, "step": 13548 }, { "epoch": 1.7235720646228216, "ewc_loss": 0.030222615227103233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.022261444129981e-05, "grad_norm": 17.86861228942871, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8651495575904846, "num_tokens": 516750598.0, "step": 13549 }, { "epoch": 1.7236992749014122, "ewc_loss": 0.03015373833477497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0153738407534547e-05, "grad_norm": 17.765018463134766, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8757815957069397, "num_tokens": 516793438.0, "step": 13550 }, { "epoch": 1.7238264851800027, "ewc_loss": 0.0302780382335186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0278039048425853e-05, "grad_norm": 17.92813491821289, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8837422132492065, "num_tokens": 516828692.0, "step": 13551 }, { "epoch": 1.723953695458593, "ewc_loss": 0.030202168971300125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0202169000403956e-05, "grad_norm": 17.836257934570312, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8705599904060364, "num_tokens": 516867592.0, "step": 13552 }, { "epoch": 1.7240809057371835, "ewc_loss": 0.030179103836417198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0179104214766994e-05, "grad_norm": 17.895240783691406, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8572514057159424, "num_tokens": 516907668.0, "step": 13553 }, { "epoch": 1.724208116015774, "ewc_loss": 0.030241413041949272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0241413696785457e-05, "grad_norm": 17.805025100708008, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8667092323303223, "num_tokens": 516944628.0, "step": 13554 }, { "epoch": 1.7243353262943646, "ewc_loss": 0.030183687806129456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.018368806806393e-05, "grad_norm": 17.828458786010742, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8641543388366699, "num_tokens": 516989027.0, "step": 13555 }, { "epoch": 1.7244625365729551, "ewc_loss": 0.03025941550731659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025941623491235e-05, "grad_norm": 17.864654541015625, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8648217916488647, "num_tokens": 517024464.0, "step": 13556 }, { "epoch": 1.7245897468515456, "ewc_loss": 0.030214035883545876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.021403608727269e-05, "grad_norm": 17.86490821838379, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8732814788818359, "num_tokens": 517067714.0, "step": 13557 }, { "epoch": 1.724716957130136, "ewc_loss": 0.03019256517291069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0192564736353233e-05, "grad_norm": 17.831504821777344, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8729395270347595, "num_tokens": 517107564.0, "step": 13558 }, { "epoch": 1.7248441674087265, "ewc_loss": 0.03020099177956581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020099211425986e-05, "grad_norm": 17.85782241821289, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8744831085205078, "num_tokens": 517148240.0, "step": 13559 }, { "epoch": 1.724971377687317, "ewc_loss": 0.03018968179821968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0189681638148613e-05, "grad_norm": 17.86899757385254, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8619989156723022, "num_tokens": 517187841.0, "step": 13560 }, { "epoch": 1.7250985879659075, "ewc_loss": 0.030162805691361427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0162806069711223e-05, "grad_norm": 17.87687873840332, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8541790246963501, "num_tokens": 517225732.0, "step": 13561 }, { "epoch": 1.725225798244498, "ewc_loss": 0.03015941008925438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0159410016494803e-05, "grad_norm": 17.934133529663086, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8520540595054626, "num_tokens": 517260423.0, "step": 13562 }, { "epoch": 1.7253530085230886, "ewc_loss": 0.030113790184259415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0113789762253873e-05, "grad_norm": 17.835058212280273, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8566077947616577, "num_tokens": 517300862.0, "step": 13563 }, { "epoch": 1.7254802188016791, "ewc_loss": 0.030131451785564423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0131452149362303e-05, "grad_norm": 17.872556686401367, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.87303626537323, "num_tokens": 517341010.0, "step": 13564 }, { "epoch": 1.7256074290802697, "ewc_loss": 0.030170166864991188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0170167519827373e-05, "grad_norm": 17.84440040588379, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8520780801773071, "num_tokens": 517376853.0, "step": 13565 }, { "epoch": 1.7257346393588602, "ewc_loss": 0.030147051438689232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0147051802487113e-05, "grad_norm": 17.847949981689453, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8631691932678223, "num_tokens": 517421854.0, "step": 13566 }, { "epoch": 1.7258618496374507, "ewc_loss": 0.03011874295771122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.011874287039973e-05, "grad_norm": 17.831926345825195, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.862957775592804, "num_tokens": 517461711.0, "step": 13567 }, { "epoch": 1.7259890599160412, "ewc_loss": 0.03013344667851925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0133447580737993e-05, "grad_norm": 17.838987350463867, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8754056096076965, "num_tokens": 517494755.0, "step": 13568 }, { "epoch": 1.7261162701946318, "ewc_loss": 0.030147334560751915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0147333745844662e-05, "grad_norm": 17.917158126831055, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8594566583633423, "num_tokens": 517539061.0, "step": 13569 }, { "epoch": 1.7262434804732223, "ewc_loss": 0.030226172879338264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0226172384573147e-05, "grad_norm": 17.9025936126709, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8643205761909485, "num_tokens": 517579768.0, "step": 13570 }, { "epoch": 1.7263706907518128, "ewc_loss": 0.030130630359053612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01306299661519e-05, "grad_norm": 17.954376220703125, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.879636824131012, "num_tokens": 517612006.0, "step": 13571 }, { "epoch": 1.7264979010304033, "ewc_loss": 0.03017369657754898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0173696359270252e-05, "grad_norm": 17.911727905273438, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8681702017784119, "num_tokens": 517647547.0, "step": 13572 }, { "epoch": 1.7266251113089939, "ewc_loss": 0.030128199607133865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128199796308763e-05, "grad_norm": 17.847742080688477, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8588342666625977, "num_tokens": 517689067.0, "step": 13573 }, { "epoch": 1.7267523215875844, "ewc_loss": 0.030132154002785683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0132154279272072e-05, "grad_norm": 17.88205337524414, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8745797276496887, "num_tokens": 517729967.0, "step": 13574 }, { "epoch": 1.726879531866175, "ewc_loss": 0.03019157610833645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0191577025107108e-05, "grad_norm": 17.90862274169922, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8680904507637024, "num_tokens": 517770427.0, "step": 13575 }, { "epoch": 1.7270067421447655, "ewc_loss": 0.030157949775457382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0157949368003756e-05, "grad_norm": 17.847434997558594, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8666412234306335, "num_tokens": 517804202.0, "step": 13576 }, { "epoch": 1.7271339524233558, "ewc_loss": 0.03013226017355919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0132259780657478e-05, "grad_norm": 17.904335021972656, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8599033951759338, "num_tokens": 517834825.0, "step": 13577 }, { "epoch": 1.7272611627019463, "ewc_loss": 0.030187461525201797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0187460652086884e-05, "grad_norm": 17.943849563598633, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8607617616653442, "num_tokens": 517874600.0, "step": 13578 }, { "epoch": 1.7273883729805368, "ewc_loss": 0.030160386115312576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0160386813804507e-05, "grad_norm": 17.892553329467773, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8589915633201599, "num_tokens": 517913488.0, "step": 13579 }, { "epoch": 1.7275155832591274, "ewc_loss": 0.030195077881217003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0195078579708934e-05, "grad_norm": 17.899333953857422, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8821749687194824, "num_tokens": 517951094.0, "step": 13580 }, { "epoch": 1.7276427935377179, "ewc_loss": 0.030141517519950867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0141518436721526e-05, "grad_norm": 17.837976455688477, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.858246386051178, "num_tokens": 517993811.0, "step": 13581 }, { "epoch": 1.7277700038163084, "ewc_loss": 0.030179476365447044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.017947710759472e-05, "grad_norm": 17.893268585205078, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8713904023170471, "num_tokens": 518032860.0, "step": 13582 }, { "epoch": 1.7278972140948987, "ewc_loss": 0.030191369354724884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0191369660315104e-05, "grad_norm": 17.86919593811035, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8640775680541992, "num_tokens": 518070256.0, "step": 13583 }, { "epoch": 1.7280244243734892, "ewc_loss": 0.030130719766020775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0130719096632674e-05, "grad_norm": 17.921541213989258, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8748466968536377, "num_tokens": 518102828.0, "step": 13584 }, { "epoch": 1.7281516346520798, "ewc_loss": 0.03015591949224472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.01559193758294e-05, "grad_norm": 17.83544921875, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8819008469581604, "num_tokens": 518146859.0, "step": 13585 }, { "epoch": 1.7282788449306703, "ewc_loss": 0.030187532305717468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0187531592673622e-05, "grad_norm": 17.992830276489258, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8756781816482544, "num_tokens": 518186173.0, "step": 13586 }, { "epoch": 1.7284060552092608, "ewc_loss": 0.030214909464120865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.021490920218639e-05, "grad_norm": 17.859819412231445, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8526118397712708, "num_tokens": 518221903.0, "step": 13587 }, { "epoch": 1.7285332654878514, "ewc_loss": 0.030137233436107635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0137232897686772e-05, "grad_norm": 17.91009521484375, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8594496846199036, "num_tokens": 518258048.0, "step": 13588 }, { "epoch": 1.7286604757664419, "ewc_loss": 0.03025149740278721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0251498174038716e-05, "grad_norm": 17.919893264770508, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8599655032157898, "num_tokens": 518299127.0, "step": 13589 }, { "epoch": 1.7287876860450324, "ewc_loss": 0.030125973746180534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0125973353278823e-05, "grad_norm": 17.860870361328125, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.879187285900116, "num_tokens": 518336520.0, "step": 13590 }, { "epoch": 1.728914896323623, "ewc_loss": 0.030200229957699776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0200229957699776e-05, "grad_norm": 17.967390060424805, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8649044036865234, "num_tokens": 518368484.0, "step": 13591 }, { "epoch": 1.7290421066022135, "ewc_loss": 0.030167069286108017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0167069780873135e-05, "grad_norm": 17.843656539916992, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8666613101959229, "num_tokens": 518408439.0, "step": 13592 }, { "epoch": 1.729169316880804, "ewc_loss": 0.030111847445368767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0111847081570886e-05, "grad_norm": 17.887514114379883, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8486380577087402, "num_tokens": 518447414.0, "step": 13593 }, { "epoch": 1.7292965271593945, "ewc_loss": 0.030236637219786644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0236637030611746e-05, "grad_norm": 17.949995040893555, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8645457625389099, "num_tokens": 518486455.0, "step": 13594 }, { "epoch": 1.729423737437985, "ewc_loss": 0.030138012021780014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013801142515149e-05, "grad_norm": 17.829307556152344, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8851277828216553, "num_tokens": 518527023.0, "step": 13595 }, { "epoch": 1.7295509477165756, "ewc_loss": 0.03020087257027626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0200872060959227e-05, "grad_norm": 17.951377868652344, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8619006872177124, "num_tokens": 518563285.0, "step": 13596 }, { "epoch": 1.729678157995166, "ewc_loss": 0.030229339376091957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.022933924512472e-05, "grad_norm": 17.88448715209961, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8694074153900146, "num_tokens": 518607610.0, "step": 13597 }, { "epoch": 1.7298053682737566, "ewc_loss": 0.03011961840093136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0119617804302834e-05, "grad_norm": 17.890600204467773, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8770309686660767, "num_tokens": 518649434.0, "step": 13598 }, { "epoch": 1.7299325785523472, "ewc_loss": 0.030254455283284187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025445585080888e-05, "grad_norm": 17.926197052001953, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8754938244819641, "num_tokens": 518684508.0, "step": 13599 }, { "epoch": 1.7300597888309377, "ewc_loss": 0.030115995556116104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0115996196400374e-05, "grad_norm": 17.90059471130371, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8746730089187622, "num_tokens": 518717883.0, "step": 13600 }, { "epoch": 1.730186999109528, "ewc_loss": 0.030134525150060654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0134524422464892e-05, "grad_norm": 17.896448135375977, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8750657439231873, "num_tokens": 518756158.0, "step": 13601 }, { "epoch": 1.7303142093881185, "ewc_loss": 0.030108334496617317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.010833461303264e-05, "grad_norm": 17.894283294677734, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8681690692901611, "num_tokens": 518793003.0, "step": 13602 }, { "epoch": 1.730441419666709, "ewc_loss": 0.030140364542603493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0140365197439678e-05, "grad_norm": 17.951122283935547, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8624331951141357, "num_tokens": 518838485.0, "step": 13603 }, { "epoch": 1.7305686299452996, "ewc_loss": 0.03014819324016571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.014819412783254e-05, "grad_norm": 17.808242797851562, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8627262115478516, "num_tokens": 518885936.0, "step": 13604 }, { "epoch": 1.7306958402238901, "ewc_loss": 0.030106071382761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0106071790214628e-05, "grad_norm": 17.85365104675293, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8560610413551331, "num_tokens": 518925684.0, "step": 13605 }, { "epoch": 1.7308230505024806, "ewc_loss": 0.03016510047018528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0165099815349095e-05, "grad_norm": 17.954931259155273, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8714162111282349, "num_tokens": 518966438.0, "step": 13606 }, { "epoch": 1.730950260781071, "ewc_loss": 0.03013804368674755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0138044166960754e-05, "grad_norm": 17.845735549926758, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8710633516311646, "num_tokens": 519013576.0, "step": 13607 }, { "epoch": 1.7310774710596615, "ewc_loss": 0.030094709247350693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0094708563410677e-05, "grad_norm": 17.83066749572754, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8656913042068481, "num_tokens": 519045342.0, "step": 13608 }, { "epoch": 1.731204681338252, "ewc_loss": 0.030152034014463425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0152034014463425e-05, "grad_norm": 17.904987335205078, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8538522124290466, "num_tokens": 519088566.0, "step": 13609 }, { "epoch": 1.7313318916168425, "ewc_loss": 0.030171146616339684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.017114613612648e-05, "grad_norm": 17.906810760498047, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8721578121185303, "num_tokens": 519126516.0, "step": 13610 }, { "epoch": 1.731459101895433, "ewc_loss": 0.03012670949101448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012671004398726e-05, "grad_norm": 17.819618225097656, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8771778345108032, "num_tokens": 519165431.0, "step": 13611 }, { "epoch": 1.7315863121740236, "ewc_loss": 0.030082644894719124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.008264502568636e-05, "grad_norm": 17.85692596435547, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8827574253082275, "num_tokens": 519200991.0, "step": 13612 }, { "epoch": 1.7317135224526141, "ewc_loss": 0.030183322727680206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0183322451193817e-05, "grad_norm": 17.844282150268555, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8747055530548096, "num_tokens": 519245988.0, "step": 13613 }, { "epoch": 1.7318407327312046, "ewc_loss": 0.030115678906440735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0115679692244157e-05, "grad_norm": 17.892597198486328, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8735371828079224, "num_tokens": 519282595.0, "step": 13614 }, { "epoch": 1.7319679430097952, "ewc_loss": 0.03015298955142498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0152988983900286e-05, "grad_norm": 17.830585479736328, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8628290891647339, "num_tokens": 519319061.0, "step": 13615 }, { "epoch": 1.7320951532883857, "ewc_loss": 0.03012140840291977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0121407689875923e-05, "grad_norm": 17.85045051574707, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8666856288909912, "num_tokens": 519358617.0, "step": 13616 }, { "epoch": 1.7322223635669762, "ewc_loss": 0.03026636131107807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026636113645509e-05, "grad_norm": 17.86716651916504, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8674725294113159, "num_tokens": 519392461.0, "step": 13617 }, { "epoch": 1.7323495738455668, "ewc_loss": 0.03018266335129738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0182663977029733e-05, "grad_norm": 17.860315322875977, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8761141896247864, "num_tokens": 519432893.0, "step": 13618 }, { "epoch": 1.7324767841241573, "ewc_loss": 0.030181583017110825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0181583497324027e-05, "grad_norm": 17.881683349609375, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8685293197631836, "num_tokens": 519473445.0, "step": 13619 }, { "epoch": 1.7326039944027478, "ewc_loss": 0.030254147946834564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0254148441599682e-05, "grad_norm": 17.833683013916016, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8687061071395874, "num_tokens": 519518883.0, "step": 13620 }, { "epoch": 1.7327312046813383, "ewc_loss": 0.03016047365963459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0160474125295877e-05, "grad_norm": 17.880456924438477, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8732165098190308, "num_tokens": 519550123.0, "step": 13621 }, { "epoch": 1.7328584149599289, "ewc_loss": 0.030255824327468872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0255823730840348e-05, "grad_norm": 17.87334632873535, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8590519428253174, "num_tokens": 519592538.0, "step": 13622 }, { "epoch": 1.7329856252385194, "ewc_loss": 0.030210403725504875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0210403565433808e-05, "grad_norm": 17.838956832885742, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8689876794815063, "num_tokens": 519633466.0, "step": 13623 }, { "epoch": 1.73311283551711, "ewc_loss": 0.030182821676135063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0182822229107842e-05, "grad_norm": 17.87920570373535, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8651911020278931, "num_tokens": 519672569.0, "step": 13624 }, { "epoch": 1.7332400457957005, "ewc_loss": 0.030296821147203445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0296821933006868e-05, "grad_norm": 17.933448791503906, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8745322823524475, "num_tokens": 519707383.0, "step": 13625 }, { "epoch": 1.7333672560742908, "ewc_loss": 0.03021508827805519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0215087463147938e-05, "grad_norm": 17.852079391479492, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8508978486061096, "num_tokens": 519748187.0, "step": 13626 }, { "epoch": 1.7334944663528813, "ewc_loss": 0.030189601704478264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0189601602614857e-05, "grad_norm": 17.93448257446289, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8652499318122864, "num_tokens": 519790215.0, "step": 13627 }, { "epoch": 1.7336216766314718, "ewc_loss": 0.03022809512913227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0228095056372695e-05, "grad_norm": 17.840560913085938, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8600488901138306, "num_tokens": 519827278.0, "step": 13628 }, { "epoch": 1.7337488869100623, "ewc_loss": 0.030159002169966698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015900256286841e-05, "grad_norm": 17.881534576416016, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8745001554489136, "num_tokens": 519860216.0, "step": 13629 }, { "epoch": 1.7338760971886529, "ewc_loss": 0.03019823133945465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.019823088834528e-05, "grad_norm": 17.88498306274414, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8685212731361389, "num_tokens": 519894442.0, "step": 13630 }, { "epoch": 1.7340033074672434, "ewc_loss": 0.030186498537659645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0186498406692408e-05, "grad_norm": 17.840360641479492, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8676085472106934, "num_tokens": 519932524.0, "step": 13631 }, { "epoch": 1.7341305177458337, "ewc_loss": 0.03021986596286297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0219865948311053e-05, "grad_norm": 17.90428924560547, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8677763938903809, "num_tokens": 519974366.0, "step": 13632 }, { "epoch": 1.7342577280244242, "ewc_loss": 0.030221108347177505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0221108318073675e-05, "grad_norm": 17.811904907226562, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8780673146247864, "num_tokens": 520013983.0, "step": 13633 }, { "epoch": 1.7343849383030148, "ewc_loss": 0.030154336243867874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0154336855048314e-05, "grad_norm": 17.88351821899414, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8702583909034729, "num_tokens": 520047671.0, "step": 13634 }, { "epoch": 1.7345121485816053, "ewc_loss": 0.03022032231092453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0220322514651343e-05, "grad_norm": 17.846651077270508, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8647150993347168, "num_tokens": 520086574.0, "step": 13635 }, { "epoch": 1.7346393588601958, "ewc_loss": 0.030233794823288918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0233793950174004e-05, "grad_norm": 17.834810256958008, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8543409705162048, "num_tokens": 520130613.0, "step": 13636 }, { "epoch": 1.7347665691387864, "ewc_loss": 0.030245047062635422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024504621862434e-05, "grad_norm": 17.84620475769043, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8870511054992676, "num_tokens": 520163763.0, "step": 13637 }, { "epoch": 1.7348937794173769, "ewc_loss": 0.030264895409345627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026489503099583e-05, "grad_norm": 17.928255081176758, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8658396005630493, "num_tokens": 520198663.0, "step": 13638 }, { "epoch": 1.7350209896959674, "ewc_loss": 0.030182993039488792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0182993214111775e-05, "grad_norm": 17.852806091308594, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8707524538040161, "num_tokens": 520231737.0, "step": 13639 }, { "epoch": 1.735148199974558, "ewc_loss": 0.030233031138777733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0233031793613918e-05, "grad_norm": 17.92920684814453, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8734517693519592, "num_tokens": 520275664.0, "step": 13640 }, { "epoch": 1.7352754102531485, "ewc_loss": 0.030232403427362442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0232404242269695e-05, "grad_norm": 17.891162872314453, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8570530414581299, "num_tokens": 520316100.0, "step": 13641 }, { "epoch": 1.735402620531739, "ewc_loss": 0.030204739421606064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0204739232431166e-05, "grad_norm": 17.880998611450195, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8657518625259399, "num_tokens": 520348167.0, "step": 13642 }, { "epoch": 1.7355298308103295, "ewc_loss": 0.030261117964982986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026111880899407e-05, "grad_norm": 17.934030532836914, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8543813228607178, "num_tokens": 520385234.0, "step": 13643 }, { "epoch": 1.73565704108892, "ewc_loss": 0.030174780637025833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0174780476954766e-05, "grad_norm": 17.80108642578125, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8518385887145996, "num_tokens": 520423051.0, "step": 13644 }, { "epoch": 1.7357842513675106, "ewc_loss": 0.030217265710234642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0217266612453386e-05, "grad_norm": 17.917797088623047, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8592638969421387, "num_tokens": 520467202.0, "step": 13645 }, { "epoch": 1.735911461646101, "ewc_loss": 0.03029884211719036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0298842830234207e-05, "grad_norm": 17.87660026550293, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8657280206680298, "num_tokens": 520501434.0, "step": 13646 }, { "epoch": 1.7360386719246916, "ewc_loss": 0.03024514950811863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024514990102034e-05, "grad_norm": 17.946044921875, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8655003309249878, "num_tokens": 520542757.0, "step": 13647 }, { "epoch": 1.7361658822032822, "ewc_loss": 0.030216647312045097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.021664815605618e-05, "grad_norm": 17.889986038208008, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8583635687828064, "num_tokens": 520585216.0, "step": 13648 }, { "epoch": 1.7362930924818727, "ewc_loss": 0.03022075444459915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0220753615139984e-05, "grad_norm": 17.921751022338867, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8693907260894775, "num_tokens": 520620066.0, "step": 13649 }, { "epoch": 1.736420302760463, "ewc_loss": 0.030272798612713814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0272798539954238e-05, "grad_norm": 17.90605354309082, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8754788637161255, "num_tokens": 520660917.0, "step": 13650 }, { "epoch": 1.7365475130390535, "ewc_loss": 0.030235642567276955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0235642043408006e-05, "grad_norm": 17.94707679748535, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8608429431915283, "num_tokens": 520704255.0, "step": 13651 }, { "epoch": 1.736674723317644, "ewc_loss": 0.030203677713871002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0203676942619495e-05, "grad_norm": 17.889720916748047, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8558374643325806, "num_tokens": 520743036.0, "step": 13652 }, { "epoch": 1.7368019335962346, "ewc_loss": 0.03021007962524891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0210079785319977e-05, "grad_norm": 18.025310516357422, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8614882230758667, "num_tokens": 520778325.0, "step": 13653 }, { "epoch": 1.736929143874825, "ewc_loss": 0.030199602246284485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0199602406355552e-05, "grad_norm": 17.806293487548828, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8803558349609375, "num_tokens": 520815226.0, "step": 13654 }, { "epoch": 1.7370563541534156, "ewc_loss": 0.030171964317560196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0171964681358077e-05, "grad_norm": 17.98676109313965, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8493819236755371, "num_tokens": 520856606.0, "step": 13655 }, { "epoch": 1.737183564432006, "ewc_loss": 0.030246203765273094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0246203095884994e-05, "grad_norm": 17.890884399414062, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8587684035301208, "num_tokens": 520893493.0, "step": 13656 }, { "epoch": 1.7373107747105965, "ewc_loss": 0.030130917206406593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013091736647766e-05, "grad_norm": 17.916484832763672, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8769608736038208, "num_tokens": 520930391.0, "step": 13657 }, { "epoch": 1.737437984989187, "ewc_loss": 0.03026566468179226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026566446351353e-05, "grad_norm": 17.941051483154297, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8678030967712402, "num_tokens": 520970512.0, "step": 13658 }, { "epoch": 1.7375651952677775, "ewc_loss": 0.030205050483345985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0205050279619172e-05, "grad_norm": 17.92792320251465, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8615093231201172, "num_tokens": 521011787.0, "step": 13659 }, { "epoch": 1.737692405546368, "ewc_loss": 0.03023010492324829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0230105039663613e-05, "grad_norm": 17.87557029724121, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8691321611404419, "num_tokens": 521053494.0, "step": 13660 }, { "epoch": 1.7378196158249586, "ewc_loss": 0.03018495999276638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0184959541657008e-05, "grad_norm": 17.983444213867188, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8570286631584167, "num_tokens": 521089291.0, "step": 13661 }, { "epoch": 1.7379468261035491, "ewc_loss": 0.030238337814807892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.023833778570406e-05, "grad_norm": 17.842391967773438, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8799495697021484, "num_tokens": 521128910.0, "step": 13662 }, { "epoch": 1.7380740363821396, "ewc_loss": 0.030147969722747803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0147970392135903e-05, "grad_norm": 17.87498664855957, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8652207851409912, "num_tokens": 521168175.0, "step": 13663 }, { "epoch": 1.7382012466607302, "ewc_loss": 0.030251028016209602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.02510288747726e-05, "grad_norm": 17.863264083862305, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8643102645874023, "num_tokens": 521208020.0, "step": 13664 }, { "epoch": 1.7383284569393207, "ewc_loss": 0.030157754197716713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0157754736137576e-05, "grad_norm": 17.903467178344727, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8799302577972412, "num_tokens": 521249828.0, "step": 13665 }, { "epoch": 1.7384556672179112, "ewc_loss": 0.030222298577427864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0222297937143594e-05, "grad_norm": 17.93309783935547, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.872821033000946, "num_tokens": 521289940.0, "step": 13666 }, { "epoch": 1.7385828774965018, "ewc_loss": 0.030128082260489464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128081561997533e-05, "grad_norm": 17.886533737182617, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8853914737701416, "num_tokens": 521327426.0, "step": 13667 }, { "epoch": 1.7387100877750923, "ewc_loss": 0.030124465003609657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0124465411063284e-05, "grad_norm": 17.812458038330078, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8580669164657593, "num_tokens": 521372112.0, "step": 13668 }, { "epoch": 1.7388372980536828, "ewc_loss": 0.030133334919810295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0133334803394973e-05, "grad_norm": 17.880552291870117, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8663840293884277, "num_tokens": 521410842.0, "step": 13669 }, { "epoch": 1.7389645083322733, "ewc_loss": 0.03021181747317314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0211816920200363e-05, "grad_norm": 17.909969329833984, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8780840635299683, "num_tokens": 521446854.0, "step": 13670 }, { "epoch": 1.7390917186108639, "ewc_loss": 0.030215520411729813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0215520382625982e-05, "grad_norm": 17.96120834350586, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8578472137451172, "num_tokens": 521486284.0, "step": 13671 }, { "epoch": 1.7392189288894544, "ewc_loss": 0.030128251761198044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0128252547001466e-05, "grad_norm": 17.864973068237305, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8551011085510254, "num_tokens": 521522716.0, "step": 13672 }, { "epoch": 1.739346139168045, "ewc_loss": 0.030123628675937653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0123628675937653e-05, "grad_norm": 18.012224197387695, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.862060010433197, "num_tokens": 521551549.0, "step": 13673 }, { "epoch": 1.7394733494466355, "ewc_loss": 0.030203919857740402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020392068719957e-05, "grad_norm": 17.81365203857422, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8712814450263977, "num_tokens": 521589552.0, "step": 13674 }, { "epoch": 1.7396005597252258, "ewc_loss": 0.03015095740556717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0150957172736526e-05, "grad_norm": 17.94761848449707, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8791449069976807, "num_tokens": 521627906.0, "step": 13675 }, { "epoch": 1.7397277700038163, "ewc_loss": 0.030299214646220207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029921390407253e-05, "grad_norm": 17.948318481445312, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8516910076141357, "num_tokens": 521666615.0, "step": 13676 }, { "epoch": 1.7398549802824068, "ewc_loss": 0.030136587098240852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0136587156448513e-05, "grad_norm": 17.86849594116211, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8639941215515137, "num_tokens": 521707742.0, "step": 13677 }, { "epoch": 1.7399821905609973, "ewc_loss": 0.03017895482480526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0178955057635903e-05, "grad_norm": 17.889219284057617, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8591119050979614, "num_tokens": 521746848.0, "step": 13678 }, { "epoch": 1.7401094008395879, "ewc_loss": 0.030220426619052887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0220426197047345e-05, "grad_norm": 17.850770950317383, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8703405857086182, "num_tokens": 521782917.0, "step": 13679 }, { "epoch": 1.7402366111181784, "ewc_loss": 0.030186716467142105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0186716685420834e-05, "grad_norm": 17.836101531982422, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8654054999351501, "num_tokens": 521825609.0, "step": 13680 }, { "epoch": 1.7403638213967687, "ewc_loss": 0.030224008485674858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0224007787182927e-05, "grad_norm": 17.90777015686035, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8806203603744507, "num_tokens": 521862277.0, "step": 13681 }, { "epoch": 1.7404910316753592, "ewc_loss": 0.030226323753595352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.022632336069364e-05, "grad_norm": 17.834457397460938, "learning_rate": 1e-06, "loss": 0.3493, "mean_token_accuracy": 0.8864002227783203, "num_tokens": 521894951.0, "step": 13682 }, { "epoch": 1.7406182419539498, "ewc_loss": 0.030215857550501823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0215856895665638e-05, "grad_norm": 17.965999603271484, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8683793544769287, "num_tokens": 521932158.0, "step": 13683 }, { "epoch": 1.7407454522325403, "ewc_loss": 0.03034646064043045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.034646033484023e-05, "grad_norm": 17.984508514404297, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8745495080947876, "num_tokens": 521971050.0, "step": 13684 }, { "epoch": 1.7408726625111308, "ewc_loss": 0.03021954372525215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0219543987186626e-05, "grad_norm": 17.953296661376953, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8789341449737549, "num_tokens": 522007617.0, "step": 13685 }, { "epoch": 1.7409998727897213, "ewc_loss": 0.030221765860915184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0221764973248355e-05, "grad_norm": 17.855436325073242, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8566362857818604, "num_tokens": 522053484.0, "step": 13686 }, { "epoch": 1.7411270830683119, "ewc_loss": 0.030205685645341873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020568510692101e-05, "grad_norm": 17.963424682617188, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8651053309440613, "num_tokens": 522088047.0, "step": 13687 }, { "epoch": 1.7412542933469024, "ewc_loss": 0.030262770131230354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026277045137249e-05, "grad_norm": 17.908306121826172, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8663154244422913, "num_tokens": 522129865.0, "step": 13688 }, { "epoch": 1.741381503625493, "ewc_loss": 0.03015602007508278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0156019420246594e-05, "grad_norm": 17.90448570251465, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8639683723449707, "num_tokens": 522170154.0, "step": 13689 }, { "epoch": 1.7415087139040835, "ewc_loss": 0.030129792168736458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0129791412036866e-05, "grad_norm": 17.789691925048828, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.891343355178833, "num_tokens": 522204586.0, "step": 13690 }, { "epoch": 1.741635924182674, "ewc_loss": 0.030172118917107582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0172119295457378e-05, "grad_norm": 17.94362449645996, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8734453916549683, "num_tokens": 522241942.0, "step": 13691 }, { "epoch": 1.7417631344612645, "ewc_loss": 0.030249109491705894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024910984095186e-05, "grad_norm": 17.880661010742188, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8645526170730591, "num_tokens": 522275800.0, "step": 13692 }, { "epoch": 1.741890344739855, "ewc_loss": 0.030189603567123413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.018960342160426e-05, "grad_norm": 17.873144149780273, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8618223071098328, "num_tokens": 522317024.0, "step": 13693 }, { "epoch": 1.7420175550184456, "ewc_loss": 0.0302385576069355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.023855788342189e-05, "grad_norm": 17.913368225097656, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8780206441879272, "num_tokens": 522358510.0, "step": 13694 }, { "epoch": 1.742144765297036, "ewc_loss": 0.030204733833670616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0204733775462955e-05, "grad_norm": 17.863510131835938, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.86296147108078, "num_tokens": 522395960.0, "step": 13695 }, { "epoch": 1.7422719755756266, "ewc_loss": 0.03022877313196659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0228773539420217e-05, "grad_norm": 18.024356842041016, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.876590371131897, "num_tokens": 522428807.0, "step": 13696 }, { "epoch": 1.7423991858542172, "ewc_loss": 0.030292721465229988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0292721930891275e-05, "grad_norm": 17.938135147094727, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8696362972259521, "num_tokens": 522464472.0, "step": 13697 }, { "epoch": 1.7425263961328077, "ewc_loss": 0.030177291482686996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.017729068233166e-05, "grad_norm": 17.965669631958008, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.881906270980835, "num_tokens": 522509522.0, "step": 13698 }, { "epoch": 1.742653606411398, "ewc_loss": 0.030238507315516472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.023850695171859e-05, "grad_norm": 18.026090621948242, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8530001044273376, "num_tokens": 522545581.0, "step": 13699 }, { "epoch": 1.7427808166899885, "ewc_loss": 0.030230382457375526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0230383345042355e-05, "grad_norm": 18.007869720458984, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8725700378417969, "num_tokens": 522582072.0, "step": 13700 }, { "epoch": 1.742908026968579, "ewc_loss": 0.030145693570375443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0145693017402664e-05, "grad_norm": 18.012630462646484, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8712334036827087, "num_tokens": 522618183.0, "step": 13701 }, { "epoch": 1.7430352372471696, "ewc_loss": 0.03014204651117325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0142045943648554e-05, "grad_norm": 17.86180305480957, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8569155931472778, "num_tokens": 522660578.0, "step": 13702 }, { "epoch": 1.74316244752576, "ewc_loss": 0.030113790184259415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0113789762253873e-05, "grad_norm": 18.05933952331543, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8797329664230347, "num_tokens": 522701934.0, "step": 13703 }, { "epoch": 1.7432896578043506, "ewc_loss": 0.030209602788090706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0209603210096247e-05, "grad_norm": 17.913101196289062, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8863328099250793, "num_tokens": 522737404.0, "step": 13704 }, { "epoch": 1.743416868082941, "ewc_loss": 0.030050266534090042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0050267014303245e-05, "grad_norm": 17.958484649658203, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8652975559234619, "num_tokens": 522774188.0, "step": 13705 }, { "epoch": 1.7435440783615315, "ewc_loss": 0.030155761167407036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015576112375129e-05, "grad_norm": 17.904521942138672, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8763816356658936, "num_tokens": 522810467.0, "step": 13706 }, { "epoch": 1.743671288640122, "ewc_loss": 0.030077975243330002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.007797567988746e-05, "grad_norm": 17.949542999267578, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8579715490341187, "num_tokens": 522845246.0, "step": 13707 }, { "epoch": 1.7437984989187125, "ewc_loss": 0.03018105961382389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0181059628375806e-05, "grad_norm": 17.988435745239258, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8777941465377808, "num_tokens": 522877150.0, "step": 13708 }, { "epoch": 1.743925709197303, "ewc_loss": 0.0301302932202816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0130293453112245e-05, "grad_norm": 17.91785430908203, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8830938339233398, "num_tokens": 522913161.0, "step": 13709 }, { "epoch": 1.7440529194758936, "ewc_loss": 0.030133984982967377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013398418261204e-05, "grad_norm": 17.942655563354492, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8549441695213318, "num_tokens": 522953464.0, "step": 13710 }, { "epoch": 1.744180129754484, "ewc_loss": 0.03014424629509449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0144246920826845e-05, "grad_norm": 17.87586784362793, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8726754188537598, "num_tokens": 522996874.0, "step": 13711 }, { "epoch": 1.7443073400330746, "ewc_loss": 0.030121589079499245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0121589588816278e-05, "grad_norm": 17.925722122192383, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8720626831054688, "num_tokens": 523031349.0, "step": 13712 }, { "epoch": 1.7444345503116652, "ewc_loss": 0.030201567336916924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0201566914911382e-05, "grad_norm": 17.90100860595703, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8614370822906494, "num_tokens": 523068910.0, "step": 13713 }, { "epoch": 1.7445617605902557, "ewc_loss": 0.030127665027976036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.012766501342412e-05, "grad_norm": 17.980140686035156, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8817316293716431, "num_tokens": 523106179.0, "step": 13714 }, { "epoch": 1.7446889708688462, "ewc_loss": 0.0302719809114933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0271979994722642e-05, "grad_norm": 18.005430221557617, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8702391386032104, "num_tokens": 523140827.0, "step": 13715 }, { "epoch": 1.7448161811474368, "ewc_loss": 0.030163636431097984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0163637347868644e-05, "grad_norm": 17.971885681152344, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8831748366355896, "num_tokens": 523182880.0, "step": 13716 }, { "epoch": 1.7449433914260273, "ewc_loss": 0.030157022178173065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0157021683407947e-05, "grad_norm": 17.930927276611328, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8718613386154175, "num_tokens": 523213059.0, "step": 13717 }, { "epoch": 1.7450706017046178, "ewc_loss": 0.030205633491277695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020563417521771e-05, "grad_norm": 17.961669921875, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8675175905227661, "num_tokens": 523254398.0, "step": 13718 }, { "epoch": 1.7451978119832083, "ewc_loss": 0.0301784910261631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0178491215338e-05, "grad_norm": 17.904680252075195, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8818537592887878, "num_tokens": 523293325.0, "step": 13719 }, { "epoch": 1.7453250222617989, "ewc_loss": 0.03020353615283966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0203536880435422e-05, "grad_norm": 17.95005226135254, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8587908744812012, "num_tokens": 523334402.0, "step": 13720 }, { "epoch": 1.7454522325403894, "ewc_loss": 0.030246438458561897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024643774551805e-05, "grad_norm": 17.908533096313477, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8725506067276001, "num_tokens": 523376238.0, "step": 13721 }, { "epoch": 1.74557944281898, "ewc_loss": 0.030129052698612213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0129052902339026e-05, "grad_norm": 17.909664154052734, "learning_rate": 1e-06, "loss": 0.3412, "mean_token_accuracy": 0.8908399343490601, "num_tokens": 523417074.0, "step": 13722 }, { "epoch": 1.7457066530975704, "ewc_loss": 0.030261624604463577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0261624488048255e-05, "grad_norm": 17.953662872314453, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8759326338768005, "num_tokens": 523457984.0, "step": 13723 }, { "epoch": 1.7458338633761608, "ewc_loss": 0.03018330968916416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0183309718267992e-05, "grad_norm": 17.948030471801758, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.863097608089447, "num_tokens": 523490071.0, "step": 13724 }, { "epoch": 1.7459610736547513, "ewc_loss": 0.030197998508810997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0197998057701625e-05, "grad_norm": 17.964292526245117, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.863075852394104, "num_tokens": 523530105.0, "step": 13725 }, { "epoch": 1.7460882839333418, "ewc_loss": 0.030183564871549606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.018356437678449e-05, "grad_norm": 17.9201717376709, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8833321928977966, "num_tokens": 523576698.0, "step": 13726 }, { "epoch": 1.7462154942119323, "ewc_loss": 0.030206387862563133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020638723683078e-05, "grad_norm": 17.962587356567383, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8699628114700317, "num_tokens": 523614387.0, "step": 13727 }, { "epoch": 1.7463427044905229, "ewc_loss": 0.03017154335975647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0171542675816454e-05, "grad_norm": 17.871726989746094, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.864962100982666, "num_tokens": 523651149.0, "step": 13728 }, { "epoch": 1.7464699147691134, "ewc_loss": 0.030198203399777412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0198203603504226e-05, "grad_norm": 17.92262077331543, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8815587759017944, "num_tokens": 523688448.0, "step": 13729 }, { "epoch": 1.7465971250477037, "ewc_loss": 0.030200300738215446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0200300898286514e-05, "grad_norm": 17.882034301757812, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8730270862579346, "num_tokens": 523725805.0, "step": 13730 }, { "epoch": 1.7467243353262942, "ewc_loss": 0.03019139915704727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.019139876414556e-05, "grad_norm": 17.894989013671875, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8705361485481262, "num_tokens": 523763564.0, "step": 13731 }, { "epoch": 1.7468515456048848, "ewc_loss": 0.030253663659095764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0253662771428935e-05, "grad_norm": 17.96820068359375, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8643006086349487, "num_tokens": 523804422.0, "step": 13732 }, { "epoch": 1.7469787558834753, "ewc_loss": 0.03018273040652275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0182731279637665e-05, "grad_norm": 17.916770935058594, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8785145878791809, "num_tokens": 523845889.0, "step": 13733 }, { "epoch": 1.7471059661620658, "ewc_loss": 0.03020094521343708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.020094482053537e-05, "grad_norm": 17.9818058013916, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8795357942581177, "num_tokens": 523883623.0, "step": 13734 }, { "epoch": 1.7472331764406563, "ewc_loss": 0.030221708118915558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0221708584576845e-05, "grad_norm": 17.9158935546875, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8763340711593628, "num_tokens": 523917273.0, "step": 13735 }, { "epoch": 1.7473603867192469, "ewc_loss": 0.030137179419398308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.013718014699407e-05, "grad_norm": 17.88092041015625, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8666282892227173, "num_tokens": 523956149.0, "step": 13736 }, { "epoch": 1.7474875969978374, "ewc_loss": 0.030197735875844955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0197736123227514e-05, "grad_norm": 17.932342529296875, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8673450350761414, "num_tokens": 523997229.0, "step": 13737 }, { "epoch": 1.747614807276428, "ewc_loss": 0.030209777876734734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0209777833078988e-05, "grad_norm": 17.876728057861328, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8731505870819092, "num_tokens": 524032921.0, "step": 13738 }, { "epoch": 1.7477420175550185, "ewc_loss": 0.0301626268774271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0162625989760272e-05, "grad_norm": 17.875280380249023, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8716942667961121, "num_tokens": 524065016.0, "step": 13739 }, { "epoch": 1.747869227833609, "ewc_loss": 0.030188599601387978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0188599339453503e-05, "grad_norm": 17.900299072265625, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8580737113952637, "num_tokens": 524107001.0, "step": 13740 }, { "epoch": 1.7479964381121995, "ewc_loss": 0.030205393210053444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0205394068616442e-05, "grad_norm": 17.883838653564453, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8631629943847656, "num_tokens": 524149283.0, "step": 13741 }, { "epoch": 1.74812364839079, "ewc_loss": 0.030206892639398575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0206892915884964e-05, "grad_norm": 17.95517349243164, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8693972826004028, "num_tokens": 524187240.0, "step": 13742 }, { "epoch": 1.7482508586693806, "ewc_loss": 0.030256761237978935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0256760510383174e-05, "grad_norm": 17.97554588317871, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8785350322723389, "num_tokens": 524216793.0, "step": 13743 }, { "epoch": 1.748378068947971, "ewc_loss": 0.03016798198223114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.016798109456431e-05, "grad_norm": 17.900222778320312, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8758453726768494, "num_tokens": 524257629.0, "step": 13744 }, { "epoch": 1.7485052792265616, "ewc_loss": 0.030219590291380882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0219589461921714e-05, "grad_norm": 17.98598861694336, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8798952698707581, "num_tokens": 524291090.0, "step": 13745 }, { "epoch": 1.7486324895051522, "ewc_loss": 0.030239492654800415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0239492843975313e-05, "grad_norm": 17.96216583251953, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8820706605911255, "num_tokens": 524324882.0, "step": 13746 }, { "epoch": 1.7487596997837427, "ewc_loss": 0.030199171975255013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0199171305866912e-05, "grad_norm": 17.952178955078125, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8540636301040649, "num_tokens": 524359422.0, "step": 13747 }, { "epoch": 1.748886910062333, "ewc_loss": 0.03022262640297413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0222627174225636e-05, "grad_norm": 17.940732955932617, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8754342794418335, "num_tokens": 524398383.0, "step": 13748 }, { "epoch": 1.7490141203409235, "ewc_loss": 0.030205246061086655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0205246730474755e-05, "grad_norm": 17.986875534057617, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8655702471733093, "num_tokens": 524437984.0, "step": 13749 }, { "epoch": 1.749141330619514, "ewc_loss": 0.030260175466537476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026017475349363e-05, "grad_norm": 17.949951171875, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8756595849990845, "num_tokens": 524471324.0, "step": 13750 }, { "epoch": 1.7492685408981046, "ewc_loss": 0.030175771564245224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0175771826179698e-05, "grad_norm": 17.983028411865234, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8608474731445312, "num_tokens": 524511471.0, "step": 13751 }, { "epoch": 1.749395751176695, "ewc_loss": 0.030221592634916306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0221592169255018e-05, "grad_norm": 17.87417984008789, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.87279212474823, "num_tokens": 524541743.0, "step": 13752 }, { "epoch": 1.7495229614552856, "ewc_loss": 0.030182765796780586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0182765840436332e-05, "grad_norm": 17.971235275268555, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8692907094955444, "num_tokens": 524581470.0, "step": 13753 }, { "epoch": 1.749650171733876, "ewc_loss": 0.030266134068369865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0266133762779646e-05, "grad_norm": 17.976932525634766, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8725773096084595, "num_tokens": 524612722.0, "step": 13754 }, { "epoch": 1.7497773820124665, "ewc_loss": 0.030199622735381126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.019962241523899e-05, "grad_norm": 17.928043365478516, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8784835934638977, "num_tokens": 524647640.0, "step": 13755 }, { "epoch": 1.749904592291057, "ewc_loss": 0.030237790197134018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0237790269893594e-05, "grad_norm": 17.991323471069336, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8631187677383423, "num_tokens": 524682366.0, "step": 13756 }, { "epoch": 1.7500318025696475, "ewc_loss": 0.030224865302443504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0224864531191997e-05, "grad_norm": 17.946014404296875, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8696487545967102, "num_tokens": 524724242.0, "step": 13757 }, { "epoch": 1.750159012848238, "ewc_loss": 0.03021516464650631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0215163860702887e-05, "grad_norm": 17.960681915283203, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8565998673439026, "num_tokens": 524757373.0, "step": 13758 }, { "epoch": 1.7502862231268286, "ewc_loss": 0.0302733201533556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0273320589913055e-05, "grad_norm": 17.9250431060791, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8592259883880615, "num_tokens": 524797704.0, "step": 13759 }, { "epoch": 1.750413433405419, "ewc_loss": 0.03025249019265175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0252489523263648e-05, "grad_norm": 17.93256950378418, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8715817928314209, "num_tokens": 524836710.0, "step": 13760 }, { "epoch": 1.7505406436840096, "ewc_loss": 0.03031396120786667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0313960451167077e-05, "grad_norm": 17.93848991394043, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8676247000694275, "num_tokens": 524870224.0, "step": 13761 }, { "epoch": 1.7506678539626002, "ewc_loss": 0.030287932604551315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.028793253179174e-05, "grad_norm": 17.995967864990234, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8716709613800049, "num_tokens": 524905345.0, "step": 13762 }, { "epoch": 1.7507950642411907, "ewc_loss": 0.03030698373913765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0306982807815075e-05, "grad_norm": 17.904069900512695, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8539413809776306, "num_tokens": 524943893.0, "step": 13763 }, { "epoch": 1.7509222745197812, "ewc_loss": 0.03030768223106861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.030768311873544e-05, "grad_norm": 17.973209381103516, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8688206076622009, "num_tokens": 524982924.0, "step": 13764 }, { "epoch": 1.7510494847983717, "ewc_loss": 0.030390698462724686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0390698157134466e-05, "grad_norm": 17.942445755004883, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8684216737747192, "num_tokens": 525016749.0, "step": 13765 }, { "epoch": 1.7511766950769623, "ewc_loss": 0.030280010774731636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0280010832939297e-05, "grad_norm": 17.957487106323242, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8711522817611694, "num_tokens": 525049178.0, "step": 13766 }, { "epoch": 1.7513039053555528, "ewc_loss": 0.030300280079245567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.030027983186301e-05, "grad_norm": 17.85900115966797, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8571453094482422, "num_tokens": 525083456.0, "step": 13767 }, { "epoch": 1.7514311156341433, "ewc_loss": 0.030315706506371498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031570668099448e-05, "grad_norm": 17.988893508911133, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8713364601135254, "num_tokens": 525120159.0, "step": 13768 }, { "epoch": 1.7515583259127339, "ewc_loss": 0.030403289943933487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040328920178581e-05, "grad_norm": 17.85284423828125, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8749862313270569, "num_tokens": 525159174.0, "step": 13769 }, { "epoch": 1.7516855361913244, "ewc_loss": 0.030288461595773697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.028846185770817e-05, "grad_norm": 17.918445587158203, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8657985329627991, "num_tokens": 525198373.0, "step": 13770 }, { "epoch": 1.751812746469915, "ewc_loss": 0.030382564291357994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.038256363652181e-05, "grad_norm": 17.864551544189453, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8756972551345825, "num_tokens": 525240618.0, "step": 13771 }, { "epoch": 1.7519399567485054, "ewc_loss": 0.030361980199813843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0361979952431284e-05, "grad_norm": 17.94358253479004, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8837338089942932, "num_tokens": 525274899.0, "step": 13772 }, { "epoch": 1.7520671670270958, "ewc_loss": 0.030355066061019897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0355065973708406e-05, "grad_norm": 17.872751235961914, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8497436046600342, "num_tokens": 525310144.0, "step": 13773 }, { "epoch": 1.7521943773056863, "ewc_loss": 0.030376696959137917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0376697395695373e-05, "grad_norm": 17.957611083984375, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.879657506942749, "num_tokens": 525348565.0, "step": 13774 }, { "epoch": 1.7523215875842768, "ewc_loss": 0.030419953167438507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0419952963711694e-05, "grad_norm": 17.952075958251953, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8750773668289185, "num_tokens": 525383709.0, "step": 13775 }, { "epoch": 1.7524487978628673, "ewc_loss": 0.030346747487783432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.034674773516599e-05, "grad_norm": 17.886184692382812, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8636274337768555, "num_tokens": 525423983.0, "step": 13776 }, { "epoch": 1.7525760081414579, "ewc_loss": 0.030420752242207527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.042075150005985e-05, "grad_norm": 17.903295516967773, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8766967058181763, "num_tokens": 525457357.0, "step": 13777 }, { "epoch": 1.7527032184200484, "ewc_loss": 0.03039390593767166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.039390685444232e-05, "grad_norm": 17.931568145751953, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8686513304710388, "num_tokens": 525497021.0, "step": 13778 }, { "epoch": 1.7528304286986387, "ewc_loss": 0.030428996309638023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0428996979026124e-05, "grad_norm": 17.925111770629883, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8856262564659119, "num_tokens": 525533592.0, "step": 13779 }, { "epoch": 1.7529576389772292, "ewc_loss": 0.03035716898739338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0357168725458905e-05, "grad_norm": 17.9261417388916, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8611974120140076, "num_tokens": 525572747.0, "step": 13780 }, { "epoch": 1.7530848492558198, "ewc_loss": 0.030432946979999542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0432947824010625e-05, "grad_norm": 17.917160034179688, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8755815625190735, "num_tokens": 525613578.0, "step": 13781 }, { "epoch": 1.7532120595344103, "ewc_loss": 0.030382052063941956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0382052500499412e-05, "grad_norm": 17.967182159423828, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8760275840759277, "num_tokens": 525655047.0, "step": 13782 }, { "epoch": 1.7533392698130008, "ewc_loss": 0.030382556840777397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0382556360564195e-05, "grad_norm": 17.9248046875, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8667649030685425, "num_tokens": 525689561.0, "step": 13783 }, { "epoch": 1.7534664800915913, "ewc_loss": 0.030317258089780807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0317258278955705e-05, "grad_norm": 17.949539184570312, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8658497333526611, "num_tokens": 525726946.0, "step": 13784 }, { "epoch": 1.7535936903701819, "ewc_loss": 0.030369833111763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.036983252968639e-05, "grad_norm": 17.876893997192383, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8629140853881836, "num_tokens": 525762089.0, "step": 13785 }, { "epoch": 1.7537209006487724, "ewc_loss": 0.030318938195705414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0318939025164582e-05, "grad_norm": 17.986431121826172, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8756042718887329, "num_tokens": 525796081.0, "step": 13786 }, { "epoch": 1.753848110927363, "ewc_loss": 0.03041813336312771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0418133974308148e-05, "grad_norm": 17.92049217224121, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8591606020927429, "num_tokens": 525829945.0, "step": 13787 }, { "epoch": 1.7539753212059535, "ewc_loss": 0.030349375680088997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0349376174854115e-05, "grad_norm": 17.88884162902832, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.877517580986023, "num_tokens": 525871289.0, "step": 13788 }, { "epoch": 1.754102531484544, "ewc_loss": 0.03038211353123188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0382114346139133e-05, "grad_norm": 17.90021514892578, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8705565929412842, "num_tokens": 525908455.0, "step": 13789 }, { "epoch": 1.7542297417631345, "ewc_loss": 0.030357226729393005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.035722693311982e-05, "grad_norm": 17.997581481933594, "learning_rate": 1e-06, "loss": 0.3392, "mean_token_accuracy": 0.8905397653579712, "num_tokens": 525943734.0, "step": 13790 }, { "epoch": 1.754356952041725, "ewc_loss": 0.030384600162506104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.038460090465378e-05, "grad_norm": 17.937538146972656, "learning_rate": 1e-06, "loss": 0.3379, "mean_token_accuracy": 0.8921701908111572, "num_tokens": 525988070.0, "step": 13791 }, { "epoch": 1.7544841623203156, "ewc_loss": 0.03030933067202568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0309331123135053e-05, "grad_norm": 17.927209854125977, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.884844183921814, "num_tokens": 526027940.0, "step": 13792 }, { "epoch": 1.754611372598906, "ewc_loss": 0.030316514894366264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0316514312289655e-05, "grad_norm": 18.048110961914062, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.879529595375061, "num_tokens": 526061357.0, "step": 13793 }, { "epoch": 1.7547385828774966, "ewc_loss": 0.0302807055413723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.028070568689145e-05, "grad_norm": 17.816905975341797, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8729069232940674, "num_tokens": 526102157.0, "step": 13794 }, { "epoch": 1.7548657931560872, "ewc_loss": 0.03027147613465786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.027147613465786e-05, "grad_norm": 18.071117401123047, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.861980676651001, "num_tokens": 526137241.0, "step": 13795 }, { "epoch": 1.7549930034346777, "ewc_loss": 0.030306892469525337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0306891858344898e-05, "grad_norm": 17.82632827758789, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8619608879089355, "num_tokens": 526169079.0, "step": 13796 }, { "epoch": 1.755120213713268, "ewc_loss": 0.030311690643429756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031169035239145e-05, "grad_norm": 18.113388061523438, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8745505213737488, "num_tokens": 526212212.0, "step": 13797 }, { "epoch": 1.7552474239918585, "ewc_loss": 0.030303793027997017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0303792300401255e-05, "grad_norm": 17.857511520385742, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.877890408039093, "num_tokens": 526250380.0, "step": 13798 }, { "epoch": 1.755374634270449, "ewc_loss": 0.030182935297489166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.018293500645086e-05, "grad_norm": 17.98514175415039, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8771870136260986, "num_tokens": 526292525.0, "step": 13799 }, { "epoch": 1.7555018445490396, "ewc_loss": 0.030317630618810654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0317631171783432e-05, "grad_norm": 17.9843692779541, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8711086511611938, "num_tokens": 526330427.0, "step": 13800 }, { "epoch": 1.75562905482763, "ewc_loss": 0.03015543706715107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.015543734363746e-05, "grad_norm": 17.914230346679688, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8716188669204712, "num_tokens": 526362954.0, "step": 13801 }, { "epoch": 1.7557562651062206, "ewc_loss": 0.030227525159716606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0227525712689385e-05, "grad_norm": 18.036344528198242, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8652777075767517, "num_tokens": 526394310.0, "step": 13802 }, { "epoch": 1.755883475384811, "ewc_loss": 0.03022233583033085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.022233613592107e-05, "grad_norm": 17.90508270263672, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8509511947631836, "num_tokens": 526438507.0, "step": 13803 }, { "epoch": 1.7560106856634015, "ewc_loss": 0.030204812064766884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0204811992007308e-05, "grad_norm": 17.980079650878906, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8667421936988831, "num_tokens": 526475124.0, "step": 13804 }, { "epoch": 1.756137895941992, "ewc_loss": 0.0302879735827446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.028797436854802e-05, "grad_norm": 17.904417037963867, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8784471154212952, "num_tokens": 526514936.0, "step": 13805 }, { "epoch": 1.7562651062205825, "ewc_loss": 0.03018467128276825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0184672141331248e-05, "grad_norm": 17.842824935913086, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8648473024368286, "num_tokens": 526548649.0, "step": 13806 }, { "epoch": 1.756392316499173, "ewc_loss": 0.0302249975502491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0224997317418456e-05, "grad_norm": 17.96331214904785, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8544301986694336, "num_tokens": 526589028.0, "step": 13807 }, { "epoch": 1.7565195267777636, "ewc_loss": 0.030302923172712326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.030292282346636e-05, "grad_norm": 17.963134765625, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.870288610458374, "num_tokens": 526626327.0, "step": 13808 }, { "epoch": 1.756646737056354, "ewc_loss": 0.03024703823029995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024703801202122e-05, "grad_norm": 17.87986946105957, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8667222261428833, "num_tokens": 526672264.0, "step": 13809 }, { "epoch": 1.7567739473349446, "ewc_loss": 0.03033396787941456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0333967515616678e-05, "grad_norm": 17.995922088623047, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8638081550598145, "num_tokens": 526709979.0, "step": 13810 }, { "epoch": 1.7569011576135352, "ewc_loss": 0.0302635096013546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026350896107033e-05, "grad_norm": 17.914522171020508, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8818240165710449, "num_tokens": 526745002.0, "step": 13811 }, { "epoch": 1.7570283678921257, "ewc_loss": 0.03029307909309864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029307845281437e-05, "grad_norm": 17.878612518310547, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8853379487991333, "num_tokens": 526785684.0, "step": 13812 }, { "epoch": 1.7571555781707162, "ewc_loss": 0.030292466282844543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0292465453385375e-05, "grad_norm": 17.9465389251709, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8874557018280029, "num_tokens": 526820246.0, "step": 13813 }, { "epoch": 1.7572827884493067, "ewc_loss": 0.030331319198012352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0331319067045115e-05, "grad_norm": 17.90119171142578, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8747868537902832, "num_tokens": 526854815.0, "step": 13814 }, { "epoch": 1.7574099987278973, "ewc_loss": 0.03025062009692192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0250619602156803e-05, "grad_norm": 17.86295509338379, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8768883943557739, "num_tokens": 526896336.0, "step": 13815 }, { "epoch": 1.7575372090064878, "ewc_loss": 0.03033996932208538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033997018064838e-05, "grad_norm": 17.897947311401367, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8814048767089844, "num_tokens": 526939586.0, "step": 13816 }, { "epoch": 1.7576644192850783, "ewc_loss": 0.03031441755592823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0314417017507367e-05, "grad_norm": 17.928388595581055, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.875480055809021, "num_tokens": 526975027.0, "step": 13817 }, { "epoch": 1.7577916295636689, "ewc_loss": 0.030305976048111916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0305976906674914e-05, "grad_norm": 17.922435760498047, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8721717000007629, "num_tokens": 527017820.0, "step": 13818 }, { "epoch": 1.7579188398422594, "ewc_loss": 0.030290920287370682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0290921131381765e-05, "grad_norm": 17.94829750061035, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8812992572784424, "num_tokens": 527051647.0, "step": 13819 }, { "epoch": 1.75804605012085, "ewc_loss": 0.030372558161616325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0372557375812903e-05, "grad_norm": 17.913869857788086, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8749328851699829, "num_tokens": 527089590.0, "step": 13820 }, { "epoch": 1.7581732603994404, "ewc_loss": 0.030241359025239944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.024135912710335e-05, "grad_norm": 17.903125762939453, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8675214648246765, "num_tokens": 527126696.0, "step": 13821 }, { "epoch": 1.7583004706780307, "ewc_loss": 0.03033478558063507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0334786060848273e-05, "grad_norm": 17.943601608276367, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8626865148544312, "num_tokens": 527164823.0, "step": 13822 }, { "epoch": 1.7584276809566213, "ewc_loss": 0.030279431492090225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.027943239430897e-05, "grad_norm": 17.923601150512695, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8532012701034546, "num_tokens": 527208032.0, "step": 13823 }, { "epoch": 1.7585548912352118, "ewc_loss": 0.03029322810471058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029322760994546e-05, "grad_norm": 18.01601791381836, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8715777397155762, "num_tokens": 527242773.0, "step": 13824 }, { "epoch": 1.7586821015138023, "ewc_loss": 0.03034164197742939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0341641831910238e-05, "grad_norm": 17.89005470275879, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8733856678009033, "num_tokens": 527277968.0, "step": 13825 }, { "epoch": 1.7588093117923929, "ewc_loss": 0.030241845175623894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0241844797274098e-05, "grad_norm": 17.91208839416504, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.884428858757019, "num_tokens": 527311077.0, "step": 13826 }, { "epoch": 1.7589365220709834, "ewc_loss": 0.030339907854795456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033990833500866e-05, "grad_norm": 17.968788146972656, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.876708984375, "num_tokens": 527340543.0, "step": 13827 }, { "epoch": 1.7590637323495737, "ewc_loss": 0.030319463461637497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0319462894112803e-05, "grad_norm": 17.96846580505371, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8683569431304932, "num_tokens": 527379596.0, "step": 13828 }, { "epoch": 1.7591909426281642, "ewc_loss": 0.030348462983965874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0348463042173535e-05, "grad_norm": 17.994930267333984, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8704807162284851, "num_tokens": 527417873.0, "step": 13829 }, { "epoch": 1.7593181529067548, "ewc_loss": 0.030295655131340027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0295655960799195e-05, "grad_norm": 17.89436149597168, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8726327419281006, "num_tokens": 527458607.0, "step": 13830 }, { "epoch": 1.7594453631853453, "ewc_loss": 0.030257541686296463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0257540856837295e-05, "grad_norm": 17.942611694335938, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8634629249572754, "num_tokens": 527493299.0, "step": 13831 }, { "epoch": 1.7595725734639358, "ewc_loss": 0.03033345751464367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0333458198583685e-05, "grad_norm": 18.029876708984375, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8586926460266113, "num_tokens": 527536065.0, "step": 13832 }, { "epoch": 1.7596997837425263, "ewc_loss": 0.030305778607726097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0305778636829928e-05, "grad_norm": 17.978046417236328, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8559865951538086, "num_tokens": 527576477.0, "step": 13833 }, { "epoch": 1.7598269940211169, "ewc_loss": 0.03027997724711895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0279978091130033e-05, "grad_norm": 17.99878692626953, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8689441680908203, "num_tokens": 527611777.0, "step": 13834 }, { "epoch": 1.7599542042997074, "ewc_loss": 0.030259277671575546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025927799171768e-05, "grad_norm": 17.982690811157227, "learning_rate": 1e-06, "loss": 0.3453, "mean_token_accuracy": 0.8880069255828857, "num_tokens": 527650236.0, "step": 13835 }, { "epoch": 1.760081414578298, "ewc_loss": 0.03025059960782528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0250599593273364e-05, "grad_norm": 17.916751861572266, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8767001628875732, "num_tokens": 527685486.0, "step": 13836 }, { "epoch": 1.7602086248568884, "ewc_loss": 0.030286258086562157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0286257242551073e-05, "grad_norm": 17.929384231567383, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8586087226867676, "num_tokens": 527725386.0, "step": 13837 }, { "epoch": 1.760335835135479, "ewc_loss": 0.030272923409938812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0272924050223082e-05, "grad_norm": 17.956491470336914, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8577519059181213, "num_tokens": 527756213.0, "step": 13838 }, { "epoch": 1.7604630454140695, "ewc_loss": 0.030321625992655754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.032162567251362e-05, "grad_norm": 17.94970703125, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8752740025520325, "num_tokens": 527793884.0, "step": 13839 }, { "epoch": 1.76059025569266, "ewc_loss": 0.030291611328721046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029161052836571e-05, "grad_norm": 17.943309783935547, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8681629300117493, "num_tokens": 527826428.0, "step": 13840 }, { "epoch": 1.7607174659712506, "ewc_loss": 0.030336881056427956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0336881536641158e-05, "grad_norm": 17.941333770751953, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8632274866104126, "num_tokens": 527866691.0, "step": 13841 }, { "epoch": 1.760844676249841, "ewc_loss": 0.030304433777928352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0304434403660707e-05, "grad_norm": 17.920873641967773, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8790138959884644, "num_tokens": 527909482.0, "step": 13842 }, { "epoch": 1.7609718865284316, "ewc_loss": 0.030350927263498306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.035092777281534e-05, "grad_norm": 17.9771671295166, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8587924242019653, "num_tokens": 527947818.0, "step": 13843 }, { "epoch": 1.7610990968070221, "ewc_loss": 0.030378984287381172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.037898386537563e-05, "grad_norm": 17.963546752929688, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.874459445476532, "num_tokens": 527986760.0, "step": 13844 }, { "epoch": 1.7612263070856127, "ewc_loss": 0.030327463522553444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0327462809509598e-05, "grad_norm": 17.934906005859375, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8640357255935669, "num_tokens": 528024742.0, "step": 13845 }, { "epoch": 1.761353517364203, "ewc_loss": 0.030380409210920334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.038040995306801e-05, "grad_norm": 17.98935890197754, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8569886684417725, "num_tokens": 528060781.0, "step": 13846 }, { "epoch": 1.7614807276427935, "ewc_loss": 0.03035050444304943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0350503948284313e-05, "grad_norm": 17.87860870361328, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8720360994338989, "num_tokens": 528099051.0, "step": 13847 }, { "epoch": 1.761607937921384, "ewc_loss": 0.030377328395843506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0377328585018404e-05, "grad_norm": 17.991357803344727, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8649395108222961, "num_tokens": 528138325.0, "step": 13848 }, { "epoch": 1.7617351481999746, "ewc_loss": 0.030405480414628983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0405481084017083e-05, "grad_norm": 17.985544204711914, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8620747923851013, "num_tokens": 528177997.0, "step": 13849 }, { "epoch": 1.761862358478565, "ewc_loss": 0.030377568677067757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0377568691619672e-05, "grad_norm": 17.945810317993164, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8665976524353027, "num_tokens": 528216360.0, "step": 13850 }, { "epoch": 1.7619895687571556, "ewc_loss": 0.030342163518071175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0342163881869055e-05, "grad_norm": 18.02152442932129, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8848152756690979, "num_tokens": 528251100.0, "step": 13851 }, { "epoch": 1.762116779035746, "ewc_loss": 0.030433068051934242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.043306787731126e-05, "grad_norm": 18.075653076171875, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.874809205532074, "num_tokens": 528286298.0, "step": 13852 }, { "epoch": 1.7622439893143365, "ewc_loss": 0.030315153300762177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0315153708215803e-05, "grad_norm": 17.953840255737305, "learning_rate": 1e-06, "loss": 0.3533, "mean_token_accuracy": 0.8858906626701355, "num_tokens": 528322970.0, "step": 13853 }, { "epoch": 1.762371199592927, "ewc_loss": 0.030367007479071617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0367007639142685e-05, "grad_norm": 18.068262100219727, "learning_rate": 1e-06, "loss": 0.3529, "mean_token_accuracy": 0.8869363069534302, "num_tokens": 528355191.0, "step": 13854 }, { "epoch": 1.7624984098715175, "ewc_loss": 0.030361732468008995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.03617325698724e-05, "grad_norm": 18.000272750854492, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8701304197311401, "num_tokens": 528388975.0, "step": 13855 }, { "epoch": 1.762625620150108, "ewc_loss": 0.030251268297433853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025126898137387e-05, "grad_norm": 17.909626007080078, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8761172294616699, "num_tokens": 528432139.0, "step": 13856 }, { "epoch": 1.7627528304286986, "ewc_loss": 0.030325615778565407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0325616535265e-05, "grad_norm": 18.017885208129883, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8808900713920593, "num_tokens": 528468315.0, "step": 13857 }, { "epoch": 1.762880040707289, "ewc_loss": 0.030307745561003685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.030774496437516e-05, "grad_norm": 17.891845703125, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8605033159255981, "num_tokens": 528510368.0, "step": 13858 }, { "epoch": 1.7630072509858796, "ewc_loss": 0.030334804207086563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033480425074231e-05, "grad_norm": 17.95560073852539, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8742417693138123, "num_tokens": 528545637.0, "step": 13859 }, { "epoch": 1.7631344612644702, "ewc_loss": 0.030369410291314125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.036941052414477e-05, "grad_norm": 18.010944366455078, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8750625848770142, "num_tokens": 528586267.0, "step": 13860 }, { "epoch": 1.7632616715430607, "ewc_loss": 0.03030269965529442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0302699087769724e-05, "grad_norm": 17.88317108154297, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8746959567070007, "num_tokens": 528629640.0, "step": 13861 }, { "epoch": 1.7633888818216512, "ewc_loss": 0.030291227623820305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029122672160156e-05, "grad_norm": 17.99199676513672, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8570079207420349, "num_tokens": 528665373.0, "step": 13862 }, { "epoch": 1.7635160921002417, "ewc_loss": 0.0303135197609663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031352025573142e-05, "grad_norm": 17.913631439208984, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8836922645568848, "num_tokens": 528703426.0, "step": 13863 }, { "epoch": 1.7636433023788323, "ewc_loss": 0.030294720083475113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0294721000245772e-05, "grad_norm": 17.987098693847656, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8531368970870972, "num_tokens": 528744886.0, "step": 13864 }, { "epoch": 1.7637705126574228, "ewc_loss": 0.0303350780159235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033507709915284e-05, "grad_norm": 17.984100341796875, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.883094310760498, "num_tokens": 528779773.0, "step": 13865 }, { "epoch": 1.7638977229360133, "ewc_loss": 0.030319062992930412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0319062716444023e-05, "grad_norm": 17.98802375793457, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.873221755027771, "num_tokens": 528821878.0, "step": 13866 }, { "epoch": 1.7640249332146039, "ewc_loss": 0.03029962070286274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0299621357698925e-05, "grad_norm": 18.0103816986084, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8722466230392456, "num_tokens": 528858056.0, "step": 13867 }, { "epoch": 1.7641521434931944, "ewc_loss": 0.03030562959611416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0305629479698837e-05, "grad_norm": 17.966726303100586, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8682446479797363, "num_tokens": 528903116.0, "step": 13868 }, { "epoch": 1.764279353771785, "ewc_loss": 0.030256163328886032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025616388185881e-05, "grad_norm": 17.95099449157715, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8653630614280701, "num_tokens": 528941561.0, "step": 13869 }, { "epoch": 1.7644065640503754, "ewc_loss": 0.03031979873776436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031979940715246e-05, "grad_norm": 17.979690551757812, "learning_rate": 1e-06, "loss": 0.3503, "mean_token_accuracy": 0.8876575231552124, "num_tokens": 528976215.0, "step": 13870 }, { "epoch": 1.7645337743289657, "ewc_loss": 0.030294574797153473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029457548109349e-05, "grad_norm": 17.94964599609375, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8769582509994507, "num_tokens": 529019280.0, "step": 13871 }, { "epoch": 1.7646609846075563, "ewc_loss": 0.030241509899497032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0241510103223845e-05, "grad_norm": 17.86473846435547, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8744467496871948, "num_tokens": 529060733.0, "step": 13872 }, { "epoch": 1.7647881948861468, "ewc_loss": 0.030307859182357788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0307859560707584e-05, "grad_norm": 18.041994094848633, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.873211681842804, "num_tokens": 529094948.0, "step": 13873 }, { "epoch": 1.7649154051647373, "ewc_loss": 0.030296823009848595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029682375199627e-05, "grad_norm": 17.807722091674805, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8812631368637085, "num_tokens": 529131374.0, "step": 13874 }, { "epoch": 1.7650426154433279, "ewc_loss": 0.030322369188070297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.032236963917967e-05, "grad_norm": 18.032855987548828, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8677947521209717, "num_tokens": 529169133.0, "step": 13875 }, { "epoch": 1.7651698257219184, "ewc_loss": 0.030410686507821083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.041068703169003e-05, "grad_norm": 17.956132888793945, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8684114217758179, "num_tokens": 529206467.0, "step": 13876 }, { "epoch": 1.7652970360005087, "ewc_loss": 0.030254855751991272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025485602847766e-05, "grad_norm": 17.946208953857422, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.861655056476593, "num_tokens": 529244891.0, "step": 13877 }, { "epoch": 1.7654242462790992, "ewc_loss": 0.030431339517235756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.043133983737789e-05, "grad_norm": 18.00103759765625, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8730514049530029, "num_tokens": 529277384.0, "step": 13878 }, { "epoch": 1.7655514565576897, "ewc_loss": 0.030335422605276108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0335422707139514e-05, "grad_norm": 17.945758819580078, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.874661922454834, "num_tokens": 529322530.0, "step": 13879 }, { "epoch": 1.7656786668362803, "ewc_loss": 0.030308935791254044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0308936402434483e-05, "grad_norm": 18.02916145324707, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8657952547073364, "num_tokens": 529356345.0, "step": 13880 }, { "epoch": 1.7658058771148708, "ewc_loss": 0.030338753014802933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0338753276737407e-05, "grad_norm": 17.97068977355957, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8726340532302856, "num_tokens": 529391877.0, "step": 13881 }, { "epoch": 1.7659330873934613, "ewc_loss": 0.030317887663841248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0317887649289332e-05, "grad_norm": 18.020639419555664, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8779039978981018, "num_tokens": 529425460.0, "step": 13882 }, { "epoch": 1.7660602976720519, "ewc_loss": 0.0303430687636137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0343067919602618e-05, "grad_norm": 17.967329025268555, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8619706630706787, "num_tokens": 529459660.0, "step": 13883 }, { "epoch": 1.7661875079506424, "ewc_loss": 0.03034905157983303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0349052394740283e-05, "grad_norm": 17.962299346923828, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8634994626045227, "num_tokens": 529501839.0, "step": 13884 }, { "epoch": 1.766314718229233, "ewc_loss": 0.03035295382142067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.035295412701089e-05, "grad_norm": 18.023849487304688, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8687165975570679, "num_tokens": 529540828.0, "step": 13885 }, { "epoch": 1.7664419285078234, "ewc_loss": 0.030291354283690453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.029135405085981e-05, "grad_norm": 17.92896270751953, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8707266449928284, "num_tokens": 529581578.0, "step": 13886 }, { "epoch": 1.766569138786414, "ewc_loss": 0.030347749590873718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0347749998327345e-05, "grad_norm": 18.017894744873047, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8706096410751343, "num_tokens": 529624363.0, "step": 13887 }, { "epoch": 1.7666963490650045, "ewc_loss": 0.030351370573043823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.03513697872404e-05, "grad_norm": 17.97576332092285, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8775606155395508, "num_tokens": 529665286.0, "step": 13888 }, { "epoch": 1.766823559343595, "ewc_loss": 0.030302030965685844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0302031518658623e-05, "grad_norm": 17.95720672607422, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8665375709533691, "num_tokens": 529704150.0, "step": 13889 }, { "epoch": 1.7669507696221856, "ewc_loss": 0.030375856906175613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0375857022590935e-05, "grad_norm": 17.975584030151367, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8727203607559204, "num_tokens": 529742110.0, "step": 13890 }, { "epoch": 1.767077979900776, "ewc_loss": 0.03033628687262535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.03362867271062e-05, "grad_norm": 18.02246856689453, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8701655864715576, "num_tokens": 529776158.0, "step": 13891 }, { "epoch": 1.7672051901793666, "ewc_loss": 0.03028975985944271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0289760616142303e-05, "grad_norm": 17.978822708129883, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8488407135009766, "num_tokens": 529814002.0, "step": 13892 }, { "epoch": 1.7673324004579571, "ewc_loss": 0.03029007464647293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0290075301309116e-05, "grad_norm": 18.004121780395508, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8741838335990906, "num_tokens": 529846696.0, "step": 13893 }, { "epoch": 1.7674596107365477, "ewc_loss": 0.030368715524673462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0368715670192614e-05, "grad_norm": 18.094051361083984, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8682541847229004, "num_tokens": 529881557.0, "step": 13894 }, { "epoch": 1.767586821015138, "ewc_loss": 0.03031209483742714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031209416803904e-05, "grad_norm": 18.019163131713867, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8584114909172058, "num_tokens": 529924337.0, "step": 13895 }, { "epoch": 1.7677140312937285, "ewc_loss": 0.030227232724428177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0227232855395414e-05, "grad_norm": 17.972333908081055, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.869767427444458, "num_tokens": 529967195.0, "step": 13896 }, { "epoch": 1.767841241572319, "ewc_loss": 0.030264247208833694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026424747076817e-05, "grad_norm": 17.990402221679688, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8516696095466614, "num_tokens": 530003454.0, "step": 13897 }, { "epoch": 1.7679684518509096, "ewc_loss": 0.03027508594095707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.02750868286239e-05, "grad_norm": 17.93198013305664, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8452991843223572, "num_tokens": 530042577.0, "step": 13898 }, { "epoch": 1.7680956621295, "ewc_loss": 0.03029724583029747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0297245757537894e-05, "grad_norm": 18.0145320892334, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8555623888969421, "num_tokens": 530084903.0, "step": 13899 }, { "epoch": 1.7682228724080906, "ewc_loss": 0.030374594032764435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0374594643944874e-05, "grad_norm": 17.978755950927734, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8662437200546265, "num_tokens": 530117598.0, "step": 13900 }, { "epoch": 1.768350082686681, "ewc_loss": 0.030301226302981377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0301225706352852e-05, "grad_norm": 18.035572052001953, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8726842403411865, "num_tokens": 530151891.0, "step": 13901 }, { "epoch": 1.7684772929652715, "ewc_loss": 0.030391039326786995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0391040127142332e-05, "grad_norm": 18.021923065185547, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.861785352230072, "num_tokens": 530191263.0, "step": 13902 }, { "epoch": 1.768604503243862, "ewc_loss": 0.03032657317817211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0326573323691264e-05, "grad_norm": 17.97504997253418, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8655402660369873, "num_tokens": 530231156.0, "step": 13903 }, { "epoch": 1.7687317135224525, "ewc_loss": 0.03030189871788025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0301898732432164e-05, "grad_norm": 17.976106643676758, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8831180334091187, "num_tokens": 530270089.0, "step": 13904 }, { "epoch": 1.768858923801043, "ewc_loss": 0.030324455350637436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0324456020025536e-05, "grad_norm": 17.99362564086914, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8735585808753967, "num_tokens": 530309808.0, "step": 13905 }, { "epoch": 1.7689861340796336, "ewc_loss": 0.030352255329489708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0352255635079928e-05, "grad_norm": 17.956388473510742, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8591493964195251, "num_tokens": 530357723.0, "step": 13906 }, { "epoch": 1.769113344358224, "ewc_loss": 0.03028569184243679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.028569153684657e-05, "grad_norm": 18.063701629638672, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.857049822807312, "num_tokens": 530389586.0, "step": 13907 }, { "epoch": 1.7692405546368146, "ewc_loss": 0.030337046831846237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033704706467688e-05, "grad_norm": 17.95256996154785, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8602755069732666, "num_tokens": 530430356.0, "step": 13908 }, { "epoch": 1.7693677649154052, "ewc_loss": 0.030226336792111397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0226336093619466e-05, "grad_norm": 17.91252326965332, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8744218349456787, "num_tokens": 530472859.0, "step": 13909 }, { "epoch": 1.7694949751939957, "ewc_loss": 0.030364887788891792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0364888516487554e-05, "grad_norm": 17.994827270507812, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8572933673858643, "num_tokens": 530515422.0, "step": 13910 }, { "epoch": 1.7696221854725862, "ewc_loss": 0.030282311141490936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0282311854534782e-05, "grad_norm": 17.922574996948242, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8679434657096863, "num_tokens": 530553842.0, "step": 13911 }, { "epoch": 1.7697493957511767, "ewc_loss": 0.030321922153234482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0321922167786397e-05, "grad_norm": 17.97637367248535, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8685933947563171, "num_tokens": 530592593.0, "step": 13912 }, { "epoch": 1.7698766060297673, "ewc_loss": 0.030315525829792023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031552660104353e-05, "grad_norm": 17.8666934967041, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8638620972633362, "num_tokens": 530632309.0, "step": 13913 }, { "epoch": 1.7700038163083578, "ewc_loss": 0.030368415638804436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.036841553694103e-05, "grad_norm": 18.048667907714844, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8647770881652832, "num_tokens": 530667415.0, "step": 13914 }, { "epoch": 1.7701310265869483, "ewc_loss": 0.03040940687060356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0409406463149935e-05, "grad_norm": 17.911088943481445, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8667510747909546, "num_tokens": 530705736.0, "step": 13915 }, { "epoch": 1.7702582368655388, "ewc_loss": 0.030272550880908966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0272551157395355e-05, "grad_norm": 18.017183303833008, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8781842589378357, "num_tokens": 530744931.0, "step": 13916 }, { "epoch": 1.7703854471441294, "ewc_loss": 0.030443649739027023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0443648938671686e-05, "grad_norm": 17.973031997680664, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8816735744476318, "num_tokens": 530784700.0, "step": 13917 }, { "epoch": 1.77051265742272, "ewc_loss": 0.030329100787639618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.032909989997279e-05, "grad_norm": 17.966890335083008, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8860977292060852, "num_tokens": 530821097.0, "step": 13918 }, { "epoch": 1.7706398677013102, "ewc_loss": 0.030351711437106133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0351711757248268e-05, "grad_norm": 17.99320411682129, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8633589744567871, "num_tokens": 530860444.0, "step": 13919 }, { "epoch": 1.7707670779799007, "ewc_loss": 0.03030773065984249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0307730412459932e-05, "grad_norm": 17.958934783935547, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8665031790733337, "num_tokens": 530907304.0, "step": 13920 }, { "epoch": 1.7708942882584913, "ewc_loss": 0.030351225286722183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.035122608707752e-05, "grad_norm": 18.01859474182129, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8621463775634766, "num_tokens": 530945692.0, "step": 13921 }, { "epoch": 1.7710214985370818, "ewc_loss": 0.03030821867287159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0308217901620083e-05, "grad_norm": 17.873781204223633, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8796345591545105, "num_tokens": 530986695.0, "step": 13922 }, { "epoch": 1.7711487088156723, "ewc_loss": 0.03036348521709442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.036348607565742e-05, "grad_norm": 17.980287551879883, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8806120753288269, "num_tokens": 531022656.0, "step": 13923 }, { "epoch": 1.7712759190942629, "ewc_loss": 0.030362319201231003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0362318284460343e-05, "grad_norm": 18.01514434814453, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8784916400909424, "num_tokens": 531056896.0, "step": 13924 }, { "epoch": 1.7714031293728534, "ewc_loss": 0.030303535982966423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0303535822895356e-05, "grad_norm": 17.91014862060547, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8423842787742615, "num_tokens": 531097286.0, "step": 13925 }, { "epoch": 1.7715303396514437, "ewc_loss": 0.03031674027442932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0316739866975695e-05, "grad_norm": 17.961130142211914, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8775100708007812, "num_tokens": 531136430.0, "step": 13926 }, { "epoch": 1.7716575499300342, "ewc_loss": 0.030381793156266212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0381792385014705e-05, "grad_norm": 17.960166931152344, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8621668815612793, "num_tokens": 531176698.0, "step": 13927 }, { "epoch": 1.7717847602086247, "ewc_loss": 0.030341962352395058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0341961974045262e-05, "grad_norm": 17.950735092163086, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8685315251350403, "num_tokens": 531214343.0, "step": 13928 }, { "epoch": 1.7719119704872153, "ewc_loss": 0.030326588079333305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0326587875606492e-05, "grad_norm": 18.028478622436523, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.88148432970047, "num_tokens": 531252325.0, "step": 13929 }, { "epoch": 1.7720391807658058, "ewc_loss": 0.030318761244416237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0318760764203034e-05, "grad_norm": 17.965105056762695, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.857780396938324, "num_tokens": 531290378.0, "step": 13930 }, { "epoch": 1.7721663910443963, "ewc_loss": 0.03032684326171875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.032684253412299e-05, "grad_norm": 17.956514358520508, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8613265752792358, "num_tokens": 531330406.0, "step": 13931 }, { "epoch": 1.7722936013229869, "ewc_loss": 0.030312566086649895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0312565286294557e-05, "grad_norm": 18.011322021484375, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8695994019508362, "num_tokens": 531367425.0, "step": 13932 }, { "epoch": 1.7724208116015774, "ewc_loss": 0.030340412631630898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.034041219507344e-05, "grad_norm": 17.962974548339844, "learning_rate": 1e-06, "loss": 0.3481, "mean_token_accuracy": 0.8904241323471069, "num_tokens": 531406465.0, "step": 13933 }, { "epoch": 1.772548021880168, "ewc_loss": 0.030299197882413864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.02991975331679e-05, "grad_norm": 17.97548484802246, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.858808159828186, "num_tokens": 531445356.0, "step": 13934 }, { "epoch": 1.7726752321587584, "ewc_loss": 0.030336571857333183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0336572308442555e-05, "grad_norm": 17.94960594177246, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8672589063644409, "num_tokens": 531484737.0, "step": 13935 }, { "epoch": 1.772802442437349, "ewc_loss": 0.03034038096666336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.034038127225358e-05, "grad_norm": 18.047931671142578, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.865814208984375, "num_tokens": 531523453.0, "step": 13936 }, { "epoch": 1.7729296527159395, "ewc_loss": 0.030265824869275093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0265824534581043e-05, "grad_norm": 17.913148880004883, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8753626346588135, "num_tokens": 531559627.0, "step": 13937 }, { "epoch": 1.77305686299453, "ewc_loss": 0.030266284942626953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.026628473890014e-05, "grad_norm": 17.940433502197266, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8744563460350037, "num_tokens": 531597677.0, "step": 13938 }, { "epoch": 1.7731840732731206, "ewc_loss": 0.030319245532155037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0319244615384378e-05, "grad_norm": 17.90762710571289, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8777045607566833, "num_tokens": 531642753.0, "step": 13939 }, { "epoch": 1.773311283551711, "ewc_loss": 0.03029291331768036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0292912924778648e-05, "grad_norm": 17.98291778564453, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8618658781051636, "num_tokens": 531688249.0, "step": 13940 }, { "epoch": 1.7734384938303016, "ewc_loss": 0.030339648947119713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.033964821952395e-05, "grad_norm": 17.882156372070312, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8835242986679077, "num_tokens": 531727994.0, "step": 13941 }, { "epoch": 1.7735657041088921, "ewc_loss": 0.03030036762356758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.030036714335438e-05, "grad_norm": 18.055774688720703, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.873249888420105, "num_tokens": 531764428.0, "step": 13942 }, { "epoch": 1.7736929143874827, "ewc_loss": 0.030348503962159157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0348503059940413e-05, "grad_norm": 17.887313842773438, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8651509284973145, "num_tokens": 531806604.0, "step": 13943 }, { "epoch": 1.773820124666073, "ewc_loss": 0.030248768627643585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0248767870943993e-05, "grad_norm": 18.078125, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8875240087509155, "num_tokens": 531845504.0, "step": 13944 }, { "epoch": 1.7739473349446635, "ewc_loss": 0.03036855161190033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0368551961146295e-05, "grad_norm": 17.97078514099121, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8794747591018677, "num_tokens": 531880962.0, "step": 13945 }, { "epoch": 1.774074545223254, "ewc_loss": 0.030276766046881676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.027676575584337e-05, "grad_norm": 18.02950668334961, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8671706318855286, "num_tokens": 531918182.0, "step": 13946 }, { "epoch": 1.7742017555018446, "ewc_loss": 0.030289100483059883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0289100322988816e-05, "grad_norm": 17.923416137695312, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8704506158828735, "num_tokens": 531958340.0, "step": 13947 }, { "epoch": 1.774328965780435, "ewc_loss": 0.030253808945417404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.025380829058122e-05, "grad_norm": 18.093873977661133, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8581568002700806, "num_tokens": 531996951.0, "step": 13948 }, { "epoch": 1.7744561760590256, "ewc_loss": 0.030367525294423103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0367526051122695e-05, "grad_norm": 17.90738296508789, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8736087083816528, "num_tokens": 532036963.0, "step": 13949 }, { "epoch": 1.774583386337616, "ewc_loss": 0.030201619490981102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0201619665604085e-05, "grad_norm": 18.0374813079834, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8639611005783081, "num_tokens": 532070937.0, "step": 13950 }, { "epoch": 1.7747105966162064, "ewc_loss": 0.030307820066809654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0307819542940706e-05, "grad_norm": 17.928050994873047, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.856212854385376, "num_tokens": 532107866.0, "step": 13951 }, { "epoch": 1.774837806894797, "ewc_loss": 0.030220746994018555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.022074633918237e-05, "grad_norm": 18.017131805419922, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8627567887306213, "num_tokens": 532151485.0, "step": 13952 }, { "epoch": 1.7749650171733875, "ewc_loss": 0.030338328331708908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0338327633216977e-05, "grad_norm": 18.038074493408203, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8757364749908447, "num_tokens": 532191209.0, "step": 13953 }, { "epoch": 1.775092227451978, "ewc_loss": 0.03025571070611477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0255710953497328e-05, "grad_norm": 17.975711822509766, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8655872344970703, "num_tokens": 532232219.0, "step": 13954 }, { "epoch": 1.7752194377305686, "ewc_loss": 0.030268469825387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.02684693451738e-05, "grad_norm": 18.024877548217773, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8578407764434814, "num_tokens": 532265330.0, "step": 13955 }, { "epoch": 1.775346648009159, "ewc_loss": 0.030272619798779488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.027262027899269e-05, "grad_norm": 17.976177215576172, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8682554364204407, "num_tokens": 532295964.0, "step": 13956 }, { "epoch": 1.7754738582877496, "ewc_loss": 0.030329514294862747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0329514629556797e-05, "grad_norm": 18.011642456054688, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8727394938468933, "num_tokens": 532333804.0, "step": 13957 }, { "epoch": 1.7756010685663401, "ewc_loss": 0.030311284586787224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.031128471775446e-05, "grad_norm": 18.009857177734375, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8733139634132385, "num_tokens": 532373088.0, "step": 13958 }, { "epoch": 1.7757282788449307, "ewc_loss": 0.030352042987942696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0352042813319713e-05, "grad_norm": 17.912633895874023, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8671000003814697, "num_tokens": 532409473.0, "step": 13959 }, { "epoch": 1.7758554891235212, "ewc_loss": 0.030346132814884186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0346132916747592e-05, "grad_norm": 17.978240966796875, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.880815863609314, "num_tokens": 532447883.0, "step": 13960 }, { "epoch": 1.7759826994021117, "ewc_loss": 0.03040139377117157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0401393814827316e-05, "grad_norm": 17.934511184692383, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8741816282272339, "num_tokens": 532485783.0, "step": 13961 }, { "epoch": 1.7761099096807023, "ewc_loss": 0.030349761247634888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0349761800607666e-05, "grad_norm": 17.95838737487793, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8754324316978455, "num_tokens": 532511082.0, "step": 13962 }, { "epoch": 1.7762371199592928, "ewc_loss": 0.03037162311375141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.037162241525948e-05, "grad_norm": 17.875064849853516, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8857871890068054, "num_tokens": 532555516.0, "step": 13963 }, { "epoch": 1.7763643302378833, "ewc_loss": 0.03041122667491436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0411227271542884e-05, "grad_norm": 17.92957878112793, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8545252084732056, "num_tokens": 532599200.0, "step": 13964 }, { "epoch": 1.7764915405164738, "ewc_loss": 0.03040541708469391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040541741938796e-05, "grad_norm": 17.920787811279297, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8806910514831543, "num_tokens": 532636335.0, "step": 13965 }, { "epoch": 1.7766187507950644, "ewc_loss": 0.03049272671341896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0492727091768757e-05, "grad_norm": 17.97369384765625, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8815670609474182, "num_tokens": 532678337.0, "step": 13966 }, { "epoch": 1.776745961073655, "ewc_loss": 0.030437346547842026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.04373461403884e-05, "grad_norm": 17.892492294311523, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8657311201095581, "num_tokens": 532715051.0, "step": 13967 }, { "epoch": 1.7768731713522452, "ewc_loss": 0.03046063706278801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.04606364807114e-05, "grad_norm": 17.982730865478516, "learning_rate": 1e-06, "loss": 0.3561, "mean_token_accuracy": 0.8806577324867249, "num_tokens": 532750836.0, "step": 13968 }, { "epoch": 1.7770003816308357, "ewc_loss": 0.030537467449903488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053746695513837e-05, "grad_norm": 17.964685440063477, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8762264251708984, "num_tokens": 532790331.0, "step": 13969 }, { "epoch": 1.7771275919094263, "ewc_loss": 0.030432892963290215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.043289325432852e-05, "grad_norm": 17.97594451904297, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8665470480918884, "num_tokens": 532830169.0, "step": 13970 }, { "epoch": 1.7772548021880168, "ewc_loss": 0.030467119067907333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0467119358945638e-05, "grad_norm": 17.91163444519043, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8623250722885132, "num_tokens": 532865957.0, "step": 13971 }, { "epoch": 1.7773820124666073, "ewc_loss": 0.030478017404675484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047801692446228e-05, "grad_norm": 17.996789932250977, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8661254048347473, "num_tokens": 532906112.0, "step": 13972 }, { "epoch": 1.7775092227451978, "ewc_loss": 0.030519120395183563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051912062801421e-05, "grad_norm": 17.955095291137695, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8758485317230225, "num_tokens": 532939676.0, "step": 13973 }, { "epoch": 1.7776364330237884, "ewc_loss": 0.030417121946811676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0417122616199777e-05, "grad_norm": 18.005287170410156, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8521820902824402, "num_tokens": 532977668.0, "step": 13974 }, { "epoch": 1.7777636433023787, "ewc_loss": 0.030509518459439278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.050951818295289e-05, "grad_norm": 17.907291412353516, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8670864701271057, "num_tokens": 533013862.0, "step": 13975 }, { "epoch": 1.7778908535809692, "ewc_loss": 0.030468877404928207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0468878321698867e-05, "grad_norm": 17.947296142578125, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8588923215866089, "num_tokens": 533051222.0, "step": 13976 }, { "epoch": 1.7780180638595597, "ewc_loss": 0.030517684295773506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0517683626385406e-05, "grad_norm": 17.965866088867188, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8672217726707458, "num_tokens": 533090182.0, "step": 13977 }, { "epoch": 1.7781452741381503, "ewc_loss": 0.030422115698456764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.042211574211251e-05, "grad_norm": 17.943103790283203, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8833334445953369, "num_tokens": 533129600.0, "step": 13978 }, { "epoch": 1.7782724844167408, "ewc_loss": 0.03045138344168663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0451383281615563e-05, "grad_norm": 17.925434112548828, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8695858120918274, "num_tokens": 533172351.0, "step": 13979 }, { "epoch": 1.7783996946953313, "ewc_loss": 0.030450474470853806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.045047378691379e-05, "grad_norm": 17.90920066833496, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8671625852584839, "num_tokens": 533211583.0, "step": 13980 }, { "epoch": 1.7785269049739219, "ewc_loss": 0.030405160039663315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040516094188206e-05, "grad_norm": 17.903369903564453, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8759521245956421, "num_tokens": 533249960.0, "step": 13981 }, { "epoch": 1.7786541152525124, "ewc_loss": 0.030492672696709633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049267252208665e-05, "grad_norm": 17.973243713378906, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8772680759429932, "num_tokens": 533287328.0, "step": 13982 }, { "epoch": 1.778781325531103, "ewc_loss": 0.03045107237994671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0451072234427556e-05, "grad_norm": 17.945310592651367, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8706548810005188, "num_tokens": 533327772.0, "step": 13983 }, { "epoch": 1.7789085358096934, "ewc_loss": 0.03046237863600254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0462379072559997e-05, "grad_norm": 17.972917556762695, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8636299967765808, "num_tokens": 533372004.0, "step": 13984 }, { "epoch": 1.779035746088284, "ewc_loss": 0.03043985180556774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0439852707786486e-05, "grad_norm": 17.95325469970703, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8673651218414307, "num_tokens": 533409502.0, "step": 13985 }, { "epoch": 1.7791629563668745, "ewc_loss": 0.030383968725800514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0383967896341346e-05, "grad_norm": 17.923444747924805, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8578770160675049, "num_tokens": 533451160.0, "step": 13986 }, { "epoch": 1.779290166645465, "ewc_loss": 0.030400563031435013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0400562536669895e-05, "grad_norm": 17.983524322509766, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8782162666320801, "num_tokens": 533480413.0, "step": 13987 }, { "epoch": 1.7794173769240555, "ewc_loss": 0.030431924387812614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.043192373297643e-05, "grad_norm": 17.904516220092773, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8587085604667664, "num_tokens": 533518826.0, "step": 13988 }, { "epoch": 1.779544587202646, "ewc_loss": 0.03043871745467186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0438717658398673e-05, "grad_norm": 17.941001892089844, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8582321405410767, "num_tokens": 533563760.0, "step": 13989 }, { "epoch": 1.7796717974812366, "ewc_loss": 0.030466953292489052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0466953830909915e-05, "grad_norm": 17.953649520874023, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8697292804718018, "num_tokens": 533601499.0, "step": 13990 }, { "epoch": 1.7797990077598271, "ewc_loss": 0.03049500286579132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0495002647512592e-05, "grad_norm": 18.050254821777344, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8732810020446777, "num_tokens": 533640757.0, "step": 13991 }, { "epoch": 1.7799262180384177, "ewc_loss": 0.030454715713858604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.045471567020286e-05, "grad_norm": 17.940185546875, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.871943473815918, "num_tokens": 533678047.0, "step": 13992 }, { "epoch": 1.780053428317008, "ewc_loss": 0.030361535027623177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0361534300027415e-05, "grad_norm": 17.989238739013672, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8703232407569885, "num_tokens": 533720903.0, "step": 13993 }, { "epoch": 1.7801806385955985, "ewc_loss": 0.030506812036037445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0506811526720412e-05, "grad_norm": 17.970067977905273, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8712373375892639, "num_tokens": 533759720.0, "step": 13994 }, { "epoch": 1.780307848874189, "ewc_loss": 0.030393503606319427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0393503038794734e-05, "grad_norm": 17.95442771911621, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.858045220375061, "num_tokens": 533796122.0, "step": 13995 }, { "epoch": 1.7804350591527796, "ewc_loss": 0.030467720702290535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.046772144443821e-05, "grad_norm": 18.006229400634766, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8761543035507202, "num_tokens": 533836326.0, "step": 13996 }, { "epoch": 1.78056226943137, "ewc_loss": 0.030466396361589432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.046639722015243e-05, "grad_norm": 17.9127254486084, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8818404674530029, "num_tokens": 533867874.0, "step": 13997 }, { "epoch": 1.7806894797099606, "ewc_loss": 0.03048234060406685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.048234066227451e-05, "grad_norm": 18.10164451599121, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8826406598091125, "num_tokens": 533902877.0, "step": 13998 }, { "epoch": 1.780816689988551, "ewc_loss": 0.030458949506282806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0458950277534313e-05, "grad_norm": 17.964872360229492, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8729144334793091, "num_tokens": 533937440.0, "step": 13999 }, { "epoch": 1.7809439002671414, "ewc_loss": 0.03040197677910328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040197589143645e-05, "grad_norm": 17.987262725830078, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8796713948249817, "num_tokens": 533972308.0, "step": 14000 }, { "epoch": 1.781071110545732, "ewc_loss": 0.030481137335300446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0481136491289362e-05, "grad_norm": 18.089637756347656, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.857973575592041, "num_tokens": 534011913.0, "step": 14001 }, { "epoch": 1.7811983208243225, "ewc_loss": 0.030441582202911377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0441582566709258e-05, "grad_norm": 18.001649856567383, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8810346126556396, "num_tokens": 534057235.0, "step": 14002 }, { "epoch": 1.781325531102913, "ewc_loss": 0.030436841771006584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0436842280323617e-05, "grad_norm": 18.0219783782959, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8694660663604736, "num_tokens": 534092850.0, "step": 14003 }, { "epoch": 1.7814527413815036, "ewc_loss": 0.030455617234110832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0455617888947017e-05, "grad_norm": 18.02627182006836, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8747901916503906, "num_tokens": 534135310.0, "step": 14004 }, { "epoch": 1.781579951660094, "ewc_loss": 0.030406296253204346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040629599126987e-05, "grad_norm": 18.006298065185547, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8681116700172424, "num_tokens": 534176008.0, "step": 14005 }, { "epoch": 1.7817071619386846, "ewc_loss": 0.030442388728260994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.044238837901503e-05, "grad_norm": 18.061328887939453, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8702636957168579, "num_tokens": 534208217.0, "step": 14006 }, { "epoch": 1.7818343722172751, "ewc_loss": 0.03040895238518715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.040895171579905e-05, "grad_norm": 18.05966567993164, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8737231492996216, "num_tokens": 534247564.0, "step": 14007 }, { "epoch": 1.7819615824958657, "ewc_loss": 0.030405985191464424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0405984944081865e-05, "grad_norm": 17.978490829467773, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8512790203094482, "num_tokens": 534281144.0, "step": 14008 }, { "epoch": 1.7820887927744562, "ewc_loss": 0.030375180765986443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0375180358532816e-05, "grad_norm": 18.022863388061523, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8604037761688232, "num_tokens": 534317893.0, "step": 14009 }, { "epoch": 1.7822160030530467, "ewc_loss": 0.030475104227662086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0475104722427204e-05, "grad_norm": 18.074583053588867, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8682703971862793, "num_tokens": 534356116.0, "step": 14010 }, { "epoch": 1.7823432133316373, "ewc_loss": 0.03037237375974655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0372373657883145e-05, "grad_norm": 17.926469802856445, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8450524806976318, "num_tokens": 534390499.0, "step": 14011 }, { "epoch": 1.7824704236102278, "ewc_loss": 0.030442334711551666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0442333809332922e-05, "grad_norm": 18.012083053588867, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8640708327293396, "num_tokens": 534424242.0, "step": 14012 }, { "epoch": 1.7825976338888183, "ewc_loss": 0.03046659380197525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0466593671008013e-05, "grad_norm": 17.985275268554688, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.861190140247345, "num_tokens": 534467465.0, "step": 14013 }, { "epoch": 1.7827248441674088, "ewc_loss": 0.030482858419418335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.048285907425452e-05, "grad_norm": 17.965641021728516, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8648198843002319, "num_tokens": 534502959.0, "step": 14014 }, { "epoch": 1.7828520544459994, "ewc_loss": 0.030457166954874992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.045716766791884e-05, "grad_norm": 17.87327766418457, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8702980279922485, "num_tokens": 534549888.0, "step": 14015 }, { "epoch": 1.78297926472459, "ewc_loss": 0.030421238392591476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.042123898922e-05, "grad_norm": 17.9406795501709, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8722343444824219, "num_tokens": 534591686.0, "step": 14016 }, { "epoch": 1.7831064750031802, "ewc_loss": 0.030564449727535248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056444984395057e-05, "grad_norm": 18.058176040649414, "learning_rate": 1e-06, "loss": 0.3563, "mean_token_accuracy": 0.8854826092720032, "num_tokens": 534628584.0, "step": 14017 }, { "epoch": 1.7832336852817707, "ewc_loss": 0.03051452711224556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0514527679770254e-05, "grad_norm": 17.94249153137207, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8600215911865234, "num_tokens": 534665684.0, "step": 14018 }, { "epoch": 1.7833608955603613, "ewc_loss": 0.030469276010990143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0469276680378243e-05, "grad_norm": 18.00232696533203, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8718703985214233, "num_tokens": 534701604.0, "step": 14019 }, { "epoch": 1.7834881058389518, "ewc_loss": 0.03054192289710045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054192347917706e-05, "grad_norm": 17.99679946899414, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8540405035018921, "num_tokens": 534741765.0, "step": 14020 }, { "epoch": 1.7836153161175423, "ewc_loss": 0.03043416514992714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0434164727921598e-05, "grad_norm": 17.87852668762207, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8787841796875, "num_tokens": 534780331.0, "step": 14021 }, { "epoch": 1.7837425263961328, "ewc_loss": 0.030480455607175827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0480456189252436e-05, "grad_norm": 18.080059051513672, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8582710027694702, "num_tokens": 534816254.0, "step": 14022 }, { "epoch": 1.7838697366747234, "ewc_loss": 0.030542071908712387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054207263630815e-05, "grad_norm": 17.937864303588867, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8685563802719116, "num_tokens": 534851367.0, "step": 14023 }, { "epoch": 1.7839969469533137, "ewc_loss": 0.030421435832977295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0421435440075584e-05, "grad_norm": 17.97772789001465, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8668785691261292, "num_tokens": 534889077.0, "step": 14024 }, { "epoch": 1.7841241572319042, "ewc_loss": 0.030557598918676376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0557599529856816e-05, "grad_norm": 17.938167572021484, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8588999509811401, "num_tokens": 534924249.0, "step": 14025 }, { "epoch": 1.7842513675104947, "ewc_loss": 0.03052457794547081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0524577596224844e-05, "grad_norm": 18.044958114624023, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8544548749923706, "num_tokens": 534966370.0, "step": 14026 }, { "epoch": 1.7843785777890853, "ewc_loss": 0.030543994158506393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054399348911829e-05, "grad_norm": 18.02433204650879, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.87636798620224, "num_tokens": 535005869.0, "step": 14027 }, { "epoch": 1.7845057880676758, "ewc_loss": 0.030503932386636734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05039320664946e-05, "grad_norm": 17.97989273071289, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8666239976882935, "num_tokens": 535042057.0, "step": 14028 }, { "epoch": 1.7846329983462663, "ewc_loss": 0.03056061454117298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0560615414287895e-05, "grad_norm": 18.029674530029297, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8640287518501282, "num_tokens": 535074410.0, "step": 14029 }, { "epoch": 1.7847602086248568, "ewc_loss": 0.03056752309203148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056752393604256e-05, "grad_norm": 17.976886749267578, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8601938486099243, "num_tokens": 535120356.0, "step": 14030 }, { "epoch": 1.7848874189034474, "ewc_loss": 0.030492201447486877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0492201403831132e-05, "grad_norm": 18.07288932800293, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8597421646118164, "num_tokens": 535156939.0, "step": 14031 }, { "epoch": 1.785014629182038, "ewc_loss": 0.030537724494934082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0537725251633674e-05, "grad_norm": 17.91143035888672, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8719601631164551, "num_tokens": 535195553.0, "step": 14032 }, { "epoch": 1.7851418394606284, "ewc_loss": 0.03047027438879013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047027530556079e-05, "grad_norm": 17.997034072875977, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.87348473072052, "num_tokens": 535235692.0, "step": 14033 }, { "epoch": 1.785269049739219, "ewc_loss": 0.030535854399204254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053585533052683e-05, "grad_norm": 17.9661922454834, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8679916858673096, "num_tokens": 535269265.0, "step": 14034 }, { "epoch": 1.7853962600178095, "ewc_loss": 0.030521364882588387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052136526093818e-05, "grad_norm": 17.997936248779297, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8847230076789856, "num_tokens": 535303825.0, "step": 14035 }, { "epoch": 1.7855234702964, "ewc_loss": 0.03048679232597351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0486791729344986e-05, "grad_norm": 17.949220657348633, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8637773394584656, "num_tokens": 535338802.0, "step": 14036 }, { "epoch": 1.7856506805749905, "ewc_loss": 0.030491415411233902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.04914156004088e-05, "grad_norm": 17.935022354125977, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8761144876480103, "num_tokens": 535376592.0, "step": 14037 }, { "epoch": 1.785777890853581, "ewc_loss": 0.0305945947766304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059459413634613e-05, "grad_norm": 17.965600967407227, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8658814430236816, "num_tokens": 535416528.0, "step": 14038 }, { "epoch": 1.7859051011321716, "ewc_loss": 0.030499354004859924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0499353670165874e-05, "grad_norm": 17.967641830444336, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8663263916969299, "num_tokens": 535450892.0, "step": 14039 }, { "epoch": 1.7860323114107621, "ewc_loss": 0.03053806722164154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053806722164154e-05, "grad_norm": 17.9937744140625, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8710302114486694, "num_tokens": 535488198.0, "step": 14040 }, { "epoch": 1.7861595216893527, "ewc_loss": 0.030580323189496994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058032234548591e-05, "grad_norm": 18.028432846069336, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.867645263671875, "num_tokens": 535526020.0, "step": 14041 }, { "epoch": 1.786286731967943, "ewc_loss": 0.03051064722239971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051064777537249e-05, "grad_norm": 17.923336029052734, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8753399848937988, "num_tokens": 535560400.0, "step": 14042 }, { "epoch": 1.7864139422465335, "ewc_loss": 0.030524101108312607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0524101021001115e-05, "grad_norm": 17.996265411376953, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8634233474731445, "num_tokens": 535594699.0, "step": 14043 }, { "epoch": 1.786541152525124, "ewc_loss": 0.030561154708266258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0561153835151345e-05, "grad_norm": 17.89630126953125, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8745154142379761, "num_tokens": 535627019.0, "step": 14044 }, { "epoch": 1.7866683628037145, "ewc_loss": 0.030564183369278908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056418427149765e-05, "grad_norm": 17.99510383605957, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8789365291595459, "num_tokens": 535667479.0, "step": 14045 }, { "epoch": 1.786795573082305, "ewc_loss": 0.03064708597958088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064708653255366e-05, "grad_norm": 17.951723098754883, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8639634847640991, "num_tokens": 535705211.0, "step": 14046 }, { "epoch": 1.7869227833608956, "ewc_loss": 0.0305919386446476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0591938411816955e-05, "grad_norm": 17.9748592376709, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8504811525344849, "num_tokens": 535744907.0, "step": 14047 }, { "epoch": 1.787049993639486, "ewc_loss": 0.0306437686085701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064376869588159e-05, "grad_norm": 18.0485897064209, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8678547143936157, "num_tokens": 535782526.0, "step": 14048 }, { "epoch": 1.7871772039180764, "ewc_loss": 0.03060038574039936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060038579860702e-05, "grad_norm": 17.940874099731445, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8848304748535156, "num_tokens": 535824153.0, "step": 14049 }, { "epoch": 1.787304414196667, "ewc_loss": 0.03057142347097397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0571423849323764e-05, "grad_norm": 18.021631240844727, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.881171703338623, "num_tokens": 535859019.0, "step": 14050 }, { "epoch": 1.7874316244752575, "ewc_loss": 0.030623771250247955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062377072637901e-05, "grad_norm": 17.9505672454834, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8808733820915222, "num_tokens": 535896866.0, "step": 14051 }, { "epoch": 1.787558834753848, "ewc_loss": 0.030540620908141136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054062108276412e-05, "grad_norm": 18.028610229492188, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8709094524383545, "num_tokens": 535937128.0, "step": 14052 }, { "epoch": 1.7876860450324386, "ewc_loss": 0.030644718557596207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064471820835024e-05, "grad_norm": 18.030723571777344, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8855158686637878, "num_tokens": 535972727.0, "step": 14053 }, { "epoch": 1.787813255311029, "ewc_loss": 0.030571481212973595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057148205698468e-05, "grad_norm": 17.990575790405273, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8748096227645874, "num_tokens": 536011491.0, "step": 14054 }, { "epoch": 1.7879404655896196, "ewc_loss": 0.030554506927728653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055450724787079e-05, "grad_norm": 18.053977966308594, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8729021549224854, "num_tokens": 536048074.0, "step": 14055 }, { "epoch": 1.7880676758682101, "ewc_loss": 0.030538445338606834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053844557143748e-05, "grad_norm": 17.969120025634766, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8762249946594238, "num_tokens": 536089554.0, "step": 14056 }, { "epoch": 1.7881948861468007, "ewc_loss": 0.030443653464317322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0443654395639896e-05, "grad_norm": 17.912141799926758, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8791015148162842, "num_tokens": 536127672.0, "step": 14057 }, { "epoch": 1.7883220964253912, "ewc_loss": 0.030578412115573883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057841240661219e-05, "grad_norm": 18.045700073242188, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8710306882858276, "num_tokens": 536157517.0, "step": 14058 }, { "epoch": 1.7884493067039817, "ewc_loss": 0.030524814501404762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0524814064847305e-05, "grad_norm": 17.997777938842773, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8824481964111328, "num_tokens": 536192855.0, "step": 14059 }, { "epoch": 1.7885765169825723, "ewc_loss": 0.030436119064688683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0436118322541006e-05, "grad_norm": 17.8817138671875, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8714970946311951, "num_tokens": 536231727.0, "step": 14060 }, { "epoch": 1.7887037272611628, "ewc_loss": 0.030513659119606018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0513660021824762e-05, "grad_norm": 18.01555633544922, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8832899332046509, "num_tokens": 536270738.0, "step": 14061 }, { "epoch": 1.7888309375397533, "ewc_loss": 0.03054967150092125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054967237403616e-05, "grad_norm": 18.00527000427246, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8884446620941162, "num_tokens": 536313006.0, "step": 14062 }, { "epoch": 1.7889581478183438, "ewc_loss": 0.03050893358886242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.050893428735435e-05, "grad_norm": 18.019620895385742, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8639159798622131, "num_tokens": 536350697.0, "step": 14063 }, { "epoch": 1.7890853580969344, "ewc_loss": 0.030470460653305054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047046084247995e-05, "grad_norm": 17.94247055053711, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8788163661956787, "num_tokens": 536394252.0, "step": 14064 }, { "epoch": 1.789212568375525, "ewc_loss": 0.030477669090032578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0477669497486204e-05, "grad_norm": 18.051515579223633, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8687292337417603, "num_tokens": 536426591.0, "step": 14065 }, { "epoch": 1.7893397786541152, "ewc_loss": 0.030501116067171097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.050111627089791e-05, "grad_norm": 18.019596099853516, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8583832383155823, "num_tokens": 536468567.0, "step": 14066 }, { "epoch": 1.7894669889327057, "ewc_loss": 0.030476709827780724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0476709071081132e-05, "grad_norm": 18.007802963256836, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8611151576042175, "num_tokens": 536506352.0, "step": 14067 }, { "epoch": 1.7895941992112963, "ewc_loss": 0.030482113361358643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0482113288599066e-05, "grad_norm": 17.996246337890625, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8633245229721069, "num_tokens": 536538122.0, "step": 14068 }, { "epoch": 1.7897214094898868, "ewc_loss": 0.030496669933199883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0496670660795644e-05, "grad_norm": 17.947978973388672, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8603023290634155, "num_tokens": 536574617.0, "step": 14069 }, { "epoch": 1.7898486197684773, "ewc_loss": 0.030495535582304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049553561140783e-05, "grad_norm": 18.006168365478516, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8815034627914429, "num_tokens": 536611895.0, "step": 14070 }, { "epoch": 1.7899758300470678, "ewc_loss": 0.030487101525068283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.048710095754359e-05, "grad_norm": 17.973283767700195, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8626409769058228, "num_tokens": 536651336.0, "step": 14071 }, { "epoch": 1.7901030403256584, "ewc_loss": 0.030512714758515358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051271414733492e-05, "grad_norm": 17.966848373413086, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.866034746170044, "num_tokens": 536692566.0, "step": 14072 }, { "epoch": 1.7902302506042487, "ewc_loss": 0.03047015890479088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0470158890238963e-05, "grad_norm": 17.98784828186035, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8759860396385193, "num_tokens": 536730855.0, "step": 14073 }, { "epoch": 1.7903574608828392, "ewc_loss": 0.030559614300727844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0559614970115945e-05, "grad_norm": 17.99397850036621, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8684342503547668, "num_tokens": 536766478.0, "step": 14074 }, { "epoch": 1.7904846711614297, "ewc_loss": 0.030462447553873062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0462448194157332e-05, "grad_norm": 17.939966201782227, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.86601322889328, "num_tokens": 536799900.0, "step": 14075 }, { "epoch": 1.7906118814400203, "ewc_loss": 0.030523493885993958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052349347854033e-05, "grad_norm": 18.021421432495117, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8764976859092712, "num_tokens": 536843597.0, "step": 14076 }, { "epoch": 1.7907390917186108, "ewc_loss": 0.030506674200296402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0506673283525743e-05, "grad_norm": 17.926166534423828, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8818081021308899, "num_tokens": 536879189.0, "step": 14077 }, { "epoch": 1.7908663019972013, "ewc_loss": 0.03046858124434948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.046858182642609e-05, "grad_norm": 17.971050262451172, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8741788864135742, "num_tokens": 536918099.0, "step": 14078 }, { "epoch": 1.7909935122757918, "ewc_loss": 0.030578240752220154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0578241421608254e-05, "grad_norm": 18.016691207885742, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8612634539604187, "num_tokens": 536954410.0, "step": 14079 }, { "epoch": 1.7911207225543824, "ewc_loss": 0.030495572835206985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0495571991195902e-05, "grad_norm": 17.925565719604492, "learning_rate": 1e-06, "loss": 0.3376, "mean_token_accuracy": 0.8921799659729004, "num_tokens": 536986688.0, "step": 14080 }, { "epoch": 1.791247932832973, "ewc_loss": 0.030525054782629013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0525054171448573e-05, "grad_norm": 17.97136688232422, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8707340955734253, "num_tokens": 537024181.0, "step": 14081 }, { "epoch": 1.7913751431115634, "ewc_loss": 0.03057267889380455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057267895201221e-05, "grad_norm": 18.01018524169922, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8731546401977539, "num_tokens": 537064174.0, "step": 14082 }, { "epoch": 1.791502353390154, "ewc_loss": 0.030551861971616745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055186243727803e-05, "grad_norm": 18.02281951904297, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8630971908569336, "num_tokens": 537100279.0, "step": 14083 }, { "epoch": 1.7916295636687445, "ewc_loss": 0.030488155782222748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0488155971397646e-05, "grad_norm": 17.984590530395508, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8450736403465271, "num_tokens": 537135339.0, "step": 14084 }, { "epoch": 1.791756773947335, "ewc_loss": 0.030519796535372734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0519797292072326e-05, "grad_norm": 17.9829044342041, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8652317523956299, "num_tokens": 537169608.0, "step": 14085 }, { "epoch": 1.7918839842259255, "ewc_loss": 0.0305422805249691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054228000110015e-05, "grad_norm": 17.98258399963379, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8791950941085815, "num_tokens": 537205560.0, "step": 14086 }, { "epoch": 1.792011194504516, "ewc_loss": 0.030498052015900612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0498051273752935e-05, "grad_norm": 17.900436401367188, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8594421744346619, "num_tokens": 537238697.0, "step": 14087 }, { "epoch": 1.7921384047831066, "ewc_loss": 0.030567742884159088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056774221477099e-05, "grad_norm": 18.007701873779297, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8654582500457764, "num_tokens": 537282365.0, "step": 14088 }, { "epoch": 1.7922656150616971, "ewc_loss": 0.030600881204009056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0600880563724786e-05, "grad_norm": 17.922012329101562, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8789799213409424, "num_tokens": 537320805.0, "step": 14089 }, { "epoch": 1.7923928253402877, "ewc_loss": 0.030511323362588882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0511322620441206e-05, "grad_norm": 18.000823974609375, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8733630180358887, "num_tokens": 537354607.0, "step": 14090 }, { "epoch": 1.792520035618878, "ewc_loss": 0.030612526461482048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061252573388629e-05, "grad_norm": 17.996170043945312, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8801471590995789, "num_tokens": 537386672.0, "step": 14091 }, { "epoch": 1.7926472458974685, "ewc_loss": 0.030592892318964005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059289156226441e-05, "grad_norm": 17.98415184020996, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8728615045547485, "num_tokens": 537430882.0, "step": 14092 }, { "epoch": 1.792774456176059, "ewc_loss": 0.030620452016592026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062045288970694e-05, "grad_norm": 17.993423461914062, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8735201358795166, "num_tokens": 537465114.0, "step": 14093 }, { "epoch": 1.7929016664546495, "ewc_loss": 0.030628295615315437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062829637201503e-05, "grad_norm": 17.970163345336914, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8646838665008545, "num_tokens": 537502820.0, "step": 14094 }, { "epoch": 1.79302887673324, "ewc_loss": 0.030624454841017723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062445466639474e-05, "grad_norm": 18.01117706298828, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.878990113735199, "num_tokens": 537537422.0, "step": 14095 }, { "epoch": 1.7931560870118306, "ewc_loss": 0.030600305646657944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0600305763073266e-05, "grad_norm": 17.98098373413086, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8749328255653381, "num_tokens": 537566828.0, "step": 14096 }, { "epoch": 1.793283297290421, "ewc_loss": 0.030611470341682434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061147072003223e-05, "grad_norm": 17.978683471679688, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8678211569786072, "num_tokens": 537606102.0, "step": 14097 }, { "epoch": 1.7934105075690114, "ewc_loss": 0.030609218403697014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060921881115064e-05, "grad_norm": 17.90838050842285, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.876759946346283, "num_tokens": 537638527.0, "step": 14098 }, { "epoch": 1.793537717847602, "ewc_loss": 0.0306592658162117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06592664855998e-05, "grad_norm": 18.025802612304688, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8696589469909668, "num_tokens": 537683375.0, "step": 14099 }, { "epoch": 1.7936649281261925, "ewc_loss": 0.0306659284979105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0665927624795586e-05, "grad_norm": 17.930015563964844, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8645491600036621, "num_tokens": 537731354.0, "step": 14100 }, { "epoch": 1.793792138404783, "ewc_loss": 0.030563442036509514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0563442123821005e-05, "grad_norm": 18.005727767944336, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.865484356880188, "num_tokens": 537770055.0, "step": 14101 }, { "epoch": 1.7939193486833735, "ewc_loss": 0.03061911091208458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061911047552712e-05, "grad_norm": 18.009519577026367, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8504694700241089, "num_tokens": 537807977.0, "step": 14102 }, { "epoch": 1.794046558961964, "ewc_loss": 0.03059781715273857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0597817385569215e-05, "grad_norm": 18.047685623168945, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8653781414031982, "num_tokens": 537847809.0, "step": 14103 }, { "epoch": 1.7941737692405546, "ewc_loss": 0.030603045597672462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0603045161115006e-05, "grad_norm": 17.97364616394043, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8512255549430847, "num_tokens": 537885506.0, "step": 14104 }, { "epoch": 1.7943009795191451, "ewc_loss": 0.03057599626481533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057599678868428e-05, "grad_norm": 18.055091857910156, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8661041259765625, "num_tokens": 537918936.0, "step": 14105 }, { "epoch": 1.7944281897977357, "ewc_loss": 0.030592434108257294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059243317693472e-05, "grad_norm": 17.988140106201172, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8742068409919739, "num_tokens": 537954205.0, "step": 14106 }, { "epoch": 1.7945554000763262, "ewc_loss": 0.03053644858300686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0536448321072385e-05, "grad_norm": 18.0273494720459, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8715423941612244, "num_tokens": 537989190.0, "step": 14107 }, { "epoch": 1.7946826103549167, "ewc_loss": 0.03064747154712677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064747215830721e-05, "grad_norm": 18.051189422607422, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8612880110740662, "num_tokens": 538028852.0, "step": 14108 }, { "epoch": 1.7948098206335072, "ewc_loss": 0.03055424988269806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0554248951375484e-05, "grad_norm": 18.009193420410156, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.875947892665863, "num_tokens": 538061245.0, "step": 14109 }, { "epoch": 1.7949370309120978, "ewc_loss": 0.030532479286193848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053247928619385e-05, "grad_norm": 17.967491149902344, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.88138747215271, "num_tokens": 538103238.0, "step": 14110 }, { "epoch": 1.7950642411906883, "ewc_loss": 0.0305837020277977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05837020277977e-05, "grad_norm": 18.111549377441406, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.856460690498352, "num_tokens": 538142692.0, "step": 14111 }, { "epoch": 1.7951914514692788, "ewc_loss": 0.030586939305067062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058693982893601e-05, "grad_norm": 17.977935791015625, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8680530190467834, "num_tokens": 538183155.0, "step": 14112 }, { "epoch": 1.7953186617478694, "ewc_loss": 0.030486367642879486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.048636790481396e-05, "grad_norm": 18.02882957458496, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8687155842781067, "num_tokens": 538222063.0, "step": 14113 }, { "epoch": 1.7954458720264599, "ewc_loss": 0.03057014010846615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057013964280486e-05, "grad_norm": 18.01844596862793, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8721029758453369, "num_tokens": 538259847.0, "step": 14114 }, { "epoch": 1.7955730823050502, "ewc_loss": 0.03049241192638874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0492412406601943e-05, "grad_norm": 17.942655563354492, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8759166598320007, "num_tokens": 538298557.0, "step": 14115 }, { "epoch": 1.7957002925836407, "ewc_loss": 0.030519181862473488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051918247365393e-05, "grad_norm": 18.06377410888672, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.866807222366333, "num_tokens": 538337926.0, "step": 14116 }, { "epoch": 1.7958275028622313, "ewc_loss": 0.030566086992621422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056608693441376e-05, "grad_norm": 17.937150955200195, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.873638927936554, "num_tokens": 538382755.0, "step": 14117 }, { "epoch": 1.7959547131408218, "ewc_loss": 0.0305231474339962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052314787055366e-05, "grad_norm": 18.08565902709961, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8525583744049072, "num_tokens": 538417639.0, "step": 14118 }, { "epoch": 1.7960819234194123, "ewc_loss": 0.030653826892375946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06538277072832e-05, "grad_norm": 18.041467666625977, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8638838529586792, "num_tokens": 538458444.0, "step": 14119 }, { "epoch": 1.7962091336980028, "ewc_loss": 0.030459171161055565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0459170375252143e-05, "grad_norm": 17.999420166015625, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8712543249130249, "num_tokens": 538496580.0, "step": 14120 }, { "epoch": 1.7963363439765934, "ewc_loss": 0.030564552173018456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056455170735717e-05, "grad_norm": 17.983747482299805, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.878792405128479, "num_tokens": 538538474.0, "step": 14121 }, { "epoch": 1.7964635542551837, "ewc_loss": 0.03054460883140564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054460830753669e-05, "grad_norm": 18.053264617919922, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8722953796386719, "num_tokens": 538576828.0, "step": 14122 }, { "epoch": 1.7965907645337742, "ewc_loss": 0.030568910762667656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0568910005968064e-05, "grad_norm": 18.014869689941406, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8775434494018555, "num_tokens": 538611630.0, "step": 14123 }, { "epoch": 1.7967179748123647, "ewc_loss": 0.030515212565660477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051521343877539e-05, "grad_norm": 18.00884246826172, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8461899757385254, "num_tokens": 538659537.0, "step": 14124 }, { "epoch": 1.7968451850909553, "ewc_loss": 0.030573777854442596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057377762161195e-05, "grad_norm": 18.064016342163086, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8774000406265259, "num_tokens": 538700216.0, "step": 14125 }, { "epoch": 1.7969723953695458, "ewc_loss": 0.030539827421307564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053982800338417e-05, "grad_norm": 18.020261764526367, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8665916323661804, "num_tokens": 538737719.0, "step": 14126 }, { "epoch": 1.7970996056481363, "ewc_loss": 0.030447328463196754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.044732875423506e-05, "grad_norm": 17.954612731933594, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8784873485565186, "num_tokens": 538774723.0, "step": 14127 }, { "epoch": 1.7972268159267268, "ewc_loss": 0.030509231612086296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0509230782627128e-05, "grad_norm": 18.079740524291992, "learning_rate": 1e-06, "loss": 0.367, "mean_token_accuracy": 0.879896879196167, "num_tokens": 538815288.0, "step": 14128 }, { "epoch": 1.7973540262053174, "ewc_loss": 0.03049672767519951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0496727049467154e-05, "grad_norm": 17.949289321899414, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.868860125541687, "num_tokens": 538858239.0, "step": 14129 }, { "epoch": 1.797481236483908, "ewc_loss": 0.030441196635365486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0441196940955706e-05, "grad_norm": 17.99970245361328, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8734755516052246, "num_tokens": 538896947.0, "step": 14130 }, { "epoch": 1.7976084467624984, "ewc_loss": 0.03046623431146145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.046623351110611e-05, "grad_norm": 18.025209426879883, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8680856227874756, "num_tokens": 538936511.0, "step": 14131 }, { "epoch": 1.797735657041089, "ewc_loss": 0.030453935265541077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0453935323748738e-05, "grad_norm": 18.02425765991211, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8646615147590637, "num_tokens": 538969198.0, "step": 14132 }, { "epoch": 1.7978628673196795, "ewc_loss": 0.030485061928629875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0485061870422214e-05, "grad_norm": 18.0659122467041, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8816394805908203, "num_tokens": 539002258.0, "step": 14133 }, { "epoch": 1.79799007759827, "ewc_loss": 0.03051971271634102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051971361855976e-05, "grad_norm": 18.0351619720459, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8565600514411926, "num_tokens": 539039993.0, "step": 14134 }, { "epoch": 1.7981172878768605, "ewc_loss": 0.030461207032203674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0461207643384114e-05, "grad_norm": 17.993013381958008, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8662445545196533, "num_tokens": 539080713.0, "step": 14135 }, { "epoch": 1.798244498155451, "ewc_loss": 0.030474061146378517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047406062250957e-05, "grad_norm": 17.9903507232666, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8806971311569214, "num_tokens": 539121115.0, "step": 14136 }, { "epoch": 1.7983717084340416, "ewc_loss": 0.030495639890432358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0495639293803833e-05, "grad_norm": 17.902612686157227, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8887730240821838, "num_tokens": 539159889.0, "step": 14137 }, { "epoch": 1.7984989187126321, "ewc_loss": 0.030479321256279945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0479321139864624e-05, "grad_norm": 18.038925170898438, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8692582845687866, "num_tokens": 539201508.0, "step": 14138 }, { "epoch": 1.7986261289912227, "ewc_loss": 0.030534757301211357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053475666092709e-05, "grad_norm": 17.993606567382812, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8517624735832214, "num_tokens": 539238640.0, "step": 14139 }, { "epoch": 1.798753339269813, "ewc_loss": 0.030499083921313286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049908445973415e-05, "grad_norm": 18.053571701049805, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8665003776550293, "num_tokens": 539274797.0, "step": 14140 }, { "epoch": 1.7988805495484035, "ewc_loss": 0.03052264265716076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052264219149947e-05, "grad_norm": 18.019062042236328, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8592110276222229, "num_tokens": 539308410.0, "step": 14141 }, { "epoch": 1.799007759826994, "ewc_loss": 0.030497530475258827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049753104278352e-05, "grad_norm": 18.05244255065918, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.862308919429779, "num_tokens": 539345578.0, "step": 14142 }, { "epoch": 1.7991349701055845, "ewc_loss": 0.03050152026116848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0501520086545497e-05, "grad_norm": 17.982738494873047, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8537647724151611, "num_tokens": 539378946.0, "step": 14143 }, { "epoch": 1.799262180384175, "ewc_loss": 0.030501816421747208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0501816581818275e-05, "grad_norm": 18.024433135986328, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8812885284423828, "num_tokens": 539417663.0, "step": 14144 }, { "epoch": 1.7993893906627656, "ewc_loss": 0.030581563711166382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058156289625913e-05, "grad_norm": 17.98080062866211, "learning_rate": 1e-06, "loss": 0.3576, "mean_token_accuracy": 0.8852794766426086, "num_tokens": 539459597.0, "step": 14145 }, { "epoch": 1.799516600941356, "ewc_loss": 0.03047008439898491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0470084311673418e-05, "grad_norm": 17.972829818725586, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8725593090057373, "num_tokens": 539499579.0, "step": 14146 }, { "epoch": 1.7996438112199464, "ewc_loss": 0.030591890215873718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059189111809246e-05, "grad_norm": 18.052776336669922, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8792035579681396, "num_tokens": 539538505.0, "step": 14147 }, { "epoch": 1.799771021498537, "ewc_loss": 0.03051202930510044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051202838832978e-05, "grad_norm": 18.02480125427246, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8644382953643799, "num_tokens": 539574824.0, "step": 14148 }, { "epoch": 1.7998982317771275, "ewc_loss": 0.03054320439696312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0543204047717154e-05, "grad_norm": 17.99681854248047, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8564916849136353, "num_tokens": 539614181.0, "step": 14149 }, { "epoch": 1.800025442055718, "ewc_loss": 0.030569560825824738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0569561204174533e-05, "grad_norm": 18.00023651123047, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8577972054481506, "num_tokens": 539656713.0, "step": 14150 }, { "epoch": 1.8001526523343085, "ewc_loss": 0.030546335503458977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054633634747006e-05, "grad_norm": 18.030841827392578, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8683144450187683, "num_tokens": 539695436.0, "step": 14151 }, { "epoch": 1.800279862612899, "ewc_loss": 0.030530322343111038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053032196476124e-05, "grad_norm": 17.992645263671875, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8716274499893188, "num_tokens": 539736549.0, "step": 14152 }, { "epoch": 1.8004070728914896, "ewc_loss": 0.030536044389009476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05360445054248e-05, "grad_norm": 18.041959762573242, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8747092485427856, "num_tokens": 539776326.0, "step": 14153 }, { "epoch": 1.8005342831700801, "ewc_loss": 0.030538223683834076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0538223654730245e-05, "grad_norm": 17.957763671875, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8761488795280457, "num_tokens": 539819665.0, "step": 14154 }, { "epoch": 1.8006614934486707, "ewc_loss": 0.030519234016537666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051923340535723e-05, "grad_norm": 18.1021728515625, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8671966791152954, "num_tokens": 539858175.0, "step": 14155 }, { "epoch": 1.8007887037272612, "ewc_loss": 0.030536865815520287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05368666886352e-05, "grad_norm": 17.989601135253906, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.868087887763977, "num_tokens": 539895152.0, "step": 14156 }, { "epoch": 1.8009159140058517, "ewc_loss": 0.030517196282744408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0517196137225255e-05, "grad_norm": 18.06715202331543, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8758198022842407, "num_tokens": 539941033.0, "step": 14157 }, { "epoch": 1.8010431242844422, "ewc_loss": 0.030545631423592567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0545630579581484e-05, "grad_norm": 18.007104873657227, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8719519376754761, "num_tokens": 539977959.0, "step": 14158 }, { "epoch": 1.8011703345630328, "ewc_loss": 0.030450204387307167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0450204576482065e-05, "grad_norm": 17.9999942779541, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.86799156665802, "num_tokens": 540014857.0, "step": 14159 }, { "epoch": 1.8012975448416233, "ewc_loss": 0.03053085319697857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053085310966708e-05, "grad_norm": 18.004186630249023, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8690633773803711, "num_tokens": 540055775.0, "step": 14160 }, { "epoch": 1.8014247551202138, "ewc_loss": 0.030495062470436096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049506267416291e-05, "grad_norm": 18.036758422851562, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8797479867935181, "num_tokens": 540090287.0, "step": 14161 }, { "epoch": 1.8015519653988044, "ewc_loss": 0.030494200065732002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049420047318563e-05, "grad_norm": 17.992042541503906, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8774531483650208, "num_tokens": 540129139.0, "step": 14162 }, { "epoch": 1.8016791756773949, "ewc_loss": 0.030521700158715248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0521699954988435e-05, "grad_norm": 18.093154907226562, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.869167685508728, "num_tokens": 540165187.0, "step": 14163 }, { "epoch": 1.8018063859559852, "ewc_loss": 0.030537186190485954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0537186830770224e-05, "grad_norm": 18.014856338500977, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8822740912437439, "num_tokens": 540201113.0, "step": 14164 }, { "epoch": 1.8019335962345757, "ewc_loss": 0.030472759157419205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047275822609663e-05, "grad_norm": 17.9886531829834, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8723083138465881, "num_tokens": 540237448.0, "step": 14165 }, { "epoch": 1.8020608065131662, "ewc_loss": 0.03050588257610798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05058820231352e-05, "grad_norm": 18.003847122192383, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8639302253723145, "num_tokens": 540279703.0, "step": 14166 }, { "epoch": 1.8021880167917568, "ewc_loss": 0.03048052079975605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0480521672870964e-05, "grad_norm": 18.066810607910156, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8521385788917542, "num_tokens": 540313071.0, "step": 14167 }, { "epoch": 1.8023152270703473, "ewc_loss": 0.03056594915688038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056594869121909e-05, "grad_norm": 17.99530029296875, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8738417625427246, "num_tokens": 540353940.0, "step": 14168 }, { "epoch": 1.8024424373489378, "ewc_loss": 0.030524414032697678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0524413887178525e-05, "grad_norm": 18.060367584228516, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8671583533287048, "num_tokens": 540391241.0, "step": 14169 }, { "epoch": 1.8025696476275284, "ewc_loss": 0.03054455667734146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054455737583339e-05, "grad_norm": 18.01019859313965, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8816605806350708, "num_tokens": 540420205.0, "step": 14170 }, { "epoch": 1.8026968579061187, "ewc_loss": 0.03051232360303402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0512323064613156e-05, "grad_norm": 18.09830093383789, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8738581538200378, "num_tokens": 540452121.0, "step": 14171 }, { "epoch": 1.8028240681847092, "ewc_loss": 0.030539533123373985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05395333271008e-05, "grad_norm": 17.921890258789062, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8697692155838013, "num_tokens": 540487415.0, "step": 14172 }, { "epoch": 1.8029512784632997, "ewc_loss": 0.030476301908493042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0476301617454737e-05, "grad_norm": 18.02163314819336, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.859248161315918, "num_tokens": 540522293.0, "step": 14173 }, { "epoch": 1.8030784887418903, "ewc_loss": 0.030644692480564117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064469274249859e-05, "grad_norm": 17.99808692932129, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8811735510826111, "num_tokens": 540560345.0, "step": 14174 }, { "epoch": 1.8032056990204808, "ewc_loss": 0.030559448525309563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055944762309082e-05, "grad_norm": 17.96862030029297, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.878251850605011, "num_tokens": 540598299.0, "step": 14175 }, { "epoch": 1.8033329092990713, "ewc_loss": 0.03060414083302021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060414019273594e-05, "grad_norm": 18.054079055786133, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8819609880447388, "num_tokens": 540634480.0, "step": 14176 }, { "epoch": 1.8034601195776618, "ewc_loss": 0.030595477670431137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0595478165196255e-05, "grad_norm": 17.972070693969727, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8619016408920288, "num_tokens": 540679473.0, "step": 14177 }, { "epoch": 1.8035873298562524, "ewc_loss": 0.03062095120549202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062095129280351e-05, "grad_norm": 18.0887393951416, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8637389540672302, "num_tokens": 540722259.0, "step": 14178 }, { "epoch": 1.803714540134843, "ewc_loss": 0.03058004565536976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058004585909657e-05, "grad_norm": 18.037891387939453, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8735501170158386, "num_tokens": 540758058.0, "step": 14179 }, { "epoch": 1.8038417504134334, "ewc_loss": 0.030583815649151802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058381480514072e-05, "grad_norm": 18.008230209350586, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8657673597335815, "num_tokens": 540798535.0, "step": 14180 }, { "epoch": 1.803968960692024, "ewc_loss": 0.030594900250434875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059489972656593e-05, "grad_norm": 18.027080535888672, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8704426288604736, "num_tokens": 540839423.0, "step": 14181 }, { "epoch": 1.8040961709706145, "ewc_loss": 0.030604522675275803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0604522180510685e-05, "grad_norm": 18.103174209594727, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8755055665969849, "num_tokens": 540887512.0, "step": 14182 }, { "epoch": 1.804223381249205, "ewc_loss": 0.0305569376796484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0556937417713925e-05, "grad_norm": 18.06995391845703, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.871930718421936, "num_tokens": 540922292.0, "step": 14183 }, { "epoch": 1.8043505915277955, "ewc_loss": 0.030497178435325623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0497178158839233e-05, "grad_norm": 18.067575454711914, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8771114945411682, "num_tokens": 540964105.0, "step": 14184 }, { "epoch": 1.804477801806386, "ewc_loss": 0.03054649569094181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054649641853757e-05, "grad_norm": 18.019718170166016, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8696705102920532, "num_tokens": 541001793.0, "step": 14185 }, { "epoch": 1.8046050120849766, "ewc_loss": 0.030465982854366302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0465982490568422e-05, "grad_norm": 18.035062789916992, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8772790431976318, "num_tokens": 541041104.0, "step": 14186 }, { "epoch": 1.8047322223635671, "ewc_loss": 0.030515514314174652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0515513572026975e-05, "grad_norm": 18.100805282592773, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8633164763450623, "num_tokens": 541079643.0, "step": 14187 }, { "epoch": 1.8048594326421576, "ewc_loss": 0.030541978776454926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0541978048859164e-05, "grad_norm": 18.080608367919922, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8748400807380676, "num_tokens": 541118554.0, "step": 14188 }, { "epoch": 1.804986642920748, "ewc_loss": 0.03049723617732525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0497236366500147e-05, "grad_norm": 18.109580993652344, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8593137860298157, "num_tokens": 541156893.0, "step": 14189 }, { "epoch": 1.8051138531993385, "ewc_loss": 0.030462533235549927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.04625336866593e-05, "grad_norm": 18.002365112304688, "learning_rate": 1e-06, "loss": 0.3512, "mean_token_accuracy": 0.8857896327972412, "num_tokens": 541197283.0, "step": 14190 }, { "epoch": 1.805241063477929, "ewc_loss": 0.030377764254808426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0377765142475255e-05, "grad_norm": 17.980472564697266, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8842545747756958, "num_tokens": 541232343.0, "step": 14191 }, { "epoch": 1.8053682737565195, "ewc_loss": 0.030497727915644646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0497727493639104e-05, "grad_norm": 18.114255905151367, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8617429733276367, "num_tokens": 541270274.0, "step": 14192 }, { "epoch": 1.80549548403511, "ewc_loss": 0.03047630377113819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.047630343644414e-05, "grad_norm": 17.997690200805664, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8653440475463867, "num_tokens": 541306018.0, "step": 14193 }, { "epoch": 1.8056226943137006, "ewc_loss": 0.03043528087437153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0435281587415375e-05, "grad_norm": 18.01628875732422, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8709635138511658, "num_tokens": 541344974.0, "step": 14194 }, { "epoch": 1.805749904592291, "ewc_loss": 0.030496643856167793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049664337595459e-05, "grad_norm": 18.005064010620117, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8797551989555359, "num_tokens": 541378590.0, "step": 14195 }, { "epoch": 1.8058771148708814, "ewc_loss": 0.03043779730796814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.043779724976048e-05, "grad_norm": 18.015560150146484, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8724812269210815, "num_tokens": 541415945.0, "step": 14196 }, { "epoch": 1.806004325149472, "ewc_loss": 0.030531009659171104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053100954275578e-05, "grad_norm": 17.990215301513672, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8674308657646179, "num_tokens": 541459577.0, "step": 14197 }, { "epoch": 1.8061315354280625, "ewc_loss": 0.03049454092979431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0494540624204092e-05, "grad_norm": 17.97340202331543, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8707143068313599, "num_tokens": 541501041.0, "step": 14198 }, { "epoch": 1.806258745706653, "ewc_loss": 0.030461322516202927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0461322239716537e-05, "grad_norm": 17.974599838256836, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.862916111946106, "num_tokens": 541538587.0, "step": 14199 }, { "epoch": 1.8063859559852435, "ewc_loss": 0.0305239986628294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052399915759452e-05, "grad_norm": 18.001577377319336, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.857171893119812, "num_tokens": 541582225.0, "step": 14200 }, { "epoch": 1.806513166263834, "ewc_loss": 0.030542781576514244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054278204217553e-05, "grad_norm": 18.01939582824707, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8805181384086609, "num_tokens": 541616956.0, "step": 14201 }, { "epoch": 1.8066403765424246, "ewc_loss": 0.030530165880918503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053016553167254e-05, "grad_norm": 18.026405334472656, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8588310480117798, "num_tokens": 541650814.0, "step": 14202 }, { "epoch": 1.8067675868210151, "ewc_loss": 0.03053312376141548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05331232084427e-05, "grad_norm": 18.02304458618164, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8790752291679382, "num_tokens": 541692167.0, "step": 14203 }, { "epoch": 1.8068947970996057, "ewc_loss": 0.030512725934386253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051272506127134e-05, "grad_norm": 18.00384521484375, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8612077236175537, "num_tokens": 541732427.0, "step": 14204 }, { "epoch": 1.8070220073781962, "ewc_loss": 0.030483104288578033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0483104637824e-05, "grad_norm": 17.983442306518555, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8703945875167847, "num_tokens": 541773256.0, "step": 14205 }, { "epoch": 1.8071492176567867, "ewc_loss": 0.030523188412189484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0523187888320535e-05, "grad_norm": 18.03534698486328, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8761116862297058, "num_tokens": 541815008.0, "step": 14206 }, { "epoch": 1.8072764279353772, "ewc_loss": 0.03053264319896698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053264299524017e-05, "grad_norm": 17.969980239868164, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8731237649917603, "num_tokens": 541853462.0, "step": 14207 }, { "epoch": 1.8074036382139678, "ewc_loss": 0.030514033511281013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.051403291465249e-05, "grad_norm": 18.03158187866211, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8651657700538635, "num_tokens": 541890487.0, "step": 14208 }, { "epoch": 1.8075308484925583, "ewc_loss": 0.030584989115595818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058498987229541e-05, "grad_norm": 17.974918365478516, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.874541163444519, "num_tokens": 541929857.0, "step": 14209 }, { "epoch": 1.8076580587711488, "ewc_loss": 0.0305128525942564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0512852390529588e-05, "grad_norm": 18.055774688720703, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8640595078468323, "num_tokens": 541969923.0, "step": 14210 }, { "epoch": 1.8077852690497394, "ewc_loss": 0.030632469803094864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063246913370676e-05, "grad_norm": 18.0507755279541, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8789178133010864, "num_tokens": 542008802.0, "step": 14211 }, { "epoch": 1.8079124793283299, "ewc_loss": 0.030512921512126923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0512921512126923e-05, "grad_norm": 17.933027267456055, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8836853504180908, "num_tokens": 542049276.0, "step": 14212 }, { "epoch": 1.8080396896069202, "ewc_loss": 0.030520088970661163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052008833037689e-05, "grad_norm": 18.033580780029297, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8642949461936951, "num_tokens": 542086921.0, "step": 14213 }, { "epoch": 1.8081668998855107, "ewc_loss": 0.03059423714876175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0594237614423037e-05, "grad_norm": 18.024322509765625, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8707917928695679, "num_tokens": 542124291.0, "step": 14214 }, { "epoch": 1.8082941101641012, "ewc_loss": 0.030498629435896873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0498629712383263e-05, "grad_norm": 17.892534255981445, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8595309853553772, "num_tokens": 542167600.0, "step": 14215 }, { "epoch": 1.8084213204426918, "ewc_loss": 0.030570464208722115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057046342291869e-05, "grad_norm": 18.04850196838379, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8660506010055542, "num_tokens": 542201134.0, "step": 14216 }, { "epoch": 1.8085485307212823, "ewc_loss": 0.030590424314141273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0590425012633204e-05, "grad_norm": 17.994823455810547, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8572204113006592, "num_tokens": 542242590.0, "step": 14217 }, { "epoch": 1.8086757409998728, "ewc_loss": 0.030495965853333473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.049596671189647e-05, "grad_norm": 17.933475494384766, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.870269775390625, "num_tokens": 542281945.0, "step": 14218 }, { "epoch": 1.8088029512784631, "ewc_loss": 0.030627794563770294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062779433093965e-05, "grad_norm": 18.02655792236328, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8778690099716187, "num_tokens": 542319389.0, "step": 14219 }, { "epoch": 1.8089301615570537, "ewc_loss": 0.030596492812037468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0596493161283433e-05, "grad_norm": 18.01921844482422, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8580328822135925, "num_tokens": 542356912.0, "step": 14220 }, { "epoch": 1.8090573718356442, "ewc_loss": 0.030599724501371384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059972368646413e-05, "grad_norm": 18.080368041992188, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8644046187400818, "num_tokens": 542394054.0, "step": 14221 }, { "epoch": 1.8091845821142347, "ewc_loss": 0.03053146041929722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053146065212786e-05, "grad_norm": 17.929845809936523, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8745986819267273, "num_tokens": 542431063.0, "step": 14222 }, { "epoch": 1.8093117923928252, "ewc_loss": 0.030569102615118027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056910281884484e-05, "grad_norm": 18.05133628845215, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8591625690460205, "num_tokens": 542471398.0, "step": 14223 }, { "epoch": 1.8094390026714158, "ewc_loss": 0.030574258416891098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057425783481449e-05, "grad_norm": 18.054058074951172, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8560174107551575, "num_tokens": 542516288.0, "step": 14224 }, { "epoch": 1.8095662129500063, "ewc_loss": 0.030560331419110298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056033165194094e-05, "grad_norm": 18.038572311401367, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8686370849609375, "num_tokens": 542552292.0, "step": 14225 }, { "epoch": 1.8096934232285968, "ewc_loss": 0.030557727441191673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0557726859115064e-05, "grad_norm": 17.982351303100586, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8726266622543335, "num_tokens": 542592380.0, "step": 14226 }, { "epoch": 1.8098206335071874, "ewc_loss": 0.030561521649360657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056152127101086e-05, "grad_norm": 18.027467727661133, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8629496693611145, "num_tokens": 542632698.0, "step": 14227 }, { "epoch": 1.8099478437857779, "ewc_loss": 0.030592160299420357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059216032852419e-05, "grad_norm": 17.994943618774414, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8590429425239563, "num_tokens": 542668234.0, "step": 14228 }, { "epoch": 1.8100750540643684, "ewc_loss": 0.03056509420275688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0565093766199425e-05, "grad_norm": 18.0328369140625, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8688820600509644, "num_tokens": 542706205.0, "step": 14229 }, { "epoch": 1.810202264342959, "ewc_loss": 0.030603351071476936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06033507513348e-05, "grad_norm": 18.025175094604492, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8685206174850464, "num_tokens": 542737068.0, "step": 14230 }, { "epoch": 1.8103294746215495, "ewc_loss": 0.030543223023414612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054322223761119e-05, "grad_norm": 18.02498435974121, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8817409873008728, "num_tokens": 542769776.0, "step": 14231 }, { "epoch": 1.81045668490014, "ewc_loss": 0.030613195151090622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061319512198679e-05, "grad_norm": 18.01318359375, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8823525905609131, "num_tokens": 542802440.0, "step": 14232 }, { "epoch": 1.8105838951787305, "ewc_loss": 0.030557040125131607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0557039281120524e-05, "grad_norm": 17.97224998474121, "learning_rate": 1e-06, "loss": 0.367, "mean_token_accuracy": 0.8821670413017273, "num_tokens": 542841158.0, "step": 14233 }, { "epoch": 1.810711105457321, "ewc_loss": 0.030636997893452644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063699841732159e-05, "grad_norm": 18.041704177856445, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8658033609390259, "num_tokens": 542876637.0, "step": 14234 }, { "epoch": 1.8108383157359116, "ewc_loss": 0.030588503926992416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058850415982306e-05, "grad_norm": 17.94571304321289, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.877292811870575, "num_tokens": 542915087.0, "step": 14235 }, { "epoch": 1.8109655260145021, "ewc_loss": 0.030666084960103035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066608405788429e-05, "grad_norm": 18.05432891845703, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8621050119400024, "num_tokens": 542950635.0, "step": 14236 }, { "epoch": 1.8110927362930926, "ewc_loss": 0.030647551640868187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0647552193840966e-05, "grad_norm": 17.997676849365234, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.865109384059906, "num_tokens": 542990078.0, "step": 14237 }, { "epoch": 1.811219946571683, "ewc_loss": 0.030644234269857407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06442343571689e-05, "grad_norm": 18.04119300842285, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8450917601585388, "num_tokens": 543030333.0, "step": 14238 }, { "epoch": 1.8113471568502735, "ewc_loss": 0.030694929882884026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069492959184572e-05, "grad_norm": 17.98893165588379, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8628384470939636, "num_tokens": 543070537.0, "step": 14239 }, { "epoch": 1.811474367128864, "ewc_loss": 0.030673345550894737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0673345463583246e-05, "grad_norm": 18.053077697753906, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8677111864089966, "num_tokens": 543109137.0, "step": 14240 }, { "epoch": 1.8116015774074545, "ewc_loss": 0.03069772757589817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0697727197548375e-05, "grad_norm": 18.05508804321289, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8635089993476868, "num_tokens": 543146616.0, "step": 14241 }, { "epoch": 1.811728787686045, "ewc_loss": 0.030620066449046135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062006726395339e-05, "grad_norm": 17.962173461914062, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8834151029586792, "num_tokens": 543178433.0, "step": 14242 }, { "epoch": 1.8118559979646356, "ewc_loss": 0.030639952048659325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063995245611295e-05, "grad_norm": 18.042036056518555, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.881693959236145, "num_tokens": 543215086.0, "step": 14243 }, { "epoch": 1.811983208243226, "ewc_loss": 0.0306955948472023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069559534196742e-05, "grad_norm": 18.021772384643555, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8544080853462219, "num_tokens": 543252111.0, "step": 14244 }, { "epoch": 1.8121104185218164, "ewc_loss": 0.03062502294778824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062502219108865e-05, "grad_norm": 18.012617111206055, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8771952986717224, "num_tokens": 543291938.0, "step": 14245 }, { "epoch": 1.812237628800407, "ewc_loss": 0.03063884563744068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0638846510555595e-05, "grad_norm": 18.060606002807617, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8736779689788818, "num_tokens": 543335141.0, "step": 14246 }, { "epoch": 1.8123648390789975, "ewc_loss": 0.030606770887970924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060677045141347e-05, "grad_norm": 18.085411071777344, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8712215423583984, "num_tokens": 543371305.0, "step": 14247 }, { "epoch": 1.812492049357588, "ewc_loss": 0.030653664842247963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065366399823688e-05, "grad_norm": 18.052814483642578, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8797622919082642, "num_tokens": 543407431.0, "step": 14248 }, { "epoch": 1.8126192596361785, "ewc_loss": 0.03058464825153351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058464790228754e-05, "grad_norm": 18.139028549194336, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8703402280807495, "num_tokens": 543450287.0, "step": 14249 }, { "epoch": 1.812746469914769, "ewc_loss": 0.030626509338617325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062651012442075e-05, "grad_norm": 18.0528564453125, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8817883133888245, "num_tokens": 543486521.0, "step": 14250 }, { "epoch": 1.8128736801933596, "ewc_loss": 0.030546093359589577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0546092602889985e-05, "grad_norm": 18.02633285522461, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8692306876182556, "num_tokens": 543522157.0, "step": 14251 }, { "epoch": 1.8130008904719501, "ewc_loss": 0.030585942789912224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058594302274287e-05, "grad_norm": 17.998287200927734, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8732772469520569, "num_tokens": 543560503.0, "step": 14252 }, { "epoch": 1.8131281007505406, "ewc_loss": 0.030605806037783623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060580638702959e-05, "grad_norm": 18.06930923461914, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8590320944786072, "num_tokens": 543598943.0, "step": 14253 }, { "epoch": 1.8132553110291312, "ewc_loss": 0.030565643683075905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0565643100999296e-05, "grad_norm": 18.020845413208008, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8728146553039551, "num_tokens": 543640466.0, "step": 14254 }, { "epoch": 1.8133825213077217, "ewc_loss": 0.030562765896320343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0562765459762886e-05, "grad_norm": 17.96915054321289, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8720368146896362, "num_tokens": 543677581.0, "step": 14255 }, { "epoch": 1.8135097315863122, "ewc_loss": 0.030588163062930107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0588162189815193e-05, "grad_norm": 18.062358856201172, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8627526760101318, "num_tokens": 543711220.0, "step": 14256 }, { "epoch": 1.8136369418649028, "ewc_loss": 0.030637985095381737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0637984309578314e-05, "grad_norm": 18.011693954467773, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8641837239265442, "num_tokens": 543750109.0, "step": 14257 }, { "epoch": 1.8137641521434933, "ewc_loss": 0.030601274222135544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060127346543595e-05, "grad_norm": 18.100486755371094, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8743185997009277, "num_tokens": 543788040.0, "step": 14258 }, { "epoch": 1.8138913624220838, "ewc_loss": 0.030635397881269455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063539770664647e-05, "grad_norm": 18.043472290039062, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8697521686553955, "num_tokens": 543821037.0, "step": 14259 }, { "epoch": 1.8140185727006743, "ewc_loss": 0.03054165095090866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0541650630766526e-05, "grad_norm": 18.087820053100586, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.883154571056366, "num_tokens": 543856331.0, "step": 14260 }, { "epoch": 1.8141457829792649, "ewc_loss": 0.030644427984952927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0644427170045674e-05, "grad_norm": 18.051794052124023, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8735886812210083, "num_tokens": 543892898.0, "step": 14261 }, { "epoch": 1.8142729932578552, "ewc_loss": 0.030560921877622604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056092100450769e-05, "grad_norm": 18.059877395629883, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8657568693161011, "num_tokens": 543932858.0, "step": 14262 }, { "epoch": 1.8144002035364457, "ewc_loss": 0.030612217262387276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0612216505687684e-05, "grad_norm": 18.022968292236328, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8789792656898499, "num_tokens": 543968145.0, "step": 14263 }, { "epoch": 1.8145274138150362, "ewc_loss": 0.030567610636353493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056761124753393e-05, "grad_norm": 18.066518783569336, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8685872554779053, "num_tokens": 544000495.0, "step": 14264 }, { "epoch": 1.8146546240936268, "ewc_loss": 0.030655376613140106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065537748625502e-05, "grad_norm": 18.052249908447266, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8654943704605103, "num_tokens": 544041345.0, "step": 14265 }, { "epoch": 1.8147818343722173, "ewc_loss": 0.03056534193456173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056534114875831e-05, "grad_norm": 17.98177146911621, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8743565082550049, "num_tokens": 544080084.0, "step": 14266 }, { "epoch": 1.8149090446508078, "ewc_loss": 0.030651379376649857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0651379347546026e-05, "grad_norm": 18.038732528686523, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8621871471405029, "num_tokens": 544115828.0, "step": 14267 }, { "epoch": 1.8150362549293981, "ewc_loss": 0.03068455122411251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068455043830909e-05, "grad_norm": 18.1378231048584, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8663251996040344, "num_tokens": 544153213.0, "step": 14268 }, { "epoch": 1.8151634652079887, "ewc_loss": 0.030712362378835678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0712362786289304e-05, "grad_norm": 18.034711837768555, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8733236789703369, "num_tokens": 544192383.0, "step": 14269 }, { "epoch": 1.8152906754865792, "ewc_loss": 0.03060721419751644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060721428482793e-05, "grad_norm": 18.010143280029297, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8722428679466248, "num_tokens": 544232034.0, "step": 14270 }, { "epoch": 1.8154178857651697, "ewc_loss": 0.03064584545791149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064584598178044e-05, "grad_norm": 18.04407501220703, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8544337153434753, "num_tokens": 544266242.0, "step": 14271 }, { "epoch": 1.8155450960437602, "ewc_loss": 0.030630100518465042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063010080950335e-05, "grad_norm": 18.09027671813965, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8594530820846558, "num_tokens": 544304995.0, "step": 14272 }, { "epoch": 1.8156723063223508, "ewc_loss": 0.030701709911227226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070171078434214e-05, "grad_norm": 18.074132919311523, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8690431118011475, "num_tokens": 544341950.0, "step": 14273 }, { "epoch": 1.8157995166009413, "ewc_loss": 0.030644020065665245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064401971641928e-05, "grad_norm": 18.125473022460938, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8661109209060669, "num_tokens": 544379535.0, "step": 14274 }, { "epoch": 1.8159267268795318, "ewc_loss": 0.0306057408452034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060574090341106e-05, "grad_norm": 18.04017448425293, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8695970177650452, "num_tokens": 544416939.0, "step": 14275 }, { "epoch": 1.8160539371581224, "ewc_loss": 0.030561160296201706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056116111110896e-05, "grad_norm": 18.042173385620117, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8809986114501953, "num_tokens": 544461476.0, "step": 14276 }, { "epoch": 1.8161811474367129, "ewc_loss": 0.030678654089570045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067865327466279e-05, "grad_norm": 18.06153678894043, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.870357871055603, "num_tokens": 544505051.0, "step": 14277 }, { "epoch": 1.8163083577153034, "ewc_loss": 0.030680224299430847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0680224881507456e-05, "grad_norm": 18.155309677124023, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8700853586196899, "num_tokens": 544550408.0, "step": 14278 }, { "epoch": 1.816435567993894, "ewc_loss": 0.030650990083813667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065099008381367e-05, "grad_norm": 18.107690811157227, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8696928024291992, "num_tokens": 544586645.0, "step": 14279 }, { "epoch": 1.8165627782724845, "ewc_loss": 0.03057028166949749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057028152397834e-05, "grad_norm": 17.99418830871582, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8630995750427246, "num_tokens": 544628087.0, "step": 14280 }, { "epoch": 1.816689988551075, "ewc_loss": 0.030603554099798203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0603554478148e-05, "grad_norm": 18.06241798400879, "learning_rate": 1e-06, "loss": 0.3576, "mean_token_accuracy": 0.8850489854812622, "num_tokens": 544667961.0, "step": 14281 }, { "epoch": 1.8168171988296655, "ewc_loss": 0.030635524541139603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063552503590472e-05, "grad_norm": 18.169891357421875, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8667639493942261, "num_tokens": 544704919.0, "step": 14282 }, { "epoch": 1.816944409108256, "ewc_loss": 0.03061923384666443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0619234166806564e-05, "grad_norm": 18.09702491760254, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.874649167060852, "num_tokens": 544741421.0, "step": 14283 }, { "epoch": 1.8170716193868466, "ewc_loss": 0.030502840876579285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.050284067285247e-05, "grad_norm": 18.100509643554688, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8740297555923462, "num_tokens": 544776414.0, "step": 14284 }, { "epoch": 1.817198829665437, "ewc_loss": 0.030553918331861496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055391789530404e-05, "grad_norm": 18.07504653930664, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8632419109344482, "num_tokens": 544814964.0, "step": 14285 }, { "epoch": 1.8173260399440276, "ewc_loss": 0.030487261712551117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.04872610286111e-05, "grad_norm": 18.023523330688477, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8559759855270386, "num_tokens": 544853381.0, "step": 14286 }, { "epoch": 1.817453250222618, "ewc_loss": 0.030509067699313164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.050906707358081e-05, "grad_norm": 18.10035514831543, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8723883628845215, "num_tokens": 544892474.0, "step": 14287 }, { "epoch": 1.8175804605012085, "ewc_loss": 0.0304840337485075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.048403414140921e-05, "grad_norm": 17.991321563720703, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.877311110496521, "num_tokens": 544927667.0, "step": 14288 }, { "epoch": 1.817707670779799, "ewc_loss": 0.0305175743997097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0517574487021193e-05, "grad_norm": 18.029930114746094, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8756176233291626, "num_tokens": 544959780.0, "step": 14289 }, { "epoch": 1.8178348810583895, "ewc_loss": 0.030528495088219643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052849569940008e-05, "grad_norm": 18.09002685546875, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8628703355789185, "num_tokens": 545001832.0, "step": 14290 }, { "epoch": 1.81796209133698, "ewc_loss": 0.030560847371816635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056084824493155e-05, "grad_norm": 18.033533096313477, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8780410289764404, "num_tokens": 545038489.0, "step": 14291 }, { "epoch": 1.8180893016155706, "ewc_loss": 0.03058626689016819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05862668028567e-05, "grad_norm": 18.17479705810547, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8496658205986023, "num_tokens": 545069086.0, "step": 14292 }, { "epoch": 1.818216511894161, "ewc_loss": 0.030603427439928055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060342714888975e-05, "grad_norm": 18.102733612060547, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8727427124977112, "num_tokens": 545104564.0, "step": 14293 }, { "epoch": 1.8183437221727514, "ewc_loss": 0.030553976073861122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055397610296495e-05, "grad_norm": 18.014694213867188, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8743442296981812, "num_tokens": 545149036.0, "step": 14294 }, { "epoch": 1.818470932451342, "ewc_loss": 0.030523231253027916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052323154406622e-05, "grad_norm": 18.078750610351562, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8812428116798401, "num_tokens": 545183825.0, "step": 14295 }, { "epoch": 1.8185981427299325, "ewc_loss": 0.030651478096842766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065147757297382e-05, "grad_norm": 18.0327091217041, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8735880255699158, "num_tokens": 545221274.0, "step": 14296 }, { "epoch": 1.818725353008523, "ewc_loss": 0.03063783422112465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0637835152447224e-05, "grad_norm": 18.112979888916016, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8780947327613831, "num_tokens": 545259242.0, "step": 14297 }, { "epoch": 1.8188525632871135, "ewc_loss": 0.030601266771554947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060126618947834e-05, "grad_norm": 18.1140079498291, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.857340931892395, "num_tokens": 545289380.0, "step": 14298 }, { "epoch": 1.818979773565704, "ewc_loss": 0.030647464096546173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0647464882349595e-05, "grad_norm": 18.106586456298828, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8734694123268127, "num_tokens": 545321935.0, "step": 14299 }, { "epoch": 1.8191069838442946, "ewc_loss": 0.030617734417319298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061773531953804e-05, "grad_norm": 18.077983856201172, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8770255446434021, "num_tokens": 545365714.0, "step": 14300 }, { "epoch": 1.8192341941228851, "ewc_loss": 0.03057202138006687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057202047784813e-05, "grad_norm": 17.99341583251953, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8604747653007507, "num_tokens": 545406838.0, "step": 14301 }, { "epoch": 1.8193614044014756, "ewc_loss": 0.030652830377221107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0652830901090056e-05, "grad_norm": 18.11626434326172, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8678628206253052, "num_tokens": 545443265.0, "step": 14302 }, { "epoch": 1.8194886146800662, "ewc_loss": 0.030625443905591965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062544419663027e-05, "grad_norm": 18.05593490600586, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.880807638168335, "num_tokens": 545479903.0, "step": 14303 }, { "epoch": 1.8196158249586567, "ewc_loss": 0.030547304078936577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0547304049832746e-05, "grad_norm": 18.067914962768555, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8766747713088989, "num_tokens": 545513767.0, "step": 14304 }, { "epoch": 1.8197430352372472, "ewc_loss": 0.03065602108836174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0656021408503875e-05, "grad_norm": 18.11985969543457, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8544754981994629, "num_tokens": 545550541.0, "step": 14305 }, { "epoch": 1.8198702455158378, "ewc_loss": 0.03063993714749813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063993790419772e-05, "grad_norm": 18.02741050720215, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8712252378463745, "num_tokens": 545590739.0, "step": 14306 }, { "epoch": 1.8199974557944283, "ewc_loss": 0.03060712292790413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0607123335357755e-05, "grad_norm": 17.976665496826172, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8564999103546143, "num_tokens": 545627127.0, "step": 14307 }, { "epoch": 1.8201246660730188, "ewc_loss": 0.030629904940724373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0629904358647764e-05, "grad_norm": 18.09136199951172, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8748737573623657, "num_tokens": 545665473.0, "step": 14308 }, { "epoch": 1.8202518763516093, "ewc_loss": 0.030732909217476845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073291009059176e-05, "grad_norm": 18.01654624938965, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8860766291618347, "num_tokens": 545708018.0, "step": 14309 }, { "epoch": 1.8203790866301999, "ewc_loss": 0.030616851523518562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061685129068792e-05, "grad_norm": 18.113439559936523, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8757647275924683, "num_tokens": 545745635.0, "step": 14310 }, { "epoch": 1.8205062969087902, "ewc_loss": 0.030731724575161934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0731724109500647e-05, "grad_norm": 18.118724822998047, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8589240312576294, "num_tokens": 545785260.0, "step": 14311 }, { "epoch": 1.8206335071873807, "ewc_loss": 0.03055923618376255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055923662032001e-05, "grad_norm": 18.04960823059082, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.867668867111206, "num_tokens": 545821696.0, "step": 14312 }, { "epoch": 1.8207607174659712, "ewc_loss": 0.030629562214016914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06295623886399e-05, "grad_norm": 18.04982566833496, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8845180869102478, "num_tokens": 545857971.0, "step": 14313 }, { "epoch": 1.8208879277445618, "ewc_loss": 0.030639123171567917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0639122996944934e-05, "grad_norm": 18.07001495361328, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8599075675010681, "num_tokens": 545891000.0, "step": 14314 }, { "epoch": 1.8210151380231523, "ewc_loss": 0.03069758228957653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069758167839609e-05, "grad_norm": 18.008132934570312, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.857438325881958, "num_tokens": 545932825.0, "step": 14315 }, { "epoch": 1.8211423483017428, "ewc_loss": 0.030611226335167885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0611226975452155e-05, "grad_norm": 18.032419204711914, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8599721789360046, "num_tokens": 545972849.0, "step": 14316 }, { "epoch": 1.8212695585803331, "ewc_loss": 0.030710995197296143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071099490625784e-05, "grad_norm": 18.04396629333496, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.882824718952179, "num_tokens": 546011492.0, "step": 14317 }, { "epoch": 1.8213967688589237, "ewc_loss": 0.030656753107905388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06567526422441e-05, "grad_norm": 18.120893478393555, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8645970821380615, "num_tokens": 546048461.0, "step": 14318 }, { "epoch": 1.8215239791375142, "ewc_loss": 0.030641691759228706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064169140998274e-05, "grad_norm": 17.964143753051758, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8795689344406128, "num_tokens": 546087113.0, "step": 14319 }, { "epoch": 1.8216511894161047, "ewc_loss": 0.03060491569340229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060491508222185e-05, "grad_norm": 18.122758865356445, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8725671768188477, "num_tokens": 546124413.0, "step": 14320 }, { "epoch": 1.8217783996946952, "ewc_loss": 0.030649371445178986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064937118324451e-05, "grad_norm": 17.980640411376953, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.881081223487854, "num_tokens": 546165763.0, "step": 14321 }, { "epoch": 1.8219056099732858, "ewc_loss": 0.030636055395007133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0636056180810556e-05, "grad_norm": 18.07103157043457, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8711644411087036, "num_tokens": 546203218.0, "step": 14322 }, { "epoch": 1.8220328202518763, "ewc_loss": 0.030627746134996414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062774703721516e-05, "grad_norm": 17.9539794921875, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8604512214660645, "num_tokens": 546247610.0, "step": 14323 }, { "epoch": 1.8221600305304668, "ewc_loss": 0.030616436153650284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061643656110391e-05, "grad_norm": 18.12406349182129, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8630765676498413, "num_tokens": 546288952.0, "step": 14324 }, { "epoch": 1.8222872408090574, "ewc_loss": 0.03070572018623352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0705719836987555e-05, "grad_norm": 18.076017379760742, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8813588619232178, "num_tokens": 546322449.0, "step": 14325 }, { "epoch": 1.8224144510876479, "ewc_loss": 0.03051409311592579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0514092941302806e-05, "grad_norm": 18.03619384765625, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8766969442367554, "num_tokens": 546362692.0, "step": 14326 }, { "epoch": 1.8225416613662384, "ewc_loss": 0.030628787353634834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0628787499153987e-05, "grad_norm": 18.06633949279785, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8534106016159058, "num_tokens": 546398856.0, "step": 14327 }, { "epoch": 1.822668871644829, "ewc_loss": 0.03061395138502121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0613951821578667e-05, "grad_norm": 18.120872497558594, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8625522255897522, "num_tokens": 546439633.0, "step": 14328 }, { "epoch": 1.8227960819234195, "ewc_loss": 0.030647996813058853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064799602725543e-05, "grad_norm": 18.033559799194336, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8747496008872986, "num_tokens": 546477282.0, "step": 14329 }, { "epoch": 1.82292329220201, "ewc_loss": 0.03058675490319729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058675429201685e-05, "grad_norm": 18.11842918395996, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8813967704772949, "num_tokens": 546514584.0, "step": 14330 }, { "epoch": 1.8230505024806005, "ewc_loss": 0.030557556077837944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055755587411113e-05, "grad_norm": 18.08677101135254, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8741841316223145, "num_tokens": 546550761.0, "step": 14331 }, { "epoch": 1.823177712759191, "ewc_loss": 0.030540205538272858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.054020635318011e-05, "grad_norm": 18.023860931396484, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8564976453781128, "num_tokens": 546594157.0, "step": 14332 }, { "epoch": 1.8233049230377816, "ewc_loss": 0.030573923140764236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0573923140764236e-05, "grad_norm": 18.19081687927246, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8773818612098694, "num_tokens": 546634671.0, "step": 14333 }, { "epoch": 1.823432133316372, "ewc_loss": 0.030572332441806793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057233334402554e-05, "grad_norm": 18.027429580688477, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8709620237350464, "num_tokens": 546672084.0, "step": 14334 }, { "epoch": 1.8235593435949626, "ewc_loss": 0.030476724728941917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0476725441985764e-05, "grad_norm": 18.18380355834961, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.864950954914093, "num_tokens": 546701920.0, "step": 14335 }, { "epoch": 1.823686553873553, "ewc_loss": 0.03058001585304737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0580016755266115e-05, "grad_norm": 18.020565032958984, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8752607107162476, "num_tokens": 546735776.0, "step": 14336 }, { "epoch": 1.8238137641521435, "ewc_loss": 0.0304989293217659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0498929845634848e-05, "grad_norm": 18.0985107421875, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8758363723754883, "num_tokens": 546772991.0, "step": 14337 }, { "epoch": 1.823940974430734, "ewc_loss": 0.030653132125735283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0653132853331044e-05, "grad_norm": 18.059432983398438, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8646706342697144, "num_tokens": 546805310.0, "step": 14338 }, { "epoch": 1.8240681847093245, "ewc_loss": 0.030535487458109856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053548789466731e-05, "grad_norm": 18.069869995117188, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8771988153457642, "num_tokens": 546838240.0, "step": 14339 }, { "epoch": 1.824195394987915, "ewc_loss": 0.03065778873860836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065778946620412e-05, "grad_norm": 18.106557846069336, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8656619191169739, "num_tokens": 546878484.0, "step": 14340 }, { "epoch": 1.8243226052665056, "ewc_loss": 0.030598239973187447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059823939111084e-05, "grad_norm": 18.05160903930664, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8853304386138916, "num_tokens": 546919747.0, "step": 14341 }, { "epoch": 1.8244498155450959, "ewc_loss": 0.030604911968111992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0604911444243044e-05, "grad_norm": 18.101308822631836, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8697633743286133, "num_tokens": 546962614.0, "step": 14342 }, { "epoch": 1.8245770258236864, "ewc_loss": 0.03065205179154873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065205237362534e-05, "grad_norm": 18.082233428955078, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8638197183609009, "num_tokens": 547000059.0, "step": 14343 }, { "epoch": 1.824704236102277, "ewc_loss": 0.030633248388767242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063324766117148e-05, "grad_norm": 18.12677001953125, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.871436357498169, "num_tokens": 547037372.0, "step": 14344 }, { "epoch": 1.8248314463808675, "ewc_loss": 0.030626917257905006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062691757804714e-05, "grad_norm": 18.018569946289062, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8558684587478638, "num_tokens": 547071840.0, "step": 14345 }, { "epoch": 1.824958656659458, "ewc_loss": 0.030638424679636955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063842450501397e-05, "grad_norm": 18.13140106201172, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8456351161003113, "num_tokens": 547108916.0, "step": 14346 }, { "epoch": 1.8250858669380485, "ewc_loss": 0.030647600069642067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064759948756546e-05, "grad_norm": 18.008262634277344, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8890784382820129, "num_tokens": 547146991.0, "step": 14347 }, { "epoch": 1.825213077216639, "ewc_loss": 0.030634498223662376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063449912588112e-05, "grad_norm": 18.096071243286133, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8808720111846924, "num_tokens": 547184093.0, "step": 14348 }, { "epoch": 1.8253402874952296, "ewc_loss": 0.030649084597826004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064908378291875e-05, "grad_norm": 18.050859451293945, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.875758171081543, "num_tokens": 547217570.0, "step": 14349 }, { "epoch": 1.8254674977738201, "ewc_loss": 0.030635038390755653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063503754674457e-05, "grad_norm": 18.06759262084961, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8702813386917114, "num_tokens": 547259498.0, "step": 14350 }, { "epoch": 1.8255947080524106, "ewc_loss": 0.03066002018749714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066001954721287e-05, "grad_norm": 18.047407150268555, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8690499663352966, "num_tokens": 547298419.0, "step": 14351 }, { "epoch": 1.8257219183310012, "ewc_loss": 0.030667658895254135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066765930270776e-05, "grad_norm": 18.078359603881836, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8777703046798706, "num_tokens": 547342984.0, "step": 14352 }, { "epoch": 1.8258491286095917, "ewc_loss": 0.030695704743266106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069570448133163e-05, "grad_norm": 18.047100067138672, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8722532987594604, "num_tokens": 547383162.0, "step": 14353 }, { "epoch": 1.8259763388881822, "ewc_loss": 0.030670903623104095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067090437980369e-05, "grad_norm": 18.1468448638916, "learning_rate": 1e-06, "loss": 0.3384, "mean_token_accuracy": 0.8905894756317139, "num_tokens": 547416967.0, "step": 14354 }, { "epoch": 1.8261035491667728, "ewc_loss": 0.030685914680361748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068591468036175e-05, "grad_norm": 18.04047966003418, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8740097284317017, "num_tokens": 547454695.0, "step": 14355 }, { "epoch": 1.8262307594453633, "ewc_loss": 0.03055662475526333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0556624551536515e-05, "grad_norm": 18.057022094726562, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.881048858165741, "num_tokens": 547494258.0, "step": 14356 }, { "epoch": 1.8263579697239538, "ewc_loss": 0.030712613835930824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0712613806826994e-05, "grad_norm": 18.052248001098633, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.865455687046051, "num_tokens": 547531404.0, "step": 14357 }, { "epoch": 1.8264851800025443, "ewc_loss": 0.03059503808617592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.05950379697606e-05, "grad_norm": 18.042577743530273, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8748716115951538, "num_tokens": 547566553.0, "step": 14358 }, { "epoch": 1.8266123902811349, "ewc_loss": 0.030623406171798706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06234069284983e-05, "grad_norm": 18.078004837036133, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8633717894554138, "num_tokens": 547605505.0, "step": 14359 }, { "epoch": 1.8267396005597252, "ewc_loss": 0.030681701377034187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0681701900903136e-05, "grad_norm": 18.049039840698242, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8784369230270386, "num_tokens": 547643146.0, "step": 14360 }, { "epoch": 1.8268668108383157, "ewc_loss": 0.030626706779003143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062670657527633e-05, "grad_norm": 18.084636688232422, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8674747943878174, "num_tokens": 547683373.0, "step": 14361 }, { "epoch": 1.8269940211169062, "ewc_loss": 0.030663829296827316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0663828511023894e-05, "grad_norm": 18.14719009399414, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8650122284889221, "num_tokens": 547716762.0, "step": 14362 }, { "epoch": 1.8271212313954968, "ewc_loss": 0.03062276355922222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062276300624944e-05, "grad_norm": 18.03913688659668, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8785659670829773, "num_tokens": 547756907.0, "step": 14363 }, { "epoch": 1.8272484416740873, "ewc_loss": 0.0305153951048851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0515395337715745e-05, "grad_norm": 18.074983596801758, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8671932816505432, "num_tokens": 547793545.0, "step": 14364 }, { "epoch": 1.8273756519526778, "ewc_loss": 0.03069152496755123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069152444368228e-05, "grad_norm": 18.073862075805664, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8615907430648804, "num_tokens": 547836845.0, "step": 14365 }, { "epoch": 1.8275028622312681, "ewc_loss": 0.03061460703611374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061460665776394e-05, "grad_norm": 18.0637149810791, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8766015768051147, "num_tokens": 547875849.0, "step": 14366 }, { "epoch": 1.8276300725098586, "ewc_loss": 0.030634822323918343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063482290599495e-05, "grad_norm": 18.043243408203125, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8640960454940796, "num_tokens": 547918672.0, "step": 14367 }, { "epoch": 1.8277572827884492, "ewc_loss": 0.03068181872367859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068181831622496e-05, "grad_norm": 18.124069213867188, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8573645949363708, "num_tokens": 547958681.0, "step": 14368 }, { "epoch": 1.8278844930670397, "ewc_loss": 0.030591312795877457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0591312679462135e-05, "grad_norm": 18.032564163208008, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8646214008331299, "num_tokens": 547996940.0, "step": 14369 }, { "epoch": 1.8280117033456302, "ewc_loss": 0.030606243759393692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060624294448644e-05, "grad_norm": 18.18305778503418, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8784527778625488, "num_tokens": 548030188.0, "step": 14370 }, { "epoch": 1.8281389136242208, "ewc_loss": 0.030686136335134506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068613659706898e-05, "grad_norm": 18.106868743896484, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8857482075691223, "num_tokens": 548066891.0, "step": 14371 }, { "epoch": 1.8282661239028113, "ewc_loss": 0.030529292300343513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0529292416758835e-05, "grad_norm": 18.153011322021484, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8761127591133118, "num_tokens": 548104664.0, "step": 14372 }, { "epoch": 1.8283933341814018, "ewc_loss": 0.030619943514466286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061994357267395e-05, "grad_norm": 18.085012435913086, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8664624691009521, "num_tokens": 548143938.0, "step": 14373 }, { "epoch": 1.8285205444599923, "ewc_loss": 0.030560575425624847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056057539652102e-05, "grad_norm": 18.14481544494629, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8691307306289673, "num_tokens": 548183213.0, "step": 14374 }, { "epoch": 1.8286477547385829, "ewc_loss": 0.030642688274383545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0642688216175884e-05, "grad_norm": 18.110143661499023, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8549280166625977, "num_tokens": 548221304.0, "step": 14375 }, { "epoch": 1.8287749650171734, "ewc_loss": 0.03052397444844246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052397369174287e-05, "grad_norm": 18.067272186279297, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.867824912071228, "num_tokens": 548261123.0, "step": 14376 }, { "epoch": 1.828902175295764, "ewc_loss": 0.03061527945101261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0615279683843255e-05, "grad_norm": 18.095378875732422, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8515735268592834, "num_tokens": 548305040.0, "step": 14377 }, { "epoch": 1.8290293855743545, "ewc_loss": 0.030546780675649643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0546780180884525e-05, "grad_norm": 18.09879493713379, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8778764605522156, "num_tokens": 548338538.0, "step": 14378 }, { "epoch": 1.829156595852945, "ewc_loss": 0.030635325238108635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063532494707033e-05, "grad_norm": 18.037260055541992, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8738054037094116, "num_tokens": 548378510.0, "step": 14379 }, { "epoch": 1.8292838061315355, "ewc_loss": 0.030525464564561844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0525465263053775e-05, "grad_norm": 18.054773330688477, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8643032312393188, "num_tokens": 548415790.0, "step": 14380 }, { "epoch": 1.829411016410126, "ewc_loss": 0.030672669410705566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067266879952513e-05, "grad_norm": 18.054887771606445, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8605976700782776, "num_tokens": 548447294.0, "step": 14381 }, { "epoch": 1.8295382266887166, "ewc_loss": 0.03060656227171421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060656308662146e-05, "grad_norm": 18.18189811706543, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.876807451248169, "num_tokens": 548484617.0, "step": 14382 }, { "epoch": 1.829665436967307, "ewc_loss": 0.03066444769501686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06644469674211e-05, "grad_norm": 18.036191940307617, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8666356801986694, "num_tokens": 548527552.0, "step": 14383 }, { "epoch": 1.8297926472458976, "ewc_loss": 0.03060128726065159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060128801735118e-05, "grad_norm": 18.10141944885254, "learning_rate": 1e-06, "loss": 0.3448, "mean_token_accuracy": 0.8889608383178711, "num_tokens": 548561322.0, "step": 14384 }, { "epoch": 1.829919857524488, "ewc_loss": 0.030690377578139305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069037848035805e-05, "grad_norm": 18.1270751953125, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8801994323730469, "num_tokens": 548594810.0, "step": 14385 }, { "epoch": 1.8300470678030785, "ewc_loss": 0.03056567907333374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056567948078737e-05, "grad_norm": 18.14345932006836, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8640095591545105, "num_tokens": 548631094.0, "step": 14386 }, { "epoch": 1.830174278081669, "ewc_loss": 0.030659157782793045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065915734623559e-05, "grad_norm": 18.076793670654297, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8716408610343933, "num_tokens": 548669162.0, "step": 14387 }, { "epoch": 1.8303014883602595, "ewc_loss": 0.030629603192210197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0629602406406775e-05, "grad_norm": 18.15032386779785, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8694608211517334, "num_tokens": 548704555.0, "step": 14388 }, { "epoch": 1.83042869863885, "ewc_loss": 0.030658898875117302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0658899049740285e-05, "grad_norm": 18.032360076904297, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8837651014328003, "num_tokens": 548741852.0, "step": 14389 }, { "epoch": 1.8305559089174406, "ewc_loss": 0.030592866241931915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059286609641276e-05, "grad_norm": 18.155790328979492, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8589349389076233, "num_tokens": 548788455.0, "step": 14390 }, { "epoch": 1.8306831191960309, "ewc_loss": 0.030709896236658096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0709896236658096e-05, "grad_norm": 18.056428909301758, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8774736523628235, "num_tokens": 548826347.0, "step": 14391 }, { "epoch": 1.8308103294746214, "ewc_loss": 0.03059357963502407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059357914025895e-05, "grad_norm": 18.0794677734375, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8636835813522339, "num_tokens": 548865428.0, "step": 14392 }, { "epoch": 1.830937539753212, "ewc_loss": 0.030665211379528046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066521094297059e-05, "grad_norm": 18.085290908813477, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8584975600242615, "num_tokens": 548901671.0, "step": 14393 }, { "epoch": 1.8310647500318025, "ewc_loss": 0.03061584010720253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061583993257955e-05, "grad_norm": 18.130321502685547, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8672200441360474, "num_tokens": 548939234.0, "step": 14394 }, { "epoch": 1.831191960310393, "ewc_loss": 0.03069535456597805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069535523536615e-05, "grad_norm": 18.101638793945312, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.875612735748291, "num_tokens": 548975334.0, "step": 14395 }, { "epoch": 1.8313191705889835, "ewc_loss": 0.030573071911931038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057307185372338e-05, "grad_norm": 18.05135726928711, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8639022707939148, "num_tokens": 549016372.0, "step": 14396 }, { "epoch": 1.831446380867574, "ewc_loss": 0.030639806762337685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063980693696067e-05, "grad_norm": 18.089319229125977, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.877356231212616, "num_tokens": 549058049.0, "step": 14397 }, { "epoch": 1.8315735911461646, "ewc_loss": 0.03068559058010578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0685590900247917e-05, "grad_norm": 18.137784957885742, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8580959439277649, "num_tokens": 549097855.0, "step": 14398 }, { "epoch": 1.831700801424755, "ewc_loss": 0.030617978423833847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061797906411812e-05, "grad_norm": 18.09035873413086, "learning_rate": 1e-06, "loss": 0.3395, "mean_token_accuracy": 0.8909211158752441, "num_tokens": 549130906.0, "step": 14399 }, { "epoch": 1.8318280117033456, "ewc_loss": 0.030651317909359932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0651317501906306e-05, "grad_norm": 18.13153839111328, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8508310317993164, "num_tokens": 549176574.0, "step": 14400 }, { "epoch": 1.8319552219819362, "ewc_loss": 0.030673112720251083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067311263293959e-05, "grad_norm": 18.146923065185547, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8691553473472595, "num_tokens": 549213945.0, "step": 14401 }, { "epoch": 1.8320824322605267, "ewc_loss": 0.030593831092119217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059383016079664e-05, "grad_norm": 18.09767723083496, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8613989949226379, "num_tokens": 549255843.0, "step": 14402 }, { "epoch": 1.8322096425391172, "ewc_loss": 0.03062487579882145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062487667193636e-05, "grad_norm": 18.094646453857422, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8753023147583008, "num_tokens": 549292573.0, "step": 14403 }, { "epoch": 1.8323368528177078, "ewc_loss": 0.03064013086259365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06401307170745e-05, "grad_norm": 18.127792358398438, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8833580017089844, "num_tokens": 549335291.0, "step": 14404 }, { "epoch": 1.8324640630962983, "ewc_loss": 0.0306386761367321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063867552555166e-05, "grad_norm": 18.100627899169922, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.857085108757019, "num_tokens": 549381337.0, "step": 14405 }, { "epoch": 1.8325912733748888, "ewc_loss": 0.03063119202852249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0631192203145474e-05, "grad_norm": 18.112483978271484, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.881255030632019, "num_tokens": 549418180.0, "step": 14406 }, { "epoch": 1.8327184836534793, "ewc_loss": 0.030599359422922134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059935988858342e-05, "grad_norm": 18.074094772338867, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8781805038452148, "num_tokens": 549461623.0, "step": 14407 }, { "epoch": 1.8328456939320699, "ewc_loss": 0.0306528490036726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065284909098409e-05, "grad_norm": 18.196958541870117, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8739891052246094, "num_tokens": 549500791.0, "step": 14408 }, { "epoch": 1.8329729042106602, "ewc_loss": 0.03061565011739731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061565075768158e-05, "grad_norm": 18.139476776123047, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8770778179168701, "num_tokens": 549533343.0, "step": 14409 }, { "epoch": 1.8331001144892507, "ewc_loss": 0.030522704124450684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052270403713919e-05, "grad_norm": 18.08934783935547, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.861606240272522, "num_tokens": 549572707.0, "step": 14410 }, { "epoch": 1.8332273247678412, "ewc_loss": 0.030626870691776276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062687028432265e-05, "grad_norm": 18.052528381347656, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8773934841156006, "num_tokens": 549609772.0, "step": 14411 }, { "epoch": 1.8333545350464318, "ewc_loss": 0.030651826411485672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06518268189393e-05, "grad_norm": 18.178953170776367, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8595199584960938, "num_tokens": 549648626.0, "step": 14412 }, { "epoch": 1.8334817453250223, "ewc_loss": 0.030615366995334625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0615366995334625e-05, "grad_norm": 18.114032745361328, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8588467836380005, "num_tokens": 549693211.0, "step": 14413 }, { "epoch": 1.8336089556036128, "ewc_loss": 0.030639130622148514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063913027290255e-05, "grad_norm": 18.118331909179688, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8647534251213074, "num_tokens": 549729463.0, "step": 14414 }, { "epoch": 1.8337361658822031, "ewc_loss": 0.03063781000673771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0637809686595574e-05, "grad_norm": 18.16866111755371, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8756737112998962, "num_tokens": 549769139.0, "step": 14415 }, { "epoch": 1.8338633761607936, "ewc_loss": 0.03064829856157303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064829797949642e-05, "grad_norm": 18.06669044494629, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8740500211715698, "num_tokens": 549807067.0, "step": 14416 }, { "epoch": 1.8339905864393842, "ewc_loss": 0.030555499717593193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0555500416085124e-05, "grad_norm": 18.071712493896484, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8702005743980408, "num_tokens": 549842689.0, "step": 14417 }, { "epoch": 1.8341177967179747, "ewc_loss": 0.030694708228111267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069470767513849e-05, "grad_norm": 18.172225952148438, "learning_rate": 1e-06, "loss": 0.3389, "mean_token_accuracy": 0.8895953297615051, "num_tokens": 549876108.0, "step": 14418 }, { "epoch": 1.8342450069965652, "ewc_loss": 0.03063136339187622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063136318814941e-05, "grad_norm": 18.127534866333008, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8841776251792908, "num_tokens": 549909656.0, "step": 14419 }, { "epoch": 1.8343722172751558, "ewc_loss": 0.030605915933847427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06059155263938e-05, "grad_norm": 18.114688873291016, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8616559505462646, "num_tokens": 549949156.0, "step": 14420 }, { "epoch": 1.8344994275537463, "ewc_loss": 0.030585065484046936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.058506626985036e-05, "grad_norm": 18.133493423461914, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8774734735488892, "num_tokens": 549990256.0, "step": 14421 }, { "epoch": 1.8346266378323368, "ewc_loss": 0.03065325692296028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0653256544610485e-05, "grad_norm": 18.14238166809082, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8789859414100647, "num_tokens": 550023557.0, "step": 14422 }, { "epoch": 1.8347538481109273, "ewc_loss": 0.030612535774707794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061253664782271e-05, "grad_norm": 18.144699096679688, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8664367198944092, "num_tokens": 550069402.0, "step": 14423 }, { "epoch": 1.8348810583895179, "ewc_loss": 0.03059178963303566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0591789254685864e-05, "grad_norm": 18.0403995513916, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8774843811988831, "num_tokens": 550110860.0, "step": 14424 }, { "epoch": 1.8350082686681084, "ewc_loss": 0.03059297613799572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0592975235776976e-05, "grad_norm": 18.10478401184082, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8549642562866211, "num_tokens": 550151736.0, "step": 14425 }, { "epoch": 1.835135478946699, "ewc_loss": 0.03059392049908638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059392111026682e-05, "grad_norm": 18.122011184692383, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8733482360839844, "num_tokens": 550187731.0, "step": 14426 }, { "epoch": 1.8352626892252895, "ewc_loss": 0.03062077984213829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062078030779958e-05, "grad_norm": 18.087692260742188, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8730019330978394, "num_tokens": 550220545.0, "step": 14427 }, { "epoch": 1.83538989950388, "ewc_loss": 0.030601950362324715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060195012949407e-05, "grad_norm": 18.173316955566406, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8746383190155029, "num_tokens": 550254755.0, "step": 14428 }, { "epoch": 1.8355171097824705, "ewc_loss": 0.030594510957598686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059451046283357e-05, "grad_norm": 18.054540634155273, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.878204882144928, "num_tokens": 550295191.0, "step": 14429 }, { "epoch": 1.835644320061061, "ewc_loss": 0.030615726485848427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.061572715523653e-05, "grad_norm": 18.180253982543945, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8664242029190063, "num_tokens": 550332984.0, "step": 14430 }, { "epoch": 1.8357715303396516, "ewc_loss": 0.03067040629684925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0670405976707116e-05, "grad_norm": 18.133834838867188, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8727993965148926, "num_tokens": 550377470.0, "step": 14431 }, { "epoch": 1.835898740618242, "ewc_loss": 0.030571773648262024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0571773095289245e-05, "grad_norm": 18.0773868560791, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8748840093612671, "num_tokens": 550416123.0, "step": 14432 }, { "epoch": 1.8360259508968326, "ewc_loss": 0.030577385798096657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057738649658859e-05, "grad_norm": 18.097518920898438, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8752151131629944, "num_tokens": 550458062.0, "step": 14433 }, { "epoch": 1.836153161175423, "ewc_loss": 0.030639924108982086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0639923352282494e-05, "grad_norm": 18.11646842956543, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8644670844078064, "num_tokens": 550496485.0, "step": 14434 }, { "epoch": 1.8362803714540135, "ewc_loss": 0.030577724799513817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057772482861765e-05, "grad_norm": 18.043787002563477, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8737156391143799, "num_tokens": 550536500.0, "step": 14435 }, { "epoch": 1.836407581732604, "ewc_loss": 0.030568502843379974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056850255234167e-05, "grad_norm": 18.064815521240234, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8648849725723267, "num_tokens": 550575611.0, "step": 14436 }, { "epoch": 1.8365347920111945, "ewc_loss": 0.030643237754702568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0643237550975755e-05, "grad_norm": 18.133785247802734, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.862528920173645, "num_tokens": 550616431.0, "step": 14437 }, { "epoch": 1.836662002289785, "ewc_loss": 0.030654460191726685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065446071559563e-05, "grad_norm": 18.089540481567383, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8809316158294678, "num_tokens": 550653626.0, "step": 14438 }, { "epoch": 1.8367892125683756, "ewc_loss": 0.03060242347419262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060242306673899e-05, "grad_norm": 18.119802474975586, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.866978108882904, "num_tokens": 550690899.0, "step": 14439 }, { "epoch": 1.8369164228469659, "ewc_loss": 0.030606500804424286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060650124098174e-05, "grad_norm": 18.10513687133789, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8714723587036133, "num_tokens": 550734408.0, "step": 14440 }, { "epoch": 1.8370436331255564, "ewc_loss": 0.03063301555812359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063301483052783e-05, "grad_norm": 18.09880256652832, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8584238290786743, "num_tokens": 550767888.0, "step": 14441 }, { "epoch": 1.837170843404147, "ewc_loss": 0.03062872588634491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0628725653514266e-05, "grad_norm": 18.09024429321289, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.87184077501297, "num_tokens": 550807192.0, "step": 14442 }, { "epoch": 1.8372980536827375, "ewc_loss": 0.03064330294728279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064330303459428e-05, "grad_norm": 18.157190322875977, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8795291185379028, "num_tokens": 550848006.0, "step": 14443 }, { "epoch": 1.837425263961328, "ewc_loss": 0.03063727170228958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0637271265732124e-05, "grad_norm": 18.02471923828125, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8712064027786255, "num_tokens": 550888350.0, "step": 14444 }, { "epoch": 1.8375524742399185, "ewc_loss": 0.030564097687602043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056409696000628e-05, "grad_norm": 18.064258575439453, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8773958683013916, "num_tokens": 550925389.0, "step": 14445 }, { "epoch": 1.837679684518509, "ewc_loss": 0.030650723725557327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065072451136075e-05, "grad_norm": 18.111007690429688, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8758479356765747, "num_tokens": 550962610.0, "step": 14446 }, { "epoch": 1.8378068947970996, "ewc_loss": 0.030571987852454185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0571987736038864e-05, "grad_norm": 18.12386131286621, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8671163320541382, "num_tokens": 551001018.0, "step": 14447 }, { "epoch": 1.83793410507569, "ewc_loss": 0.030629001557826996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0629002139903605e-05, "grad_norm": 18.070541381835938, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8722749948501587, "num_tokens": 551038505.0, "step": 14448 }, { "epoch": 1.8380613153542806, "ewc_loss": 0.03059786558151245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059786467929371e-05, "grad_norm": 18.137283325195312, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.88239586353302, "num_tokens": 551072235.0, "step": 14449 }, { "epoch": 1.8381885256328712, "ewc_loss": 0.030692284926772118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0692284781252965e-05, "grad_norm": 18.06052017211914, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8835243582725525, "num_tokens": 551110129.0, "step": 14450 }, { "epoch": 1.8383157359114617, "ewc_loss": 0.03061586618423462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06158653984312e-05, "grad_norm": 18.100893020629883, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8718765377998352, "num_tokens": 551152380.0, "step": 14451 }, { "epoch": 1.8384429461900522, "ewc_loss": 0.030651161447167397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06511610688176e-05, "grad_norm": 18.098926544189453, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8819969892501831, "num_tokens": 551195942.0, "step": 14452 }, { "epoch": 1.8385701564686427, "ewc_loss": 0.03058297000825405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0582970794057474e-05, "grad_norm": 18.093982696533203, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8632636666297913, "num_tokens": 551238502.0, "step": 14453 }, { "epoch": 1.8386973667472333, "ewc_loss": 0.030638061463832855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063806070713326e-05, "grad_norm": 18.065671920776367, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8812882304191589, "num_tokens": 551278091.0, "step": 14454 }, { "epoch": 1.8388245770258238, "ewc_loss": 0.03055397979915142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055397974094376e-05, "grad_norm": 18.17374038696289, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8520745635032654, "num_tokens": 551314940.0, "step": 14455 }, { "epoch": 1.8389517873044143, "ewc_loss": 0.03062378242611885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062378164031543e-05, "grad_norm": 18.0052490234375, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8577451705932617, "num_tokens": 551356427.0, "step": 14456 }, { "epoch": 1.8390789975830049, "ewc_loss": 0.030598504468798637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0598504963563755e-05, "grad_norm": 18.124269485473633, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8754361271858215, "num_tokens": 551400436.0, "step": 14457 }, { "epoch": 1.8392062078615952, "ewc_loss": 0.030690712854266167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06907131744083e-05, "grad_norm": 18.097944259643555, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8695707321166992, "num_tokens": 551438335.0, "step": 14458 }, { "epoch": 1.8393334181401857, "ewc_loss": 0.030556628480553627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.055662818951532e-05, "grad_norm": 18.037874221801758, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8738762140274048, "num_tokens": 551471214.0, "step": 14459 }, { "epoch": 1.8394606284187762, "ewc_loss": 0.0306237880140543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062378891627304e-05, "grad_norm": 18.173442840576172, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8584585189819336, "num_tokens": 551504071.0, "step": 14460 }, { "epoch": 1.8395878386973668, "ewc_loss": 0.030651332810521126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0651332053821534e-05, "grad_norm": 18.117979049682617, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8653404116630554, "num_tokens": 551538456.0, "step": 14461 }, { "epoch": 1.8397150489759573, "ewc_loss": 0.030523795634508133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.052379543078132e-05, "grad_norm": 18.119369506835938, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8747228384017944, "num_tokens": 551575334.0, "step": 14462 }, { "epoch": 1.8398422592545478, "ewc_loss": 0.030608808621764183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060880771954544e-05, "grad_norm": 18.074438095092773, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8787218332290649, "num_tokens": 551617127.0, "step": 14463 }, { "epoch": 1.8399694695331381, "ewc_loss": 0.030600519850850105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0600520403822884e-05, "grad_norm": 18.111650466918945, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8753639459609985, "num_tokens": 551655548.0, "step": 14464 }, { "epoch": 1.8400966798117286, "ewc_loss": 0.03062283992767334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062283940380439e-05, "grad_norm": 18.101476669311523, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8679879903793335, "num_tokens": 551691807.0, "step": 14465 }, { "epoch": 1.8402238900903192, "ewc_loss": 0.030629824846982956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062982432311401e-05, "grad_norm": 18.174604415893555, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8787292242050171, "num_tokens": 551729805.0, "step": 14466 }, { "epoch": 1.8403511003689097, "ewc_loss": 0.030572835355997086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0572835385100916e-05, "grad_norm": 18.055694580078125, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8482903838157654, "num_tokens": 551775220.0, "step": 14467 }, { "epoch": 1.8404783106475002, "ewc_loss": 0.030624112114310265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0624112696386874e-05, "grad_norm": 18.186296463012695, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8629316091537476, "num_tokens": 551816242.0, "step": 14468 }, { "epoch": 1.8406055209260908, "ewc_loss": 0.030675742775201797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067574289161712e-05, "grad_norm": 18.09991455078125, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.859061062335968, "num_tokens": 551854929.0, "step": 14469 }, { "epoch": 1.8407327312046813, "ewc_loss": 0.03060213476419449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060213566641323e-05, "grad_norm": 18.099584579467773, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8684391379356384, "num_tokens": 551898877.0, "step": 14470 }, { "epoch": 1.8408599414832718, "ewc_loss": 0.030655406415462494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065540659008548e-05, "grad_norm": 18.138580322265625, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.8878452181816101, "num_tokens": 551934252.0, "step": 14471 }, { "epoch": 1.8409871517618623, "ewc_loss": 0.03068050555884838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06805050058756e-05, "grad_norm": 18.0941162109375, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8823882341384888, "num_tokens": 551974779.0, "step": 14472 }, { "epoch": 1.8411143620404529, "ewc_loss": 0.030654985457658768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0654984584543854e-05, "grad_norm": 23.037616729736328, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8659206628799438, "num_tokens": 552011627.0, "step": 14473 }, { "epoch": 1.8412415723190434, "ewc_loss": 0.03371492773294449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371492857695557e-05, "grad_norm": 18.473485946655273, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8499586582183838, "num_tokens": 552045333.0, "step": 14474 }, { "epoch": 1.841368782597634, "ewc_loss": 0.030279582366347313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.027958155144006e-05, "grad_norm": 17.4534969329834, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8650719523429871, "num_tokens": 552079974.0, "step": 14475 }, { "epoch": 1.8414959928762245, "ewc_loss": 0.032903775572776794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290377571829595e-05, "grad_norm": 18.338104248046875, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8783706426620483, "num_tokens": 552117507.0, "step": 14476 }, { "epoch": 1.841623203154815, "ewc_loss": 0.032717011868953705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271701280027628e-05, "grad_norm": 17.98615264892578, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8701587319374084, "num_tokens": 552156009.0, "step": 14477 }, { "epoch": 1.8417504134334055, "ewc_loss": 0.03281492739915848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281492899986915e-05, "grad_norm": 18.098697662353516, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8791582584381104, "num_tokens": 552189092.0, "step": 14478 }, { "epoch": 1.841877623711996, "ewc_loss": 0.03327561169862747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327561353216879e-05, "grad_norm": 18.07318878173828, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8687957525253296, "num_tokens": 552228305.0, "step": 14479 }, { "epoch": 1.8420048339905866, "ewc_loss": 0.03331035375595093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331035259179771e-05, "grad_norm": 18.137670516967773, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8726710081100464, "num_tokens": 552269402.0, "step": 14480 }, { "epoch": 1.842132044269177, "ewc_loss": 0.033535249531269073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353524880367331e-05, "grad_norm": 18.206506729125977, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8682923316955566, "num_tokens": 552304870.0, "step": 14481 }, { "epoch": 1.8422592545477676, "ewc_loss": 0.033523477613925934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352347630425356e-05, "grad_norm": 18.291000366210938, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8664587736129761, "num_tokens": 552336802.0, "step": 14482 }, { "epoch": 1.842386464826358, "ewc_loss": 0.0334884412586689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348844256834127e-05, "grad_norm": 18.167373657226562, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.8841010332107544, "num_tokens": 552376124.0, "step": 14483 }, { "epoch": 1.8425136751049485, "ewc_loss": 0.03331765532493591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331765401526354e-05, "grad_norm": 18.248029708862305, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8619582653045654, "num_tokens": 552412285.0, "step": 14484 }, { "epoch": 1.842640885383539, "ewc_loss": 0.0333072803914547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3307282137684524e-05, "grad_norm": 18.248367309570312, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.872847855091095, "num_tokens": 552447527.0, "step": 14485 }, { "epoch": 1.8427680956621295, "ewc_loss": 0.033103905618190765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310390457045287e-05, "grad_norm": 18.24250030517578, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8740783929824829, "num_tokens": 552486569.0, "step": 14486 }, { "epoch": 1.84289530594072, "ewc_loss": 0.03297571465373039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297571311122738e-05, "grad_norm": 18.21903419494629, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8358291387557983, "num_tokens": 552529158.0, "step": 14487 }, { "epoch": 1.8430225162193106, "ewc_loss": 0.0328151173889637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281511817476712e-05, "grad_norm": 18.27081298828125, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.869491457939148, "num_tokens": 552572183.0, "step": 14488 }, { "epoch": 1.8431497264979009, "ewc_loss": 0.03271474689245224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271474633947946e-05, "grad_norm": 18.18350601196289, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8682290315628052, "num_tokens": 552609785.0, "step": 14489 }, { "epoch": 1.8432769367764914, "ewc_loss": 0.03250698000192642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2506981369806454e-05, "grad_norm": 18.284297943115234, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8770673274993896, "num_tokens": 552641243.0, "step": 14490 }, { "epoch": 1.843404147055082, "ewc_loss": 0.0323922224342823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2392221328336746e-05, "grad_norm": 18.239810943603516, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8623005151748657, "num_tokens": 552677595.0, "step": 14491 }, { "epoch": 1.8435313573336725, "ewc_loss": 0.03219872713088989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219872814952396e-05, "grad_norm": 18.172103881835938, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8708323240280151, "num_tokens": 552715492.0, "step": 14492 }, { "epoch": 1.843658567612263, "ewc_loss": 0.03207305073738098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207305053365417e-05, "grad_norm": 18.216705322265625, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.870198130607605, "num_tokens": 552757410.0, "step": 14493 }, { "epoch": 1.8437857778908535, "ewc_loss": 0.03200451284646988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200451101292856e-05, "grad_norm": 18.18305778503418, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8859161138534546, "num_tokens": 552793794.0, "step": 14494 }, { "epoch": 1.843912988169444, "ewc_loss": 0.03180655837059021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180655767209828e-05, "grad_norm": 18.203922271728516, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8563568592071533, "num_tokens": 552836010.0, "step": 14495 }, { "epoch": 1.8440401984480346, "ewc_loss": 0.03168823570013046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168823604937643e-05, "grad_norm": 18.126554489135742, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8725084662437439, "num_tokens": 552874136.0, "step": 14496 }, { "epoch": 1.844167408726625, "ewc_loss": 0.031590551137924194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159055268042721e-05, "grad_norm": 18.20411491394043, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8675599098205566, "num_tokens": 552912314.0, "step": 14497 }, { "epoch": 1.8442946190052156, "ewc_loss": 0.03154909238219261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1549090635962784e-05, "grad_norm": 18.179401397705078, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8586738109588623, "num_tokens": 552953998.0, "step": 14498 }, { "epoch": 1.8444218292838062, "ewc_loss": 0.03139733523130417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139733598800376e-05, "grad_norm": 18.133377075195312, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8885653614997864, "num_tokens": 552987310.0, "step": 14499 }, { "epoch": 1.8445490395623967, "ewc_loss": 0.03135678544640541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1356787076219916e-05, "grad_norm": 18.166109085083008, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8683739900588989, "num_tokens": 553019830.0, "step": 14500 }, { "epoch": 1.8446762498409872, "ewc_loss": 0.03134709596633911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1347095500677824e-05, "grad_norm": 18.105974197387695, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8696707487106323, "num_tokens": 553066695.0, "step": 14501 }, { "epoch": 1.8448034601195777, "ewc_loss": 0.031226664781570435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122666385024786e-05, "grad_norm": 18.24551773071289, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8624740839004517, "num_tokens": 553098943.0, "step": 14502 }, { "epoch": 1.8449306703981683, "ewc_loss": 0.03114868700504303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1148687412496656e-05, "grad_norm": 18.06887435913086, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8751142024993896, "num_tokens": 553136106.0, "step": 14503 }, { "epoch": 1.8450578806767588, "ewc_loss": 0.0310207586735487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1020757887745276e-05, "grad_norm": 18.202909469604492, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8775180578231812, "num_tokens": 553170014.0, "step": 14504 }, { "epoch": 1.8451850909553493, "ewc_loss": 0.031091636046767235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1091636628843844e-05, "grad_norm": 18.137741088867188, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8650197386741638, "num_tokens": 553213297.0, "step": 14505 }, { "epoch": 1.8453123012339399, "ewc_loss": 0.030999109148979187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099910827586427e-05, "grad_norm": 18.167415618896484, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8719267249107361, "num_tokens": 553248559.0, "step": 14506 }, { "epoch": 1.8454395115125302, "ewc_loss": 0.030980536714196205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098053639405407e-05, "grad_norm": 18.13213348388672, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8729859590530396, "num_tokens": 553289140.0, "step": 14507 }, { "epoch": 1.8455667217911207, "ewc_loss": 0.03091682866215706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091682810918428e-05, "grad_norm": 18.096309661865234, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8633702993392944, "num_tokens": 553330619.0, "step": 14508 }, { "epoch": 1.8456939320697112, "ewc_loss": 0.030894778668880463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08947783196345e-05, "grad_norm": 18.151445388793945, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.881775438785553, "num_tokens": 553371295.0, "step": 14509 }, { "epoch": 1.8458211423483017, "ewc_loss": 0.030905961990356445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09059614664875e-05, "grad_norm": 18.11244010925293, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.867438554763794, "num_tokens": 553410237.0, "step": 14510 }, { "epoch": 1.8459483526268923, "ewc_loss": 0.03086911514401436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086911601712927e-05, "grad_norm": 18.21745491027832, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8776373267173767, "num_tokens": 553446727.0, "step": 14511 }, { "epoch": 1.8460755629054828, "ewc_loss": 0.030850796028971672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085079515585676e-05, "grad_norm": 18.096120834350586, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8772429823875427, "num_tokens": 553488626.0, "step": 14512 }, { "epoch": 1.846202773184073, "ewc_loss": 0.03080965019762516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080964961554855e-05, "grad_norm": 18.178709030151367, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8636561632156372, "num_tokens": 553528294.0, "step": 14513 }, { "epoch": 1.8463299834626636, "ewc_loss": 0.030803147703409195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080314854742028e-05, "grad_norm": 18.19344139099121, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.874466061592102, "num_tokens": 553563738.0, "step": 14514 }, { "epoch": 1.8464571937412542, "ewc_loss": 0.030763596296310425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.076359644182958e-05, "grad_norm": 18.129384994506836, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8390097618103027, "num_tokens": 553603351.0, "step": 14515 }, { "epoch": 1.8465844040198447, "ewc_loss": 0.03071136213839054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0711362342117354e-05, "grad_norm": 18.157363891601562, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8695103526115417, "num_tokens": 553643655.0, "step": 14516 }, { "epoch": 1.8467116142984352, "ewc_loss": 0.030840612947940826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084061245317571e-05, "grad_norm": 18.158889770507812, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.883010983467102, "num_tokens": 553677944.0, "step": 14517 }, { "epoch": 1.8468388245770258, "ewc_loss": 0.030707350000739098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070734965149313e-05, "grad_norm": 18.147279739379883, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8665359616279602, "num_tokens": 553712120.0, "step": 14518 }, { "epoch": 1.8469660348556163, "ewc_loss": 0.03074479103088379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074479172937572e-05, "grad_norm": 18.121837615966797, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8740293383598328, "num_tokens": 553751442.0, "step": 14519 }, { "epoch": 1.8470932451342068, "ewc_loss": 0.030708597972989082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0708597478223965e-05, "grad_norm": 18.091136932373047, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8770473599433899, "num_tokens": 553782924.0, "step": 14520 }, { "epoch": 1.8472204554127973, "ewc_loss": 0.03077678382396698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0776784115005285e-05, "grad_norm": 18.145387649536133, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8620346784591675, "num_tokens": 553818861.0, "step": 14521 }, { "epoch": 1.8473476656913879, "ewc_loss": 0.030699342489242554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069934246013872e-05, "grad_norm": 18.096242904663086, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8701756000518799, "num_tokens": 553855593.0, "step": 14522 }, { "epoch": 1.8474748759699784, "ewc_loss": 0.030707458034157753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0707458790857345e-05, "grad_norm": 18.190677642822266, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8693152666091919, "num_tokens": 553896028.0, "step": 14523 }, { "epoch": 1.847602086248569, "ewc_loss": 0.030746763572096825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0746763513889164e-05, "grad_norm": 18.13433074951172, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8752559423446655, "num_tokens": 553935212.0, "step": 14524 }, { "epoch": 1.8477292965271594, "ewc_loss": 0.03068789653480053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068789737881161e-05, "grad_norm": 18.100830078125, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8697009086608887, "num_tokens": 553971231.0, "step": 14525 }, { "epoch": 1.84785650680575, "ewc_loss": 0.030718034133315086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071803439524956e-05, "grad_norm": 18.19316864013672, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8785754442214966, "num_tokens": 554006782.0, "step": 14526 }, { "epoch": 1.8479837170843405, "ewc_loss": 0.030738720670342445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0738719942746684e-05, "grad_norm": 18.210756301879883, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8639988899230957, "num_tokens": 554043427.0, "step": 14527 }, { "epoch": 1.848110927362931, "ewc_loss": 0.03067176602780819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067176658078097e-05, "grad_norm": 18.11017417907715, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8599432706832886, "num_tokens": 554084187.0, "step": 14528 }, { "epoch": 1.8482381376415216, "ewc_loss": 0.030677197501063347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0677198083139956e-05, "grad_norm": 18.21504020690918, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8706092834472656, "num_tokens": 554122186.0, "step": 14529 }, { "epoch": 1.848365347920112, "ewc_loss": 0.030681414529681206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0681414500577375e-05, "grad_norm": 18.167163848876953, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8700653910636902, "num_tokens": 554161388.0, "step": 14530 }, { "epoch": 1.8484925581987026, "ewc_loss": 0.03063351660966873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0633516871603206e-05, "grad_norm": 18.238840103149414, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8626901507377625, "num_tokens": 554199837.0, "step": 14531 }, { "epoch": 1.848619768477293, "ewc_loss": 0.030634000897407532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063400072278455e-05, "grad_norm": 18.113481521606445, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8613370656967163, "num_tokens": 554239567.0, "step": 14532 }, { "epoch": 1.8487469787558835, "ewc_loss": 0.030578630045056343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.057863068534061e-05, "grad_norm": 18.174787521362305, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8740462064743042, "num_tokens": 554277040.0, "step": 14533 }, { "epoch": 1.848874189034474, "ewc_loss": 0.030685536563396454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068553633056581e-05, "grad_norm": 18.14055061340332, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8746495246887207, "num_tokens": 554318994.0, "step": 14534 }, { "epoch": 1.8490013993130645, "ewc_loss": 0.03062717616558075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0627175874542445e-05, "grad_norm": 18.108491897583008, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8645370006561279, "num_tokens": 554355334.0, "step": 14535 }, { "epoch": 1.849128609591655, "ewc_loss": 0.03063281811773777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0632818379672244e-05, "grad_norm": 18.121065139770508, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8599748015403748, "num_tokens": 554396504.0, "step": 14536 }, { "epoch": 1.8492558198702456, "ewc_loss": 0.03068932332098484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068932346650399e-05, "grad_norm": 18.16253662109375, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8772352337837219, "num_tokens": 554429772.0, "step": 14537 }, { "epoch": 1.8493830301488359, "ewc_loss": 0.03069516271352768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0695162422489375e-05, "grad_norm": 18.164697647094727, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.872130274772644, "num_tokens": 554470102.0, "step": 14538 }, { "epoch": 1.8495102404274264, "ewc_loss": 0.030628714710474014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0628714739577845e-05, "grad_norm": 18.054861068725586, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8621011972427368, "num_tokens": 554511632.0, "step": 14539 }, { "epoch": 1.849637450706017, "ewc_loss": 0.03062286414206028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062286486965604e-05, "grad_norm": 18.15218734741211, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.866793692111969, "num_tokens": 554553028.0, "step": 14540 }, { "epoch": 1.8497646609846075, "ewc_loss": 0.030730709433555603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073070911341347e-05, "grad_norm": 18.251277923583984, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8646928071975708, "num_tokens": 554585109.0, "step": 14541 }, { "epoch": 1.849891871263198, "ewc_loss": 0.03065032884478569, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0650327971670777e-05, "grad_norm": 18.085865020751953, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8716025352478027, "num_tokens": 554621141.0, "step": 14542 }, { "epoch": 1.8500190815417885, "ewc_loss": 0.03063896670937538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063896656385623e-05, "grad_norm": 18.188936233520508, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8698880076408386, "num_tokens": 554660838.0, "step": 14543 }, { "epoch": 1.850146291820379, "ewc_loss": 0.03074028342962265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0740284273633733e-05, "grad_norm": 18.142066955566406, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8724032640457153, "num_tokens": 554696338.0, "step": 14544 }, { "epoch": 1.8502735020989696, "ewc_loss": 0.030611710622906685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06117108266335e-05, "grad_norm": 18.107696533203125, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8824644088745117, "num_tokens": 554729782.0, "step": 14545 }, { "epoch": 1.85040071237756, "ewc_loss": 0.03060496598482132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060496601392515e-05, "grad_norm": 18.026182174682617, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8861461877822876, "num_tokens": 554767566.0, "step": 14546 }, { "epoch": 1.8505279226561506, "ewc_loss": 0.030613653361797333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0613653507316485e-05, "grad_norm": 18.130170822143555, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8755844235420227, "num_tokens": 554806063.0, "step": 14547 }, { "epoch": 1.8506551329347412, "ewc_loss": 0.030731339007616043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0731338483747095e-05, "grad_norm": 18.11665916442871, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8670446872711182, "num_tokens": 554839418.0, "step": 14548 }, { "epoch": 1.8507823432133317, "ewc_loss": 0.03066055104136467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0660550692118704e-05, "grad_norm": 18.152748107910156, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8661966323852539, "num_tokens": 554881888.0, "step": 14549 }, { "epoch": 1.8509095534919222, "ewc_loss": 0.030667174607515335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066717545152642e-05, "grad_norm": 18.06900978088379, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8888951539993286, "num_tokens": 554921561.0, "step": 14550 }, { "epoch": 1.8510367637705127, "ewc_loss": 0.030606042593717575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060604285565205e-05, "grad_norm": 18.128955841064453, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8464662432670593, "num_tokens": 554954380.0, "step": 14551 }, { "epoch": 1.8511639740491033, "ewc_loss": 0.030671624466776848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067162469960749e-05, "grad_norm": 18.134096145629883, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8710978031158447, "num_tokens": 554994781.0, "step": 14552 }, { "epoch": 1.8512911843276938, "ewc_loss": 0.03068011999130249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068011938012205e-05, "grad_norm": 18.19499397277832, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8793456554412842, "num_tokens": 555024938.0, "step": 14553 }, { "epoch": 1.8514183946062843, "ewc_loss": 0.030682062730193138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068206206080504e-05, "grad_norm": 18.11672019958496, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8588569164276123, "num_tokens": 555059785.0, "step": 14554 }, { "epoch": 1.8515456048848749, "ewc_loss": 0.030592134222388268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.059213486267254e-05, "grad_norm": 18.1491756439209, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.845306932926178, "num_tokens": 555097502.0, "step": 14555 }, { "epoch": 1.8516728151634652, "ewc_loss": 0.03071298450231552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071298488066532e-05, "grad_norm": 18.098257064819336, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8687570691108704, "num_tokens": 555139976.0, "step": 14556 }, { "epoch": 1.8518000254420557, "ewc_loss": 0.030638398602604866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063839903916232e-05, "grad_norm": 18.097843170166016, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8732016682624817, "num_tokens": 555181250.0, "step": 14557 }, { "epoch": 1.8519272357206462, "ewc_loss": 0.03070923686027527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070923776249401e-05, "grad_norm": 18.156387329101562, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8510043025016785, "num_tokens": 555214927.0, "step": 14558 }, { "epoch": 1.8520544459992367, "ewc_loss": 0.030664829537272453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0664828955195844e-05, "grad_norm": 18.11684799194336, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8669924736022949, "num_tokens": 555255795.0, "step": 14559 }, { "epoch": 1.8521816562778273, "ewc_loss": 0.03070083074271679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0700830393470824e-05, "grad_norm": 18.16274642944336, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8731116056442261, "num_tokens": 555294177.0, "step": 14560 }, { "epoch": 1.8523088665564178, "ewc_loss": 0.03071887046098709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071887113037519e-05, "grad_norm": 18.162248611450195, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8828974962234497, "num_tokens": 555331548.0, "step": 14561 }, { "epoch": 1.852436076835008, "ewc_loss": 0.03067120909690857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067120997002348e-05, "grad_norm": 18.138656616210938, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8831257820129395, "num_tokens": 555368426.0, "step": 14562 }, { "epoch": 1.8525632871135986, "ewc_loss": 0.030655017122626305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065501732635312e-05, "grad_norm": 18.183969497680664, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8875507116317749, "num_tokens": 555411594.0, "step": 14563 }, { "epoch": 1.8526904973921892, "ewc_loss": 0.03067035786807537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0670358682982624e-05, "grad_norm": 18.061424255371094, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8619891405105591, "num_tokens": 555450412.0, "step": 14564 }, { "epoch": 1.8528177076707797, "ewc_loss": 0.03061961941421032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0619619792560115e-05, "grad_norm": 18.18219757080078, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8789589405059814, "num_tokens": 555489067.0, "step": 14565 }, { "epoch": 1.8529449179493702, "ewc_loss": 0.0307207889854908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072078834520653e-05, "grad_norm": 18.14468765258789, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8761630654335022, "num_tokens": 555524552.0, "step": 14566 }, { "epoch": 1.8530721282279607, "ewc_loss": 0.030663738027215004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066373756155372e-05, "grad_norm": 18.258602142333984, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8439853191375732, "num_tokens": 555563766.0, "step": 14567 }, { "epoch": 1.8531993385065513, "ewc_loss": 0.03062429092824459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062429095734842e-05, "grad_norm": 18.156110763549805, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8772925138473511, "num_tokens": 555601410.0, "step": 14568 }, { "epoch": 1.8533265487851418, "ewc_loss": 0.03056965209543705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.056965215364471e-05, "grad_norm": 18.12923240661621, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8701429963111877, "num_tokens": 555641346.0, "step": 14569 }, { "epoch": 1.8534537590637323, "ewc_loss": 0.030624618753790855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062461837544106e-05, "grad_norm": 18.14100456237793, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8802375793457031, "num_tokens": 555676537.0, "step": 14570 }, { "epoch": 1.8535809693423229, "ewc_loss": 0.030643295496702194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064329575863667e-05, "grad_norm": 18.13901710510254, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8555596470832825, "num_tokens": 555721634.0, "step": 14571 }, { "epoch": 1.8537081796209134, "ewc_loss": 0.03062046319246292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062046380364336e-05, "grad_norm": 18.13637351989746, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.870236873626709, "num_tokens": 555765447.0, "step": 14572 }, { "epoch": 1.853835389899504, "ewc_loss": 0.03063192032277584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063191979890689e-05, "grad_norm": 18.234834671020508, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8697945475578308, "num_tokens": 555798598.0, "step": 14573 }, { "epoch": 1.8539626001780944, "ewc_loss": 0.03063712641596794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063712574657984e-05, "grad_norm": 18.167543411254883, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8792580366134644, "num_tokens": 555829465.0, "step": 14574 }, { "epoch": 1.854089810456685, "ewc_loss": 0.030613340437412262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0613340641139075e-05, "grad_norm": 18.23151969909668, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8718903660774231, "num_tokens": 555866577.0, "step": 14575 }, { "epoch": 1.8542170207352755, "ewc_loss": 0.03063783422112465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0637835152447224e-05, "grad_norm": 18.1740665435791, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8745868802070618, "num_tokens": 555900472.0, "step": 14576 }, { "epoch": 1.854344231013866, "ewc_loss": 0.030606480315327644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06064794131089e-05, "grad_norm": 18.26365852355957, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.866066575050354, "num_tokens": 555936356.0, "step": 14577 }, { "epoch": 1.8544714412924566, "ewc_loss": 0.030671024695038795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067102443310432e-05, "grad_norm": 18.181140899658203, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8643754720687866, "num_tokens": 555975338.0, "step": 14578 }, { "epoch": 1.854598651571047, "ewc_loss": 0.030636295676231384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0636296287411824e-05, "grad_norm": 18.319011688232422, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8607168793678284, "num_tokens": 556019651.0, "step": 14579 }, { "epoch": 1.8547258618496376, "ewc_loss": 0.03063984028995037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.063983967876993e-05, "grad_norm": 18.104991912841797, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8658477663993835, "num_tokens": 556054530.0, "step": 14580 }, { "epoch": 1.854853072128228, "ewc_loss": 0.03053911402821541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.053911495953798e-05, "grad_norm": 18.195537567138672, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8661549091339111, "num_tokens": 556091985.0, "step": 14581 }, { "epoch": 1.8549802824068184, "ewc_loss": 0.0306948721408844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069487138418481e-05, "grad_norm": 18.153493881225586, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8734898567199707, "num_tokens": 556128777.0, "step": 14582 }, { "epoch": 1.855107492685409, "ewc_loss": 0.03060724027454853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.060723975067958e-05, "grad_norm": 18.088821411132812, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8893750905990601, "num_tokens": 556164286.0, "step": 14583 }, { "epoch": 1.8552347029639995, "ewc_loss": 0.03065444901585579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065444980165921e-05, "grad_norm": 18.12266731262207, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8777347803115845, "num_tokens": 556199141.0, "step": 14584 }, { "epoch": 1.85536191324259, "ewc_loss": 0.030670085921883583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067008583457209e-05, "grad_norm": 18.136302947998047, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8758636713027954, "num_tokens": 556237104.0, "step": 14585 }, { "epoch": 1.8554891235211806, "ewc_loss": 0.030698277056217194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0698276532348245e-05, "grad_norm": 18.138837814331055, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8596369028091431, "num_tokens": 556275343.0, "step": 14586 }, { "epoch": 1.8556163337997709, "ewc_loss": 0.030745821073651314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074582127737813e-05, "grad_norm": 18.192991256713867, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8855514526367188, "num_tokens": 556316539.0, "step": 14587 }, { "epoch": 1.8557435440783614, "ewc_loss": 0.030708758160471916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0708757549291477e-05, "grad_norm": 18.120471954345703, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8837409019470215, "num_tokens": 556351101.0, "step": 14588 }, { "epoch": 1.855870754356952, "ewc_loss": 0.030746955424547195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074695632676594e-05, "grad_norm": 18.209108352661133, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8826490640640259, "num_tokens": 556382458.0, "step": 14589 }, { "epoch": 1.8559979646355425, "ewc_loss": 0.0307475533336401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.07475529552903e-05, "grad_norm": 18.121728897094727, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8726902008056641, "num_tokens": 556416980.0, "step": 14590 }, { "epoch": 1.856125174914133, "ewc_loss": 0.030746014788746834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.07460140902549e-05, "grad_norm": 18.204618453979492, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8774930834770203, "num_tokens": 556451868.0, "step": 14591 }, { "epoch": 1.8562523851927235, "ewc_loss": 0.030746890231966972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074689084314741e-05, "grad_norm": 18.175397872924805, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8587418794631958, "num_tokens": 556485682.0, "step": 14592 }, { "epoch": 1.856379595471314, "ewc_loss": 0.030706170946359634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0706170946359634e-05, "grad_norm": 18.120389938354492, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8817226886749268, "num_tokens": 556524857.0, "step": 14593 }, { "epoch": 1.8565068057499046, "ewc_loss": 0.03074980154633522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0749801226193085e-05, "grad_norm": 18.165281295776367, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8754960298538208, "num_tokens": 556565763.0, "step": 14594 }, { "epoch": 1.856634016028495, "ewc_loss": 0.03077162615954876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.077162546105683e-05, "grad_norm": 18.1402587890625, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8591036200523376, "num_tokens": 556606202.0, "step": 14595 }, { "epoch": 1.8567612263070856, "ewc_loss": 0.030795734375715256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0795734346611425e-05, "grad_norm": 18.170188903808594, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8798927068710327, "num_tokens": 556646906.0, "step": 14596 }, { "epoch": 1.8568884365856761, "ewc_loss": 0.030772943049669266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0772942409384996e-05, "grad_norm": 18.143653869628906, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8732144832611084, "num_tokens": 556684249.0, "step": 14597 }, { "epoch": 1.8570156468642667, "ewc_loss": 0.030722323805093765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072232357226312e-05, "grad_norm": 18.13233184814453, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8598713874816895, "num_tokens": 556726840.0, "step": 14598 }, { "epoch": 1.8571428571428572, "ewc_loss": 0.03075542114675045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075542190345004e-05, "grad_norm": 18.084638595581055, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8748730421066284, "num_tokens": 556764822.0, "step": 14599 }, { "epoch": 1.8572700674214477, "ewc_loss": 0.030719202011823654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071920218644664e-05, "grad_norm": 18.18210792541504, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8688968420028687, "num_tokens": 556800884.0, "step": 14600 }, { "epoch": 1.8573972777000383, "ewc_loss": 0.030778301879763603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.077830115216784e-05, "grad_norm": 18.104190826416016, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8768259286880493, "num_tokens": 556841233.0, "step": 14601 }, { "epoch": 1.8575244879786288, "ewc_loss": 0.03072645701467991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072645631618798e-05, "grad_norm": 18.234134674072266, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8648826479911804, "num_tokens": 556880995.0, "step": 14602 }, { "epoch": 1.8576516982572193, "ewc_loss": 0.030747640877962112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074764026678167e-05, "grad_norm": 18.13498306274414, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.876319944858551, "num_tokens": 556914367.0, "step": 14603 }, { "epoch": 1.8577789085358098, "ewc_loss": 0.030618194490671158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0618193704867736e-05, "grad_norm": 18.224802017211914, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.851887583732605, "num_tokens": 556955944.0, "step": 14604 }, { "epoch": 1.8579061188144002, "ewc_loss": 0.030736088752746582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073608968406916e-05, "grad_norm": 18.182458877563477, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8543041944503784, "num_tokens": 556989851.0, "step": 14605 }, { "epoch": 1.8580333290929907, "ewc_loss": 0.030587511137127876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0587510991608724e-05, "grad_norm": 18.07550811767578, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8706878423690796, "num_tokens": 557033662.0, "step": 14606 }, { "epoch": 1.8581605393715812, "ewc_loss": 0.03067701682448387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06770161841996e-05, "grad_norm": 18.169933319091797, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8639068007469177, "num_tokens": 557074869.0, "step": 14607 }, { "epoch": 1.8582877496501717, "ewc_loss": 0.03064459003508091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064459087909199e-05, "grad_norm": 18.1704044342041, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8601876497268677, "num_tokens": 557110164.0, "step": 14608 }, { "epoch": 1.8584149599287623, "ewc_loss": 0.030684055760502815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0684055673191324e-05, "grad_norm": 18.14495086669922, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8710222244262695, "num_tokens": 557149309.0, "step": 14609 }, { "epoch": 1.8585421702073528, "ewc_loss": 0.030626626685261726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0626626539742574e-05, "grad_norm": 18.1882266998291, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8658448457717896, "num_tokens": 557180510.0, "step": 14610 }, { "epoch": 1.858669380485943, "ewc_loss": 0.03067978098988533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067978104809299e-05, "grad_norm": 18.217002868652344, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8770918846130371, "num_tokens": 557218062.0, "step": 14611 }, { "epoch": 1.8587965907645336, "ewc_loss": 0.03071398101747036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071398168685846e-05, "grad_norm": 18.169240951538086, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8632588982582092, "num_tokens": 557258963.0, "step": 14612 }, { "epoch": 1.8589238010431242, "ewc_loss": 0.030650975182652473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065097553189844e-05, "grad_norm": 18.18825912475586, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8710383176803589, "num_tokens": 557296046.0, "step": 14613 }, { "epoch": 1.8590510113217147, "ewc_loss": 0.030638696625828743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0638697353424504e-05, "grad_norm": 18.14485740661621, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8688286542892456, "num_tokens": 557328395.0, "step": 14614 }, { "epoch": 1.8591782216003052, "ewc_loss": 0.030654773116111755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065477358177304e-05, "grad_norm": 18.155092239379883, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8740551471710205, "num_tokens": 557363888.0, "step": 14615 }, { "epoch": 1.8593054318788957, "ewc_loss": 0.03067832998931408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067832949454896e-05, "grad_norm": 18.083202362060547, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8635427951812744, "num_tokens": 557402944.0, "step": 14616 }, { "epoch": 1.8594326421574863, "ewc_loss": 0.030713075771927834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0713075830135494e-05, "grad_norm": 18.265209197998047, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8767911791801453, "num_tokens": 557442247.0, "step": 14617 }, { "epoch": 1.8595598524360768, "ewc_loss": 0.030794352293014526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079435191466473e-05, "grad_norm": 18.101242065429688, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.86519455909729, "num_tokens": 557483580.0, "step": 14618 }, { "epoch": 1.8596870627146673, "ewc_loss": 0.030624505132436752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.062450559809804e-05, "grad_norm": 18.20743751525879, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8754516839981079, "num_tokens": 557520134.0, "step": 14619 }, { "epoch": 1.8598142729932579, "ewc_loss": 0.03074190393090248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074190317420289e-05, "grad_norm": 18.135766983032227, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8653465509414673, "num_tokens": 557565726.0, "step": 14620 }, { "epoch": 1.8599414832718484, "ewc_loss": 0.030708689242601395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070868842769414e-05, "grad_norm": 18.15455436706543, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8694589138031006, "num_tokens": 557602871.0, "step": 14621 }, { "epoch": 1.860068693550439, "ewc_loss": 0.030723923817276955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072392428293824e-05, "grad_norm": 18.274444580078125, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8616425395011902, "num_tokens": 557640873.0, "step": 14622 }, { "epoch": 1.8601959038290294, "ewc_loss": 0.030740756541490555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0740757210878655e-05, "grad_norm": 18.151874542236328, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8637775182723999, "num_tokens": 557684233.0, "step": 14623 }, { "epoch": 1.86032311410762, "ewc_loss": 0.030648458749055862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064845805056393e-05, "grad_norm": 18.221580505371094, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8747725486755371, "num_tokens": 557716895.0, "step": 14624 }, { "epoch": 1.8604503243862105, "ewc_loss": 0.0307167898863554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0716790206497535e-05, "grad_norm": 18.105772018432617, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8663662075996399, "num_tokens": 557758197.0, "step": 14625 }, { "epoch": 1.860577534664801, "ewc_loss": 0.030659455806016922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.065945566049777e-05, "grad_norm": 18.198217391967773, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8431645035743713, "num_tokens": 557794196.0, "step": 14626 }, { "epoch": 1.8607047449433916, "ewc_loss": 0.03069349192082882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069349259021692e-05, "grad_norm": 18.16583824157715, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8540993928909302, "num_tokens": 557833856.0, "step": 14627 }, { "epoch": 1.860831955221982, "ewc_loss": 0.030661998316645622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066199860768393e-05, "grad_norm": 18.10425567626953, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8591614365577698, "num_tokens": 557866304.0, "step": 14628 }, { "epoch": 1.8609591655005726, "ewc_loss": 0.03068849816918373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068849764531478e-05, "grad_norm": 18.147777557373047, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8829421401023865, "num_tokens": 557907065.0, "step": 14629 }, { "epoch": 1.861086375779163, "ewc_loss": 0.030743518844246864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074351843679324e-05, "grad_norm": 18.123355865478516, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8667661547660828, "num_tokens": 557947326.0, "step": 14630 }, { "epoch": 1.8612135860577534, "ewc_loss": 0.030707528814673424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070752791245468e-05, "grad_norm": 18.139612197875977, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8691977262496948, "num_tokens": 557990059.0, "step": 14631 }, { "epoch": 1.861340796336344, "ewc_loss": 0.030711820349097252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071182072744705e-05, "grad_norm": 18.136648178100586, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8720592856407166, "num_tokens": 558028406.0, "step": 14632 }, { "epoch": 1.8614680066149345, "ewc_loss": 0.030694518238306046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069451850024052e-05, "grad_norm": 18.225597381591797, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8654042482376099, "num_tokens": 558070239.0, "step": 14633 }, { "epoch": 1.861595216893525, "ewc_loss": 0.030718468129634857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0718467314727604e-05, "grad_norm": 18.061670303344727, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8739860653877258, "num_tokens": 558107685.0, "step": 14634 }, { "epoch": 1.8617224271721156, "ewc_loss": 0.03073355369269848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0733554012840614e-05, "grad_norm": 18.313383102416992, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8646858930587769, "num_tokens": 558143185.0, "step": 14635 }, { "epoch": 1.8618496374507059, "ewc_loss": 0.03073922172188759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073922198382206e-05, "grad_norm": 18.130672454833984, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8783066272735596, "num_tokens": 558176207.0, "step": 14636 }, { "epoch": 1.8619768477292964, "ewc_loss": 0.03061172552406788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0611725378548726e-05, "grad_norm": 18.184350967407227, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.8862877488136292, "num_tokens": 558212215.0, "step": 14637 }, { "epoch": 1.862104058007887, "ewc_loss": 0.030693136155605316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0693136068293825e-05, "grad_norm": 18.140548706054688, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8801159262657166, "num_tokens": 558248471.0, "step": 14638 }, { "epoch": 1.8622312682864774, "ewc_loss": 0.030623149126768112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0623148632002994e-05, "grad_norm": 18.200519561767578, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8572331666946411, "num_tokens": 558285211.0, "step": 14639 }, { "epoch": 1.862358478565068, "ewc_loss": 0.030733183026313782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073318293900229e-05, "grad_norm": 18.214616775512695, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8642270565032959, "num_tokens": 558314434.0, "step": 14640 }, { "epoch": 1.8624856888436585, "ewc_loss": 0.03065687231719494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0656872695544735e-05, "grad_norm": 18.17509651184082, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8777997493743896, "num_tokens": 558353710.0, "step": 14641 }, { "epoch": 1.862612899122249, "ewc_loss": 0.0307359229773283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073592233704403e-05, "grad_norm": 18.18003273010254, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8576425909996033, "num_tokens": 558390476.0, "step": 14642 }, { "epoch": 1.8627401094008396, "ewc_loss": 0.03066844493150711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066844510613009e-05, "grad_norm": 18.144052505493164, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8791748881340027, "num_tokens": 558424917.0, "step": 14643 }, { "epoch": 1.86286731967943, "ewc_loss": 0.03068319708108902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.068319711019285e-05, "grad_norm": 18.19626235961914, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.864446759223938, "num_tokens": 558459831.0, "step": 14644 }, { "epoch": 1.8629945299580206, "ewc_loss": 0.03076876886188984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.076876964769326e-05, "grad_norm": 18.15748405456543, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8856790065765381, "num_tokens": 558498600.0, "step": 14645 }, { "epoch": 1.8631217402366111, "ewc_loss": 0.030698109418153763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069810918532312e-05, "grad_norm": 18.145097732543945, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8603995442390442, "num_tokens": 558537465.0, "step": 14646 }, { "epoch": 1.8632489505152017, "ewc_loss": 0.03075295127928257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075295171584003e-05, "grad_norm": 18.164287567138672, "learning_rate": 1e-06, "loss": 0.3438, "mean_token_accuracy": 0.8899710178375244, "num_tokens": 558575357.0, "step": 14647 }, { "epoch": 1.8633761607937922, "ewc_loss": 0.030737239867448807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.07372392853722e-05, "grad_norm": 18.142271041870117, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8675342202186584, "num_tokens": 558619422.0, "step": 14648 }, { "epoch": 1.8635033710723827, "ewc_loss": 0.030735457316040993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073545667575672e-05, "grad_norm": 18.126117706298828, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8614662289619446, "num_tokens": 558657677.0, "step": 14649 }, { "epoch": 1.8636305813509733, "ewc_loss": 0.030754007399082184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075400672969408e-05, "grad_norm": 18.174930572509766, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8778373003005981, "num_tokens": 558692912.0, "step": 14650 }, { "epoch": 1.8637577916295638, "ewc_loss": 0.030806055292487144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0806055292487144e-05, "grad_norm": 18.163633346557617, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8824479579925537, "num_tokens": 558733470.0, "step": 14651 }, { "epoch": 1.8638850019081543, "ewc_loss": 0.030748942866921425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074894266319461e-05, "grad_norm": 18.202390670776367, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8658877611160278, "num_tokens": 558771899.0, "step": 14652 }, { "epoch": 1.8640122121867448, "ewc_loss": 0.03087448514997959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087448567384854e-05, "grad_norm": 18.257200241088867, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8766293525695801, "num_tokens": 558813002.0, "step": 14653 }, { "epoch": 1.8641394224653351, "ewc_loss": 0.030729668214917183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072966865147464e-05, "grad_norm": 18.153024673461914, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.854941725730896, "num_tokens": 558850827.0, "step": 14654 }, { "epoch": 1.8642666327439257, "ewc_loss": 0.030692005529999733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069200465688482e-05, "grad_norm": 18.23019790649414, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8673592805862427, "num_tokens": 558889003.0, "step": 14655 }, { "epoch": 1.8643938430225162, "ewc_loss": 0.030787089839577675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0787090508965775e-05, "grad_norm": 18.142982482910156, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8768337965011597, "num_tokens": 558925849.0, "step": 14656 }, { "epoch": 1.8645210533011067, "ewc_loss": 0.030703337863087654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070333696086891e-05, "grad_norm": 18.22861099243164, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8701931238174438, "num_tokens": 558967471.0, "step": 14657 }, { "epoch": 1.8646482635796973, "ewc_loss": 0.03075377829372883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0753777537029237e-05, "grad_norm": 18.130464553833008, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8739857077598572, "num_tokens": 559001938.0, "step": 14658 }, { "epoch": 1.8647754738582878, "ewc_loss": 0.030671002343297005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067100260523148e-05, "grad_norm": 18.157920837402344, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8762415051460266, "num_tokens": 559040515.0, "step": 14659 }, { "epoch": 1.864902684136878, "ewc_loss": 0.030787333846092224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.078733425354585e-05, "grad_norm": 18.247663497924805, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8826062083244324, "num_tokens": 559077702.0, "step": 14660 }, { "epoch": 1.8650298944154686, "ewc_loss": 0.030707694590091705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0707695259479806e-05, "grad_norm": 18.174989700317383, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8632872104644775, "num_tokens": 559114741.0, "step": 14661 }, { "epoch": 1.8651571046940592, "ewc_loss": 0.03071569837629795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071569881285541e-05, "grad_norm": 18.157262802124023, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8493106365203857, "num_tokens": 559155844.0, "step": 14662 }, { "epoch": 1.8652843149726497, "ewc_loss": 0.030707618221640587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070761886192486e-05, "grad_norm": 18.212116241455078, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8723974823951721, "num_tokens": 559194964.0, "step": 14663 }, { "epoch": 1.8654115252512402, "ewc_loss": 0.03073885850608349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0738858185941353e-05, "grad_norm": 18.09303855895996, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8764610290527344, "num_tokens": 559230083.0, "step": 14664 }, { "epoch": 1.8655387355298307, "ewc_loss": 0.03073044680058956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073044717893936e-05, "grad_norm": 18.244747161865234, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8812879323959351, "num_tokens": 559268397.0, "step": 14665 }, { "epoch": 1.8656659458084213, "ewc_loss": 0.03076022118330002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0760220397496596e-05, "grad_norm": 18.093017578125, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.861930251121521, "num_tokens": 559302024.0, "step": 14666 }, { "epoch": 1.8657931560870118, "ewc_loss": 0.030668586492538452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066858698730357e-05, "grad_norm": 18.1715087890625, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.868238627910614, "num_tokens": 559343192.0, "step": 14667 }, { "epoch": 1.8659203663656023, "ewc_loss": 0.030803920701146126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080391979892738e-05, "grad_norm": 18.144269943237305, "learning_rate": 1e-06, "loss": 0.3373, "mean_token_accuracy": 0.8930612802505493, "num_tokens": 559380616.0, "step": 14668 }, { "epoch": 1.8660475766441929, "ewc_loss": 0.030679097399115562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.067909710807726e-05, "grad_norm": 18.146142959594727, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8810174465179443, "num_tokens": 559413621.0, "step": 14669 }, { "epoch": 1.8661747869227834, "ewc_loss": 0.030789902433753014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.078990266658366e-05, "grad_norm": 18.223989486694336, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8798733949661255, "num_tokens": 559447274.0, "step": 14670 }, { "epoch": 1.866301997201374, "ewc_loss": 0.03073790855705738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.07379086734727e-05, "grad_norm": 18.086322784423828, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8802750110626221, "num_tokens": 559484306.0, "step": 14671 }, { "epoch": 1.8664292074799644, "ewc_loss": 0.03075803816318512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075803761021234e-05, "grad_norm": 18.194610595703125, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8769326210021973, "num_tokens": 559527126.0, "step": 14672 }, { "epoch": 1.866556417758555, "ewc_loss": 0.03080727718770504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080727765336633e-05, "grad_norm": 18.06243324279785, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8746911883354187, "num_tokens": 559568342.0, "step": 14673 }, { "epoch": 1.8666836280371455, "ewc_loss": 0.030777672305703163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0777671781834215e-05, "grad_norm": 18.212963104248047, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8687466979026794, "num_tokens": 559609597.0, "step": 14674 }, { "epoch": 1.866810838315736, "ewc_loss": 0.030859021469950676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085902062593959e-05, "grad_norm": 18.1339054107666, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8685615062713623, "num_tokens": 559646392.0, "step": 14675 }, { "epoch": 1.8669380485943265, "ewc_loss": 0.030743084847927094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0743085517315194e-05, "grad_norm": 18.089513778686523, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8749400973320007, "num_tokens": 559684836.0, "step": 14676 }, { "epoch": 1.867065258872917, "ewc_loss": 0.030859122052788734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085912248934619e-05, "grad_norm": 18.13279151916504, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8765689730644226, "num_tokens": 559727793.0, "step": 14677 }, { "epoch": 1.8671924691515076, "ewc_loss": 0.030764572322368622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.076457142014988e-05, "grad_norm": 18.15629768371582, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8597480654716492, "num_tokens": 559768154.0, "step": 14678 }, { "epoch": 1.867319679430098, "ewc_loss": 0.030747175216674805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0747174605494365e-05, "grad_norm": 18.09667205810547, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8719426393508911, "num_tokens": 559798732.0, "step": 14679 }, { "epoch": 1.8674468897086884, "ewc_loss": 0.030760886147618294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0760886147618294e-05, "grad_norm": 18.151634216308594, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.870121955871582, "num_tokens": 559842796.0, "step": 14680 }, { "epoch": 1.867574099987279, "ewc_loss": 0.030792562291026115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079256202909164e-05, "grad_norm": 18.093748092651367, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8681391477584839, "num_tokens": 559884198.0, "step": 14681 }, { "epoch": 1.8677013102658695, "ewc_loss": 0.030728965997695923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072896652156487e-05, "grad_norm": 18.157363891601562, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8568084239959717, "num_tokens": 559916749.0, "step": 14682 }, { "epoch": 1.86782852054446, "ewc_loss": 0.03083992935717106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0839928513159975e-05, "grad_norm": 18.172645568847656, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8671422004699707, "num_tokens": 559958430.0, "step": 14683 }, { "epoch": 1.8679557308230506, "ewc_loss": 0.030694693326950073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.069469312322326e-05, "grad_norm": 18.126483917236328, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8872907161712646, "num_tokens": 559995720.0, "step": 14684 }, { "epoch": 1.8680829411016409, "ewc_loss": 0.030764764174818993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0764764233026654e-05, "grad_norm": 18.166259765625, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8752085566520691, "num_tokens": 560033304.0, "step": 14685 }, { "epoch": 1.8682101513802314, "ewc_loss": 0.03078542836010456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0785427952650934e-05, "grad_norm": 18.172626495361328, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8702535629272461, "num_tokens": 560069879.0, "step": 14686 }, { "epoch": 1.868337361658822, "ewc_loss": 0.030720721930265427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0720722861588e-05, "grad_norm": 18.18931770324707, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8691504001617432, "num_tokens": 560106853.0, "step": 14687 }, { "epoch": 1.8684645719374124, "ewc_loss": 0.030752351507544518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0752351449336857e-05, "grad_norm": 18.165878295898438, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8726150989532471, "num_tokens": 560144225.0, "step": 14688 }, { "epoch": 1.868591782216003, "ewc_loss": 0.030751114711165428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0751114536542445e-05, "grad_norm": 18.24094581604004, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8648104071617126, "num_tokens": 560186520.0, "step": 14689 }, { "epoch": 1.8687189924945935, "ewc_loss": 0.03075331822037697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075331915169954e-05, "grad_norm": 18.17405128479004, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8740235567092896, "num_tokens": 560225458.0, "step": 14690 }, { "epoch": 1.868846202773184, "ewc_loss": 0.030692311003804207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0692310247104615e-05, "grad_norm": 18.171201705932617, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8760213255882263, "num_tokens": 560263401.0, "step": 14691 }, { "epoch": 1.8689734130517746, "ewc_loss": 0.030768966302275658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0768966098548844e-05, "grad_norm": 18.102567672729492, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8673858642578125, "num_tokens": 560305026.0, "step": 14692 }, { "epoch": 1.869100623330365, "ewc_loss": 0.030665287747979164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066528734052554e-05, "grad_norm": 18.151229858398438, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8394312858581543, "num_tokens": 560354823.0, "step": 14693 }, { "epoch": 1.8692278336089556, "ewc_loss": 0.030815936625003815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0815936042927206e-05, "grad_norm": 18.17829132080078, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8859577775001526, "num_tokens": 560391888.0, "step": 14694 }, { "epoch": 1.8693550438875461, "ewc_loss": 0.030725862830877304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072586332564242e-05, "grad_norm": 18.180177688598633, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8728278279304504, "num_tokens": 560425350.0, "step": 14695 }, { "epoch": 1.8694822541661367, "ewc_loss": 0.03074387088418007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0743871320737526e-05, "grad_norm": 18.178470611572266, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8810287714004517, "num_tokens": 560458521.0, "step": 14696 }, { "epoch": 1.8696094644447272, "ewc_loss": 0.030748603865504265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074860433116555e-05, "grad_norm": 18.124292373657227, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.864681601524353, "num_tokens": 560506378.0, "step": 14697 }, { "epoch": 1.8697366747233177, "ewc_loss": 0.030798710882663727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0798710213275626e-05, "grad_norm": 18.179187774658203, "learning_rate": 1e-06, "loss": 0.3324, "mean_token_accuracy": 0.8932616710662842, "num_tokens": 560544055.0, "step": 14698 }, { "epoch": 1.8698638850019083, "ewc_loss": 0.03076246567070484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.076246503042057e-05, "grad_norm": 18.178007125854492, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8803722262382507, "num_tokens": 560577240.0, "step": 14699 }, { "epoch": 1.8699910952804988, "ewc_loss": 0.030771378427743912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0771378078497946e-05, "grad_norm": 18.14113426208496, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8666662573814392, "num_tokens": 560615662.0, "step": 14700 }, { "epoch": 1.8701183055590893, "ewc_loss": 0.03075423277914524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075423228438012e-05, "grad_norm": 18.132173538208008, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8659883141517639, "num_tokens": 560658386.0, "step": 14701 }, { "epoch": 1.8702455158376798, "ewc_loss": 0.030781051144003868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0781051464146e-05, "grad_norm": 18.170242309570312, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8695876002311707, "num_tokens": 560696366.0, "step": 14702 }, { "epoch": 1.8703727261162701, "ewc_loss": 0.03082955628633499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082955663558096e-05, "grad_norm": 18.194984436035156, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8599283695220947, "num_tokens": 560736419.0, "step": 14703 }, { "epoch": 1.8704999363948607, "ewc_loss": 0.030818337574601173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0818337108939886e-05, "grad_norm": 18.143301010131836, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8696690797805786, "num_tokens": 560777839.0, "step": 14704 }, { "epoch": 1.8706271466734512, "ewc_loss": 0.03079998679459095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0799987143836915e-05, "grad_norm": 18.26587677001953, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8808659315109253, "num_tokens": 560816167.0, "step": 14705 }, { "epoch": 1.8707543569520417, "ewc_loss": 0.030825838446617126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082583862124011e-05, "grad_norm": 18.174957275390625, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8571711778640747, "num_tokens": 560854524.0, "step": 14706 }, { "epoch": 1.8708815672306323, "ewc_loss": 0.03077048249542713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.07704831357114e-05, "grad_norm": 18.196809768676758, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8686879873275757, "num_tokens": 560894393.0, "step": 14707 }, { "epoch": 1.8710087775092228, "ewc_loss": 0.03085591085255146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085591015405953e-05, "grad_norm": 18.176729202270508, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8817125558853149, "num_tokens": 560938640.0, "step": 14708 }, { "epoch": 1.871135987787813, "ewc_loss": 0.03076275996863842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0762759706703946e-05, "grad_norm": 18.214740753173828, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8704434037208557, "num_tokens": 560976805.0, "step": 14709 }, { "epoch": 1.8712631980664036, "ewc_loss": 0.03081626258790493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0816263461019844e-05, "grad_norm": 18.153108596801758, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8682171106338501, "num_tokens": 561018940.0, "step": 14710 }, { "epoch": 1.8713904083449941, "ewc_loss": 0.030738648027181625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073864718317054e-05, "grad_norm": 18.13227653503418, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8760491013526917, "num_tokens": 561065228.0, "step": 14711 }, { "epoch": 1.8715176186235847, "ewc_loss": 0.030774371698498726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.077437213505618e-05, "grad_norm": 18.20939826965332, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8633290529251099, "num_tokens": 561107540.0, "step": 14712 }, { "epoch": 1.8716448289021752, "ewc_loss": 0.030839689075946808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083968840655871e-05, "grad_norm": 18.323932647705078, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8520653247833252, "num_tokens": 561144774.0, "step": 14713 }, { "epoch": 1.8717720391807657, "ewc_loss": 0.03079027682542801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079027737840079e-05, "grad_norm": 18.20269012451172, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8512110710144043, "num_tokens": 561186774.0, "step": 14714 }, { "epoch": 1.8718992494593563, "ewc_loss": 0.030678093433380127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06780930259265e-05, "grad_norm": 18.23299789428711, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8835411071777344, "num_tokens": 561223997.0, "step": 14715 }, { "epoch": 1.8720264597379468, "ewc_loss": 0.030825361609458923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082536204601638e-05, "grad_norm": 18.224699020385742, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8785068988800049, "num_tokens": 561261583.0, "step": 14716 }, { "epoch": 1.8721536700165373, "ewc_loss": 0.030702169984579086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0702169169671834e-05, "grad_norm": 18.269798278808594, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8760496377944946, "num_tokens": 561302681.0, "step": 14717 }, { "epoch": 1.8722808802951278, "ewc_loss": 0.030744923278689384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0744922696612775e-05, "grad_norm": 18.179954528808594, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8696291446685791, "num_tokens": 561338402.0, "step": 14718 }, { "epoch": 1.8724080905737184, "ewc_loss": 0.030649643391370773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.064964403165504e-05, "grad_norm": 18.246749877929688, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8745893239974976, "num_tokens": 561372892.0, "step": 14719 }, { "epoch": 1.872535300852309, "ewc_loss": 0.03075031004846096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075031054322608e-05, "grad_norm": 18.26136589050293, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8673272728919983, "num_tokens": 561409474.0, "step": 14720 }, { "epoch": 1.8726625111308994, "ewc_loss": 0.030663711950182915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.066371209570207e-05, "grad_norm": 18.098134994506836, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8445501327514648, "num_tokens": 561450152.0, "step": 14721 }, { "epoch": 1.87278972140949, "ewc_loss": 0.030752282589673996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075228232773952e-05, "grad_norm": 18.237539291381836, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8640244007110596, "num_tokens": 561489448.0, "step": 14722 }, { "epoch": 1.8729169316880805, "ewc_loss": 0.030825525522232056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08255257550627e-05, "grad_norm": 18.24160385131836, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8481889367103577, "num_tokens": 561529684.0, "step": 14723 }, { "epoch": 1.873044141966671, "ewc_loss": 0.030703499913215637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.070350066991523e-05, "grad_norm": 18.157926559448242, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8717530965805054, "num_tokens": 561564259.0, "step": 14724 }, { "epoch": 1.8731713522452615, "ewc_loss": 0.030783982947468758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.078398367506452e-05, "grad_norm": 18.16961669921875, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.878823459148407, "num_tokens": 561609650.0, "step": 14725 }, { "epoch": 1.873298562523852, "ewc_loss": 0.030775684863328934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.077568544540554e-05, "grad_norm": 18.204429626464844, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8773813247680664, "num_tokens": 561645520.0, "step": 14726 }, { "epoch": 1.8734257728024426, "ewc_loss": 0.03071858361363411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.071858373004943e-05, "grad_norm": 18.1622257232666, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8763875961303711, "num_tokens": 561682771.0, "step": 14727 }, { "epoch": 1.873552983081033, "ewc_loss": 0.030743852257728577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.074385313084349e-05, "grad_norm": 18.128416061401367, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8604766130447388, "num_tokens": 561721317.0, "step": 14728 }, { "epoch": 1.8736801933596234, "ewc_loss": 0.030821379274129868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0821378459222615e-05, "grad_norm": 18.256359100341797, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8715077638626099, "num_tokens": 561763178.0, "step": 14729 }, { "epoch": 1.873807403638214, "ewc_loss": 0.030825071036815643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082507100771181e-05, "grad_norm": 18.1152400970459, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8711456060409546, "num_tokens": 561802292.0, "step": 14730 }, { "epoch": 1.8739346139168045, "ewc_loss": 0.03067629598081112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.06762958643958e-05, "grad_norm": 18.186931610107422, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8798808455467224, "num_tokens": 561838610.0, "step": 14731 }, { "epoch": 1.874061824195395, "ewc_loss": 0.030840681865811348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0840681574773043e-05, "grad_norm": 18.219236373901367, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8748879432678223, "num_tokens": 561881992.0, "step": 14732 }, { "epoch": 1.8741890344739855, "ewc_loss": 0.030777916312217712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.077791552641429e-05, "grad_norm": 18.228282928466797, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8607178926467896, "num_tokens": 561917661.0, "step": 14733 }, { "epoch": 1.8743162447525759, "ewc_loss": 0.03080875799059868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0808758310740814e-05, "grad_norm": 18.157957077026367, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8713959455490112, "num_tokens": 561958218.0, "step": 14734 }, { "epoch": 1.8744434550311664, "ewc_loss": 0.030738480389118195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0738479836145416e-05, "grad_norm": 18.190231323242188, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8825083374977112, "num_tokens": 561993882.0, "step": 14735 }, { "epoch": 1.874570665309757, "ewc_loss": 0.03081304393708706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081304384977557e-05, "grad_norm": 18.17283058166504, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8719173073768616, "num_tokens": 562035443.0, "step": 14736 }, { "epoch": 1.8746978755883474, "ewc_loss": 0.030721941962838173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072194158448838e-05, "grad_norm": 18.17719268798828, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8696816563606262, "num_tokens": 562072806.0, "step": 14737 }, { "epoch": 1.874825085866938, "ewc_loss": 0.030756110325455666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.075610948144458e-05, "grad_norm": 18.111289978027344, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.85444176197052, "num_tokens": 562113652.0, "step": 14738 }, { "epoch": 1.8749522961455285, "ewc_loss": 0.03073248453438282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.073248444707133e-05, "grad_norm": 18.23536491394043, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8627105951309204, "num_tokens": 562150600.0, "step": 14739 }, { "epoch": 1.875079506424119, "ewc_loss": 0.03083672747015953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0836727091809735e-05, "grad_norm": 18.15447425842285, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8727836012840271, "num_tokens": 562187730.0, "step": 14740 }, { "epoch": 1.8752067167027096, "ewc_loss": 0.030775271356105804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0775270715821534e-05, "grad_norm": 18.114513397216797, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8713364601135254, "num_tokens": 562228229.0, "step": 14741 }, { "epoch": 1.8753339269813, "ewc_loss": 0.030821628868579865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0821629479760304e-05, "grad_norm": 18.189319610595703, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8760448694229126, "num_tokens": 562262293.0, "step": 14742 }, { "epoch": 1.8754611372598906, "ewc_loss": 0.03080672025680542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080672104260884e-05, "grad_norm": 18.157093048095703, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8676140308380127, "num_tokens": 562303389.0, "step": 14743 }, { "epoch": 1.8755883475384811, "ewc_loss": 0.030830159783363342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0830160540062934e-05, "grad_norm": 18.157445907592773, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8646567463874817, "num_tokens": 562344461.0, "step": 14744 }, { "epoch": 1.8757155578170717, "ewc_loss": 0.030799295753240585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079929592786357e-05, "grad_norm": 18.159870147705078, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8531316518783569, "num_tokens": 562387068.0, "step": 14745 }, { "epoch": 1.8758427680956622, "ewc_loss": 0.030853526666760445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0853527277940884e-05, "grad_norm": 18.12928581237793, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8690389394760132, "num_tokens": 562428225.0, "step": 14746 }, { "epoch": 1.8759699783742527, "ewc_loss": 0.030793918296694756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0793918995186687e-05, "grad_norm": 18.19061279296875, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8711428046226501, "num_tokens": 562463879.0, "step": 14747 }, { "epoch": 1.8760971886528433, "ewc_loss": 0.03086664527654648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086664582951926e-05, "grad_norm": 18.146217346191406, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8640493154525757, "num_tokens": 562502327.0, "step": 14748 }, { "epoch": 1.8762243989314338, "ewc_loss": 0.03079017624258995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079017551499419e-05, "grad_norm": 18.22049331665039, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8748249411582947, "num_tokens": 562542162.0, "step": 14749 }, { "epoch": 1.8763516092100243, "ewc_loss": 0.03084452636539936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084452691837214e-05, "grad_norm": 18.124040603637695, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8656836152076721, "num_tokens": 562583364.0, "step": 14750 }, { "epoch": 1.8764788194886148, "ewc_loss": 0.030770715326070786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0770715966355056e-05, "grad_norm": 18.221359252929688, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.872845470905304, "num_tokens": 562615870.0, "step": 14751 }, { "epoch": 1.8766060297672051, "ewc_loss": 0.030827807262539864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0827806767774746e-05, "grad_norm": 18.14011001586914, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8562199473381042, "num_tokens": 562655086.0, "step": 14752 }, { "epoch": 1.8767332400457957, "ewc_loss": 0.030721155926585197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0721155781066045e-05, "grad_norm": 18.137889862060547, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8634375929832458, "num_tokens": 562697532.0, "step": 14753 }, { "epoch": 1.8768604503243862, "ewc_loss": 0.03088594600558281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088594530709088e-05, "grad_norm": 18.197372436523438, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8508122563362122, "num_tokens": 562731057.0, "step": 14754 }, { "epoch": 1.8769876606029767, "ewc_loss": 0.03076997771859169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0769977456657216e-05, "grad_norm": 18.20781707763672, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8587076663970947, "num_tokens": 562770402.0, "step": 14755 }, { "epoch": 1.8771148708815673, "ewc_loss": 0.03085264004766941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0852639611111954e-05, "grad_norm": 18.222206115722656, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8670977354049683, "num_tokens": 562806191.0, "step": 14756 }, { "epoch": 1.8772420811601578, "ewc_loss": 0.030808616429567337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080861642956734e-05, "grad_norm": 18.18099021911621, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8690201044082642, "num_tokens": 562846864.0, "step": 14757 }, { "epoch": 1.877369291438748, "ewc_loss": 0.030841369181871414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0841369152767584e-05, "grad_norm": 18.21694564819336, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8653866648674011, "num_tokens": 562882887.0, "step": 14758 }, { "epoch": 1.8774965017173386, "ewc_loss": 0.03083129972219467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0831299227429554e-05, "grad_norm": 18.14973258972168, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8699302077293396, "num_tokens": 562919096.0, "step": 14759 }, { "epoch": 1.8776237119959291, "ewc_loss": 0.030830033123493195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0830033210804686e-05, "grad_norm": 18.143211364746094, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8435878157615662, "num_tokens": 562959336.0, "step": 14760 }, { "epoch": 1.8777509222745197, "ewc_loss": 0.03089914843440056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0899147532181814e-05, "grad_norm": 18.139020919799805, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8695817589759827, "num_tokens": 562997883.0, "step": 14761 }, { "epoch": 1.8778781325531102, "ewc_loss": 0.03085995838046074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085995922447182e-05, "grad_norm": 18.199237823486328, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8790016174316406, "num_tokens": 563036763.0, "step": 14762 }, { "epoch": 1.8780053428317007, "ewc_loss": 0.03086099959909916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086099968641065e-05, "grad_norm": 18.13162612915039, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8858550786972046, "num_tokens": 563076815.0, "step": 14763 }, { "epoch": 1.8781325531102913, "ewc_loss": 0.030842173844575882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084217314608395e-05, "grad_norm": 18.258432388305664, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8808941841125488, "num_tokens": 563110511.0, "step": 14764 }, { "epoch": 1.8782597633888818, "ewc_loss": 0.030963808298110962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096380896749906e-05, "grad_norm": 18.187156677246094, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.864824652671814, "num_tokens": 563140757.0, "step": 14765 }, { "epoch": 1.8783869736674723, "ewc_loss": 0.030813131481409073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081313116126694e-05, "grad_norm": 18.148231506347656, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8695571422576904, "num_tokens": 563179961.0, "step": 14766 }, { "epoch": 1.8785141839460628, "ewc_loss": 0.030947918072342873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0947918276069686e-05, "grad_norm": 18.21089744567871, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8773813247680664, "num_tokens": 563221355.0, "step": 14767 }, { "epoch": 1.8786413942246534, "ewc_loss": 0.030835632234811783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08356320601888e-05, "grad_norm": 18.108715057373047, "learning_rate": 1e-06, "loss": 0.3408, "mean_token_accuracy": 0.8904199004173279, "num_tokens": 563257427.0, "step": 14768 }, { "epoch": 1.878768604503244, "ewc_loss": 0.03089224174618721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0892242648405954e-05, "grad_norm": 18.250701904296875, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8675689697265625, "num_tokens": 563294371.0, "step": 14769 }, { "epoch": 1.8788958147818344, "ewc_loss": 0.03089210018515587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089210076723248e-05, "grad_norm": 18.1813907623291, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8802326917648315, "num_tokens": 563338592.0, "step": 14770 }, { "epoch": 1.879023025060425, "ewc_loss": 0.030850406736135483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08504058921244e-05, "grad_norm": 18.225019454956055, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8738925457000732, "num_tokens": 563373853.0, "step": 14771 }, { "epoch": 1.8791502353390155, "ewc_loss": 0.030901584774255753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0901584977982566e-05, "grad_norm": 18.25312042236328, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8695617318153381, "num_tokens": 563411137.0, "step": 14772 }, { "epoch": 1.879277445617606, "ewc_loss": 0.030820414423942566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0820414394838735e-05, "grad_norm": 18.12725257873535, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8738322257995605, "num_tokens": 563450294.0, "step": 14773 }, { "epoch": 1.8794046558961965, "ewc_loss": 0.030774733051657677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0774732294958085e-05, "grad_norm": 18.074934005737305, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.872162401676178, "num_tokens": 563496858.0, "step": 14774 }, { "epoch": 1.879531866174787, "ewc_loss": 0.03089805692434311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089805613853969e-05, "grad_norm": 18.245094299316406, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8655070066452026, "num_tokens": 563539566.0, "step": 14775 }, { "epoch": 1.8796590764533776, "ewc_loss": 0.030881427228450775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088142693741247e-05, "grad_norm": 18.186372756958008, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8788486123085022, "num_tokens": 563574363.0, "step": 14776 }, { "epoch": 1.879786286731968, "ewc_loss": 0.03082260489463806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0822604458080605e-05, "grad_norm": 18.148677825927734, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8717297315597534, "num_tokens": 563605540.0, "step": 14777 }, { "epoch": 1.8799134970105584, "ewc_loss": 0.03084772452712059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084772470174357e-05, "grad_norm": 18.178598403930664, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8568702340126038, "num_tokens": 563646640.0, "step": 14778 }, { "epoch": 1.880040707289149, "ewc_loss": 0.03088969551026821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088969606324099e-05, "grad_norm": 18.218778610229492, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8808975219726562, "num_tokens": 563683576.0, "step": 14779 }, { "epoch": 1.8801679175677395, "ewc_loss": 0.03081611357629299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081611430388875e-05, "grad_norm": 18.17456817626953, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8623104095458984, "num_tokens": 563724088.0, "step": 14780 }, { "epoch": 1.88029512784633, "ewc_loss": 0.030873771756887436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087377263000235e-05, "grad_norm": 18.264394760131836, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8691999912261963, "num_tokens": 563760897.0, "step": 14781 }, { "epoch": 1.8804223381249205, "ewc_loss": 0.03082137554883957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082137482124381e-05, "grad_norm": 18.12269401550293, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8754767179489136, "num_tokens": 563800158.0, "step": 14782 }, { "epoch": 1.8805495484035109, "ewc_loss": 0.030814077705144882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081407703575678e-05, "grad_norm": 18.15212059020996, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8768602609634399, "num_tokens": 563833288.0, "step": 14783 }, { "epoch": 1.8806767586821014, "ewc_loss": 0.030901195481419563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090119571425021e-05, "grad_norm": 18.179214477539062, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8814281225204468, "num_tokens": 563874507.0, "step": 14784 }, { "epoch": 1.880803968960692, "ewc_loss": 0.03084045648574829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0840456020087004e-05, "grad_norm": 18.118488311767578, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8698722124099731, "num_tokens": 563912452.0, "step": 14785 }, { "epoch": 1.8809311792392824, "ewc_loss": 0.03085327334702015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085327261942439e-05, "grad_norm": 18.177440643310547, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8532506227493286, "num_tokens": 563950565.0, "step": 14786 }, { "epoch": 1.881058389517873, "ewc_loss": 0.030831001698970795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083100091316737e-05, "grad_norm": 18.126062393188477, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8611542582511902, "num_tokens": 563984924.0, "step": 14787 }, { "epoch": 1.8811855997964635, "ewc_loss": 0.030873125419020653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087312506977469e-05, "grad_norm": 18.158788681030273, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8769604563713074, "num_tokens": 564022887.0, "step": 14788 }, { "epoch": 1.881312810075054, "ewc_loss": 0.030872324481606483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087232471443713e-05, "grad_norm": 18.155744552612305, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8698989152908325, "num_tokens": 564064829.0, "step": 14789 }, { "epoch": 1.8814400203536445, "ewc_loss": 0.0308188758790493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0818875529803336e-05, "grad_norm": 18.185636520385742, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.85020512342453, "num_tokens": 564100858.0, "step": 14790 }, { "epoch": 1.881567230632235, "ewc_loss": 0.030844425782561302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084442505496554e-05, "grad_norm": 18.091482162475586, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8637456297874451, "num_tokens": 564141477.0, "step": 14791 }, { "epoch": 1.8816944409108256, "ewc_loss": 0.030834177508950233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0834176868665963e-05, "grad_norm": 18.22602081298828, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8633326888084412, "num_tokens": 564179271.0, "step": 14792 }, { "epoch": 1.8818216511894161, "ewc_loss": 0.030939558520913124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093955820077099e-05, "grad_norm": 18.176252365112305, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8762222528457642, "num_tokens": 564224336.0, "step": 14793 }, { "epoch": 1.8819488614680067, "ewc_loss": 0.03081689216196537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081689283135347e-05, "grad_norm": 18.142667770385742, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8644822239875793, "num_tokens": 564265157.0, "step": 14794 }, { "epoch": 1.8820760717465972, "ewc_loss": 0.030817154794931412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081715476582758e-05, "grad_norm": 18.163759231567383, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8765427470207214, "num_tokens": 564295023.0, "step": 14795 }, { "epoch": 1.8822032820251877, "ewc_loss": 0.03085704892873764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085704884142615e-05, "grad_norm": 18.149887084960938, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8722515106201172, "num_tokens": 564333700.0, "step": 14796 }, { "epoch": 1.8823304923037782, "ewc_loss": 0.03083515167236328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0835151846986264e-05, "grad_norm": 18.204172134399414, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8566529750823975, "num_tokens": 564374342.0, "step": 14797 }, { "epoch": 1.8824577025823688, "ewc_loss": 0.03086293488740921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0862935091136023e-05, "grad_norm": 18.156709671020508, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8723735809326172, "num_tokens": 564422877.0, "step": 14798 }, { "epoch": 1.8825849128609593, "ewc_loss": 0.030827486887574196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082748662563972e-05, "grad_norm": 18.161685943603516, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8704110383987427, "num_tokens": 564464713.0, "step": 14799 }, { "epoch": 1.8827121231395498, "ewc_loss": 0.03091922216117382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0919221899239346e-05, "grad_norm": 18.265464782714844, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8596800565719604, "num_tokens": 564500667.0, "step": 14800 }, { "epoch": 1.8828393334181401, "ewc_loss": 0.030835486948490143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0835486541036516e-05, "grad_norm": 18.172836303710938, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8785861730575562, "num_tokens": 564541460.0, "step": 14801 }, { "epoch": 1.8829665436967307, "ewc_loss": 0.03072892688214779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.072892650379799e-05, "grad_norm": 18.165767669677734, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.862639307975769, "num_tokens": 564580983.0, "step": 14802 }, { "epoch": 1.8830937539753212, "ewc_loss": 0.030836261808872223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083626143052243e-05, "grad_norm": 18.195215225219727, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8784423470497131, "num_tokens": 564615967.0, "step": 14803 }, { "epoch": 1.8832209642539117, "ewc_loss": 0.030863221734762192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0863222491461784e-05, "grad_norm": 18.245975494384766, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8629624843597412, "num_tokens": 564653255.0, "step": 14804 }, { "epoch": 1.8833481745325023, "ewc_loss": 0.030793849378824234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079384987358935e-05, "grad_norm": 18.162189483642578, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8689572811126709, "num_tokens": 564688420.0, "step": 14805 }, { "epoch": 1.8834753848110928, "ewc_loss": 0.030799321830272675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079932139371522e-05, "grad_norm": 18.11568832397461, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8645663261413574, "num_tokens": 564733123.0, "step": 14806 }, { "epoch": 1.883602595089683, "ewc_loss": 0.03083132766187191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083132833126001e-05, "grad_norm": 18.167469024658203, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8770444393157959, "num_tokens": 564767712.0, "step": 14807 }, { "epoch": 1.8837298053682736, "ewc_loss": 0.0308823324739933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0882332794135436e-05, "grad_norm": 18.142925262451172, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8678715229034424, "num_tokens": 564810663.0, "step": 14808 }, { "epoch": 1.8838570156468641, "ewc_loss": 0.030830563977360725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083056435571052e-05, "grad_norm": 18.192853927612305, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8690428733825684, "num_tokens": 564848689.0, "step": 14809 }, { "epoch": 1.8839842259254547, "ewc_loss": 0.03089751861989498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089751771767624e-05, "grad_norm": 18.14162826538086, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.858890950679779, "num_tokens": 564887426.0, "step": 14810 }, { "epoch": 1.8841114362040452, "ewc_loss": 0.030834689736366272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083468982367776e-05, "grad_norm": 18.1773681640625, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8742461204528809, "num_tokens": 564924852.0, "step": 14811 }, { "epoch": 1.8842386464826357, "ewc_loss": 0.030865434557199478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0865434382576495e-05, "grad_norm": 18.126127243041992, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8669928312301636, "num_tokens": 564964201.0, "step": 14812 }, { "epoch": 1.8843658567612263, "ewc_loss": 0.030836930498480797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083693081862293e-05, "grad_norm": 18.161500930786133, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8813353180885315, "num_tokens": 565004939.0, "step": 14813 }, { "epoch": 1.8844930670398168, "ewc_loss": 0.030869342386722565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086934157181531e-05, "grad_norm": 18.174734115600586, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.874419629573822, "num_tokens": 565044067.0, "step": 14814 }, { "epoch": 1.8846202773184073, "ewc_loss": 0.030855033546686172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085503340116702e-05, "grad_norm": 18.07990837097168, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8716691732406616, "num_tokens": 565082114.0, "step": 14815 }, { "epoch": 1.8847474875969978, "ewc_loss": 0.030864229425787926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086423021159135e-05, "grad_norm": 18.176267623901367, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8694977760314941, "num_tokens": 565118350.0, "step": 14816 }, { "epoch": 1.8848746978755884, "ewc_loss": 0.030876271426677704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087627192144282e-05, "grad_norm": 18.134384155273438, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8638392686843872, "num_tokens": 565153313.0, "step": 14817 }, { "epoch": 1.885001908154179, "ewc_loss": 0.030851341784000397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085134085267782e-05, "grad_norm": 18.172428131103516, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8578358292579651, "num_tokens": 565191332.0, "step": 14818 }, { "epoch": 1.8851291184327694, "ewc_loss": 0.030922695994377136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092269616900012e-05, "grad_norm": 18.241260528564453, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8676083087921143, "num_tokens": 565229937.0, "step": 14819 }, { "epoch": 1.88525632871136, "ewc_loss": 0.030882690101861954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088268931605853e-05, "grad_norm": 18.106353759765625, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8645838499069214, "num_tokens": 565266271.0, "step": 14820 }, { "epoch": 1.8853835389899505, "ewc_loss": 0.03086535818874836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0865357985021546e-05, "grad_norm": 18.263029098510742, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8673444986343384, "num_tokens": 565300593.0, "step": 14821 }, { "epoch": 1.885510749268541, "ewc_loss": 0.03092748299241066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092748374911025e-05, "grad_norm": 18.130525588989258, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8696331977844238, "num_tokens": 565341801.0, "step": 14822 }, { "epoch": 1.8856379595471315, "ewc_loss": 0.030839720740914345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083972114836797e-05, "grad_norm": 18.2546329498291, "learning_rate": 1e-06, "loss": 0.3468, "mean_token_accuracy": 0.8881899118423462, "num_tokens": 565374380.0, "step": 14823 }, { "epoch": 1.885765169825722, "ewc_loss": 0.030959902331233025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0959901778260246e-05, "grad_norm": 18.178390502929688, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8619189262390137, "num_tokens": 565413096.0, "step": 14824 }, { "epoch": 1.8858923801043126, "ewc_loss": 0.03086678497493267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0866784072713926e-05, "grad_norm": 18.240129470825195, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8790386915206909, "num_tokens": 565453292.0, "step": 14825 }, { "epoch": 1.886019590382903, "ewc_loss": 0.03094707801938057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094707790296525e-05, "grad_norm": 18.285263061523438, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8658079504966736, "num_tokens": 565489311.0, "step": 14826 }, { "epoch": 1.8861468006614934, "ewc_loss": 0.030860770493745804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0860770493745804e-05, "grad_norm": 18.205347061157227, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8543480634689331, "num_tokens": 565526293.0, "step": 14827 }, { "epoch": 1.886274010940084, "ewc_loss": 0.030826952308416367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082695184275508e-05, "grad_norm": 18.16077995300293, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8631284832954407, "num_tokens": 565569215.0, "step": 14828 }, { "epoch": 1.8864012212186745, "ewc_loss": 0.03085247613489628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0852475902065635e-05, "grad_norm": 18.1793270111084, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8801331520080566, "num_tokens": 565610755.0, "step": 14829 }, { "epoch": 1.886528431497265, "ewc_loss": 0.03082186169922352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.082186231040396e-05, "grad_norm": 18.207948684692383, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8757789134979248, "num_tokens": 565641964.0, "step": 14830 }, { "epoch": 1.8866556417758555, "ewc_loss": 0.03083217889070511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083217961830087e-05, "grad_norm": 18.16498565673828, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8368472456932068, "num_tokens": 565682852.0, "step": 14831 }, { "epoch": 1.8867828520544458, "ewc_loss": 0.030809525400400162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.080952592426911e-05, "grad_norm": 18.09576988220215, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.88643878698349, "num_tokens": 565721352.0, "step": 14832 }, { "epoch": 1.8869100623330364, "ewc_loss": 0.030836818739771843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.083681804127991e-05, "grad_norm": 18.12877082824707, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.87462317943573, "num_tokens": 565764045.0, "step": 14833 }, { "epoch": 1.887037272611627, "ewc_loss": 0.03096085786819458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096085856668651e-05, "grad_norm": 18.240644454956055, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8782703876495361, "num_tokens": 565797617.0, "step": 14834 }, { "epoch": 1.8871644828902174, "ewc_loss": 0.030887458473443985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088745870627463e-05, "grad_norm": 18.101675033569336, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8699254989624023, "num_tokens": 565839254.0, "step": 14835 }, { "epoch": 1.887291693168808, "ewc_loss": 0.03091607801616192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091607868555002e-05, "grad_norm": 18.162498474121094, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8704428672790527, "num_tokens": 565874742.0, "step": 14836 }, { "epoch": 1.8874189034473985, "ewc_loss": 0.03093714639544487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093714622082189e-05, "grad_norm": 18.168331146240234, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8709568977355957, "num_tokens": 565911043.0, "step": 14837 }, { "epoch": 1.887546113725989, "ewc_loss": 0.030936390161514282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093638952123001e-05, "grad_norm": 18.19852638244629, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.866203784942627, "num_tokens": 565950228.0, "step": 14838 }, { "epoch": 1.8876733240045795, "ewc_loss": 0.03094218112528324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09421811834909e-05, "grad_norm": 18.153200149536133, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8813422322273254, "num_tokens": 565982891.0, "step": 14839 }, { "epoch": 1.88780053428317, "ewc_loss": 0.030928000807762146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092800034210086e-05, "grad_norm": 18.24863624572754, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8687558770179749, "num_tokens": 566022601.0, "step": 14840 }, { "epoch": 1.8879277445617606, "ewc_loss": 0.030957188457250595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0957187846070156e-05, "grad_norm": 18.15162467956543, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8709743022918701, "num_tokens": 566054928.0, "step": 14841 }, { "epoch": 1.8880549548403511, "ewc_loss": 0.03081447072327137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.081446993746795e-05, "grad_norm": 18.215452194213867, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8639109134674072, "num_tokens": 566094900.0, "step": 14842 }, { "epoch": 1.8881821651189417, "ewc_loss": 0.030982675030827522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098267552559264e-05, "grad_norm": 18.178831100463867, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8684640526771545, "num_tokens": 566125230.0, "step": 14843 }, { "epoch": 1.8883093753975322, "ewc_loss": 0.03088909573853016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088909579673782e-05, "grad_norm": 18.15083122253418, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8852978944778442, "num_tokens": 566165320.0, "step": 14844 }, { "epoch": 1.8884365856761227, "ewc_loss": 0.03089292347431183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089292295044288e-05, "grad_norm": 18.0657901763916, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8599917888641357, "num_tokens": 566202769.0, "step": 14845 }, { "epoch": 1.8885637959547132, "ewc_loss": 0.030882282182574272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088228186243214e-05, "grad_norm": 18.20718765258789, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8713681101799011, "num_tokens": 566245874.0, "step": 14846 }, { "epoch": 1.8886910062333038, "ewc_loss": 0.03094923496246338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094923522439785e-05, "grad_norm": 18.0979061126709, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8530676364898682, "num_tokens": 566284855.0, "step": 14847 }, { "epoch": 1.8888182165118943, "ewc_loss": 0.030877957120537758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0877956305630505e-05, "grad_norm": 18.210124969482422, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8619584441184998, "num_tokens": 566323422.0, "step": 14848 }, { "epoch": 1.8889454267904848, "ewc_loss": 0.030964743345975876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0964743928052485e-05, "grad_norm": 18.126779556274414, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8775343894958496, "num_tokens": 566364357.0, "step": 14849 }, { "epoch": 1.8890726370690751, "ewc_loss": 0.030879953876137733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08799535559956e-05, "grad_norm": 18.1982421875, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8668100833892822, "num_tokens": 566404498.0, "step": 14850 }, { "epoch": 1.8891998473476657, "ewc_loss": 0.03094870038330555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094870044151321e-05, "grad_norm": 18.111175537109375, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8663526773452759, "num_tokens": 566442960.0, "step": 14851 }, { "epoch": 1.8893270576262562, "ewc_loss": 0.030919905751943588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091990583925508e-05, "grad_norm": 18.120471954345703, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8791382908821106, "num_tokens": 566475087.0, "step": 14852 }, { "epoch": 1.8894542679048467, "ewc_loss": 0.03090280294418335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090280370088294e-05, "grad_norm": 18.130809783935547, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8605685830116272, "num_tokens": 566514337.0, "step": 14853 }, { "epoch": 1.8895814781834372, "ewc_loss": 0.030963139608502388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096313957939856e-05, "grad_norm": 18.25885009765625, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8599426746368408, "num_tokens": 566553204.0, "step": 14854 }, { "epoch": 1.8897086884620278, "ewc_loss": 0.03096769005060196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096769069088623e-05, "grad_norm": 18.13158416748047, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.868462085723877, "num_tokens": 566589697.0, "step": 14855 }, { "epoch": 1.889835898740618, "ewc_loss": 0.030967988073825836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096798900514841e-05, "grad_norm": 18.14259910583496, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8554903268814087, "num_tokens": 566629680.0, "step": 14856 }, { "epoch": 1.8899631090192086, "ewc_loss": 0.03098168969154358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098168963333592e-05, "grad_norm": 18.251628875732422, "learning_rate": 1e-06, "loss": 0.3305, "mean_token_accuracy": 0.8924113512039185, "num_tokens": 566670931.0, "step": 14857 }, { "epoch": 1.8900903192977991, "ewc_loss": 0.031018564477562904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10185641865246e-05, "grad_norm": 18.161596298217773, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.86441570520401, "num_tokens": 566706673.0, "step": 14858 }, { "epoch": 1.8902175295763897, "ewc_loss": 0.03091053105890751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0910530767869204e-05, "grad_norm": 18.230005264282227, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8668013215065002, "num_tokens": 566749503.0, "step": 14859 }, { "epoch": 1.8903447398549802, "ewc_loss": 0.030980609357357025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098060915363021e-05, "grad_norm": 18.16990852355957, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8845938444137573, "num_tokens": 566790700.0, "step": 14860 }, { "epoch": 1.8904719501335707, "ewc_loss": 0.030916497111320496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0916497053112835e-05, "grad_norm": 18.180831909179688, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8690973520278931, "num_tokens": 566832995.0, "step": 14861 }, { "epoch": 1.8905991604121613, "ewc_loss": 0.030924715101718903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0924715247238055e-05, "grad_norm": 18.17680549621582, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8580752611160278, "num_tokens": 566868190.0, "step": 14862 }, { "epoch": 1.8907263706907518, "ewc_loss": 0.03091740608215332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091740654781461e-05, "grad_norm": 18.256168365478516, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8423459529876709, "num_tokens": 566906637.0, "step": 14863 }, { "epoch": 1.8908535809693423, "ewc_loss": 0.030962960794568062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096296131843701e-05, "grad_norm": 18.12974739074707, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8698105812072754, "num_tokens": 566946006.0, "step": 14864 }, { "epoch": 1.8909807912479328, "ewc_loss": 0.030819987878203392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0819988751318306e-05, "grad_norm": 18.176780700683594, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8574862480163574, "num_tokens": 566987726.0, "step": 14865 }, { "epoch": 1.8911080015265234, "ewc_loss": 0.03097512014210224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097511944361031e-05, "grad_norm": 18.190242767333984, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8847982883453369, "num_tokens": 567027432.0, "step": 14866 }, { "epoch": 1.891235211805114, "ewc_loss": 0.030927088111639023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092708720942028e-05, "grad_norm": 18.189905166625977, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8760345578193665, "num_tokens": 567060607.0, "step": 14867 }, { "epoch": 1.8913624220837044, "ewc_loss": 0.03088587336242199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088587254751474e-05, "grad_norm": 18.078468322753906, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8637769818305969, "num_tokens": 567100691.0, "step": 14868 }, { "epoch": 1.891489632362295, "ewc_loss": 0.030934272333979607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0934272217564285e-05, "grad_norm": 18.158832550048828, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8888859748840332, "num_tokens": 567139460.0, "step": 14869 }, { "epoch": 1.8916168426408855, "ewc_loss": 0.030973242595791817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097324224654585e-05, "grad_norm": 18.225801467895508, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8578640222549438, "num_tokens": 567185384.0, "step": 14870 }, { "epoch": 1.891744052919476, "ewc_loss": 0.03094295598566532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094295607297681e-05, "grad_norm": 18.248714447021484, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8690640926361084, "num_tokens": 567219898.0, "step": 14871 }, { "epoch": 1.8918712631980665, "ewc_loss": 0.030921543017029762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092154292971827e-05, "grad_norm": 18.219327926635742, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.876873254776001, "num_tokens": 567260322.0, "step": 14872 }, { "epoch": 1.891998473476657, "ewc_loss": 0.03090623766183853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0906237952876836e-05, "grad_norm": 18.225618362426758, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8650068044662476, "num_tokens": 567293739.0, "step": 14873 }, { "epoch": 1.8921256837552476, "ewc_loss": 0.030901096761226654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0901097488822415e-05, "grad_norm": 18.238292694091797, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8711919784545898, "num_tokens": 567327282.0, "step": 14874 }, { "epoch": 1.892252894033838, "ewc_loss": 0.030856946483254433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.085694697801955e-05, "grad_norm": 18.1806640625, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.863049328327179, "num_tokens": 567366402.0, "step": 14875 }, { "epoch": 1.8923801043124284, "ewc_loss": 0.030956409871578217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095640931860544e-05, "grad_norm": 18.2110538482666, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8627948760986328, "num_tokens": 567409924.0, "step": 14876 }, { "epoch": 1.892507314591019, "ewc_loss": 0.03088383562862873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0883835279382765e-05, "grad_norm": 18.204715728759766, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8721712231636047, "num_tokens": 567450355.0, "step": 14877 }, { "epoch": 1.8926345248696095, "ewc_loss": 0.030912332236766815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0912331567378715e-05, "grad_norm": 18.184917449951172, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8831192255020142, "num_tokens": 567487701.0, "step": 14878 }, { "epoch": 1.8927617351482, "ewc_loss": 0.030843399465084076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084339914494194e-05, "grad_norm": 18.176780700683594, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8693281412124634, "num_tokens": 567527619.0, "step": 14879 }, { "epoch": 1.8928889454267905, "ewc_loss": 0.030799241736531258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.079924135818146e-05, "grad_norm": 18.11973762512207, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8691657781600952, "num_tokens": 567567016.0, "step": 14880 }, { "epoch": 1.8930161557053808, "ewc_loss": 0.030903439968824387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090344034717418e-05, "grad_norm": 18.178146362304688, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8573347330093384, "num_tokens": 567605465.0, "step": 14881 }, { "epoch": 1.8931433659839714, "ewc_loss": 0.030896611511707306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089661186095327e-05, "grad_norm": 18.153339385986328, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8732117414474487, "num_tokens": 567646680.0, "step": 14882 }, { "epoch": 1.893270576262562, "ewc_loss": 0.030892351642251015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089235178777017e-05, "grad_norm": 18.183151245117188, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8711510300636292, "num_tokens": 567685685.0, "step": 14883 }, { "epoch": 1.8933977865411524, "ewc_loss": 0.03088599629700184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088599623879418e-05, "grad_norm": 18.18299674987793, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8586719036102295, "num_tokens": 567717222.0, "step": 14884 }, { "epoch": 1.893524996819743, "ewc_loss": 0.03084721975028515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084721902268939e-05, "grad_norm": 18.144962310791016, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8601677417755127, "num_tokens": 567761050.0, "step": 14885 }, { "epoch": 1.8936522070983335, "ewc_loss": 0.030903542414307594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090354221058078e-05, "grad_norm": 18.197952270507812, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8681461215019226, "num_tokens": 567801423.0, "step": 14886 }, { "epoch": 1.893779417376924, "ewc_loss": 0.030903775244951248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0903775041224435e-05, "grad_norm": 18.17317771911621, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8603605628013611, "num_tokens": 567844614.0, "step": 14887 }, { "epoch": 1.8939066276555145, "ewc_loss": 0.030895190313458443, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.08951894112397e-05, "grad_norm": 18.194387435913086, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8724480867385864, "num_tokens": 567878803.0, "step": 14888 }, { "epoch": 1.894033837934105, "ewc_loss": 0.03089943900704384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089943857048638e-05, "grad_norm": 18.182180404663086, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8820528388023376, "num_tokens": 567909419.0, "step": 14889 }, { "epoch": 1.8941610482126956, "ewc_loss": 0.030878404155373573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087840377702378e-05, "grad_norm": 18.180959701538086, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8705345392227173, "num_tokens": 567949221.0, "step": 14890 }, { "epoch": 1.8942882584912861, "ewc_loss": 0.030879179015755653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.087917866650969e-05, "grad_norm": 18.117523193359375, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8862380981445312, "num_tokens": 567985101.0, "step": 14891 }, { "epoch": 1.8944154687698767, "ewc_loss": 0.030927181243896484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092718179686926e-05, "grad_norm": 18.236730575561523, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8527055978775024, "num_tokens": 568030542.0, "step": 14892 }, { "epoch": 1.8945426790484672, "ewc_loss": 0.030953243374824524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095324427704327e-05, "grad_norm": 18.111862182617188, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8722338676452637, "num_tokens": 568070723.0, "step": 14893 }, { "epoch": 1.8946698893270577, "ewc_loss": 0.030869392678141594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086939250351861e-05, "grad_norm": 18.248748779296875, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8621782064437866, "num_tokens": 568113444.0, "step": 14894 }, { "epoch": 1.8947970996056482, "ewc_loss": 0.03096490353345871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096490399912e-05, "grad_norm": 18.12955665588379, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8731511235237122, "num_tokens": 568152646.0, "step": 14895 }, { "epoch": 1.8949243098842388, "ewc_loss": 0.03084571659564972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.084571653744206e-05, "grad_norm": 18.240018844604492, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8703120946884155, "num_tokens": 568187046.0, "step": 14896 }, { "epoch": 1.8950515201628293, "ewc_loss": 0.031025974079966545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1025974749354646e-05, "grad_norm": 18.172958374023438, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8654575347900391, "num_tokens": 568225052.0, "step": 14897 }, { "epoch": 1.8951787304414198, "ewc_loss": 0.030863698571920395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086369906668551e-05, "grad_norm": 18.212444305419922, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8615638017654419, "num_tokens": 568258299.0, "step": 14898 }, { "epoch": 1.8953059407200101, "ewc_loss": 0.031000353395938873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10003524646163e-05, "grad_norm": 18.20026397705078, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8754845261573792, "num_tokens": 568297137.0, "step": 14899 }, { "epoch": 1.8954331509986007, "ewc_loss": 0.030868519097566605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.086851938860491e-05, "grad_norm": 18.15179443359375, "learning_rate": 1e-06, "loss": 0.3354, "mean_token_accuracy": 0.8904644250869751, "num_tokens": 568331102.0, "step": 14900 }, { "epoch": 1.8955603612771912, "ewc_loss": 0.030957471579313278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095747160841711e-05, "grad_norm": 18.189117431640625, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8674378395080566, "num_tokens": 568373926.0, "step": 14901 }, { "epoch": 1.8956875715557817, "ewc_loss": 0.030929775908589363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092977567575872e-05, "grad_norm": 18.149383544921875, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.878330647945404, "num_tokens": 568406470.0, "step": 14902 }, { "epoch": 1.8958147818343722, "ewc_loss": 0.03090246580541134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090246536885388e-05, "grad_norm": 18.209360122680664, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8772064447402954, "num_tokens": 568443452.0, "step": 14903 }, { "epoch": 1.8959419921129628, "ewc_loss": 0.030981652438640594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098165325354785e-05, "grad_norm": 18.11321258544922, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8647335767745972, "num_tokens": 568480085.0, "step": 14904 }, { "epoch": 1.896069202391553, "ewc_loss": 0.03098115138709545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098115121247247e-05, "grad_norm": 18.297523498535156, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8663918972015381, "num_tokens": 568519191.0, "step": 14905 }, { "epoch": 1.8961964126701436, "ewc_loss": 0.031023239716887474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102323898929171e-05, "grad_norm": 18.161903381347656, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8757167458534241, "num_tokens": 568554224.0, "step": 14906 }, { "epoch": 1.8963236229487341, "ewc_loss": 0.030931031331419945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0931030778447166e-05, "grad_norm": 18.194414138793945, "learning_rate": 1e-06, "loss": 0.3512, "mean_token_accuracy": 0.8866322040557861, "num_tokens": 568586102.0, "step": 14907 }, { "epoch": 1.8964508332273247, "ewc_loss": 0.03099980391561985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099980312981643e-05, "grad_norm": 18.14080047607422, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8648465871810913, "num_tokens": 568625224.0, "step": 14908 }, { "epoch": 1.8965780435059152, "ewc_loss": 0.030974891036748886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0974890250945464e-05, "grad_norm": 18.15242576599121, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8491095304489136, "num_tokens": 568665588.0, "step": 14909 }, { "epoch": 1.8967052537845057, "ewc_loss": 0.031040553003549576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104055213043466e-05, "grad_norm": 18.174644470214844, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8850662708282471, "num_tokens": 568698408.0, "step": 14910 }, { "epoch": 1.8968324640630962, "ewc_loss": 0.030978666618466377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0978666472947225e-05, "grad_norm": 18.210494995117188, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8581369519233704, "num_tokens": 568731982.0, "step": 14911 }, { "epoch": 1.8969596743416868, "ewc_loss": 0.031032519415020943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1032519473228604e-05, "grad_norm": 18.136093139648438, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8690929412841797, "num_tokens": 568769388.0, "step": 14912 }, { "epoch": 1.8970868846202773, "ewc_loss": 0.030978156253695488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097815715591423e-05, "grad_norm": 18.227819442749023, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8768240213394165, "num_tokens": 568810329.0, "step": 14913 }, { "epoch": 1.8972140948988678, "ewc_loss": 0.031036412343382835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103641211055219e-05, "grad_norm": 18.162179946899414, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8746699094772339, "num_tokens": 568852241.0, "step": 14914 }, { "epoch": 1.8973413051774584, "ewc_loss": 0.03098301775753498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098301749560051e-05, "grad_norm": 18.210500717163086, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8651615381240845, "num_tokens": 568889660.0, "step": 14915 }, { "epoch": 1.8974685154560489, "ewc_loss": 0.031026072800159454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102607297478244e-05, "grad_norm": 18.142759323120117, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8679286241531372, "num_tokens": 568934489.0, "step": 14916 }, { "epoch": 1.8975957257346394, "ewc_loss": 0.030988922342658043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0988921935204417e-05, "grad_norm": 18.203264236450195, "learning_rate": 1e-06, "loss": 0.3329, "mean_token_accuracy": 0.8915446996688843, "num_tokens": 568971904.0, "step": 14917 }, { "epoch": 1.89772293601323, "ewc_loss": 0.03103926032781601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1039260647958145e-05, "grad_norm": 18.250347137451172, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8518263697624207, "num_tokens": 569011863.0, "step": 14918 }, { "epoch": 1.8978501462918205, "ewc_loss": 0.03100169077515602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100169124081731e-05, "grad_norm": 18.207508087158203, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8914192914962769, "num_tokens": 569050295.0, "step": 14919 }, { "epoch": 1.897977356570411, "ewc_loss": 0.03101137839257717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1011379178380594e-05, "grad_norm": 18.270994186401367, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8634039759635925, "num_tokens": 569087013.0, "step": 14920 }, { "epoch": 1.8981045668490015, "ewc_loss": 0.03099580854177475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099580862908624e-05, "grad_norm": 18.19095230102539, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8635579943656921, "num_tokens": 569125226.0, "step": 14921 }, { "epoch": 1.898231777127592, "ewc_loss": 0.03100407123565674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1004070478957146e-05, "grad_norm": 18.25698471069336, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8664730191230774, "num_tokens": 569161910.0, "step": 14922 }, { "epoch": 1.8983589874061826, "ewc_loss": 0.03097488358616829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097488297498785e-05, "grad_norm": 18.1722412109375, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8725447654724121, "num_tokens": 569196519.0, "step": 14923 }, { "epoch": 1.898486197684773, "ewc_loss": 0.030982526019215584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098252636846155e-05, "grad_norm": 18.287525177001953, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8654966950416565, "num_tokens": 569234404.0, "step": 14924 }, { "epoch": 1.8986134079633634, "ewc_loss": 0.030978597700595856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097859735134989e-05, "grad_norm": 18.153438568115234, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8613506555557251, "num_tokens": 569272728.0, "step": 14925 }, { "epoch": 1.898740618241954, "ewc_loss": 0.030910611152648926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091061080340296e-05, "grad_norm": 18.23065757751465, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8616803884506226, "num_tokens": 569315434.0, "step": 14926 }, { "epoch": 1.8988678285205445, "ewc_loss": 0.030921604484319687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092160477535799e-05, "grad_norm": 18.156862258911133, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8540287017822266, "num_tokens": 569357473.0, "step": 14927 }, { "epoch": 1.898995038799135, "ewc_loss": 0.030947506427764893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0947507184464484e-05, "grad_norm": 18.23940658569336, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8579398393630981, "num_tokens": 569398747.0, "step": 14928 }, { "epoch": 1.8991222490777255, "ewc_loss": 0.030929189175367355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092918996117078e-05, "grad_norm": 18.168184280395508, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8765764832496643, "num_tokens": 569434855.0, "step": 14929 }, { "epoch": 1.8992494593563158, "ewc_loss": 0.030928004533052444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0928003980079666e-05, "grad_norm": 18.140260696411133, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8699899911880493, "num_tokens": 569475236.0, "step": 14930 }, { "epoch": 1.8993766696349064, "ewc_loss": 0.030941825360059738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094182466156781e-05, "grad_norm": 18.224098205566406, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8685336112976074, "num_tokens": 569513599.0, "step": 14931 }, { "epoch": 1.899503879913497, "ewc_loss": 0.030923834070563316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092383485636674e-05, "grad_norm": 18.15458869934082, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8764630556106567, "num_tokens": 569559059.0, "step": 14932 }, { "epoch": 1.8996310901920874, "ewc_loss": 0.03097955323755741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0979554139776155e-05, "grad_norm": 18.2622127532959, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8874239921569824, "num_tokens": 569595240.0, "step": 14933 }, { "epoch": 1.899758300470678, "ewc_loss": 0.030922409147024155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092240876867436e-05, "grad_norm": 18.17012596130371, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8591141700744629, "num_tokens": 569629336.0, "step": 14934 }, { "epoch": 1.8998855107492685, "ewc_loss": 0.03088999353349209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.088999437750317e-05, "grad_norm": 18.23130989074707, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8807195425033569, "num_tokens": 569663613.0, "step": 14935 }, { "epoch": 1.900012721027859, "ewc_loss": 0.031043874099850655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104387360508554e-05, "grad_norm": 18.17804718017578, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8540446162223816, "num_tokens": 569702884.0, "step": 14936 }, { "epoch": 1.9001399313064495, "ewc_loss": 0.030916668474674225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091666803811677e-05, "grad_norm": 18.181259155273438, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8843647837638855, "num_tokens": 569744618.0, "step": 14937 }, { "epoch": 1.90026714158504, "ewc_loss": 0.03099665604531765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0996656278148293e-05, "grad_norm": 18.197099685668945, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8781638145446777, "num_tokens": 569777083.0, "step": 14938 }, { "epoch": 1.9003943518636306, "ewc_loss": 0.03093012422323227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09301249217242e-05, "grad_norm": 18.151790618896484, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8767669200897217, "num_tokens": 569809652.0, "step": 14939 }, { "epoch": 1.9005215621422211, "ewc_loss": 0.031003855168819427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100385583820753e-05, "grad_norm": 18.216407775878906, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8753050565719604, "num_tokens": 569846465.0, "step": 14940 }, { "epoch": 1.9006487724208116, "ewc_loss": 0.030960239470005035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0960240110289305e-05, "grad_norm": 18.165998458862305, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8635002374649048, "num_tokens": 569883655.0, "step": 14941 }, { "epoch": 1.9007759826994022, "ewc_loss": 0.030962662771344185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096266300417483e-05, "grad_norm": 18.166536331176758, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.868273138999939, "num_tokens": 569923358.0, "step": 14942 }, { "epoch": 1.9009031929779927, "ewc_loss": 0.031001541763544083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100154208368622e-05, "grad_norm": 18.227630615234375, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8666960000991821, "num_tokens": 569961393.0, "step": 14943 }, { "epoch": 1.9010304032565832, "ewc_loss": 0.03093874640762806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093874693149701e-05, "grad_norm": 18.11564826965332, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8427289724349976, "num_tokens": 570002092.0, "step": 14944 }, { "epoch": 1.9011576135351738, "ewc_loss": 0.030959704890847206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095970532740466e-05, "grad_norm": 18.252927780151367, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8554030656814575, "num_tokens": 570037699.0, "step": 14945 }, { "epoch": 1.9012848238137643, "ewc_loss": 0.031014740467071533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1014740670798346e-05, "grad_norm": 18.217164993286133, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8472819328308105, "num_tokens": 570077693.0, "step": 14946 }, { "epoch": 1.9014120340923548, "ewc_loss": 0.030919890850782394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091989128733985e-05, "grad_norm": 18.14616584777832, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8891716599464417, "num_tokens": 570115354.0, "step": 14947 }, { "epoch": 1.9015392443709451, "ewc_loss": 0.03105079010128975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105078940279782e-05, "grad_norm": 18.206872940063477, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.877138614654541, "num_tokens": 570154387.0, "step": 14948 }, { "epoch": 1.9016664546495357, "ewc_loss": 0.030973872169852257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097387161687948e-05, "grad_norm": 18.15447998046875, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8652235269546509, "num_tokens": 570189743.0, "step": 14949 }, { "epoch": 1.9017936649281262, "ewc_loss": 0.030926775187253952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092677434324287e-05, "grad_norm": 18.122907638549805, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8819005489349365, "num_tokens": 570224009.0, "step": 14950 }, { "epoch": 1.9019208752067167, "ewc_loss": 0.03102775663137436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102775735897012e-05, "grad_norm": 18.156164169311523, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8763737678527832, "num_tokens": 570266703.0, "step": 14951 }, { "epoch": 1.9020480854853072, "ewc_loss": 0.031041095033288002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104109418927692e-05, "grad_norm": 18.207448959350586, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8699917197227478, "num_tokens": 570308241.0, "step": 14952 }, { "epoch": 1.9021752957638978, "ewc_loss": 0.031078336760401726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107833617832512e-05, "grad_norm": 18.2470760345459, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8699597120285034, "num_tokens": 570346931.0, "step": 14953 }, { "epoch": 1.902302506042488, "ewc_loss": 0.030984599143266678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098460001638159e-05, "grad_norm": 18.166414260864258, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8837595582008362, "num_tokens": 570380373.0, "step": 14954 }, { "epoch": 1.9024297163210786, "ewc_loss": 0.030982835218310356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098283559666015e-05, "grad_norm": 18.179462432861328, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8815451860427856, "num_tokens": 570420106.0, "step": 14955 }, { "epoch": 1.9025569265996691, "ewc_loss": 0.03093811683356762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093811756116338e-05, "grad_norm": 18.1868839263916, "learning_rate": 1e-06, "loss": 0.3449, "mean_token_accuracy": 0.8916859030723572, "num_tokens": 570464534.0, "step": 14956 }, { "epoch": 1.9026841368782597, "ewc_loss": 0.030994301661849022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0994302505860105e-05, "grad_norm": 18.15641975402832, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8591284155845642, "num_tokens": 570505734.0, "step": 14957 }, { "epoch": 1.9028113471568502, "ewc_loss": 0.030946336686611176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09463357552886e-05, "grad_norm": 18.219640731811523, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8794865608215332, "num_tokens": 570545913.0, "step": 14958 }, { "epoch": 1.9029385574354407, "ewc_loss": 0.03099360689520836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099360765190795e-05, "grad_norm": 18.25287437438965, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.87689208984375, "num_tokens": 570584397.0, "step": 14959 }, { "epoch": 1.9030657677140312, "ewc_loss": 0.030966049060225487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096604996244423e-05, "grad_norm": 18.175363540649414, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8757510185241699, "num_tokens": 570622285.0, "step": 14960 }, { "epoch": 1.9031929779926218, "ewc_loss": 0.03090187907218933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090187965426594e-05, "grad_norm": 18.24645233154297, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8547418117523193, "num_tokens": 570656331.0, "step": 14961 }, { "epoch": 1.9033201882712123, "ewc_loss": 0.030961276963353157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0961276934249327e-05, "grad_norm": 18.192964553833008, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8742725253105164, "num_tokens": 570699944.0, "step": 14962 }, { "epoch": 1.9034473985498028, "ewc_loss": 0.0309408251196146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094082421739586e-05, "grad_norm": 18.213151931762695, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8642879128456116, "num_tokens": 570737649.0, "step": 14963 }, { "epoch": 1.9035746088283934, "ewc_loss": 0.030909337103366852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090933751082048e-05, "grad_norm": 18.138282775878906, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8674212694168091, "num_tokens": 570773193.0, "step": 14964 }, { "epoch": 1.9037018191069839, "ewc_loss": 0.03092316910624504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092316910624504e-05, "grad_norm": 18.2230281829834, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8638123273849487, "num_tokens": 570809117.0, "step": 14965 }, { "epoch": 1.9038290293855744, "ewc_loss": 0.030948501080274582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094850035267882e-05, "grad_norm": 18.131275177001953, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.882493257522583, "num_tokens": 570844268.0, "step": 14966 }, { "epoch": 1.903956239664165, "ewc_loss": 0.030888641253113747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0888641049386933e-05, "grad_norm": 18.20783805847168, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8619341850280762, "num_tokens": 570890166.0, "step": 14967 }, { "epoch": 1.9040834499427555, "ewc_loss": 0.030972685664892197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0972685635788366e-05, "grad_norm": 18.236522674560547, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8654807806015015, "num_tokens": 570921419.0, "step": 14968 }, { "epoch": 1.904210660221346, "ewc_loss": 0.030949926003813744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09499264403712e-05, "grad_norm": 18.2120304107666, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8598026037216187, "num_tokens": 570966444.0, "step": 14969 }, { "epoch": 1.9043378704999365, "ewc_loss": 0.03090631030499935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090631071245298e-05, "grad_norm": 18.216753005981445, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.879928469657898, "num_tokens": 571011656.0, "step": 14970 }, { "epoch": 1.904465080778527, "ewc_loss": 0.030974581837654114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097458102274686e-05, "grad_norm": 18.26902961730957, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8742038011550903, "num_tokens": 571047828.0, "step": 14971 }, { "epoch": 1.9045922910571176, "ewc_loss": 0.03090033307671547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0900333513272926e-05, "grad_norm": 18.181495666503906, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8696265816688538, "num_tokens": 571084541.0, "step": 14972 }, { "epoch": 1.9047195013357079, "ewc_loss": 0.030915725976228714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091572580160573e-05, "grad_norm": 18.22296142578125, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8645423650741577, "num_tokens": 571120978.0, "step": 14973 }, { "epoch": 1.9048467116142984, "ewc_loss": 0.03096417337656021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096417276537977e-05, "grad_norm": 18.212711334228516, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8838868141174316, "num_tokens": 571160181.0, "step": 14974 }, { "epoch": 1.904973921892889, "ewc_loss": 0.03093431517481804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093431587330997e-05, "grad_norm": 18.303083419799805, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8752711415290833, "num_tokens": 571194648.0, "step": 14975 }, { "epoch": 1.9051011321714795, "ewc_loss": 0.03090693987905979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0906940082786605e-05, "grad_norm": 18.118709564208984, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8655296564102173, "num_tokens": 571238449.0, "step": 14976 }, { "epoch": 1.90522834245007, "ewc_loss": 0.0308525487780571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0852548661641777e-05, "grad_norm": 18.195934295654297, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8711069822311401, "num_tokens": 571277266.0, "step": 14977 }, { "epoch": 1.9053555527286605, "ewc_loss": 0.03094024956226349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0940249416744336e-05, "grad_norm": 18.249855041503906, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8812559843063354, "num_tokens": 571316891.0, "step": 14978 }, { "epoch": 1.9054827630072508, "ewc_loss": 0.030925478786230087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0925479222787544e-05, "grad_norm": 18.261232376098633, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8713866472244263, "num_tokens": 571353097.0, "step": 14979 }, { "epoch": 1.9056099732858414, "ewc_loss": 0.03089940920472145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0899409466655925e-05, "grad_norm": 18.14106559753418, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.884445309638977, "num_tokens": 571392217.0, "step": 14980 }, { "epoch": 1.905737183564432, "ewc_loss": 0.03089798428118229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0897983378963545e-05, "grad_norm": 18.209430694580078, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8780057430267334, "num_tokens": 571430440.0, "step": 14981 }, { "epoch": 1.9058643938430224, "ewc_loss": 0.03089258074760437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0892580980435014e-05, "grad_norm": 18.118249893188477, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8664853572845459, "num_tokens": 571472002.0, "step": 14982 }, { "epoch": 1.905991604121613, "ewc_loss": 0.03091568872332573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091568942181766e-05, "grad_norm": 18.27128028869629, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8507068157196045, "num_tokens": 571509368.0, "step": 14983 }, { "epoch": 1.9061188144002035, "ewc_loss": 0.030944211408495903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094421117566526e-05, "grad_norm": 18.17216682434082, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8558767437934875, "num_tokens": 571543898.0, "step": 14984 }, { "epoch": 1.906246024678794, "ewc_loss": 0.030889365822076797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0889365007169545e-05, "grad_norm": 18.241579055786133, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8639253377914429, "num_tokens": 571584395.0, "step": 14985 }, { "epoch": 1.9063732349573845, "ewc_loss": 0.03096555918455124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0965558835305274e-05, "grad_norm": 18.261388778686523, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8699600696563721, "num_tokens": 571624792.0, "step": 14986 }, { "epoch": 1.906500445235975, "ewc_loss": 0.030949871987104416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0949871870689094e-05, "grad_norm": 18.174137115478516, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8721662163734436, "num_tokens": 571667197.0, "step": 14987 }, { "epoch": 1.9066276555145656, "ewc_loss": 0.030900031328201294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090003156103194e-05, "grad_norm": 18.240711212158203, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8640753030776978, "num_tokens": 571703648.0, "step": 14988 }, { "epoch": 1.9067548657931561, "ewc_loss": 0.03092390112578869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0923900339985266e-05, "grad_norm": 18.242294311523438, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8620993494987488, "num_tokens": 571738980.0, "step": 14989 }, { "epoch": 1.9068820760717466, "ewc_loss": 0.030941829085350037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0941828299546614e-05, "grad_norm": 18.19121742248535, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.872613251209259, "num_tokens": 571774540.0, "step": 14990 }, { "epoch": 1.9070092863503372, "ewc_loss": 0.030966609716415405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096661021118052e-05, "grad_norm": 18.243267059326172, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8758556842803955, "num_tokens": 571805286.0, "step": 14991 }, { "epoch": 1.9071364966289277, "ewc_loss": 0.03093772381544113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0937724659452215e-05, "grad_norm": 18.259687423706055, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8636810779571533, "num_tokens": 571847936.0, "step": 14992 }, { "epoch": 1.9072637069075182, "ewc_loss": 0.030966853722929955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09668539557606e-05, "grad_norm": 18.204578399658203, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8666177988052368, "num_tokens": 571885454.0, "step": 14993 }, { "epoch": 1.9073909171861088, "ewc_loss": 0.030960291624069214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0960291041992605e-05, "grad_norm": 18.215871810913086, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8684014678001404, "num_tokens": 571924870.0, "step": 14994 }, { "epoch": 1.9075181274646993, "ewc_loss": 0.03091672621667385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.091672624577768e-05, "grad_norm": 18.25844383239746, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8497909307479858, "num_tokens": 571957870.0, "step": 14995 }, { "epoch": 1.9076453377432898, "ewc_loss": 0.030932512134313583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093251143582165e-05, "grad_norm": 18.246362686157227, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8630534410476685, "num_tokens": 571993645.0, "step": 14996 }, { "epoch": 1.9077725480218801, "ewc_loss": 0.030917298048734665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0917297408450395e-05, "grad_norm": 18.155445098876953, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8790297508239746, "num_tokens": 572028594.0, "step": 14997 }, { "epoch": 1.9078997583004706, "ewc_loss": 0.030896412208676338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.089641177211888e-05, "grad_norm": 18.194246292114258, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8437191247940063, "num_tokens": 572066971.0, "step": 14998 }, { "epoch": 1.9080269685790612, "ewc_loss": 0.030964406207203865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0964405596023425e-05, "grad_norm": 18.2286319732666, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8732147216796875, "num_tokens": 572107609.0, "step": 14999 }, { "epoch": 1.9081541788576517, "ewc_loss": 0.030974198132753372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0974199034972116e-05, "grad_norm": 18.20016098022461, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8782472610473633, "num_tokens": 572146940.0, "step": 15000 }, { "epoch": 1.9082813891362422, "ewc_loss": 0.03095209039747715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095209103776142e-05, "grad_norm": 18.239866256713867, "learning_rate": 1e-06, "loss": 0.3291, "mean_token_accuracy": 0.8951030969619751, "num_tokens": 572184010.0, "step": 15001 }, { "epoch": 1.9084085994148328, "ewc_loss": 0.030978934839367867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097893568337895e-05, "grad_norm": 18.250635147094727, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8532603979110718, "num_tokens": 572223184.0, "step": 15002 }, { "epoch": 1.908535809693423, "ewc_loss": 0.030958127230405807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0958126444602385e-05, "grad_norm": 18.195518493652344, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8706026077270508, "num_tokens": 572261770.0, "step": 15003 }, { "epoch": 1.9086630199720136, "ewc_loss": 0.030964162200689316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096416185144335e-05, "grad_norm": 18.226770401000977, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8725813627243042, "num_tokens": 572303229.0, "step": 15004 }, { "epoch": 1.9087902302506041, "ewc_loss": 0.03098895400762558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098895467701368e-05, "grad_norm": 18.238065719604492, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.857703685760498, "num_tokens": 572337839.0, "step": 15005 }, { "epoch": 1.9089174405291947, "ewc_loss": 0.030995739623904228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0995739507488906e-05, "grad_norm": 18.147064208984375, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.87024986743927, "num_tokens": 572373604.0, "step": 15006 }, { "epoch": 1.9090446508077852, "ewc_loss": 0.030992312356829643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0992312531452626e-05, "grad_norm": 18.29388427734375, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8774526119232178, "num_tokens": 572410425.0, "step": 15007 }, { "epoch": 1.9091718610863757, "ewc_loss": 0.031087340787053108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108734017587267e-05, "grad_norm": 18.21396255493164, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8792890310287476, "num_tokens": 572443998.0, "step": 15008 }, { "epoch": 1.9092990713649662, "ewc_loss": 0.030932005494832993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093200575676747e-05, "grad_norm": 18.241806030273438, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8573868274688721, "num_tokens": 572485337.0, "step": 15009 }, { "epoch": 1.9094262816435568, "ewc_loss": 0.03106318786740303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106318763457239e-05, "grad_norm": 18.266489028930664, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.858534574508667, "num_tokens": 572528273.0, "step": 15010 }, { "epoch": 1.9095534919221473, "ewc_loss": 0.03099602274596691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099602326983586e-05, "grad_norm": 18.26580047607422, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8724504709243774, "num_tokens": 572569467.0, "step": 15011 }, { "epoch": 1.9096807022007378, "ewc_loss": 0.031026072800159454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102607297478244e-05, "grad_norm": 18.255237579345703, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8777163028717041, "num_tokens": 572609626.0, "step": 15012 }, { "epoch": 1.9098079124793284, "ewc_loss": 0.030991705134510994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099170498899184e-05, "grad_norm": 18.19685935974121, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8533660173416138, "num_tokens": 572652907.0, "step": 15013 }, { "epoch": 1.9099351227579189, "ewc_loss": 0.030964449048042297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096444925176911e-05, "grad_norm": 18.200040817260742, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8742662668228149, "num_tokens": 572687460.0, "step": 15014 }, { "epoch": 1.9100623330365094, "ewc_loss": 0.031022774055600166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1022773328004405e-05, "grad_norm": 18.23818016052246, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8722357749938965, "num_tokens": 572723937.0, "step": 15015 }, { "epoch": 1.9101895433151, "ewc_loss": 0.03101174347102642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10117429762613e-05, "grad_norm": 18.238954544067383, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8658499717712402, "num_tokens": 572764929.0, "step": 15016 }, { "epoch": 1.9103167535936905, "ewc_loss": 0.0309926625341177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099266177741811e-05, "grad_norm": 18.17011070251465, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8669983744621277, "num_tokens": 572802545.0, "step": 15017 }, { "epoch": 1.910443963872281, "ewc_loss": 0.030987858772277832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0987859645392746e-05, "grad_norm": 18.20770835876465, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8642609715461731, "num_tokens": 572850008.0, "step": 15018 }, { "epoch": 1.9105711741508715, "ewc_loss": 0.03100387379527092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1003874028101563e-05, "grad_norm": 18.13111114501953, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8774434328079224, "num_tokens": 572888968.0, "step": 15019 }, { "epoch": 1.910698384429462, "ewc_loss": 0.030963463708758354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096346335951239e-05, "grad_norm": 18.180694580078125, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.869083046913147, "num_tokens": 572926314.0, "step": 15020 }, { "epoch": 1.9108255947080524, "ewc_loss": 0.031041355803608894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104135612375103e-05, "grad_norm": 18.139665603637695, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8733412027359009, "num_tokens": 572961789.0, "step": 15021 }, { "epoch": 1.9109528049866429, "ewc_loss": 0.03094634599983692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094634666922502e-05, "grad_norm": 18.26058578491211, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8708083629608154, "num_tokens": 572993680.0, "step": 15022 }, { "epoch": 1.9110800152652334, "ewc_loss": 0.031014205887913704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1014205887913704e-05, "grad_norm": 18.13880157470703, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8717470169067383, "num_tokens": 573024003.0, "step": 15023 }, { "epoch": 1.911207225543824, "ewc_loss": 0.030947886407375336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.094788553426042e-05, "grad_norm": 18.298927307128906, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8784891366958618, "num_tokens": 573055138.0, "step": 15024 }, { "epoch": 1.9113344358224145, "ewc_loss": 0.031053703278303146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10537034238223e-05, "grad_norm": 18.161890029907227, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8749666213989258, "num_tokens": 573092373.0, "step": 15025 }, { "epoch": 1.911461646101005, "ewc_loss": 0.031026335433125496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102633490925655e-05, "grad_norm": 18.305858612060547, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8650283813476562, "num_tokens": 573131486.0, "step": 15026 }, { "epoch": 1.9115888563795955, "ewc_loss": 0.031068043783307076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1068044336279854e-05, "grad_norm": 18.20448112487793, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8791121244430542, "num_tokens": 573170114.0, "step": 15027 }, { "epoch": 1.9117160666581858, "ewc_loss": 0.03097989223897457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0979892471805215e-05, "grad_norm": 18.302398681640625, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8735330104827881, "num_tokens": 573200906.0, "step": 15028 }, { "epoch": 1.9118432769367764, "ewc_loss": 0.03104451298713684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1044513889355585e-05, "grad_norm": 18.21897315979004, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8573532104492188, "num_tokens": 573243731.0, "step": 15029 }, { "epoch": 1.9119704872153669, "ewc_loss": 0.03102813847362995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1028139346744865e-05, "grad_norm": 18.35497283935547, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8763284683227539, "num_tokens": 573283335.0, "step": 15030 }, { "epoch": 1.9120976974939574, "ewc_loss": 0.031008362770080566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1008363293949515e-05, "grad_norm": 18.21144676208496, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8569380044937134, "num_tokens": 573326628.0, "step": 15031 }, { "epoch": 1.912224907772548, "ewc_loss": 0.031025178730487823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102517803199589e-05, "grad_norm": 18.22770881652832, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.875965416431427, "num_tokens": 573369025.0, "step": 15032 }, { "epoch": 1.9123521180511385, "ewc_loss": 0.0310466680675745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104666757280938e-05, "grad_norm": 18.210540771484375, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8636789917945862, "num_tokens": 573411923.0, "step": 15033 }, { "epoch": 1.912479328329729, "ewc_loss": 0.031029706820845604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102970731561072e-05, "grad_norm": 18.2731990814209, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8655178546905518, "num_tokens": 573450455.0, "step": 15034 }, { "epoch": 1.9126065386083195, "ewc_loss": 0.031018782407045364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1018782465253025e-05, "grad_norm": 18.25543212890625, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8610529899597168, "num_tokens": 573492886.0, "step": 15035 }, { "epoch": 1.91273374888691, "ewc_loss": 0.030981700867414474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098170054727234e-05, "grad_norm": 18.238800048828125, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8744188547134399, "num_tokens": 573532073.0, "step": 15036 }, { "epoch": 1.9128609591655006, "ewc_loss": 0.030995888635516167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.099588866462e-05, "grad_norm": 18.235179901123047, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.885353684425354, "num_tokens": 573573632.0, "step": 15037 }, { "epoch": 1.9129881694440911, "ewc_loss": 0.0309233870357275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0923387384973466e-05, "grad_norm": 18.224924087524414, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8792914152145386, "num_tokens": 573610194.0, "step": 15038 }, { "epoch": 1.9131153797226816, "ewc_loss": 0.030970262363553047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097026274190284e-05, "grad_norm": 18.242265701293945, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8624334335327148, "num_tokens": 573650237.0, "step": 15039 }, { "epoch": 1.9132425900012722, "ewc_loss": 0.030952082946896553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0952083761803806e-05, "grad_norm": 18.26250457763672, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8735243082046509, "num_tokens": 573686859.0, "step": 15040 }, { "epoch": 1.9133698002798627, "ewc_loss": 0.03100089728832245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100089816143736e-05, "grad_norm": 18.226228713989258, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8704798817634583, "num_tokens": 573726770.0, "step": 15041 }, { "epoch": 1.9134970105584532, "ewc_loss": 0.030922234058380127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.092223414569162e-05, "grad_norm": 18.178367614746094, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8645709753036499, "num_tokens": 573768668.0, "step": 15042 }, { "epoch": 1.9136242208370438, "ewc_loss": 0.030932802706956863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093280247412622e-05, "grad_norm": 18.182680130004883, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8602568507194519, "num_tokens": 573807880.0, "step": 15043 }, { "epoch": 1.9137514311156343, "ewc_loss": 0.03090040199458599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090040263487026e-05, "grad_norm": 18.15479278564453, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8678090572357178, "num_tokens": 573847018.0, "step": 15044 }, { "epoch": 1.9138786413942248, "ewc_loss": 0.03091384470462799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0913844966562465e-05, "grad_norm": 18.18754768371582, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8854669332504272, "num_tokens": 573881193.0, "step": 15045 }, { "epoch": 1.9140058516728151, "ewc_loss": 0.031012164428830147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1012164981802925e-05, "grad_norm": 18.240985870361328, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8780367374420166, "num_tokens": 573918457.0, "step": 15046 }, { "epoch": 1.9141330619514056, "ewc_loss": 0.030933227390050888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093322811764665e-05, "grad_norm": 18.178958892822266, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.888548731803894, "num_tokens": 573958818.0, "step": 15047 }, { "epoch": 1.9142602722299962, "ewc_loss": 0.030974585562944412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097458466072567e-05, "grad_norm": 18.307479858398438, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8644008636474609, "num_tokens": 573996735.0, "step": 15048 }, { "epoch": 1.9143874825085867, "ewc_loss": 0.03090912289917469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.090912287007086e-05, "grad_norm": 18.13473129272461, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8735420107841492, "num_tokens": 574035005.0, "step": 15049 }, { "epoch": 1.9145146927871772, "ewc_loss": 0.03089461848139763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0894618248566985e-05, "grad_norm": 18.223388671875, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8689810633659363, "num_tokens": 574077817.0, "step": 15050 }, { "epoch": 1.9146419030657678, "ewc_loss": 0.031025294214487076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102529444731772e-05, "grad_norm": 18.25863265991211, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8778053522109985, "num_tokens": 574113083.0, "step": 15051 }, { "epoch": 1.914769113344358, "ewc_loss": 0.030892016366124153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0892017093719915e-05, "grad_norm": 18.201326370239258, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8739882111549377, "num_tokens": 574147324.0, "step": 15052 }, { "epoch": 1.9148963236229486, "ewc_loss": 0.030937040224671364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093704071943648e-05, "grad_norm": 18.20682716369629, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8849290609359741, "num_tokens": 574186108.0, "step": 15053 }, { "epoch": 1.9150235339015391, "ewc_loss": 0.03095947578549385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0959476134739816e-05, "grad_norm": 18.3065242767334, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8826619386672974, "num_tokens": 574218076.0, "step": 15054 }, { "epoch": 1.9151507441801296, "ewc_loss": 0.030978616327047348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0978615541243926e-05, "grad_norm": 18.10744857788086, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8803970813751221, "num_tokens": 574256416.0, "step": 15055 }, { "epoch": 1.9152779544587202, "ewc_loss": 0.030930738896131516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09307397401426e-05, "grad_norm": 18.22159194946289, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8837419748306274, "num_tokens": 574295262.0, "step": 15056 }, { "epoch": 1.9154051647373107, "ewc_loss": 0.031008534133434296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100853427895345e-05, "grad_norm": 18.17953872680664, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8596562147140503, "num_tokens": 574329836.0, "step": 15057 }, { "epoch": 1.9155323750159012, "ewc_loss": 0.03095540590584278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.095540523645468e-05, "grad_norm": 18.19684410095215, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8730388879776001, "num_tokens": 574363189.0, "step": 15058 }, { "epoch": 1.9156595852944918, "ewc_loss": 0.031016705557703972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1016705179354176e-05, "grad_norm": 18.239120483398438, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8788842558860779, "num_tokens": 574396374.0, "step": 15059 }, { "epoch": 1.9157867955730823, "ewc_loss": 0.03109780326485634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1097803002921864e-05, "grad_norm": 18.30329132080078, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8741730451583862, "num_tokens": 574435488.0, "step": 15060 }, { "epoch": 1.9159140058516728, "ewc_loss": 0.030944233760237694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09442330035381e-05, "grad_norm": 18.237478256225586, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8703731298446655, "num_tokens": 574475261.0, "step": 15061 }, { "epoch": 1.9160412161302633, "ewc_loss": 0.030964452773332596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096445288974792e-05, "grad_norm": 18.199289321899414, "learning_rate": 1e-06, "loss": 0.3408, "mean_token_accuracy": 0.8894898295402527, "num_tokens": 574513531.0, "step": 15062 }, { "epoch": 1.9161684264088539, "ewc_loss": 0.03101327084004879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.101327092736028e-05, "grad_norm": 18.310684204101562, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8569482564926147, "num_tokens": 574551736.0, "step": 15063 }, { "epoch": 1.9162956366874444, "ewc_loss": 0.031044434756040573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104443385382183e-05, "grad_norm": 18.244632720947266, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8646179437637329, "num_tokens": 574589978.0, "step": 15064 }, { "epoch": 1.916422846966035, "ewc_loss": 0.030935119837522507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.093511986662634e-05, "grad_norm": 18.25179672241211, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8499020338058472, "num_tokens": 574628845.0, "step": 15065 }, { "epoch": 1.9165500572446255, "ewc_loss": 0.030949831008911133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0949831852922216e-05, "grad_norm": 18.18254280090332, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.86765456199646, "num_tokens": 574660700.0, "step": 15066 }, { "epoch": 1.916677267523216, "ewc_loss": 0.03094995953142643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0949959182180464e-05, "grad_norm": 18.252582550048828, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8616712093353271, "num_tokens": 574692195.0, "step": 15067 }, { "epoch": 1.9168044778018065, "ewc_loss": 0.031010517850518227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.101051697740331e-05, "grad_norm": 18.20062255859375, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8611066341400146, "num_tokens": 574732462.0, "step": 15068 }, { "epoch": 1.916931688080397, "ewc_loss": 0.030948230996727943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0948231142247096e-05, "grad_norm": 18.26507568359375, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8772792816162109, "num_tokens": 574769353.0, "step": 15069 }, { "epoch": 1.9170588983589874, "ewc_loss": 0.031024256721138954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10242576233577e-05, "grad_norm": 18.21826934814453, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8700911402702332, "num_tokens": 574804958.0, "step": 15070 }, { "epoch": 1.9171861086375779, "ewc_loss": 0.03097253292798996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097253284067847e-05, "grad_norm": 18.261415481567383, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.870370626449585, "num_tokens": 574843966.0, "step": 15071 }, { "epoch": 1.9173133189161684, "ewc_loss": 0.031060729175806046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106072836089879e-05, "grad_norm": 18.310461044311523, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.874566376209259, "num_tokens": 574882916.0, "step": 15072 }, { "epoch": 1.917440529194759, "ewc_loss": 0.03098369762301445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098369779763743e-05, "grad_norm": 18.196712493896484, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8794739842414856, "num_tokens": 574922154.0, "step": 15073 }, { "epoch": 1.9175677394733495, "ewc_loss": 0.031008832156658173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100883259321563e-05, "grad_norm": 18.27813720703125, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8675204515457153, "num_tokens": 574956894.0, "step": 15074 }, { "epoch": 1.91769494975194, "ewc_loss": 0.03107181377708912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1071813282324e-05, "grad_norm": 18.24623680114746, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8850111365318298, "num_tokens": 574998343.0, "step": 15075 }, { "epoch": 1.9178221600305305, "ewc_loss": 0.03097541444003582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0975414119893685e-05, "grad_norm": 18.26117706298828, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8795743584632874, "num_tokens": 575036689.0, "step": 15076 }, { "epoch": 1.9179493703091208, "ewc_loss": 0.031111711636185646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1111710995901376e-05, "grad_norm": 18.201276779174805, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8653517365455627, "num_tokens": 575075510.0, "step": 15077 }, { "epoch": 1.9180765805877114, "ewc_loss": 0.030945779755711555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0945779144531116e-05, "grad_norm": 18.237211227416992, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8677324056625366, "num_tokens": 575115797.0, "step": 15078 }, { "epoch": 1.9182037908663019, "ewc_loss": 0.031056206673383713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105620635324158e-05, "grad_norm": 18.26046371459961, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8572694063186646, "num_tokens": 575155240.0, "step": 15079 }, { "epoch": 1.9183310011448924, "ewc_loss": 0.0309304092079401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0930408684071153e-05, "grad_norm": 18.192636489868164, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.857128918170929, "num_tokens": 575191048.0, "step": 15080 }, { "epoch": 1.918458211423483, "ewc_loss": 0.03098602592945099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098602610407397e-05, "grad_norm": 18.271530151367188, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.85959792137146, "num_tokens": 575227778.0, "step": 15081 }, { "epoch": 1.9185854217020735, "ewc_loss": 0.031060051172971725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1060051696840674e-05, "grad_norm": 18.273130416870117, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8719545602798462, "num_tokens": 575269774.0, "step": 15082 }, { "epoch": 1.918712631980664, "ewc_loss": 0.031009621918201447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100962203461677e-05, "grad_norm": 18.255207061767578, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8626881241798401, "num_tokens": 575310710.0, "step": 15083 }, { "epoch": 1.9188398422592545, "ewc_loss": 0.031098607927560806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109860699623823e-05, "grad_norm": 18.34707260131836, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8779407739639282, "num_tokens": 575344073.0, "step": 15084 }, { "epoch": 1.918967052537845, "ewc_loss": 0.03104233555495739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104233474005014e-05, "grad_norm": 18.25482940673828, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8612152338027954, "num_tokens": 575386737.0, "step": 15085 }, { "epoch": 1.9190942628164356, "ewc_loss": 0.030996255576610565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0996256100479513e-05, "grad_norm": 18.229446411132812, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8745304346084595, "num_tokens": 575424287.0, "step": 15086 }, { "epoch": 1.919221473095026, "ewc_loss": 0.031020183116197586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1020183087093756e-05, "grad_norm": 18.21205711364746, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8730326890945435, "num_tokens": 575463859.0, "step": 15087 }, { "epoch": 1.9193486833736166, "ewc_loss": 0.031057676300406456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105767609667964e-05, "grad_norm": 18.30034637451172, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8651708960533142, "num_tokens": 575500838.0, "step": 15088 }, { "epoch": 1.9194758936522072, "ewc_loss": 0.031047137454152107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10471368720755e-05, "grad_norm": 18.230241775512695, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8619475960731506, "num_tokens": 575535164.0, "step": 15089 }, { "epoch": 1.9196031039307977, "ewc_loss": 0.03097192943096161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.097192893619649e-05, "grad_norm": 18.253076553344727, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8762809634208679, "num_tokens": 575569493.0, "step": 15090 }, { "epoch": 1.9197303142093882, "ewc_loss": 0.03106711618602276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1067116651684046e-05, "grad_norm": 18.310791015625, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8555952310562134, "num_tokens": 575603396.0, "step": 15091 }, { "epoch": 1.9198575244879788, "ewc_loss": 0.030980808660387993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.09808092424646e-05, "grad_norm": 18.23647117614746, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.877281904220581, "num_tokens": 575638255.0, "step": 15092 }, { "epoch": 1.9199847347665693, "ewc_loss": 0.031021932139992714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102193295489997e-05, "grad_norm": 18.25947380065918, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8606041669845581, "num_tokens": 575685281.0, "step": 15093 }, { "epoch": 1.9201119450451598, "ewc_loss": 0.031035825610160828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103582639596425e-05, "grad_norm": 18.272397994995117, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8772329092025757, "num_tokens": 575725030.0, "step": 15094 }, { "epoch": 1.9202391553237501, "ewc_loss": 0.031047668308019638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1047668016981333e-05, "grad_norm": 18.19117546081543, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8648228645324707, "num_tokens": 575764237.0, "step": 15095 }, { "epoch": 1.9203663656023406, "ewc_loss": 0.031032107770442963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10321083816234e-05, "grad_norm": 18.268524169921875, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8684004545211792, "num_tokens": 575796732.0, "step": 15096 }, { "epoch": 1.9204935758809312, "ewc_loss": 0.031111940741539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111194018856622e-05, "grad_norm": 18.282194137573242, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8744544386863708, "num_tokens": 575833133.0, "step": 15097 }, { "epoch": 1.9206207861595217, "ewc_loss": 0.03103986382484436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103986455244012e-05, "grad_norm": 18.276588439941406, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8661609888076782, "num_tokens": 575870920.0, "step": 15098 }, { "epoch": 1.9207479964381122, "ewc_loss": 0.031070826575160027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107082739006728e-05, "grad_norm": 18.271678924560547, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.873221755027771, "num_tokens": 575909132.0, "step": 15099 }, { "epoch": 1.9208752067167028, "ewc_loss": 0.031070267781615257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107026714133099e-05, "grad_norm": 18.258899688720703, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8708274364471436, "num_tokens": 575947979.0, "step": 15100 }, { "epoch": 1.921002416995293, "ewc_loss": 0.031018050387501717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10180512315128e-05, "grad_norm": 18.256446838378906, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8591669201850891, "num_tokens": 575988888.0, "step": 15101 }, { "epoch": 1.9211296272738836, "ewc_loss": 0.03107389807701111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1073897844180465e-05, "grad_norm": 18.23746109008789, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8719642758369446, "num_tokens": 576028614.0, "step": 15102 }, { "epoch": 1.9212568375524741, "ewc_loss": 0.031060652807354927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1060651963343844e-05, "grad_norm": 18.31476593017578, "learning_rate": 1e-06, "loss": 0.3227, "mean_token_accuracy": 0.8982137441635132, "num_tokens": 576066648.0, "step": 15103 }, { "epoch": 1.9213840478310646, "ewc_loss": 0.03105616196990013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105616269749589e-05, "grad_norm": 18.208009719848633, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8684054017066956, "num_tokens": 576104045.0, "step": 15104 }, { "epoch": 1.9215112581096552, "ewc_loss": 0.031055526807904243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105552605120465e-05, "grad_norm": 18.360984802246094, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8628370761871338, "num_tokens": 576145419.0, "step": 15105 }, { "epoch": 1.9216384683882457, "ewc_loss": 0.03108804300427437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108804230578244e-05, "grad_norm": 18.223901748657227, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8664255142211914, "num_tokens": 576191410.0, "step": 15106 }, { "epoch": 1.9217656786668362, "ewc_loss": 0.030944855883717537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0944855097914115e-05, "grad_norm": 18.15949058532715, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8647454977035522, "num_tokens": 576226383.0, "step": 15107 }, { "epoch": 1.9218928889454268, "ewc_loss": 0.03109666518867016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1096664315555245e-05, "grad_norm": 18.30104637145996, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8626904487609863, "num_tokens": 576259958.0, "step": 15108 }, { "epoch": 1.9220200992240173, "ewc_loss": 0.031098349019885063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109834869974293e-05, "grad_norm": 18.241384506225586, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8730202913284302, "num_tokens": 576299467.0, "step": 15109 }, { "epoch": 1.9221473095026078, "ewc_loss": 0.031044790521264076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1044790375744924e-05, "grad_norm": 18.24274444580078, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8640648126602173, "num_tokens": 576334137.0, "step": 15110 }, { "epoch": 1.9222745197811983, "ewc_loss": 0.031053077429533005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105307769146748e-05, "grad_norm": 18.17215347290039, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8665666580200195, "num_tokens": 576379582.0, "step": 15111 }, { "epoch": 1.9224017300597889, "ewc_loss": 0.03106250800192356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106250733253546e-05, "grad_norm": 18.25018310546875, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8842225074768066, "num_tokens": 576414590.0, "step": 15112 }, { "epoch": 1.9225289403383794, "ewc_loss": 0.031150834634900093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115083381999284e-05, "grad_norm": 18.27595329284668, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8622230291366577, "num_tokens": 576445023.0, "step": 15113 }, { "epoch": 1.92265615061697, "ewc_loss": 0.031072573736310005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107257361989468e-05, "grad_norm": 18.20607566833496, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.867889404296875, "num_tokens": 576477861.0, "step": 15114 }, { "epoch": 1.9227833608955605, "ewc_loss": 0.03106195107102394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1061950721777976e-05, "grad_norm": 18.20655632019043, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.86647629737854, "num_tokens": 576513121.0, "step": 15115 }, { "epoch": 1.922910571174151, "ewc_loss": 0.031112849712371826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1112849683267996e-05, "grad_norm": 18.191648483276367, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.87653648853302, "num_tokens": 576551039.0, "step": 15116 }, { "epoch": 1.9230377814527415, "ewc_loss": 0.03113606758415699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1136067264014855e-05, "grad_norm": 18.263818740844727, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8624141216278076, "num_tokens": 576590651.0, "step": 15117 }, { "epoch": 1.923164991731332, "ewc_loss": 0.03114285133779049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114285209449008e-05, "grad_norm": 18.243867874145508, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.861418604850769, "num_tokens": 576629119.0, "step": 15118 }, { "epoch": 1.9232922020099223, "ewc_loss": 0.03115064464509487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115064464509487e-05, "grad_norm": 18.2618408203125, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8796253800392151, "num_tokens": 576664240.0, "step": 15119 }, { "epoch": 1.9234194122885129, "ewc_loss": 0.03113619051873684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1136190955294296e-05, "grad_norm": 18.191692352294922, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8645268678665161, "num_tokens": 576701946.0, "step": 15120 }, { "epoch": 1.9235466225671034, "ewc_loss": 0.03118276037275791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118276072200388e-05, "grad_norm": 18.250003814697266, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8805989027023315, "num_tokens": 576733541.0, "step": 15121 }, { "epoch": 1.923673832845694, "ewc_loss": 0.031170472502708435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117047162959352e-05, "grad_norm": 18.294384002685547, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8569960594177246, "num_tokens": 576770930.0, "step": 15122 }, { "epoch": 1.9238010431242845, "ewc_loss": 0.031130312010645866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1130311981542036e-05, "grad_norm": 18.14117431640625, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8699007034301758, "num_tokens": 576807455.0, "step": 15123 }, { "epoch": 1.923928253402875, "ewc_loss": 0.031159767881035805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115976869594306e-05, "grad_norm": 18.316120147705078, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8773233890533447, "num_tokens": 576849577.0, "step": 15124 }, { "epoch": 1.9240554636814655, "ewc_loss": 0.031171118840575218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1171119189821184e-05, "grad_norm": 18.22420310974121, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8642128705978394, "num_tokens": 576883506.0, "step": 15125 }, { "epoch": 1.9241826739600558, "ewc_loss": 0.031070733442902565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1070732802618295e-05, "grad_norm": 18.2685604095459, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8532397747039795, "num_tokens": 576923892.0, "step": 15126 }, { "epoch": 1.9243098842386464, "ewc_loss": 0.03120962344110012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120962355751544e-05, "grad_norm": 18.225067138671875, "learning_rate": 1e-06, "loss": 0.352, "mean_token_accuracy": 0.8866746425628662, "num_tokens": 576970832.0, "step": 15127 }, { "epoch": 1.9244370945172369, "ewc_loss": 0.031116396188735962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111639671260491e-05, "grad_norm": 18.330890655517578, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8530021905899048, "num_tokens": 577006990.0, "step": 15128 }, { "epoch": 1.9245643047958274, "ewc_loss": 0.031183935701847076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118393578915857e-05, "grad_norm": 18.263158798217773, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8740664720535278, "num_tokens": 577044932.0, "step": 15129 }, { "epoch": 1.924691515074418, "ewc_loss": 0.031088363379240036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108836244791746e-05, "grad_norm": 18.280845642089844, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8535243272781372, "num_tokens": 577076217.0, "step": 15130 }, { "epoch": 1.9248187253530085, "ewc_loss": 0.031131159514188766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113115963060409e-05, "grad_norm": 18.207687377929688, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.8851362466812134, "num_tokens": 577112696.0, "step": 15131 }, { "epoch": 1.924945935631599, "ewc_loss": 0.031085485592484474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108548480668105e-05, "grad_norm": 18.271020889282227, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8688651323318481, "num_tokens": 577156397.0, "step": 15132 }, { "epoch": 1.9250731459101895, "ewc_loss": 0.03120438940823078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120438850601204e-05, "grad_norm": 18.340635299682617, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8744338154792786, "num_tokens": 577194192.0, "step": 15133 }, { "epoch": 1.92520035618878, "ewc_loss": 0.03110126219689846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110126272076741e-05, "grad_norm": 18.2199764251709, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8646564483642578, "num_tokens": 577232639.0, "step": 15134 }, { "epoch": 1.9253275664673706, "ewc_loss": 0.0310985054820776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109850513283163e-05, "grad_norm": 18.297470092773438, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8782134056091309, "num_tokens": 577267349.0, "step": 15135 }, { "epoch": 1.925454776745961, "ewc_loss": 0.03111620806157589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111620753770694e-05, "grad_norm": 18.317014694213867, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8651139140129089, "num_tokens": 577308329.0, "step": 15136 }, { "epoch": 1.9255819870245516, "ewc_loss": 0.031048046424984932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104804636677727e-05, "grad_norm": 18.230592727661133, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8711217641830444, "num_tokens": 577347825.0, "step": 15137 }, { "epoch": 1.9257091973031422, "ewc_loss": 0.031051669269800186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1051669793669134e-05, "grad_norm": 18.31549644470215, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8695192933082581, "num_tokens": 577379981.0, "step": 15138 }, { "epoch": 1.9258364075817327, "ewc_loss": 0.031090551987290382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109055251115933e-05, "grad_norm": 18.248462677001953, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8732300996780396, "num_tokens": 577414134.0, "step": 15139 }, { "epoch": 1.9259636178603232, "ewc_loss": 0.030961858108639717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.096185901085846e-05, "grad_norm": 18.290897369384766, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8679570555686951, "num_tokens": 577451170.0, "step": 15140 }, { "epoch": 1.9260908281389137, "ewc_loss": 0.03108198568224907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108198507106863e-05, "grad_norm": 18.32221031188965, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8566327095031738, "num_tokens": 577487447.0, "step": 15141 }, { "epoch": 1.9262180384175043, "ewc_loss": 0.031037142500281334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103714334429242e-05, "grad_norm": 18.287405014038086, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8744264841079712, "num_tokens": 577525732.0, "step": 15142 }, { "epoch": 1.9263452486960948, "ewc_loss": 0.03110126219689846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110126272076741e-05, "grad_norm": 18.279144287109375, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8674659132957458, "num_tokens": 577563303.0, "step": 15143 }, { "epoch": 1.926472458974685, "ewc_loss": 0.031046489253640175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1046489311847836e-05, "grad_norm": 18.214012145996094, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8836542963981628, "num_tokens": 577607592.0, "step": 15144 }, { "epoch": 1.9265996692532756, "ewc_loss": 0.031060408800840378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106040821876377e-05, "grad_norm": 18.302003860473633, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8615635633468628, "num_tokens": 577651646.0, "step": 15145 }, { "epoch": 1.9267268795318662, "ewc_loss": 0.03108794428408146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1087944080354646e-05, "grad_norm": 18.292354583740234, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8775949478149414, "num_tokens": 577689530.0, "step": 15146 }, { "epoch": 1.9268540898104567, "ewc_loss": 0.031048297882080078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104829738731496e-05, "grad_norm": 18.20237159729004, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8644109964370728, "num_tokens": 577732524.0, "step": 15147 }, { "epoch": 1.9269813000890472, "ewc_loss": 0.03103264980018139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103265044046566e-05, "grad_norm": 18.21736717224121, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8738930225372314, "num_tokens": 577767164.0, "step": 15148 }, { "epoch": 1.9271085103676378, "ewc_loss": 0.03110472299158573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110472243861295e-05, "grad_norm": 18.263900756835938, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8776306509971619, "num_tokens": 577802347.0, "step": 15149 }, { "epoch": 1.927235720646228, "ewc_loss": 0.031006857752799988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100685717072338e-05, "grad_norm": 18.179859161376953, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8617023825645447, "num_tokens": 577842899.0, "step": 15150 }, { "epoch": 1.9273629309248186, "ewc_loss": 0.031020328402519226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102032860624604e-05, "grad_norm": 18.225177764892578, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8696798086166382, "num_tokens": 577885137.0, "step": 15151 }, { "epoch": 1.9274901412034091, "ewc_loss": 0.031067458912730217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106745862169191e-05, "grad_norm": 18.229154586791992, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8831936717033386, "num_tokens": 577925727.0, "step": 15152 }, { "epoch": 1.9276173514819996, "ewc_loss": 0.031006377190351486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.100637695752084e-05, "grad_norm": 18.16606330871582, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8732658624649048, "num_tokens": 577965399.0, "step": 15153 }, { "epoch": 1.9277445617605902, "ewc_loss": 0.03103940561413765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103940616711043e-05, "grad_norm": 18.251972198486328, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8706635236740112, "num_tokens": 578007608.0, "step": 15154 }, { "epoch": 1.9278717720391807, "ewc_loss": 0.03112068958580494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112068952759728e-05, "grad_norm": 18.213577270507812, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8812551498413086, "num_tokens": 578043425.0, "step": 15155 }, { "epoch": 1.9279989823177712, "ewc_loss": 0.03103252314031124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103252311120741e-05, "grad_norm": 18.291851043701172, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8677845001220703, "num_tokens": 578075838.0, "step": 15156 }, { "epoch": 1.9281261925963618, "ewc_loss": 0.03108866512775421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108866440015845e-05, "grad_norm": 18.225202560424805, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8638980388641357, "num_tokens": 578110113.0, "step": 15157 }, { "epoch": 1.9282534028749523, "ewc_loss": 0.03102877549827099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1028775993036106e-05, "grad_norm": 18.237504959106445, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8705853223800659, "num_tokens": 578145743.0, "step": 15158 }, { "epoch": 1.9283806131535428, "ewc_loss": 0.0310759749263525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1075975130079314e-05, "grad_norm": 18.235637664794922, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8775662183761597, "num_tokens": 578181588.0, "step": 15159 }, { "epoch": 1.9285078234321333, "ewc_loss": 0.031066734343767166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10667346639093e-05, "grad_norm": 18.172780990600586, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8677352666854858, "num_tokens": 578222048.0, "step": 15160 }, { "epoch": 1.9286350337107239, "ewc_loss": 0.03108810819685459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1088107789400965e-05, "grad_norm": 18.211414337158203, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.872397780418396, "num_tokens": 578262664.0, "step": 15161 }, { "epoch": 1.9287622439893144, "ewc_loss": 0.03113831952214241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1138319172896445e-05, "grad_norm": 18.26025390625, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8588207364082336, "num_tokens": 578303431.0, "step": 15162 }, { "epoch": 1.928889454267905, "ewc_loss": 0.031100211665034294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110021134489216e-05, "grad_norm": 18.188928604125977, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8594288229942322, "num_tokens": 578338978.0, "step": 15163 }, { "epoch": 1.9290166645464955, "ewc_loss": 0.03113533928990364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113533966825344e-05, "grad_norm": 18.283573150634766, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.866240382194519, "num_tokens": 578376340.0, "step": 15164 }, { "epoch": 1.929143874825086, "ewc_loss": 0.031111815944314003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111181649728678e-05, "grad_norm": 18.19343376159668, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8721216917037964, "num_tokens": 578414714.0, "step": 15165 }, { "epoch": 1.9292710851036765, "ewc_loss": 0.03107401356101036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107401425950229e-05, "grad_norm": 18.248199462890625, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8566411733627319, "num_tokens": 578453668.0, "step": 15166 }, { "epoch": 1.929398295382267, "ewc_loss": 0.03118947334587574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118947279290296e-05, "grad_norm": 18.18602180480957, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8621636033058167, "num_tokens": 578494148.0, "step": 15167 }, { "epoch": 1.9295255056608573, "ewc_loss": 0.03107018768787384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107018710579723e-05, "grad_norm": 18.302412033081055, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8500083684921265, "num_tokens": 578530743.0, "step": 15168 }, { "epoch": 1.9296527159394479, "ewc_loss": 0.031214680522680283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12146803480573e-05, "grad_norm": 18.288148880004883, "learning_rate": 1e-06, "loss": 0.341, "mean_token_accuracy": 0.8911813497543335, "num_tokens": 578567613.0, "step": 15169 }, { "epoch": 1.9297799262180384, "ewc_loss": 0.03109004721045494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1090046832105145e-05, "grad_norm": 18.2613582611084, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8687528371810913, "num_tokens": 578602981.0, "step": 15170 }, { "epoch": 1.929907136496629, "ewc_loss": 0.03111494518816471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111494515906088e-05, "grad_norm": 18.252607345581055, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.863053560256958, "num_tokens": 578641027.0, "step": 15171 }, { "epoch": 1.9300343467752195, "ewc_loss": 0.031105300411581993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110530087724328e-05, "grad_norm": 18.1923828125, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8817310333251953, "num_tokens": 578683006.0, "step": 15172 }, { "epoch": 1.93016155705381, "ewc_loss": 0.03113982453942299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113982529612258e-05, "grad_norm": 18.276851654052734, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8747289776802063, "num_tokens": 578716722.0, "step": 15173 }, { "epoch": 1.9302887673324005, "ewc_loss": 0.031162705272436142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116270454484038e-05, "grad_norm": 18.216285705566406, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8501734733581543, "num_tokens": 578750740.0, "step": 15174 }, { "epoch": 1.9304159776109908, "ewc_loss": 0.03109556995332241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109556928393431e-05, "grad_norm": 18.182462692260742, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8721414804458618, "num_tokens": 578791358.0, "step": 15175 }, { "epoch": 1.9305431878895813, "ewc_loss": 0.03115934692323208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1159346690401435e-05, "grad_norm": 18.315898895263672, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8535066843032837, "num_tokens": 578834514.0, "step": 15176 }, { "epoch": 1.9306703981681719, "ewc_loss": 0.03118300437927246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118300446658395e-05, "grad_norm": 18.246801376342773, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8617920875549316, "num_tokens": 578875869.0, "step": 15177 }, { "epoch": 1.9307976084467624, "ewc_loss": 0.0311054065823555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1105406378628686e-05, "grad_norm": 18.2691593170166, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8735318183898926, "num_tokens": 578921079.0, "step": 15178 }, { "epoch": 1.930924818725353, "ewc_loss": 0.031187381595373154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1187380955088884e-05, "grad_norm": 18.228343963623047, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8680121898651123, "num_tokens": 578959837.0, "step": 15179 }, { "epoch": 1.9310520290039435, "ewc_loss": 0.031062474474310875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10624745907262e-05, "grad_norm": 18.21291160583496, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.841754138469696, "num_tokens": 579001629.0, "step": 15180 }, { "epoch": 1.931179239282534, "ewc_loss": 0.031160732731223106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116073276032694e-05, "grad_norm": 18.275636672973633, "learning_rate": 1e-06, "loss": 0.3409, "mean_token_accuracy": 0.8923416137695312, "num_tokens": 579036762.0, "step": 15181 }, { "epoch": 1.9313064495611245, "ewc_loss": 0.031131751835346222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1131752621149644e-05, "grad_norm": 18.336071014404297, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.847589910030365, "num_tokens": 579078455.0, "step": 15182 }, { "epoch": 1.931433659839715, "ewc_loss": 0.031172778457403183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117277810815722e-05, "grad_norm": 18.2955322265625, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8732737302780151, "num_tokens": 579116942.0, "step": 15183 }, { "epoch": 1.9315608701183056, "ewc_loss": 0.031118258833885193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111825935775414e-05, "grad_norm": 18.284936904907227, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8813636302947998, "num_tokens": 579155180.0, "step": 15184 }, { "epoch": 1.931688080396896, "ewc_loss": 0.031112689524888992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1112689612200484e-05, "grad_norm": 18.254619598388672, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8621587753295898, "num_tokens": 579194336.0, "step": 15185 }, { "epoch": 1.9318152906754866, "ewc_loss": 0.031053561717271805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105356154264882e-05, "grad_norm": 18.237157821655273, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8661489486694336, "num_tokens": 579226609.0, "step": 15186 }, { "epoch": 1.9319425009540772, "ewc_loss": 0.03111928515136242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111928526777774e-05, "grad_norm": 18.270084381103516, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8741182088851929, "num_tokens": 579259288.0, "step": 15187 }, { "epoch": 1.9320697112326677, "ewc_loss": 0.031136037781834602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11360381601844e-05, "grad_norm": 18.219112396240234, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8779808878898621, "num_tokens": 579294797.0, "step": 15188 }, { "epoch": 1.9321969215112582, "ewc_loss": 0.03117687813937664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117687811027281e-05, "grad_norm": 18.30304527282715, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8801767826080322, "num_tokens": 579334352.0, "step": 15189 }, { "epoch": 1.9323241317898487, "ewc_loss": 0.03106805868446827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106805888819508e-05, "grad_norm": 18.151996612548828, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.85888671875, "num_tokens": 579379618.0, "step": 15190 }, { "epoch": 1.9324513420684393, "ewc_loss": 0.031102696433663368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11026960844174e-05, "grad_norm": 18.265321731567383, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8695141077041626, "num_tokens": 579420230.0, "step": 15191 }, { "epoch": 1.9325785523470298, "ewc_loss": 0.031133048236370087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113304774160497e-05, "grad_norm": 18.262012481689453, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8699429631233215, "num_tokens": 579458236.0, "step": 15192 }, { "epoch": 1.93270576262562, "ewc_loss": 0.031157290562987328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115729123237543e-05, "grad_norm": 18.26141357421875, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8686391115188599, "num_tokens": 579495561.0, "step": 15193 }, { "epoch": 1.9328329729042106, "ewc_loss": 0.03113340027630329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113340062554926e-05, "grad_norm": 18.231142044067383, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8688974976539612, "num_tokens": 579530225.0, "step": 15194 }, { "epoch": 1.9329601831828012, "ewc_loss": 0.031178340315818787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117834057775326e-05, "grad_norm": 18.32134437561035, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8617244958877563, "num_tokens": 579570143.0, "step": 15195 }, { "epoch": 1.9330873934613917, "ewc_loss": 0.031150706112384796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115070649073459e-05, "grad_norm": 18.214569091796875, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8826958537101746, "num_tokens": 579607105.0, "step": 15196 }, { "epoch": 1.9332146037399822, "ewc_loss": 0.031098654493689537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1098654289962724e-05, "grad_norm": 18.30401611328125, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8843250274658203, "num_tokens": 579640018.0, "step": 15197 }, { "epoch": 1.9333418140185727, "ewc_loss": 0.0312198419123888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121984263998456e-05, "grad_norm": 18.26557159423828, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8644980788230896, "num_tokens": 579681250.0, "step": 15198 }, { "epoch": 1.933469024297163, "ewc_loss": 0.031110307201743126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111030673608184e-05, "grad_norm": 18.227685928344727, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.85764479637146, "num_tokens": 579718535.0, "step": 15199 }, { "epoch": 1.9335962345757536, "ewc_loss": 0.031183507293462753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118350650765933e-05, "grad_norm": 18.278823852539062, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8584864735603333, "num_tokens": 579756959.0, "step": 15200 }, { "epoch": 1.933723444854344, "ewc_loss": 0.03113875910639763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11387593683321e-05, "grad_norm": 18.221446990966797, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8597604036331177, "num_tokens": 579798266.0, "step": 15201 }, { "epoch": 1.9338506551329346, "ewc_loss": 0.03116002306342125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1160023354459554e-05, "grad_norm": 18.277952194213867, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8891268968582153, "num_tokens": 579831869.0, "step": 15202 }, { "epoch": 1.9339778654115252, "ewc_loss": 0.031232811510562897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1232812034431845e-05, "grad_norm": 18.292205810546875, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.870884358882904, "num_tokens": 579874168.0, "step": 15203 }, { "epoch": 1.9341050756901157, "ewc_loss": 0.031149210408329964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114921128144488e-05, "grad_norm": 18.232070922851562, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8760396838188171, "num_tokens": 579916313.0, "step": 15204 }, { "epoch": 1.9342322859687062, "ewc_loss": 0.031182030215859413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118202948826365e-05, "grad_norm": 18.18574333190918, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.88080233335495, "num_tokens": 579959022.0, "step": 15205 }, { "epoch": 1.9343594962472968, "ewc_loss": 0.031158149242401123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11581497953739e-05, "grad_norm": 18.230520248413086, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8552840948104858, "num_tokens": 580003675.0, "step": 15206 }, { "epoch": 1.9344867065258873, "ewc_loss": 0.031192081049084663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1192081223707646e-05, "grad_norm": 18.226987838745117, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8792729377746582, "num_tokens": 580044384.0, "step": 15207 }, { "epoch": 1.9346139168044778, "ewc_loss": 0.031134536489844322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113453567493707e-05, "grad_norm": 18.20866584777832, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8591992855072021, "num_tokens": 580078955.0, "step": 15208 }, { "epoch": 1.9347411270830683, "ewc_loss": 0.03114102967083454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114102946710773e-05, "grad_norm": 18.19754981994629, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8664096593856812, "num_tokens": 580116793.0, "step": 15209 }, { "epoch": 1.9348683373616589, "ewc_loss": 0.03117353841662407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11735384457279e-05, "grad_norm": 18.23870086669922, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8679755330085754, "num_tokens": 580153622.0, "step": 15210 }, { "epoch": 1.9349955476402494, "ewc_loss": 0.0311641413718462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116414154646918e-05, "grad_norm": 18.18025779724121, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8673200011253357, "num_tokens": 580195287.0, "step": 15211 }, { "epoch": 1.93512275791884, "ewc_loss": 0.031221454963088036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1221454264596105e-05, "grad_norm": 18.304418563842773, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8584589958190918, "num_tokens": 580229601.0, "step": 15212 }, { "epoch": 1.9352499681974304, "ewc_loss": 0.031209103763103485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120910332654603e-05, "grad_norm": 18.220083236694336, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8722054958343506, "num_tokens": 580268079.0, "step": 15213 }, { "epoch": 1.935377178476021, "ewc_loss": 0.031155560165643692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115555955446325e-05, "grad_norm": 18.263134002685547, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8517458438873291, "num_tokens": 580305379.0, "step": 15214 }, { "epoch": 1.9355043887546115, "ewc_loss": 0.031206956133246422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1206956919049844e-05, "grad_norm": 18.23135757446289, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8692560195922852, "num_tokens": 580338438.0, "step": 15215 }, { "epoch": 1.935631599033202, "ewc_loss": 0.031143171712756157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1143172236625105e-05, "grad_norm": 18.238182067871094, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8693915009498596, "num_tokens": 580370017.0, "step": 15216 }, { "epoch": 1.9357588093117923, "ewc_loss": 0.031200114637613297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1200113880913705e-05, "grad_norm": 18.279260635375977, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8802269697189331, "num_tokens": 580409572.0, "step": 15217 }, { "epoch": 1.9358860195903829, "ewc_loss": 0.031162293627858162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116229345323518e-05, "grad_norm": 18.152069091796875, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8744754195213318, "num_tokens": 580448481.0, "step": 15218 }, { "epoch": 1.9360132298689734, "ewc_loss": 0.031232016161084175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123201531707309e-05, "grad_norm": 18.252092361450195, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8806231617927551, "num_tokens": 580481919.0, "step": 15219 }, { "epoch": 1.936140440147564, "ewc_loss": 0.03119342401623726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.119342363788746e-05, "grad_norm": 18.240751266479492, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8643372058868408, "num_tokens": 580520758.0, "step": 15220 }, { "epoch": 1.9362676504261545, "ewc_loss": 0.031181881204247475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118188033113256e-05, "grad_norm": 18.258569717407227, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.860673189163208, "num_tokens": 580559839.0, "step": 15221 }, { "epoch": 1.936394860704745, "ewc_loss": 0.031253207474946976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12532065436244e-05, "grad_norm": 18.30353355407715, "learning_rate": 1e-06, "loss": 0.3326, "mean_token_accuracy": 0.8951414227485657, "num_tokens": 580591767.0, "step": 15222 }, { "epoch": 1.9365220709833355, "ewc_loss": 0.031177973374724388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1177973141893744e-05, "grad_norm": 18.25345230102539, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8781726956367493, "num_tokens": 580633180.0, "step": 15223 }, { "epoch": 1.9366492812619258, "ewc_loss": 0.031175924465060234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117592495982535e-05, "grad_norm": 18.34868812561035, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.871989369392395, "num_tokens": 580677045.0, "step": 15224 }, { "epoch": 1.9367764915405163, "ewc_loss": 0.031146137043833733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1146137189352885e-05, "grad_norm": 18.284425735473633, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8826143741607666, "num_tokens": 580717423.0, "step": 15225 }, { "epoch": 1.9369037018191069, "ewc_loss": 0.03105541691184044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105541691184044e-05, "grad_norm": 18.23479461669922, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8750728368759155, "num_tokens": 580756677.0, "step": 15226 }, { "epoch": 1.9370309120976974, "ewc_loss": 0.031124621629714966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1124622182687744e-05, "grad_norm": 18.303634643554688, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8745181560516357, "num_tokens": 580796993.0, "step": 15227 }, { "epoch": 1.937158122376288, "ewc_loss": 0.031171180307865143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1171181035460904e-05, "grad_norm": 18.25385093688965, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8733327388763428, "num_tokens": 580834616.0, "step": 15228 }, { "epoch": 1.9372853326548785, "ewc_loss": 0.03109687566757202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1096875318326056e-05, "grad_norm": 18.323028564453125, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8720719814300537, "num_tokens": 580873718.0, "step": 15229 }, { "epoch": 1.937412542933469, "ewc_loss": 0.03126438334584236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126438241451979e-05, "grad_norm": 18.367286682128906, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8650722503662109, "num_tokens": 580911678.0, "step": 15230 }, { "epoch": 1.9375397532120595, "ewc_loss": 0.031086387112736702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108638702542521e-05, "grad_norm": 18.30703353881836, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8764587044715881, "num_tokens": 580953282.0, "step": 15231 }, { "epoch": 1.93766696349065, "ewc_loss": 0.03104241192340851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1042411137605086e-05, "grad_norm": 18.235042572021484, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.875162661075592, "num_tokens": 580997954.0, "step": 15232 }, { "epoch": 1.9377941737692406, "ewc_loss": 0.03105776384472847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105776340817101e-05, "grad_norm": 18.34173011779785, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8715239763259888, "num_tokens": 581035623.0, "step": 15233 }, { "epoch": 1.937921384047831, "ewc_loss": 0.031049003824591637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1049003155203536e-05, "grad_norm": 18.26993179321289, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8787626028060913, "num_tokens": 581068727.0, "step": 15234 }, { "epoch": 1.9380485943264216, "ewc_loss": 0.031058188527822495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105818905169144e-05, "grad_norm": 18.375076293945312, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8678333759307861, "num_tokens": 581111180.0, "step": 15235 }, { "epoch": 1.9381758046050122, "ewc_loss": 0.031044965609908104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1044964998727664e-05, "grad_norm": 18.22776985168457, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8493386507034302, "num_tokens": 581152373.0, "step": 15236 }, { "epoch": 1.9383030148836027, "ewc_loss": 0.03096972405910492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0969724321039394e-05, "grad_norm": 18.336687088012695, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8679378032684326, "num_tokens": 581195854.0, "step": 15237 }, { "epoch": 1.9384302251621932, "ewc_loss": 0.03108120709657669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108120654360391e-05, "grad_norm": 18.27065086364746, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8427785038948059, "num_tokens": 581236414.0, "step": 15238 }, { "epoch": 1.9385574354407837, "ewc_loss": 0.030981911346316338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098191155004315e-05, "grad_norm": 18.28306770324707, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8537187576293945, "num_tokens": 581275349.0, "step": 15239 }, { "epoch": 1.9386846457193743, "ewc_loss": 0.031050607562065125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1050607503857464e-05, "grad_norm": 18.301794052124023, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8773846626281738, "num_tokens": 581313231.0, "step": 15240 }, { "epoch": 1.9388118559979648, "ewc_loss": 0.031065000221133232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106499934801832e-05, "grad_norm": 18.242412567138672, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8713301420211792, "num_tokens": 581355295.0, "step": 15241 }, { "epoch": 1.938939066276555, "ewc_loss": 0.030987808480858803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0987808713689446e-05, "grad_norm": 18.23777961730957, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8735841512680054, "num_tokens": 581392633.0, "step": 15242 }, { "epoch": 1.9390662765551456, "ewc_loss": 0.031101731583476067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1101732020033523e-05, "grad_norm": 18.278385162353516, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8846727013587952, "num_tokens": 581425552.0, "step": 15243 }, { "epoch": 1.9391934868337362, "ewc_loss": 0.031103864312171936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110386387561448e-05, "grad_norm": 18.240009307861328, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8585171699523926, "num_tokens": 581467707.0, "step": 15244 }, { "epoch": 1.9393206971123267, "ewc_loss": 0.031054532155394554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1054532882990316e-05, "grad_norm": 18.261493682861328, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.875666081905365, "num_tokens": 581504888.0, "step": 15245 }, { "epoch": 1.9394479073909172, "ewc_loss": 0.031095365062355995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109536555712111e-05, "grad_norm": 18.237138748168945, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8686809539794922, "num_tokens": 581537800.0, "step": 15246 }, { "epoch": 1.9395751176695077, "ewc_loss": 0.031081268563866615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108126838924363e-05, "grad_norm": 18.293331146240234, "learning_rate": 1e-06, "loss": 0.3445, "mean_token_accuracy": 0.8908604383468628, "num_tokens": 581570594.0, "step": 15247 }, { "epoch": 1.939702327948098, "ewc_loss": 0.03111172467470169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1111725547816604e-05, "grad_norm": 18.32231330871582, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8634600043296814, "num_tokens": 581610764.0, "step": 15248 }, { "epoch": 1.9398295382266886, "ewc_loss": 0.031131282448768616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113128332188353e-05, "grad_norm": 18.293018341064453, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8694906234741211, "num_tokens": 581652341.0, "step": 15249 }, { "epoch": 1.939956748505279, "ewc_loss": 0.03104020282626152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104020288446918e-05, "grad_norm": 18.312480926513672, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8624678254127502, "num_tokens": 581689978.0, "step": 15250 }, { "epoch": 1.9400839587838696, "ewc_loss": 0.03110630437731743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110630495939404e-05, "grad_norm": 18.29873275756836, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8785766363143921, "num_tokens": 581728811.0, "step": 15251 }, { "epoch": 1.9402111690624602, "ewc_loss": 0.031064432114362717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106443182332441e-05, "grad_norm": 18.248472213745117, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8786523342132568, "num_tokens": 581768016.0, "step": 15252 }, { "epoch": 1.9403383793410507, "ewc_loss": 0.031083429232239723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1083429348655045e-05, "grad_norm": 18.22052574157715, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8732022047042847, "num_tokens": 581805405.0, "step": 15253 }, { "epoch": 1.9404655896196412, "ewc_loss": 0.0310245081782341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102450864389539e-05, "grad_norm": 18.254390716552734, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8512814044952393, "num_tokens": 581845302.0, "step": 15254 }, { "epoch": 1.9405927998982317, "ewc_loss": 0.031164079904556274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116407970082946e-05, "grad_norm": 18.273767471313477, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8788156509399414, "num_tokens": 581875391.0, "step": 15255 }, { "epoch": 1.9407200101768223, "ewc_loss": 0.031155508011579514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115550862275995e-05, "grad_norm": 18.319477081298828, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.869721531867981, "num_tokens": 581916580.0, "step": 15256 }, { "epoch": 1.9408472204554128, "ewc_loss": 0.031158441677689552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115844083367847e-05, "grad_norm": 18.375537872314453, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8711007833480835, "num_tokens": 581958539.0, "step": 15257 }, { "epoch": 1.9409744307340033, "ewc_loss": 0.0311002004891634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110020043095574e-05, "grad_norm": 18.294532775878906, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8695071339607239, "num_tokens": 581996981.0, "step": 15258 }, { "epoch": 1.9411016410125939, "ewc_loss": 0.03107447549700737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107447628281079e-05, "grad_norm": 18.28056526184082, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8800030946731567, "num_tokens": 582036068.0, "step": 15259 }, { "epoch": 1.9412288512911844, "ewc_loss": 0.03107433393597603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1074334401637316e-05, "grad_norm": 18.282413482666016, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8728711605072021, "num_tokens": 582073322.0, "step": 15260 }, { "epoch": 1.941356061569775, "ewc_loss": 0.0310842152684927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108421515207738e-05, "grad_norm": 18.486310958862305, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8733277916908264, "num_tokens": 582110736.0, "step": 15261 }, { "epoch": 1.9414832718483654, "ewc_loss": 0.031125349923968315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112534977844916e-05, "grad_norm": 18.32611656188965, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8539078235626221, "num_tokens": 582153431.0, "step": 15262 }, { "epoch": 1.941610482126956, "ewc_loss": 0.031003225594758987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.10032264678739e-05, "grad_norm": 18.319005966186523, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8520454168319702, "num_tokens": 582191793.0, "step": 15263 }, { "epoch": 1.9417376924055465, "ewc_loss": 0.03102947026491165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.102947084698826e-05, "grad_norm": 18.234079360961914, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.861828088760376, "num_tokens": 582227923.0, "step": 15264 }, { "epoch": 1.941864902684137, "ewc_loss": 0.031073471531271935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1073472200660035e-05, "grad_norm": 18.37661361694336, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8560128211975098, "num_tokens": 582268029.0, "step": 15265 }, { "epoch": 1.9419921129627273, "ewc_loss": 0.031107764691114426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110776378889568e-05, "grad_norm": 18.280235290527344, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8645421266555786, "num_tokens": 582302173.0, "step": 15266 }, { "epoch": 1.9421193232413179, "ewc_loss": 0.030956314876675606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.0956314731156453e-05, "grad_norm": 18.356346130371094, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.8847391605377197, "num_tokens": 582341642.0, "step": 15267 }, { "epoch": 1.9422465335199084, "ewc_loss": 0.031077634543180466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.107763404841535e-05, "grad_norm": 18.245424270629883, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8735730051994324, "num_tokens": 582378960.0, "step": 15268 }, { "epoch": 1.942373743798499, "ewc_loss": 0.03098675049841404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.098675006185658e-05, "grad_norm": 18.204559326171875, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8771269917488098, "num_tokens": 582422149.0, "step": 15269 }, { "epoch": 1.9425009540770894, "ewc_loss": 0.031152671203017235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115267099929042e-05, "grad_norm": 18.347455978393555, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8537431955337524, "num_tokens": 582462657.0, "step": 15270 }, { "epoch": 1.94262816435568, "ewc_loss": 0.031091826036572456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109182580374181e-05, "grad_norm": 18.170385360717773, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8826358318328857, "num_tokens": 582499264.0, "step": 15271 }, { "epoch": 1.9427553746342705, "ewc_loss": 0.031163210049271584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1163210223894566e-05, "grad_norm": 18.295352935791016, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.873666524887085, "num_tokens": 582534744.0, "step": 15272 }, { "epoch": 1.9428825849128608, "ewc_loss": 0.031128136441111565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1128136470215395e-05, "grad_norm": 18.309846878051758, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8668125867843628, "num_tokens": 582577010.0, "step": 15273 }, { "epoch": 1.9430097951914513, "ewc_loss": 0.031065309420228004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106530857621692e-05, "grad_norm": 18.336273193359375, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8665566444396973, "num_tokens": 582617103.0, "step": 15274 }, { "epoch": 1.9431370054700419, "ewc_loss": 0.03109079599380493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1090796255739406e-05, "grad_norm": 18.303367614746094, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.864551305770874, "num_tokens": 582661754.0, "step": 15275 }, { "epoch": 1.9432642157486324, "ewc_loss": 0.031040919944643974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.104091956629418e-05, "grad_norm": 18.294525146484375, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8719266057014465, "num_tokens": 582701701.0, "step": 15276 }, { "epoch": 1.943391426027223, "ewc_loss": 0.03111959435045719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1119594495976344e-05, "grad_norm": 18.244674682617188, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8614800572395325, "num_tokens": 582741977.0, "step": 15277 }, { "epoch": 1.9435186363058135, "ewc_loss": 0.031039301306009293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103930066572502e-05, "grad_norm": 18.249950408935547, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8792284727096558, "num_tokens": 582783420.0, "step": 15278 }, { "epoch": 1.943645846584404, "ewc_loss": 0.031062951311469078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1062951165949926e-05, "grad_norm": 18.263111114501953, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8758355379104614, "num_tokens": 582819464.0, "step": 15279 }, { "epoch": 1.9437730568629945, "ewc_loss": 0.03109229914844036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1092298740986735e-05, "grad_norm": 18.239709854125977, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8762944936752319, "num_tokens": 582860757.0, "step": 15280 }, { "epoch": 1.943900267141585, "ewc_loss": 0.031102564185857773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110256511718035e-05, "grad_norm": 18.217700958251953, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8655864596366882, "num_tokens": 582910300.0, "step": 15281 }, { "epoch": 1.9440274774201756, "ewc_loss": 0.031069736927747726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106973599642515e-05, "grad_norm": 18.232101440429688, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8623386025428772, "num_tokens": 582956333.0, "step": 15282 }, { "epoch": 1.944154687698766, "ewc_loss": 0.031101616099476814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1101615604711697e-05, "grad_norm": 18.225515365600586, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8784847855567932, "num_tokens": 582994068.0, "step": 15283 }, { "epoch": 1.9442818979773566, "ewc_loss": 0.031102051958441734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110205216216855e-05, "grad_norm": 18.185937881469727, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8683462738990784, "num_tokens": 583027829.0, "step": 15284 }, { "epoch": 1.9444091082559471, "ewc_loss": 0.03113512136042118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113512138952501e-05, "grad_norm": 18.223913192749023, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8681122064590454, "num_tokens": 583068881.0, "step": 15285 }, { "epoch": 1.9445363185345377, "ewc_loss": 0.031196774914860725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1196774216368794e-05, "grad_norm": 18.283794403076172, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8739897012710571, "num_tokens": 583100097.0, "step": 15286 }, { "epoch": 1.9446635288131282, "ewc_loss": 0.031129533424973488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112953345407732e-05, "grad_norm": 18.245967864990234, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8695186972618103, "num_tokens": 583135340.0, "step": 15287 }, { "epoch": 1.9447907390917187, "ewc_loss": 0.031138606369495392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1138606573222205e-05, "grad_norm": 18.277761459350586, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.869625985622406, "num_tokens": 583172032.0, "step": 15288 }, { "epoch": 1.9449179493703093, "ewc_loss": 0.031133541837334633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113354250672273e-05, "grad_norm": 18.193601608276367, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.874221682548523, "num_tokens": 583205244.0, "step": 15289 }, { "epoch": 1.9450451596488998, "ewc_loss": 0.031133022159337997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113302227575332e-05, "grad_norm": 18.3252010345459, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8664799332618713, "num_tokens": 583247537.0, "step": 15290 }, { "epoch": 1.94517236992749, "ewc_loss": 0.031152259558439255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115225990768522e-05, "grad_norm": 18.243396759033203, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8629543781280518, "num_tokens": 583285349.0, "step": 15291 }, { "epoch": 1.9452995802060806, "ewc_loss": 0.03108270838856697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.108270902885124e-05, "grad_norm": 18.259675979614258, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8720280528068542, "num_tokens": 583323810.0, "step": 15292 }, { "epoch": 1.9454267904846712, "ewc_loss": 0.03122881054878235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1228810257744044e-05, "grad_norm": 18.32459831237793, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8820380568504333, "num_tokens": 583354244.0, "step": 15293 }, { "epoch": 1.9455540007632617, "ewc_loss": 0.031096868216991425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109686804236844e-05, "grad_norm": 18.243436813354492, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8836426734924316, "num_tokens": 583391764.0, "step": 15294 }, { "epoch": 1.9456812110418522, "ewc_loss": 0.031095284968614578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.109528552158736e-05, "grad_norm": 18.32924461364746, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8573819398880005, "num_tokens": 583426305.0, "step": 15295 }, { "epoch": 1.9458084213204427, "ewc_loss": 0.03112536482512951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112536433036439e-05, "grad_norm": 18.250736236572266, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8661224246025085, "num_tokens": 583465252.0, "step": 15296 }, { "epoch": 1.945935631599033, "ewc_loss": 0.031140025705099106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114002538495697e-05, "grad_norm": 18.34978485107422, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8626078367233276, "num_tokens": 583510001.0, "step": 15297 }, { "epoch": 1.9460628418776236, "ewc_loss": 0.031172480434179306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1172479793895036e-05, "grad_norm": 18.27264976501465, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8717593550682068, "num_tokens": 583547380.0, "step": 15298 }, { "epoch": 1.946190052156214, "ewc_loss": 0.03106856532394886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106856456724927e-05, "grad_norm": 18.282196044921875, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8709516525268555, "num_tokens": 583588852.0, "step": 15299 }, { "epoch": 1.9463172624348046, "ewc_loss": 0.03113236092031002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113236016361043e-05, "grad_norm": 18.26569175720215, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.877333402633667, "num_tokens": 583634006.0, "step": 15300 }, { "epoch": 1.9464444727133952, "ewc_loss": 0.03111839108169079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1118390324991196e-05, "grad_norm": 18.31089973449707, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.856874406337738, "num_tokens": 583679430.0, "step": 15301 }, { "epoch": 1.9465716829919857, "ewc_loss": 0.03112029656767845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112029662588611e-05, "grad_norm": 18.305885314941406, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.877128005027771, "num_tokens": 583708856.0, "step": 15302 }, { "epoch": 1.9466988932705762, "ewc_loss": 0.03103146143257618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.103146082139574e-05, "grad_norm": 18.26905059814453, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8691505193710327, "num_tokens": 583750737.0, "step": 15303 }, { "epoch": 1.9468261035491667, "ewc_loss": 0.031147221103310585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11472213070374e-05, "grad_norm": 18.27117156982422, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8760591745376587, "num_tokens": 583786342.0, "step": 15304 }, { "epoch": 1.9469533138277573, "ewc_loss": 0.03106941655278206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.106941585429013e-05, "grad_norm": 18.191762924194336, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8760877847671509, "num_tokens": 583821681.0, "step": 15305 }, { "epoch": 1.9470805241063478, "ewc_loss": 0.03112151101231575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112151171080768e-05, "grad_norm": 18.242977142333984, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8522897958755493, "num_tokens": 583855640.0, "step": 15306 }, { "epoch": 1.9472077343849383, "ewc_loss": 0.031172705814242363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1172705348581076e-05, "grad_norm": 18.322546005249023, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8808282613754272, "num_tokens": 583888047.0, "step": 15307 }, { "epoch": 1.9473349446635289, "ewc_loss": 0.031196020543575287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1196021154755726e-05, "grad_norm": 18.25023651123047, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8693406581878662, "num_tokens": 583923220.0, "step": 15308 }, { "epoch": 1.9474621549421194, "ewc_loss": 0.03119412250816822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1194122129818425e-05, "grad_norm": 18.276906967163086, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8654626607894897, "num_tokens": 583965718.0, "step": 15309 }, { "epoch": 1.94758936522071, "ewc_loss": 0.031236806884407997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123680653516203e-05, "grad_norm": 18.30768394470215, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8762437701225281, "num_tokens": 584014027.0, "step": 15310 }, { "epoch": 1.9477165754993004, "ewc_loss": 0.031139032915234566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1139032216742635e-05, "grad_norm": 18.305805206298828, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8744816184043884, "num_tokens": 584045019.0, "step": 15311 }, { "epoch": 1.947843785777891, "ewc_loss": 0.031202219426631927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120222027064301e-05, "grad_norm": 18.29144859313965, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.854891836643219, "num_tokens": 584086839.0, "step": 15312 }, { "epoch": 1.9479709960564815, "ewc_loss": 0.031173191964626312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1173192837741226e-05, "grad_norm": 18.23556137084961, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8840655088424683, "num_tokens": 584123785.0, "step": 15313 }, { "epoch": 1.948098206335072, "ewc_loss": 0.031223025172948837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122302587144077e-05, "grad_norm": 18.383255004882812, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8622449636459351, "num_tokens": 584163768.0, "step": 15314 }, { "epoch": 1.9482254166136623, "ewc_loss": 0.031199714168906212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1199713703244925e-05, "grad_norm": 18.240299224853516, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.866947591304779, "num_tokens": 584201668.0, "step": 15315 }, { "epoch": 1.9483526268922529, "ewc_loss": 0.031210165470838547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12101656163577e-05, "grad_norm": 18.367658615112305, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8783679008483887, "num_tokens": 584236626.0, "step": 15316 }, { "epoch": 1.9484798371708434, "ewc_loss": 0.031195878982543945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.119587927358225e-05, "grad_norm": 18.24214744567871, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8546475768089294, "num_tokens": 584279811.0, "step": 15317 }, { "epoch": 1.948607047449434, "ewc_loss": 0.031150132417678833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115013169008307e-05, "grad_norm": 18.302175521850586, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8707967400550842, "num_tokens": 584317314.0, "step": 15318 }, { "epoch": 1.9487342577280244, "ewc_loss": 0.0312332846224308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123328497167677e-05, "grad_norm": 18.295352935791016, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8753138184547424, "num_tokens": 584355768.0, "step": 15319 }, { "epoch": 1.948861468006615, "ewc_loss": 0.03118710406124592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1187104468699545e-05, "grad_norm": 18.269901275634766, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8685277700424194, "num_tokens": 584386205.0, "step": 15320 }, { "epoch": 1.9489886782852053, "ewc_loss": 0.03121843747794628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1218438380165026e-05, "grad_norm": 18.276432037353516, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8815507292747498, "num_tokens": 584419721.0, "step": 15321 }, { "epoch": 1.9491158885637958, "ewc_loss": 0.03121926449239254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1219264201354235e-05, "grad_norm": 18.345766067504883, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8683285713195801, "num_tokens": 584451029.0, "step": 15322 }, { "epoch": 1.9492430988423863, "ewc_loss": 0.031225180253386497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1225179554894567e-05, "grad_norm": 18.209745407104492, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8847879767417908, "num_tokens": 584483991.0, "step": 15323 }, { "epoch": 1.9493703091209769, "ewc_loss": 0.03120708465576172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120708424830809e-05, "grad_norm": 18.3165283203125, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8686548471450806, "num_tokens": 584517652.0, "step": 15324 }, { "epoch": 1.9494975193995674, "ewc_loss": 0.03127119317650795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127119271084666e-05, "grad_norm": 18.187936782836914, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8784166574478149, "num_tokens": 584561058.0, "step": 15325 }, { "epoch": 1.949624729678158, "ewc_loss": 0.03120417706668377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1204177503241226e-05, "grad_norm": 18.30592155456543, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8740967512130737, "num_tokens": 584595097.0, "step": 15326 }, { "epoch": 1.9497519399567484, "ewc_loss": 0.03137011453509331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137011299259029e-05, "grad_norm": 18.32468605041504, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8775548338890076, "num_tokens": 584639073.0, "step": 15327 }, { "epoch": 1.949879150235339, "ewc_loss": 0.03126244619488716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1262447009794414e-05, "grad_norm": 18.351238250732422, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8612580299377441, "num_tokens": 584680517.0, "step": 15328 }, { "epoch": 1.9500063605139295, "ewc_loss": 0.03128201514482498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128201569779776e-05, "grad_norm": 18.235958099365234, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8698987364768982, "num_tokens": 584717796.0, "step": 15329 }, { "epoch": 1.95013357079252, "ewc_loss": 0.03126814588904381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126814772258513e-05, "grad_norm": 18.296659469604492, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8834060430526733, "num_tokens": 584755595.0, "step": 15330 }, { "epoch": 1.9502607810711106, "ewc_loss": 0.031275276094675064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127527452306822e-05, "grad_norm": 18.24028205871582, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8608036041259766, "num_tokens": 584798681.0, "step": 15331 }, { "epoch": 1.950387991349701, "ewc_loss": 0.03124229609966278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124229624518193e-05, "grad_norm": 18.299097061157227, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8736971020698547, "num_tokens": 584835424.0, "step": 15332 }, { "epoch": 1.9505152016282916, "ewc_loss": 0.031292591243982315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1292591302189976e-05, "grad_norm": 18.299779891967773, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8837454319000244, "num_tokens": 584872063.0, "step": 15333 }, { "epoch": 1.9506424119068821, "ewc_loss": 0.031221359968185425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122135967714712e-05, "grad_norm": 18.334369659423828, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8861576318740845, "num_tokens": 584905850.0, "step": 15334 }, { "epoch": 1.9507696221854727, "ewc_loss": 0.03123016282916069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123016358586028e-05, "grad_norm": 18.368366241455078, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8792678713798523, "num_tokens": 584946480.0, "step": 15335 }, { "epoch": 1.9508968324640632, "ewc_loss": 0.031199783086776733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.119978282484226e-05, "grad_norm": 18.314037322998047, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8771584033966064, "num_tokens": 584977382.0, "step": 15336 }, { "epoch": 1.9510240427426537, "ewc_loss": 0.03119889460504055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.119889515801333e-05, "grad_norm": 18.30094337463379, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8732175827026367, "num_tokens": 585010885.0, "step": 15337 }, { "epoch": 1.9511512530212443, "ewc_loss": 0.031229326501488686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122932685073465e-05, "grad_norm": 18.273025512695312, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8691497445106506, "num_tokens": 585048208.0, "step": 15338 }, { "epoch": 1.9512784632998348, "ewc_loss": 0.031225524842739105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122552516288124e-05, "grad_norm": 18.328500747680664, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.866142749786377, "num_tokens": 585087477.0, "step": 15339 }, { "epoch": 1.951405673578425, "ewc_loss": 0.031241390854120255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124139038845897e-05, "grad_norm": 18.31289291381836, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8703881502151489, "num_tokens": 585120860.0, "step": 15340 }, { "epoch": 1.9515328838570156, "ewc_loss": 0.03120829537510872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1208295695250854e-05, "grad_norm": 18.23817253112793, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8722183108329773, "num_tokens": 585161887.0, "step": 15341 }, { "epoch": 1.9516600941356061, "ewc_loss": 0.03126528114080429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126528099528514e-05, "grad_norm": 18.354843139648438, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8792926669120789, "num_tokens": 585200496.0, "step": 15342 }, { "epoch": 1.9517873044141967, "ewc_loss": 0.03132753074169159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1327530450653285e-05, "grad_norm": 18.264522552490234, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8708583116531372, "num_tokens": 585243976.0, "step": 15343 }, { "epoch": 1.9519145146927872, "ewc_loss": 0.031208764761686325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120876499451697e-05, "grad_norm": 18.35966682434082, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8654186725616455, "num_tokens": 585282366.0, "step": 15344 }, { "epoch": 1.9520417249713777, "ewc_loss": 0.03126469999551773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1264698918676004e-05, "grad_norm": 18.251304626464844, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.86875319480896, "num_tokens": 585325224.0, "step": 15345 }, { "epoch": 1.952168935249968, "ewc_loss": 0.03122522681951523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122522684861906e-05, "grad_norm": 18.28181266784668, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8785586953163147, "num_tokens": 585361954.0, "step": 15346 }, { "epoch": 1.9522961455285586, "ewc_loss": 0.03132602944970131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1326027965405956e-05, "grad_norm": 18.290719985961914, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8672263622283936, "num_tokens": 585406033.0, "step": 15347 }, { "epoch": 1.952423355807149, "ewc_loss": 0.03128238022327423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128237949567847e-05, "grad_norm": 18.35006332397461, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8693034648895264, "num_tokens": 585440457.0, "step": 15348 }, { "epoch": 1.9525505660857396, "ewc_loss": 0.031262028962373734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12620286422316e-05, "grad_norm": 18.260623931884766, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8924914598464966, "num_tokens": 585473451.0, "step": 15349 }, { "epoch": 1.9526777763643302, "ewc_loss": 0.03121366910636425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121366898994893e-05, "grad_norm": 18.31427574157715, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8737608194351196, "num_tokens": 585509652.0, "step": 15350 }, { "epoch": 1.9528049866429207, "ewc_loss": 0.031248288229107857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124828799627721e-05, "grad_norm": 18.27992820739746, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8751881718635559, "num_tokens": 585545591.0, "step": 15351 }, { "epoch": 1.9529321969215112, "ewc_loss": 0.03114570863544941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114570790785365e-05, "grad_norm": 18.314674377441406, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8592739701271057, "num_tokens": 585583757.0, "step": 15352 }, { "epoch": 1.9530594072001017, "ewc_loss": 0.03123817779123783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1238178053172305e-05, "grad_norm": 18.27885627746582, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8593647480010986, "num_tokens": 585623897.0, "step": 15353 }, { "epoch": 1.9531866174786923, "ewc_loss": 0.03113926202058792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113926140940748e-05, "grad_norm": 18.23806381225586, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8741376399993896, "num_tokens": 585661043.0, "step": 15354 }, { "epoch": 1.9533138277572828, "ewc_loss": 0.03126634657382965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1266346923075616e-05, "grad_norm": 18.28786849975586, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8442962765693665, "num_tokens": 585699201.0, "step": 15355 }, { "epoch": 1.9534410380358733, "ewc_loss": 0.03125526010990143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12552583636716e-05, "grad_norm": 18.276596069335938, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.863588273525238, "num_tokens": 585735325.0, "step": 15356 }, { "epoch": 1.9535682483144639, "ewc_loss": 0.03127165511250496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127165473415516e-05, "grad_norm": 18.25217628479004, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8781644701957703, "num_tokens": 585769909.0, "step": 15357 }, { "epoch": 1.9536954585930544, "ewc_loss": 0.031186725944280624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118672611890361e-05, "grad_norm": 18.25827980041504, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8600801229476929, "num_tokens": 585804590.0, "step": 15358 }, { "epoch": 1.953822668871645, "ewc_loss": 0.031238945201039314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12389456667006e-05, "grad_norm": 18.199750900268555, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8660477995872498, "num_tokens": 585843349.0, "step": 15359 }, { "epoch": 1.9539498791502354, "ewc_loss": 0.03126367926597595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126368028461002e-05, "grad_norm": 18.281936645507812, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8830063343048096, "num_tokens": 585881846.0, "step": 15360 }, { "epoch": 1.954077089428826, "ewc_loss": 0.03128253296017647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128253229078837e-05, "grad_norm": 18.243947982788086, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8687528371810913, "num_tokens": 585917848.0, "step": 15361 }, { "epoch": 1.9542042997074165, "ewc_loss": 0.031214073300361633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1214072805596516e-05, "grad_norm": 18.2259521484375, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8759483098983765, "num_tokens": 585961249.0, "step": 15362 }, { "epoch": 1.954331509986007, "ewc_loss": 0.03130761906504631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130761979264207e-05, "grad_norm": 18.162996292114258, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.878979504108429, "num_tokens": 586005795.0, "step": 15363 }, { "epoch": 1.9544587202645973, "ewc_loss": 0.0312713161110878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1271316402126104e-05, "grad_norm": 18.299386978149414, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8802167177200317, "num_tokens": 586041523.0, "step": 15364 }, { "epoch": 1.9545859305431879, "ewc_loss": 0.031350716948509216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135071528959088e-05, "grad_norm": 18.23700714111328, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8620911836624146, "num_tokens": 586077800.0, "step": 15365 }, { "epoch": 1.9547131408217784, "ewc_loss": 0.03124619647860527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1246196158463135e-05, "grad_norm": 18.3308162689209, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8654162883758545, "num_tokens": 586116010.0, "step": 15366 }, { "epoch": 1.954840351100369, "ewc_loss": 0.03133831173181534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1338313419837505e-05, "grad_norm": 18.300308227539062, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8822525143623352, "num_tokens": 586155359.0, "step": 15367 }, { "epoch": 1.9549675613789594, "ewc_loss": 0.03121650591492653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121650661341846e-05, "grad_norm": 18.260072708129883, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8766024112701416, "num_tokens": 586188190.0, "step": 15368 }, { "epoch": 1.95509477165755, "ewc_loss": 0.03131211921572685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1312119972426444e-05, "grad_norm": 18.297780990600586, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8701390624046326, "num_tokens": 586225971.0, "step": 15369 }, { "epoch": 1.9552219819361403, "ewc_loss": 0.031213486567139626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1213487091008574e-05, "grad_norm": 18.26563262939453, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8672391772270203, "num_tokens": 586263295.0, "step": 15370 }, { "epoch": 1.9553491922147308, "ewc_loss": 0.031272098422050476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127209856756963e-05, "grad_norm": 18.428424835205078, "learning_rate": 1e-06, "loss": 0.3363, "mean_token_accuracy": 0.8924624919891357, "num_tokens": 586293547.0, "step": 15371 }, { "epoch": 1.9554764024933213, "ewc_loss": 0.031292665749788284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129266406176612e-05, "grad_norm": 18.29364585876465, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8663802742958069, "num_tokens": 586333180.0, "step": 15372 }, { "epoch": 1.9556036127719119, "ewc_loss": 0.03119046613574028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11904659611173e-05, "grad_norm": 18.288188934326172, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8715711832046509, "num_tokens": 586368780.0, "step": 15373 }, { "epoch": 1.9557308230505024, "ewc_loss": 0.0312423687428236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1242369004758075e-05, "grad_norm": 18.344911575317383, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8555371761322021, "num_tokens": 586404177.0, "step": 15374 }, { "epoch": 1.955858033329093, "ewc_loss": 0.03121940605342388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121940608252771e-05, "grad_norm": 18.244089126586914, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8562030792236328, "num_tokens": 586446917.0, "step": 15375 }, { "epoch": 1.9559852436076834, "ewc_loss": 0.031221268698573112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122126872767694e-05, "grad_norm": 18.315696716308594, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8715367913246155, "num_tokens": 586485832.0, "step": 15376 }, { "epoch": 1.956112453886274, "ewc_loss": 0.031303126364946365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1303126888815314e-05, "grad_norm": 18.27060890197754, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8809170722961426, "num_tokens": 586524089.0, "step": 15377 }, { "epoch": 1.9562396641648645, "ewc_loss": 0.031275682151317596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1275681976694614e-05, "grad_norm": 18.362855911254883, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8742641806602478, "num_tokens": 586564097.0, "step": 15378 }, { "epoch": 1.956366874443455, "ewc_loss": 0.031225619837641716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1225619750330225e-05, "grad_norm": 18.26557731628418, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8841078281402588, "num_tokens": 586600653.0, "step": 15379 }, { "epoch": 1.9564940847220456, "ewc_loss": 0.031254030764102936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125403236481361e-05, "grad_norm": 18.34429359436035, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8764617443084717, "num_tokens": 586639527.0, "step": 15380 }, { "epoch": 1.956621295000636, "ewc_loss": 0.031231218948960304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123121859971434e-05, "grad_norm": 18.192508697509766, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8763198256492615, "num_tokens": 586670885.0, "step": 15381 }, { "epoch": 1.9567485052792266, "ewc_loss": 0.03125089034438133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1250889151124284e-05, "grad_norm": 18.304792404174805, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8841903209686279, "num_tokens": 586709875.0, "step": 15382 }, { "epoch": 1.9568757155578171, "ewc_loss": 0.03128259256482124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128259413642809e-05, "grad_norm": 18.20313835144043, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.865071713924408, "num_tokens": 586753164.0, "step": 15383 }, { "epoch": 1.9570029258364077, "ewc_loss": 0.031235627830028534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1235627830028534e-05, "grad_norm": 18.317302703857422, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8778365850448608, "num_tokens": 586793737.0, "step": 15384 }, { "epoch": 1.9571301361149982, "ewc_loss": 0.031322184950113297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132218625978567e-05, "grad_norm": 18.22905921936035, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8666536808013916, "num_tokens": 586833428.0, "step": 15385 }, { "epoch": 1.9572573463935887, "ewc_loss": 0.031214196234941483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121419649687596e-05, "grad_norm": 18.337736129760742, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8485125303268433, "num_tokens": 586871202.0, "step": 15386 }, { "epoch": 1.9573845566721793, "ewc_loss": 0.03127354383468628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1273542845156044e-05, "grad_norm": 18.235084533691406, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8817260265350342, "num_tokens": 586905537.0, "step": 15387 }, { "epoch": 1.9575117669507698, "ewc_loss": 0.031260229647159576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1260231480700895e-05, "grad_norm": 18.354000091552734, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8619090914726257, "num_tokens": 586940378.0, "step": 15388 }, { "epoch": 1.95763897722936, "ewc_loss": 0.03126631677150726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126631781924516e-05, "grad_norm": 18.298442840576172, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8810381889343262, "num_tokens": 586978806.0, "step": 15389 }, { "epoch": 1.9577661875079506, "ewc_loss": 0.03122399002313614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122398993582465e-05, "grad_norm": 18.278043746948242, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8641332983970642, "num_tokens": 587007367.0, "step": 15390 }, { "epoch": 1.9578933977865411, "ewc_loss": 0.03130442649126053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1304425647249445e-05, "grad_norm": 18.32158088684082, "learning_rate": 1e-06, "loss": 0.3391, "mean_token_accuracy": 0.8927837014198303, "num_tokens": 587040422.0, "step": 15391 }, { "epoch": 1.9580206080651317, "ewc_loss": 0.03127724677324295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127724630758166e-05, "grad_norm": 18.337066650390625, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8877502679824829, "num_tokens": 587070238.0, "step": 15392 }, { "epoch": 1.9581478183437222, "ewc_loss": 0.031248528510332108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124852810287848e-05, "grad_norm": 18.293685913085938, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8774077296257019, "num_tokens": 587105973.0, "step": 15393 }, { "epoch": 1.9582750286223127, "ewc_loss": 0.0312398299574852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1239829695550725e-05, "grad_norm": 18.3222713470459, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8837759494781494, "num_tokens": 587137975.0, "step": 15394 }, { "epoch": 1.958402238900903, "ewc_loss": 0.03130863979458809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130863842670806e-05, "grad_norm": 18.288177490234375, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8677561283111572, "num_tokens": 587169176.0, "step": 15395 }, { "epoch": 1.9585294491794936, "ewc_loss": 0.031331390142440796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133139034616761e-05, "grad_norm": 18.29022979736328, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8690440058708191, "num_tokens": 587205570.0, "step": 15396 }, { "epoch": 1.958656659458084, "ewc_loss": 0.0313444547355175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1344454328063875e-05, "grad_norm": 18.366811752319336, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8630162477493286, "num_tokens": 587246729.0, "step": 15397 }, { "epoch": 1.9587838697366746, "ewc_loss": 0.031352024525403976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135202496196143e-05, "grad_norm": 18.31264305114746, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8758964538574219, "num_tokens": 587284537.0, "step": 15398 }, { "epoch": 1.9589110800152651, "ewc_loss": 0.031322579830884933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132257916149683e-05, "grad_norm": 18.316795349121094, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8762578964233398, "num_tokens": 587324394.0, "step": 15399 }, { "epoch": 1.9590382902938557, "ewc_loss": 0.03134046122431755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134045982733369e-05, "grad_norm": 18.390962600708008, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8791226744651794, "num_tokens": 587359935.0, "step": 15400 }, { "epoch": 1.9591655005724462, "ewc_loss": 0.031292762607336044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129276228719391e-05, "grad_norm": 18.242727279663086, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8833434581756592, "num_tokens": 587402081.0, "step": 15401 }, { "epoch": 1.9592927108510367, "ewc_loss": 0.03124692291021347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1246923754224554e-05, "grad_norm": 18.395349502563477, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8687245845794678, "num_tokens": 587442939.0, "step": 15402 }, { "epoch": 1.9594199211296273, "ewc_loss": 0.031359124928712845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1359126296592876e-05, "grad_norm": 18.318315505981445, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.862826943397522, "num_tokens": 587479155.0, "step": 15403 }, { "epoch": 1.9595471314082178, "ewc_loss": 0.031250789761543274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125079092569649e-05, "grad_norm": 18.425296783447266, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8604223728179932, "num_tokens": 587519285.0, "step": 15404 }, { "epoch": 1.9596743416868083, "ewc_loss": 0.03126499429345131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126499359495938e-05, "grad_norm": 18.18804359436035, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8529878258705139, "num_tokens": 587563978.0, "step": 15405 }, { "epoch": 1.9598015519653988, "ewc_loss": 0.031217340379953384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1217339710565284e-05, "grad_norm": 18.448026657104492, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8654099702835083, "num_tokens": 587599052.0, "step": 15406 }, { "epoch": 1.9599287622439894, "ewc_loss": 0.0312781035900116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127810487058014e-05, "grad_norm": 18.24092674255371, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8744542002677917, "num_tokens": 587639980.0, "step": 15407 }, { "epoch": 1.96005597252258, "ewc_loss": 0.03119710087776184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.119710163446143e-05, "grad_norm": 18.47675895690918, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8677985668182373, "num_tokens": 587673836.0, "step": 15408 }, { "epoch": 1.9601831828011704, "ewc_loss": 0.03133029490709305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1330295314546674e-05, "grad_norm": 18.300575256347656, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8721746206283569, "num_tokens": 587712896.0, "step": 15409 }, { "epoch": 1.960310393079761, "ewc_loss": 0.03114243410527706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1142433726927266e-05, "grad_norm": 18.32927131652832, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8658546805381775, "num_tokens": 587747352.0, "step": 15410 }, { "epoch": 1.9604376033583515, "ewc_loss": 0.031240452080965042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124045178992674e-05, "grad_norm": 18.29644775390625, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8700011968612671, "num_tokens": 587790234.0, "step": 15411 }, { "epoch": 1.960564813636942, "ewc_loss": 0.031147904694080353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114790524705313e-05, "grad_norm": 18.33177947998047, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8849822282791138, "num_tokens": 587830185.0, "step": 15412 }, { "epoch": 1.9606920239155323, "ewc_loss": 0.03113732300698757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11373223667033e-05, "grad_norm": 18.269351959228516, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8727387189865112, "num_tokens": 587866265.0, "step": 15413 }, { "epoch": 1.9608192341941229, "ewc_loss": 0.031165041029453278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1165040127234533e-05, "grad_norm": 18.297386169433594, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8829726576805115, "num_tokens": 587904956.0, "step": 15414 }, { "epoch": 1.9609464444727134, "ewc_loss": 0.031168133020401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116813240922056e-05, "grad_norm": 18.2978572845459, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8785083889961243, "num_tokens": 587942369.0, "step": 15415 }, { "epoch": 1.961073654751304, "ewc_loss": 0.031159739941358566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11597395921126e-05, "grad_norm": 18.35669708251953, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8764603137969971, "num_tokens": 587979340.0, "step": 15416 }, { "epoch": 1.9612008650298944, "ewc_loss": 0.031185703352093697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1185703846858814e-05, "grad_norm": 18.32788848876953, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8676745891571045, "num_tokens": 588021812.0, "step": 15417 }, { "epoch": 1.961328075308485, "ewc_loss": 0.031154537573456764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115453728241846e-05, "grad_norm": 18.308799743652344, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8673720359802246, "num_tokens": 588064978.0, "step": 15418 }, { "epoch": 1.9614552855870753, "ewc_loss": 0.031132781878113747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.113278216915205e-05, "grad_norm": 18.358680725097656, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8678639531135559, "num_tokens": 588098142.0, "step": 15419 }, { "epoch": 1.9615824958656658, "ewc_loss": 0.03111635334789753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1116353056859225e-05, "grad_norm": 18.271780014038086, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8782981634140015, "num_tokens": 588141808.0, "step": 15420 }, { "epoch": 1.9617097061442563, "ewc_loss": 0.03115088865160942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115088838967495e-05, "grad_norm": 18.357646942138672, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8788372874259949, "num_tokens": 588178137.0, "step": 15421 }, { "epoch": 1.9618369164228469, "ewc_loss": 0.031214497983455658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1214498449116945e-05, "grad_norm": 18.33110809326172, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8650434017181396, "num_tokens": 588215646.0, "step": 15422 }, { "epoch": 1.9619641267014374, "ewc_loss": 0.031115273013710976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.111527257715352e-05, "grad_norm": 18.28624725341797, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8632411360740662, "num_tokens": 588251345.0, "step": 15423 }, { "epoch": 1.962091336980028, "ewc_loss": 0.031225906684994698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1225907150655985e-05, "grad_norm": 18.39594268798828, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8776791095733643, "num_tokens": 588291222.0, "step": 15424 }, { "epoch": 1.9622185472586184, "ewc_loss": 0.031230712309479713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123071292066015e-05, "grad_norm": 18.316375732421875, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.863359808921814, "num_tokens": 588329480.0, "step": 15425 }, { "epoch": 1.962345757537209, "ewc_loss": 0.031166572123765945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116657171631232e-05, "grad_norm": 18.333139419555664, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8623204231262207, "num_tokens": 588367143.0, "step": 15426 }, { "epoch": 1.9624729678157995, "ewc_loss": 0.03127255290746689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1272553314920515e-05, "grad_norm": 18.43193817138672, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8794958591461182, "num_tokens": 588408363.0, "step": 15427 }, { "epoch": 1.96260017809439, "ewc_loss": 0.031150083988904953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115008439635858e-05, "grad_norm": 18.32305335998535, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8783973455429077, "num_tokens": 588445397.0, "step": 15428 }, { "epoch": 1.9627273883729806, "ewc_loss": 0.03122173808515072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122173802694306e-05, "grad_norm": 18.42122459411621, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.87702476978302, "num_tokens": 588480436.0, "step": 15429 }, { "epoch": 1.962854598651571, "ewc_loss": 0.031208015978336334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120801557088271e-05, "grad_norm": 18.375877380371094, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8695257902145386, "num_tokens": 588526918.0, "step": 15430 }, { "epoch": 1.9629818089301616, "ewc_loss": 0.031183572486042976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118357199127786e-05, "grad_norm": 18.40487289428711, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8723433017730713, "num_tokens": 588561300.0, "step": 15431 }, { "epoch": 1.9631090192087521, "ewc_loss": 0.031148847192525864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114884748356417e-05, "grad_norm": 18.393014907836914, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8622453808784485, "num_tokens": 588603393.0, "step": 15432 }, { "epoch": 1.9632362294873427, "ewc_loss": 0.031124277040362358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112427657470107e-05, "grad_norm": 18.390058517456055, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8786409497261047, "num_tokens": 588643806.0, "step": 15433 }, { "epoch": 1.9633634397659332, "ewc_loss": 0.031123850494623184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.112385093118064e-05, "grad_norm": 18.418088912963867, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8704272508621216, "num_tokens": 588685090.0, "step": 15434 }, { "epoch": 1.9634906500445237, "ewc_loss": 0.031104616820812225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.110461693722755e-05, "grad_norm": 18.260709762573242, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8593572378158569, "num_tokens": 588720983.0, "step": 15435 }, { "epoch": 1.9636178603231143, "ewc_loss": 0.031123995780944824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1123996450332925e-05, "grad_norm": 18.442577362060547, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8644704222679138, "num_tokens": 588760143.0, "step": 15436 }, { "epoch": 1.9637450706017048, "ewc_loss": 0.031181709840893745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118170934612863e-05, "grad_norm": 18.231176376342773, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8736981153488159, "num_tokens": 588796636.0, "step": 15437 }, { "epoch": 1.963872280880295, "ewc_loss": 0.03102513775229454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1025138014229015e-05, "grad_norm": 18.351947784423828, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8682347536087036, "num_tokens": 588831681.0, "step": 15438 }, { "epoch": 1.9639994911588856, "ewc_loss": 0.031170591711997986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1170591682894155e-05, "grad_norm": 18.169422149658203, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8817508220672607, "num_tokens": 588875408.0, "step": 15439 }, { "epoch": 1.9641267014374761, "ewc_loss": 0.03114873170852661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.114873106824234e-05, "grad_norm": 18.481046676635742, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8760335445404053, "num_tokens": 588913583.0, "step": 15440 }, { "epoch": 1.9642539117160667, "ewc_loss": 0.0312504768371582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125047805951908e-05, "grad_norm": 18.274295806884766, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8807399272918701, "num_tokens": 588949833.0, "step": 15441 }, { "epoch": 1.9643811219946572, "ewc_loss": 0.031053787097334862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.105378709733486e-05, "grad_norm": 18.359420776367188, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8764398694038391, "num_tokens": 588985678.0, "step": 15442 }, { "epoch": 1.9645083322732477, "ewc_loss": 0.031202755868434906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.120275505352765e-05, "grad_norm": 18.35679054260254, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8706551790237427, "num_tokens": 589018276.0, "step": 15443 }, { "epoch": 1.964635542551838, "ewc_loss": 0.03108869679272175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1088697141967714e-05, "grad_norm": 18.279870986938477, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8697713613510132, "num_tokens": 589061900.0, "step": 15444 }, { "epoch": 1.9647627528304286, "ewc_loss": 0.031167423352599144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116742300335318e-05, "grad_norm": 18.343170166015625, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8733364343643188, "num_tokens": 589102740.0, "step": 15445 }, { "epoch": 1.964889963109019, "ewc_loss": 0.031154097989201546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11540970869828e-05, "grad_norm": 18.239768981933594, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8821327090263367, "num_tokens": 589135276.0, "step": 15446 }, { "epoch": 1.9650171733876096, "ewc_loss": 0.031154613941907883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.115461367997341e-05, "grad_norm": 18.296554565429688, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.871254563331604, "num_tokens": 589173390.0, "step": 15447 }, { "epoch": 1.9651443836662001, "ewc_loss": 0.031171130016446114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1171130103757605e-05, "grad_norm": 18.365678787231445, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8643736839294434, "num_tokens": 589214592.0, "step": 15448 }, { "epoch": 1.9652715939447907, "ewc_loss": 0.031210359185934067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1210358429234475e-05, "grad_norm": 18.24913215637207, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8720812201499939, "num_tokens": 589250682.0, "step": 15449 }, { "epoch": 1.9653988042233812, "ewc_loss": 0.031136708334088326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.11367075482849e-05, "grad_norm": 18.315120697021484, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8647266626358032, "num_tokens": 589288326.0, "step": 15450 }, { "epoch": 1.9655260145019717, "ewc_loss": 0.03118998557329178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.118998574791476e-05, "grad_norm": 18.26167869567871, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8669763803482056, "num_tokens": 589321632.0, "step": 15451 }, { "epoch": 1.9656532247805623, "ewc_loss": 0.03122236393392086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122236375929788e-05, "grad_norm": 18.333457946777344, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.862514853477478, "num_tokens": 589361134.0, "step": 15452 }, { "epoch": 1.9657804350591528, "ewc_loss": 0.03121977671980858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1219777156366035e-05, "grad_norm": 18.241283416748047, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8741397857666016, "num_tokens": 589402338.0, "step": 15453 }, { "epoch": 1.9659076453377433, "ewc_loss": 0.031195471063256264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1195471819955856e-05, "grad_norm": 18.368030548095703, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8660786747932434, "num_tokens": 589441080.0, "step": 15454 }, { "epoch": 1.9660348556163338, "ewc_loss": 0.031302981078624725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130298136966303e-05, "grad_norm": 18.32203483581543, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8681900501251221, "num_tokens": 589484019.0, "step": 15455 }, { "epoch": 1.9661620658949244, "ewc_loss": 0.031212173402309418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1212173780659214e-05, "grad_norm": 18.363248825073242, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8664916753768921, "num_tokens": 589525890.0, "step": 15456 }, { "epoch": 1.966289276173515, "ewc_loss": 0.03127333149313927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127333184238523e-05, "grad_norm": 18.369815826416016, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8772316575050354, "num_tokens": 589560644.0, "step": 15457 }, { "epoch": 1.9664164864521054, "ewc_loss": 0.03127172589302063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1271727493731305e-05, "grad_norm": 18.372209548950195, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.890557587146759, "num_tokens": 589600863.0, "step": 15458 }, { "epoch": 1.966543696730696, "ewc_loss": 0.03122781030833721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1227809813572094e-05, "grad_norm": 18.30900001525879, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8708857297897339, "num_tokens": 589637948.0, "step": 15459 }, { "epoch": 1.9666709070092865, "ewc_loss": 0.03125765174627304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125765215372667e-05, "grad_norm": 18.37450408935547, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8838415145874023, "num_tokens": 589678061.0, "step": 15460 }, { "epoch": 1.966798117287877, "ewc_loss": 0.031283147633075714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1283147109206766e-05, "grad_norm": 18.31222152709961, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8606250882148743, "num_tokens": 589718050.0, "step": 15461 }, { "epoch": 1.9669253275664673, "ewc_loss": 0.031215200200676918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1215200579026714e-05, "grad_norm": 18.294212341308594, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8746744990348816, "num_tokens": 589759206.0, "step": 15462 }, { "epoch": 1.9670525378450578, "ewc_loss": 0.03126386180520058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126386218355037e-05, "grad_norm": 18.290695190429688, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8607504367828369, "num_tokens": 589800218.0, "step": 15463 }, { "epoch": 1.9671797481236484, "ewc_loss": 0.031244803220033646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124480281258002e-05, "grad_norm": 18.2855281829834, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8722648620605469, "num_tokens": 589843105.0, "step": 15464 }, { "epoch": 1.967306958402239, "ewc_loss": 0.031233109533786774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1233110348694026e-05, "grad_norm": 18.299903869628906, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8622716069221497, "num_tokens": 589881560.0, "step": 15465 }, { "epoch": 1.9674341686808294, "ewc_loss": 0.03126419708132744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1264196877600625e-05, "grad_norm": 18.318470001220703, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8592211604118347, "num_tokens": 589922755.0, "step": 15466 }, { "epoch": 1.96756137895942, "ewc_loss": 0.03123527206480503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123527130810544e-05, "grad_norm": 18.24976348876953, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8528209924697876, "num_tokens": 589964982.0, "step": 15467 }, { "epoch": 1.9676885892380103, "ewc_loss": 0.031230615451931953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123061469523236e-05, "grad_norm": 18.28495979309082, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8576269149780273, "num_tokens": 590003371.0, "step": 15468 }, { "epoch": 1.9678157995166008, "ewc_loss": 0.0312681719660759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1268173188436776e-05, "grad_norm": 18.30097007751465, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8681039214134216, "num_tokens": 590048361.0, "step": 15469 }, { "epoch": 1.9679430097951913, "ewc_loss": 0.031247563660144806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12475640384946e-05, "grad_norm": 18.249414443969727, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8688448667526245, "num_tokens": 590083141.0, "step": 15470 }, { "epoch": 1.9680702200737819, "ewc_loss": 0.03131918981671333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1319188565248623e-05, "grad_norm": 18.37883186340332, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8647087812423706, "num_tokens": 590128161.0, "step": 15471 }, { "epoch": 1.9681974303523724, "ewc_loss": 0.031286999583244324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1286999728763476e-05, "grad_norm": 18.25202178955078, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8859052062034607, "num_tokens": 590166101.0, "step": 15472 }, { "epoch": 1.968324640630963, "ewc_loss": 0.031162424013018608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1162424420472234e-05, "grad_norm": 18.272233963012695, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8695881962776184, "num_tokens": 590207643.0, "step": 15473 }, { "epoch": 1.9684518509095534, "ewc_loss": 0.03131500631570816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131500488962047e-05, "grad_norm": 18.322898864746094, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8636905550956726, "num_tokens": 590247567.0, "step": 15474 }, { "epoch": 1.968579061188144, "ewc_loss": 0.031243616715073586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124361683148891e-05, "grad_norm": 18.278106689453125, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8667889833450317, "num_tokens": 590287940.0, "step": 15475 }, { "epoch": 1.9687062714667345, "ewc_loss": 0.03126031532883644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126031515421346e-05, "grad_norm": 18.321653366088867, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8652799725532532, "num_tokens": 590322558.0, "step": 15476 }, { "epoch": 1.968833481745325, "ewc_loss": 0.03126584365963936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126584488200024e-05, "grad_norm": 18.254623413085938, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.869293212890625, "num_tokens": 590362913.0, "step": 15477 }, { "epoch": 1.9689606920239155, "ewc_loss": 0.031229719519615173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.122971975244582e-05, "grad_norm": 18.336069107055664, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8631354570388794, "num_tokens": 590405877.0, "step": 15478 }, { "epoch": 1.969087902302506, "ewc_loss": 0.031264886260032654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126488809357397e-05, "grad_norm": 18.239290237426758, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8620116710662842, "num_tokens": 590445373.0, "step": 15479 }, { "epoch": 1.9692151125810966, "ewc_loss": 0.031197160482406616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1197159842122346e-05, "grad_norm": 18.3471622467041, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8597076535224915, "num_tokens": 590483319.0, "step": 15480 }, { "epoch": 1.9693423228596871, "ewc_loss": 0.03127657622098923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127657691948116e-05, "grad_norm": 18.31763458251953, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8566765785217285, "num_tokens": 590522433.0, "step": 15481 }, { "epoch": 1.9694695331382777, "ewc_loss": 0.031227922067046165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1227922590915114e-05, "grad_norm": 18.312646865844727, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8752442002296448, "num_tokens": 590564342.0, "step": 15482 }, { "epoch": 1.9695967434168682, "ewc_loss": 0.03125583380460739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125583316432312e-05, "grad_norm": 18.305339813232422, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8730589151382446, "num_tokens": 590594527.0, "step": 15483 }, { "epoch": 1.9697239536954587, "ewc_loss": 0.03128702566027641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1287025194615126e-05, "grad_norm": 18.299060821533203, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8823686242103577, "num_tokens": 590630573.0, "step": 15484 }, { "epoch": 1.9698511639740492, "ewc_loss": 0.03123072348535061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1230723834596574e-05, "grad_norm": 18.270055770874023, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8719067573547363, "num_tokens": 590669333.0, "step": 15485 }, { "epoch": 1.9699783742526398, "ewc_loss": 0.03129502013325691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129502147203311e-05, "grad_norm": 18.335721969604492, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8855197429656982, "num_tokens": 590709884.0, "step": 15486 }, { "epoch": 1.97010558453123, "ewc_loss": 0.03130478039383888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130478216917254e-05, "grad_norm": 18.358572006225586, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8762083053588867, "num_tokens": 590746109.0, "step": 15487 }, { "epoch": 1.9702327948098206, "ewc_loss": 0.03129143640398979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129143806290813e-05, "grad_norm": 18.306129455566406, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8554521799087524, "num_tokens": 590787881.0, "step": 15488 }, { "epoch": 1.9703600050884111, "ewc_loss": 0.03129967674612999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129967808490619e-05, "grad_norm": 18.339006423950195, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8559900522232056, "num_tokens": 590826861.0, "step": 15489 }, { "epoch": 1.9704872153670017, "ewc_loss": 0.03126007691025734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1260078685591e-05, "grad_norm": 18.229768753051758, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8792344927787781, "num_tokens": 590863104.0, "step": 15490 }, { "epoch": 1.9706144256455922, "ewc_loss": 0.03130987659096718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130987533950247e-05, "grad_norm": 18.350242614746094, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8775575757026672, "num_tokens": 590900582.0, "step": 15491 }, { "epoch": 1.9707416359241827, "ewc_loss": 0.03133596107363701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1335959647549316e-05, "grad_norm": 18.30673599243164, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8771872520446777, "num_tokens": 590937735.0, "step": 15492 }, { "epoch": 1.970868846202773, "ewc_loss": 0.031310223042964935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131022458546795e-05, "grad_norm": 18.425079345703125, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8845552802085876, "num_tokens": 590973089.0, "step": 15493 }, { "epoch": 1.9709960564813636, "ewc_loss": 0.0313376858830452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133768404950388e-05, "grad_norm": 18.281747817993164, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8578623533248901, "num_tokens": 591012117.0, "step": 15494 }, { "epoch": 1.971123266759954, "ewc_loss": 0.03123726136982441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123726128251292e-05, "grad_norm": 18.343181610107422, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8518354892730713, "num_tokens": 591044978.0, "step": 15495 }, { "epoch": 1.9712504770385446, "ewc_loss": 0.03135295584797859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135295628453605e-05, "grad_norm": 18.328907012939453, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8680182695388794, "num_tokens": 591085562.0, "step": 15496 }, { "epoch": 1.9713776873171351, "ewc_loss": 0.03127848729491234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127848685835488e-05, "grad_norm": 18.291473388671875, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8609758615493774, "num_tokens": 591120140.0, "step": 15497 }, { "epoch": 1.9715048975957257, "ewc_loss": 0.03139336779713631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1393366953125224e-05, "grad_norm": 18.377721786499023, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8618152737617493, "num_tokens": 591153952.0, "step": 15498 }, { "epoch": 1.9716321078743162, "ewc_loss": 0.03125663474202156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125663351966068e-05, "grad_norm": 18.25348472595215, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8869173526763916, "num_tokens": 591192983.0, "step": 15499 }, { "epoch": 1.9717593181529067, "ewc_loss": 0.03135976940393448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135977021884173e-05, "grad_norm": 18.427688598632812, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8717435002326965, "num_tokens": 591226260.0, "step": 15500 }, { "epoch": 1.9718865284314973, "ewc_loss": 0.0313531756401062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1353174563264474e-05, "grad_norm": 18.347625732421875, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8575586080551147, "num_tokens": 591259361.0, "step": 15501 }, { "epoch": 1.9720137387100878, "ewc_loss": 0.03130808100104332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130808181595057e-05, "grad_norm": 18.300731658935547, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.870809018611908, "num_tokens": 591299753.0, "step": 15502 }, { "epoch": 1.9721409489886783, "ewc_loss": 0.03135745972394943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135746010229923e-05, "grad_norm": 18.30927848815918, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8729647994041443, "num_tokens": 591335008.0, "step": 15503 }, { "epoch": 1.9722681592672688, "ewc_loss": 0.03139479458332062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1394793040817603e-05, "grad_norm": 18.1903076171875, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8774935603141785, "num_tokens": 591373001.0, "step": 15504 }, { "epoch": 1.9723953695458594, "ewc_loss": 0.03136143460869789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136143641313538e-05, "grad_norm": 18.338966369628906, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.873145341873169, "num_tokens": 591408637.0, "step": 15505 }, { "epoch": 1.97252257982445, "ewc_loss": 0.03144369274377823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144369111396372e-05, "grad_norm": 18.30850601196289, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8679391741752625, "num_tokens": 591444908.0, "step": 15506 }, { "epoch": 1.9726497901030404, "ewc_loss": 0.03138091787695885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1380917789647356e-05, "grad_norm": 18.261184692382812, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8727995157241821, "num_tokens": 591484641.0, "step": 15507 }, { "epoch": 1.972777000381631, "ewc_loss": 0.03144710510969162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1447103538084775e-05, "grad_norm": 18.3823184967041, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8669359683990479, "num_tokens": 591521859.0, "step": 15508 }, { "epoch": 1.9729042106602215, "ewc_loss": 0.03146882355213165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.146882227156311e-05, "grad_norm": 18.33414649963379, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8695987462997437, "num_tokens": 591559124.0, "step": 15509 }, { "epoch": 1.973031420938812, "ewc_loss": 0.031375352293252945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1375351682072505e-05, "grad_norm": 18.275318145751953, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8689399361610413, "num_tokens": 591597170.0, "step": 15510 }, { "epoch": 1.9731586312174023, "ewc_loss": 0.03142949193716049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1429492082679644e-05, "grad_norm": 18.426555633544922, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8769617676734924, "num_tokens": 591634784.0, "step": 15511 }, { "epoch": 1.9732858414959928, "ewc_loss": 0.031375180929899216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137518069706857e-05, "grad_norm": 18.315279006958008, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8748313188552856, "num_tokens": 591671904.0, "step": 15512 }, { "epoch": 1.9734130517745834, "ewc_loss": 0.031306736171245575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130673576379195e-05, "grad_norm": 18.34107780456543, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.872409462928772, "num_tokens": 591712990.0, "step": 15513 }, { "epoch": 1.973540262053174, "ewc_loss": 0.03138260915875435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138260944979265e-05, "grad_norm": 18.335704803466797, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.866975724697113, "num_tokens": 591748579.0, "step": 15514 }, { "epoch": 1.9736674723317644, "ewc_loss": 0.03140470013022423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1404699257109314e-05, "grad_norm": 18.442319869995117, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8640896081924438, "num_tokens": 591785865.0, "step": 15515 }, { "epoch": 1.973794682610355, "ewc_loss": 0.03134409338235855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134409416816197e-05, "grad_norm": 18.278234481811523, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8796776533126831, "num_tokens": 591820087.0, "step": 15516 }, { "epoch": 1.9739218928889453, "ewc_loss": 0.03129297494888306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129297328996472e-05, "grad_norm": 18.347383499145508, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8696901798248291, "num_tokens": 591859713.0, "step": 15517 }, { "epoch": 1.9740491031675358, "ewc_loss": 0.03141385316848755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141385241178796e-05, "grad_norm": 18.363191604614258, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8776469230651855, "num_tokens": 591893142.0, "step": 15518 }, { "epoch": 1.9741763134461263, "ewc_loss": 0.03132162243127823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132162237307057e-05, "grad_norm": 18.3248348236084, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8653268814086914, "num_tokens": 591931529.0, "step": 15519 }, { "epoch": 1.9743035237247168, "ewc_loss": 0.03128637745976448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128637763438746e-05, "grad_norm": 18.315799713134766, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8757296204566956, "num_tokens": 591976248.0, "step": 15520 }, { "epoch": 1.9744307340033074, "ewc_loss": 0.031391993165016174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139199179713614e-05, "grad_norm": 18.3770751953125, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.868537425994873, "num_tokens": 592018008.0, "step": 15521 }, { "epoch": 1.974557944281898, "ewc_loss": 0.03137349337339401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137349267490208e-05, "grad_norm": 18.3602352142334, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8563534021377563, "num_tokens": 592062934.0, "step": 15522 }, { "epoch": 1.9746851545604884, "ewc_loss": 0.0312732569873333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1273255444830284e-05, "grad_norm": 18.24514389038086, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8692541122436523, "num_tokens": 592103129.0, "step": 15523 }, { "epoch": 1.974812364839079, "ewc_loss": 0.031303249299526215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1303250580094755e-05, "grad_norm": 18.39788818359375, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8661626577377319, "num_tokens": 592142116.0, "step": 15524 }, { "epoch": 1.9749395751176695, "ewc_loss": 0.03138591721653938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13859163725283e-05, "grad_norm": 18.32482147216797, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8718193769454956, "num_tokens": 592182507.0, "step": 15525 }, { "epoch": 1.97506678539626, "ewc_loss": 0.03132733330130577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13273339997977e-05, "grad_norm": 18.385679244995117, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8488471508026123, "num_tokens": 592225593.0, "step": 15526 }, { "epoch": 1.9751939956748505, "ewc_loss": 0.03140130639076233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.14013050228823e-05, "grad_norm": 18.46974754333496, "learning_rate": 1e-06, "loss": 0.3358, "mean_token_accuracy": 0.8945268392562866, "num_tokens": 592270676.0, "step": 15527 }, { "epoch": 1.975321205953441, "ewc_loss": 0.031215105205774307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.121510599157773e-05, "grad_norm": 18.3187313079834, "learning_rate": 1e-06, "loss": 0.3481, "mean_token_accuracy": 0.8889399766921997, "num_tokens": 592305537.0, "step": 15528 }, { "epoch": 1.9754484162320316, "ewc_loss": 0.03123057447373867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1230574677465484e-05, "grad_norm": 18.402116775512695, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8775472640991211, "num_tokens": 592338289.0, "step": 15529 }, { "epoch": 1.9755756265106221, "ewc_loss": 0.0313606932759285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136069426545873e-05, "grad_norm": 18.331783294677734, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8731491565704346, "num_tokens": 592379189.0, "step": 15530 }, { "epoch": 1.9757028367892127, "ewc_loss": 0.031160004436969757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116000516456552e-05, "grad_norm": 18.315439224243164, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8839669823646545, "num_tokens": 592421875.0, "step": 15531 }, { "epoch": 1.9758300470678032, "ewc_loss": 0.03131071850657463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1310719350585714e-05, "grad_norm": 18.3783016204834, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8524730205535889, "num_tokens": 592457438.0, "step": 15532 }, { "epoch": 1.9759572573463937, "ewc_loss": 0.03131426125764847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131426274194382e-05, "grad_norm": 18.340959548950195, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8819081783294678, "num_tokens": 592493230.0, "step": 15533 }, { "epoch": 1.9760844676249842, "ewc_loss": 0.03126523271203041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1265233701560646e-05, "grad_norm": 18.35591697692871, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8548998832702637, "num_tokens": 592533656.0, "step": 15534 }, { "epoch": 1.9762116779035748, "ewc_loss": 0.031330406665802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1330408091889694e-05, "grad_norm": 18.312728881835938, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.88409024477005, "num_tokens": 592571622.0, "step": 15535 }, { "epoch": 1.976338888182165, "ewc_loss": 0.031286824494600296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1286825105780736e-05, "grad_norm": 18.31072425842285, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.865319013595581, "num_tokens": 592613260.0, "step": 15536 }, { "epoch": 1.9764660984607556, "ewc_loss": 0.031355906277894974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13559066853486e-05, "grad_norm": 18.34669303894043, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8682601451873779, "num_tokens": 592648872.0, "step": 15537 }, { "epoch": 1.9765933087393461, "ewc_loss": 0.0312739834189415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12739830405917e-05, "grad_norm": 18.275705337524414, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8710281252861023, "num_tokens": 592682336.0, "step": 15538 }, { "epoch": 1.9767205190179367, "ewc_loss": 0.03129718452692032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129718606942333e-05, "grad_norm": 18.344558715820312, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8841436505317688, "num_tokens": 592716412.0, "step": 15539 }, { "epoch": 1.9768477292965272, "ewc_loss": 0.03135102987289429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135102815576829e-05, "grad_norm": 18.37906265258789, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8748363256454468, "num_tokens": 592753936.0, "step": 15540 }, { "epoch": 1.9769749395751177, "ewc_loss": 0.03133754804730415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1337549444288015e-05, "grad_norm": 18.37952423095703, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8715224862098694, "num_tokens": 592792731.0, "step": 15541 }, { "epoch": 1.977102149853708, "ewc_loss": 0.031260449439287186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126044975942932e-05, "grad_norm": 18.262826919555664, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8743019104003906, "num_tokens": 592832035.0, "step": 15542 }, { "epoch": 1.9772293601322986, "ewc_loss": 0.031282953917980194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128295429632999e-05, "grad_norm": 18.312484741210938, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8885961174964905, "num_tokens": 592869693.0, "step": 15543 }, { "epoch": 1.977356570410889, "ewc_loss": 0.031322550028562546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1322550057666376e-05, "grad_norm": 18.28932762145996, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8741265535354614, "num_tokens": 592909648.0, "step": 15544 }, { "epoch": 1.9774837806894796, "ewc_loss": 0.03134793043136597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134792859782465e-05, "grad_norm": 18.33196258544922, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8747316598892212, "num_tokens": 592942539.0, "step": 15545 }, { "epoch": 1.9776109909680701, "ewc_loss": 0.031381361186504364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138136162306182e-05, "grad_norm": 18.282384872436523, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8702119588851929, "num_tokens": 592983680.0, "step": 15546 }, { "epoch": 1.9777382012466607, "ewc_loss": 0.03132941946387291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1329418561654165e-05, "grad_norm": 18.322336196899414, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8593337535858154, "num_tokens": 593023161.0, "step": 15547 }, { "epoch": 1.9778654115252512, "ewc_loss": 0.03138284757733345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1382845918415114e-05, "grad_norm": 18.324020385742188, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.852643609046936, "num_tokens": 593066612.0, "step": 15548 }, { "epoch": 1.9779926218038417, "ewc_loss": 0.03137347102165222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137347084702924e-05, "grad_norm": 18.33409309387207, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8593769669532776, "num_tokens": 593105224.0, "step": 15549 }, { "epoch": 1.9781198320824323, "ewc_loss": 0.0313677042722702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136770465062e-05, "grad_norm": 18.4040584564209, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8722406029701233, "num_tokens": 593143683.0, "step": 15550 }, { "epoch": 1.9782470423610228, "ewc_loss": 0.031348925083875656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134892540401779e-05, "grad_norm": 18.208341598510742, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.856448769569397, "num_tokens": 593179827.0, "step": 15551 }, { "epoch": 1.9783742526396133, "ewc_loss": 0.03137252852320671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13725286105182e-05, "grad_norm": 18.36795997619629, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8709466457366943, "num_tokens": 593220936.0, "step": 15552 }, { "epoch": 1.9785014629182038, "ewc_loss": 0.03137779235839844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1377792765852064e-05, "grad_norm": 18.24071502685547, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8671372532844543, "num_tokens": 593258503.0, "step": 15553 }, { "epoch": 1.9786286731967944, "ewc_loss": 0.031368035823106766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1368035706691444e-05, "grad_norm": 18.345930099487305, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8502206802368164, "num_tokens": 593293738.0, "step": 15554 }, { "epoch": 1.978755883475385, "ewc_loss": 0.031399644911289215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1399646104546264e-05, "grad_norm": 18.28243637084961, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8611711263656616, "num_tokens": 593338037.0, "step": 15555 }, { "epoch": 1.9788830937539754, "ewc_loss": 0.03133060783147812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1330608180724084e-05, "grad_norm": 18.28955078125, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8554807901382446, "num_tokens": 593383509.0, "step": 15556 }, { "epoch": 1.979010304032566, "ewc_loss": 0.0314427874982357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1442788895219564e-05, "grad_norm": 18.297826766967773, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8746272325515747, "num_tokens": 593417734.0, "step": 15557 }, { "epoch": 1.9791375143111565, "ewc_loss": 0.031346943229436874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1346942705567926e-05, "grad_norm": 18.34088706970215, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8830902576446533, "num_tokens": 593454502.0, "step": 15558 }, { "epoch": 1.979264724589747, "ewc_loss": 0.031383708119392395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1383708119392395e-05, "grad_norm": 18.29817008972168, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8669239282608032, "num_tokens": 593498864.0, "step": 15559 }, { "epoch": 1.9793919348683373, "ewc_loss": 0.031443387269973755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144338552374393e-05, "grad_norm": 18.38148307800293, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8673505783081055, "num_tokens": 593540125.0, "step": 15560 }, { "epoch": 1.9795191451469278, "ewc_loss": 0.031343478709459305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1343479349743575e-05, "grad_norm": 18.259849548339844, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8811770677566528, "num_tokens": 593566937.0, "step": 15561 }, { "epoch": 1.9796463554255184, "ewc_loss": 0.03135097399353981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1350973586086184e-05, "grad_norm": 18.32328987121582, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8664783835411072, "num_tokens": 593600109.0, "step": 15562 }, { "epoch": 1.979773565704109, "ewc_loss": 0.031461965292692184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1461964681511745e-05, "grad_norm": 18.3471622467041, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8699327707290649, "num_tokens": 593638982.0, "step": 15563 }, { "epoch": 1.9799007759826994, "ewc_loss": 0.031350184231996536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1350184144685045e-05, "grad_norm": 18.312360763549805, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8610385060310364, "num_tokens": 593677270.0, "step": 15564 }, { "epoch": 1.98002798626129, "ewc_loss": 0.0314013808965683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1401381420437247e-05, "grad_norm": 18.31075096130371, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.877164900302887, "num_tokens": 593714056.0, "step": 15565 }, { "epoch": 1.9801551965398803, "ewc_loss": 0.03142736107110977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142736022709869e-05, "grad_norm": 18.303123474121094, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8597636222839355, "num_tokens": 593753516.0, "step": 15566 }, { "epoch": 1.9802824068184708, "ewc_loss": 0.03137794882059097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137794919894077e-05, "grad_norm": 18.34547233581543, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8733073472976685, "num_tokens": 593789276.0, "step": 15567 }, { "epoch": 1.9804096170970613, "ewc_loss": 0.031372904777526855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137290332233533e-05, "grad_norm": 18.326505661010742, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8751081824302673, "num_tokens": 593828112.0, "step": 15568 }, { "epoch": 1.9805368273756518, "ewc_loss": 0.03142447769641876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142447894788347e-05, "grad_norm": 18.27437973022461, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8581117391586304, "num_tokens": 593866370.0, "step": 15569 }, { "epoch": 1.9806640376542424, "ewc_loss": 0.03137625381350517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1376253900816664e-05, "grad_norm": 18.25928497314453, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8747528791427612, "num_tokens": 593907863.0, "step": 15570 }, { "epoch": 1.980791247932833, "ewc_loss": 0.03144587203860283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144587026326917e-05, "grad_norm": 18.31405258178711, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8819065093994141, "num_tokens": 593945254.0, "step": 15571 }, { "epoch": 1.9809184582114234, "ewc_loss": 0.03146706148982048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.146706148982048e-05, "grad_norm": 18.365764617919922, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8735705614089966, "num_tokens": 593987105.0, "step": 15572 }, { "epoch": 1.981045668490014, "ewc_loss": 0.03141317889094353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1413179385708645e-05, "grad_norm": 18.22500991821289, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8818596601486206, "num_tokens": 594024091.0, "step": 15573 }, { "epoch": 1.9811728787686045, "ewc_loss": 0.03136201947927475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1362018489744514e-05, "grad_norm": 18.37139320373535, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8814246654510498, "num_tokens": 594056018.0, "step": 15574 }, { "epoch": 1.981300089047195, "ewc_loss": 0.031465791165828705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1465791835216805e-05, "grad_norm": 18.275697708129883, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8735077381134033, "num_tokens": 594102297.0, "step": 15575 }, { "epoch": 1.9814272993257855, "ewc_loss": 0.031342919915914536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134291910100728e-05, "grad_norm": 18.403972625732422, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8720294237136841, "num_tokens": 594140278.0, "step": 15576 }, { "epoch": 1.981554509604376, "ewc_loss": 0.031456638127565384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145663868053816e-05, "grad_norm": 18.299453735351562, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.874312162399292, "num_tokens": 594175025.0, "step": 15577 }, { "epoch": 1.9816817198829666, "ewc_loss": 0.03130423650145531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1304236472351477e-05, "grad_norm": 18.321147918701172, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8694481253623962, "num_tokens": 594217906.0, "step": 15578 }, { "epoch": 1.9818089301615571, "ewc_loss": 0.0314394049346447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143940557492897e-05, "grad_norm": 18.31513214111328, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8532708287239075, "num_tokens": 594264331.0, "step": 15579 }, { "epoch": 1.9819361404401477, "ewc_loss": 0.03138085827231407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138085958198644e-05, "grad_norm": 18.351058959960938, "learning_rate": 1e-06, "loss": 0.3545, "mean_token_accuracy": 0.885793149471283, "num_tokens": 594300220.0, "step": 15580 }, { "epoch": 1.9820633507187382, "ewc_loss": 0.0313943512737751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1394352845381945e-05, "grad_norm": 18.346115112304688, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8770087957382202, "num_tokens": 594337849.0, "step": 15581 }, { "epoch": 1.9821905609973287, "ewc_loss": 0.031355224549770355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135522274533287e-05, "grad_norm": 18.348548889160156, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.864008903503418, "num_tokens": 594379295.0, "step": 15582 }, { "epoch": 1.9823177712759192, "ewc_loss": 0.03134358674287796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134358848910779e-05, "grad_norm": 18.347734451293945, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8671412467956543, "num_tokens": 594408350.0, "step": 15583 }, { "epoch": 1.9824449815545098, "ewc_loss": 0.031364552676677704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136455416097306e-05, "grad_norm": 18.35127830505371, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8599075078964233, "num_tokens": 594455110.0, "step": 15584 }, { "epoch": 1.9825721918331, "ewc_loss": 0.03132377192378044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132377241854556e-05, "grad_norm": 18.33479881286621, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8744808435440063, "num_tokens": 594499231.0, "step": 15585 }, { "epoch": 1.9826994021116906, "ewc_loss": 0.03132922574877739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132922574877739e-05, "grad_norm": 18.29279327392578, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8526001572608948, "num_tokens": 594541478.0, "step": 15586 }, { "epoch": 1.9828266123902811, "ewc_loss": 0.031331077218055725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13310774799902e-05, "grad_norm": 18.37605094909668, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8734815716743469, "num_tokens": 594579075.0, "step": 15587 }, { "epoch": 1.9829538226688717, "ewc_loss": 0.03137485310435295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1374853278975934e-05, "grad_norm": 18.37618637084961, "learning_rate": 1e-06, "loss": 0.3567, "mean_token_accuracy": 0.8872898817062378, "num_tokens": 594616189.0, "step": 15588 }, { "epoch": 1.9830810329474622, "ewc_loss": 0.03128184750676155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1281848350772634e-05, "grad_norm": 18.32330894470215, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8771470785140991, "num_tokens": 594656633.0, "step": 15589 }, { "epoch": 1.9832082432260527, "ewc_loss": 0.03128805384039879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128805474261753e-05, "grad_norm": 18.341554641723633, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8528798818588257, "num_tokens": 594696833.0, "step": 15590 }, { "epoch": 1.983335453504643, "ewc_loss": 0.03137139603495598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13713971991092e-05, "grad_norm": 18.3275146484375, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8585875630378723, "num_tokens": 594735138.0, "step": 15591 }, { "epoch": 1.9834626637832335, "ewc_loss": 0.03129865601658821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12986558128614e-05, "grad_norm": 18.361783981323242, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8646242618560791, "num_tokens": 594769587.0, "step": 15592 }, { "epoch": 1.983589874061824, "ewc_loss": 0.0313652902841568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136528903269209e-05, "grad_norm": 18.314023971557617, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8758605718612671, "num_tokens": 594803635.0, "step": 15593 }, { "epoch": 1.9837170843404146, "ewc_loss": 0.0313127338886261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131273479084484e-05, "grad_norm": 18.382244110107422, "learning_rate": 1e-06, "loss": 0.3532, "mean_token_accuracy": 0.8864811658859253, "num_tokens": 594839872.0, "step": 15594 }, { "epoch": 1.9838442946190051, "ewc_loss": 0.03137880563735962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1378804123960435e-05, "grad_norm": 18.395402908325195, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8583483695983887, "num_tokens": 594879208.0, "step": 15595 }, { "epoch": 1.9839715048975957, "ewc_loss": 0.031303610652685165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130361073999666e-05, "grad_norm": 18.340084075927734, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8774233460426331, "num_tokens": 594910377.0, "step": 15596 }, { "epoch": 1.9840987151761862, "ewc_loss": 0.03128018602728844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128018579445779e-05, "grad_norm": 18.31216049194336, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.87325119972229, "num_tokens": 594956574.0, "step": 15597 }, { "epoch": 1.9842259254547767, "ewc_loss": 0.03131546825170517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131546691292897e-05, "grad_norm": 18.319292068481445, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8552722334861755, "num_tokens": 594998036.0, "step": 15598 }, { "epoch": 1.9843531357333672, "ewc_loss": 0.03130451217293739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1304512958740816e-05, "grad_norm": 18.368877410888672, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8657429218292236, "num_tokens": 595031404.0, "step": 15599 }, { "epoch": 1.9844803460119578, "ewc_loss": 0.03132559359073639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132559504592791e-05, "grad_norm": 18.273832321166992, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.879429817199707, "num_tokens": 595069800.0, "step": 15600 }, { "epoch": 1.9846075562905483, "ewc_loss": 0.03132212907075882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1322128052124754e-05, "grad_norm": 18.336715698242188, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8762527704238892, "num_tokens": 595110987.0, "step": 15601 }, { "epoch": 1.9847347665691388, "ewc_loss": 0.03139658272266388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139658292639069e-05, "grad_norm": 18.290870666503906, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8739567995071411, "num_tokens": 595149192.0, "step": 15602 }, { "epoch": 1.9848619768477294, "ewc_loss": 0.03132203221321106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.132203346467577e-05, "grad_norm": 18.384241104125977, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8689590692520142, "num_tokens": 595182993.0, "step": 15603 }, { "epoch": 1.9849891871263199, "ewc_loss": 0.031397320330142975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139732143608853e-05, "grad_norm": 18.280166625976562, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8679807186126709, "num_tokens": 595216177.0, "step": 15604 }, { "epoch": 1.9851163974049104, "ewc_loss": 0.03133483603596687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1334835512097925e-05, "grad_norm": 18.310476303100586, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8718335628509521, "num_tokens": 595253379.0, "step": 15605 }, { "epoch": 1.985243607683501, "ewc_loss": 0.031471025198698044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147102688672021e-05, "grad_norm": 18.354515075683594, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.87481689453125, "num_tokens": 595292878.0, "step": 15606 }, { "epoch": 1.9853708179620915, "ewc_loss": 0.03135037049651146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1350369681604207e-05, "grad_norm": 18.320701599121094, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8731129169464111, "num_tokens": 595329082.0, "step": 15607 }, { "epoch": 1.985498028240682, "ewc_loss": 0.031401779502630234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140177796012722e-05, "grad_norm": 18.30401039123535, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8632888793945312, "num_tokens": 595367246.0, "step": 15608 }, { "epoch": 1.9856252385192723, "ewc_loss": 0.031357232481241226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135723090963438e-05, "grad_norm": 18.302183151245117, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8772522211074829, "num_tokens": 595404450.0, "step": 15609 }, { "epoch": 1.9857524487978628, "ewc_loss": 0.0314045324921608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140453191008419e-05, "grad_norm": 18.37897300720215, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8836503028869629, "num_tokens": 595439718.0, "step": 15610 }, { "epoch": 1.9858796590764534, "ewc_loss": 0.031349968165159225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1349969503935426e-05, "grad_norm": 18.288606643676758, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8733489513397217, "num_tokens": 595473617.0, "step": 15611 }, { "epoch": 1.986006869355044, "ewc_loss": 0.03137192130088806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137192106805742e-05, "grad_norm": 18.366239547729492, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8686013221740723, "num_tokens": 595516432.0, "step": 15612 }, { "epoch": 1.9861340796336344, "ewc_loss": 0.03140917420387268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140917397104204e-05, "grad_norm": 18.401195526123047, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8721464276313782, "num_tokens": 595556278.0, "step": 15613 }, { "epoch": 1.986261289912225, "ewc_loss": 0.03132852911949158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1328530894825235e-05, "grad_norm": 18.23550033569336, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8768616318702698, "num_tokens": 595598713.0, "step": 15614 }, { "epoch": 1.9863885001908153, "ewc_loss": 0.03136802092194557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1368021154776216e-05, "grad_norm": 18.419286727905273, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8738311529159546, "num_tokens": 595630597.0, "step": 15615 }, { "epoch": 1.9865157104694058, "ewc_loss": 0.031381189823150635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138119063805789e-05, "grad_norm": 18.225868225097656, "learning_rate": 1e-06, "loss": 0.3457, "mean_token_accuracy": 0.8888046741485596, "num_tokens": 595664163.0, "step": 15616 }, { "epoch": 1.9866429207479963, "ewc_loss": 0.031333938241004944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133393693133257e-05, "grad_norm": 18.384695053100586, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8803615570068359, "num_tokens": 595698809.0, "step": 15617 }, { "epoch": 1.9867701310265868, "ewc_loss": 0.031478382647037506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147838287986815e-05, "grad_norm": 18.345138549804688, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8668926954269409, "num_tokens": 595741166.0, "step": 15618 }, { "epoch": 1.9868973413051774, "ewc_loss": 0.03131489455699921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1314895750256255e-05, "grad_norm": 18.332643508911133, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8663485050201416, "num_tokens": 595787079.0, "step": 15619 }, { "epoch": 1.987024551583768, "ewc_loss": 0.031440701335668564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1440700695384294e-05, "grad_norm": 18.41395378112793, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8467411994934082, "num_tokens": 595829085.0, "step": 15620 }, { "epoch": 1.9871517618623584, "ewc_loss": 0.03137444704771042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137444582534954e-05, "grad_norm": 18.319602966308594, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8696487545967102, "num_tokens": 595870646.0, "step": 15621 }, { "epoch": 1.987278972140949, "ewc_loss": 0.031309835612773895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130983532173559e-05, "grad_norm": 18.359811782836914, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.860875129699707, "num_tokens": 595918473.0, "step": 15622 }, { "epoch": 1.9874061824195395, "ewc_loss": 0.03138245269656181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138245301670395e-05, "grad_norm": 18.36258316040039, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8613415956497192, "num_tokens": 595954097.0, "step": 15623 }, { "epoch": 1.98753339269813, "ewc_loss": 0.03124563954770565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124563954770565e-05, "grad_norm": 18.30164337158203, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8530405759811401, "num_tokens": 595990191.0, "step": 15624 }, { "epoch": 1.9876606029767205, "ewc_loss": 0.031396158039569855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139615728287026e-05, "grad_norm": 18.432119369506836, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8641703128814697, "num_tokens": 596027109.0, "step": 15625 }, { "epoch": 1.987787813255311, "ewc_loss": 0.03129476308822632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129476317553781e-05, "grad_norm": 18.24147605895996, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.878920316696167, "num_tokens": 596067930.0, "step": 15626 }, { "epoch": 1.9879150235339016, "ewc_loss": 0.03133013844490051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133013888145797e-05, "grad_norm": 18.452903747558594, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8814620971679688, "num_tokens": 596105809.0, "step": 15627 }, { "epoch": 1.9880422338124921, "ewc_loss": 0.03139916807413101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1399169529322535e-05, "grad_norm": 18.378280639648438, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8741129040718079, "num_tokens": 596144220.0, "step": 15628 }, { "epoch": 1.9881694440910826, "ewc_loss": 0.031248364597558975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124836439383216e-05, "grad_norm": 18.33910369873047, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.849063515663147, "num_tokens": 596183825.0, "step": 15629 }, { "epoch": 1.9882966543696732, "ewc_loss": 0.03132686764001846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1326868338510394e-05, "grad_norm": 18.321266174316406, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8794468641281128, "num_tokens": 596232333.0, "step": 15630 }, { "epoch": 1.9884238646482637, "ewc_loss": 0.03125713765621185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125713919871487e-05, "grad_norm": 18.391202926635742, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8795602917671204, "num_tokens": 596272138.0, "step": 15631 }, { "epoch": 1.9885510749268542, "ewc_loss": 0.03131489455699921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1314895750256255e-05, "grad_norm": 18.316831588745117, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8808633685112, "num_tokens": 596310254.0, "step": 15632 }, { "epoch": 1.9886782852054448, "ewc_loss": 0.03125518560409546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125518560409546e-05, "grad_norm": 18.428037643432617, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8684412240982056, "num_tokens": 596350396.0, "step": 15633 }, { "epoch": 1.988805495484035, "ewc_loss": 0.031359151005744934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1359151762444526e-05, "grad_norm": 18.354764938354492, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8747298717498779, "num_tokens": 596388809.0, "step": 15634 }, { "epoch": 1.9889327057626256, "ewc_loss": 0.031194347888231277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1194347684504464e-05, "grad_norm": 18.371335983276367, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.878996729850769, "num_tokens": 596427366.0, "step": 15635 }, { "epoch": 1.9890599160412161, "ewc_loss": 0.03130623698234558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.130623736069538e-05, "grad_norm": 18.31557846069336, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8617598414421082, "num_tokens": 596469999.0, "step": 15636 }, { "epoch": 1.9891871263198067, "ewc_loss": 0.03126487135887146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126486990367994e-05, "grad_norm": 18.4836368560791, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8553732633590698, "num_tokens": 596506773.0, "step": 15637 }, { "epoch": 1.9893143365983972, "ewc_loss": 0.031286224722862244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1286224839277565e-05, "grad_norm": 18.299802780151367, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8719360828399658, "num_tokens": 596547440.0, "step": 15638 }, { "epoch": 1.9894415468769877, "ewc_loss": 0.031165679916739464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.116568041150458e-05, "grad_norm": 18.330997467041016, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8785641193389893, "num_tokens": 596580212.0, "step": 15639 }, { "epoch": 1.989568757155578, "ewc_loss": 0.03126341104507446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.126341107417829e-05, "grad_norm": 18.39152717590332, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8923311829566956, "num_tokens": 596611790.0, "step": 15640 }, { "epoch": 1.9896959674341685, "ewc_loss": 0.031238079071044922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1238079827744514e-05, "grad_norm": 18.36663818359375, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.8858421444892883, "num_tokens": 596646984.0, "step": 15641 }, { "epoch": 1.989823177712759, "ewc_loss": 0.0312384981662035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.123849819530733e-05, "grad_norm": 18.389116287231445, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8677138686180115, "num_tokens": 596688362.0, "step": 15642 }, { "epoch": 1.9899503879913496, "ewc_loss": 0.031248444691300392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.124844442936592e-05, "grad_norm": 18.324268341064453, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.873516857624054, "num_tokens": 596725298.0, "step": 15643 }, { "epoch": 1.9900775982699401, "ewc_loss": 0.031206993386149406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1206993298837915e-05, "grad_norm": 18.37841796875, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8784182071685791, "num_tokens": 596757400.0, "step": 15644 }, { "epoch": 1.9902048085485307, "ewc_loss": 0.03125914931297302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.125915100099519e-05, "grad_norm": 18.281265258789062, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.883726954460144, "num_tokens": 596790533.0, "step": 15645 }, { "epoch": 1.9903320188271212, "ewc_loss": 0.031170431524515152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.117043161182664e-05, "grad_norm": 18.360939025878906, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8647794723510742, "num_tokens": 596834467.0, "step": 15646 }, { "epoch": 1.9904592291057117, "ewc_loss": 0.03134531527757645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1345316529041156e-05, "grad_norm": 18.347978591918945, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8587392568588257, "num_tokens": 596873262.0, "step": 15647 }, { "epoch": 1.9905864393843022, "ewc_loss": 0.031276341527700424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.12763404508587e-05, "grad_norm": 18.38796043395996, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8714256286621094, "num_tokens": 596912833.0, "step": 15648 }, { "epoch": 1.9907136496628928, "ewc_loss": 0.03129426762461662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1294268410420045e-05, "grad_norm": 18.255020141601562, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8796046376228333, "num_tokens": 596953554.0, "step": 15649 }, { "epoch": 1.9908408599414833, "ewc_loss": 0.03126421198248863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1264211429515854e-05, "grad_norm": 18.28305435180664, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.866401731967926, "num_tokens": 596991206.0, "step": 15650 }, { "epoch": 1.9909680702200738, "ewc_loss": 0.03137660771608353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137660678476095e-05, "grad_norm": 18.344818115234375, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8728952407836914, "num_tokens": 597027222.0, "step": 15651 }, { "epoch": 1.9910952804986644, "ewc_loss": 0.03131943196058273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13194323098287e-05, "grad_norm": 18.26732635498047, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8806381821632385, "num_tokens": 597069040.0, "step": 15652 }, { "epoch": 1.9912224907772549, "ewc_loss": 0.03138866275548935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1388663046527654e-05, "grad_norm": 18.32108497619629, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.863821268081665, "num_tokens": 597112529.0, "step": 15653 }, { "epoch": 1.9913497010558454, "ewc_loss": 0.031388938426971436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138893953291699e-05, "grad_norm": 18.324682235717773, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8737446665763855, "num_tokens": 597148304.0, "step": 15654 }, { "epoch": 1.991476911334436, "ewc_loss": 0.031328655779361725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1328654586104676e-05, "grad_norm": 18.361942291259766, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.880035400390625, "num_tokens": 597184244.0, "step": 15655 }, { "epoch": 1.9916041216130265, "ewc_loss": 0.03141288831830025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141288834740408e-05, "grad_norm": 18.26006507873535, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8655698299407959, "num_tokens": 597218178.0, "step": 15656 }, { "epoch": 1.991731331891617, "ewc_loss": 0.031353678554296494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.135368024231866e-05, "grad_norm": 18.310413360595703, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8687443733215332, "num_tokens": 597259256.0, "step": 15657 }, { "epoch": 1.9918585421702073, "ewc_loss": 0.03145357221364975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1453571864403784e-05, "grad_norm": 18.36280059814453, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8813997507095337, "num_tokens": 597296584.0, "step": 15658 }, { "epoch": 1.9919857524487978, "ewc_loss": 0.031403325498104095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1403324101120234e-05, "grad_norm": 18.33161163330078, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8568439483642578, "num_tokens": 597341116.0, "step": 15659 }, { "epoch": 1.9921129627273884, "ewc_loss": 0.03137702867388725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1377028790302575e-05, "grad_norm": 18.347597122192383, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8688039779663086, "num_tokens": 597375595.0, "step": 15660 }, { "epoch": 1.9922401730059789, "ewc_loss": 0.031389541923999786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138954343739897e-05, "grad_norm": 18.306726455688477, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8575550317764282, "num_tokens": 597419226.0, "step": 15661 }, { "epoch": 1.9923673832845694, "ewc_loss": 0.0313408300280571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134083090117201e-05, "grad_norm": 18.267255783081055, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8514646291732788, "num_tokens": 597461242.0, "step": 15662 }, { "epoch": 1.99249459356316, "ewc_loss": 0.03142908215522766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142908099107444e-05, "grad_norm": 18.421472549438477, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8703656196594238, "num_tokens": 597500030.0, "step": 15663 }, { "epoch": 1.9926218038417502, "ewc_loss": 0.0314517505466938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145174923702143e-05, "grad_norm": 18.28536605834961, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8564407229423523, "num_tokens": 597540428.0, "step": 15664 }, { "epoch": 1.9927490141203408, "ewc_loss": 0.03137772157788277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137772000627592e-05, "grad_norm": 18.342805862426758, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8814968466758728, "num_tokens": 597575070.0, "step": 15665 }, { "epoch": 1.9928762243989313, "ewc_loss": 0.03140270337462425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140270200674422e-05, "grad_norm": 18.31521224975586, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8691948652267456, "num_tokens": 597617425.0, "step": 15666 }, { "epoch": 1.9930034346775218, "ewc_loss": 0.031386569142341614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138656757073477e-05, "grad_norm": 18.363431930541992, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8777533769607544, "num_tokens": 597660489.0, "step": 15667 }, { "epoch": 1.9931306449561124, "ewc_loss": 0.0314180850982666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1418083381140605e-05, "grad_norm": 18.308496475219727, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8763909339904785, "num_tokens": 597703626.0, "step": 15668 }, { "epoch": 1.993257855234703, "ewc_loss": 0.031410086899995804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141008710372262e-05, "grad_norm": 18.395572662353516, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8667577505111694, "num_tokens": 597741176.0, "step": 15669 }, { "epoch": 1.9933850655132934, "ewc_loss": 0.03142824396491051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142824425594881e-05, "grad_norm": 18.36925506591797, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8586862087249756, "num_tokens": 597779763.0, "step": 15670 }, { "epoch": 1.993512275791884, "ewc_loss": 0.03132457658648491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1324576411861926e-05, "grad_norm": 18.293245315551758, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8736873865127563, "num_tokens": 597813300.0, "step": 15671 }, { "epoch": 1.9936394860704745, "ewc_loss": 0.031363699585199356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136369923595339e-05, "grad_norm": 18.339941024780273, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8608731627464294, "num_tokens": 597849062.0, "step": 15672 }, { "epoch": 1.993766696349065, "ewc_loss": 0.03136587515473366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136587474728003e-05, "grad_norm": 18.277917861938477, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8721182346343994, "num_tokens": 597886565.0, "step": 15673 }, { "epoch": 1.9938939066276555, "ewc_loss": 0.031432464718818665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143246431136504e-05, "grad_norm": 18.48484230041504, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8733915090560913, "num_tokens": 597925097.0, "step": 15674 }, { "epoch": 1.994021116906246, "ewc_loss": 0.03142581880092621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142581772408448e-05, "grad_norm": 18.213727951049805, "learning_rate": 1e-06, "loss": 0.3684, "mean_token_accuracy": 0.8840594291687012, "num_tokens": 597960786.0, "step": 15675 }, { "epoch": 1.9941483271848366, "ewc_loss": 0.03129357472062111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.129357355646789e-05, "grad_norm": 18.342548370361328, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8527203798294067, "num_tokens": 598000442.0, "step": 15676 }, { "epoch": 1.9942755374634271, "ewc_loss": 0.031453900039196014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145389928249642e-05, "grad_norm": 18.256853103637695, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8687586784362793, "num_tokens": 598042946.0, "step": 15677 }, { "epoch": 1.9944027477420176, "ewc_loss": 0.03136835992336273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1368359486805275e-05, "grad_norm": 18.351091384887695, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8509445190429688, "num_tokens": 598081659.0, "step": 15678 }, { "epoch": 1.9945299580206082, "ewc_loss": 0.03147576004266739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147575989714824e-05, "grad_norm": 18.39064598083496, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8656176328659058, "num_tokens": 598120203.0, "step": 15679 }, { "epoch": 1.9946571682991987, "ewc_loss": 0.031441979110240936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144197762594558e-05, "grad_norm": 18.382043838500977, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8583998680114746, "num_tokens": 598154779.0, "step": 15680 }, { "epoch": 1.9947843785777892, "ewc_loss": 0.03144469112157822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1444691558135673e-05, "grad_norm": 18.376663208007812, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8703446388244629, "num_tokens": 598197075.0, "step": 15681 }, { "epoch": 1.9949115888563798, "ewc_loss": 0.03141586855053902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1415867852047086e-05, "grad_norm": 18.37687110900879, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8625848293304443, "num_tokens": 598240539.0, "step": 15682 }, { "epoch": 1.99503879913497, "ewc_loss": 0.03139834478497505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1398343708133325e-05, "grad_norm": 18.418489456176758, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8724051117897034, "num_tokens": 598281776.0, "step": 15683 }, { "epoch": 1.9951660094135606, "ewc_loss": 0.031412795186042786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141279375995509e-05, "grad_norm": 18.31163787841797, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8773549795150757, "num_tokens": 598322715.0, "step": 15684 }, { "epoch": 1.9952932196921511, "ewc_loss": 0.031370993703603745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137099338346161e-05, "grad_norm": 18.382137298583984, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8649927973747253, "num_tokens": 598364495.0, "step": 15685 }, { "epoch": 1.9954204299707416, "ewc_loss": 0.031375814229249954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1375813705381006e-05, "grad_norm": 18.2479248046875, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8674721717834473, "num_tokens": 598407102.0, "step": 15686 }, { "epoch": 1.9955476402493322, "ewc_loss": 0.031338270753622055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133826976409182e-05, "grad_norm": 18.37096405029297, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8576743006706238, "num_tokens": 598448600.0, "step": 15687 }, { "epoch": 1.9956748505279227, "ewc_loss": 0.03142924606800079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142924470012076e-05, "grad_norm": 18.367835998535156, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8603982329368591, "num_tokens": 598491140.0, "step": 15688 }, { "epoch": 1.995802060806513, "ewc_loss": 0.031362082809209824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136208397336304e-05, "grad_norm": 18.41042137145996, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8672452569007874, "num_tokens": 598527831.0, "step": 15689 }, { "epoch": 1.9959292710851035, "ewc_loss": 0.03138527646660805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138527608825825e-05, "grad_norm": 18.327985763549805, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8575483560562134, "num_tokens": 598568680.0, "step": 15690 }, { "epoch": 1.996056481363694, "ewc_loss": 0.03141486644744873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1414867407875136e-05, "grad_norm": 18.434459686279297, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8587331771850586, "num_tokens": 598606254.0, "step": 15691 }, { "epoch": 1.9961836916422846, "ewc_loss": 0.03137565031647682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137564999633469e-05, "grad_norm": 18.352397918701172, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8717495203018188, "num_tokens": 598642400.0, "step": 15692 }, { "epoch": 1.9963109019208751, "ewc_loss": 0.031310614198446274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.131061384920031e-05, "grad_norm": 18.381677627563477, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8484691381454468, "num_tokens": 598678379.0, "step": 15693 }, { "epoch": 1.9964381121994657, "ewc_loss": 0.03140610083937645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1406099878950045e-05, "grad_norm": 18.43095588684082, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8683797717094421, "num_tokens": 598718234.0, "step": 15694 }, { "epoch": 1.9965653224780562, "ewc_loss": 0.03134151175618172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134151120320894e-05, "grad_norm": 18.371360778808594, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8731160163879395, "num_tokens": 598754897.0, "step": 15695 }, { "epoch": 1.9966925327566467, "ewc_loss": 0.03128119558095932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1281197152566165e-05, "grad_norm": 18.400699615478516, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8652309775352478, "num_tokens": 598785268.0, "step": 15696 }, { "epoch": 1.9968197430352372, "ewc_loss": 0.03133244439959526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1332445360021666e-05, "grad_norm": 18.36894989013672, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.875818133354187, "num_tokens": 598827131.0, "step": 15697 }, { "epoch": 1.9969469533138278, "ewc_loss": 0.03134451061487198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134450889774598e-05, "grad_norm": 18.38732147216797, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8713029026985168, "num_tokens": 598859507.0, "step": 15698 }, { "epoch": 1.9970741635924183, "ewc_loss": 0.03134296089410782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134295911877416e-05, "grad_norm": 18.37717056274414, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.860858142375946, "num_tokens": 598893280.0, "step": 15699 }, { "epoch": 1.9972013738710088, "ewc_loss": 0.031411174684762955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141117485938594e-05, "grad_norm": 18.429079055786133, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8832826614379883, "num_tokens": 598932759.0, "step": 15700 }, { "epoch": 1.9973285841495994, "ewc_loss": 0.031334709376096725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133470818283968e-05, "grad_norm": 18.30463409423828, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8801224231719971, "num_tokens": 598972899.0, "step": 15701 }, { "epoch": 1.9974557944281899, "ewc_loss": 0.0313815213739872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138152169412933e-05, "grad_norm": 18.39487648010254, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.871648371219635, "num_tokens": 599012811.0, "step": 15702 }, { "epoch": 1.9975830047067804, "ewc_loss": 0.03146425262093544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1464252970181406e-05, "grad_norm": 18.367408752441406, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8612799644470215, "num_tokens": 599055632.0, "step": 15703 }, { "epoch": 1.997710214985371, "ewc_loss": 0.031419530510902405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141953129670583e-05, "grad_norm": 18.41556167602539, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8527755737304688, "num_tokens": 599090924.0, "step": 15704 }, { "epoch": 1.9978374252639615, "ewc_loss": 0.031439732760190964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143973299302161e-05, "grad_norm": 18.33151626586914, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8833556771278381, "num_tokens": 599130434.0, "step": 15705 }, { "epoch": 1.997964635542552, "ewc_loss": 0.03138316422700882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138316606055014e-05, "grad_norm": 18.40015411376953, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8694466352462769, "num_tokens": 599171018.0, "step": 15706 }, { "epoch": 1.9980918458211423, "ewc_loss": 0.03147947043180466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147947063553147e-05, "grad_norm": 18.3714542388916, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8611419796943665, "num_tokens": 599212445.0, "step": 15707 }, { "epoch": 1.9982190560997328, "ewc_loss": 0.03140369430184364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140369517495856e-05, "grad_norm": 18.41344451904297, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8749790191650391, "num_tokens": 599250591.0, "step": 15708 }, { "epoch": 1.9983462663783234, "ewc_loss": 0.03144184872508049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1441850296687335e-05, "grad_norm": 18.449115753173828, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8661748170852661, "num_tokens": 599284987.0, "step": 15709 }, { "epoch": 1.9984734766569139, "ewc_loss": 0.031342726200819016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134272628813051e-05, "grad_norm": 18.36993980407715, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.866881251335144, "num_tokens": 599324656.0, "step": 15710 }, { "epoch": 1.9986006869355044, "ewc_loss": 0.031439077109098434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143907815683633e-05, "grad_norm": 18.363649368286133, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8804815411567688, "num_tokens": 599364014.0, "step": 15711 }, { "epoch": 1.998727897214095, "ewc_loss": 0.031374845653772354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137484600301832e-05, "grad_norm": 18.38421630859375, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8619605302810669, "num_tokens": 599396781.0, "step": 15712 }, { "epoch": 1.9988551074926852, "ewc_loss": 0.031398043036460876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1398041755892336e-05, "grad_norm": 18.3726749420166, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.879361093044281, "num_tokens": 599433240.0, "step": 15713 }, { "epoch": 1.9989823177712758, "ewc_loss": 0.031425099819898605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1425101042259485e-05, "grad_norm": 18.415042877197266, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8798235654830933, "num_tokens": 599474883.0, "step": 15714 }, { "epoch": 1.9991095280498663, "ewc_loss": 0.03134710341691971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134710277663544e-05, "grad_norm": 18.36754608154297, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.871286928653717, "num_tokens": 599508492.0, "step": 15715 }, { "epoch": 1.9992367383284568, "ewc_loss": 0.03139643371105194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13964337692596e-05, "grad_norm": 18.376916885375977, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8722146153450012, "num_tokens": 599545946.0, "step": 15716 }, { "epoch": 1.9993639486070474, "ewc_loss": 0.03138049691915512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138049578410573e-05, "grad_norm": 18.403379440307617, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8836584091186523, "num_tokens": 599586217.0, "step": 15717 }, { "epoch": 1.9994911588856379, "ewc_loss": 0.031376663595438004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1376664992421865e-05, "grad_norm": 18.373031616210938, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8775155544281006, "num_tokens": 599620431.0, "step": 15718 }, { "epoch": 1.9996183691642284, "ewc_loss": 0.03140329197049141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140329135931097e-05, "grad_norm": 18.457746505737305, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8563541173934937, "num_tokens": 599659959.0, "step": 15719 }, { "epoch": 1.999745579442819, "ewc_loss": 0.031379278749227524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137927706120536e-05, "grad_norm": 18.328857421875, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8697731494903564, "num_tokens": 599696558.0, "step": 15720 }, { "epoch": 1.9998727897214095, "ewc_loss": 0.03134848177433014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1348481570603326e-05, "grad_norm": 18.497026443481445, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8594598770141602, "num_tokens": 599734925.0, "step": 15721 }, { "epoch": 2.0, "ewc_loss": 0.03144562989473343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.14456301566679e-05, "grad_norm": 18.410755157470703, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8745255470275879, "num_tokens": 599772613.0, "step": 15722 }, { "epoch": 2.0001272102785905, "ewc_loss": 0.03129349276423454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1293493520934135e-05, "grad_norm": 18.32694435119629, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8755834102630615, "num_tokens": 599813892.0, "step": 15723 }, { "epoch": 2.000254420557181, "ewc_loss": 0.0313982330262661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139823456876911e-05, "grad_norm": 18.511505126953125, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.872164785861969, "num_tokens": 599852807.0, "step": 15724 }, { "epoch": 2.0003816308357716, "ewc_loss": 0.03138381242752075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13838136207778e-05, "grad_norm": 18.425682067871094, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8787054419517517, "num_tokens": 599893712.0, "step": 15725 }, { "epoch": 2.000508841114362, "ewc_loss": 0.03127307817339897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1273077183868736e-05, "grad_norm": 18.327871322631836, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8746060132980347, "num_tokens": 599929954.0, "step": 15726 }, { "epoch": 2.0006360513929526, "ewc_loss": 0.03140055015683174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140055196126923e-05, "grad_norm": 18.559249877929688, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8705369234085083, "num_tokens": 599967763.0, "step": 15727 }, { "epoch": 2.000763261671543, "ewc_loss": 0.031396206468343735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.139620821457356e-05, "grad_norm": 18.370962142944336, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8811333775520325, "num_tokens": 600008938.0, "step": 15728 }, { "epoch": 2.0008904719501337, "ewc_loss": 0.031226031482219696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1226030841935426e-05, "grad_norm": 18.276336669921875, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8690841794013977, "num_tokens": 600051849.0, "step": 15729 }, { "epoch": 2.0010176822287242, "ewc_loss": 0.0313970148563385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1397015845868737e-05, "grad_norm": 18.494081497192383, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8787631988525391, "num_tokens": 600088568.0, "step": 15730 }, { "epoch": 2.0011448925073148, "ewc_loss": 0.031381528824567795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138152897008695e-05, "grad_norm": 18.430593490600586, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8764686584472656, "num_tokens": 600131553.0, "step": 15731 }, { "epoch": 2.0012721027859053, "ewc_loss": 0.031247422099113464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1247422157321125e-05, "grad_norm": 18.396291732788086, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8731461763381958, "num_tokens": 600173198.0, "step": 15732 }, { "epoch": 2.001399313064496, "ewc_loss": 0.03134538605809212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134538565063849e-05, "grad_norm": 18.4644775390625, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8776185512542725, "num_tokens": 600211347.0, "step": 15733 }, { "epoch": 2.0015265233430863, "ewc_loss": 0.03128563240170479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.128563184873201e-05, "grad_norm": 18.357532501220703, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8665159940719604, "num_tokens": 600249953.0, "step": 15734 }, { "epoch": 2.0016537336216764, "ewc_loss": 0.03131866455078125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13186646963004e-05, "grad_norm": 18.441926956176758, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8655624389648438, "num_tokens": 600291211.0, "step": 15735 }, { "epoch": 2.001780943900267, "ewc_loss": 0.03134648874402046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.134648795821704e-05, "grad_norm": 18.366188049316406, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.877653956413269, "num_tokens": 600332906.0, "step": 15736 }, { "epoch": 2.0019081541788575, "ewc_loss": 0.031274352222681046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.127435047645122e-05, "grad_norm": 18.410390853881836, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8726152181625366, "num_tokens": 600370570.0, "step": 15737 }, { "epoch": 2.002035364457448, "ewc_loss": 0.03132030367851257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.13203054247424e-05, "grad_norm": 18.403966903686523, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8644760847091675, "num_tokens": 600409368.0, "step": 15738 }, { "epoch": 2.0021625747360385, "ewc_loss": 0.03129006549715996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1290066544897854e-05, "grad_norm": 18.359106063842773, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8768364191055298, "num_tokens": 600442019.0, "step": 15739 }, { "epoch": 2.002289785014629, "ewc_loss": 0.03133760392665863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.133760401397012e-05, "grad_norm": 18.40140724182129, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8814640045166016, "num_tokens": 600471665.0, "step": 15740 }, { "epoch": 2.0024169952932196, "ewc_loss": 0.031377892941236496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.137789462925866e-05, "grad_norm": 18.38007926940918, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8722652792930603, "num_tokens": 600508804.0, "step": 15741 }, { "epoch": 2.00254420557181, "ewc_loss": 0.03136281669139862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.136281520710327e-05, "grad_norm": 18.435514450073242, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8621839284896851, "num_tokens": 600543895.0, "step": 15742 }, { "epoch": 2.0026714158504006, "ewc_loss": 0.03143826872110367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1438270525541157e-05, "grad_norm": 18.398359298706055, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8746603727340698, "num_tokens": 600585818.0, "step": 15743 }, { "epoch": 2.002798626128991, "ewc_loss": 0.03136318176984787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1363182642962784e-05, "grad_norm": 18.378387451171875, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8777191638946533, "num_tokens": 600623106.0, "step": 15744 }, { "epoch": 2.0029258364075817, "ewc_loss": 0.03140294924378395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.14029493893031e-05, "grad_norm": 18.28812026977539, "learning_rate": 1e-06, "loss": 0.3264, "mean_token_accuracy": 0.8938525915145874, "num_tokens": 600662662.0, "step": 15745 }, { "epoch": 2.0030530466861722, "ewc_loss": 0.03142327442765236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1423274776898324e-05, "grad_norm": 18.44338035583496, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8597021698951721, "num_tokens": 600699416.0, "step": 15746 }, { "epoch": 2.0031802569647628, "ewc_loss": 0.03148892521858215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.14889257424511e-05, "grad_norm": 18.343666076660156, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8667404651641846, "num_tokens": 600739351.0, "step": 15747 }, { "epoch": 2.0033074672433533, "ewc_loss": 0.03144414722919464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144414586131461e-05, "grad_norm": 18.403886795043945, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.875058114528656, "num_tokens": 600781198.0, "step": 15748 }, { "epoch": 2.003434677521944, "ewc_loss": 0.03154854476451874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154854493914172e-05, "grad_norm": 18.39859390258789, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8721187114715576, "num_tokens": 600821743.0, "step": 15749 }, { "epoch": 2.0035618878005343, "ewc_loss": 0.03144720569252968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1447205401491374e-05, "grad_norm": 18.399660110473633, "learning_rate": 1e-06, "loss": 0.3485, "mean_token_accuracy": 0.8883326649665833, "num_tokens": 600862463.0, "step": 15750 }, { "epoch": 2.003689098079125, "ewc_loss": 0.03148065134882927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1480652978643775e-05, "grad_norm": 18.361265182495117, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8642836809158325, "num_tokens": 600902629.0, "step": 15751 }, { "epoch": 2.0038163083577154, "ewc_loss": 0.03148573264479637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1485731597058475e-05, "grad_norm": 18.40537452697754, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8717223405838013, "num_tokens": 600941778.0, "step": 15752 }, { "epoch": 2.003943518636306, "ewc_loss": 0.03154420852661133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154420846840367e-05, "grad_norm": 18.411802291870117, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8813593983650208, "num_tokens": 600979425.0, "step": 15753 }, { "epoch": 2.0040707289148965, "ewc_loss": 0.03147300332784653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147300230921246e-05, "grad_norm": 18.45115089416504, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8686109185218811, "num_tokens": 601017882.0, "step": 15754 }, { "epoch": 2.004197939193487, "ewc_loss": 0.031482890248298645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1482890335610136e-05, "grad_norm": 18.419374465942383, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8677163124084473, "num_tokens": 601053394.0, "step": 15755 }, { "epoch": 2.0043251494720775, "ewc_loss": 0.031421512365341187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.142151399515569e-05, "grad_norm": 18.349382400512695, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8768117427825928, "num_tokens": 601094940.0, "step": 15756 }, { "epoch": 2.004452359750668, "ewc_loss": 0.03148327395319939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.148327232338488e-05, "grad_norm": 18.474721908569336, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.874428391456604, "num_tokens": 601132963.0, "step": 15757 }, { "epoch": 2.0045795700292586, "ewc_loss": 0.03143355995416641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143355934298597e-05, "grad_norm": 18.371856689453125, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8731916546821594, "num_tokens": 601168511.0, "step": 15758 }, { "epoch": 2.0047067803078487, "ewc_loss": 0.03138548880815506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.138548709102906e-05, "grad_norm": 18.39626693725586, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8734105229377747, "num_tokens": 601208922.0, "step": 15759 }, { "epoch": 2.004833990586439, "ewc_loss": 0.03144135698676109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.144135553156957e-05, "grad_norm": 18.384292602539062, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8681609630584717, "num_tokens": 601253814.0, "step": 15760 }, { "epoch": 2.0049612008650297, "ewc_loss": 0.03143980726599693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143980575259775e-05, "grad_norm": 18.364274978637695, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8754075765609741, "num_tokens": 601295584.0, "step": 15761 }, { "epoch": 2.0050884111436202, "ewc_loss": 0.031411122530698776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.141112392768264e-05, "grad_norm": 18.303062438964844, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8657999038696289, "num_tokens": 601329134.0, "step": 15762 }, { "epoch": 2.0052156214222108, "ewc_loss": 0.03146630898118019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.146630842820741e-05, "grad_norm": 18.417436599731445, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8816973567008972, "num_tokens": 601369100.0, "step": 15763 }, { "epoch": 2.0053428317008013, "ewc_loss": 0.031498659402132034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.149866097373888e-05, "grad_norm": 18.325057983398438, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.887708306312561, "num_tokens": 601407876.0, "step": 15764 }, { "epoch": 2.005470041979392, "ewc_loss": 0.03148063272237778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.148063115077093e-05, "grad_norm": 18.378009796142578, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8861507177352905, "num_tokens": 601446996.0, "step": 15765 }, { "epoch": 2.0055972522579824, "ewc_loss": 0.03143833205103874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.143833237118088e-05, "grad_norm": 18.266681671142578, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8650439381599426, "num_tokens": 601482158.0, "step": 15766 }, { "epoch": 2.005724462536573, "ewc_loss": 0.03148748353123665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.148748510284349e-05, "grad_norm": 18.458362579345703, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8788647651672363, "num_tokens": 601517940.0, "step": 15767 }, { "epoch": 2.0058516728151634, "ewc_loss": 0.03148987516760826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.148987525491975e-05, "grad_norm": 18.312807083129883, "learning_rate": 1e-06, "loss": 0.3265, "mean_token_accuracy": 0.8932909369468689, "num_tokens": 601553823.0, "step": 15768 }, { "epoch": 2.005978883093754, "ewc_loss": 0.0314750075340271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147500683553517e-05, "grad_norm": 18.504098892211914, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8613361120223999, "num_tokens": 601595482.0, "step": 15769 }, { "epoch": 2.0061060933723445, "ewc_loss": 0.0315411239862442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1541123462375253e-05, "grad_norm": 18.31633758544922, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8811787366867065, "num_tokens": 601632451.0, "step": 15770 }, { "epoch": 2.006233303650935, "ewc_loss": 0.03140884265303612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140884291497059e-05, "grad_norm": 18.43459701538086, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8711714744567871, "num_tokens": 601667573.0, "step": 15771 }, { "epoch": 2.0063605139295255, "ewc_loss": 0.03161625936627388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161625863867812e-05, "grad_norm": 18.4481201171875, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8791388869285583, "num_tokens": 601704494.0, "step": 15772 }, { "epoch": 2.006487724208116, "ewc_loss": 0.03148078918457031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.148078758385964e-05, "grad_norm": 18.414676666259766, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8641625046730042, "num_tokens": 601745307.0, "step": 15773 }, { "epoch": 2.0066149344867066, "ewc_loss": 0.03145574405789375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145574373775162e-05, "grad_norm": 18.380029678344727, "learning_rate": 1e-06, "loss": 0.3473, "mean_token_accuracy": 0.8894211053848267, "num_tokens": 601782551.0, "step": 15774 }, { "epoch": 2.006742144765297, "ewc_loss": 0.0314597561955452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145975642837584e-05, "grad_norm": 18.40484619140625, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8747515678405762, "num_tokens": 601823378.0, "step": 15775 }, { "epoch": 2.0068693550438876, "ewc_loss": 0.03151499852538109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.151499913656153e-05, "grad_norm": 18.355361938476562, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8742925524711609, "num_tokens": 601860881.0, "step": 15776 }, { "epoch": 2.006996565322478, "ewc_loss": 0.031401269137859344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.140126864309423e-05, "grad_norm": 18.337034225463867, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8721097111701965, "num_tokens": 601903502.0, "step": 15777 }, { "epoch": 2.0071237756010687, "ewc_loss": 0.031572822481393814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1572821171721444e-05, "grad_norm": 18.488494873046875, "learning_rate": 1e-06, "loss": 0.3442, "mean_token_accuracy": 0.8913446664810181, "num_tokens": 601937097.0, "step": 15778 }, { "epoch": 2.007250985879659, "ewc_loss": 0.03155563771724701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.155563899781555e-05, "grad_norm": 18.38352394104004, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8772189617156982, "num_tokens": 601978050.0, "step": 15779 }, { "epoch": 2.0073781961582498, "ewc_loss": 0.03148803487420082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1488034437643364e-05, "grad_norm": 18.382774353027344, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8722472190856934, "num_tokens": 602018167.0, "step": 15780 }, { "epoch": 2.0075054064368403, "ewc_loss": 0.03153187781572342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1531879358226433e-05, "grad_norm": 18.44246482849121, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8598565459251404, "num_tokens": 602054485.0, "step": 15781 }, { "epoch": 2.007632616715431, "ewc_loss": 0.03151629865169525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.151629789499566e-05, "grad_norm": 18.485700607299805, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8791517019271851, "num_tokens": 602092881.0, "step": 15782 }, { "epoch": 2.0077598269940213, "ewc_loss": 0.031502291560173035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1502291676588356e-05, "grad_norm": 18.43561363220215, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8751031160354614, "num_tokens": 602127424.0, "step": 15783 }, { "epoch": 2.0078870372726114, "ewc_loss": 0.03146407753229141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1464078347198665e-05, "grad_norm": 18.412918090820312, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8811877965927124, "num_tokens": 602168340.0, "step": 15784 }, { "epoch": 2.008014247551202, "ewc_loss": 0.0314759686589241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147596726194024e-05, "grad_norm": 18.43718719482422, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8647240400314331, "num_tokens": 602207894.0, "step": 15785 }, { "epoch": 2.0081414578297925, "ewc_loss": 0.03145096078515053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145095979562029e-05, "grad_norm": 18.311809539794922, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8762559294700623, "num_tokens": 602245924.0, "step": 15786 }, { "epoch": 2.008268668108383, "ewc_loss": 0.031474385410547256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1474384741159156e-05, "grad_norm": 18.50654411315918, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8722426891326904, "num_tokens": 602278417.0, "step": 15787 }, { "epoch": 2.0083958783869735, "ewc_loss": 0.03154788911342621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1547890102956444e-05, "grad_norm": 18.339147567749023, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8775829672813416, "num_tokens": 602318431.0, "step": 15788 }, { "epoch": 2.008523088665564, "ewc_loss": 0.03143732249736786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1437321013072506e-05, "grad_norm": 18.37895393371582, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8649210929870605, "num_tokens": 602356315.0, "step": 15789 }, { "epoch": 2.0086502989441546, "ewc_loss": 0.0315389558672905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.153895522700623e-05, "grad_norm": 18.420392990112305, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8880144953727722, "num_tokens": 602398865.0, "step": 15790 }, { "epoch": 2.008777509222745, "ewc_loss": 0.03142675384879112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1426752684637904e-05, "grad_norm": 18.38105010986328, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8851721286773682, "num_tokens": 602435835.0, "step": 15791 }, { "epoch": 2.0089047195013356, "ewc_loss": 0.031493958085775375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.149395706714131e-05, "grad_norm": 18.43263816833496, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8817710280418396, "num_tokens": 602470359.0, "step": 15792 }, { "epoch": 2.009031929779926, "ewc_loss": 0.03145147114992142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.145147275063209e-05, "grad_norm": 18.369243621826172, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8786777853965759, "num_tokens": 602512132.0, "step": 15793 }, { "epoch": 2.0091591400585167, "ewc_loss": 0.03150799497961998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1507996027357876e-05, "grad_norm": 18.416709899902344, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8716103434562683, "num_tokens": 602552700.0, "step": 15794 }, { "epoch": 2.0092863503371072, "ewc_loss": 0.031527504324913025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.15275028697215e-05, "grad_norm": 18.40435028076172, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8741592168807983, "num_tokens": 602587779.0, "step": 15795 }, { "epoch": 2.0094135606156978, "ewc_loss": 0.03146106004714966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.146105882478878e-05, "grad_norm": 18.412118911743164, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8642728328704834, "num_tokens": 602623834.0, "step": 15796 }, { "epoch": 2.0095407708942883, "ewc_loss": 0.031498368829488754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.149836993543431e-05, "grad_norm": 18.334369659423828, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8844951391220093, "num_tokens": 602660141.0, "step": 15797 }, { "epoch": 2.009667981172879, "ewc_loss": 0.03150112181901932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.150112024741247e-05, "grad_norm": 18.386943817138672, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8779563903808594, "num_tokens": 602698990.0, "step": 15798 }, { "epoch": 2.0097951914514693, "ewc_loss": 0.031575094908475876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1575094908475876e-05, "grad_norm": 18.477069854736328, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.856509804725647, "num_tokens": 602732053.0, "step": 15799 }, { "epoch": 2.00992240173006, "ewc_loss": 0.03156397119164467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156396996928379e-05, "grad_norm": 18.339622497558594, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8744298219680786, "num_tokens": 602770975.0, "step": 15800 }, { "epoch": 2.0100496120086504, "ewc_loss": 0.031511496752500534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.15114957629703e-05, "grad_norm": 18.46872329711914, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8653839826583862, "num_tokens": 602813207.0, "step": 15801 }, { "epoch": 2.010176822287241, "ewc_loss": 0.03161971643567085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1619714718544856e-05, "grad_norm": 18.39957618713379, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8801028728485107, "num_tokens": 602849951.0, "step": 15802 }, { "epoch": 2.0103040325658315, "ewc_loss": 0.03152036294341087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.152036151732318e-05, "grad_norm": 18.48109245300293, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8649978637695312, "num_tokens": 602891418.0, "step": 15803 }, { "epoch": 2.010431242844422, "ewc_loss": 0.03153536841273308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1535368179902434e-05, "grad_norm": 18.34849739074707, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8801184892654419, "num_tokens": 602933039.0, "step": 15804 }, { "epoch": 2.0105584531230125, "ewc_loss": 0.03152022510766983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.152022691210732e-05, "grad_norm": 18.434185028076172, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.8885551691055298, "num_tokens": 602965546.0, "step": 15805 }, { "epoch": 2.010685663401603, "ewc_loss": 0.03160501644015312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1605017284164205e-05, "grad_norm": 18.4249267578125, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8741692304611206, "num_tokens": 603004473.0, "step": 15806 }, { "epoch": 2.0108128736801936, "ewc_loss": 0.031529609113931656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.152960925945081e-05, "grad_norm": 18.378742218017578, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8670302629470825, "num_tokens": 603041644.0, "step": 15807 }, { "epoch": 2.0109400839587837, "ewc_loss": 0.031512558460235596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.151255805278197e-05, "grad_norm": 18.4361629486084, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8624557256698608, "num_tokens": 603084698.0, "step": 15808 }, { "epoch": 2.011067294237374, "ewc_loss": 0.03156723827123642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156723687425256e-05, "grad_norm": 18.397979736328125, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8585251569747925, "num_tokens": 603121059.0, "step": 15809 }, { "epoch": 2.0111945045159647, "ewc_loss": 0.03147183731198311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.147183815599419e-05, "grad_norm": 18.37738800048828, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8713741898536682, "num_tokens": 603156604.0, "step": 15810 }, { "epoch": 2.0113217147945552, "ewc_loss": 0.031590305268764496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1590305297868326e-05, "grad_norm": 18.438779830932617, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8736933469772339, "num_tokens": 603183743.0, "step": 15811 }, { "epoch": 2.0114489250731458, "ewc_loss": 0.03154450282454491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154450314468704e-05, "grad_norm": 18.332040786743164, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8505836725234985, "num_tokens": 603222099.0, "step": 15812 }, { "epoch": 2.0115761353517363, "ewc_loss": 0.031559184193611145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1559186027152464e-05, "grad_norm": 18.415712356567383, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8552057147026062, "num_tokens": 603263467.0, "step": 15813 }, { "epoch": 2.011703345630327, "ewc_loss": 0.03157707676291466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.157707760692574e-05, "grad_norm": 18.37605094909668, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8549749851226807, "num_tokens": 603296726.0, "step": 15814 }, { "epoch": 2.0118305559089174, "ewc_loss": 0.03154489025473595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154488877044059e-05, "grad_norm": 18.37739372253418, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8792408108711243, "num_tokens": 603329691.0, "step": 15815 }, { "epoch": 2.011957766187508, "ewc_loss": 0.03156907483935356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156907405355014e-05, "grad_norm": 18.402982711791992, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8813183307647705, "num_tokens": 603371619.0, "step": 15816 }, { "epoch": 2.0120849764660984, "ewc_loss": 0.03165120631456375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165120506309904e-05, "grad_norm": 18.397489547729492, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8764009475708008, "num_tokens": 603400018.0, "step": 15817 }, { "epoch": 2.012212186744689, "ewc_loss": 0.03165169060230255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1651688914280385e-05, "grad_norm": 18.358753204345703, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8767493963241577, "num_tokens": 603436971.0, "step": 15818 }, { "epoch": 2.0123393970232795, "ewc_loss": 0.03163415566086769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16341538564302e-05, "grad_norm": 18.326507568359375, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8745574355125427, "num_tokens": 603472802.0, "step": 15819 }, { "epoch": 2.01246660730187, "ewc_loss": 0.03170246630907059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170246782246977e-05, "grad_norm": 18.407447814941406, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8799856305122375, "num_tokens": 603506547.0, "step": 15820 }, { "epoch": 2.0125938175804605, "ewc_loss": 0.03167663887143135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167663817293942e-05, "grad_norm": 18.35622215270996, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8573643565177917, "num_tokens": 603544967.0, "step": 15821 }, { "epoch": 2.012721027859051, "ewc_loss": 0.03169526532292366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169526462443173e-05, "grad_norm": 18.38948631286621, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8684065341949463, "num_tokens": 603582522.0, "step": 15822 }, { "epoch": 2.0128482381376416, "ewc_loss": 0.03174498304724693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1744981242809445e-05, "grad_norm": 18.35837173461914, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.882423996925354, "num_tokens": 603619584.0, "step": 15823 }, { "epoch": 2.012975448416232, "ewc_loss": 0.03169150650501251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1691506592324004e-05, "grad_norm": 18.371597290039062, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8621447682380676, "num_tokens": 603660542.0, "step": 15824 }, { "epoch": 2.0131026586948226, "ewc_loss": 0.031718865036964417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171886419295333e-05, "grad_norm": 18.356985092163086, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.875008225440979, "num_tokens": 603698900.0, "step": 15825 }, { "epoch": 2.013229868973413, "ewc_loss": 0.03172796592116356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172796641592868e-05, "grad_norm": 18.46597671508789, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8562612533569336, "num_tokens": 603733019.0, "step": 15826 }, { "epoch": 2.0133570792520037, "ewc_loss": 0.031720634549856186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172063588863239e-05, "grad_norm": 18.344257354736328, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8820318579673767, "num_tokens": 603772071.0, "step": 15827 }, { "epoch": 2.013484289530594, "ewc_loss": 0.03164073824882507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164073859807104e-05, "grad_norm": 18.40749740600586, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8667585849761963, "num_tokens": 603811008.0, "step": 15828 }, { "epoch": 2.0136114998091847, "ewc_loss": 0.03171143680810928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171143544022925e-05, "grad_norm": 18.338029861450195, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.864301323890686, "num_tokens": 603852647.0, "step": 15829 }, { "epoch": 2.0137387100877753, "ewc_loss": 0.031667206436395645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166720489389263e-05, "grad_norm": 18.413694381713867, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8723239898681641, "num_tokens": 603894198.0, "step": 15830 }, { "epoch": 2.013865920366366, "ewc_loss": 0.03175069019198418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175068923155777e-05, "grad_norm": 18.39495086669922, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8703210353851318, "num_tokens": 603930168.0, "step": 15831 }, { "epoch": 2.0139931306449563, "ewc_loss": 0.0316191241145134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161912536597811e-05, "grad_norm": 18.392162322998047, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8829930424690247, "num_tokens": 603966189.0, "step": 15832 }, { "epoch": 2.0141203409235464, "ewc_loss": 0.031672991812229156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1672992918174714e-05, "grad_norm": 18.358858108520508, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.869533896446228, "num_tokens": 604011110.0, "step": 15833 }, { "epoch": 2.014247551202137, "ewc_loss": 0.031653761863708496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165376256220043e-05, "grad_norm": 18.492401123046875, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8569462299346924, "num_tokens": 604044256.0, "step": 15834 }, { "epoch": 2.0143747614807275, "ewc_loss": 0.031762588769197464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176258906023577e-05, "grad_norm": 18.39374351501465, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.882748007774353, "num_tokens": 604078652.0, "step": 15835 }, { "epoch": 2.014501971759318, "ewc_loss": 0.031547822058200836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154782098135911e-05, "grad_norm": 18.38230323791504, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8824659585952759, "num_tokens": 604112302.0, "step": 15836 }, { "epoch": 2.0146291820379085, "ewc_loss": 0.03172842785716057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172842843923718e-05, "grad_norm": 18.363492965698242, "learning_rate": 1e-06, "loss": 0.343, "mean_token_accuracy": 0.8890768885612488, "num_tokens": 604150011.0, "step": 15837 }, { "epoch": 2.014756392316499, "ewc_loss": 0.03159734234213829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159734114888124e-05, "grad_norm": 18.41743278503418, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8740339279174805, "num_tokens": 604188859.0, "step": 15838 }, { "epoch": 2.0148836025950896, "ewc_loss": 0.031730227172374725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173022560076788e-05, "grad_norm": 18.402809143066406, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8685884475708008, "num_tokens": 604232273.0, "step": 15839 }, { "epoch": 2.01501081287368, "ewc_loss": 0.031578656286001205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.157865648972802e-05, "grad_norm": 18.330690383911133, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8801840543746948, "num_tokens": 604265236.0, "step": 15840 }, { "epoch": 2.0151380231522706, "ewc_loss": 0.03172175586223602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172175638610497e-05, "grad_norm": 18.375333786010742, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8873649835586548, "num_tokens": 604304784.0, "step": 15841 }, { "epoch": 2.015265233430861, "ewc_loss": 0.03167980909347534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1679810490459204e-05, "grad_norm": 18.377565383911133, "learning_rate": 1e-06, "loss": 0.3176, "mean_token_accuracy": 0.8984434604644775, "num_tokens": 604341461.0, "step": 15842 }, { "epoch": 2.0153924437094517, "ewc_loss": 0.03168264403939247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168264447594993e-05, "grad_norm": 18.387948989868164, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8797646760940552, "num_tokens": 604378628.0, "step": 15843 }, { "epoch": 2.0155196539880422, "ewc_loss": 0.03167550265789032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167550312355161e-05, "grad_norm": 18.398174285888672, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8843536972999573, "num_tokens": 604418070.0, "step": 15844 }, { "epoch": 2.0156468642666328, "ewc_loss": 0.031666889786720276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1666888389736414e-05, "grad_norm": 18.42976951599121, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8862226009368896, "num_tokens": 604454445.0, "step": 15845 }, { "epoch": 2.0157740745452233, "ewc_loss": 0.0316334031522274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163340443279594e-05, "grad_norm": 18.410512924194336, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8777716159820557, "num_tokens": 604488559.0, "step": 15846 }, { "epoch": 2.015901284823814, "ewc_loss": 0.031624674797058105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162467328365892e-05, "grad_norm": 18.404695510864258, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8673922419548035, "num_tokens": 604522538.0, "step": 15847 }, { "epoch": 2.0160284951024043, "ewc_loss": 0.03164273872971535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164273948641494e-05, "grad_norm": 18.45207405090332, "learning_rate": 1e-06, "loss": 0.3258, "mean_token_accuracy": 0.8929696679115295, "num_tokens": 604561750.0, "step": 15848 }, { "epoch": 2.016155705380995, "ewc_loss": 0.03163990005850792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16398982249666e-05, "grad_norm": 18.395992279052734, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8869174122810364, "num_tokens": 604594961.0, "step": 15849 }, { "epoch": 2.0162829156595854, "ewc_loss": 0.031546883285045624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.154688238282688e-05, "grad_norm": 18.316856384277344, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8647326231002808, "num_tokens": 604631301.0, "step": 15850 }, { "epoch": 2.016410125938176, "ewc_loss": 0.03160626441240311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160626511089504e-05, "grad_norm": 18.399246215820312, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8546062111854553, "num_tokens": 604666049.0, "step": 15851 }, { "epoch": 2.0165373362167665, "ewc_loss": 0.031657781451940536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1657782528782263e-05, "grad_norm": 18.346662521362305, "learning_rate": 1e-06, "loss": 0.3301, "mean_token_accuracy": 0.8958268165588379, "num_tokens": 604701984.0, "step": 15852 }, { "epoch": 2.016664546495357, "ewc_loss": 0.03168938308954239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168938201270066e-05, "grad_norm": 18.45320701599121, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.882567286491394, "num_tokens": 604738414.0, "step": 15853 }, { "epoch": 2.0167917567739475, "ewc_loss": 0.031685274094343185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1685274734627455e-05, "grad_norm": 18.375574111938477, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8763478994369507, "num_tokens": 604779961.0, "step": 15854 }, { "epoch": 2.016918967052538, "ewc_loss": 0.031633712351322174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1633713660994545e-05, "grad_norm": 18.377151489257812, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8675650358200073, "num_tokens": 604819729.0, "step": 15855 }, { "epoch": 2.0170461773311286, "ewc_loss": 0.03171263262629509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1712632335256785e-05, "grad_norm": 18.440021514892578, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.883857250213623, "num_tokens": 604864108.0, "step": 15856 }, { "epoch": 2.0171733876097186, "ewc_loss": 0.03165512531995773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1655126804253086e-05, "grad_norm": 18.398588180541992, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8683165311813354, "num_tokens": 604905582.0, "step": 15857 }, { "epoch": 2.017300597888309, "ewc_loss": 0.031658049672842026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165804810123518e-05, "grad_norm": 18.438060760498047, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8819668292999268, "num_tokens": 604943931.0, "step": 15858 }, { "epoch": 2.0174278081668997, "ewc_loss": 0.031604133546352386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160413325531408e-05, "grad_norm": 18.38223648071289, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8710432052612305, "num_tokens": 604982100.0, "step": 15859 }, { "epoch": 2.0175550184454902, "ewc_loss": 0.031637802720069885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1637802749173716e-05, "grad_norm": 18.415151596069336, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.862870991230011, "num_tokens": 605026289.0, "step": 15860 }, { "epoch": 2.0176822287240808, "ewc_loss": 0.031670499593019485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1670500902691856e-05, "grad_norm": 18.38759422302246, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8782542943954468, "num_tokens": 605070214.0, "step": 15861 }, { "epoch": 2.0178094390026713, "ewc_loss": 0.03159225732088089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1592258892487735e-05, "grad_norm": 18.39054298400879, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8700302839279175, "num_tokens": 605113495.0, "step": 15862 }, { "epoch": 2.017936649281262, "ewc_loss": 0.03165939450263977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1659394153393805e-05, "grad_norm": 18.3983211517334, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8783619999885559, "num_tokens": 605145717.0, "step": 15863 }, { "epoch": 2.0180638595598523, "ewc_loss": 0.03166488930583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1664887501392514e-05, "grad_norm": 18.600440979003906, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8675829172134399, "num_tokens": 605180930.0, "step": 15864 }, { "epoch": 2.018191069838443, "ewc_loss": 0.031625304371118546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162530265399255e-05, "grad_norm": 18.338788986206055, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8804832696914673, "num_tokens": 605214004.0, "step": 15865 }, { "epoch": 2.0183182801170334, "ewc_loss": 0.031523559242486954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1523559300694615e-05, "grad_norm": 18.41837501525879, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8746578693389893, "num_tokens": 605253244.0, "step": 15866 }, { "epoch": 2.018445490395624, "ewc_loss": 0.031596507877111435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159650805173442e-05, "grad_norm": 18.38312530517578, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8701645135879517, "num_tokens": 605293832.0, "step": 15867 }, { "epoch": 2.0185727006742145, "ewc_loss": 0.03159152716398239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159152765874751e-05, "grad_norm": 18.400020599365234, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8847622871398926, "num_tokens": 605333811.0, "step": 15868 }, { "epoch": 2.018699910952805, "ewc_loss": 0.03158580884337425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.158580875606276e-05, "grad_norm": 18.38371467590332, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8742496967315674, "num_tokens": 605371560.0, "step": 15869 }, { "epoch": 2.0188271212313955, "ewc_loss": 0.031654320657253265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165431917295791e-05, "grad_norm": 18.389652252197266, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8716252446174622, "num_tokens": 605410555.0, "step": 15870 }, { "epoch": 2.018954331509986, "ewc_loss": 0.03155677393078804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.155677404720336e-05, "grad_norm": 18.339284896850586, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8684677481651306, "num_tokens": 605451851.0, "step": 15871 }, { "epoch": 2.0190815417885766, "ewc_loss": 0.031637392938137054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1637391657568514e-05, "grad_norm": 18.37766456604004, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8657932281494141, "num_tokens": 605490442.0, "step": 15872 }, { "epoch": 2.019208752067167, "ewc_loss": 0.03162992373108864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1629922887077555e-05, "grad_norm": 18.368499755859375, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8725835084915161, "num_tokens": 605525237.0, "step": 15873 }, { "epoch": 2.0193359623457576, "ewc_loss": 0.03165072202682495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16507212119177e-05, "grad_norm": 18.38820457458496, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8788658380508423, "num_tokens": 605563624.0, "step": 15874 }, { "epoch": 2.019463172624348, "ewc_loss": 0.031658731400966644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1658732041250914e-05, "grad_norm": 18.404253005981445, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8835840225219727, "num_tokens": 605597419.0, "step": 15875 }, { "epoch": 2.0195903829029387, "ewc_loss": 0.03170279785990715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1702798878541216e-05, "grad_norm": 18.431827545166016, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8678436279296875, "num_tokens": 605639954.0, "step": 15876 }, { "epoch": 2.019717593181529, "ewc_loss": 0.0316457599401474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1645759008824825e-05, "grad_norm": 18.338964462280273, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8742122650146484, "num_tokens": 605678226.0, "step": 15877 }, { "epoch": 2.0198448034601197, "ewc_loss": 0.031571343541145325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1571344152325764e-05, "grad_norm": 18.375154495239258, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8828209638595581, "num_tokens": 605719925.0, "step": 15878 }, { "epoch": 2.0199720137387103, "ewc_loss": 0.03170241788029671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170241689076647e-05, "grad_norm": 18.455768585205078, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8556591868400574, "num_tokens": 605761282.0, "step": 15879 }, { "epoch": 2.020099224017301, "ewc_loss": 0.03159702196717262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159702100674622e-05, "grad_norm": 18.294620513916016, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8832808136940002, "num_tokens": 605800681.0, "step": 15880 }, { "epoch": 2.0202264342958913, "ewc_loss": 0.03162013366818428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162013308610767e-05, "grad_norm": 18.501747131347656, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8570740222930908, "num_tokens": 605836282.0, "step": 15881 }, { "epoch": 2.0203536445744814, "ewc_loss": 0.03171254321932793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1712545023765415e-05, "grad_norm": 18.398056030273438, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8760846853256226, "num_tokens": 605880360.0, "step": 15882 }, { "epoch": 2.020480854853072, "ewc_loss": 0.031581293791532516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.158129402436316e-05, "grad_norm": 18.37844467163086, "learning_rate": 1e-06, "loss": 0.3494, "mean_token_accuracy": 0.8841719627380371, "num_tokens": 605918363.0, "step": 15883 }, { "epoch": 2.0206080651316625, "ewc_loss": 0.03168941289186478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168941111653112e-05, "grad_norm": 18.40334701538086, "learning_rate": 1e-06, "loss": 0.3461, "mean_token_accuracy": 0.8882632851600647, "num_tokens": 605953032.0, "step": 15884 }, { "epoch": 2.020735275410253, "ewc_loss": 0.03163980692625046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163980727549642e-05, "grad_norm": 18.40675926208496, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8813740015029907, "num_tokens": 605992633.0, "step": 15885 }, { "epoch": 2.0208624856888435, "ewc_loss": 0.031620901077985764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162090069963597e-05, "grad_norm": 18.40666961669922, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8763803243637085, "num_tokens": 606031776.0, "step": 15886 }, { "epoch": 2.020989695967434, "ewc_loss": 0.03164602816104889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164602821925655e-05, "grad_norm": 18.508028030395508, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8680486679077148, "num_tokens": 606070614.0, "step": 15887 }, { "epoch": 2.0211169062460246, "ewc_loss": 0.031612567603588104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161256609018892e-05, "grad_norm": 18.3973388671875, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8733187913894653, "num_tokens": 606115898.0, "step": 15888 }, { "epoch": 2.021244116524615, "ewc_loss": 0.031579602509737015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.157960236421786e-05, "grad_norm": 18.48963737487793, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8792604804039001, "num_tokens": 606150231.0, "step": 15889 }, { "epoch": 2.0213713268032056, "ewc_loss": 0.03167058154940605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167058093822561e-05, "grad_norm": 18.409698486328125, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8659324049949646, "num_tokens": 606191527.0, "step": 15890 }, { "epoch": 2.021498537081796, "ewc_loss": 0.03160149231553078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160149208270013e-05, "grad_norm": 18.531089782714844, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8666349649429321, "num_tokens": 606229070.0, "step": 15891 }, { "epoch": 2.0216257473603867, "ewc_loss": 0.03160174563527107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160174674121663e-05, "grad_norm": 18.397930145263672, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8756880164146423, "num_tokens": 606269555.0, "step": 15892 }, { "epoch": 2.021752957638977, "ewc_loss": 0.03150274232029915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1502742785960436e-05, "grad_norm": 18.45722198486328, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8696088790893555, "num_tokens": 606302040.0, "step": 15893 }, { "epoch": 2.0218801679175677, "ewc_loss": 0.03159485384821892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159485277137719e-05, "grad_norm": 18.393966674804688, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8834625482559204, "num_tokens": 606341769.0, "step": 15894 }, { "epoch": 2.0220073781961583, "ewc_loss": 0.031522009521722794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1522009521722794e-05, "grad_norm": 18.364675521850586, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8611823916435242, "num_tokens": 606379989.0, "step": 15895 }, { "epoch": 2.022134588474749, "ewc_loss": 0.031576208770275116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1576208129990846e-05, "grad_norm": 18.39401626586914, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8686229586601257, "num_tokens": 606414396.0, "step": 15896 }, { "epoch": 2.0222617987533393, "ewc_loss": 0.031581610441207886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.158161052851938e-05, "grad_norm": 18.37684440612793, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8777910470962524, "num_tokens": 606445949.0, "step": 15897 }, { "epoch": 2.02238900903193, "ewc_loss": 0.03158196806907654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.158196705044247e-05, "grad_norm": 18.418548583984375, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8757230043411255, "num_tokens": 606484751.0, "step": 15898 }, { "epoch": 2.0225162193105204, "ewc_loss": 0.031649671494960785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164966983604245e-05, "grad_norm": 18.364259719848633, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8773632049560547, "num_tokens": 606525390.0, "step": 15899 }, { "epoch": 2.022643429589111, "ewc_loss": 0.03159065544605255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159065454383381e-05, "grad_norm": 18.411039352416992, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8706254959106445, "num_tokens": 606563364.0, "step": 15900 }, { "epoch": 2.0227706398677014, "ewc_loss": 0.031610164791345596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161016502417624e-05, "grad_norm": 18.36809730529785, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8771141767501831, "num_tokens": 606598140.0, "step": 15901 }, { "epoch": 2.022897850146292, "ewc_loss": 0.03166420757770538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166420719935559e-05, "grad_norm": 18.32095718383789, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8675426244735718, "num_tokens": 606636480.0, "step": 15902 }, { "epoch": 2.0230250604248825, "ewc_loss": 0.03168029338121414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168029434164055e-05, "grad_norm": 18.48934555053711, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8788090944290161, "num_tokens": 606676044.0, "step": 15903 }, { "epoch": 2.023152270703473, "ewc_loss": 0.0317317470908165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1731746275909245e-05, "grad_norm": 18.42600440979004, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8679054975509644, "num_tokens": 606713262.0, "step": 15904 }, { "epoch": 2.0232794809820636, "ewc_loss": 0.03165457397699356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165457383147441e-05, "grad_norm": 18.312368392944336, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8711999654769897, "num_tokens": 606754844.0, "step": 15905 }, { "epoch": 2.0234066912606536, "ewc_loss": 0.03166630491614342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166630631312728e-05, "grad_norm": 18.35689353942871, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8749876022338867, "num_tokens": 606800910.0, "step": 15906 }, { "epoch": 2.023533901539244, "ewc_loss": 0.03174318000674248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1743180443299934e-05, "grad_norm": 18.37755584716797, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8690834641456604, "num_tokens": 606841855.0, "step": 15907 }, { "epoch": 2.0236611118178347, "ewc_loss": 0.031658709049224854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165871021337807e-05, "grad_norm": 18.38767433166504, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8633180856704712, "num_tokens": 606888261.0, "step": 15908 }, { "epoch": 2.0237883220964252, "ewc_loss": 0.031671009957790375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167101021972485e-05, "grad_norm": 18.356399536132812, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8654454946517944, "num_tokens": 606926235.0, "step": 15909 }, { "epoch": 2.0239155323750158, "ewc_loss": 0.03166598081588745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166598253301345e-05, "grad_norm": 18.373905181884766, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8697576522827148, "num_tokens": 606962527.0, "step": 15910 }, { "epoch": 2.0240427426536063, "ewc_loss": 0.03177083656191826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1770836358191445e-05, "grad_norm": 18.448625564575195, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8786848187446594, "num_tokens": 607008924.0, "step": 15911 }, { "epoch": 2.024169952932197, "ewc_loss": 0.03165518864989281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165518864989281e-05, "grad_norm": 18.33096694946289, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8702906370162964, "num_tokens": 607049788.0, "step": 15912 }, { "epoch": 2.0242971632107873, "ewc_loss": 0.03163766860961914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163766814395785e-05, "grad_norm": 18.4363956451416, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8677244186401367, "num_tokens": 607087592.0, "step": 15913 }, { "epoch": 2.024424373489378, "ewc_loss": 0.031675808131694794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16758087137714e-05, "grad_norm": 18.38087272644043, "learning_rate": 1e-06, "loss": 0.3404, "mean_token_accuracy": 0.889051616191864, "num_tokens": 607119954.0, "step": 15914 }, { "epoch": 2.0245515837679684, "ewc_loss": 0.0316234827041626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1623483664589e-05, "grad_norm": 18.42816162109375, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8644487857818604, "num_tokens": 607156511.0, "step": 15915 }, { "epoch": 2.024678794046559, "ewc_loss": 0.031652193516492844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165219459333457e-05, "grad_norm": 18.31987190246582, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8828858733177185, "num_tokens": 607198292.0, "step": 15916 }, { "epoch": 2.0248060043251495, "ewc_loss": 0.031622596085071564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162259599776007e-05, "grad_norm": 18.44921875, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8719492554664612, "num_tokens": 607237572.0, "step": 15917 }, { "epoch": 2.02493321460374, "ewc_loss": 0.03173980489373207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173980439896695e-05, "grad_norm": 18.410680770874023, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.864871084690094, "num_tokens": 607282321.0, "step": 15918 }, { "epoch": 2.0250604248823305, "ewc_loss": 0.031587254256010056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1587253033649176e-05, "grad_norm": 18.3737850189209, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8723984956741333, "num_tokens": 607313355.0, "step": 15919 }, { "epoch": 2.025187635160921, "ewc_loss": 0.03166628256440163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166628084727563e-05, "grad_norm": 18.40898895263672, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8714717626571655, "num_tokens": 607351961.0, "step": 15920 }, { "epoch": 2.0253148454395116, "ewc_loss": 0.03162078186869621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1620780646335334e-05, "grad_norm": 18.37363624572754, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.876246452331543, "num_tokens": 607391368.0, "step": 15921 }, { "epoch": 2.025442055718102, "ewc_loss": 0.03164704144001007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164703957736492e-05, "grad_norm": 18.450271606445312, "learning_rate": 1e-06, "loss": 0.3328, "mean_token_accuracy": 0.8930103778839111, "num_tokens": 607427328.0, "step": 15922 }, { "epoch": 2.0255692659966926, "ewc_loss": 0.03164668381214142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1646683055441827e-05, "grad_norm": 18.375486373901367, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8700764179229736, "num_tokens": 607468243.0, "step": 15923 }, { "epoch": 2.025696476275283, "ewc_loss": 0.03165903314948082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16590339934919e-05, "grad_norm": 18.467580795288086, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8582977652549744, "num_tokens": 607502466.0, "step": 15924 }, { "epoch": 2.0258236865538737, "ewc_loss": 0.031696733087301254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1696734367869794e-05, "grad_norm": 18.48252296447754, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.864193320274353, "num_tokens": 607544093.0, "step": 15925 }, { "epoch": 2.025950896832464, "ewc_loss": 0.0316496342420578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164963345625438e-05, "grad_norm": 18.37818145751953, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.88013756275177, "num_tokens": 607576054.0, "step": 15926 }, { "epoch": 2.0260781071110547, "ewc_loss": 0.031589243561029434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1589243008056656e-05, "grad_norm": 18.463394165039062, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8869417309761047, "num_tokens": 607608462.0, "step": 15927 }, { "epoch": 2.0262053173896453, "ewc_loss": 0.031704727560281754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1704727007308975e-05, "grad_norm": 18.526622772216797, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8688234090805054, "num_tokens": 607645272.0, "step": 15928 }, { "epoch": 2.026332527668236, "ewc_loss": 0.03160206228494644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160206324537285e-05, "grad_norm": 18.37374496459961, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8764376640319824, "num_tokens": 607682585.0, "step": 15929 }, { "epoch": 2.0264597379468263, "ewc_loss": 0.03166339546442032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1663395930081606e-05, "grad_norm": 18.506423950195312, "learning_rate": 1e-06, "loss": 0.3393, "mean_token_accuracy": 0.8907032012939453, "num_tokens": 607721302.0, "step": 15930 }, { "epoch": 2.0265869482254164, "ewc_loss": 0.03166043013334274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1660430977353826e-05, "grad_norm": 18.367311477661133, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.869218111038208, "num_tokens": 607758357.0, "step": 15931 }, { "epoch": 2.026714158504007, "ewc_loss": 0.03162169083952904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1621690141037107e-05, "grad_norm": 18.48925018310547, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8733134269714355, "num_tokens": 607795467.0, "step": 15932 }, { "epoch": 2.0268413687825975, "ewc_loss": 0.03161676228046417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16167606797535e-05, "grad_norm": 18.42786407470703, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8690294027328491, "num_tokens": 607833192.0, "step": 15933 }, { "epoch": 2.026968579061188, "ewc_loss": 0.03162282705307007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1622828828403726e-05, "grad_norm": 18.43356704711914, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8822578191757202, "num_tokens": 607866672.0, "step": 15934 }, { "epoch": 2.0270957893397785, "ewc_loss": 0.03169047087430954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169046976836398e-05, "grad_norm": 18.592697143554688, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8751046657562256, "num_tokens": 607907976.0, "step": 15935 }, { "epoch": 2.027222999618369, "ewc_loss": 0.03163563087582588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163563087582588e-05, "grad_norm": 18.32909393310547, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8774349093437195, "num_tokens": 607946009.0, "step": 15936 }, { "epoch": 2.0273502098969596, "ewc_loss": 0.031546998769044876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1546998798148707e-05, "grad_norm": 18.49955177307129, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8793041706085205, "num_tokens": 607986800.0, "step": 15937 }, { "epoch": 2.02747742017555, "ewc_loss": 0.031657617539167404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1657618819735944e-05, "grad_norm": 18.430315017700195, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8690274953842163, "num_tokens": 608021807.0, "step": 15938 }, { "epoch": 2.0276046304541406, "ewc_loss": 0.031559817492961884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.15598190354649e-05, "grad_norm": 18.379358291625977, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8729085326194763, "num_tokens": 608060294.0, "step": 15939 }, { "epoch": 2.027731840732731, "ewc_loss": 0.03169830143451691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169830233673565e-05, "grad_norm": 18.45720100402832, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8739893436431885, "num_tokens": 608100790.0, "step": 15940 }, { "epoch": 2.0278590510113217, "ewc_loss": 0.03159309923648834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159309926559217e-05, "grad_norm": 18.304410934448242, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8695923089981079, "num_tokens": 608142164.0, "step": 15941 }, { "epoch": 2.027986261289912, "ewc_loss": 0.03161858022212982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161857966915704e-05, "grad_norm": 18.374963760375977, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.875445544719696, "num_tokens": 608184175.0, "step": 15942 }, { "epoch": 2.0281134715685027, "ewc_loss": 0.03172993287444115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172993456246331e-05, "grad_norm": 18.400985717773438, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8681703209877014, "num_tokens": 608226801.0, "step": 15943 }, { "epoch": 2.0282406818470933, "ewc_loss": 0.03162272647023201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162272696499713e-05, "grad_norm": 18.41741371154785, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8824415802955627, "num_tokens": 608265155.0, "step": 15944 }, { "epoch": 2.028367892125684, "ewc_loss": 0.03164181858301163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1641819077776745e-05, "grad_norm": 18.371614456176758, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8840510845184326, "num_tokens": 608303817.0, "step": 15945 }, { "epoch": 2.0284951024042743, "ewc_loss": 0.03161456063389778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161455970257521e-05, "grad_norm": 18.440568923950195, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8806433081626892, "num_tokens": 608338845.0, "step": 15946 }, { "epoch": 2.028622312682865, "ewc_loss": 0.03165283799171448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1652838515583426e-05, "grad_norm": 18.333728790283203, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8744208812713623, "num_tokens": 608377661.0, "step": 15947 }, { "epoch": 2.0287495229614554, "ewc_loss": 0.03158687800168991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1586878321832046e-05, "grad_norm": 18.424110412597656, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8733704686164856, "num_tokens": 608412897.0, "step": 15948 }, { "epoch": 2.028876733240046, "ewc_loss": 0.03171851485967636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171851494698785e-05, "grad_norm": 18.37507438659668, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8660722970962524, "num_tokens": 608453584.0, "step": 15949 }, { "epoch": 2.0290039435186364, "ewc_loss": 0.0316370353102684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163703513564542e-05, "grad_norm": 18.43272590637207, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8707383871078491, "num_tokens": 608492211.0, "step": 15950 }, { "epoch": 2.029131153797227, "ewc_loss": 0.03167407959699631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1674080673838034e-05, "grad_norm": 18.3885440826416, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8705803155899048, "num_tokens": 608534317.0, "step": 15951 }, { "epoch": 2.0292583640758175, "ewc_loss": 0.03166280314326286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166280293953605e-05, "grad_norm": 18.46123695373535, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8836215138435364, "num_tokens": 608569763.0, "step": 15952 }, { "epoch": 2.029385574354408, "ewc_loss": 0.03163791447877884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1637915526516736e-05, "grad_norm": 18.353961944580078, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.877193808555603, "num_tokens": 608612145.0, "step": 15953 }, { "epoch": 2.0295127846329986, "ewc_loss": 0.03162410855293274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162410939694382e-05, "grad_norm": 18.391267776489258, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.873305082321167, "num_tokens": 608649528.0, "step": 15954 }, { "epoch": 2.0296399949115886, "ewc_loss": 0.03166986256837845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1669864256400615e-05, "grad_norm": 18.440954208374023, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8679110407829285, "num_tokens": 608686973.0, "step": 15955 }, { "epoch": 2.029767205190179, "ewc_loss": 0.03168331831693649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168331750202924e-05, "grad_norm": 18.484539031982422, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8762792944908142, "num_tokens": 608729169.0, "step": 15956 }, { "epoch": 2.0298944154687697, "ewc_loss": 0.03170906379818916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170906347804703e-05, "grad_norm": 18.550954818725586, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8732795715332031, "num_tokens": 608764502.0, "step": 15957 }, { "epoch": 2.0300216257473602, "ewc_loss": 0.03165845945477486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165845919284038e-05, "grad_norm": 18.455432891845703, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.874904990196228, "num_tokens": 608805190.0, "step": 15958 }, { "epoch": 2.0301488360259508, "ewc_loss": 0.03159342333674431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1593423045706004e-05, "grad_norm": 18.443513870239258, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8804258108139038, "num_tokens": 608840006.0, "step": 15959 }, { "epoch": 2.0302760463045413, "ewc_loss": 0.03163431957364082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163432120345533e-05, "grad_norm": 18.441747665405273, "learning_rate": 1e-06, "loss": 0.367, "mean_token_accuracy": 0.8809535503387451, "num_tokens": 608871551.0, "step": 15960 }, { "epoch": 2.030403256583132, "ewc_loss": 0.031627919524908066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162791836075485e-05, "grad_norm": 18.438940048217773, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.852608323097229, "num_tokens": 608909610.0, "step": 15961 }, { "epoch": 2.0305304668617223, "ewc_loss": 0.03164258599281311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164258669130504e-05, "grad_norm": 18.343164443969727, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8708736896514893, "num_tokens": 608950758.0, "step": 15962 }, { "epoch": 2.030657677140313, "ewc_loss": 0.031663671135902405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1663672416470945e-05, "grad_norm": 18.456972122192383, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8615450263023376, "num_tokens": 608990695.0, "step": 15963 }, { "epoch": 2.0307848874189034, "ewc_loss": 0.031700994819402695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17009944410529e-05, "grad_norm": 18.449880599975586, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8880053758621216, "num_tokens": 609030978.0, "step": 15964 }, { "epoch": 2.030912097697494, "ewc_loss": 0.031632717698812485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16327168548014e-05, "grad_norm": 18.4328556060791, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8841031193733215, "num_tokens": 609071974.0, "step": 15965 }, { "epoch": 2.0310393079760845, "ewc_loss": 0.031660374253988266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166037276969291e-05, "grad_norm": 18.44585609436035, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8770221471786499, "num_tokens": 609111268.0, "step": 15966 }, { "epoch": 2.031166518254675, "ewc_loss": 0.03163288161158562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163288056384772e-05, "grad_norm": 18.375335693359375, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8698154091835022, "num_tokens": 609149681.0, "step": 15967 }, { "epoch": 2.0312937285332655, "ewc_loss": 0.03162858262658119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1628584110876545e-05, "grad_norm": 18.488832473754883, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8795463442802429, "num_tokens": 609189121.0, "step": 15968 }, { "epoch": 2.031420938811856, "ewc_loss": 0.0316636823117733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1663683330407366e-05, "grad_norm": 18.37127685546875, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8830393552780151, "num_tokens": 609228070.0, "step": 15969 }, { "epoch": 2.0315481490904466, "ewc_loss": 0.031646907329559326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1646908610127866e-05, "grad_norm": 18.50895881652832, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8763964176177979, "num_tokens": 609269718.0, "step": 15970 }, { "epoch": 2.031675359369037, "ewc_loss": 0.03169861435890198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169861520291306e-05, "grad_norm": 18.45115089416504, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8773936033248901, "num_tokens": 609307236.0, "step": 15971 }, { "epoch": 2.0318025696476276, "ewc_loss": 0.03160162642598152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1601626687915996e-05, "grad_norm": 18.549015045166016, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8569254875183105, "num_tokens": 609342993.0, "step": 15972 }, { "epoch": 2.031929779926218, "ewc_loss": 0.031669437885284424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1669438612880185e-05, "grad_norm": 18.44614028930664, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8806816339492798, "num_tokens": 609376817.0, "step": 15973 }, { "epoch": 2.0320569902048087, "ewc_loss": 0.031604744493961334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160474443575367e-05, "grad_norm": 18.50870132446289, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8731421232223511, "num_tokens": 609411209.0, "step": 15974 }, { "epoch": 2.032184200483399, "ewc_loss": 0.03166734427213669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16673431370873e-05, "grad_norm": 18.509485244750977, "learning_rate": 1e-06, "loss": 0.336, "mean_token_accuracy": 0.8919042348861694, "num_tokens": 609449055.0, "step": 15975 }, { "epoch": 2.0323114107619897, "ewc_loss": 0.03158394992351532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.158394974889234e-05, "grad_norm": 18.464710235595703, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8746824264526367, "num_tokens": 609486332.0, "step": 15976 }, { "epoch": 2.0324386210405803, "ewc_loss": 0.031600091606378555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16000914608594e-05, "grad_norm": 18.434234619140625, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8615944385528564, "num_tokens": 609526924.0, "step": 15977 }, { "epoch": 2.032565831319171, "ewc_loss": 0.031563349068164825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156334787490778e-05, "grad_norm": 18.430095672607422, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.8883547186851501, "num_tokens": 609559542.0, "step": 15978 }, { "epoch": 2.032693041597761, "ewc_loss": 0.03160226345062256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160226333420724e-05, "grad_norm": 18.4909725189209, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8671335577964783, "num_tokens": 609598989.0, "step": 15979 }, { "epoch": 2.0328202518763514, "ewc_loss": 0.03159525245428085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159525294904597e-05, "grad_norm": 18.461156845092773, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8624873161315918, "num_tokens": 609635790.0, "step": 15980 }, { "epoch": 2.032947462154942, "ewc_loss": 0.03161607310175896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161607310175896e-05, "grad_norm": 18.51959800720215, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8660403490066528, "num_tokens": 609670120.0, "step": 15981 }, { "epoch": 2.0330746724335325, "ewc_loss": 0.03156200796365738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156200909870677e-05, "grad_norm": 18.423599243164062, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.874457597732544, "num_tokens": 609713260.0, "step": 15982 }, { "epoch": 2.033201882712123, "ewc_loss": 0.03160136193037033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160136111546308e-05, "grad_norm": 18.439193725585938, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8811390995979309, "num_tokens": 609753236.0, "step": 15983 }, { "epoch": 2.0333290929907135, "ewc_loss": 0.03155161067843437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.15516117552761e-05, "grad_norm": 18.423938751220703, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8771706223487854, "num_tokens": 609791629.0, "step": 15984 }, { "epoch": 2.033456303269304, "ewc_loss": 0.03157711774110794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.157711762469262e-05, "grad_norm": 18.458051681518555, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8588260412216187, "num_tokens": 609833457.0, "step": 15985 }, { "epoch": 2.0335835135478946, "ewc_loss": 0.0316317155957222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163171641062945e-05, "grad_norm": 18.450639724731445, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8675739169120789, "num_tokens": 609866007.0, "step": 15986 }, { "epoch": 2.033710723826485, "ewc_loss": 0.03157300129532814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.15730030706618e-05, "grad_norm": 18.46309471130371, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8790791034698486, "num_tokens": 609903346.0, "step": 15987 }, { "epoch": 2.0338379341050756, "ewc_loss": 0.03163808584213257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163808651152067e-05, "grad_norm": 18.42569351196289, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8789231777191162, "num_tokens": 609935885.0, "step": 15988 }, { "epoch": 2.033965144383666, "ewc_loss": 0.031575482338666916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.157548053422943e-05, "grad_norm": 18.442554473876953, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8683881163597107, "num_tokens": 609971127.0, "step": 15989 }, { "epoch": 2.0340923546622567, "ewc_loss": 0.031648650765419006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164865120197646e-05, "grad_norm": 18.49512481689453, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8658309578895569, "num_tokens": 610016719.0, "step": 15990 }, { "epoch": 2.034219564940847, "ewc_loss": 0.03165313974022865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1653140467824414e-05, "grad_norm": 18.412919998168945, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8637927770614624, "num_tokens": 610057002.0, "step": 15991 }, { "epoch": 2.0343467752194377, "ewc_loss": 0.031560588628053665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1560590286972e-05, "grad_norm": 18.438291549682617, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8795781135559082, "num_tokens": 610092779.0, "step": 15992 }, { "epoch": 2.0344739854980283, "ewc_loss": 0.031647343188524246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164734152960591e-05, "grad_norm": 18.436513900756836, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8650830984115601, "num_tokens": 610137092.0, "step": 15993 }, { "epoch": 2.034601195776619, "ewc_loss": 0.03164086118340492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164086228935048e-05, "grad_norm": 18.457645416259766, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8832138776779175, "num_tokens": 610178632.0, "step": 15994 }, { "epoch": 2.0347284060552093, "ewc_loss": 0.031626854091882706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162685243296437e-05, "grad_norm": 18.49704933166504, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.873296856880188, "num_tokens": 610219789.0, "step": 15995 }, { "epoch": 2.0348556163338, "ewc_loss": 0.031599704176187515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159970583510585e-05, "grad_norm": 18.452871322631836, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8602356910705566, "num_tokens": 610259104.0, "step": 15996 }, { "epoch": 2.0349828266123904, "ewc_loss": 0.031576383858919144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1576382752973586e-05, "grad_norm": 18.4683780670166, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8676168918609619, "num_tokens": 610296317.0, "step": 15997 }, { "epoch": 2.035110036890981, "ewc_loss": 0.031608838587999344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160883716191165e-05, "grad_norm": 18.40578842163086, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.871719241142273, "num_tokens": 610339300.0, "step": 15998 }, { "epoch": 2.0352372471695714, "ewc_loss": 0.031567592173814774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156759339617565e-05, "grad_norm": 18.466594696044922, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8701537847518921, "num_tokens": 610373917.0, "step": 15999 }, { "epoch": 2.035364457448162, "ewc_loss": 0.0316593237221241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165932503179647e-05, "grad_norm": 18.42033576965332, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.87378990650177, "num_tokens": 610419571.0, "step": 16000 }, { "epoch": 2.0354916677267525, "ewc_loss": 0.031636349856853485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1636351195629686e-05, "grad_norm": 18.45816993713379, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8650391101837158, "num_tokens": 610458073.0, "step": 16001 }, { "epoch": 2.035618878005343, "ewc_loss": 0.031651683151721954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165168163832277e-05, "grad_norm": 18.403799057006836, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8634803295135498, "num_tokens": 610497223.0, "step": 16002 }, { "epoch": 2.0357460882839336, "ewc_loss": 0.031598884612321854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159888365189545e-05, "grad_norm": 18.4223575592041, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8537600040435791, "num_tokens": 610532563.0, "step": 16003 }, { "epoch": 2.0358732985625236, "ewc_loss": 0.031673263758420944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167326212860644e-05, "grad_norm": 18.43097496032715, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8842365741729736, "num_tokens": 610566644.0, "step": 16004 }, { "epoch": 2.036000508841114, "ewc_loss": 0.031649548560380936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1649549782741815e-05, "grad_norm": 18.492921829223633, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8616317510604858, "num_tokens": 610604755.0, "step": 16005 }, { "epoch": 2.0361277191197047, "ewc_loss": 0.031648386269807816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1648385629523546e-05, "grad_norm": 18.480958938598633, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8779323101043701, "num_tokens": 610645905.0, "step": 16006 }, { "epoch": 2.036254929398295, "ewc_loss": 0.031644146889448166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1644147384213284e-05, "grad_norm": 18.52410316467285, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8904819488525391, "num_tokens": 610686800.0, "step": 16007 }, { "epoch": 2.0363821396768857, "ewc_loss": 0.03162980079650879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1629799195798114e-05, "grad_norm": 18.389862060546875, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8737600445747375, "num_tokens": 610728291.0, "step": 16008 }, { "epoch": 2.0365093499554763, "ewc_loss": 0.031616777181625366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161677886964753e-05, "grad_norm": 18.60822105407715, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8728548884391785, "num_tokens": 610759818.0, "step": 16009 }, { "epoch": 2.036636560234067, "ewc_loss": 0.031670961529016495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167096292600036e-05, "grad_norm": 18.37958526611328, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8599671721458435, "num_tokens": 610800125.0, "step": 16010 }, { "epoch": 2.0367637705126573, "ewc_loss": 0.03156780079007149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156780076096766e-05, "grad_norm": 18.530641555786133, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8652145862579346, "num_tokens": 610839261.0, "step": 16011 }, { "epoch": 2.036890980791248, "ewc_loss": 0.031760264188051224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176026439177804e-05, "grad_norm": 18.546669006347656, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.878270149230957, "num_tokens": 610879092.0, "step": 16012 }, { "epoch": 2.0370181910698384, "ewc_loss": 0.031608935445547104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160893538733944e-05, "grad_norm": 18.46554946899414, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8740237951278687, "num_tokens": 610914494.0, "step": 16013 }, { "epoch": 2.037145401348429, "ewc_loss": 0.031617421656847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161742279189639e-05, "grad_norm": 18.479215621948242, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8839405179023743, "num_tokens": 610950784.0, "step": 16014 }, { "epoch": 2.0372726116270194, "ewc_loss": 0.03165128827095032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1651288736611605e-05, "grad_norm": 18.48046875, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8637343645095825, "num_tokens": 610989254.0, "step": 16015 }, { "epoch": 2.03739982190561, "ewc_loss": 0.031626734882593155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162673601764254e-05, "grad_norm": 18.5433292388916, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8663369417190552, "num_tokens": 611022668.0, "step": 16016 }, { "epoch": 2.0375270321842005, "ewc_loss": 0.03164590895175934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1645908165955916e-05, "grad_norm": 18.439908981323242, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8728805184364319, "num_tokens": 611062110.0, "step": 16017 }, { "epoch": 2.037654242462791, "ewc_loss": 0.03162830322980881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16283039865084e-05, "grad_norm": 18.498668670654297, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8783231973648071, "num_tokens": 611098663.0, "step": 16018 }, { "epoch": 2.0377814527413816, "ewc_loss": 0.03170930966734886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170931086060591e-05, "grad_norm": 18.43921661376953, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.869788646697998, "num_tokens": 611134058.0, "step": 16019 }, { "epoch": 2.037908663019972, "ewc_loss": 0.0315924733877182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159247353323735e-05, "grad_norm": 18.494325637817383, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8745312690734863, "num_tokens": 611174748.0, "step": 16020 }, { "epoch": 2.0380358732985626, "ewc_loss": 0.03168042376637459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16804253088776e-05, "grad_norm": 18.4915828704834, "learning_rate": 1e-06, "loss": 0.3463, "mean_token_accuracy": 0.8896794319152832, "num_tokens": 611216792.0, "step": 16021 }, { "epoch": 2.038163083577153, "ewc_loss": 0.03163086250424385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1630861485609785e-05, "grad_norm": 18.50345230102539, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8830676078796387, "num_tokens": 611253996.0, "step": 16022 }, { "epoch": 2.0382902938557437, "ewc_loss": 0.031614113599061966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1614112231181934e-05, "grad_norm": 18.431018829345703, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8846363425254822, "num_tokens": 611295351.0, "step": 16023 }, { "epoch": 2.038417504134334, "ewc_loss": 0.03161821886897087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.161821950925514e-05, "grad_norm": 18.507165908813477, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8741564154624939, "num_tokens": 611330513.0, "step": 16024 }, { "epoch": 2.0385447144129247, "ewc_loss": 0.03160205855965614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.160205960739404e-05, "grad_norm": 18.419782638549805, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.881946325302124, "num_tokens": 611369679.0, "step": 16025 }, { "epoch": 2.0386719246915153, "ewc_loss": 0.031599678099155426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1599676731275395e-05, "grad_norm": 18.543603897094727, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8817446231842041, "num_tokens": 611410398.0, "step": 16026 }, { "epoch": 2.038799134970106, "ewc_loss": 0.0316615104675293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166151145705953e-05, "grad_norm": 18.461101531982422, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8672661781311035, "num_tokens": 611447151.0, "step": 16027 }, { "epoch": 2.0389263452486963, "ewc_loss": 0.03156435862183571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.156435923301615e-05, "grad_norm": 18.381715774536133, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8660586476325989, "num_tokens": 611486923.0, "step": 16028 }, { "epoch": 2.0390535555272864, "ewc_loss": 0.031600192189216614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1600193324266e-05, "grad_norm": 18.46288299560547, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8801616430282593, "num_tokens": 611519502.0, "step": 16029 }, { "epoch": 2.039180765805877, "ewc_loss": 0.031635019928216934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.163501969538629e-05, "grad_norm": 18.412940979003906, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8612036108970642, "num_tokens": 611557647.0, "step": 16030 }, { "epoch": 2.0393079760844675, "ewc_loss": 0.03162487968802452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1624880648450926e-05, "grad_norm": 18.485015869140625, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.872460126876831, "num_tokens": 611597405.0, "step": 16031 }, { "epoch": 2.039435186363058, "ewc_loss": 0.03172203153371811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172203287249431e-05, "grad_norm": 18.46752166748047, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8685234189033508, "num_tokens": 611635996.0, "step": 16032 }, { "epoch": 2.0395623966416485, "ewc_loss": 0.031648073345422745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1648072763346136e-05, "grad_norm": 18.421104431152344, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8819313645362854, "num_tokens": 611677900.0, "step": 16033 }, { "epoch": 2.039689606920239, "ewc_loss": 0.03167067468166351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1670675525674596e-05, "grad_norm": 18.46869659423828, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8761606216430664, "num_tokens": 611719184.0, "step": 16034 }, { "epoch": 2.0398168171988296, "ewc_loss": 0.031676895916461945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167689646943472e-05, "grad_norm": 18.434593200683594, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8716052770614624, "num_tokens": 611761270.0, "step": 16035 }, { "epoch": 2.03994402747742, "ewc_loss": 0.031598303467035294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.159830521326512e-05, "grad_norm": 18.395938873291016, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8729109764099121, "num_tokens": 611794939.0, "step": 16036 }, { "epoch": 2.0400712377560106, "ewc_loss": 0.03165833279490471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1658331863582134e-05, "grad_norm": 18.424589157104492, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8848468065261841, "num_tokens": 611834093.0, "step": 16037 }, { "epoch": 2.040198448034601, "ewc_loss": 0.031728025525808334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172802462358959e-05, "grad_norm": 18.480112075805664, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8694765567779541, "num_tokens": 611871327.0, "step": 16038 }, { "epoch": 2.0403256583131917, "ewc_loss": 0.03170955553650856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1709554605185986e-05, "grad_norm": 18.433757781982422, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8669029474258423, "num_tokens": 611907837.0, "step": 16039 }, { "epoch": 2.040452868591782, "ewc_loss": 0.03164331614971161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164331792504527e-05, "grad_norm": 18.442121505737305, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8568596243858337, "num_tokens": 611950964.0, "step": 16040 }, { "epoch": 2.0405800788703727, "ewc_loss": 0.0317346565425396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173465665895492e-05, "grad_norm": 18.470029830932617, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8692858815193176, "num_tokens": 611985867.0, "step": 16041 }, { "epoch": 2.0407072891489633, "ewc_loss": 0.03167296573519707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1672967452323064e-05, "grad_norm": 18.4498291015625, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.877042293548584, "num_tokens": 612017819.0, "step": 16042 }, { "epoch": 2.040834499427554, "ewc_loss": 0.0316704697906971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167046816088259e-05, "grad_norm": 18.376327514648438, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8590449690818787, "num_tokens": 612060669.0, "step": 16043 }, { "epoch": 2.0409617097061443, "ewc_loss": 0.03168840333819389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1688403396401554e-05, "grad_norm": 18.45880699157715, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8602553009986877, "num_tokens": 612099801.0, "step": 16044 }, { "epoch": 2.041088919984735, "ewc_loss": 0.03170917183160782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170917261741124e-05, "grad_norm": 18.37734603881836, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.878075361251831, "num_tokens": 612139454.0, "step": 16045 }, { "epoch": 2.0412161302633254, "ewc_loss": 0.03176609054207802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176608879584819e-05, "grad_norm": 18.430809020996094, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8551963567733765, "num_tokens": 612181695.0, "step": 16046 }, { "epoch": 2.041343340541916, "ewc_loss": 0.03174725919961929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1747258617542684e-05, "grad_norm": 18.383655548095703, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8879196047782898, "num_tokens": 612223200.0, "step": 16047 }, { "epoch": 2.0414705508205064, "ewc_loss": 0.031712424010038376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171242497046478e-05, "grad_norm": 18.42128562927246, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8744882345199585, "num_tokens": 612265946.0, "step": 16048 }, { "epoch": 2.041597761099097, "ewc_loss": 0.03171517699956894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171517528244294e-05, "grad_norm": 18.366352081298828, "learning_rate": 1e-06, "loss": 0.3483, "mean_token_accuracy": 0.8894307613372803, "num_tokens": 612301246.0, "step": 16049 }, { "epoch": 2.0417249713776875, "ewc_loss": 0.03176843374967575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1768435292178765e-05, "grad_norm": 18.54336166381836, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8590649366378784, "num_tokens": 612337789.0, "step": 16050 }, { "epoch": 2.041852181656278, "ewc_loss": 0.03176560252904892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176560130668804e-05, "grad_norm": 18.344507217407227, "learning_rate": 1e-06, "loss": 0.349, "mean_token_accuracy": 0.8863030672073364, "num_tokens": 612372391.0, "step": 16051 }, { "epoch": 2.0419793919348685, "ewc_loss": 0.03168487921357155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168487819493748e-05, "grad_norm": 18.438392639160156, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8778753280639648, "num_tokens": 612408806.0, "step": 16052 }, { "epoch": 2.0421066022134586, "ewc_loss": 0.03177563473582268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1775634852238e-05, "grad_norm": 18.42038917541504, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8847032785415649, "num_tokens": 612446349.0, "step": 16053 }, { "epoch": 2.042233812492049, "ewc_loss": 0.031686924397945404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168692273902707e-05, "grad_norm": 18.326229095458984, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8868210315704346, "num_tokens": 612489899.0, "step": 16054 }, { "epoch": 2.0423610227706397, "ewc_loss": 0.03179718181490898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1797182600712404e-05, "grad_norm": 18.459299087524414, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8684297204017639, "num_tokens": 612532314.0, "step": 16055 }, { "epoch": 2.04248823304923, "ewc_loss": 0.031787075102329254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17870762955863e-05, "grad_norm": 18.495262145996094, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8810256719589233, "num_tokens": 612568033.0, "step": 16056 }, { "epoch": 2.0426154433278207, "ewc_loss": 0.031711578369140625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171157732140273e-05, "grad_norm": 18.453018188476562, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8816507458686829, "num_tokens": 612605160.0, "step": 16057 }, { "epoch": 2.0427426536064113, "ewc_loss": 0.03174707293510437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174707308062352e-05, "grad_norm": 18.470502853393555, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8620409369468689, "num_tokens": 612647851.0, "step": 16058 }, { "epoch": 2.042869863885002, "ewc_loss": 0.03172401711344719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1724015570944175e-05, "grad_norm": 18.472660064697266, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8738065361976624, "num_tokens": 612684770.0, "step": 16059 }, { "epoch": 2.0429970741635923, "ewc_loss": 0.03172660619020462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1726605811854824e-05, "grad_norm": 18.453567504882812, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8628782033920288, "num_tokens": 612721689.0, "step": 16060 }, { "epoch": 2.043124284442183, "ewc_loss": 0.03172124922275543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1721250707050785e-05, "grad_norm": 18.49085235595703, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8727461695671082, "num_tokens": 612757256.0, "step": 16061 }, { "epoch": 2.0432514947207734, "ewc_loss": 0.031690020114183426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16900186589919e-05, "grad_norm": 18.3763427734375, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8794499039649963, "num_tokens": 612797806.0, "step": 16062 }, { "epoch": 2.043378704999364, "ewc_loss": 0.031694162636995316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169416231685318e-05, "grad_norm": 18.500967025756836, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.88532555103302, "num_tokens": 612833507.0, "step": 16063 }, { "epoch": 2.0435059152779544, "ewc_loss": 0.03172226995229721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172226934111677e-05, "grad_norm": 18.48761558532715, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8795728087425232, "num_tokens": 612872920.0, "step": 16064 }, { "epoch": 2.043633125556545, "ewc_loss": 0.031684499233961105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1684499845141545e-05, "grad_norm": 18.455148696899414, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8799680471420288, "num_tokens": 612909534.0, "step": 16065 }, { "epoch": 2.0437603358351355, "ewc_loss": 0.031685721129179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168572220602073e-05, "grad_norm": 18.43444061279297, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8701679110527039, "num_tokens": 612950531.0, "step": 16066 }, { "epoch": 2.043887546113726, "ewc_loss": 0.03167758882045746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167758768540807e-05, "grad_norm": 18.46866798400879, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8760631084442139, "num_tokens": 612984638.0, "step": 16067 }, { "epoch": 2.0440147563923166, "ewc_loss": 0.031723033636808395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172303331666626e-05, "grad_norm": 18.46178436279297, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8861371278762817, "num_tokens": 613020632.0, "step": 16068 }, { "epoch": 2.044141966670907, "ewc_loss": 0.031620003283023834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1620002118870616e-05, "grad_norm": 18.407880783081055, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.878631591796875, "num_tokens": 613057381.0, "step": 16069 }, { "epoch": 2.0442691769494976, "ewc_loss": 0.03169349953532219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169350020471029e-05, "grad_norm": 18.481929779052734, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8732396960258484, "num_tokens": 613094287.0, "step": 16070 }, { "epoch": 2.044396387228088, "ewc_loss": 0.031694844365119934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169484261889011e-05, "grad_norm": 18.418413162231445, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8809814453125, "num_tokens": 613126315.0, "step": 16071 }, { "epoch": 2.0445235975066787, "ewc_loss": 0.031692035496234894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169203409925103e-05, "grad_norm": 18.514965057373047, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.870292603969574, "num_tokens": 613160970.0, "step": 16072 }, { "epoch": 2.044650807785269, "ewc_loss": 0.03169138729572296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169138653902337e-05, "grad_norm": 18.42036247253418, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8761228322982788, "num_tokens": 613204685.0, "step": 16073 }, { "epoch": 2.0447780180638597, "ewc_loss": 0.03162235766649246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.162235952913761e-05, "grad_norm": 18.457382202148438, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8822510838508606, "num_tokens": 613242394.0, "step": 16074 }, { "epoch": 2.0449052283424503, "ewc_loss": 0.03170637786388397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1706378649687394e-05, "grad_norm": 18.40630531311035, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8827776908874512, "num_tokens": 613283517.0, "step": 16075 }, { "epoch": 2.045032438621041, "ewc_loss": 0.031661707907915115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1661707907915115e-05, "grad_norm": 18.464984893798828, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8804473280906677, "num_tokens": 613317237.0, "step": 16076 }, { "epoch": 2.045159648899631, "ewc_loss": 0.03170027956366539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17002777592279e-05, "grad_norm": 18.361404418945312, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8773273825645447, "num_tokens": 613355741.0, "step": 16077 }, { "epoch": 2.0452868591782214, "ewc_loss": 0.03173714503645897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173714503645897e-05, "grad_norm": 18.49637222290039, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8655345439910889, "num_tokens": 613392131.0, "step": 16078 }, { "epoch": 2.045414069456812, "ewc_loss": 0.03172745183110237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172745346091688e-05, "grad_norm": 18.433603286743164, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8652505874633789, "num_tokens": 613433687.0, "step": 16079 }, { "epoch": 2.0455412797354025, "ewc_loss": 0.03164096549153328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164096415275708e-05, "grad_norm": 18.388874053955078, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8780395984649658, "num_tokens": 613472882.0, "step": 16080 }, { "epoch": 2.045668490013993, "ewc_loss": 0.03175428509712219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175428355461918e-05, "grad_norm": 18.468841552734375, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.879070520401001, "num_tokens": 613515869.0, "step": 16081 }, { "epoch": 2.0457957002925835, "ewc_loss": 0.031741272658109665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174127414240502e-05, "grad_norm": 18.40077781677246, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.875179648399353, "num_tokens": 613551835.0, "step": 16082 }, { "epoch": 2.045922910571174, "ewc_loss": 0.03164573758840561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.164573718095198e-05, "grad_norm": 18.406911849975586, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8716886043548584, "num_tokens": 613592277.0, "step": 16083 }, { "epoch": 2.0460501208497646, "ewc_loss": 0.031781964004039764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178196493536234e-05, "grad_norm": 18.411813735961914, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8745098114013672, "num_tokens": 613625594.0, "step": 16084 }, { "epoch": 2.046177331128355, "ewc_loss": 0.031686604022979736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1686602596892044e-05, "grad_norm": 18.396928787231445, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8771662712097168, "num_tokens": 613659924.0, "step": 16085 }, { "epoch": 2.0463045414069456, "ewc_loss": 0.031736597418785095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17365957016591e-05, "grad_norm": 18.473690032958984, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8576837778091431, "num_tokens": 613697067.0, "step": 16086 }, { "epoch": 2.046431751685536, "ewc_loss": 0.0318145751953125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181457577738911e-05, "grad_norm": 18.417978286743164, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8810828924179077, "num_tokens": 613738211.0, "step": 16087 }, { "epoch": 2.0465589619641267, "ewc_loss": 0.031657591462135315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.165758971590549e-05, "grad_norm": 18.489561080932617, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.880123496055603, "num_tokens": 613782590.0, "step": 16088 }, { "epoch": 2.046686172242717, "ewc_loss": 0.03173968195915222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173968070768751e-05, "grad_norm": 18.432435989379883, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.881922721862793, "num_tokens": 613825770.0, "step": 16089 }, { "epoch": 2.0468133825213077, "ewc_loss": 0.03171265497803688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171265416312963e-05, "grad_norm": 18.431791305541992, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8791005611419678, "num_tokens": 613864025.0, "step": 16090 }, { "epoch": 2.0469405927998983, "ewc_loss": 0.03170730546116829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17073063342832e-05, "grad_norm": 18.417530059814453, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8773052096366882, "num_tokens": 613907181.0, "step": 16091 }, { "epoch": 2.047067803078489, "ewc_loss": 0.03172861412167549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172861397615634e-05, "grad_norm": 18.47346305847168, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8657251000404358, "num_tokens": 613951258.0, "step": 16092 }, { "epoch": 2.0471950133570793, "ewc_loss": 0.03166966885328293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.166966780554503e-05, "grad_norm": 18.445024490356445, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8727943301200867, "num_tokens": 613991221.0, "step": 16093 }, { "epoch": 2.04732222363567, "ewc_loss": 0.03170780837535858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170780837535858e-05, "grad_norm": 18.381654739379883, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8844622373580933, "num_tokens": 614026862.0, "step": 16094 }, { "epoch": 2.0474494339142604, "ewc_loss": 0.031633827835321426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1633826438337564e-05, "grad_norm": 18.3729248046875, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8652933835983276, "num_tokens": 614067560.0, "step": 16095 }, { "epoch": 2.047576644192851, "ewc_loss": 0.03174501284956932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174501398461871e-05, "grad_norm": 18.427200317382812, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.878399133682251, "num_tokens": 614106974.0, "step": 16096 }, { "epoch": 2.0477038544714414, "ewc_loss": 0.03169902041554451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169901901856065e-05, "grad_norm": 18.32411003112793, "learning_rate": 1e-06, "loss": 0.3078, "mean_token_accuracy": 0.9003604054450989, "num_tokens": 614145641.0, "step": 16097 }, { "epoch": 2.047831064750032, "ewc_loss": 0.03170516714453697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170516720274463e-05, "grad_norm": 18.43385124206543, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8781116604804993, "num_tokens": 614181476.0, "step": 16098 }, { "epoch": 2.0479582750286225, "ewc_loss": 0.03180362656712532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180362546117976e-05, "grad_norm": 18.453550338745117, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8697805404663086, "num_tokens": 614219212.0, "step": 16099 }, { "epoch": 2.048085485307213, "ewc_loss": 0.03173692896962166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173693039570935e-05, "grad_norm": 18.399240493774414, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8463886976242065, "num_tokens": 614264017.0, "step": 16100 }, { "epoch": 2.0482126955858035, "ewc_loss": 0.031684283167123795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168428156641312e-05, "grad_norm": 18.298315048217773, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8599041700363159, "num_tokens": 614299031.0, "step": 16101 }, { "epoch": 2.0483399058643936, "ewc_loss": 0.03176320716738701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1763207516632974e-05, "grad_norm": 18.494232177734375, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8801355361938477, "num_tokens": 614334387.0, "step": 16102 }, { "epoch": 2.048467116142984, "ewc_loss": 0.0318114347755909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181143620167859e-05, "grad_norm": 18.41767692565918, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8758252263069153, "num_tokens": 614372438.0, "step": 16103 }, { "epoch": 2.0485943264215747, "ewc_loss": 0.03171556442975998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17155645461753e-05, "grad_norm": 18.42142105102539, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8701292872428894, "num_tokens": 614409976.0, "step": 16104 }, { "epoch": 2.048721536700165, "ewc_loss": 0.03177574276924133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177574399160221e-05, "grad_norm": 18.48748016357422, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.883253812789917, "num_tokens": 614448740.0, "step": 16105 }, { "epoch": 2.0488487469787557, "ewc_loss": 0.031772859394550323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177285907440819e-05, "grad_norm": 18.474393844604492, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8788760304450989, "num_tokens": 614484644.0, "step": 16106 }, { "epoch": 2.0489759572573463, "ewc_loss": 0.03171036019921303, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171035859850235e-05, "grad_norm": 18.36174774169922, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8557779788970947, "num_tokens": 614521210.0, "step": 16107 }, { "epoch": 2.049103167535937, "ewc_loss": 0.031717974692583084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17179765261244e-05, "grad_norm": 18.437082290649414, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8692076206207275, "num_tokens": 614561783.0, "step": 16108 }, { "epoch": 2.0492303778145273, "ewc_loss": 0.03178089112043381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178089173161425e-05, "grad_norm": 18.418676376342773, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8751572370529175, "num_tokens": 614598643.0, "step": 16109 }, { "epoch": 2.049357588093118, "ewc_loss": 0.03172136843204498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172136712237261e-05, "grad_norm": 18.390636444091797, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8743772506713867, "num_tokens": 614633520.0, "step": 16110 }, { "epoch": 2.0494847983717084, "ewc_loss": 0.031824368983507156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18243692163378e-05, "grad_norm": 18.488805770874023, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.861556887626648, "num_tokens": 614673266.0, "step": 16111 }, { "epoch": 2.049612008650299, "ewc_loss": 0.03175174072384834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175174060743302e-05, "grad_norm": 18.37896728515625, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8649115562438965, "num_tokens": 614707201.0, "step": 16112 }, { "epoch": 2.0497392189288894, "ewc_loss": 0.03177682310342789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177682447130792e-05, "grad_norm": 18.518346786499023, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8794762492179871, "num_tokens": 614746542.0, "step": 16113 }, { "epoch": 2.04986642920748, "ewc_loss": 0.03187860548496246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187860420439392e-05, "grad_norm": 18.526124954223633, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8642400503158569, "num_tokens": 614781804.0, "step": 16114 }, { "epoch": 2.0499936394860705, "ewc_loss": 0.031756699085235596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175669917254709e-05, "grad_norm": 18.457279205322266, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8731651306152344, "num_tokens": 614817281.0, "step": 16115 }, { "epoch": 2.050120849764661, "ewc_loss": 0.03182513266801834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182513319188729e-05, "grad_norm": 18.426292419433594, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8653028011322021, "num_tokens": 614853666.0, "step": 16116 }, { "epoch": 2.0502480600432516, "ewc_loss": 0.031756192445755005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17561934934929e-05, "grad_norm": 18.48682975769043, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8751023411750793, "num_tokens": 614891221.0, "step": 16117 }, { "epoch": 2.050375270321842, "ewc_loss": 0.03185396268963814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185396417393349e-05, "grad_norm": 18.440418243408203, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8805962800979614, "num_tokens": 614925956.0, "step": 16118 }, { "epoch": 2.0505024806004326, "ewc_loss": 0.03174552693963051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174552693963051e-05, "grad_norm": 18.384607315063477, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8803386688232422, "num_tokens": 614959934.0, "step": 16119 }, { "epoch": 2.050629690879023, "ewc_loss": 0.031828053295612335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1828054488869384e-05, "grad_norm": 18.454742431640625, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8708304166793823, "num_tokens": 615000579.0, "step": 16120 }, { "epoch": 2.0507569011576137, "ewc_loss": 0.03181573376059532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1815732654649764e-05, "grad_norm": 18.4204044342041, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8679227232933044, "num_tokens": 615037371.0, "step": 16121 }, { "epoch": 2.050884111436204, "ewc_loss": 0.03186215087771416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186214962624945e-05, "grad_norm": 18.379718780517578, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8652629852294922, "num_tokens": 615075948.0, "step": 16122 }, { "epoch": 2.0510113217147947, "ewc_loss": 0.03185100108385086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185099922120571e-05, "grad_norm": 18.483318328857422, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8866598010063171, "num_tokens": 615114619.0, "step": 16123 }, { "epoch": 2.0511385319933853, "ewc_loss": 0.03187871724367142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187871698173694e-05, "grad_norm": 18.38338279724121, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8745880126953125, "num_tokens": 615149260.0, "step": 16124 }, { "epoch": 2.051265742271976, "ewc_loss": 0.03184179216623306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184179149684496e-05, "grad_norm": 18.547264099121094, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8784928321838379, "num_tokens": 615183284.0, "step": 16125 }, { "epoch": 2.0513929525505663, "ewc_loss": 0.031912725418806076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1912724807625636e-05, "grad_norm": 18.479936599731445, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.879095733165741, "num_tokens": 615220015.0, "step": 16126 }, { "epoch": 2.0515201628291564, "ewc_loss": 0.03175817430019379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175817255396396e-05, "grad_norm": 18.50580406188965, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8687726259231567, "num_tokens": 615255614.0, "step": 16127 }, { "epoch": 2.051647373107747, "ewc_loss": 0.03184567019343376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184566958225332e-05, "grad_norm": 18.33313751220703, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8757306337356567, "num_tokens": 615290232.0, "step": 16128 }, { "epoch": 2.0517745833863374, "ewc_loss": 0.03179549425840378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179549457854591e-05, "grad_norm": 18.387880325317383, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.880103349685669, "num_tokens": 615328204.0, "step": 16129 }, { "epoch": 2.051901793664928, "ewc_loss": 0.031891338527202606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1891337130218744e-05, "grad_norm": 18.468795776367188, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.865469753742218, "num_tokens": 615362846.0, "step": 16130 }, { "epoch": 2.0520290039435185, "ewc_loss": 0.03183074668049812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183074659318663e-05, "grad_norm": 18.4521484375, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8736696839332581, "num_tokens": 615405404.0, "step": 16131 }, { "epoch": 2.052156214222109, "ewc_loss": 0.03183600679039955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1836007110541686e-05, "grad_norm": 18.408662796020508, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8633894324302673, "num_tokens": 615439835.0, "step": 16132 }, { "epoch": 2.0522834245006996, "ewc_loss": 0.03187441825866699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187441689078696e-05, "grad_norm": 18.508703231811523, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.8851077556610107, "num_tokens": 615477832.0, "step": 16133 }, { "epoch": 2.05241063477929, "ewc_loss": 0.03187078982591629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187078982591629e-05, "grad_norm": 18.449243545532227, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8665106296539307, "num_tokens": 615518089.0, "step": 16134 }, { "epoch": 2.0525378450578806, "ewc_loss": 0.03185701742768288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185701643815264e-05, "grad_norm": 18.460336685180664, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8789523839950562, "num_tokens": 615557286.0, "step": 16135 }, { "epoch": 2.052665055336471, "ewc_loss": 0.03181082382798195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1810825021239e-05, "grad_norm": 18.436508178710938, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.877686083316803, "num_tokens": 615593284.0, "step": 16136 }, { "epoch": 2.0527922656150617, "ewc_loss": 0.0318656787276268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1865678465692326e-05, "grad_norm": 18.462890625, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8693863153457642, "num_tokens": 615633579.0, "step": 16137 }, { "epoch": 2.052919475893652, "ewc_loss": 0.031888604164123535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188860500813462e-05, "grad_norm": 18.39394187927246, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8789352178573608, "num_tokens": 615680135.0, "step": 16138 }, { "epoch": 2.0530466861722427, "ewc_loss": 0.031804174184799194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1804174795979634e-05, "grad_norm": 18.45203971862793, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8637603521347046, "num_tokens": 615714801.0, "step": 16139 }, { "epoch": 2.0531738964508333, "ewc_loss": 0.03188374638557434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1883744668448344e-05, "grad_norm": 18.519702911376953, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8666576743125916, "num_tokens": 615750538.0, "step": 16140 }, { "epoch": 2.053301106729424, "ewc_loss": 0.031843047589063644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184304659953341e-05, "grad_norm": 18.384870529174805, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.879929780960083, "num_tokens": 615789465.0, "step": 16141 }, { "epoch": 2.0534283170080143, "ewc_loss": 0.03179018944501877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179019040544517e-05, "grad_norm": 18.534032821655273, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.860572874546051, "num_tokens": 615832553.0, "step": 16142 }, { "epoch": 2.053555527286605, "ewc_loss": 0.03186166286468506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18616621370893e-05, "grad_norm": 18.389202117919922, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8704245090484619, "num_tokens": 615868341.0, "step": 16143 }, { "epoch": 2.0536827375651954, "ewc_loss": 0.03179911524057388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179911436745897e-05, "grad_norm": 18.548538208007812, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8832775354385376, "num_tokens": 615904324.0, "step": 16144 }, { "epoch": 2.053809947843786, "ewc_loss": 0.0318705253303051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187052425346337e-05, "grad_norm": 18.412612915039062, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8810285925865173, "num_tokens": 615947656.0, "step": 16145 }, { "epoch": 2.0539371581223764, "ewc_loss": 0.03174781799316406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1747818866278976e-05, "grad_norm": 18.479759216308594, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8686329126358032, "num_tokens": 615989050.0, "step": 16146 }, { "epoch": 2.054064368400967, "ewc_loss": 0.031875647604465485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187564652762376e-05, "grad_norm": 18.49860954284668, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8789203763008118, "num_tokens": 616027638.0, "step": 16147 }, { "epoch": 2.0541915786795575, "ewc_loss": 0.031757503747940063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1757503165863454e-05, "grad_norm": 18.43017578125, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8825893402099609, "num_tokens": 616066392.0, "step": 16148 }, { "epoch": 2.054318788958148, "ewc_loss": 0.03176872432231903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1768722692504525e-05, "grad_norm": 18.458633422851562, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8841803073883057, "num_tokens": 616108303.0, "step": 16149 }, { "epoch": 2.0544459992367385, "ewc_loss": 0.03182309493422508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182309592375532e-05, "grad_norm": 18.495115280151367, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8662048578262329, "num_tokens": 616154174.0, "step": 16150 }, { "epoch": 2.0545732095153286, "ewc_loss": 0.03168726712465286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168726834701374e-05, "grad_norm": 18.442554473876953, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8722387552261353, "num_tokens": 616193954.0, "step": 16151 }, { "epoch": 2.054700419793919, "ewc_loss": 0.03175978735089302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175978781655431e-05, "grad_norm": 18.466495513916016, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8792072534561157, "num_tokens": 616232771.0, "step": 16152 }, { "epoch": 2.0548276300725097, "ewc_loss": 0.031758859753608704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17588601319585e-05, "grad_norm": 18.491207122802734, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8854815363883972, "num_tokens": 616265151.0, "step": 16153 }, { "epoch": 2.0549548403511, "ewc_loss": 0.03170649707317352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170649870298803e-05, "grad_norm": 18.403512954711914, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8689547181129456, "num_tokens": 616303886.0, "step": 16154 }, { "epoch": 2.0550820506296907, "ewc_loss": 0.031780876219272614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178087717969902e-05, "grad_norm": 18.536603927612305, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8834758996963501, "num_tokens": 616340040.0, "step": 16155 }, { "epoch": 2.0552092609082813, "ewc_loss": 0.03176966309547424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176966492901556e-05, "grad_norm": 18.489595413208008, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8797455430030823, "num_tokens": 616371329.0, "step": 16156 }, { "epoch": 2.055336471186872, "ewc_loss": 0.03172411024570465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172411015839316e-05, "grad_norm": 18.478515625, "learning_rate": 1e-06, "loss": 0.3514, "mean_token_accuracy": 0.8884837627410889, "num_tokens": 616406383.0, "step": 16157 }, { "epoch": 2.0554636814654623, "ewc_loss": 0.031680457293987274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1680458050686866e-05, "grad_norm": 18.412473678588867, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8620086908340454, "num_tokens": 616445600.0, "step": 16158 }, { "epoch": 2.055590891744053, "ewc_loss": 0.03173806518316269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173806544509716e-05, "grad_norm": 18.513689041137695, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8668901324272156, "num_tokens": 616485376.0, "step": 16159 }, { "epoch": 2.0557181020226434, "ewc_loss": 0.03177953138947487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1779531127540395e-05, "grad_norm": 18.521846771240234, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8654201030731201, "num_tokens": 616527104.0, "step": 16160 }, { "epoch": 2.055845312301234, "ewc_loss": 0.03168196603655815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.168196781189181e-05, "grad_norm": 18.34627342224121, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8761817216873169, "num_tokens": 616567557.0, "step": 16161 }, { "epoch": 2.0559725225798244, "ewc_loss": 0.03177705779671669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177705730195157e-05, "grad_norm": 18.51239776611328, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8804216980934143, "num_tokens": 616601612.0, "step": 16162 }, { "epoch": 2.056099732858415, "ewc_loss": 0.03178001567721367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178001497872174e-05, "grad_norm": 18.407114028930664, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.871802031993866, "num_tokens": 616638019.0, "step": 16163 }, { "epoch": 2.0562269431370055, "ewc_loss": 0.03176126256585121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176126119797118e-05, "grad_norm": 18.4514217376709, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8719379901885986, "num_tokens": 616675224.0, "step": 16164 }, { "epoch": 2.056354153415596, "ewc_loss": 0.03181494027376175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181493957526982e-05, "grad_norm": 18.542465209960938, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.87244713306427, "num_tokens": 616713628.0, "step": 16165 }, { "epoch": 2.0564813636941865, "ewc_loss": 0.03175260126590729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17526028084103e-05, "grad_norm": 18.38078498840332, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8720576763153076, "num_tokens": 616755985.0, "step": 16166 }, { "epoch": 2.056608573972777, "ewc_loss": 0.031776025891304016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177602411597036e-05, "grad_norm": 18.489519119262695, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.874737560749054, "num_tokens": 616786564.0, "step": 16167 }, { "epoch": 2.0567357842513676, "ewc_loss": 0.031890153884887695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189015478710644e-05, "grad_norm": 18.47919464111328, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8473743200302124, "num_tokens": 616823285.0, "step": 16168 }, { "epoch": 2.056862994529958, "ewc_loss": 0.03184063360095024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184063461958431e-05, "grad_norm": 18.490081787109375, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8823148012161255, "num_tokens": 616864465.0, "step": 16169 }, { "epoch": 2.0569902048085487, "ewc_loss": 0.03182271867990494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182271757395938e-05, "grad_norm": 18.491016387939453, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8654104471206665, "num_tokens": 616906670.0, "step": 16170 }, { "epoch": 2.057117415087139, "ewc_loss": 0.03181421011686325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181420834152959e-05, "grad_norm": 18.46318817138672, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.872492790222168, "num_tokens": 616947656.0, "step": 16171 }, { "epoch": 2.0572446253657297, "ewc_loss": 0.03174798563122749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17479862133041e-05, "grad_norm": 18.460739135742188, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8676221966743469, "num_tokens": 616980161.0, "step": 16172 }, { "epoch": 2.0573718356443202, "ewc_loss": 0.03182871639728546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1828716601012275e-05, "grad_norm": 18.517627716064453, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.874322772026062, "num_tokens": 617015281.0, "step": 16173 }, { "epoch": 2.0574990459229108, "ewc_loss": 0.031798575073480606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179857594659552e-05, "grad_norm": 18.407251358032227, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8761197328567505, "num_tokens": 617054174.0, "step": 16174 }, { "epoch": 2.057626256201501, "ewc_loss": 0.03182227537035942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1822273740544915e-05, "grad_norm": 18.500234603881836, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8729163408279419, "num_tokens": 617095160.0, "step": 16175 }, { "epoch": 2.0577534664800914, "ewc_loss": 0.03183852508664131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1838524591876194e-05, "grad_norm": 18.428667068481445, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8788526058197021, "num_tokens": 617130781.0, "step": 16176 }, { "epoch": 2.057880676758682, "ewc_loss": 0.031844064593315125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1844065233599395e-05, "grad_norm": 18.517290115356445, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8724069595336914, "num_tokens": 617169473.0, "step": 16177 }, { "epoch": 2.0580078870372724, "ewc_loss": 0.031857941299676895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185794048476964e-05, "grad_norm": 18.481786727905273, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8678987622261047, "num_tokens": 617212208.0, "step": 16178 }, { "epoch": 2.058135097315863, "ewc_loss": 0.031796328723430634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179632767569274e-05, "grad_norm": 18.416887283325195, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8765265941619873, "num_tokens": 617246055.0, "step": 16179 }, { "epoch": 2.0582623075944535, "ewc_loss": 0.03181610628962517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1816107366466895e-05, "grad_norm": 18.463918685913086, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.869897723197937, "num_tokens": 617285614.0, "step": 16180 }, { "epoch": 2.058389517873044, "ewc_loss": 0.03183980658650398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183980516041629e-05, "grad_norm": 18.427532196044922, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8818594217300415, "num_tokens": 617323401.0, "step": 16181 }, { "epoch": 2.0585167281516346, "ewc_loss": 0.03181317076086998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181317151756957e-05, "grad_norm": 18.446779251098633, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.859502375125885, "num_tokens": 617372550.0, "step": 16182 }, { "epoch": 2.058643938430225, "ewc_loss": 0.03183881938457489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183881926815957e-05, "grad_norm": 18.482009887695312, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8803451061248779, "num_tokens": 617407894.0, "step": 16183 }, { "epoch": 2.0587711487088156, "ewc_loss": 0.031773991882801056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1773990485817194e-05, "grad_norm": 18.457242965698242, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8665187358856201, "num_tokens": 617450117.0, "step": 16184 }, { "epoch": 2.058898358987406, "ewc_loss": 0.03181908652186394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18190868711099e-05, "grad_norm": 18.586118698120117, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8799638748168945, "num_tokens": 617485885.0, "step": 16185 }, { "epoch": 2.0590255692659967, "ewc_loss": 0.03176205977797508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176206155330874e-05, "grad_norm": 18.47247886657715, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8577499389648438, "num_tokens": 617520511.0, "step": 16186 }, { "epoch": 2.059152779544587, "ewc_loss": 0.03173346817493439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1733467039885e-05, "grad_norm": 18.498092651367188, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8679130673408508, "num_tokens": 617558288.0, "step": 16187 }, { "epoch": 2.0592799898231777, "ewc_loss": 0.03182319924235344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1823197787161916e-05, "grad_norm": 18.43801498413086, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8732137084007263, "num_tokens": 617602991.0, "step": 16188 }, { "epoch": 2.0594072001017683, "ewc_loss": 0.03176955506205559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176955578965135e-05, "grad_norm": 18.53044891357422, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8705809712409973, "num_tokens": 617639629.0, "step": 16189 }, { "epoch": 2.059534410380359, "ewc_loss": 0.03181532770395279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181532883900218e-05, "grad_norm": 18.501096725463867, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8670283555984497, "num_tokens": 617676218.0, "step": 16190 }, { "epoch": 2.0596616206589493, "ewc_loss": 0.03168241307139397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1682411645306274e-05, "grad_norm": 18.436067581176758, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8663009405136108, "num_tokens": 617716270.0, "step": 16191 }, { "epoch": 2.05978883093754, "ewc_loss": 0.03179158270359039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179158375132829e-05, "grad_norm": 18.487064361572266, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8833801746368408, "num_tokens": 617755559.0, "step": 16192 }, { "epoch": 2.0599160412161304, "ewc_loss": 0.03180423378944397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180423300364055e-05, "grad_norm": 18.485830307006836, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.875622570514679, "num_tokens": 617793522.0, "step": 16193 }, { "epoch": 2.060043251494721, "ewc_loss": 0.03174242004752159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174242010572925e-05, "grad_norm": 18.451509475708008, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8789049386978149, "num_tokens": 617828005.0, "step": 16194 }, { "epoch": 2.0601704617733114, "ewc_loss": 0.0318082757294178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1808274798095226e-05, "grad_norm": 18.47399139404297, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8709509968757629, "num_tokens": 617863824.0, "step": 16195 }, { "epoch": 2.060297672051902, "ewc_loss": 0.03178398683667183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178398765157908e-05, "grad_norm": 18.47406578063965, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8697763681411743, "num_tokens": 617903648.0, "step": 16196 }, { "epoch": 2.0604248823304925, "ewc_loss": 0.03174075111746788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17407502734568e-05, "grad_norm": 18.459503173828125, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.871050238609314, "num_tokens": 617946229.0, "step": 16197 }, { "epoch": 2.060552092609083, "ewc_loss": 0.031808435916900635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180843486916274e-05, "grad_norm": 18.50236701965332, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8733733892440796, "num_tokens": 617990486.0, "step": 16198 }, { "epoch": 2.0606793028876735, "ewc_loss": 0.031768228858709335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176822792738676e-05, "grad_norm": 18.472822189331055, "learning_rate": 1e-06, "loss": 0.3605, "mean_token_accuracy": 0.8848229050636292, "num_tokens": 618031743.0, "step": 16199 }, { "epoch": 2.0608065131662636, "ewc_loss": 0.031737878918647766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1737879908178e-05, "grad_norm": 18.46871566772461, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8712283372879028, "num_tokens": 618070181.0, "step": 16200 }, { "epoch": 2.060933723444854, "ewc_loss": 0.03180090710520744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1800907891010866e-05, "grad_norm": 18.554121017456055, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8798471689224243, "num_tokens": 618106396.0, "step": 16201 }, { "epoch": 2.0610609337234447, "ewc_loss": 0.031747929751873016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174792800564319e-05, "grad_norm": 18.433683395385742, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8783949613571167, "num_tokens": 618148281.0, "step": 16202 }, { "epoch": 2.061188144002035, "ewc_loss": 0.03169974684715271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1699746614322066e-05, "grad_norm": 18.50294303894043, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8670390844345093, "num_tokens": 618182721.0, "step": 16203 }, { "epoch": 2.0613153542806257, "ewc_loss": 0.03174582123756409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174582161591388e-05, "grad_norm": 18.42522430419922, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8887691497802734, "num_tokens": 618219497.0, "step": 16204 }, { "epoch": 2.0614425645592163, "ewc_loss": 0.03169781342148781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.16978148475755e-05, "grad_norm": 18.432443618774414, "learning_rate": 1e-06, "loss": 0.3486, "mean_token_accuracy": 0.8898499011993408, "num_tokens": 618258643.0, "step": 16205 }, { "epoch": 2.061569774837807, "ewc_loss": 0.03178468719124794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1784686143510044e-05, "grad_norm": 18.506013870239258, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8890087008476257, "num_tokens": 618292682.0, "step": 16206 }, { "epoch": 2.0616969851163973, "ewc_loss": 0.03177075460553169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177075632265769e-05, "grad_norm": 18.422679901123047, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.859929084777832, "num_tokens": 618331375.0, "step": 16207 }, { "epoch": 2.061824195394988, "ewc_loss": 0.03174014389514923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174014273099601e-05, "grad_norm": 18.486173629760742, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8792679309844971, "num_tokens": 618367452.0, "step": 16208 }, { "epoch": 2.0619514056735784, "ewc_loss": 0.031777530908584595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1777530239196494e-05, "grad_norm": 18.45399284362793, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8732396364212036, "num_tokens": 618410470.0, "step": 16209 }, { "epoch": 2.062078615952169, "ewc_loss": 0.031770259141922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177025791956112e-05, "grad_norm": 18.430112838745117, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.866661548614502, "num_tokens": 618447524.0, "step": 16210 }, { "epoch": 2.0622058262307594, "ewc_loss": 0.031765345484018326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1765346648171544e-05, "grad_norm": 18.483474731445312, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8553717136383057, "num_tokens": 618481793.0, "step": 16211 }, { "epoch": 2.06233303650935, "ewc_loss": 0.03179742768406868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179742634529248e-05, "grad_norm": 18.49742889404297, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.881535530090332, "num_tokens": 618517097.0, "step": 16212 }, { "epoch": 2.0624602467879405, "ewc_loss": 0.0317850261926651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17850244755391e-05, "grad_norm": 18.461626052856445, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8722274303436279, "num_tokens": 618554221.0, "step": 16213 }, { "epoch": 2.062587457066531, "ewc_loss": 0.031787510961294174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178750921506435e-05, "grad_norm": 18.5018367767334, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8844088315963745, "num_tokens": 618591279.0, "step": 16214 }, { "epoch": 2.0627146673451215, "ewc_loss": 0.03182090073823929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182090222253464e-05, "grad_norm": 18.465633392333984, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.86195307970047, "num_tokens": 618628168.0, "step": 16215 }, { "epoch": 2.062841877623712, "ewc_loss": 0.031807925552129745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1807925552129745e-05, "grad_norm": 18.514564514160156, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8794981837272644, "num_tokens": 618664580.0, "step": 16216 }, { "epoch": 2.0629690879023026, "ewc_loss": 0.03176724910736084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176724931108765e-05, "grad_norm": 18.383153915405273, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.862119197845459, "num_tokens": 618699007.0, "step": 16217 }, { "epoch": 2.063096298180893, "ewc_loss": 0.0318356454372406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1835646950639784e-05, "grad_norm": 18.560562133789062, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8745845556259155, "num_tokens": 618736630.0, "step": 16218 }, { "epoch": 2.0632235084594837, "ewc_loss": 0.03187404200434685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187404217896983e-05, "grad_norm": 18.497819900512695, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8815162181854248, "num_tokens": 618777868.0, "step": 16219 }, { "epoch": 2.063350718738074, "ewc_loss": 0.031843189150094986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1843188480706885e-05, "grad_norm": 18.508861541748047, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8776532411575317, "num_tokens": 618819134.0, "step": 16220 }, { "epoch": 2.0634779290166647, "ewc_loss": 0.03185435011982918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185434979968704e-05, "grad_norm": 18.504243850708008, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.873683512210846, "num_tokens": 618862231.0, "step": 16221 }, { "epoch": 2.0636051392952552, "ewc_loss": 0.03182198479771614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1821986340219155e-05, "grad_norm": 18.41931915283203, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8667654991149902, "num_tokens": 618906816.0, "step": 16222 }, { "epoch": 2.0637323495738458, "ewc_loss": 0.03186221793293953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186221874784678e-05, "grad_norm": 18.533546447753906, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8756250739097595, "num_tokens": 618943239.0, "step": 16223 }, { "epoch": 2.0638595598524363, "ewc_loss": 0.03181419149041176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181419015163556e-05, "grad_norm": 18.498199462890625, "learning_rate": 1e-06, "loss": 0.3462, "mean_token_accuracy": 0.8872459530830383, "num_tokens": 618978641.0, "step": 16224 }, { "epoch": 2.0639867701310264, "ewc_loss": 0.03180093318223953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1800933356862515e-05, "grad_norm": 18.45863914489746, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8746470808982849, "num_tokens": 619020522.0, "step": 16225 }, { "epoch": 2.064113980409617, "ewc_loss": 0.03174997493624687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174997618771158e-05, "grad_norm": 18.411457061767578, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8863556385040283, "num_tokens": 619060769.0, "step": 16226 }, { "epoch": 2.0642411906882074, "ewc_loss": 0.03176672011613846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1766721804160625e-05, "grad_norm": 18.538013458251953, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8851896524429321, "num_tokens": 619098711.0, "step": 16227 }, { "epoch": 2.064368400966798, "ewc_loss": 0.031805165112018585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180516432621516e-05, "grad_norm": 18.4954776763916, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8702665567398071, "num_tokens": 619133673.0, "step": 16228 }, { "epoch": 2.0644956112453885, "ewc_loss": 0.03167494758963585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.167494651279412e-05, "grad_norm": 18.445436477661133, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8678504228591919, "num_tokens": 619174554.0, "step": 16229 }, { "epoch": 2.064622821523979, "ewc_loss": 0.031708795577287674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1708794267615303e-05, "grad_norm": 18.4359188079834, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8640764355659485, "num_tokens": 619211095.0, "step": 16230 }, { "epoch": 2.0647500318025696, "ewc_loss": 0.031774722039699554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177472171955742e-05, "grad_norm": 18.594768524169922, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8843833804130554, "num_tokens": 619246346.0, "step": 16231 }, { "epoch": 2.06487724208116, "ewc_loss": 0.03175976499915123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1759765988681465e-05, "grad_norm": 18.446205139160156, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8720812201499939, "num_tokens": 619280629.0, "step": 16232 }, { "epoch": 2.0650044523597506, "ewc_loss": 0.031701359897851944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170135823893361e-05, "grad_norm": 18.536911010742188, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8548318147659302, "num_tokens": 619320928.0, "step": 16233 }, { "epoch": 2.065131662638341, "ewc_loss": 0.03176159784197807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176159953000024e-05, "grad_norm": 18.445499420166016, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8752448558807373, "num_tokens": 619369747.0, "step": 16234 }, { "epoch": 2.0652588729169317, "ewc_loss": 0.03169738128781319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.169738192809746e-05, "grad_norm": 18.581666946411133, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8755090236663818, "num_tokens": 619408694.0, "step": 16235 }, { "epoch": 2.065386083195522, "ewc_loss": 0.031777240335941315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177723920089193e-05, "grad_norm": 18.46915626525879, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8821420073509216, "num_tokens": 619446858.0, "step": 16236 }, { "epoch": 2.0655132934741127, "ewc_loss": 0.03171711415052414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171711432514712e-05, "grad_norm": 18.522144317626953, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8743395805358887, "num_tokens": 619482281.0, "step": 16237 }, { "epoch": 2.0656405037527032, "ewc_loss": 0.03177332505583763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1773324735695496e-05, "grad_norm": 18.44498634338379, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8835885524749756, "num_tokens": 619520309.0, "step": 16238 }, { "epoch": 2.065767714031294, "ewc_loss": 0.03171564266085625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171564094373025e-05, "grad_norm": 18.455469131469727, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8770175576210022, "num_tokens": 619554662.0, "step": 16239 }, { "epoch": 2.0658949243098843, "ewc_loss": 0.03177793323993683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177793405484408e-05, "grad_norm": 18.456623077392578, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8706220388412476, "num_tokens": 619599632.0, "step": 16240 }, { "epoch": 2.066022134588475, "ewc_loss": 0.03171711787581444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.171711796312593e-05, "grad_norm": 18.423683166503906, "learning_rate": 1e-06, "loss": 0.3506, "mean_token_accuracy": 0.886713981628418, "num_tokens": 619639414.0, "step": 16241 }, { "epoch": 2.0661493448670654, "ewc_loss": 0.03177902474999428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177902544848621e-05, "grad_norm": 18.45667266845703, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8761124014854431, "num_tokens": 619675878.0, "step": 16242 }, { "epoch": 2.066276555145656, "ewc_loss": 0.031724538654088974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1724539439892396e-05, "grad_norm": 18.421125411987305, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.877214789390564, "num_tokens": 619709486.0, "step": 16243 }, { "epoch": 2.0664037654242464, "ewc_loss": 0.031719036400318146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1719035177957267e-05, "grad_norm": 18.400352478027344, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8745076656341553, "num_tokens": 619753576.0, "step": 16244 }, { "epoch": 2.066530975702837, "ewc_loss": 0.03178026154637337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178026236128062e-05, "grad_norm": 18.446809768676758, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8753509521484375, "num_tokens": 619792520.0, "step": 16245 }, { "epoch": 2.0666581859814275, "ewc_loss": 0.031753115355968475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17531157634221e-05, "grad_norm": 18.505809783935547, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8540490865707397, "num_tokens": 619827871.0, "step": 16246 }, { "epoch": 2.066785396260018, "ewc_loss": 0.031801942735910416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180194107699208e-05, "grad_norm": 18.405900955200195, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8694216012954712, "num_tokens": 619864527.0, "step": 16247 }, { "epoch": 2.0669126065386085, "ewc_loss": 0.031773075461387634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177307371515781e-05, "grad_norm": 18.483816146850586, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8757658004760742, "num_tokens": 619904934.0, "step": 16248 }, { "epoch": 2.0670398168171986, "ewc_loss": 0.03184732049703598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184732122463174e-05, "grad_norm": 18.48621940612793, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8821408748626709, "num_tokens": 619945472.0, "step": 16249 }, { "epoch": 2.067167027095789, "ewc_loss": 0.03179517760872841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1795178074389696e-05, "grad_norm": 18.49851417541504, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8854040503501892, "num_tokens": 619981738.0, "step": 16250 }, { "epoch": 2.0672942373743797, "ewc_loss": 0.03178132325410843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178132465109229e-05, "grad_norm": 18.45661163330078, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.876717209815979, "num_tokens": 620024593.0, "step": 16251 }, { "epoch": 2.06742144765297, "ewc_loss": 0.0317709855735302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1770985515322536e-05, "grad_norm": 18.48115348815918, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8815398216247559, "num_tokens": 620064059.0, "step": 16252 }, { "epoch": 2.0675486579315607, "ewc_loss": 0.0317729227244854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1772924558026716e-05, "grad_norm": 18.423948287963867, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.876600980758667, "num_tokens": 620100028.0, "step": 16253 }, { "epoch": 2.0676758682101513, "ewc_loss": 0.031737279146909714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.173727964167483e-05, "grad_norm": 18.454423904418945, "learning_rate": 1e-06, "loss": 0.3503, "mean_token_accuracy": 0.8881343007087708, "num_tokens": 620135181.0, "step": 16254 }, { "epoch": 2.067803078488742, "ewc_loss": 0.0317452996969223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.174530138494447e-05, "grad_norm": 18.489038467407227, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8762916326522827, "num_tokens": 620174887.0, "step": 16255 }, { "epoch": 2.0679302887673323, "ewc_loss": 0.03177420794963837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177420876454562e-05, "grad_norm": 18.461889266967773, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8747643232345581, "num_tokens": 620212924.0, "step": 16256 }, { "epoch": 2.068057499045923, "ewc_loss": 0.03176606819033623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176606696797535e-05, "grad_norm": 18.50994873046875, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8654689788818359, "num_tokens": 620251088.0, "step": 16257 }, { "epoch": 2.0681847093245134, "ewc_loss": 0.03179801255464554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179801205988042e-05, "grad_norm": 18.47262191772461, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8743391036987305, "num_tokens": 620287888.0, "step": 16258 }, { "epoch": 2.068311919603104, "ewc_loss": 0.031772904098033905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1772902730153874e-05, "grad_norm": 18.534643173217773, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8726084232330322, "num_tokens": 620324937.0, "step": 16259 }, { "epoch": 2.0684391298816944, "ewc_loss": 0.03180292993783951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180293060722761e-05, "grad_norm": 18.482826232910156, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8752646446228027, "num_tokens": 620363390.0, "step": 16260 }, { "epoch": 2.068566340160285, "ewc_loss": 0.03172588720917702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172588549205102e-05, "grad_norm": 18.49199104309082, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8821996450424194, "num_tokens": 620403707.0, "step": 16261 }, { "epoch": 2.0686935504388755, "ewc_loss": 0.03175990656018257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.175990786985494e-05, "grad_norm": 18.5268611907959, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8783767223358154, "num_tokens": 620445994.0, "step": 16262 }, { "epoch": 2.068820760717466, "ewc_loss": 0.03172328323125839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.172328433720395e-05, "grad_norm": 18.502601623535156, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8816118836402893, "num_tokens": 620480883.0, "step": 16263 }, { "epoch": 2.0689479709960565, "ewc_loss": 0.031784772872924805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1784773455001414e-05, "grad_norm": 18.471771240234375, "learning_rate": 1e-06, "loss": 0.3536, "mean_token_accuracy": 0.8872911334037781, "num_tokens": 620518935.0, "step": 16264 }, { "epoch": 2.069075181274647, "ewc_loss": 0.03174858167767525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1748582841828465e-05, "grad_norm": 18.515411376953125, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.867933988571167, "num_tokens": 620551985.0, "step": 16265 }, { "epoch": 2.0692023915532376, "ewc_loss": 0.031764015555381775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176401514792815e-05, "grad_norm": 18.42748260498047, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8703430891036987, "num_tokens": 620592809.0, "step": 16266 }, { "epoch": 2.069329601831828, "ewc_loss": 0.03170544654130936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.170544732711278e-05, "grad_norm": 18.39572525024414, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8615392446517944, "num_tokens": 620627951.0, "step": 16267 }, { "epoch": 2.0694568121104187, "ewc_loss": 0.031820476055145264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182047657901421e-05, "grad_norm": 18.52960777282715, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8797929883003235, "num_tokens": 620660892.0, "step": 16268 }, { "epoch": 2.069584022389009, "ewc_loss": 0.031882986426353455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188298796885647e-05, "grad_norm": 18.554443359375, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8513408899307251, "num_tokens": 620698165.0, "step": 16269 }, { "epoch": 2.0697112326675997, "ewc_loss": 0.031875237822532654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1875239073997363e-05, "grad_norm": 18.462907791137695, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8658462166786194, "num_tokens": 620734758.0, "step": 16270 }, { "epoch": 2.0698384429461902, "ewc_loss": 0.031772855669260025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177285543642938e-05, "grad_norm": 18.454500198364258, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.856132984161377, "num_tokens": 620770224.0, "step": 16271 }, { "epoch": 2.0699656532247808, "ewc_loss": 0.031899042427539825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1899042369332165e-05, "grad_norm": 18.45637321472168, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8819299936294556, "num_tokens": 620805724.0, "step": 16272 }, { "epoch": 2.070092863503371, "ewc_loss": 0.031855035573244095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1855033739702776e-05, "grad_norm": 18.506017684936523, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8633692264556885, "num_tokens": 620847622.0, "step": 16273 }, { "epoch": 2.0702200737819614, "ewc_loss": 0.03187771886587143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18777201755438e-05, "grad_norm": 18.448034286499023, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8707363605499268, "num_tokens": 620879042.0, "step": 16274 }, { "epoch": 2.070347284060552, "ewc_loss": 0.03188987076282501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1889871024759486e-05, "grad_norm": 18.565185546875, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8709368705749512, "num_tokens": 620922685.0, "step": 16275 }, { "epoch": 2.0704744943391424, "ewc_loss": 0.03188329190015793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1883293559076265e-05, "grad_norm": 18.474262237548828, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8687853217124939, "num_tokens": 620954854.0, "step": 16276 }, { "epoch": 2.070601704617733, "ewc_loss": 0.03189554810523987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1895549909677356e-05, "grad_norm": 18.540321350097656, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8722449541091919, "num_tokens": 620994581.0, "step": 16277 }, { "epoch": 2.0707289148963235, "ewc_loss": 0.031912487000226974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1912488339003175e-05, "grad_norm": 18.444351196289062, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8748959302902222, "num_tokens": 621030052.0, "step": 16278 }, { "epoch": 2.070856125174914, "ewc_loss": 0.03188737854361534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188737900927663e-05, "grad_norm": 18.457015991210938, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8830296993255615, "num_tokens": 621072474.0, "step": 16279 }, { "epoch": 2.0709833354535045, "ewc_loss": 0.0319189690053463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1918967579258606e-05, "grad_norm": 18.544382095336914, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8778640031814575, "num_tokens": 621104365.0, "step": 16280 }, { "epoch": 2.071110545732095, "ewc_loss": 0.03195783868432045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195783938281238e-05, "grad_norm": 18.481094360351562, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8800193071365356, "num_tokens": 621140497.0, "step": 16281 }, { "epoch": 2.0712377560106856, "ewc_loss": 0.03188079968094826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18807979056146e-05, "grad_norm": 18.498218536376953, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8764994740486145, "num_tokens": 621177431.0, "step": 16282 }, { "epoch": 2.071364966289276, "ewc_loss": 0.03199395537376404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199395723640919e-05, "grad_norm": 18.62639617919922, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8772484064102173, "num_tokens": 621212874.0, "step": 16283 }, { "epoch": 2.0714921765678667, "ewc_loss": 0.03192274644970894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1922747439239174e-05, "grad_norm": 18.4570255279541, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8801921606063843, "num_tokens": 621249927.0, "step": 16284 }, { "epoch": 2.071619386846457, "ewc_loss": 0.03185979649424553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185979585396126e-05, "grad_norm": 18.569143295288086, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8494744300842285, "num_tokens": 621280794.0, "step": 16285 }, { "epoch": 2.0717465971250477, "ewc_loss": 0.031942930072546005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194293094566092e-05, "grad_norm": 18.48336410522461, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.858177661895752, "num_tokens": 621321167.0, "step": 16286 }, { "epoch": 2.0718738074036382, "ewc_loss": 0.03185194730758667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185194873367436e-05, "grad_norm": 18.566537857055664, "learning_rate": 1e-06, "loss": 0.35, "mean_token_accuracy": 0.8864321708679199, "num_tokens": 621357962.0, "step": 16287 }, { "epoch": 2.0720010176822288, "ewc_loss": 0.03194892778992653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1948926334735006e-05, "grad_norm": 18.482255935668945, "learning_rate": 1e-06, "loss": 0.3401, "mean_token_accuracy": 0.892388105392456, "num_tokens": 621398933.0, "step": 16288 }, { "epoch": 2.0721282279608193, "ewc_loss": 0.0318714901804924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187149195582606e-05, "grad_norm": 18.57588005065918, "learning_rate": 1e-06, "loss": 0.339, "mean_token_accuracy": 0.8931722640991211, "num_tokens": 621442334.0, "step": 16289 }, { "epoch": 2.07225543823941, "ewc_loss": 0.031940095126628876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1940096960170195e-05, "grad_norm": 18.54934310913086, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8767467737197876, "num_tokens": 621478706.0, "step": 16290 }, { "epoch": 2.0723826485180004, "ewc_loss": 0.03183653578162193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1836534617468715e-05, "grad_norm": 18.557104110717773, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8807836771011353, "num_tokens": 621513911.0, "step": 16291 }, { "epoch": 2.072509858796591, "ewc_loss": 0.03181420639157295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1814204703550786e-05, "grad_norm": 18.483417510986328, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8694195747375488, "num_tokens": 621554080.0, "step": 16292 }, { "epoch": 2.0726370690751814, "ewc_loss": 0.03185788914561272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185788955306634e-05, "grad_norm": 18.534088134765625, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8770526051521301, "num_tokens": 621592059.0, "step": 16293 }, { "epoch": 2.072764279353772, "ewc_loss": 0.031859222799539566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185922105330974e-05, "grad_norm": 18.568166732788086, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8860181570053101, "num_tokens": 621629626.0, "step": 16294 }, { "epoch": 2.0728914896323625, "ewc_loss": 0.03179936110973358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179936175001785e-05, "grad_norm": 18.50631332397461, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8797649145126343, "num_tokens": 621671341.0, "step": 16295 }, { "epoch": 2.073018699910953, "ewc_loss": 0.03178446367383003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178446422680281e-05, "grad_norm": 18.445262908935547, "learning_rate": 1e-06, "loss": 0.3375, "mean_token_accuracy": 0.8884156346321106, "num_tokens": 621708756.0, "step": 16296 }, { "epoch": 2.0731459101895435, "ewc_loss": 0.03185282275080681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1852821848588064e-05, "grad_norm": 18.570798873901367, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8793208599090576, "num_tokens": 621748009.0, "step": 16297 }, { "epoch": 2.0732731204681336, "ewc_loss": 0.031889524310827255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188952541677281e-05, "grad_norm": 18.50164794921875, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8832359313964844, "num_tokens": 621788695.0, "step": 16298 }, { "epoch": 2.073400330746724, "ewc_loss": 0.03181752935051918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181752981618047e-05, "grad_norm": 18.602270126342773, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8739016652107239, "num_tokens": 621828099.0, "step": 16299 }, { "epoch": 2.0735275410253147, "ewc_loss": 0.03179493546485901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179493432980962e-05, "grad_norm": 18.51192855834961, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8579901456832886, "num_tokens": 621867033.0, "step": 16300 }, { "epoch": 2.073654751303905, "ewc_loss": 0.03176306188106537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176306199748069e-05, "grad_norm": 18.518810272216797, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8645943403244019, "num_tokens": 621907435.0, "step": 16301 }, { "epoch": 2.0737819615824957, "ewc_loss": 0.03181079775094986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181079591740854e-05, "grad_norm": 18.43638801574707, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8664544224739075, "num_tokens": 621948995.0, "step": 16302 }, { "epoch": 2.0739091718610863, "ewc_loss": 0.031779199838638306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177920007146895e-05, "grad_norm": 18.475296020507812, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8859164714813232, "num_tokens": 621991109.0, "step": 16303 }, { "epoch": 2.074036382139677, "ewc_loss": 0.03179750218987465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179750274284743e-05, "grad_norm": 18.48591423034668, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8812756538391113, "num_tokens": 622026585.0, "step": 16304 }, { "epoch": 2.0741635924182673, "ewc_loss": 0.031803376972675323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180337807862088e-05, "grad_norm": 18.472375869750977, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8843226432800293, "num_tokens": 622065208.0, "step": 16305 }, { "epoch": 2.074290802696858, "ewc_loss": 0.03181160241365433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1811603548703715e-05, "grad_norm": 18.504962921142578, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8660921454429626, "num_tokens": 622104838.0, "step": 16306 }, { "epoch": 2.0744180129754484, "ewc_loss": 0.03188428655266762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18842867272906e-05, "grad_norm": 18.556114196777344, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8655865788459778, "num_tokens": 622136879.0, "step": 16307 }, { "epoch": 2.074545223254039, "ewc_loss": 0.03179074823856354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179074701620266e-05, "grad_norm": 18.392099380493164, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8862220644950867, "num_tokens": 622175938.0, "step": 16308 }, { "epoch": 2.0746724335326294, "ewc_loss": 0.03183063119649887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1830630177864805e-05, "grad_norm": 18.54941749572754, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8832367658615112, "num_tokens": 622216663.0, "step": 16309 }, { "epoch": 2.07479964381122, "ewc_loss": 0.03184323012828827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1843228498473763e-05, "grad_norm": 18.499879837036133, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8831451535224915, "num_tokens": 622250450.0, "step": 16310 }, { "epoch": 2.0749268540898105, "ewc_loss": 0.0317598357796669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17598351102788e-05, "grad_norm": 18.42232894897461, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8675442934036255, "num_tokens": 622291919.0, "step": 16311 }, { "epoch": 2.075054064368401, "ewc_loss": 0.03184443339705467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184443266945891e-05, "grad_norm": 18.499841690063477, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8741520643234253, "num_tokens": 622325892.0, "step": 16312 }, { "epoch": 2.0751812746469915, "ewc_loss": 0.0317881740629673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1788174965186045e-05, "grad_norm": 18.439556121826172, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8752132058143616, "num_tokens": 622356966.0, "step": 16313 }, { "epoch": 2.075308484925582, "ewc_loss": 0.031823258846998215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182325963280164e-05, "grad_norm": 18.467329025268555, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8778942823410034, "num_tokens": 622389931.0, "step": 16314 }, { "epoch": 2.0754356952041726, "ewc_loss": 0.03191583976149559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191583891748451e-05, "grad_norm": 18.510770797729492, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8740003108978271, "num_tokens": 622427817.0, "step": 16315 }, { "epoch": 2.075562905482763, "ewc_loss": 0.031866464763879776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186646426911466e-05, "grad_norm": 18.422536849975586, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8652624487876892, "num_tokens": 622470353.0, "step": 16316 }, { "epoch": 2.0756901157613536, "ewc_loss": 0.031823355704545975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182335422025062e-05, "grad_norm": 18.409378051757812, "learning_rate": 1e-06, "loss": 0.3337, "mean_token_accuracy": 0.8906958103179932, "num_tokens": 622507577.0, "step": 16317 }, { "epoch": 2.075817326039944, "ewc_loss": 0.03190387040376663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1903869967209175e-05, "grad_norm": 18.494413375854492, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8708112239837646, "num_tokens": 622544746.0, "step": 16318 }, { "epoch": 2.0759445363185347, "ewc_loss": 0.031965743750333786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196574471076019e-05, "grad_norm": 18.581218719482422, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8678874969482422, "num_tokens": 622583688.0, "step": 16319 }, { "epoch": 2.0760717465971252, "ewc_loss": 0.03187921643257141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1879215384833515e-05, "grad_norm": 18.522964477539062, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8758283853530884, "num_tokens": 622619911.0, "step": 16320 }, { "epoch": 2.0761989568757158, "ewc_loss": 0.03183930739760399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183930675731972e-05, "grad_norm": 18.41545295715332, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8801291584968567, "num_tokens": 622665732.0, "step": 16321 }, { "epoch": 2.0763261671543063, "ewc_loss": 0.03188518062233925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1885181670077145e-05, "grad_norm": 18.5037899017334, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8817830681800842, "num_tokens": 622705746.0, "step": 16322 }, { "epoch": 2.0764533774328964, "ewc_loss": 0.03189035877585411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1890358513919637e-05, "grad_norm": 18.446125030517578, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8767158389091492, "num_tokens": 622745481.0, "step": 16323 }, { "epoch": 2.076580587711487, "ewc_loss": 0.03187139332294464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187139373039827e-05, "grad_norm": 18.567506790161133, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8775103092193604, "num_tokens": 622786910.0, "step": 16324 }, { "epoch": 2.0767077979900774, "ewc_loss": 0.031889818608760834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188982009305619e-05, "grad_norm": 18.51039695739746, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8708711862564087, "num_tokens": 622826976.0, "step": 16325 }, { "epoch": 2.076835008268668, "ewc_loss": 0.03178626298904419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1786261388333514e-05, "grad_norm": 18.567256927490234, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8756637573242188, "num_tokens": 622858945.0, "step": 16326 }, { "epoch": 2.0769622185472585, "ewc_loss": 0.03184692561626434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184692468494177e-05, "grad_norm": 18.470199584960938, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.886908233165741, "num_tokens": 622893229.0, "step": 16327 }, { "epoch": 2.077089428825849, "ewc_loss": 0.031793370842933655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179336999892257e-05, "grad_norm": 18.53078269958496, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.884661078453064, "num_tokens": 622933701.0, "step": 16328 }, { "epoch": 2.0772166391044395, "ewc_loss": 0.03182809799909592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182809814461507e-05, "grad_norm": 18.509382247924805, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8609226942062378, "num_tokens": 622971226.0, "step": 16329 }, { "epoch": 2.07734384938303, "ewc_loss": 0.03180016204714775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180016210535541e-05, "grad_norm": 18.43436050415039, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8723227977752686, "num_tokens": 623009232.0, "step": 16330 }, { "epoch": 2.0774710596616206, "ewc_loss": 0.03181063383817673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181063220836222e-05, "grad_norm": 18.56771469116211, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8691655397415161, "num_tokens": 623048005.0, "step": 16331 }, { "epoch": 2.077598269940211, "ewc_loss": 0.031872738152742386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187273978255689e-05, "grad_norm": 18.45229721069336, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.874904990196228, "num_tokens": 623088474.0, "step": 16332 }, { "epoch": 2.0777254802188017, "ewc_loss": 0.03178918734192848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1789186323294416e-05, "grad_norm": 18.50031089782715, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.875952422618866, "num_tokens": 623123700.0, "step": 16333 }, { "epoch": 2.077852690497392, "ewc_loss": 0.03188740462064743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188740447512828e-05, "grad_norm": 18.432369232177734, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.87751305103302, "num_tokens": 623161406.0, "step": 16334 }, { "epoch": 2.0779799007759827, "ewc_loss": 0.03182947263121605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182947330060415e-05, "grad_norm": 18.521827697753906, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8661496639251709, "num_tokens": 623199770.0, "step": 16335 }, { "epoch": 2.0781071110545732, "ewc_loss": 0.031880270689725876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188027039868757e-05, "grad_norm": 18.47340202331543, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8784047961235046, "num_tokens": 623234635.0, "step": 16336 }, { "epoch": 2.0782343213331638, "ewc_loss": 0.03184422478079796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1844225304666907e-05, "grad_norm": 18.548433303833008, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8772841691970825, "num_tokens": 623273695.0, "step": 16337 }, { "epoch": 2.0783615316117543, "ewc_loss": 0.03188510611653328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1885105272522196e-05, "grad_norm": 18.512073516845703, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.860065221786499, "num_tokens": 623315278.0, "step": 16338 }, { "epoch": 2.078488741890345, "ewc_loss": 0.031823158264160156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182315776939504e-05, "grad_norm": 18.430498123168945, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8835099935531616, "num_tokens": 623351753.0, "step": 16339 }, { "epoch": 2.0786159521689354, "ewc_loss": 0.03189070150256157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18907004839275e-05, "grad_norm": 18.628677368164062, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.857237696647644, "num_tokens": 623387812.0, "step": 16340 }, { "epoch": 2.078743162447526, "ewc_loss": 0.03184628114104271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1846280762692913e-05, "grad_norm": 18.404247283935547, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8787686824798584, "num_tokens": 623426784.0, "step": 16341 }, { "epoch": 2.0788703727261164, "ewc_loss": 0.031776413321495056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.177641337970272e-05, "grad_norm": 18.610671997070312, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8520544767379761, "num_tokens": 623463362.0, "step": 16342 }, { "epoch": 2.078997583004707, "ewc_loss": 0.03191109001636505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191109135514125e-05, "grad_norm": 18.47770118713379, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8679070472717285, "num_tokens": 623500108.0, "step": 16343 }, { "epoch": 2.0791247932832975, "ewc_loss": 0.03176955506205559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176955578965135e-05, "grad_norm": 18.571556091308594, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8719590306282043, "num_tokens": 623537159.0, "step": 16344 }, { "epoch": 2.079252003561888, "ewc_loss": 0.031859271228313446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185927198501304e-05, "grad_norm": 18.47930908203125, "learning_rate": 1e-06, "loss": 0.3354, "mean_token_accuracy": 0.8932790756225586, "num_tokens": 623576256.0, "step": 16345 }, { "epoch": 2.0793792138404785, "ewc_loss": 0.03179379925131798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179379928042181e-05, "grad_norm": 18.564821243286133, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.871861457824707, "num_tokens": 623612454.0, "step": 16346 }, { "epoch": 2.0795064241190686, "ewc_loss": 0.031896572560071945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189657218172215e-05, "grad_norm": 18.53817367553711, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8880568146705627, "num_tokens": 623654392.0, "step": 16347 }, { "epoch": 2.079633634397659, "ewc_loss": 0.031798407435417175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1798408599570394e-05, "grad_norm": 18.48859214782715, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8568892478942871, "num_tokens": 623697350.0, "step": 16348 }, { "epoch": 2.0797608446762497, "ewc_loss": 0.03189844638109207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18984457408078e-05, "grad_norm": 18.558813095092773, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8786871433258057, "num_tokens": 623736272.0, "step": 16349 }, { "epoch": 2.07988805495484, "ewc_loss": 0.031853366643190384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185336754540913e-05, "grad_norm": 18.498632431030273, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8653881549835205, "num_tokens": 623781319.0, "step": 16350 }, { "epoch": 2.0800152652334307, "ewc_loss": 0.03183099254965782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1830993975745514e-05, "grad_norm": 18.56232452392578, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8751621842384338, "num_tokens": 623821944.0, "step": 16351 }, { "epoch": 2.0801424755120212, "ewc_loss": 0.031853485852479935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185348759870976e-05, "grad_norm": 18.53631019592285, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8704607486724854, "num_tokens": 623858399.0, "step": 16352 }, { "epoch": 2.0802696857906118, "ewc_loss": 0.031780458986759186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17804588121362e-05, "grad_norm": 18.474042892456055, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8759729266166687, "num_tokens": 623897839.0, "step": 16353 }, { "epoch": 2.0803968960692023, "ewc_loss": 0.03182736784219742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1827366910874844e-05, "grad_norm": 18.512182235717773, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8588772416114807, "num_tokens": 623933127.0, "step": 16354 }, { "epoch": 2.080524106347793, "ewc_loss": 0.03189556300640106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1895564461592585e-05, "grad_norm": 18.531538009643555, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8745045065879822, "num_tokens": 623972140.0, "step": 16355 }, { "epoch": 2.0806513166263834, "ewc_loss": 0.03189275413751602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18927523039747e-05, "grad_norm": 18.52481460571289, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8646168112754822, "num_tokens": 624008875.0, "step": 16356 }, { "epoch": 2.080778526904974, "ewc_loss": 0.0317830890417099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178308907081373e-05, "grad_norm": 18.53069496154785, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.872153639793396, "num_tokens": 624045162.0, "step": 16357 }, { "epoch": 2.0809057371835644, "ewc_loss": 0.03183276206254959, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183276203344576e-05, "grad_norm": 18.503040313720703, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8845731019973755, "num_tokens": 624085275.0, "step": 16358 }, { "epoch": 2.081032947462155, "ewc_loss": 0.03179103508591652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179103441652842e-05, "grad_norm": 18.504064559936523, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8719675540924072, "num_tokens": 624127479.0, "step": 16359 }, { "epoch": 2.0811601577407455, "ewc_loss": 0.03185952827334404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1859526643529534e-05, "grad_norm": 18.524991989135742, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8767244219779968, "num_tokens": 624165231.0, "step": 16360 }, { "epoch": 2.081287368019336, "ewc_loss": 0.0319116972386837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1911698897602037e-05, "grad_norm": 18.535741806030273, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.863672137260437, "num_tokens": 624205109.0, "step": 16361 }, { "epoch": 2.0814145782979265, "ewc_loss": 0.03176359087228775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176358950440772e-05, "grad_norm": 18.437467575073242, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8701285123825073, "num_tokens": 624241849.0, "step": 16362 }, { "epoch": 2.081541788576517, "ewc_loss": 0.03179717808961868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1797178962733597e-05, "grad_norm": 18.549039840698242, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.875739336013794, "num_tokens": 624274174.0, "step": 16363 }, { "epoch": 2.0816689988551076, "ewc_loss": 0.03186021000146866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186021058354527e-05, "grad_norm": 18.403989791870117, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8720721006393433, "num_tokens": 624311526.0, "step": 16364 }, { "epoch": 2.081796209133698, "ewc_loss": 0.031759753823280334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1759755074745044e-05, "grad_norm": 18.583980560302734, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8605954051017761, "num_tokens": 624346657.0, "step": 16365 }, { "epoch": 2.0819234194122886, "ewc_loss": 0.031929176300764084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1929175747791305e-05, "grad_norm": 18.448625564575195, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8594958782196045, "num_tokens": 624386563.0, "step": 16366 }, { "epoch": 2.082050629690879, "ewc_loss": 0.031809162348508835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180916246492416e-05, "grad_norm": 18.565448760986328, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8842334747314453, "num_tokens": 624427299.0, "step": 16367 }, { "epoch": 2.0821778399694697, "ewc_loss": 0.03194742277264595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194742384948768e-05, "grad_norm": 18.509366989135742, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8576841354370117, "num_tokens": 624461253.0, "step": 16368 }, { "epoch": 2.0823050502480602, "ewc_loss": 0.03179445117712021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179445047862828e-05, "grad_norm": 18.538461685180664, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.882843017578125, "num_tokens": 624498699.0, "step": 16369 }, { "epoch": 2.0824322605266508, "ewc_loss": 0.03193734586238861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193734664819203e-05, "grad_norm": 18.547130584716797, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.871745228767395, "num_tokens": 624534655.0, "step": 16370 }, { "epoch": 2.082559470805241, "ewc_loss": 0.03184588998556137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1845891498960555e-05, "grad_norm": 18.479915618896484, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8770119547843933, "num_tokens": 624575342.0, "step": 16371 }, { "epoch": 2.0826866810838314, "ewc_loss": 0.03184543922543526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1845440389588475e-05, "grad_norm": 18.53738021850586, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8787237405776978, "num_tokens": 624608493.0, "step": 16372 }, { "epoch": 2.082813891362422, "ewc_loss": 0.0318618081510067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186180765624158e-05, "grad_norm": 18.508207321166992, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8574632406234741, "num_tokens": 624655777.0, "step": 16373 }, { "epoch": 2.0829411016410124, "ewc_loss": 0.03191140294075012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191140422131866e-05, "grad_norm": 18.53524398803711, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8843132853507996, "num_tokens": 624699349.0, "step": 16374 }, { "epoch": 2.083068311919603, "ewc_loss": 0.03191622719168663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1916228181216866e-05, "grad_norm": 18.520906448364258, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8826624155044556, "num_tokens": 624730146.0, "step": 16375 }, { "epoch": 2.0831955221981935, "ewc_loss": 0.03186352550983429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186352478223853e-05, "grad_norm": 18.51320457458496, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8658488988876343, "num_tokens": 624773186.0, "step": 16376 }, { "epoch": 2.083322732476784, "ewc_loss": 0.031883083283901215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188308255630545e-05, "grad_norm": 18.446514129638672, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.872769296169281, "num_tokens": 624810256.0, "step": 16377 }, { "epoch": 2.0834499427553745, "ewc_loss": 0.03184189647436142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184189699823037e-05, "grad_norm": 18.602060317993164, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8640817999839783, "num_tokens": 624848166.0, "step": 16378 }, { "epoch": 2.083577153033965, "ewc_loss": 0.03191271051764488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191271025571041e-05, "grad_norm": 18.470579147338867, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8735538721084595, "num_tokens": 624887601.0, "step": 16379 }, { "epoch": 2.0837043633125556, "ewc_loss": 0.03186364099383354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1863641197560355e-05, "grad_norm": 18.611562728881836, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8742333054542542, "num_tokens": 624921160.0, "step": 16380 }, { "epoch": 2.083831573591146, "ewc_loss": 0.03193305432796478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1933053833199665e-05, "grad_norm": 18.451311111450195, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8671830892562866, "num_tokens": 624959053.0, "step": 16381 }, { "epoch": 2.0839587838697367, "ewc_loss": 0.031894464045763016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189446579199284e-05, "grad_norm": 18.595251083374023, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8870089054107666, "num_tokens": 624995272.0, "step": 16382 }, { "epoch": 2.084085994148327, "ewc_loss": 0.03193213418126106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193213342456147e-05, "grad_norm": 18.515806198120117, "learning_rate": 1e-06, "loss": 0.3397, "mean_token_accuracy": 0.8895490169525146, "num_tokens": 625033388.0, "step": 16383 }, { "epoch": 2.0842132044269177, "ewc_loss": 0.03190147876739502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1901479815132916e-05, "grad_norm": 18.603729248046875, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8636693954467773, "num_tokens": 625071103.0, "step": 16384 }, { "epoch": 2.0843404147055082, "ewc_loss": 0.031899675726890564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.18996753776446e-05, "grad_norm": 18.526100158691406, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8866628408432007, "num_tokens": 625109562.0, "step": 16385 }, { "epoch": 2.0844676249840988, "ewc_loss": 0.03179888799786568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179888881277293e-05, "grad_norm": 18.457494735717773, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.879432201385498, "num_tokens": 625147438.0, "step": 16386 }, { "epoch": 2.0845948352626893, "ewc_loss": 0.03190069645643234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190069764968939e-05, "grad_norm": 18.62090492248535, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8840256929397583, "num_tokens": 625184293.0, "step": 16387 }, { "epoch": 2.08472204554128, "ewc_loss": 0.03188334032893181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188334085280076e-05, "grad_norm": 18.444149017333984, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8761274814605713, "num_tokens": 625218201.0, "step": 16388 }, { "epoch": 2.0848492558198704, "ewc_loss": 0.031847719103097916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1847717764321715e-05, "grad_norm": 18.596059799194336, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8725632429122925, "num_tokens": 625252072.0, "step": 16389 }, { "epoch": 2.084976466098461, "ewc_loss": 0.03192037716507912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.192037547705695e-05, "grad_norm": 18.55201530456543, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8862255811691284, "num_tokens": 625286990.0, "step": 16390 }, { "epoch": 2.0851036763770514, "ewc_loss": 0.03182699531316757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182699583703652e-05, "grad_norm": 18.4688777923584, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8854250311851501, "num_tokens": 625326579.0, "step": 16391 }, { "epoch": 2.085230886655642, "ewc_loss": 0.03186974301934242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186974208801985e-05, "grad_norm": 18.472511291503906, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8786691427230835, "num_tokens": 625367317.0, "step": 16392 }, { "epoch": 2.0853580969342325, "ewc_loss": 0.0319155678153038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1915566069073975e-05, "grad_norm": 18.54998207092285, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8686587810516357, "num_tokens": 625401047.0, "step": 16393 }, { "epoch": 2.085485307212823, "ewc_loss": 0.03187567740678787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1875675631454214e-05, "grad_norm": 18.487009048461914, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8604578971862793, "num_tokens": 625441878.0, "step": 16394 }, { "epoch": 2.0856125174914135, "ewc_loss": 0.03191366791725159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191366704413667e-05, "grad_norm": 18.559951782226562, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8728929162025452, "num_tokens": 625477007.0, "step": 16395 }, { "epoch": 2.0857397277700036, "ewc_loss": 0.03187459334731102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187459515174851e-05, "grad_norm": 18.427968978881836, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8860881328582764, "num_tokens": 625519908.0, "step": 16396 }, { "epoch": 2.085866938048594, "ewc_loss": 0.031866248697042465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186624962836504e-05, "grad_norm": 18.52987289428711, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.876738429069519, "num_tokens": 625554946.0, "step": 16397 }, { "epoch": 2.0859941483271847, "ewc_loss": 0.03195422515273094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195422686985694e-05, "grad_norm": 18.555753707885742, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8845049142837524, "num_tokens": 625592121.0, "step": 16398 }, { "epoch": 2.086121358605775, "ewc_loss": 0.0318782739341259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187827314832248e-05, "grad_norm": 18.457237243652344, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8721639513969421, "num_tokens": 625630596.0, "step": 16399 }, { "epoch": 2.0862485688843657, "ewc_loss": 0.031812794506549835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1812793167773634e-05, "grad_norm": 18.38738441467285, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8572515249252319, "num_tokens": 625669535.0, "step": 16400 }, { "epoch": 2.0863757791629562, "ewc_loss": 0.031916480511426926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1916479201754555e-05, "grad_norm": 18.50640106201172, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8718105554580688, "num_tokens": 625710072.0, "step": 16401 }, { "epoch": 2.0865029894415468, "ewc_loss": 0.03195376694202423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1953768484527245e-05, "grad_norm": 18.507097244262695, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8814485669136047, "num_tokens": 625746328.0, "step": 16402 }, { "epoch": 2.0866301997201373, "ewc_loss": 0.03189217299222946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1892173865344375e-05, "grad_norm": 18.515918731689453, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8746719360351562, "num_tokens": 625787108.0, "step": 16403 }, { "epoch": 2.086757409998728, "ewc_loss": 0.031910769641399384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191077121300623e-05, "grad_norm": 18.490461349487305, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8776947855949402, "num_tokens": 625827887.0, "step": 16404 }, { "epoch": 2.0868846202773184, "ewc_loss": 0.031881801784038544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188180198776536e-05, "grad_norm": 18.56308364868164, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.873942494392395, "num_tokens": 625863475.0, "step": 16405 }, { "epoch": 2.087011830555909, "ewc_loss": 0.03191705420613289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1917054002406076e-05, "grad_norm": 18.54210090637207, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.889541745185852, "num_tokens": 625902369.0, "step": 16406 }, { "epoch": 2.0871390408344994, "ewc_loss": 0.031825076788663864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182507862220518e-05, "grad_norm": 18.525415420532227, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8495035171508789, "num_tokens": 625941054.0, "step": 16407 }, { "epoch": 2.08726625111309, "ewc_loss": 0.03181643411517143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181643478455953e-05, "grad_norm": 18.525373458862305, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8556207418441772, "num_tokens": 625981113.0, "step": 16408 }, { "epoch": 2.0873934613916805, "ewc_loss": 0.031859420239925385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.185942114214413e-05, "grad_norm": 18.611000061035156, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8709756135940552, "num_tokens": 626014758.0, "step": 16409 }, { "epoch": 2.087520671670271, "ewc_loss": 0.031896255910396576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189625567756593e-05, "grad_norm": 18.644014358520508, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8598375916481018, "num_tokens": 626054442.0, "step": 16410 }, { "epoch": 2.0876478819488615, "ewc_loss": 0.03175957500934601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.17595768137835e-05, "grad_norm": 18.46287727355957, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8570321798324585, "num_tokens": 626093993.0, "step": 16411 }, { "epoch": 2.087775092227452, "ewc_loss": 0.03180943429470062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180943531333469e-05, "grad_norm": 18.592073440551758, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.856703519821167, "num_tokens": 626131242.0, "step": 16412 }, { "epoch": 2.0879023025060426, "ewc_loss": 0.0319037102162838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190370989614166e-05, "grad_norm": 18.620805740356445, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8582608699798584, "num_tokens": 626162137.0, "step": 16413 }, { "epoch": 2.088029512784633, "ewc_loss": 0.03180327266454697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1803272577235475e-05, "grad_norm": 18.51894760131836, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.874616801738739, "num_tokens": 626201031.0, "step": 16414 }, { "epoch": 2.0881567230632236, "ewc_loss": 0.0318223275244236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182232831022702e-05, "grad_norm": 18.467140197753906, "learning_rate": 1e-06, "loss": 0.3565, "mean_token_accuracy": 0.8843014240264893, "num_tokens": 626243647.0, "step": 16415 }, { "epoch": 2.088283933341814, "ewc_loss": 0.031798847019672394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1798845157027245e-05, "grad_norm": 18.55693817138672, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8794184327125549, "num_tokens": 626281346.0, "step": 16416 }, { "epoch": 2.0884111436204047, "ewc_loss": 0.03188713639974594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188713526469655e-05, "grad_norm": 18.560638427734375, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8829102516174316, "num_tokens": 626315964.0, "step": 16417 }, { "epoch": 2.0885383538989952, "ewc_loss": 0.031815193593502045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1815194233786315e-05, "grad_norm": 18.431798934936523, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8673915863037109, "num_tokens": 626367920.0, "step": 16418 }, { "epoch": 2.0886655641775858, "ewc_loss": 0.031948406249284744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194840610376559e-05, "grad_norm": 18.52328109741211, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8704468011856079, "num_tokens": 626412123.0, "step": 16419 }, { "epoch": 2.0887927744561763, "ewc_loss": 0.03190586343407631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190586357959546e-05, "grad_norm": 18.569490432739258, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8467761278152466, "num_tokens": 626451560.0, "step": 16420 }, { "epoch": 2.0889199847347664, "ewc_loss": 0.03194909915328026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1949100957717746e-05, "grad_norm": 18.549123764038086, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8666660189628601, "num_tokens": 626489361.0, "step": 16421 }, { "epoch": 2.089047195013357, "ewc_loss": 0.03178693726658821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1786938052391633e-05, "grad_norm": 18.46513557434082, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8885324597358704, "num_tokens": 626527150.0, "step": 16422 }, { "epoch": 2.0891744052919474, "ewc_loss": 0.031903889030218124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190388815710321e-05, "grad_norm": 18.626554489135742, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8661252856254578, "num_tokens": 626568110.0, "step": 16423 }, { "epoch": 2.089301615570538, "ewc_loss": 0.03186861425638199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186861431458965e-05, "grad_norm": 18.59235954284668, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8756051063537598, "num_tokens": 626607215.0, "step": 16424 }, { "epoch": 2.0894288258491285, "ewc_loss": 0.03179395571351051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.179395571351051e-05, "grad_norm": 18.520858764648438, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8782183527946472, "num_tokens": 626644424.0, "step": 16425 }, { "epoch": 2.089556036127719, "ewc_loss": 0.03182968869805336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182968794135377e-05, "grad_norm": 18.553606033325195, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8522219657897949, "num_tokens": 626689509.0, "step": 16426 }, { "epoch": 2.0896832464063095, "ewc_loss": 0.031869228929281235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186922913300805e-05, "grad_norm": 18.575984954833984, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8637551069259644, "num_tokens": 626733866.0, "step": 16427 }, { "epoch": 2.0898104566849, "ewc_loss": 0.031829070299863815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182906948495656e-05, "grad_norm": 18.6306209564209, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8818762302398682, "num_tokens": 626767465.0, "step": 16428 }, { "epoch": 2.0899376669634906, "ewc_loss": 0.031810808926820755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181081046932377e-05, "grad_norm": 18.478490829467773, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8605037927627563, "num_tokens": 626806049.0, "step": 16429 }, { "epoch": 2.090064877242081, "ewc_loss": 0.031768232583999634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.176823156536557e-05, "grad_norm": 18.545867919921875, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8594540357589722, "num_tokens": 626848917.0, "step": 16430 }, { "epoch": 2.0901920875206716, "ewc_loss": 0.03188522532582283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188522532582283e-05, "grad_norm": 18.60874366760254, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8598568439483643, "num_tokens": 626884442.0, "step": 16431 }, { "epoch": 2.090319297799262, "ewc_loss": 0.031801894307136536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.180189378326759e-05, "grad_norm": 18.490171432495117, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8714407682418823, "num_tokens": 626922517.0, "step": 16432 }, { "epoch": 2.0904465080778527, "ewc_loss": 0.031800009310245514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1800009310245514e-05, "grad_norm": 18.52284049987793, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.871800422668457, "num_tokens": 626962880.0, "step": 16433 }, { "epoch": 2.0905737183564432, "ewc_loss": 0.03189181908965111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189181734342128e-05, "grad_norm": 18.57330894470215, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8799749612808228, "num_tokens": 627008115.0, "step": 16434 }, { "epoch": 2.0907009286350338, "ewc_loss": 0.031817976385354996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.181797728757374e-05, "grad_norm": 18.540176391601562, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8839442729949951, "num_tokens": 627043935.0, "step": 16435 }, { "epoch": 2.0908281389136243, "ewc_loss": 0.031903352588415146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190335337421857e-05, "grad_norm": 18.641103744506836, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8754932880401611, "num_tokens": 627078495.0, "step": 16436 }, { "epoch": 2.090955349192215, "ewc_loss": 0.03191046044230461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1910461984807625e-05, "grad_norm": 18.59517478942871, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8775765895843506, "num_tokens": 627116652.0, "step": 16437 }, { "epoch": 2.0910825594708053, "ewc_loss": 0.031826067715883255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.182606815244071e-05, "grad_norm": 18.531169891357422, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8812621831893921, "num_tokens": 627152326.0, "step": 16438 }, { "epoch": 2.091209769749396, "ewc_loss": 0.03188641741871834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1886418582871556e-05, "grad_norm": 18.61916160583496, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8820887207984924, "num_tokens": 627189502.0, "step": 16439 }, { "epoch": 2.0913369800279864, "ewc_loss": 0.03183550760149956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1835508707445115e-05, "grad_norm": 18.537006378173828, "learning_rate": 1e-06, "loss": 0.3472, "mean_token_accuracy": 0.8892634510993958, "num_tokens": 627225438.0, "step": 16440 }, { "epoch": 2.091464190306577, "ewc_loss": 0.031897157430648804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189715789631009e-05, "grad_norm": 18.626264572143555, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8695782423019409, "num_tokens": 627258154.0, "step": 16441 }, { "epoch": 2.0915914005851675, "ewc_loss": 0.03186934441328049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1869345548329875e-05, "grad_norm": 18.496309280395508, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8655089139938354, "num_tokens": 627301804.0, "step": 16442 }, { "epoch": 2.091718610863758, "ewc_loss": 0.03184026852250099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184026718372479e-05, "grad_norm": 18.53887939453125, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8765698671340942, "num_tokens": 627344301.0, "step": 16443 }, { "epoch": 2.0918458211423485, "ewc_loss": 0.03189447522163391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1894476705929264e-05, "grad_norm": 18.550844192504883, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8818244338035583, "num_tokens": 627381068.0, "step": 16444 }, { "epoch": 2.0919730314209386, "ewc_loss": 0.031889908015728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188990740454756e-05, "grad_norm": 18.58274269104004, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8662838339805603, "num_tokens": 627423410.0, "step": 16445 }, { "epoch": 2.092100241699529, "ewc_loss": 0.03188931569457054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1889314414002e-05, "grad_norm": 18.491397857666016, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8834874629974365, "num_tokens": 627456593.0, "step": 16446 }, { "epoch": 2.0922274519781197, "ewc_loss": 0.031830400228500366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.183040098519996e-05, "grad_norm": 18.612680435180664, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8758926391601562, "num_tokens": 627493742.0, "step": 16447 }, { "epoch": 2.09235466225671, "ewc_loss": 0.03191521018743515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191520954715088e-05, "grad_norm": 18.484010696411133, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8803728222846985, "num_tokens": 627529809.0, "step": 16448 }, { "epoch": 2.0924818725353007, "ewc_loss": 0.03178274258971214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.178274346282706e-05, "grad_norm": 18.550283432006836, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8767650127410889, "num_tokens": 627570693.0, "step": 16449 }, { "epoch": 2.0926090828138912, "ewc_loss": 0.03199174255132675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199174170731567e-05, "grad_norm": 18.490375518798828, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8805825710296631, "num_tokens": 627607608.0, "step": 16450 }, { "epoch": 2.0927362930924818, "ewc_loss": 0.03188347816467285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1883479095995426e-05, "grad_norm": 18.606002807617188, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8719686269760132, "num_tokens": 627642102.0, "step": 16451 }, { "epoch": 2.0928635033710723, "ewc_loss": 0.03194364905357361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1943647627485916e-05, "grad_norm": 18.49860191345215, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.868193507194519, "num_tokens": 627678838.0, "step": 16452 }, { "epoch": 2.092990713649663, "ewc_loss": 0.03189556673169136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189556809957139e-05, "grad_norm": 18.56388282775879, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8779706954956055, "num_tokens": 627714952.0, "step": 16453 }, { "epoch": 2.0931179239282534, "ewc_loss": 0.031893447041511536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189344715792686e-05, "grad_norm": 18.427284240722656, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8639342784881592, "num_tokens": 627754710.0, "step": 16454 }, { "epoch": 2.093245134206844, "ewc_loss": 0.03187296912074089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.187296897522174e-05, "grad_norm": 18.567808151245117, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8777923583984375, "num_tokens": 627791504.0, "step": 16455 }, { "epoch": 2.0933723444854344, "ewc_loss": 0.03197519853711128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197519981767982e-05, "grad_norm": 18.45984649658203, "learning_rate": 1e-06, "loss": 0.3565, "mean_token_accuracy": 0.8872560262680054, "num_tokens": 627829683.0, "step": 16456 }, { "epoch": 2.093499554764025, "ewc_loss": 0.03189975768327713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189975905115716e-05, "grad_norm": 18.55465316772461, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8744494318962097, "num_tokens": 627869826.0, "step": 16457 }, { "epoch": 2.0936267650426155, "ewc_loss": 0.03194310516119003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194310556864366e-05, "grad_norm": 18.464950561523438, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8703059554100037, "num_tokens": 627904165.0, "step": 16458 }, { "epoch": 2.093753975321206, "ewc_loss": 0.031847745180130005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.184774686815217e-05, "grad_norm": 18.534912109375, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8787674903869629, "num_tokens": 627936559.0, "step": 16459 }, { "epoch": 2.0938811855997965, "ewc_loss": 0.03197474032640457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197474143235013e-05, "grad_norm": 18.50489616394043, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8780760765075684, "num_tokens": 627971742.0, "step": 16460 }, { "epoch": 2.094008395878387, "ewc_loss": 0.031921178102493286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.192117947037332e-05, "grad_norm": 18.56972885131836, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8830087184906006, "num_tokens": 628013104.0, "step": 16461 }, { "epoch": 2.0941356061569776, "ewc_loss": 0.032012343406677246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201234358130023e-05, "grad_norm": 18.56751823425293, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8706091642379761, "num_tokens": 628049378.0, "step": 16462 }, { "epoch": 2.094262816435568, "ewc_loss": 0.03190259262919426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1902593036647886e-05, "grad_norm": 18.5435848236084, "learning_rate": 1e-06, "loss": 0.3384, "mean_token_accuracy": 0.8914628028869629, "num_tokens": 628087213.0, "step": 16463 }, { "epoch": 2.0943900267141586, "ewc_loss": 0.03195640817284584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1956409657141194e-05, "grad_norm": 18.475027084350586, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8796622157096863, "num_tokens": 628122060.0, "step": 16464 }, { "epoch": 2.094517236992749, "ewc_loss": 0.031904712319374084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190471397829242e-05, "grad_norm": 18.49917984008789, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8767116665840149, "num_tokens": 628166554.0, "step": 16465 }, { "epoch": 2.0946444472713397, "ewc_loss": 0.03196898475289345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196898614987731e-05, "grad_norm": 18.51354217529297, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8722931742668152, "num_tokens": 628199953.0, "step": 16466 }, { "epoch": 2.09477165754993, "ewc_loss": 0.031994469463825226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199447019142099e-05, "grad_norm": 18.556804656982422, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8765155076980591, "num_tokens": 628237044.0, "step": 16467 }, { "epoch": 2.0948988678285207, "ewc_loss": 0.03190723434090614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1907235097605735e-05, "grad_norm": 18.459394454956055, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8761806488037109, "num_tokens": 628275773.0, "step": 16468 }, { "epoch": 2.095026078107111, "ewc_loss": 0.03201601281762123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2016014301916584e-05, "grad_norm": 18.60199737548828, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8603392839431763, "num_tokens": 628314549.0, "step": 16469 }, { "epoch": 2.0951532883857014, "ewc_loss": 0.031964290887117386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1964289519237354e-05, "grad_norm": 18.499055862426758, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8697140216827393, "num_tokens": 628349417.0, "step": 16470 }, { "epoch": 2.095280498664292, "ewc_loss": 0.03189678117632866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1896779546514153e-05, "grad_norm": 18.468385696411133, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8786966800689697, "num_tokens": 628384435.0, "step": 16471 }, { "epoch": 2.0954077089428824, "ewc_loss": 0.03197959065437317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197959085809998e-05, "grad_norm": 18.57244300842285, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.861197829246521, "num_tokens": 628424124.0, "step": 16472 }, { "epoch": 2.095534919221473, "ewc_loss": 0.03202592581510544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20259241561871e-05, "grad_norm": 18.52096939086914, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8757908344268799, "num_tokens": 628462412.0, "step": 16473 }, { "epoch": 2.0956621295000635, "ewc_loss": 0.03196980059146881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19698010571301e-05, "grad_norm": 18.512489318847656, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.855073094367981, "num_tokens": 628499066.0, "step": 16474 }, { "epoch": 2.095789339778654, "ewc_loss": 0.03203631937503815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203631786163896e-05, "grad_norm": 18.580490112304688, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.887095034122467, "num_tokens": 628532379.0, "step": 16475 }, { "epoch": 2.0959165500572445, "ewc_loss": 0.032042644917964935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2042644306784496e-05, "grad_norm": 18.49102783203125, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.883072555065155, "num_tokens": 628570497.0, "step": 16476 }, { "epoch": 2.096043760335835, "ewc_loss": 0.03196686506271362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1966865208232775e-05, "grad_norm": 18.56757926940918, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8791131973266602, "num_tokens": 628608298.0, "step": 16477 }, { "epoch": 2.0961709706144256, "ewc_loss": 0.03204041346907616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204041422577575e-05, "grad_norm": 18.595130920410156, "learning_rate": 1e-06, "loss": 0.3343, "mean_token_accuracy": 0.8918837904930115, "num_tokens": 628641217.0, "step": 16478 }, { "epoch": 2.096298180893016, "ewc_loss": 0.03197600692510605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1976007448975e-05, "grad_norm": 18.45638084411621, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8746494054794312, "num_tokens": 628682480.0, "step": 16479 }, { "epoch": 2.0964253911716066, "ewc_loss": 0.03198719024658203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1987190595828e-05, "grad_norm": 18.56439208984375, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8725189566612244, "num_tokens": 628718359.0, "step": 16480 }, { "epoch": 2.096552601450197, "ewc_loss": 0.03202129900455475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202130028512329e-05, "grad_norm": 18.556562423706055, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.8815107345581055, "num_tokens": 628752214.0, "step": 16481 }, { "epoch": 2.0966798117287877, "ewc_loss": 0.03195997700095177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195997851435095e-05, "grad_norm": 18.514638900756836, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8878436088562012, "num_tokens": 628787882.0, "step": 16482 }, { "epoch": 2.0968070220073782, "ewc_loss": 0.0319487601518631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194875898770988e-05, "grad_norm": 18.474435806274414, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8716199994087219, "num_tokens": 628824910.0, "step": 16483 }, { "epoch": 2.0969342322859688, "ewc_loss": 0.031969401985406876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196940087946132e-05, "grad_norm": 18.554494857788086, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.8850323557853699, "num_tokens": 628860657.0, "step": 16484 }, { "epoch": 2.0970614425645593, "ewc_loss": 0.03197941929101944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197941987309605e-05, "grad_norm": 18.509971618652344, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8580827713012695, "num_tokens": 628900619.0, "step": 16485 }, { "epoch": 2.09718865284315, "ewc_loss": 0.03198358789086342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1983588996808976e-05, "grad_norm": 18.55191993713379, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8857100009918213, "num_tokens": 628944304.0, "step": 16486 }, { "epoch": 2.0973158631217403, "ewc_loss": 0.03199963644146919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199963612132706e-05, "grad_norm": 18.59961700439453, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8786475658416748, "num_tokens": 628980490.0, "step": 16487 }, { "epoch": 2.097443073400331, "ewc_loss": 0.03203028813004494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203028973075561e-05, "grad_norm": 18.63222885131836, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8784194588661194, "num_tokens": 629011924.0, "step": 16488 }, { "epoch": 2.0975702836789214, "ewc_loss": 0.03197191655635834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1971918360795826e-05, "grad_norm": 18.55132293701172, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8497860431671143, "num_tokens": 629049514.0, "step": 16489 }, { "epoch": 2.097697493957512, "ewc_loss": 0.03194868192076683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194868259015493e-05, "grad_norm": 18.51731300354004, "learning_rate": 1e-06, "loss": 0.3357, "mean_token_accuracy": 0.8920583128929138, "num_tokens": 629085374.0, "step": 16490 }, { "epoch": 2.0978247042361025, "ewc_loss": 0.031971342861652374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1971343560144305e-05, "grad_norm": 18.62546730041504, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.871009111404419, "num_tokens": 629126307.0, "step": 16491 }, { "epoch": 2.097951914514693, "ewc_loss": 0.032038282603025436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203828237019479e-05, "grad_norm": 18.57905387878418, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8743773698806763, "num_tokens": 629159693.0, "step": 16492 }, { "epoch": 2.0980791247932835, "ewc_loss": 0.031939804553985596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193980592186563e-05, "grad_norm": 18.532291412353516, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8800156116485596, "num_tokens": 629198544.0, "step": 16493 }, { "epoch": 2.0982063350718736, "ewc_loss": 0.031934816390275955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19348146149423e-05, "grad_norm": 18.496641159057617, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8665176630020142, "num_tokens": 629240956.0, "step": 16494 }, { "epoch": 2.098333545350464, "ewc_loss": 0.03192747384309769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1927473173709586e-05, "grad_norm": 18.5487117767334, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8758708238601685, "num_tokens": 629276196.0, "step": 16495 }, { "epoch": 2.0984607556290547, "ewc_loss": 0.031996674835681915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1996674806578085e-05, "grad_norm": 18.534832000732422, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8624991178512573, "num_tokens": 629311398.0, "step": 16496 }, { "epoch": 2.098587965907645, "ewc_loss": 0.03193134069442749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1931340345181525e-05, "grad_norm": 18.50287437438965, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8695532083511353, "num_tokens": 629349488.0, "step": 16497 }, { "epoch": 2.0987151761862357, "ewc_loss": 0.03201720863580704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201720755896531e-05, "grad_norm": 18.509965896606445, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.886927604675293, "num_tokens": 629388023.0, "step": 16498 }, { "epoch": 2.0988423864648262, "ewc_loss": 0.03197598084807396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197598198312335e-05, "grad_norm": 18.471343994140625, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8734803795814514, "num_tokens": 629432140.0, "step": 16499 }, { "epoch": 2.0989695967434168, "ewc_loss": 0.03199392557144165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1993924494599923e-05, "grad_norm": 18.573450088500977, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8604587912559509, "num_tokens": 629480690.0, "step": 16500 }, { "epoch": 2.0990968070220073, "ewc_loss": 0.03202568739652634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202568768756464e-05, "grad_norm": 18.5589542388916, "learning_rate": 1e-06, "loss": 0.3443, "mean_token_accuracy": 0.8912701606750488, "num_tokens": 629522500.0, "step": 16501 }, { "epoch": 2.099224017300598, "ewc_loss": 0.031917132437229156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191713403793983e-05, "grad_norm": 18.50823974609375, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8744632005691528, "num_tokens": 629563450.0, "step": 16502 }, { "epoch": 2.0993512275791884, "ewc_loss": 0.03196398913860321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1963987566996366e-05, "grad_norm": 18.50658416748047, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8836562633514404, "num_tokens": 629603753.0, "step": 16503 }, { "epoch": 2.099478437857779, "ewc_loss": 0.03193746879696846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1937470339471474e-05, "grad_norm": 18.597352981567383, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8765636086463928, "num_tokens": 629637828.0, "step": 16504 }, { "epoch": 2.0996056481363694, "ewc_loss": 0.03195492550730705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19549253617879e-05, "grad_norm": 18.584081649780273, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.882449746131897, "num_tokens": 629674636.0, "step": 16505 }, { "epoch": 2.09973285841496, "ewc_loss": 0.031901437789201736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190143615938723e-05, "grad_norm": 18.538530349731445, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8647308945655823, "num_tokens": 629717046.0, "step": 16506 }, { "epoch": 2.0998600686935505, "ewc_loss": 0.03193797543644905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193797601852566e-05, "grad_norm": 18.526416778564453, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8846839666366577, "num_tokens": 629750331.0, "step": 16507 }, { "epoch": 2.099987278972141, "ewc_loss": 0.031948402523994446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1948402465786785e-05, "grad_norm": 18.602169036865234, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8561943769454956, "num_tokens": 629790095.0, "step": 16508 }, { "epoch": 2.1001144892507315, "ewc_loss": 0.031950097531080246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195009776391089e-05, "grad_norm": 18.517831802368164, "learning_rate": 1e-06, "loss": 0.3373, "mean_token_accuracy": 0.8889485597610474, "num_tokens": 629823035.0, "step": 16509 }, { "epoch": 2.100241699529322, "ewc_loss": 0.03197973594069481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1979736377252266e-05, "grad_norm": 18.62659454345703, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8696773052215576, "num_tokens": 629857096.0, "step": 16510 }, { "epoch": 2.1003689098079126, "ewc_loss": 0.03192988410592079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.192988515365869e-05, "grad_norm": 18.55025291442871, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8739054799079895, "num_tokens": 629893448.0, "step": 16511 }, { "epoch": 2.100496120086503, "ewc_loss": 0.031889453530311584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.188945265719667e-05, "grad_norm": 18.59343910217285, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8714083433151245, "num_tokens": 629931301.0, "step": 16512 }, { "epoch": 2.1006233303650936, "ewc_loss": 0.031968504190444946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1968505936674774e-05, "grad_norm": 18.609580993652344, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8832280039787292, "num_tokens": 629963481.0, "step": 16513 }, { "epoch": 2.100750540643684, "ewc_loss": 0.03189747408032417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.189747440046631e-05, "grad_norm": 18.52649688720703, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8702267408370972, "num_tokens": 630002739.0, "step": 16514 }, { "epoch": 2.1008777509222747, "ewc_loss": 0.031942129135131836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194213059032336e-05, "grad_norm": 18.54550552368164, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8680806756019592, "num_tokens": 630041206.0, "step": 16515 }, { "epoch": 2.101004961200865, "ewc_loss": 0.03200191259384155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20019134960603e-05, "grad_norm": 18.551589965820312, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8602637052536011, "num_tokens": 630083811.0, "step": 16516 }, { "epoch": 2.1011321714794557, "ewc_loss": 0.03196493908762932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1964940717443824e-05, "grad_norm": 18.55640983581543, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8720653653144836, "num_tokens": 630119823.0, "step": 16517 }, { "epoch": 2.1012593817580463, "ewc_loss": 0.031985603272914886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1985604437068105e-05, "grad_norm": 18.53522491455078, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8748613595962524, "num_tokens": 630163807.0, "step": 16518 }, { "epoch": 2.1013865920366364, "ewc_loss": 0.03196415305137634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196415491402149e-05, "grad_norm": 18.619705200195312, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8701876401901245, "num_tokens": 630203038.0, "step": 16519 }, { "epoch": 2.101513802315227, "ewc_loss": 0.03197591006755829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1975909223547205e-05, "grad_norm": 18.561565399169922, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8781232237815857, "num_tokens": 630238505.0, "step": 16520 }, { "epoch": 2.1016410125938174, "ewc_loss": 0.032000038772821426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2000039936974645e-05, "grad_norm": 18.53141212463379, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8733571767807007, "num_tokens": 630275279.0, "step": 16521 }, { "epoch": 2.101768222872408, "ewc_loss": 0.031985536217689514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198553531547077e-05, "grad_norm": 18.519006729125977, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8712815642356873, "num_tokens": 630317288.0, "step": 16522 }, { "epoch": 2.1018954331509985, "ewc_loss": 0.03201732039451599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201732033630833e-05, "grad_norm": 18.590423583984375, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8754091262817383, "num_tokens": 630352786.0, "step": 16523 }, { "epoch": 2.102022643429589, "ewc_loss": 0.03200574591755867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2005744287744164e-05, "grad_norm": 18.61929702758789, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8746084570884705, "num_tokens": 630391138.0, "step": 16524 }, { "epoch": 2.1021498537081795, "ewc_loss": 0.032002050429582596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2002051739254966e-05, "grad_norm": 18.613128662109375, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8627305030822754, "num_tokens": 630426567.0, "step": 16525 }, { "epoch": 2.10227706398677, "ewc_loss": 0.03193635120987892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193634984199889e-05, "grad_norm": 18.546030044555664, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8646270036697388, "num_tokens": 630467308.0, "step": 16526 }, { "epoch": 2.1024042742653606, "ewc_loss": 0.031948287039995193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194828605046496e-05, "grad_norm": 18.560699462890625, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8850893974304199, "num_tokens": 630503333.0, "step": 16527 }, { "epoch": 2.102531484543951, "ewc_loss": 0.031954094767570496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1954095902619883e-05, "grad_norm": 18.590295791625977, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8702200651168823, "num_tokens": 630536838.0, "step": 16528 }, { "epoch": 2.1026586948225416, "ewc_loss": 0.03192751482129097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1927513191476464e-05, "grad_norm": 18.564918518066406, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8869489431381226, "num_tokens": 630570833.0, "step": 16529 }, { "epoch": 2.102785905101132, "ewc_loss": 0.03201077878475189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201077925041318e-05, "grad_norm": 18.564041137695312, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8824689984321594, "num_tokens": 630615043.0, "step": 16530 }, { "epoch": 2.1029131153797227, "ewc_loss": 0.03192656859755516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.192656731698662e-05, "grad_norm": 18.590299606323242, "learning_rate": 1e-06, "loss": 0.3316, "mean_token_accuracy": 0.8962424397468567, "num_tokens": 630651368.0, "step": 16531 }, { "epoch": 2.1030403256583132, "ewc_loss": 0.031926412135362625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1926410883897915e-05, "grad_norm": 18.522966384887695, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.867454469203949, "num_tokens": 630690341.0, "step": 16532 }, { "epoch": 2.1031675359369038, "ewc_loss": 0.03196130692958832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196130637661554e-05, "grad_norm": 18.62338638305664, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8727716207504272, "num_tokens": 630727168.0, "step": 16533 }, { "epoch": 2.1032947462154943, "ewc_loss": 0.031975191086530685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197519254172221e-05, "grad_norm": 18.452817916870117, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8786156177520752, "num_tokens": 630766737.0, "step": 16534 }, { "epoch": 2.103421956494085, "ewc_loss": 0.03195152431726456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195152385160327e-05, "grad_norm": 18.66288948059082, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8718628287315369, "num_tokens": 630803721.0, "step": 16535 }, { "epoch": 2.1035491667726753, "ewc_loss": 0.03204674646258354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2046747946878895e-05, "grad_norm": 18.524646759033203, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8799314498901367, "num_tokens": 630843697.0, "step": 16536 }, { "epoch": 2.103676377051266, "ewc_loss": 0.031914856284856796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191485666320659e-05, "grad_norm": 18.67513656616211, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.87554532289505, "num_tokens": 630874325.0, "step": 16537 }, { "epoch": 2.1038035873298564, "ewc_loss": 0.03199728578329086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1997285987017676e-05, "grad_norm": 18.523822784423828, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8708498477935791, "num_tokens": 630911456.0, "step": 16538 }, { "epoch": 2.103930797608447, "ewc_loss": 0.03189880773425102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1898805900709704e-05, "grad_norm": 18.576602935791016, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8725718259811401, "num_tokens": 630954418.0, "step": 16539 }, { "epoch": 2.1040580078870375, "ewc_loss": 0.0320061594247818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200615901732817e-05, "grad_norm": 18.613332748413086, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8742156028747559, "num_tokens": 630991826.0, "step": 16540 }, { "epoch": 2.104185218165628, "ewc_loss": 0.031948767602443695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1948766263667494e-05, "grad_norm": 18.545076370239258, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8752249479293823, "num_tokens": 631025831.0, "step": 16541 }, { "epoch": 2.1043124284442185, "ewc_loss": 0.03196481242775917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1964813388185576e-05, "grad_norm": 18.546350479125977, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8592554330825806, "num_tokens": 631062896.0, "step": 16542 }, { "epoch": 2.1044396387228086, "ewc_loss": 0.03194575011730194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1945750379236415e-05, "grad_norm": 18.60824203491211, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8811364769935608, "num_tokens": 631099291.0, "step": 16543 }, { "epoch": 2.104566849001399, "ewc_loss": 0.03197147324681282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197147452738136e-05, "grad_norm": 18.54586410522461, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8664925694465637, "num_tokens": 631141570.0, "step": 16544 }, { "epoch": 2.1046940592799896, "ewc_loss": 0.03198211267590523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19821119774133e-05, "grad_norm": 18.552505493164062, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8667263984680176, "num_tokens": 631178882.0, "step": 16545 }, { "epoch": 2.10482126955858, "ewc_loss": 0.0319712869822979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197128535248339e-05, "grad_norm": 18.5299129486084, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8716210722923279, "num_tokens": 631218059.0, "step": 16546 }, { "epoch": 2.1049484798371707, "ewc_loss": 0.032025087624788284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202508742106147e-05, "grad_norm": 18.597890853881836, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8751629590988159, "num_tokens": 631252827.0, "step": 16547 }, { "epoch": 2.1050756901157612, "ewc_loss": 0.03201870992779732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201871004421264e-05, "grad_norm": 18.61764907836914, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8804929852485657, "num_tokens": 631294370.0, "step": 16548 }, { "epoch": 2.1052029003943518, "ewc_loss": 0.032037414610385895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20374128932599e-05, "grad_norm": 18.588438034057617, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8714252710342407, "num_tokens": 631331091.0, "step": 16549 }, { "epoch": 2.1053301106729423, "ewc_loss": 0.031993765383958817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199376442353241e-05, "grad_norm": 18.652009963989258, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8773930072784424, "num_tokens": 631367285.0, "step": 16550 }, { "epoch": 2.105457320951533, "ewc_loss": 0.032013896852731705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201389699825086e-05, "grad_norm": 18.51470375061035, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8608443737030029, "num_tokens": 631402471.0, "step": 16551 }, { "epoch": 2.1055845312301233, "ewc_loss": 0.03193091228604317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193091106368229e-05, "grad_norm": 18.659591674804688, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8745155930519104, "num_tokens": 631439706.0, "step": 16552 }, { "epoch": 2.105711741508714, "ewc_loss": 0.032050516456365585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205051689292304e-05, "grad_norm": 18.546337127685547, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8793879151344299, "num_tokens": 631471437.0, "step": 16553 }, { "epoch": 2.1058389517873044, "ewc_loss": 0.03195330500602722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1953306461218745e-05, "grad_norm": 18.512380599975586, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8712315559387207, "num_tokens": 631515028.0, "step": 16554 }, { "epoch": 2.105966162065895, "ewc_loss": 0.03201346471905708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201346407877281e-05, "grad_norm": 18.612258911132812, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8690202832221985, "num_tokens": 631552946.0, "step": 16555 }, { "epoch": 2.1060933723444855, "ewc_loss": 0.03203006461262703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203006417606957e-05, "grad_norm": 18.574581146240234, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8790223002433777, "num_tokens": 631596045.0, "step": 16556 }, { "epoch": 2.106220582623076, "ewc_loss": 0.03191453963518143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1914540159050375e-05, "grad_norm": 18.553922653198242, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8823043704032898, "num_tokens": 631630233.0, "step": 16557 }, { "epoch": 2.1063477929016665, "ewc_loss": 0.032035987824201584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203598680556752e-05, "grad_norm": 18.576608657836914, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8807395100593567, "num_tokens": 631666665.0, "step": 16558 }, { "epoch": 2.106475003180257, "ewc_loss": 0.0319858118891716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198581180186011e-05, "grad_norm": 18.575468063354492, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8702126741409302, "num_tokens": 631709536.0, "step": 16559 }, { "epoch": 2.1066022134588476, "ewc_loss": 0.031977586448192596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1977586331777275e-05, "grad_norm": 18.581920623779297, "learning_rate": 1e-06, "loss": 0.343, "mean_token_accuracy": 0.8893181085586548, "num_tokens": 631744333.0, "step": 16560 }, { "epoch": 2.106729423737438, "ewc_loss": 0.03199775889515877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19977589242626e-05, "grad_norm": 18.580204010009766, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8598703145980835, "num_tokens": 631783522.0, "step": 16561 }, { "epoch": 2.1068566340160286, "ewc_loss": 0.03197234123945236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197234036633745e-05, "grad_norm": 18.544706344604492, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8582358360290527, "num_tokens": 631822579.0, "step": 16562 }, { "epoch": 2.106983844294619, "ewc_loss": 0.03194912523031235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1949126423569396e-05, "grad_norm": 18.529891967773438, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8806951642036438, "num_tokens": 631862642.0, "step": 16563 }, { "epoch": 2.1071110545732097, "ewc_loss": 0.032045576721429825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204557651770301e-05, "grad_norm": 18.550004959106445, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.8844281435012817, "num_tokens": 631903161.0, "step": 16564 }, { "epoch": 2.1072382648518, "ewc_loss": 0.03196573257446289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196573379682377e-05, "grad_norm": 18.566476821899414, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8772138357162476, "num_tokens": 631938149.0, "step": 16565 }, { "epoch": 2.1073654751303907, "ewc_loss": 0.03199858218431473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1998581107473e-05, "grad_norm": 18.573026657104492, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8758913278579712, "num_tokens": 631974717.0, "step": 16566 }, { "epoch": 2.107492685408981, "ewc_loss": 0.03198451176285744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198451304342598e-05, "grad_norm": 18.589834213256836, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8780702948570251, "num_tokens": 632011147.0, "step": 16567 }, { "epoch": 2.1076198956875714, "ewc_loss": 0.031960614025592804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196061516064219e-05, "grad_norm": 18.46482276916504, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8805755376815796, "num_tokens": 632051915.0, "step": 16568 }, { "epoch": 2.107747105966162, "ewc_loss": 0.03197984769940376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1979849154595286e-05, "grad_norm": 18.64247703552246, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8659611940383911, "num_tokens": 632083047.0, "step": 16569 }, { "epoch": 2.1078743162447524, "ewc_loss": 0.03203808143734932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20380822813604e-05, "grad_norm": 18.539932250976562, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8863523602485657, "num_tokens": 632119250.0, "step": 16570 }, { "epoch": 2.108001526523343, "ewc_loss": 0.031938083469867706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193808515788987e-05, "grad_norm": 18.5076904296875, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.880710780620575, "num_tokens": 632152696.0, "step": 16571 }, { "epoch": 2.1081287368019335, "ewc_loss": 0.032051656395196915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205165558028966e-05, "grad_norm": 18.60795021057129, "learning_rate": 1e-06, "loss": 0.3332, "mean_token_accuracy": 0.8946349024772644, "num_tokens": 632188167.0, "step": 16572 }, { "epoch": 2.108255947080524, "ewc_loss": 0.03203406184911728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2034062314778566e-05, "grad_norm": 18.592126846313477, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8729732036590576, "num_tokens": 632229567.0, "step": 16573 }, { "epoch": 2.1083831573591145, "ewc_loss": 0.03196088224649429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196088073309511e-05, "grad_norm": 18.577905654907227, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.883979082107544, "num_tokens": 632266870.0, "step": 16574 }, { "epoch": 2.108510367637705, "ewc_loss": 0.032003313302993774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200331411790103e-05, "grad_norm": 18.613834381103516, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8742930889129639, "num_tokens": 632303844.0, "step": 16575 }, { "epoch": 2.1086375779162956, "ewc_loss": 0.032040003687143326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204000313417055e-05, "grad_norm": 18.540483474731445, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8771502375602722, "num_tokens": 632339343.0, "step": 16576 }, { "epoch": 2.108764788194886, "ewc_loss": 0.032021861523389816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202186053385958e-05, "grad_norm": 18.61646270751953, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8799068927764893, "num_tokens": 632385516.0, "step": 16577 }, { "epoch": 2.1088919984734766, "ewc_loss": 0.032036494463682175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203649612260051e-05, "grad_norm": 18.62017250061035, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.879589319229126, "num_tokens": 632420664.0, "step": 16578 }, { "epoch": 2.109019208752067, "ewc_loss": 0.03191404417157173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191404539393261e-05, "grad_norm": 18.512577056884766, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8799731731414795, "num_tokens": 632458082.0, "step": 16579 }, { "epoch": 2.1091464190306577, "ewc_loss": 0.031904853880405426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.190485222148709e-05, "grad_norm": 18.54559898376465, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8630484938621521, "num_tokens": 632495963.0, "step": 16580 }, { "epoch": 2.109273629309248, "ewc_loss": 0.031963277608156204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196327816112898e-05, "grad_norm": 18.551698684692383, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8764535188674927, "num_tokens": 632529968.0, "step": 16581 }, { "epoch": 2.1094008395878387, "ewc_loss": 0.03191511705517769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19151185976807e-05, "grad_norm": 18.55626678466797, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8830263614654541, "num_tokens": 632567527.0, "step": 16582 }, { "epoch": 2.1095280498664293, "ewc_loss": 0.03191252425312996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1912524718791246e-05, "grad_norm": 18.597694396972656, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8731735944747925, "num_tokens": 632606192.0, "step": 16583 }, { "epoch": 2.10965526014502, "ewc_loss": 0.03200279176235199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2002790248952806e-05, "grad_norm": 18.60069465637207, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8851287364959717, "num_tokens": 632645940.0, "step": 16584 }, { "epoch": 2.1097824704236103, "ewc_loss": 0.03191877901554108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.191877840436064e-05, "grad_norm": 18.53652572631836, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8830941319465637, "num_tokens": 632689385.0, "step": 16585 }, { "epoch": 2.109909680702201, "ewc_loss": 0.03198482096195221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198482227162458e-05, "grad_norm": 18.654062271118164, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8819898366928101, "num_tokens": 632720178.0, "step": 16586 }, { "epoch": 2.1100368909807914, "ewc_loss": 0.03194161504507065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194161399733275e-05, "grad_norm": 18.63460350036621, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8671282529830933, "num_tokens": 632763249.0, "step": 16587 }, { "epoch": 2.110164101259382, "ewc_loss": 0.031941208988428116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1941210181685165e-05, "grad_norm": 18.540651321411133, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8567065000534058, "num_tokens": 632803293.0, "step": 16588 }, { "epoch": 2.1102913115379724, "ewc_loss": 0.031914930790662766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1914929422782734e-05, "grad_norm": 18.70494842529297, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.865777850151062, "num_tokens": 632838731.0, "step": 16589 }, { "epoch": 2.110418521816563, "ewc_loss": 0.03197932615876198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1979325285647064e-05, "grad_norm": 18.57642364501953, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8660111427307129, "num_tokens": 632875044.0, "step": 16590 }, { "epoch": 2.1105457320951535, "ewc_loss": 0.03186841309070587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.186841422575526e-05, "grad_norm": 18.595325469970703, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.854374349117279, "num_tokens": 632911330.0, "step": 16591 }, { "epoch": 2.1106729423737436, "ewc_loss": 0.031969815492630005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196981560904533e-05, "grad_norm": 18.58249282836914, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8644448518753052, "num_tokens": 632952122.0, "step": 16592 }, { "epoch": 2.110800152652334, "ewc_loss": 0.031938910484313965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193891097907908e-05, "grad_norm": 18.49480438232422, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8571768999099731, "num_tokens": 632985602.0, "step": 16593 }, { "epoch": 2.1109273629309246, "ewc_loss": 0.031951893121004105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.195189492544159e-05, "grad_norm": 18.621458053588867, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8753520250320435, "num_tokens": 633023792.0, "step": 16594 }, { "epoch": 2.111054573209515, "ewc_loss": 0.031997762620449066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1997762562241405e-05, "grad_norm": 18.578609466552734, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8755031824111938, "num_tokens": 633065457.0, "step": 16595 }, { "epoch": 2.1111817834881057, "ewc_loss": 0.03193533793091774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.193533848389052e-05, "grad_norm": 18.556074142456055, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8837389945983887, "num_tokens": 633101402.0, "step": 16596 }, { "epoch": 2.1113089937666962, "ewc_loss": 0.03201448917388916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201448998879641e-05, "grad_norm": 18.665800094604492, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8658029437065125, "num_tokens": 633147002.0, "step": 16597 }, { "epoch": 2.1114362040452868, "ewc_loss": 0.03194432333111763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1944324291544035e-05, "grad_norm": 18.512758255004883, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8742207884788513, "num_tokens": 633184209.0, "step": 16598 }, { "epoch": 2.1115634143238773, "ewc_loss": 0.03194429725408554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1944298825692385e-05, "grad_norm": 18.62163734436035, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8792997598648071, "num_tokens": 633224525.0, "step": 16599 }, { "epoch": 2.111690624602468, "ewc_loss": 0.03204389661550522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2043895771494135e-05, "grad_norm": 18.5627384185791, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8893876671791077, "num_tokens": 633253171.0, "step": 16600 }, { "epoch": 2.1118178348810583, "ewc_loss": 0.03202718123793602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2027182896854356e-05, "grad_norm": 18.623247146606445, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8717986345291138, "num_tokens": 633294326.0, "step": 16601 }, { "epoch": 2.111945045159649, "ewc_loss": 0.032062724232673645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206272594979964e-05, "grad_norm": 18.552261352539062, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.8875321745872498, "num_tokens": 633327236.0, "step": 16602 }, { "epoch": 2.1120722554382394, "ewc_loss": 0.03198113664984703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1981136999092996e-05, "grad_norm": 18.615331649780273, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8849489688873291, "num_tokens": 633368593.0, "step": 16603 }, { "epoch": 2.11219946571683, "ewc_loss": 0.032093342393636703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2093343179440126e-05, "grad_norm": 18.611209869384766, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8537454009056091, "num_tokens": 633402598.0, "step": 16604 }, { "epoch": 2.1123266759954205, "ewc_loss": 0.032029539346694946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202954030712135e-05, "grad_norm": 18.635787963867188, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8696173429489136, "num_tokens": 633437069.0, "step": 16605 }, { "epoch": 2.112453886274011, "ewc_loss": 0.03197593241930008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197593105142005e-05, "grad_norm": 18.510196685791016, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8766904473304749, "num_tokens": 633479752.0, "step": 16606 }, { "epoch": 2.1125810965526015, "ewc_loss": 0.031963665038347244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1963663786882535e-05, "grad_norm": 18.642301559448242, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8704003691673279, "num_tokens": 633516949.0, "step": 16607 }, { "epoch": 2.112708306831192, "ewc_loss": 0.0320451557636261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204515451216139e-05, "grad_norm": 18.54059410095215, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8790172934532166, "num_tokens": 633547511.0, "step": 16608 }, { "epoch": 2.1128355171097826, "ewc_loss": 0.03197963163256645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197963087586686e-05, "grad_norm": 18.593441009521484, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8762468099594116, "num_tokens": 633583940.0, "step": 16609 }, { "epoch": 2.112962727388373, "ewc_loss": 0.0320717953145504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207179543096572e-05, "grad_norm": 18.621976852416992, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8644756078720093, "num_tokens": 633626245.0, "step": 16610 }, { "epoch": 2.1130899376669636, "ewc_loss": 0.03202242776751518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202242805855349e-05, "grad_norm": 18.54156494140625, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8776459693908691, "num_tokens": 633662889.0, "step": 16611 }, { "epoch": 2.113217147945554, "ewc_loss": 0.03204480931162834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2044808904174715e-05, "grad_norm": 18.632116317749023, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8702250123023987, "num_tokens": 633701461.0, "step": 16612 }, { "epoch": 2.1133443582241447, "ewc_loss": 0.032002370804548264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200237188138999e-05, "grad_norm": 18.53084373474121, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8757332563400269, "num_tokens": 633741302.0, "step": 16613 }, { "epoch": 2.113471568502735, "ewc_loss": 0.03203025832772255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203025698894635e-05, "grad_norm": 18.61264991760254, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8802264928817749, "num_tokens": 633780337.0, "step": 16614 }, { "epoch": 2.1135987787813257, "ewc_loss": 0.032052140682935715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2052139431471005e-05, "grad_norm": 18.597272872924805, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8808388710021973, "num_tokens": 633814450.0, "step": 16615 }, { "epoch": 2.1137259890599163, "ewc_loss": 0.03197919577360153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197919431841001e-05, "grad_norm": 18.5604248046875, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8673796653747559, "num_tokens": 633856252.0, "step": 16616 }, { "epoch": 2.1138531993385064, "ewc_loss": 0.031985990703105927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1985990062821656e-05, "grad_norm": 18.602373123168945, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8633519411087036, "num_tokens": 633895235.0, "step": 16617 }, { "epoch": 2.113980409617097, "ewc_loss": 0.0320754311978817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2075429771794006e-05, "grad_norm": 18.619850158691406, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8763985633850098, "num_tokens": 633938948.0, "step": 16618 }, { "epoch": 2.1141076198956874, "ewc_loss": 0.03204229101538658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204229142284021e-05, "grad_norm": 18.58669662475586, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8806583881378174, "num_tokens": 633973393.0, "step": 16619 }, { "epoch": 2.114234830174278, "ewc_loss": 0.03201436996459961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201436993549578e-05, "grad_norm": 18.618288040161133, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8789368271827698, "num_tokens": 634006714.0, "step": 16620 }, { "epoch": 2.1143620404528685, "ewc_loss": 0.03205922991037369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205922985216603e-05, "grad_norm": 18.573766708374023, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8797189593315125, "num_tokens": 634046104.0, "step": 16621 }, { "epoch": 2.114489250731459, "ewc_loss": 0.03197462484240532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19746250170283e-05, "grad_norm": 18.5275936126709, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8668925166130066, "num_tokens": 634086401.0, "step": 16622 }, { "epoch": 2.1146164610100495, "ewc_loss": 0.032030850648880005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2030849979491904e-05, "grad_norm": 18.639259338378906, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8664040565490723, "num_tokens": 634125479.0, "step": 16623 }, { "epoch": 2.11474367128864, "ewc_loss": 0.03206836059689522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206836117897183e-05, "grad_norm": 18.571979522705078, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8562582731246948, "num_tokens": 634163873.0, "step": 16624 }, { "epoch": 2.1148708815672306, "ewc_loss": 0.031978461891412735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1978463084669784e-05, "grad_norm": 18.568628311157227, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8724595308303833, "num_tokens": 634200533.0, "step": 16625 }, { "epoch": 2.114998091845821, "ewc_loss": 0.03203704208135605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2037041819421574e-05, "grad_norm": 18.584827423095703, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8668092489242554, "num_tokens": 634235891.0, "step": 16626 }, { "epoch": 2.1151253021244116, "ewc_loss": 0.03198574483394623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198574631824158e-05, "grad_norm": 18.6007080078125, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8728058934211731, "num_tokens": 634266748.0, "step": 16627 }, { "epoch": 2.115252512403002, "ewc_loss": 0.032084569334983826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208456837455742e-05, "grad_norm": 18.609970092773438, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8628072738647461, "num_tokens": 634302603.0, "step": 16628 }, { "epoch": 2.1153797226815927, "ewc_loss": 0.03204965218901634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2049651053966954e-05, "grad_norm": 18.544641494750977, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8767679333686829, "num_tokens": 634345471.0, "step": 16629 }, { "epoch": 2.115506932960183, "ewc_loss": 0.03199782595038414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1997824407881126e-05, "grad_norm": 18.499969482421875, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8729645609855652, "num_tokens": 634383902.0, "step": 16630 }, { "epoch": 2.1156341432387737, "ewc_loss": 0.03205037862062454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205037864972837e-05, "grad_norm": 18.55738639831543, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.868399977684021, "num_tokens": 634421082.0, "step": 16631 }, { "epoch": 2.1157613535173643, "ewc_loss": 0.032047562301158905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2047562854131684e-05, "grad_norm": 18.602819442749023, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8685236573219299, "num_tokens": 634456490.0, "step": 16632 }, { "epoch": 2.115888563795955, "ewc_loss": 0.03210247680544853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210247814422473e-05, "grad_norm": 18.52181625366211, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8553178310394287, "num_tokens": 634498252.0, "step": 16633 }, { "epoch": 2.1160157740745453, "ewc_loss": 0.032013919204473495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20139188261237e-05, "grad_norm": 18.506053924560547, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8816207051277161, "num_tokens": 634540769.0, "step": 16634 }, { "epoch": 2.116142984353136, "ewc_loss": 0.032083600759506226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2083600672194734e-05, "grad_norm": 18.584707260131836, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8813270330429077, "num_tokens": 634573807.0, "step": 16635 }, { "epoch": 2.1162701946317264, "ewc_loss": 0.03207378089427948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2073781767394394e-05, "grad_norm": 18.618505477905273, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8698158264160156, "num_tokens": 634609542.0, "step": 16636 }, { "epoch": 2.116397404910317, "ewc_loss": 0.03209257125854492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209257192793302e-05, "grad_norm": 18.494600296020508, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8822833895683289, "num_tokens": 634648641.0, "step": 16637 }, { "epoch": 2.1165246151889074, "ewc_loss": 0.032079800963401794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207980262232013e-05, "grad_norm": 18.65489387512207, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8719632625579834, "num_tokens": 634685531.0, "step": 16638 }, { "epoch": 2.116651825467498, "ewc_loss": 0.032191451638936996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2191452191909775e-05, "grad_norm": 18.60625648498535, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8758237361907959, "num_tokens": 634717239.0, "step": 16639 }, { "epoch": 2.116779035746088, "ewc_loss": 0.03204251825809479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204251697752625e-05, "grad_norm": 18.616941452026367, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.871475875377655, "num_tokens": 634750058.0, "step": 16640 }, { "epoch": 2.1169062460246786, "ewc_loss": 0.03214993700385094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2149935577763245e-05, "grad_norm": 18.524940490722656, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8679525256156921, "num_tokens": 634793480.0, "step": 16641 }, { "epoch": 2.117033456303269, "ewc_loss": 0.03205182030797005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205181928933598e-05, "grad_norm": 18.554561614990234, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8638163805007935, "num_tokens": 634829923.0, "step": 16642 }, { "epoch": 2.1171606665818596, "ewc_loss": 0.03216861933469772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216862023691647e-05, "grad_norm": 18.542078018188477, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8694109916687012, "num_tokens": 634868499.0, "step": 16643 }, { "epoch": 2.11728787686045, "ewc_loss": 0.03215942904353142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2159430702449754e-05, "grad_norm": 18.619028091430664, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8820455074310303, "num_tokens": 634905922.0, "step": 16644 }, { "epoch": 2.1174150871390407, "ewc_loss": 0.0321965366601944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219653808628209e-05, "grad_norm": 18.584074020385742, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8744479417800903, "num_tokens": 634946190.0, "step": 16645 }, { "epoch": 2.1175422974176312, "ewc_loss": 0.032101795077323914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2101794204209e-05, "grad_norm": 18.503353118896484, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.886318564414978, "num_tokens": 634981840.0, "step": 16646 }, { "epoch": 2.1176695076962218, "ewc_loss": 0.03214659541845322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2146595913218334e-05, "grad_norm": 18.53005599975586, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8712915182113647, "num_tokens": 635024755.0, "step": 16647 }, { "epoch": 2.1177967179748123, "ewc_loss": 0.032196663320064545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219666177756153e-05, "grad_norm": 18.562910079956055, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8779106736183167, "num_tokens": 635063076.0, "step": 16648 }, { "epoch": 2.117923928253403, "ewc_loss": 0.032167620956897736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216761979274452e-05, "grad_norm": 18.54962921142578, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8862016201019287, "num_tokens": 635103052.0, "step": 16649 }, { "epoch": 2.1180511385319933, "ewc_loss": 0.03216162696480751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216162804164924e-05, "grad_norm": 18.56181526184082, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8830478191375732, "num_tokens": 635140917.0, "step": 16650 }, { "epoch": 2.118178348810584, "ewc_loss": 0.032121848315000534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212184674339369e-05, "grad_norm": 18.565582275390625, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8787456750869751, "num_tokens": 635182049.0, "step": 16651 }, { "epoch": 2.1183055590891744, "ewc_loss": 0.03213522583246231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213522722944617e-05, "grad_norm": 18.55486488342285, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8696745038032532, "num_tokens": 635217613.0, "step": 16652 }, { "epoch": 2.118432769367765, "ewc_loss": 0.03211416304111481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211416333215311e-05, "grad_norm": 18.52275276184082, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.88270103931427, "num_tokens": 635252716.0, "step": 16653 }, { "epoch": 2.1185599796463555, "ewc_loss": 0.03211800754070282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211800867575221e-05, "grad_norm": 18.62442398071289, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8778843879699707, "num_tokens": 635288919.0, "step": 16654 }, { "epoch": 2.118687189924946, "ewc_loss": 0.03213898092508316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213898162357509e-05, "grad_norm": 18.61380958557129, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8695893287658691, "num_tokens": 635329781.0, "step": 16655 }, { "epoch": 2.1188144002035365, "ewc_loss": 0.03202425688505173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2024257961893454e-05, "grad_norm": 18.56639862060547, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8841491937637329, "num_tokens": 635363844.0, "step": 16656 }, { "epoch": 2.118941610482127, "ewc_loss": 0.032103508710861206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210350769222714e-05, "grad_norm": 18.602502822875977, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8714897632598877, "num_tokens": 635397568.0, "step": 16657 }, { "epoch": 2.1190688207607176, "ewc_loss": 0.03212045133113861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2120449759531766e-05, "grad_norm": 18.629629135131836, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8588376641273499, "num_tokens": 635432903.0, "step": 16658 }, { "epoch": 2.119196031039308, "ewc_loss": 0.03210835158824921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210834984201938e-05, "grad_norm": 18.53285026550293, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8766276836395264, "num_tokens": 635473388.0, "step": 16659 }, { "epoch": 2.1193232413178986, "ewc_loss": 0.032007474452257156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200747596565634e-05, "grad_norm": 18.597562789916992, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8774868845939636, "num_tokens": 635512966.0, "step": 16660 }, { "epoch": 2.119450451596489, "ewc_loss": 0.03217024356126785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217024277546443e-05, "grad_norm": 18.606557846069336, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8615167737007141, "num_tokens": 635547814.0, "step": 16661 }, { "epoch": 2.1195776618750797, "ewc_loss": 0.032043930143117905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20439285133034e-05, "grad_norm": 18.56647300720215, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.864928126335144, "num_tokens": 635583120.0, "step": 16662 }, { "epoch": 2.11970487215367, "ewc_loss": 0.03206118941307068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206119072274305e-05, "grad_norm": 18.56438636779785, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8694407939910889, "num_tokens": 635621531.0, "step": 16663 }, { "epoch": 2.1198320824322607, "ewc_loss": 0.032086942344903946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208694397471845e-05, "grad_norm": 18.58242416381836, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8643720149993896, "num_tokens": 635659641.0, "step": 16664 }, { "epoch": 2.119959292710851, "ewc_loss": 0.03208637982606888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208638008800335e-05, "grad_norm": 18.55478286743164, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8697762489318848, "num_tokens": 635697953.0, "step": 16665 }, { "epoch": 2.1200865029894413, "ewc_loss": 0.0321299284696579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212993033230305e-05, "grad_norm": 18.586254119873047, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8488454818725586, "num_tokens": 635742519.0, "step": 16666 }, { "epoch": 2.120213713268032, "ewc_loss": 0.032135847955942154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2135849323822185e-05, "grad_norm": 18.6236629486084, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8810266256332397, "num_tokens": 635777786.0, "step": 16667 }, { "epoch": 2.1203409235466224, "ewc_loss": 0.032102178782224655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210217982996255e-05, "grad_norm": 18.5660457611084, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8755932450294495, "num_tokens": 635814490.0, "step": 16668 }, { "epoch": 2.120468133825213, "ewc_loss": 0.032080356031656265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208035559509881e-05, "grad_norm": 18.55784034729004, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8616134524345398, "num_tokens": 635858351.0, "step": 16669 }, { "epoch": 2.1205953441038035, "ewc_loss": 0.032095011323690414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209501301171258e-05, "grad_norm": 18.6417236328125, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8711204528808594, "num_tokens": 635895585.0, "step": 16670 }, { "epoch": 2.120722554382394, "ewc_loss": 0.03205377236008644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205377288395539e-05, "grad_norm": 18.552223205566406, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.880730152130127, "num_tokens": 635933591.0, "step": 16671 }, { "epoch": 2.1208497646609845, "ewc_loss": 0.03206101432442665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206101609976031e-05, "grad_norm": 18.605512619018555, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8701820969581604, "num_tokens": 635972040.0, "step": 16672 }, { "epoch": 2.120976974939575, "ewc_loss": 0.03212592378258705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212592491763644e-05, "grad_norm": 18.606155395507812, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.877313494682312, "num_tokens": 636013287.0, "step": 16673 }, { "epoch": 2.1211041852181656, "ewc_loss": 0.032133057713508606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2133058994077146e-05, "grad_norm": 18.648128509521484, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8679229021072388, "num_tokens": 636046137.0, "step": 16674 }, { "epoch": 2.121231395496756, "ewc_loss": 0.032112814486026764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211281364201568e-05, "grad_norm": 18.57135581970215, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8628266453742981, "num_tokens": 636088870.0, "step": 16675 }, { "epoch": 2.1213586057753466, "ewc_loss": 0.03204679116606712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204679160262458e-05, "grad_norm": 18.650508880615234, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8835679292678833, "num_tokens": 636130519.0, "step": 16676 }, { "epoch": 2.121485816053937, "ewc_loss": 0.032150689512491226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215068863937631e-05, "grad_norm": 18.625232696533203, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8766763806343079, "num_tokens": 636165641.0, "step": 16677 }, { "epoch": 2.1216130263325277, "ewc_loss": 0.03201789781451225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201789877493866e-05, "grad_norm": 18.554027557373047, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.861091136932373, "num_tokens": 636203811.0, "step": 16678 }, { "epoch": 2.121740236611118, "ewc_loss": 0.032009102404117584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200910214218311e-05, "grad_norm": 18.608474731445312, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8854667544364929, "num_tokens": 636239503.0, "step": 16679 }, { "epoch": 2.1218674468897087, "ewc_loss": 0.03208104148507118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208104317309335e-05, "grad_norm": 18.559385299682617, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.877153217792511, "num_tokens": 636278994.0, "step": 16680 }, { "epoch": 2.1219946571682993, "ewc_loss": 0.03208222985267639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208222915418446e-05, "grad_norm": 18.673416137695312, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8695124983787537, "num_tokens": 636318025.0, "step": 16681 }, { "epoch": 2.12212186744689, "ewc_loss": 0.03211143612861633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211143484804779e-05, "grad_norm": 18.5562744140625, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8756020069122314, "num_tokens": 636357289.0, "step": 16682 }, { "epoch": 2.1222490777254803, "ewc_loss": 0.031992357224226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199235652573407e-05, "grad_norm": 18.591495513916016, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8585577607154846, "num_tokens": 636399067.0, "step": 16683 }, { "epoch": 2.122376288004071, "ewc_loss": 0.032106466591358185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2106465368997306e-05, "grad_norm": 18.62742805480957, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8786924481391907, "num_tokens": 636442418.0, "step": 16684 }, { "epoch": 2.1225034982826614, "ewc_loss": 0.03200444579124451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200444552931003e-05, "grad_norm": 18.63378143310547, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8620433807373047, "num_tokens": 636484273.0, "step": 16685 }, { "epoch": 2.122630708561252, "ewc_loss": 0.032056767493486404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2056766940513626e-05, "grad_norm": 18.5760555267334, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8834500312805176, "num_tokens": 636525675.0, "step": 16686 }, { "epoch": 2.1227579188398424, "ewc_loss": 0.03200135752558708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200135688530281e-05, "grad_norm": 18.67980194091797, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8726516962051392, "num_tokens": 636559652.0, "step": 16687 }, { "epoch": 2.122885129118433, "ewc_loss": 0.03201469033956528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20146900776308e-05, "grad_norm": 18.561511993408203, "learning_rate": 1e-06, "loss": 0.3418, "mean_token_accuracy": 0.8921890258789062, "num_tokens": 636597900.0, "step": 16688 }, { "epoch": 2.1230123393970235, "ewc_loss": 0.03194214776158333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1942148780217394e-05, "grad_norm": 18.58980369567871, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8758983612060547, "num_tokens": 636635822.0, "step": 16689 }, { "epoch": 2.1231395496756136, "ewc_loss": 0.03201894834637642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20189465128351e-05, "grad_norm": 18.564973831176758, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8651565313339233, "num_tokens": 636667275.0, "step": 16690 }, { "epoch": 2.123266759954204, "ewc_loss": 0.03196389228105545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.196389297954738e-05, "grad_norm": 18.634498596191406, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8768323659896851, "num_tokens": 636704723.0, "step": 16691 }, { "epoch": 2.1233939702327946, "ewc_loss": 0.032005373388528824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.200537321390584e-05, "grad_norm": 18.66105842590332, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8787893056869507, "num_tokens": 636748209.0, "step": 16692 }, { "epoch": 2.123521180511385, "ewc_loss": 0.03205755725502968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2057556381914765e-05, "grad_norm": 18.643627166748047, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8795101046562195, "num_tokens": 636785052.0, "step": 16693 }, { "epoch": 2.1236483907899757, "ewc_loss": 0.031960923224687576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1960924388840795e-05, "grad_norm": 18.573806762695312, "learning_rate": 1e-06, "loss": 0.347, "mean_token_accuracy": 0.886539101600647, "num_tokens": 636816073.0, "step": 16694 }, { "epoch": 2.123775601068566, "ewc_loss": 0.031989045441150665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1989045965019614e-05, "grad_norm": 18.653024673461914, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.880973756313324, "num_tokens": 636848963.0, "step": 16695 }, { "epoch": 2.1239028113471567, "ewc_loss": 0.03201677277684212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201677100150846e-05, "grad_norm": 18.590547561645508, "learning_rate": 1e-06, "loss": 0.3495, "mean_token_accuracy": 0.8877673149108887, "num_tokens": 636880632.0, "step": 16696 }, { "epoch": 2.1240300216257473, "ewc_loss": 0.03195979818701744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1959796615410596e-05, "grad_norm": 18.628028869628906, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8613680601119995, "num_tokens": 636923597.0, "step": 16697 }, { "epoch": 2.124157231904338, "ewc_loss": 0.032073892652988434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2073890906758606e-05, "grad_norm": 18.603641510009766, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8685638904571533, "num_tokens": 636962250.0, "step": 16698 }, { "epoch": 2.1242844421829283, "ewc_loss": 0.03204910084605217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2049101719167084e-05, "grad_norm": 18.69107437133789, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8806804418563843, "num_tokens": 636997973.0, "step": 16699 }, { "epoch": 2.124411652461519, "ewc_loss": 0.03202911093831062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2029111025622115e-05, "grad_norm": 18.638954162597656, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8621759414672852, "num_tokens": 637037320.0, "step": 16700 }, { "epoch": 2.1245388627401094, "ewc_loss": 0.03196415677666664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.19641585520003e-05, "grad_norm": 18.64225196838379, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8536786437034607, "num_tokens": 637072267.0, "step": 16701 }, { "epoch": 2.1246660730187, "ewc_loss": 0.032019179314374924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2019179343478754e-05, "grad_norm": 18.590476989746094, "learning_rate": 1e-06, "loss": 0.3485, "mean_token_accuracy": 0.8883965015411377, "num_tokens": 637108782.0, "step": 16702 }, { "epoch": 2.1247932832972904, "ewc_loss": 0.03197649493813515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.197649493813515e-05, "grad_norm": 18.643396377563477, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8741762638092041, "num_tokens": 637147368.0, "step": 16703 }, { "epoch": 2.124920493575881, "ewc_loss": 0.03205554932355881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205554821761325e-05, "grad_norm": 18.551490783691406, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8835703134536743, "num_tokens": 637181909.0, "step": 16704 }, { "epoch": 2.1250477038544715, "ewc_loss": 0.03199195861816406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1991959986044094e-05, "grad_norm": 18.670303344726562, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8753364086151123, "num_tokens": 637218498.0, "step": 16705 }, { "epoch": 2.125174914133062, "ewc_loss": 0.03210366889834404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210366776329465e-05, "grad_norm": 18.63189697265625, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8757497072219849, "num_tokens": 637254825.0, "step": 16706 }, { "epoch": 2.1253021244116526, "ewc_loss": 0.031987521797418594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198752165189944e-05, "grad_norm": 18.604198455810547, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.864382266998291, "num_tokens": 637301593.0, "step": 16707 }, { "epoch": 2.125429334690243, "ewc_loss": 0.03204454109072685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204453969374299e-05, "grad_norm": 18.63385581970215, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8858801126480103, "num_tokens": 637341676.0, "step": 16708 }, { "epoch": 2.1255565449688336, "ewc_loss": 0.03204523026943207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204523090971634e-05, "grad_norm": 18.591136932373047, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8791756629943848, "num_tokens": 637378840.0, "step": 16709 }, { "epoch": 2.125683755247424, "ewc_loss": 0.03206418827176094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206418841728009e-05, "grad_norm": 18.643444061279297, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8563783168792725, "num_tokens": 637412711.0, "step": 16710 }, { "epoch": 2.1258109655260147, "ewc_loss": 0.03202471882104874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2024719985201955e-05, "grad_norm": 18.6179256439209, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8729991316795349, "num_tokens": 637448287.0, "step": 16711 }, { "epoch": 2.125938175804605, "ewc_loss": 0.03207267075777054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207267218385823e-05, "grad_norm": 18.585323333740234, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8723845481872559, "num_tokens": 637487899.0, "step": 16712 }, { "epoch": 2.1260653860831957, "ewc_loss": 0.03205648437142372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205648317816667e-05, "grad_norm": 18.64674949645996, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8784350156784058, "num_tokens": 637528161.0, "step": 16713 }, { "epoch": 2.1261925963617863, "ewc_loss": 0.03211165964603424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211166040273383e-05, "grad_norm": 18.65529441833496, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8626901507377625, "num_tokens": 637565692.0, "step": 16714 }, { "epoch": 2.1263198066403763, "ewc_loss": 0.031991876661777496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.199187631253153e-05, "grad_norm": 18.567485809326172, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8609470129013062, "num_tokens": 637600368.0, "step": 16715 }, { "epoch": 2.126447016918967, "ewc_loss": 0.03204880282282829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20488034049049e-05, "grad_norm": 18.617902755737305, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8717145919799805, "num_tokens": 637635732.0, "step": 16716 }, { "epoch": 2.1265742271975574, "ewc_loss": 0.03212853521108627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212853698641993e-05, "grad_norm": 18.59238052368164, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8697100281715393, "num_tokens": 637677127.0, "step": 16717 }, { "epoch": 2.126701437476148, "ewc_loss": 0.03210564702749252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210564682376571e-05, "grad_norm": 18.612367630004883, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8733044266700745, "num_tokens": 637715278.0, "step": 16718 }, { "epoch": 2.1268286477547385, "ewc_loss": 0.03207126632332802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207126792403869e-05, "grad_norm": 18.56355094909668, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8774410486221313, "num_tokens": 637742937.0, "step": 16719 }, { "epoch": 2.126955858033329, "ewc_loss": 0.03210843726992607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210843715351075e-05, "grad_norm": 18.632984161376953, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8771752119064331, "num_tokens": 637776761.0, "step": 16720 }, { "epoch": 2.1270830683119195, "ewc_loss": 0.03211507201194763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2115072826854885e-05, "grad_norm": 18.572811126708984, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8758337497711182, "num_tokens": 637812197.0, "step": 16721 }, { "epoch": 2.12721027859051, "ewc_loss": 0.03209579363465309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2095795177156106e-05, "grad_norm": 18.573848724365234, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8779582381248474, "num_tokens": 637850442.0, "step": 16722 }, { "epoch": 2.1273374888691006, "ewc_loss": 0.03217492997646332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2174928492167965e-05, "grad_norm": 18.680166244506836, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8753243088722229, "num_tokens": 637889352.0, "step": 16723 }, { "epoch": 2.127464699147691, "ewc_loss": 0.032169196754693985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216919503756799e-05, "grad_norm": 18.642087936401367, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8567275404930115, "num_tokens": 637927195.0, "step": 16724 }, { "epoch": 2.1275919094262816, "ewc_loss": 0.03210112452507019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2101124816108495e-05, "grad_norm": 18.63254737854004, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8633445501327515, "num_tokens": 637969431.0, "step": 16725 }, { "epoch": 2.127719119704872, "ewc_loss": 0.03216329962015152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216329787392169e-05, "grad_norm": 18.532649993896484, "learning_rate": 1e-06, "loss": 0.3416, "mean_token_accuracy": 0.8907368183135986, "num_tokens": 638013910.0, "step": 16726 }, { "epoch": 2.1278463299834627, "ewc_loss": 0.0320882573723793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208825728506781e-05, "grad_norm": 18.61068344116211, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8641742467880249, "num_tokens": 638051848.0, "step": 16727 }, { "epoch": 2.127973540262053, "ewc_loss": 0.03217579051852226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2175790693145245e-05, "grad_norm": 18.586162567138672, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8746826648712158, "num_tokens": 638092192.0, "step": 16728 }, { "epoch": 2.1281007505406437, "ewc_loss": 0.03213825821876526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213825766579248e-05, "grad_norm": 18.54230499267578, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8828531503677368, "num_tokens": 638134306.0, "step": 16729 }, { "epoch": 2.1282279608192343, "ewc_loss": 0.03213677182793617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213677337043919e-05, "grad_norm": 18.587223052978516, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8608648180961609, "num_tokens": 638177988.0, "step": 16730 }, { "epoch": 2.128355171097825, "ewc_loss": 0.0321730375289917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217303674318828e-05, "grad_norm": 18.614036560058594, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.8847286701202393, "num_tokens": 638212237.0, "step": 16731 }, { "epoch": 2.1284823813764153, "ewc_loss": 0.0321209616959095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2120962714543566e-05, "grad_norm": 18.52978515625, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8670196533203125, "num_tokens": 638252492.0, "step": 16732 }, { "epoch": 2.128609591655006, "ewc_loss": 0.03212832659482956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212832598364912e-05, "grad_norm": 18.689315795898438, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8744297027587891, "num_tokens": 638294308.0, "step": 16733 }, { "epoch": 2.1287368019335964, "ewc_loss": 0.03217461705207825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2174615625990555e-05, "grad_norm": 18.5544376373291, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8773001432418823, "num_tokens": 638330178.0, "step": 16734 }, { "epoch": 2.128864012212187, "ewc_loss": 0.03210993856191635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210993963875808e-05, "grad_norm": 18.671070098876953, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8539879322052002, "num_tokens": 638372731.0, "step": 16735 }, { "epoch": 2.1289912224907774, "ewc_loss": 0.032178524881601334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217852645320818e-05, "grad_norm": 18.545732498168945, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8699228167533875, "num_tokens": 638407522.0, "step": 16736 }, { "epoch": 2.129118432769368, "ewc_loss": 0.032025836408138275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202583684469573e-05, "grad_norm": 18.677574157714844, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8759000301361084, "num_tokens": 638443262.0, "step": 16737 }, { "epoch": 2.129245643047958, "ewc_loss": 0.03217410296201706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2174102670978755e-05, "grad_norm": 18.615461349487305, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8740319013595581, "num_tokens": 638482729.0, "step": 16738 }, { "epoch": 2.129372853326549, "ewc_loss": 0.03212274610996246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212274532415904e-05, "grad_norm": 18.66889190673828, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8817592859268188, "num_tokens": 638520034.0, "step": 16739 }, { "epoch": 2.129500063605139, "ewc_loss": 0.03213866427540779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2138665119418874e-05, "grad_norm": 18.640769958496094, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8815701007843018, "num_tokens": 638558631.0, "step": 16740 }, { "epoch": 2.1296272738837296, "ewc_loss": 0.03208203613758087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2082036341307685e-05, "grad_norm": 18.59021759033203, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8850639462471008, "num_tokens": 638593565.0, "step": 16741 }, { "epoch": 2.12975448416232, "ewc_loss": 0.03202619031071663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202618972864002e-05, "grad_norm": 18.648229598999023, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8782975077629089, "num_tokens": 638625952.0, "step": 16742 }, { "epoch": 2.1298816944409107, "ewc_loss": 0.032099172472953796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209917122148909e-05, "grad_norm": 18.63953971862793, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.870815098285675, "num_tokens": 638662000.0, "step": 16743 }, { "epoch": 2.130008904719501, "ewc_loss": 0.03198856860399246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1988569389795884e-05, "grad_norm": 18.653364181518555, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8601628541946411, "num_tokens": 638706020.0, "step": 16744 }, { "epoch": 2.1301361149980917, "ewc_loss": 0.032095734030008316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2095733331516385e-05, "grad_norm": 18.708683013916016, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.88100665807724, "num_tokens": 638745354.0, "step": 16745 }, { "epoch": 2.1302633252766823, "ewc_loss": 0.03208767622709274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208767520845868e-05, "grad_norm": 18.650299072265625, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8678693771362305, "num_tokens": 638785497.0, "step": 16746 }, { "epoch": 2.130390535555273, "ewc_loss": 0.0319816954433918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198169724782929e-05, "grad_norm": 18.668067932128906, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8541157245635986, "num_tokens": 638821650.0, "step": 16747 }, { "epoch": 2.1305177458338633, "ewc_loss": 0.032016322016716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201632353011519e-05, "grad_norm": 18.579059600830078, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8603898882865906, "num_tokens": 638855589.0, "step": 16748 }, { "epoch": 2.130644956112454, "ewc_loss": 0.03205207362771034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205207394785248e-05, "grad_norm": 18.665916442871094, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8733862638473511, "num_tokens": 638895766.0, "step": 16749 }, { "epoch": 2.1307721663910444, "ewc_loss": 0.03211968392133713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211968578398228e-05, "grad_norm": 18.6217041015625, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.864620566368103, "num_tokens": 638932622.0, "step": 16750 }, { "epoch": 2.130899376669635, "ewc_loss": 0.03207617625594139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207617555744946e-05, "grad_norm": 18.673704147338867, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8642645478248596, "num_tokens": 638971588.0, "step": 16751 }, { "epoch": 2.1310265869482254, "ewc_loss": 0.03215303644537926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215303513570689e-05, "grad_norm": 18.666040420532227, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8788483142852783, "num_tokens": 639009921.0, "step": 16752 }, { "epoch": 2.131153797226816, "ewc_loss": 0.03214346989989281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214347088942304e-05, "grad_norm": 18.766292572021484, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8726259469985962, "num_tokens": 639048160.0, "step": 16753 }, { "epoch": 2.1312810075054065, "ewc_loss": 0.032079119235277176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20791186823044e-05, "grad_norm": 18.639808654785156, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8857038617134094, "num_tokens": 639080260.0, "step": 16754 }, { "epoch": 2.131408217783997, "ewc_loss": 0.03208165988326073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2081661629490554e-05, "grad_norm": 18.668502807617188, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8666667342185974, "num_tokens": 639118758.0, "step": 16755 }, { "epoch": 2.1315354280625876, "ewc_loss": 0.0321507453918457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2150746847037226e-05, "grad_norm": 18.70042610168457, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8747045993804932, "num_tokens": 639159528.0, "step": 16756 }, { "epoch": 2.131662638341178, "ewc_loss": 0.03204187750816345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20418766932562e-05, "grad_norm": 18.683887481689453, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8776119947433472, "num_tokens": 639200394.0, "step": 16757 }, { "epoch": 2.1317898486197686, "ewc_loss": 0.032081615179777145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208161433576606e-05, "grad_norm": 18.636653900146484, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8651070594787598, "num_tokens": 639240938.0, "step": 16758 }, { "epoch": 2.131917058898359, "ewc_loss": 0.0320538692176342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2053867471404374e-05, "grad_norm": 18.589521408081055, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8692099452018738, "num_tokens": 639278097.0, "step": 16759 }, { "epoch": 2.1320442691769497, "ewc_loss": 0.03210899978876114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210900104022585e-05, "grad_norm": 18.746301651000977, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8731188774108887, "num_tokens": 639324512.0, "step": 16760 }, { "epoch": 2.13217147945554, "ewc_loss": 0.032111190259456635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211119110346772e-05, "grad_norm": 18.59970474243164, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8784946203231812, "num_tokens": 639366833.0, "step": 16761 }, { "epoch": 2.1322986897341307, "ewc_loss": 0.03203083574771881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2030835427576676e-05, "grad_norm": 18.703561782836914, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8590487241744995, "num_tokens": 639405765.0, "step": 16762 }, { "epoch": 2.132425900012721, "ewc_loss": 0.032121479511260986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212147930753417e-05, "grad_norm": 18.631986618041992, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8700045347213745, "num_tokens": 639439622.0, "step": 16763 }, { "epoch": 2.1325531102913113, "ewc_loss": 0.03201204165816307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201204162905924e-05, "grad_norm": 18.686471939086914, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8845203518867493, "num_tokens": 639479943.0, "step": 16764 }, { "epoch": 2.132680320569902, "ewc_loss": 0.032074201852083206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207420013495721e-05, "grad_norm": 18.602659225463867, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8690170049667358, "num_tokens": 639520487.0, "step": 16765 }, { "epoch": 2.1328075308484924, "ewc_loss": 0.031995560973882675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1995561585063115e-05, "grad_norm": 18.635862350463867, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8732534646987915, "num_tokens": 639558997.0, "step": 16766 }, { "epoch": 2.132934741127083, "ewc_loss": 0.032063793390989304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206379187759012e-05, "grad_norm": 18.654884338378906, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8731245994567871, "num_tokens": 639594569.0, "step": 16767 }, { "epoch": 2.1330619514056735, "ewc_loss": 0.03206368908286095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206369001418352e-05, "grad_norm": 18.60172462463379, "learning_rate": 1e-06, "loss": 0.3456, "mean_token_accuracy": 0.8871957659721375, "num_tokens": 639629827.0, "step": 16768 }, { "epoch": 2.133189161684264, "ewc_loss": 0.03197401016950607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1974010198609903e-05, "grad_norm": 18.59321403503418, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8676947355270386, "num_tokens": 639667634.0, "step": 16769 }, { "epoch": 2.1333163719628545, "ewc_loss": 0.032041821628808975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204182212357409e-05, "grad_norm": 18.655227661132812, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8724615573883057, "num_tokens": 639700536.0, "step": 16770 }, { "epoch": 2.133443582241445, "ewc_loss": 0.03203519806265831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203519736416638e-05, "grad_norm": 18.58195686340332, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8757591843605042, "num_tokens": 639744006.0, "step": 16771 }, { "epoch": 2.1335707925200356, "ewc_loss": 0.032025862485170364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202586231054738e-05, "grad_norm": 18.600187301635742, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8785847425460815, "num_tokens": 639784950.0, "step": 16772 }, { "epoch": 2.133698002798626, "ewc_loss": 0.03210147097706795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210147042409517e-05, "grad_norm": 18.652997970581055, "learning_rate": 1e-06, "loss": 0.3492, "mean_token_accuracy": 0.8871358036994934, "num_tokens": 639823567.0, "step": 16773 }, { "epoch": 2.1338252130772166, "ewc_loss": 0.03205679729580879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205679604434408e-05, "grad_norm": 18.554759979248047, "learning_rate": 1e-06, "loss": 0.3475, "mean_token_accuracy": 0.8850446343421936, "num_tokens": 639863386.0, "step": 16774 }, { "epoch": 2.133952423355807, "ewc_loss": 0.032019950449466705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201995059498586e-05, "grad_norm": 18.625459671020508, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8624060153961182, "num_tokens": 639901219.0, "step": 16775 }, { "epoch": 2.1340796336343977, "ewc_loss": 0.03212563693523407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212563751731068e-05, "grad_norm": 18.61458969116211, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8583000302314758, "num_tokens": 639935240.0, "step": 16776 }, { "epoch": 2.134206843912988, "ewc_loss": 0.0320945642888546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209456554031931e-05, "grad_norm": 18.679967880249023, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8657025098800659, "num_tokens": 639975370.0, "step": 16777 }, { "epoch": 2.1343340541915787, "ewc_loss": 0.032087620347738266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208762063877657e-05, "grad_norm": 18.571348190307617, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8560665845870972, "num_tokens": 640020463.0, "step": 16778 }, { "epoch": 2.1344612644701693, "ewc_loss": 0.0320761576294899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2076157367555425e-05, "grad_norm": 18.67621421813965, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8682405948638916, "num_tokens": 640057142.0, "step": 16779 }, { "epoch": 2.13458847474876, "ewc_loss": 0.032163847237825394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216384720872156e-05, "grad_norm": 18.620546340942383, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8697770833969116, "num_tokens": 640098879.0, "step": 16780 }, { "epoch": 2.1347156850273503, "ewc_loss": 0.0320560485124588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205605025868863e-05, "grad_norm": 18.557811737060547, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8755040168762207, "num_tokens": 640136077.0, "step": 16781 }, { "epoch": 2.134842895305941, "ewc_loss": 0.0321292020380497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212920273654163e-05, "grad_norm": 18.67062759399414, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8602629899978638, "num_tokens": 640175160.0, "step": 16782 }, { "epoch": 2.1349701055845314, "ewc_loss": 0.032110683619976044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211068542441353e-05, "grad_norm": 18.58858871459961, "learning_rate": 1e-06, "loss": 0.3249, "mean_token_accuracy": 0.8984349966049194, "num_tokens": 640216670.0, "step": 16783 }, { "epoch": 2.135097315863122, "ewc_loss": 0.032116636633872986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2116637157741934e-05, "grad_norm": 18.599580764770508, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8711032271385193, "num_tokens": 640254872.0, "step": 16784 }, { "epoch": 2.1352245261417124, "ewc_loss": 0.032092828303575516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209282658644952e-05, "grad_norm": 18.611614227294922, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8717713952064514, "num_tokens": 640289319.0, "step": 16785 }, { "epoch": 2.135351736420303, "ewc_loss": 0.03213178366422653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213178206351586e-05, "grad_norm": 18.606800079345703, "learning_rate": 1e-06, "loss": 0.3474, "mean_token_accuracy": 0.8880517482757568, "num_tokens": 640323897.0, "step": 16786 }, { "epoch": 2.1354789466988935, "ewc_loss": 0.0320797935128212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2079795346362516e-05, "grad_norm": 18.565265655517578, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8791401386260986, "num_tokens": 640366203.0, "step": 16787 }, { "epoch": 2.1356061569774836, "ewc_loss": 0.03210495039820671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2104951969813555e-05, "grad_norm": 18.583093643188477, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8831157684326172, "num_tokens": 640408089.0, "step": 16788 }, { "epoch": 2.135733367256074, "ewc_loss": 0.032060567289590836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2060568628367037e-05, "grad_norm": 18.555749893188477, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8826669454574585, "num_tokens": 640443532.0, "step": 16789 }, { "epoch": 2.1358605775346646, "ewc_loss": 0.03209977596998215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2099775125971064e-05, "grad_norm": 18.603878021240234, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8675870895385742, "num_tokens": 640480425.0, "step": 16790 }, { "epoch": 2.135987787813255, "ewc_loss": 0.03217146918177605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217146877432242e-05, "grad_norm": 18.623395919799805, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8757514953613281, "num_tokens": 640525658.0, "step": 16791 }, { "epoch": 2.1361149980918457, "ewc_loss": 0.032095808535814285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2095809729071334e-05, "grad_norm": 18.55181884765625, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8831567168235779, "num_tokens": 640563276.0, "step": 16792 }, { "epoch": 2.136242208370436, "ewc_loss": 0.03211929649114609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211929652024992e-05, "grad_norm": 18.62550163269043, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8674732446670532, "num_tokens": 640608150.0, "step": 16793 }, { "epoch": 2.1363694186490267, "ewc_loss": 0.032171837985515594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2171839848160744e-05, "grad_norm": 18.636573791503906, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.875637412071228, "num_tokens": 640646127.0, "step": 16794 }, { "epoch": 2.1364966289276173, "ewc_loss": 0.032073598355054855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207359986845404e-05, "grad_norm": 18.535526275634766, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8794782161712646, "num_tokens": 640683195.0, "step": 16795 }, { "epoch": 2.136623839206208, "ewc_loss": 0.03209063783288002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209063652320765e-05, "grad_norm": 18.60903549194336, "learning_rate": 1e-06, "loss": 0.3493, "mean_token_accuracy": 0.8875288963317871, "num_tokens": 640724102.0, "step": 16796 }, { "epoch": 2.1367510494847983, "ewc_loss": 0.03213506191968918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2135063520399854e-05, "grad_norm": 18.630739212036133, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8709372282028198, "num_tokens": 640762624.0, "step": 16797 }, { "epoch": 2.136878259763389, "ewc_loss": 0.032068002969026566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206800465704873e-05, "grad_norm": 18.532623291015625, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8688603639602661, "num_tokens": 640799437.0, "step": 16798 }, { "epoch": 2.1370054700419794, "ewc_loss": 0.03212464973330498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212464798707515e-05, "grad_norm": 18.610309600830078, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8802945017814636, "num_tokens": 640837115.0, "step": 16799 }, { "epoch": 2.13713268032057, "ewc_loss": 0.03215261176228523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2152613130165264e-05, "grad_norm": 18.619020462036133, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8735278844833374, "num_tokens": 640882312.0, "step": 16800 }, { "epoch": 2.1372598905991604, "ewc_loss": 0.03208509087562561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208509224350564e-05, "grad_norm": 18.57386016845703, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8732476830482483, "num_tokens": 640921032.0, "step": 16801 }, { "epoch": 2.137387100877751, "ewc_loss": 0.03208457678556442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2084575650515035e-05, "grad_norm": 18.615116119384766, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8749204874038696, "num_tokens": 640957424.0, "step": 16802 }, { "epoch": 2.1375143111563415, "ewc_loss": 0.03209621459245682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209621354471892e-05, "grad_norm": 18.539323806762695, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8648358583450317, "num_tokens": 640992985.0, "step": 16803 }, { "epoch": 2.137641521434932, "ewc_loss": 0.03204716742038727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204716631444171e-05, "grad_norm": 18.57755470275879, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.881563127040863, "num_tokens": 641030379.0, "step": 16804 }, { "epoch": 2.1377687317135226, "ewc_loss": 0.032149117439985275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214911703253165e-05, "grad_norm": 18.641780853271484, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8661144971847534, "num_tokens": 641068526.0, "step": 16805 }, { "epoch": 2.137895941992113, "ewc_loss": 0.03211922571063042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211922739865258e-05, "grad_norm": 18.59295654296875, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8682776093482971, "num_tokens": 641103351.0, "step": 16806 }, { "epoch": 2.1380231522707036, "ewc_loss": 0.03208757936954498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208758062100969e-05, "grad_norm": 18.663251876831055, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8753844499588013, "num_tokens": 641143570.0, "step": 16807 }, { "epoch": 2.138150362549294, "ewc_loss": 0.03213827684521675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2138275855686516e-05, "grad_norm": 18.588220596313477, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8793889284133911, "num_tokens": 641180625.0, "step": 16808 }, { "epoch": 2.1382775728278847, "ewc_loss": 0.0321158766746521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211587682017125e-05, "grad_norm": 18.715442657470703, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8533696532249451, "num_tokens": 641226770.0, "step": 16809 }, { "epoch": 2.138404783106475, "ewc_loss": 0.032115399837493896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211540024494752e-05, "grad_norm": 18.609434127807617, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8852332830429077, "num_tokens": 641269682.0, "step": 16810 }, { "epoch": 2.1385319933850653, "ewc_loss": 0.03198660910129547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198660851921886e-05, "grad_norm": 18.701379776000977, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8859453797340393, "num_tokens": 641303803.0, "step": 16811 }, { "epoch": 2.1386592036636562, "ewc_loss": 0.032103199511766434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210319846402854e-05, "grad_norm": 18.565025329589844, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8687275052070618, "num_tokens": 641343141.0, "step": 16812 }, { "epoch": 2.1387864139422463, "ewc_loss": 0.03207674250006676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2076743082143366e-05, "grad_norm": 18.67367935180664, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8782909512519836, "num_tokens": 641384006.0, "step": 16813 }, { "epoch": 2.138913624220837, "ewc_loss": 0.032109204679727554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2109204767039046e-05, "grad_norm": 18.632122039794922, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8534009456634521, "num_tokens": 641423998.0, "step": 16814 }, { "epoch": 2.1390408344994274, "ewc_loss": 0.032054077833890915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2054078474175185e-05, "grad_norm": 18.67057991027832, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8729701042175293, "num_tokens": 641457886.0, "step": 16815 }, { "epoch": 2.139168044778018, "ewc_loss": 0.03211064636707306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2110645406646654e-05, "grad_norm": 18.58675765991211, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8740490674972534, "num_tokens": 641503386.0, "step": 16816 }, { "epoch": 2.1392952550566084, "ewc_loss": 0.0320158451795578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201584331691265e-05, "grad_norm": 18.591083526611328, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8838979601860046, "num_tokens": 641539533.0, "step": 16817 }, { "epoch": 2.139422465335199, "ewc_loss": 0.032061994075775146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2061994716059417e-05, "grad_norm": 18.593873977661133, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8685176372528076, "num_tokens": 641574020.0, "step": 16818 }, { "epoch": 2.1395496756137895, "ewc_loss": 0.032037846744060516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203784581273794e-05, "grad_norm": 18.63483428955078, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.876785397529602, "num_tokens": 641605206.0, "step": 16819 }, { "epoch": 2.13967688589238, "ewc_loss": 0.0320659801363945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206597830285318e-05, "grad_norm": 18.626707077026367, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8633427619934082, "num_tokens": 641649857.0, "step": 16820 }, { "epoch": 2.1398040961709706, "ewc_loss": 0.03209860250353813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209860369679518e-05, "grad_norm": 18.6784610748291, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8698872923851013, "num_tokens": 641683360.0, "step": 16821 }, { "epoch": 2.139931306449561, "ewc_loss": 0.03209966421127319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2099662348628044e-05, "grad_norm": 18.669801712036133, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8741812705993652, "num_tokens": 641720105.0, "step": 16822 }, { "epoch": 2.1400585167281516, "ewc_loss": 0.03198200464248657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.198200647602789e-05, "grad_norm": 18.548364639282227, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8763116598129272, "num_tokens": 641750718.0, "step": 16823 }, { "epoch": 2.140185727006742, "ewc_loss": 0.032063692808151245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206369365216233e-05, "grad_norm": 18.593454360961914, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8722898960113525, "num_tokens": 641791620.0, "step": 16824 }, { "epoch": 2.1403129372853327, "ewc_loss": 0.03215212747454643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2152125641005114e-05, "grad_norm": 18.552291870117188, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8620654344558716, "num_tokens": 641833566.0, "step": 16825 }, { "epoch": 2.140440147563923, "ewc_loss": 0.032063644379377365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206364272045903e-05, "grad_norm": 18.566173553466797, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8798415064811707, "num_tokens": 641869080.0, "step": 16826 }, { "epoch": 2.1405673578425137, "ewc_loss": 0.032191745936870575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219174686819315e-05, "grad_norm": 18.593650817871094, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8713269233703613, "num_tokens": 641910195.0, "step": 16827 }, { "epoch": 2.1406945681211043, "ewc_loss": 0.0321323461830616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2132345950230956e-05, "grad_norm": 18.6374454498291, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8606731295585632, "num_tokens": 641947270.0, "step": 16828 }, { "epoch": 2.140821778399695, "ewc_loss": 0.03216560557484627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216560435248539e-05, "grad_norm": 18.582115173339844, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8750163912773132, "num_tokens": 641986472.0, "step": 16829 }, { "epoch": 2.1409489886782853, "ewc_loss": 0.032150618731975555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215061951777898e-05, "grad_norm": 18.596088409423828, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8593149185180664, "num_tokens": 642028130.0, "step": 16830 }, { "epoch": 2.141076198956876, "ewc_loss": 0.032177750021219254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217775156372227e-05, "grad_norm": 18.62422752380371, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8685133457183838, "num_tokens": 642067097.0, "step": 16831 }, { "epoch": 2.1412034092354664, "ewc_loss": 0.03213000297546387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213000309187919e-05, "grad_norm": 18.541034698486328, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8841836452484131, "num_tokens": 642099628.0, "step": 16832 }, { "epoch": 2.141330619514057, "ewc_loss": 0.032127488404512405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212748924852349e-05, "grad_norm": 18.676668167114258, "learning_rate": 1e-06, "loss": 0.3239, "mean_token_accuracy": 0.8957151174545288, "num_tokens": 642135941.0, "step": 16833 }, { "epoch": 2.1414578297926474, "ewc_loss": 0.032275550067424774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2275551347993314e-05, "grad_norm": 18.638202667236328, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8673279881477356, "num_tokens": 642169465.0, "step": 16834 }, { "epoch": 2.141585040071238, "ewc_loss": 0.032131001353263855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213099989807233e-05, "grad_norm": 18.667253494262695, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8772687911987305, "num_tokens": 642208961.0, "step": 16835 }, { "epoch": 2.141712250349828, "ewc_loss": 0.032175157219171524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217515768483281e-05, "grad_norm": 18.605106353759766, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8726908564567566, "num_tokens": 642249691.0, "step": 16836 }, { "epoch": 2.141839460628419, "ewc_loss": 0.0321168377995491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2116837246576324e-05, "grad_norm": 18.604074478149414, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.877135157585144, "num_tokens": 642292425.0, "step": 16837 }, { "epoch": 2.141966670907009, "ewc_loss": 0.03219347447156906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219347490812652e-05, "grad_norm": 18.68448257446289, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8827074766159058, "num_tokens": 642330982.0, "step": 16838 }, { "epoch": 2.1420938811855996, "ewc_loss": 0.03213443607091904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2134437788045034e-05, "grad_norm": 18.563125610351562, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8751842975616455, "num_tokens": 642365414.0, "step": 16839 }, { "epoch": 2.14222109146419, "ewc_loss": 0.032121773809194565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212177398381755e-05, "grad_norm": 18.66141700744629, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8682306408882141, "num_tokens": 642398660.0, "step": 16840 }, { "epoch": 2.1423483017427807, "ewc_loss": 0.03220893815159798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2208939956035465e-05, "grad_norm": 18.586185455322266, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.880396842956543, "num_tokens": 642432605.0, "step": 16841 }, { "epoch": 2.142475512021371, "ewc_loss": 0.03216621279716492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216621189494617e-05, "grad_norm": 18.68474578857422, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8821080923080444, "num_tokens": 642472041.0, "step": 16842 }, { "epoch": 2.1426027222999617, "ewc_loss": 0.032184142619371414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218414349248633e-05, "grad_norm": 18.586837768554688, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8839014768600464, "num_tokens": 642512399.0, "step": 16843 }, { "epoch": 2.1427299325785523, "ewc_loss": 0.03212762251496315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212762385373935e-05, "grad_norm": 18.644386291503906, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8682807087898254, "num_tokens": 642550829.0, "step": 16844 }, { "epoch": 2.142857142857143, "ewc_loss": 0.03218144178390503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218144047423266e-05, "grad_norm": 18.641071319580078, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8733107447624207, "num_tokens": 642591255.0, "step": 16845 }, { "epoch": 2.1429843531357333, "ewc_loss": 0.03214678540825844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21467850881163e-05, "grad_norm": 18.607759475708008, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8764383792877197, "num_tokens": 642625772.0, "step": 16846 }, { "epoch": 2.143111563414324, "ewc_loss": 0.03219108656048775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2191088394029066e-05, "grad_norm": 18.564313888549805, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8679215908050537, "num_tokens": 642667596.0, "step": 16847 }, { "epoch": 2.1432387736929144, "ewc_loss": 0.03216174989938736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216175173292868e-05, "grad_norm": 18.754749298095703, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8747410774230957, "num_tokens": 642702159.0, "step": 16848 }, { "epoch": 2.143365983971505, "ewc_loss": 0.03217780962586403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217780977138318e-05, "grad_norm": 18.55800437927246, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.8882789611816406, "num_tokens": 642737307.0, "step": 16849 }, { "epoch": 2.1434931942500954, "ewc_loss": 0.032059621065855026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.205962275387719e-05, "grad_norm": 18.75168800354004, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8649435043334961, "num_tokens": 642776660.0, "step": 16850 }, { "epoch": 2.143620404528686, "ewc_loss": 0.03219885751605034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2198859116761014e-05, "grad_norm": 18.57849884033203, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8680248856544495, "num_tokens": 642814131.0, "step": 16851 }, { "epoch": 2.1437476148072765, "ewc_loss": 0.032035455107688904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203545566066168e-05, "grad_norm": 18.612058639526367, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8553520441055298, "num_tokens": 642854138.0, "step": 16852 }, { "epoch": 2.143874825085867, "ewc_loss": 0.03219760209321976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219760037609376e-05, "grad_norm": 18.76215171813965, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8806672096252441, "num_tokens": 642893767.0, "step": 16853 }, { "epoch": 2.1440020353644575, "ewc_loss": 0.03219062089920044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219062273274176e-05, "grad_norm": 18.570486068725586, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8872764706611633, "num_tokens": 642928286.0, "step": 16854 }, { "epoch": 2.144129245643048, "ewc_loss": 0.03213656693696976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213656600564718e-05, "grad_norm": 18.7186222076416, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8816065788269043, "num_tokens": 642971220.0, "step": 16855 }, { "epoch": 2.1442564559216386, "ewc_loss": 0.032168176025152206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2168176403502e-05, "grad_norm": 18.544349670410156, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8779654502868652, "num_tokens": 643013618.0, "step": 16856 }, { "epoch": 2.144383666200229, "ewc_loss": 0.03211458772420883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211458897567354e-05, "grad_norm": 18.682836532592773, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8772059082984924, "num_tokens": 643058122.0, "step": 16857 }, { "epoch": 2.1445108764788197, "ewc_loss": 0.032265014946460724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226501576136798e-05, "grad_norm": 18.716773986816406, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8619470596313477, "num_tokens": 643096662.0, "step": 16858 }, { "epoch": 2.14463808675741, "ewc_loss": 0.032088275998830795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208827547496185e-05, "grad_norm": 18.674877166748047, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8861833214759827, "num_tokens": 643137923.0, "step": 16859 }, { "epoch": 2.1447652970360007, "ewc_loss": 0.03211696445941925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211696457583457e-05, "grad_norm": 18.72548484802246, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.879912257194519, "num_tokens": 643176918.0, "step": 16860 }, { "epoch": 2.144892507314591, "ewc_loss": 0.03209967166185379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2099673262564465e-05, "grad_norm": 18.69106101989746, "learning_rate": 1e-06, "loss": 0.3425, "mean_token_accuracy": 0.8920036554336548, "num_tokens": 643217525.0, "step": 16861 }, { "epoch": 2.1450197175931813, "ewc_loss": 0.032018084079027176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.201808431185782e-05, "grad_norm": 18.590009689331055, "learning_rate": 1e-06, "loss": 0.3399, "mean_token_accuracy": 0.8907668590545654, "num_tokens": 643255131.0, "step": 16862 }, { "epoch": 2.145146927871772, "ewc_loss": 0.03204714134335518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204714084859006e-05, "grad_norm": 18.614137649536133, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8826450109481812, "num_tokens": 643292092.0, "step": 16863 }, { "epoch": 2.1452741381503624, "ewc_loss": 0.032078564167022705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207856570952572e-05, "grad_norm": 18.635042190551758, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8719127774238586, "num_tokens": 643330510.0, "step": 16864 }, { "epoch": 2.145401348428953, "ewc_loss": 0.03209582716226578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209582791896537e-05, "grad_norm": 18.68539810180664, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8697651624679565, "num_tokens": 643372900.0, "step": 16865 }, { "epoch": 2.1455285587075434, "ewc_loss": 0.032074492424726486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2074491173261777e-05, "grad_norm": 18.744306564331055, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8811901211738586, "num_tokens": 643412866.0, "step": 16866 }, { "epoch": 2.145655768986134, "ewc_loss": 0.0320778414607048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.207784175174311e-05, "grad_norm": 18.672161102294922, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8581821918487549, "num_tokens": 643452846.0, "step": 16867 }, { "epoch": 2.1457829792647245, "ewc_loss": 0.032025691121816635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.202569132554345e-05, "grad_norm": 18.673442840576172, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8794140815734863, "num_tokens": 643488264.0, "step": 16868 }, { "epoch": 2.145910189543315, "ewc_loss": 0.03199924901127815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.1999250495573506e-05, "grad_norm": 18.62884521484375, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8630302548408508, "num_tokens": 643528131.0, "step": 16869 }, { "epoch": 2.1460373998219056, "ewc_loss": 0.03194819390773773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.194819510099478e-05, "grad_norm": 18.64566421508789, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8750263452529907, "num_tokens": 643564041.0, "step": 16870 }, { "epoch": 2.146164610100496, "ewc_loss": 0.03211810439825058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211810326320119e-05, "grad_norm": 18.700204849243164, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8619519472122192, "num_tokens": 643601507.0, "step": 16871 }, { "epoch": 2.1462918203790866, "ewc_loss": 0.0320616215467453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2061620004242286e-05, "grad_norm": 18.63813591003418, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8585739135742188, "num_tokens": 643647161.0, "step": 16872 }, { "epoch": 2.146419030657677, "ewc_loss": 0.032059114426374435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20591134368442e-05, "grad_norm": 18.709394454956055, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8794646263122559, "num_tokens": 643687114.0, "step": 16873 }, { "epoch": 2.1465462409362677, "ewc_loss": 0.032042246311903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204224776709452e-05, "grad_norm": 18.542699813842773, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8601235151290894, "num_tokens": 643722110.0, "step": 16874 }, { "epoch": 2.146673451214858, "ewc_loss": 0.032032039016485214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.203203959856182e-05, "grad_norm": 18.661405563354492, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8518663644790649, "num_tokens": 643765770.0, "step": 16875 }, { "epoch": 2.1468006614934487, "ewc_loss": 0.03216814249753952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216814366169274e-05, "grad_norm": 18.63678550720215, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8664834499359131, "num_tokens": 643810097.0, "step": 16876 }, { "epoch": 2.1469278717720393, "ewc_loss": 0.03202017769217491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.20201761496719e-05, "grad_norm": 18.533653259277344, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8494231700897217, "num_tokens": 643847904.0, "step": 16877 }, { "epoch": 2.14705508205063, "ewc_loss": 0.03218573331832886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218573328922503e-05, "grad_norm": 18.651782989501953, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8818124532699585, "num_tokens": 643889157.0, "step": 16878 }, { "epoch": 2.1471822923292203, "ewc_loss": 0.032118987292051315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2118987292051315e-05, "grad_norm": 18.597206115722656, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8763890266418457, "num_tokens": 643931181.0, "step": 16879 }, { "epoch": 2.147309502607811, "ewc_loss": 0.03207658231258392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2076583011075854e-05, "grad_norm": 18.643680572509766, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8717215061187744, "num_tokens": 643965415.0, "step": 16880 }, { "epoch": 2.1474367128864014, "ewc_loss": 0.032064370810985565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206437031622045e-05, "grad_norm": 18.54384994506836, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8589715957641602, "num_tokens": 644010647.0, "step": 16881 }, { "epoch": 2.147563923164992, "ewc_loss": 0.0321466438472271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2146643206942827e-05, "grad_norm": 18.68192481994629, "learning_rate": 1e-06, "loss": 0.3453, "mean_token_accuracy": 0.8890290856361389, "num_tokens": 644044347.0, "step": 16882 }, { "epoch": 2.1476911334435824, "ewc_loss": 0.032204434275627136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220443250029348e-05, "grad_norm": 18.596202850341797, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8812112808227539, "num_tokens": 644083456.0, "step": 16883 }, { "epoch": 2.147818343722173, "ewc_loss": 0.03209531679749489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.209531496395357e-05, "grad_norm": 18.653486251831055, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8651756644248962, "num_tokens": 644123851.0, "step": 16884 }, { "epoch": 2.1479455540007635, "ewc_loss": 0.03215264528989792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215264587197453e-05, "grad_norm": 18.59033966064453, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8836582899093628, "num_tokens": 644168880.0, "step": 16885 }, { "epoch": 2.1480727642793536, "ewc_loss": 0.03207389637827873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2073894544737414e-05, "grad_norm": 18.67198371887207, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8757947087287903, "num_tokens": 644204532.0, "step": 16886 }, { "epoch": 2.148199974557944, "ewc_loss": 0.03218311071395874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2183110306505114e-05, "grad_norm": 18.664508819580078, "learning_rate": 1e-06, "loss": 0.3383, "mean_token_accuracy": 0.8915517926216125, "num_tokens": 644246160.0, "step": 16887 }, { "epoch": 2.1483271848365346, "ewc_loss": 0.032045457512140274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.204545646440238e-05, "grad_norm": 18.570526123046875, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8860050439834595, "num_tokens": 644284881.0, "step": 16888 }, { "epoch": 2.148454395115125, "ewc_loss": 0.03212152048945427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212151932530105e-05, "grad_norm": 18.624921798706055, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8675329685211182, "num_tokens": 644326661.0, "step": 16889 }, { "epoch": 2.1485816053937157, "ewc_loss": 0.032083041965961456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208304042345844e-05, "grad_norm": 18.60199737548828, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.878478467464447, "num_tokens": 644364420.0, "step": 16890 }, { "epoch": 2.148708815672306, "ewc_loss": 0.032063812017440796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.206381370546296e-05, "grad_norm": 18.652000427246094, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.857598066329956, "num_tokens": 644398626.0, "step": 16891 }, { "epoch": 2.1488360259508967, "ewc_loss": 0.03208066150546074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2080661185318604e-05, "grad_norm": 18.560684204101562, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.874859631061554, "num_tokens": 644434772.0, "step": 16892 }, { "epoch": 2.1489632362294873, "ewc_loss": 0.032096657902002335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2096657378133386e-05, "grad_norm": 18.6328067779541, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8719034194946289, "num_tokens": 644474425.0, "step": 16893 }, { "epoch": 2.149090446508078, "ewc_loss": 0.032124023884534836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212402225472033e-05, "grad_norm": 18.636207580566406, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.867534875869751, "num_tokens": 644507796.0, "step": 16894 }, { "epoch": 2.1492176567866683, "ewc_loss": 0.03213848918676376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2138490496436134e-05, "grad_norm": 18.60774040222168, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8695828318595886, "num_tokens": 644544172.0, "step": 16895 }, { "epoch": 2.149344867065259, "ewc_loss": 0.032176386564970016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217638732166961e-05, "grad_norm": 18.687726974487305, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8728393316268921, "num_tokens": 644582217.0, "step": 16896 }, { "epoch": 2.1494720773438494, "ewc_loss": 0.032166071236133575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2166070013772696e-05, "grad_norm": 18.66266441345215, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8800936937332153, "num_tokens": 644620028.0, "step": 16897 }, { "epoch": 2.14959928762244, "ewc_loss": 0.032153576612472534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215357719454914e-05, "grad_norm": 18.587257385253906, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8583928942680359, "num_tokens": 644663743.0, "step": 16898 }, { "epoch": 2.1497264979010304, "ewc_loss": 0.03217073157429695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217073026462458e-05, "grad_norm": 18.704742431640625, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8767591714859009, "num_tokens": 644700914.0, "step": 16899 }, { "epoch": 2.149853708179621, "ewc_loss": 0.03219098970293999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2190990168601274e-05, "grad_norm": 18.702611923217773, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.870663046836853, "num_tokens": 644744232.0, "step": 16900 }, { "epoch": 2.1499809184582115, "ewc_loss": 0.03211800009012222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2118001399794593e-05, "grad_norm": 18.616613388061523, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8677003383636475, "num_tokens": 644780851.0, "step": 16901 }, { "epoch": 2.150108128736802, "ewc_loss": 0.0322284959256649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222849409212358e-05, "grad_norm": 18.80088233947754, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8820129632949829, "num_tokens": 644822103.0, "step": 16902 }, { "epoch": 2.1502353390153925, "ewc_loss": 0.032201532274484634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2201533031184226e-05, "grad_norm": 18.682254791259766, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8709460496902466, "num_tokens": 644861209.0, "step": 16903 }, { "epoch": 2.150362549293983, "ewc_loss": 0.03210228309035301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210228169336915e-05, "grad_norm": 18.654762268066406, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8613255023956299, "num_tokens": 644899860.0, "step": 16904 }, { "epoch": 2.1504897595725736, "ewc_loss": 0.03219093009829521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2190928322961554e-05, "grad_norm": 18.69892692565918, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8701596260070801, "num_tokens": 644943101.0, "step": 16905 }, { "epoch": 2.150616969851164, "ewc_loss": 0.032171472907066345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217147241230123e-05, "grad_norm": 18.626907348632812, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8628578782081604, "num_tokens": 644981786.0, "step": 16906 }, { "epoch": 2.1507441801297547, "ewc_loss": 0.03216095641255379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2160955015569925e-05, "grad_norm": 18.6720027923584, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8758074641227722, "num_tokens": 645017949.0, "step": 16907 }, { "epoch": 2.150871390408345, "ewc_loss": 0.032192524522542953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219252539565787e-05, "grad_norm": 18.699491500854492, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8730990886688232, "num_tokens": 645053443.0, "step": 16908 }, { "epoch": 2.1509986006869353, "ewc_loss": 0.032147254794836044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214725438738242e-05, "grad_norm": 18.756731033325195, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.875508725643158, "num_tokens": 645089916.0, "step": 16909 }, { "epoch": 2.1511258109655262, "ewc_loss": 0.032151006162166595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215100514353253e-05, "grad_norm": 18.574411392211914, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.858079731464386, "num_tokens": 645134531.0, "step": 16910 }, { "epoch": 2.1512530212441163, "ewc_loss": 0.032118067145347595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211806688341312e-05, "grad_norm": 18.63102149963379, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8687399625778198, "num_tokens": 645172608.0, "step": 16911 }, { "epoch": 2.151380231522707, "ewc_loss": 0.032260581851005554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226058106520213e-05, "grad_norm": 18.615909576416016, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8700191974639893, "num_tokens": 645209895.0, "step": 16912 }, { "epoch": 2.1515074418012974, "ewc_loss": 0.03216356784105301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216356708435342e-05, "grad_norm": 18.653697967529297, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8789178729057312, "num_tokens": 645245259.0, "step": 16913 }, { "epoch": 2.151634652079888, "ewc_loss": 0.0321359746158123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2135973015101627e-05, "grad_norm": 18.520788192749023, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8687252998352051, "num_tokens": 645284962.0, "step": 16914 }, { "epoch": 2.1517618623584784, "ewc_loss": 0.03217809274792671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2178093533730134e-05, "grad_norm": 18.67923927307129, "learning_rate": 1e-06, "loss": 0.3535, "mean_token_accuracy": 0.8883110880851746, "num_tokens": 645322352.0, "step": 16915 }, { "epoch": 2.151889072637069, "ewc_loss": 0.03222576528787613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2225765608018264e-05, "grad_norm": 18.642627716064453, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8727664947509766, "num_tokens": 645358401.0, "step": 16916 }, { "epoch": 2.1520162829156595, "ewc_loss": 0.03217899799346924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21789993904531e-05, "grad_norm": 18.631959915161133, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8853743076324463, "num_tokens": 645399147.0, "step": 16917 }, { "epoch": 2.15214349319425, "ewc_loss": 0.03217211365699768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2172112696571276e-05, "grad_norm": 18.643749237060547, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.85502028465271, "num_tokens": 645437286.0, "step": 16918 }, { "epoch": 2.1522707034728406, "ewc_loss": 0.0322052501142025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2205251045525074e-05, "grad_norm": 18.704811096191406, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8669937252998352, "num_tokens": 645477853.0, "step": 16919 }, { "epoch": 2.152397913751431, "ewc_loss": 0.03218535706400871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2185358577407897e-05, "grad_norm": 18.598840713500977, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8638026714324951, "num_tokens": 645520476.0, "step": 16920 }, { "epoch": 2.1525251240300216, "ewc_loss": 0.03215550631284714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21555053233169e-05, "grad_norm": 18.642070770263672, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8595459461212158, "num_tokens": 645561539.0, "step": 16921 }, { "epoch": 2.152652334308612, "ewc_loss": 0.032177675515413284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217767516616732e-05, "grad_norm": 18.652151107788086, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8685903549194336, "num_tokens": 645602950.0, "step": 16922 }, { "epoch": 2.1527795445872027, "ewc_loss": 0.032149821519851685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2149822800420225e-05, "grad_norm": 18.580331802368164, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8631052374839783, "num_tokens": 645639215.0, "step": 16923 }, { "epoch": 2.152906754865793, "ewc_loss": 0.03219040110707283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2190400816034526e-05, "grad_norm": 18.69143295288086, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8724404573440552, "num_tokens": 645677287.0, "step": 16924 }, { "epoch": 2.1530339651443837, "ewc_loss": 0.03222735598683357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222735540475696e-05, "grad_norm": 18.64420509338379, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8808696269989014, "num_tokens": 645710784.0, "step": 16925 }, { "epoch": 2.1531611754229742, "ewc_loss": 0.03220321238040924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22032137773931e-05, "grad_norm": 18.688173294067383, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8687927722930908, "num_tokens": 645750568.0, "step": 16926 }, { "epoch": 2.1532883857015648, "ewc_loss": 0.032221030443906784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222102895961143e-05, "grad_norm": 18.56367301940918, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8782275915145874, "num_tokens": 645783911.0, "step": 16927 }, { "epoch": 2.1534155959801553, "ewc_loss": 0.032198306173086166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2198306143982336e-05, "grad_norm": 18.74691390991211, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8895437717437744, "num_tokens": 645822524.0, "step": 16928 }, { "epoch": 2.153542806258746, "ewc_loss": 0.032293692231178284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229369394830428e-05, "grad_norm": 18.635690689086914, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8691390752792358, "num_tokens": 645856924.0, "step": 16929 }, { "epoch": 2.1536700165373364, "ewc_loss": 0.032105933874845505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.210593422409147e-05, "grad_norm": 18.612812042236328, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8665230870246887, "num_tokens": 645896995.0, "step": 16930 }, { "epoch": 2.153797226815927, "ewc_loss": 0.03230944275856018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230944275856018e-05, "grad_norm": 18.659032821655273, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8804837465286255, "num_tokens": 645941768.0, "step": 16931 }, { "epoch": 2.1539244370945174, "ewc_loss": 0.032197918742895126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2197920518228784e-05, "grad_norm": 18.612194061279297, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8719503283500671, "num_tokens": 645981909.0, "step": 16932 }, { "epoch": 2.154051647373108, "ewc_loss": 0.032231781631708145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2231782824965194e-05, "grad_norm": 18.676000595092773, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8644011616706848, "num_tokens": 646022034.0, "step": 16933 }, { "epoch": 2.154178857651698, "ewc_loss": 0.03224416449666023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224416286684573e-05, "grad_norm": 18.623783111572266, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8720905184745789, "num_tokens": 646056714.0, "step": 16934 }, { "epoch": 2.1543060679302886, "ewc_loss": 0.032228533178567886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222853410989046e-05, "grad_norm": 18.6949405670166, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8656101226806641, "num_tokens": 646095668.0, "step": 16935 }, { "epoch": 2.154433278208879, "ewc_loss": 0.03225533291697502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22553314617835e-05, "grad_norm": 18.646696090698242, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8574557900428772, "num_tokens": 646136745.0, "step": 16936 }, { "epoch": 2.1545604884874696, "ewc_loss": 0.0322125107049942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221251245122403e-05, "grad_norm": 18.680206298828125, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8700778484344482, "num_tokens": 646180314.0, "step": 16937 }, { "epoch": 2.15468769876606, "ewc_loss": 0.03223743289709091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223743260605261e-05, "grad_norm": 18.6705322265625, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8756163120269775, "num_tokens": 646218191.0, "step": 16938 }, { "epoch": 2.1548149090446507, "ewc_loss": 0.03219949081540108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219949212507345e-05, "grad_norm": 18.613121032714844, "learning_rate": 1e-06, "loss": 0.3565, "mean_token_accuracy": 0.8877645134925842, "num_tokens": 646256350.0, "step": 16939 }, { "epoch": 2.154942119323241, "ewc_loss": 0.0321883000433445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2188301702262834e-05, "grad_norm": 18.660789489746094, "learning_rate": 1e-06, "loss": 0.3176, "mean_token_accuracy": 0.8950113654136658, "num_tokens": 646290915.0, "step": 16940 }, { "epoch": 2.1550693296018317, "ewc_loss": 0.03225266560912132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22526648233179e-05, "grad_norm": 18.677963256835938, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8657945394515991, "num_tokens": 646325838.0, "step": 16941 }, { "epoch": 2.1551965398804223, "ewc_loss": 0.032184991985559464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218499114154838e-05, "grad_norm": 18.69686508178711, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8548864126205444, "num_tokens": 646362742.0, "step": 16942 }, { "epoch": 2.155323750159013, "ewc_loss": 0.032187577337026596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218757774448022e-05, "grad_norm": 18.641490936279297, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8600475788116455, "num_tokens": 646403958.0, "step": 16943 }, { "epoch": 2.1554509604376033, "ewc_loss": 0.03221221640706062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2212217774940655e-05, "grad_norm": 18.718643188476562, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8673713803291321, "num_tokens": 646442872.0, "step": 16944 }, { "epoch": 2.155578170716194, "ewc_loss": 0.032217349857091904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221735096303746e-05, "grad_norm": 18.617813110351562, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8794696927070618, "num_tokens": 646484642.0, "step": 16945 }, { "epoch": 2.1557053809947844, "ewc_loss": 0.032163042575120926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2163043215405196e-05, "grad_norm": 18.69842529296875, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8818761110305786, "num_tokens": 646521688.0, "step": 16946 }, { "epoch": 2.155832591273375, "ewc_loss": 0.03226182237267494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226182161597535e-05, "grad_norm": 18.6844482421875, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8802646398544312, "num_tokens": 646560182.0, "step": 16947 }, { "epoch": 2.1559598015519654, "ewc_loss": 0.032159075140953064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215907418052666e-05, "grad_norm": 18.662975311279297, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8776247501373291, "num_tokens": 646593252.0, "step": 16948 }, { "epoch": 2.156087011830556, "ewc_loss": 0.03220692276954651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2206924515776336e-05, "grad_norm": 18.641857147216797, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.871505856513977, "num_tokens": 646635514.0, "step": 16949 }, { "epoch": 2.1562142221091465, "ewc_loss": 0.032146114856004715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21461157000158e-05, "grad_norm": 18.669654846191406, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8594209551811218, "num_tokens": 646674785.0, "step": 16950 }, { "epoch": 2.156341432387737, "ewc_loss": 0.03220851346850395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2208514312515035e-05, "grad_norm": 18.611295700073242, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8721380233764648, "num_tokens": 646713832.0, "step": 16951 }, { "epoch": 2.1564686426663275, "ewc_loss": 0.03218608349561691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218608253519051e-05, "grad_norm": 18.654878616333008, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8771306276321411, "num_tokens": 646752218.0, "step": 16952 }, { "epoch": 2.156595852944918, "ewc_loss": 0.03226947784423828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226947956136428e-05, "grad_norm": 18.72126579284668, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8723532557487488, "num_tokens": 646786196.0, "step": 16953 }, { "epoch": 2.1567230632235086, "ewc_loss": 0.032210297882556915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221029692213051e-05, "grad_norm": 18.662578582763672, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8670459389686584, "num_tokens": 646821462.0, "step": 16954 }, { "epoch": 2.156850273502099, "ewc_loss": 0.03216296061873436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216295954189263e-05, "grad_norm": 18.618860244750977, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8798935413360596, "num_tokens": 646856613.0, "step": 16955 }, { "epoch": 2.1569774837806897, "ewc_loss": 0.032199010252952576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219901191187091e-05, "grad_norm": 18.632461547851562, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8837946653366089, "num_tokens": 646893013.0, "step": 16956 }, { "epoch": 2.15710469405928, "ewc_loss": 0.03219112753868103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2191128411795944e-05, "grad_norm": 18.626745223999023, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8778786659240723, "num_tokens": 646936180.0, "step": 16957 }, { "epoch": 2.1572319043378707, "ewc_loss": 0.032264526933431625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2264528272207826e-05, "grad_norm": 18.62620735168457, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8726733326911926, "num_tokens": 646979314.0, "step": 16958 }, { "epoch": 2.157359114616461, "ewc_loss": 0.03224498778581619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224498868803494e-05, "grad_norm": 18.672536849975586, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8833154439926147, "num_tokens": 647020136.0, "step": 16959 }, { "epoch": 2.1574863248950513, "ewc_loss": 0.03221755474805832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221755468985066e-05, "grad_norm": 18.63897705078125, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.877410888671875, "num_tokens": 647052970.0, "step": 16960 }, { "epoch": 2.157613535173642, "ewc_loss": 0.0322016179561615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220161670469679e-05, "grad_norm": 18.64259910583496, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8678953647613525, "num_tokens": 647093195.0, "step": 16961 }, { "epoch": 2.1577407454522324, "ewc_loss": 0.03220357373356819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2203573937295005e-05, "grad_norm": 18.58485221862793, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8734642267227173, "num_tokens": 647133800.0, "step": 16962 }, { "epoch": 2.157867955730823, "ewc_loss": 0.0322394073009491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223940802854486e-05, "grad_norm": 18.654619216918945, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8571637868881226, "num_tokens": 647167704.0, "step": 16963 }, { "epoch": 2.1579951660094134, "ewc_loss": 0.0322534404695034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225343971280381e-05, "grad_norm": 18.5871524810791, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8606555461883545, "num_tokens": 647208474.0, "step": 16964 }, { "epoch": 2.158122376288004, "ewc_loss": 0.03228751942515373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2287520298268646e-05, "grad_norm": 18.806028366088867, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8863369226455688, "num_tokens": 647242131.0, "step": 16965 }, { "epoch": 2.1582495865665945, "ewc_loss": 0.03229190409183502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229190406273119e-05, "grad_norm": 18.631649017333984, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8546683192253113, "num_tokens": 647280952.0, "step": 16966 }, { "epoch": 2.158376796845185, "ewc_loss": 0.032184481620788574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218448182451539e-05, "grad_norm": 18.733251571655273, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8680953979492188, "num_tokens": 647318411.0, "step": 16967 }, { "epoch": 2.1585040071237755, "ewc_loss": 0.032306820154190063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230681977584027e-05, "grad_norm": 18.709684371948242, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8607833385467529, "num_tokens": 647361002.0, "step": 16968 }, { "epoch": 2.158631217402366, "ewc_loss": 0.03223239630460739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223239764338359e-05, "grad_norm": 18.75490951538086, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8768978118896484, "num_tokens": 647402995.0, "step": 16969 }, { "epoch": 2.1587584276809566, "ewc_loss": 0.03220910206437111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2209103665081784e-05, "grad_norm": 18.7126522064209, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8753212094306946, "num_tokens": 647441220.0, "step": 16970 }, { "epoch": 2.158885637959547, "ewc_loss": 0.03222109377384186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222109444322996e-05, "grad_norm": 18.68147087097168, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8656744956970215, "num_tokens": 647476832.0, "step": 16971 }, { "epoch": 2.1590128482381377, "ewc_loss": 0.03217547386884689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217547418898903e-05, "grad_norm": 18.689844131469727, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.875277042388916, "num_tokens": 647516855.0, "step": 16972 }, { "epoch": 2.159140058516728, "ewc_loss": 0.032206833362579346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220683356630616e-05, "grad_norm": 18.63480567932129, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8698665499687195, "num_tokens": 647556913.0, "step": 16973 }, { "epoch": 2.1592672687953187, "ewc_loss": 0.032172948122024536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217294943169691e-05, "grad_norm": 18.66722869873047, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8896433115005493, "num_tokens": 647591242.0, "step": 16974 }, { "epoch": 2.1593944790739092, "ewc_loss": 0.03222094476222992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222094528609887e-05, "grad_norm": 18.64133644104004, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8765029907226562, "num_tokens": 647632156.0, "step": 16975 }, { "epoch": 2.1595216893524998, "ewc_loss": 0.03216390684247017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216390541638248e-05, "grad_norm": 18.611595153808594, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8790522813796997, "num_tokens": 647669857.0, "step": 16976 }, { "epoch": 2.1596488996310903, "ewc_loss": 0.032169610261917114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2169609767151996e-05, "grad_norm": 18.66395378112793, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8610830307006836, "num_tokens": 647715580.0, "step": 16977 }, { "epoch": 2.159776109909681, "ewc_loss": 0.0322221964597702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222219675080851e-05, "grad_norm": 18.688392639160156, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8645197153091431, "num_tokens": 647755917.0, "step": 16978 }, { "epoch": 2.1599033201882714, "ewc_loss": 0.03216863423585892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2168634788831696e-05, "grad_norm": 18.660587310791016, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.889281690120697, "num_tokens": 647802205.0, "step": 16979 }, { "epoch": 2.160030530466862, "ewc_loss": 0.03215732052922249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215732067474164e-05, "grad_norm": 18.614900588989258, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.85945063829422, "num_tokens": 647846627.0, "step": 16980 }, { "epoch": 2.1601577407454524, "ewc_loss": 0.03218292072415352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2182921131607145e-05, "grad_norm": 18.706684112548828, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8637995719909668, "num_tokens": 647885033.0, "step": 16981 }, { "epoch": 2.160284951024043, "ewc_loss": 0.032268088310956955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226808985345997e-05, "grad_norm": 18.65471839904785, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8626605272293091, "num_tokens": 647923370.0, "step": 16982 }, { "epoch": 2.1604121613026335, "ewc_loss": 0.032093942165374756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2093943445943296e-05, "grad_norm": 18.664085388183594, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8788175582885742, "num_tokens": 647963431.0, "step": 16983 }, { "epoch": 2.1605393715812236, "ewc_loss": 0.03219594061374664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2195941457757726e-05, "grad_norm": 18.62664031982422, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.865734338760376, "num_tokens": 648002854.0, "step": 16984 }, { "epoch": 2.160666581859814, "ewc_loss": 0.032083455473184586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.208345515304245e-05, "grad_norm": 18.64797592163086, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8612474203109741, "num_tokens": 648041082.0, "step": 16985 }, { "epoch": 2.1607937921384046, "ewc_loss": 0.032198261469602585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219826248823665e-05, "grad_norm": 18.68102264404297, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8655596375465393, "num_tokens": 648082109.0, "step": 16986 }, { "epoch": 2.160921002416995, "ewc_loss": 0.032142773270606995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214277239749208e-05, "grad_norm": 18.673179626464844, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8870710730552673, "num_tokens": 648117398.0, "step": 16987 }, { "epoch": 2.1610482126955857, "ewc_loss": 0.03213947266340256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213947275071405e-05, "grad_norm": 18.63270378112793, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8802711963653564, "num_tokens": 648158997.0, "step": 16988 }, { "epoch": 2.161175422974176, "ewc_loss": 0.03211743012070656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211743023712188e-05, "grad_norm": 18.626609802246094, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8701560497283936, "num_tokens": 648194334.0, "step": 16989 }, { "epoch": 2.1613026332527667, "ewc_loss": 0.03212631493806839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21263141813688e-05, "grad_norm": 18.629711151123047, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8693156242370605, "num_tokens": 648230601.0, "step": 16990 }, { "epoch": 2.1614298435313573, "ewc_loss": 0.0321846678853035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218466736143455e-05, "grad_norm": 18.642471313476562, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.866644024848938, "num_tokens": 648274979.0, "step": 16991 }, { "epoch": 2.161557053809948, "ewc_loss": 0.03215028718113899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215028846170753e-05, "grad_norm": 18.705093383789062, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.857601523399353, "num_tokens": 648319229.0, "step": 16992 }, { "epoch": 2.1616842640885383, "ewc_loss": 0.03216567635536194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216567711206153e-05, "grad_norm": 18.67743682861328, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.8842135667800903, "num_tokens": 648354550.0, "step": 16993 }, { "epoch": 2.161811474367129, "ewc_loss": 0.03215107321739197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2151074265129864e-05, "grad_norm": 18.642539978027344, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8847500085830688, "num_tokens": 648389820.0, "step": 16994 }, { "epoch": 2.1619386846457194, "ewc_loss": 0.03215469419956207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215469405404292e-05, "grad_norm": 18.68195343017578, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8870455026626587, "num_tokens": 648423841.0, "step": 16995 }, { "epoch": 2.16206589492431, "ewc_loss": 0.03217914700508118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217914854758419e-05, "grad_norm": 18.694772720336914, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8601154088973999, "num_tokens": 648459865.0, "step": 16996 }, { "epoch": 2.1621931052029004, "ewc_loss": 0.032198239117860794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219824066036381e-05, "grad_norm": 18.757198333740234, "learning_rate": 1e-06, "loss": 0.3335, "mean_token_accuracy": 0.8905340433120728, "num_tokens": 648492639.0, "step": 16997 }, { "epoch": 2.162320315481491, "ewc_loss": 0.0321689210832119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2168922189157456e-05, "grad_norm": 18.662477493286133, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8778717517852783, "num_tokens": 648533456.0, "step": 16998 }, { "epoch": 2.1624475257600815, "ewc_loss": 0.032166920602321625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2166921300813556e-05, "grad_norm": 18.684280395507812, "learning_rate": 1e-06, "loss": 0.3567, "mean_token_accuracy": 0.8866615295410156, "num_tokens": 648571955.0, "step": 16999 }, { "epoch": 2.162574736038672, "ewc_loss": 0.03221384435892105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2213843951467425e-05, "grad_norm": 18.637479782104492, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8893494009971619, "num_tokens": 648614284.0, "step": 17000 }, { "epoch": 2.1627019463172625, "ewc_loss": 0.032189637422561646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2189636840485036e-05, "grad_norm": 18.713592529296875, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8813914060592651, "num_tokens": 648654399.0, "step": 17001 }, { "epoch": 2.162829156595853, "ewc_loss": 0.03219195082783699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219195059500635e-05, "grad_norm": 18.630319595336914, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8754451274871826, "num_tokens": 648694490.0, "step": 17002 }, { "epoch": 2.1629563668744436, "ewc_loss": 0.03212045878171921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212045703548938e-05, "grad_norm": 18.68438148498535, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8788854479789734, "num_tokens": 648730665.0, "step": 17003 }, { "epoch": 2.163083577153034, "ewc_loss": 0.03227191045880318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2271909731207415e-05, "grad_norm": 18.675785064697266, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8776858448982239, "num_tokens": 648765289.0, "step": 17004 }, { "epoch": 2.1632107874316246, "ewc_loss": 0.032134708017110825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213470699847676e-05, "grad_norm": 18.64676856994629, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8731563091278076, "num_tokens": 648803982.0, "step": 17005 }, { "epoch": 2.163337997710215, "ewc_loss": 0.032257452607154846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225745240342803e-05, "grad_norm": 18.726707458496094, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8791927695274353, "num_tokens": 648840992.0, "step": 17006 }, { "epoch": 2.1634652079888053, "ewc_loss": 0.03219687193632126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219687278033234e-05, "grad_norm": 18.71564292907715, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8690218329429626, "num_tokens": 648883621.0, "step": 17007 }, { "epoch": 2.1635924182673962, "ewc_loss": 0.03223589062690735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22358901030384e-05, "grad_norm": 18.753847122192383, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8584496974945068, "num_tokens": 648922258.0, "step": 17008 }, { "epoch": 2.1637196285459863, "ewc_loss": 0.03211105987429619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.211106013623066e-05, "grad_norm": 18.629446029663086, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8725626468658447, "num_tokens": 648965010.0, "step": 17009 }, { "epoch": 2.163846838824577, "ewc_loss": 0.03217320144176483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2173200452234596e-05, "grad_norm": 18.67831039428711, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8684102892875671, "num_tokens": 649008124.0, "step": 17010 }, { "epoch": 2.1639740491031674, "ewc_loss": 0.032180771231651306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2180771086132154e-05, "grad_norm": 18.634897232055664, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.87303227186203, "num_tokens": 649044743.0, "step": 17011 }, { "epoch": 2.164101259381758, "ewc_loss": 0.032181818038225174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2181818824028596e-05, "grad_norm": 18.655900955200195, "learning_rate": 1e-06, "loss": 0.3536, "mean_token_accuracy": 0.88570237159729, "num_tokens": 649078410.0, "step": 17012 }, { "epoch": 2.1642284696603484, "ewc_loss": 0.03222961723804474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2229618227574974e-05, "grad_norm": 18.696672439575195, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8837851881980896, "num_tokens": 649115415.0, "step": 17013 }, { "epoch": 2.164355679938939, "ewc_loss": 0.0322098508477211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220984945073724e-05, "grad_norm": 18.657081604003906, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8707731366157532, "num_tokens": 649156266.0, "step": 17014 }, { "epoch": 2.1644828902175295, "ewc_loss": 0.03216679021716118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21667903335765e-05, "grad_norm": 18.70745277404785, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8795925378799438, "num_tokens": 649192552.0, "step": 17015 }, { "epoch": 2.16461010049612, "ewc_loss": 0.03216719254851341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.216719414922409e-05, "grad_norm": 18.628692626953125, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8755474090576172, "num_tokens": 649224163.0, "step": 17016 }, { "epoch": 2.1647373107747105, "ewc_loss": 0.03218458220362663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2184583687921986e-05, "grad_norm": 18.731552124023438, "learning_rate": 1e-06, "loss": 0.329, "mean_token_accuracy": 0.892070472240448, "num_tokens": 649259348.0, "step": 17017 }, { "epoch": 2.164864521053301, "ewc_loss": 0.03222234919667244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2222349545918405e-05, "grad_norm": 18.55497932434082, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8894020915031433, "num_tokens": 649296875.0, "step": 17018 }, { "epoch": 2.1649917313318916, "ewc_loss": 0.032175831496715546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217583071091212e-05, "grad_norm": 18.75907325744629, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8622410297393799, "num_tokens": 649333455.0, "step": 17019 }, { "epoch": 2.165118941610482, "ewc_loss": 0.03229793533682823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229793583159335e-05, "grad_norm": 18.636037826538086, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8743115663528442, "num_tokens": 649364567.0, "step": 17020 }, { "epoch": 2.1652461518890727, "ewc_loss": 0.03215660899877548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215660763089545e-05, "grad_norm": 18.704025268554688, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8574628829956055, "num_tokens": 649400579.0, "step": 17021 }, { "epoch": 2.165373362167663, "ewc_loss": 0.032287754118442535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22877531289123e-05, "grad_norm": 18.730804443359375, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8852179050445557, "num_tokens": 649432940.0, "step": 17022 }, { "epoch": 2.1655005724462537, "ewc_loss": 0.032214678823947906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2214680686593056e-05, "grad_norm": 18.714374542236328, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8766069412231445, "num_tokens": 649466695.0, "step": 17023 }, { "epoch": 2.1656277827248442, "ewc_loss": 0.03222539275884628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222539453417994e-05, "grad_norm": 18.725830078125, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8714097738265991, "num_tokens": 649504578.0, "step": 17024 }, { "epoch": 2.1657549930034348, "ewc_loss": 0.032233718782663345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223371822969057e-05, "grad_norm": 18.701107025146484, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8637375831604004, "num_tokens": 649541536.0, "step": 17025 }, { "epoch": 2.1658822032820253, "ewc_loss": 0.032206982374191284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220698272343725e-05, "grad_norm": 18.678098678588867, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8705813884735107, "num_tokens": 649581280.0, "step": 17026 }, { "epoch": 2.166009413560616, "ewc_loss": 0.032251469790935516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2251471566269174e-05, "grad_norm": 18.73394012451172, "learning_rate": 1e-06, "loss": 0.3345, "mean_token_accuracy": 0.895301878452301, "num_tokens": 649619837.0, "step": 17027 }, { "epoch": 2.1661366238392064, "ewc_loss": 0.03227873891592026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2278738217428327e-05, "grad_norm": 18.706911087036133, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.8847720623016357, "num_tokens": 649654708.0, "step": 17028 }, { "epoch": 2.166263834117797, "ewc_loss": 0.032154496759176254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.215449760318734e-05, "grad_norm": 18.611797332763672, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8534328937530518, "num_tokens": 649691709.0, "step": 17029 }, { "epoch": 2.1663910443963874, "ewc_loss": 0.03227698430418968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.227698471164331e-05, "grad_norm": 18.704345703125, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8699387907981873, "num_tokens": 649734876.0, "step": 17030 }, { "epoch": 2.166518254674978, "ewc_loss": 0.03225332871079445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225332693546079e-05, "grad_norm": 18.731733322143555, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.882301390171051, "num_tokens": 649775477.0, "step": 17031 }, { "epoch": 2.166645464953568, "ewc_loss": 0.03223196789622307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2231968361884356e-05, "grad_norm": 18.703338623046875, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8850467205047607, "num_tokens": 649816136.0, "step": 17032 }, { "epoch": 2.1667726752321586, "ewc_loss": 0.03218488022685051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218488200218417e-05, "grad_norm": 18.688941955566406, "learning_rate": 1e-06, "loss": 0.3594, "mean_token_accuracy": 0.8827250599861145, "num_tokens": 649847942.0, "step": 17033 }, { "epoch": 2.166899885510749, "ewc_loss": 0.03224724158644676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2247240596916527e-05, "grad_norm": 18.648597717285156, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8706830739974976, "num_tokens": 649887875.0, "step": 17034 }, { "epoch": 2.1670270957893396, "ewc_loss": 0.032242193818092346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224219472031109e-05, "grad_norm": 18.700992584228516, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8616777062416077, "num_tokens": 649920557.0, "step": 17035 }, { "epoch": 2.16715430606793, "ewc_loss": 0.032265555113554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2265554182231426e-05, "grad_norm": 18.6632022857666, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8823525905609131, "num_tokens": 649959704.0, "step": 17036 }, { "epoch": 2.1672815163465207, "ewc_loss": 0.03215983137488365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2159830880118534e-05, "grad_norm": 18.626039505004883, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8781227469444275, "num_tokens": 650000433.0, "step": 17037 }, { "epoch": 2.167408726625111, "ewc_loss": 0.03224268555641174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224268584745005e-05, "grad_norm": 18.611404418945312, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8868272304534912, "num_tokens": 650038793.0, "step": 17038 }, { "epoch": 2.1675359369037017, "ewc_loss": 0.03220430016517639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220430153305642e-05, "grad_norm": 18.673189163208008, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8831593990325928, "num_tokens": 650072877.0, "step": 17039 }, { "epoch": 2.1676631471822922, "ewc_loss": 0.03226172551512718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2261727028526366e-05, "grad_norm": 18.61503791809082, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8615540266036987, "num_tokens": 650109245.0, "step": 17040 }, { "epoch": 2.1677903574608828, "ewc_loss": 0.03223135322332382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223135354346596e-05, "grad_norm": 18.71025276184082, "learning_rate": 1e-06, "loss": 0.3507, "mean_token_accuracy": 0.8869110345840454, "num_tokens": 650150254.0, "step": 17041 }, { "epoch": 2.1679175677394733, "ewc_loss": 0.03230365365743637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23036547342781e-05, "grad_norm": 18.677295684814453, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8545024991035461, "num_tokens": 650189587.0, "step": 17042 }, { "epoch": 2.168044778018064, "ewc_loss": 0.03222697600722313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2226977054961026e-05, "grad_norm": 18.684160232543945, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8700717687606812, "num_tokens": 650225443.0, "step": 17043 }, { "epoch": 2.1681719882966544, "ewc_loss": 0.03228994086384773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228993955417536e-05, "grad_norm": 18.623632431030273, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8704651594161987, "num_tokens": 650262319.0, "step": 17044 }, { "epoch": 2.168299198575245, "ewc_loss": 0.03222081437706947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222081431886181e-05, "grad_norm": 18.69028091430664, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8588892817497253, "num_tokens": 650297137.0, "step": 17045 }, { "epoch": 2.1684264088538354, "ewc_loss": 0.03235456347465515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235456460970454e-05, "grad_norm": 18.633222579956055, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8727875351905823, "num_tokens": 650335521.0, "step": 17046 }, { "epoch": 2.168553619132426, "ewc_loss": 0.032211918383836746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221191946067847e-05, "grad_norm": 18.695240020751953, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8683139085769653, "num_tokens": 650373050.0, "step": 17047 }, { "epoch": 2.1686808294110165, "ewc_loss": 0.032254938036203384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225493856007233e-05, "grad_norm": 18.615829467773438, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8808426260948181, "num_tokens": 650407438.0, "step": 17048 }, { "epoch": 2.168808039689607, "ewc_loss": 0.032215218991041183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2215219107456505e-05, "grad_norm": 18.62467384338379, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8690736889839172, "num_tokens": 650447793.0, "step": 17049 }, { "epoch": 2.1689352499681975, "ewc_loss": 0.03231533244252205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231533264624886e-05, "grad_norm": 18.67035675048828, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8698018789291382, "num_tokens": 650487167.0, "step": 17050 }, { "epoch": 2.169062460246788, "ewc_loss": 0.0322553813457489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22553823934868e-05, "grad_norm": 18.646053314208984, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8616107106208801, "num_tokens": 650522960.0, "step": 17051 }, { "epoch": 2.1691896705253786, "ewc_loss": 0.03224008157849312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224008105462417e-05, "grad_norm": 18.71515655517578, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8679620027542114, "num_tokens": 650564161.0, "step": 17052 }, { "epoch": 2.169316880803969, "ewc_loss": 0.03231048956513405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231049049645662e-05, "grad_norm": 18.651872634887695, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8804347515106201, "num_tokens": 650599201.0, "step": 17053 }, { "epoch": 2.1694440910825596, "ewc_loss": 0.03222053498029709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2220534194493666e-05, "grad_norm": 18.633094787597656, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8548803329467773, "num_tokens": 650637385.0, "step": 17054 }, { "epoch": 2.16957130136115, "ewc_loss": 0.03231623396277428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231623486499302e-05, "grad_norm": 18.671058654785156, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8727636337280273, "num_tokens": 650671954.0, "step": 17055 }, { "epoch": 2.1696985116397407, "ewc_loss": 0.03232896327972412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2328964152839035e-05, "grad_norm": 18.713115692138672, "learning_rate": 1e-06, "loss": 0.3392, "mean_token_accuracy": 0.8905220627784729, "num_tokens": 650708475.0, "step": 17056 }, { "epoch": 2.169825721918331, "ewc_loss": 0.03226741403341293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226741318940185e-05, "grad_norm": 18.64422035217285, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8773287534713745, "num_tokens": 650748089.0, "step": 17057 }, { "epoch": 2.1699529321969213, "ewc_loss": 0.03228430449962616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228430432500318e-05, "grad_norm": 18.665136337280273, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.876006007194519, "num_tokens": 650790869.0, "step": 17058 }, { "epoch": 2.170080142475512, "ewc_loss": 0.03232475742697716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232475864933804e-05, "grad_norm": 18.727941513061523, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8614224195480347, "num_tokens": 650830697.0, "step": 17059 }, { "epoch": 2.1702073527541024, "ewc_loss": 0.032237373292446136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2237374398391694e-05, "grad_norm": 18.676292419433594, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8818384408950806, "num_tokens": 650868755.0, "step": 17060 }, { "epoch": 2.170334563032693, "ewc_loss": 0.03225480392575264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225480395485647e-05, "grad_norm": 18.65862274169922, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8846323490142822, "num_tokens": 650908019.0, "step": 17061 }, { "epoch": 2.1704617733112834, "ewc_loss": 0.03225267678499222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225267573725432e-05, "grad_norm": 18.66110610961914, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8722744584083557, "num_tokens": 650954195.0, "step": 17062 }, { "epoch": 2.170588983589874, "ewc_loss": 0.03218600153923035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218600249965675e-05, "grad_norm": 18.66752815246582, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.887224555015564, "num_tokens": 650992740.0, "step": 17063 }, { "epoch": 2.1707161938684645, "ewc_loss": 0.03226237744092941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2262378226732835e-05, "grad_norm": 18.608036041259766, "learning_rate": 1e-06, "loss": 0.35, "mean_token_accuracy": 0.8866056203842163, "num_tokens": 651031524.0, "step": 17064 }, { "epoch": 2.170843404147055, "ewc_loss": 0.03220783919095993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2207837648456916e-05, "grad_norm": 18.633020401000977, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8730189800262451, "num_tokens": 651070004.0, "step": 17065 }, { "epoch": 2.1709706144256455, "ewc_loss": 0.03228706493973732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228706555091776e-05, "grad_norm": 18.741249084472656, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.861304521560669, "num_tokens": 651104474.0, "step": 17066 }, { "epoch": 2.171097824704236, "ewc_loss": 0.032322514802217484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232251401641406e-05, "grad_norm": 18.64590072631836, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8676525354385376, "num_tokens": 651148913.0, "step": 17067 }, { "epoch": 2.1712250349828266, "ewc_loss": 0.03220609575510025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220609505660832e-05, "grad_norm": 18.76798439025879, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8824855089187622, "num_tokens": 651192072.0, "step": 17068 }, { "epoch": 2.171352245261417, "ewc_loss": 0.032249387353658676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224938700441271e-05, "grad_norm": 18.629199981689453, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.865906834602356, "num_tokens": 651226132.0, "step": 17069 }, { "epoch": 2.1714794555400077, "ewc_loss": 0.03217983990907669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217983976355754e-05, "grad_norm": 18.666038513183594, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8769748210906982, "num_tokens": 651263467.0, "step": 17070 }, { "epoch": 2.171606665818598, "ewc_loss": 0.0323205329477787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2320531317964196e-05, "grad_norm": 18.665998458862305, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8649736642837524, "num_tokens": 651309521.0, "step": 17071 }, { "epoch": 2.1717338760971887, "ewc_loss": 0.032208092510700226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220809230697341e-05, "grad_norm": 18.689027786254883, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.876982569694519, "num_tokens": 651347026.0, "step": 17072 }, { "epoch": 2.1718610863757792, "ewc_loss": 0.032248903065919876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224890315323137e-05, "grad_norm": 18.670045852661133, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8825708627700806, "num_tokens": 651380545.0, "step": 17073 }, { "epoch": 2.1719882966543698, "ewc_loss": 0.03221145272254944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2211453799391165e-05, "grad_norm": 18.760173797607422, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8795503377914429, "num_tokens": 651409428.0, "step": 17074 }, { "epoch": 2.1721155069329603, "ewc_loss": 0.03223263844847679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223263774998486e-05, "grad_norm": 18.684249877929688, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8614214062690735, "num_tokens": 651447993.0, "step": 17075 }, { "epoch": 2.172242717211551, "ewc_loss": 0.03219226375222206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2192263461183757e-05, "grad_norm": 18.684965133666992, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8684521913528442, "num_tokens": 651488210.0, "step": 17076 }, { "epoch": 2.1723699274901414, "ewc_loss": 0.032249268144369125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224926695111208e-05, "grad_norm": 18.693296432495117, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8701344728469849, "num_tokens": 651525742.0, "step": 17077 }, { "epoch": 2.172497137768732, "ewc_loss": 0.032250966876745224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225096588721499e-05, "grad_norm": 18.658479690551758, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8809057474136353, "num_tokens": 651565569.0, "step": 17078 }, { "epoch": 2.1726243480473224, "ewc_loss": 0.03223692998290062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223693056497723e-05, "grad_norm": 18.78594207763672, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.869510293006897, "num_tokens": 651605291.0, "step": 17079 }, { "epoch": 2.172751558325913, "ewc_loss": 0.03225336968898773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2253370591206476e-05, "grad_norm": 18.62382698059082, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8741335272789001, "num_tokens": 651641998.0, "step": 17080 }, { "epoch": 2.1728787686045035, "ewc_loss": 0.032207150012254715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2207150070462376e-05, "grad_norm": 18.7633113861084, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8716379404067993, "num_tokens": 651683040.0, "step": 17081 }, { "epoch": 2.1730059788830935, "ewc_loss": 0.03232494369149208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23249441862572e-05, "grad_norm": 18.725805282592773, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.883169949054718, "num_tokens": 651718067.0, "step": 17082 }, { "epoch": 2.173133189161684, "ewc_loss": 0.032138142734766006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.213814125047065e-05, "grad_norm": 18.65786361694336, "learning_rate": 1e-06, "loss": 0.3474, "mean_token_accuracy": 0.8879753351211548, "num_tokens": 651761995.0, "step": 17083 }, { "epoch": 2.1732603994402746, "ewc_loss": 0.03228987753391266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228987770853564e-05, "grad_norm": 18.75229835510254, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8868699073791504, "num_tokens": 651797780.0, "step": 17084 }, { "epoch": 2.173387609718865, "ewc_loss": 0.03220758214592934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.220758298994042e-05, "grad_norm": 18.699888229370117, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.863959789276123, "num_tokens": 651832931.0, "step": 17085 }, { "epoch": 2.1735148199974557, "ewc_loss": 0.03222053498029709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2220534194493666e-05, "grad_norm": 18.776208877563477, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8681372404098511, "num_tokens": 651868440.0, "step": 17086 }, { "epoch": 2.173642030276046, "ewc_loss": 0.03227552771568298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2275525882141665e-05, "grad_norm": 18.74340057373047, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.876914381980896, "num_tokens": 651900029.0, "step": 17087 }, { "epoch": 2.1737692405546367, "ewc_loss": 0.03218419477343559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218419442418963e-05, "grad_norm": 18.736467361450195, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8702511787414551, "num_tokens": 651944892.0, "step": 17088 }, { "epoch": 2.1738964508332272, "ewc_loss": 0.032268673181533813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2268671930069104e-05, "grad_norm": 18.827306747436523, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8817556500434875, "num_tokens": 651979330.0, "step": 17089 }, { "epoch": 2.1740236611118178, "ewc_loss": 0.03224586322903633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224586180294864e-05, "grad_norm": 18.67909812927246, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8663063049316406, "num_tokens": 652017389.0, "step": 17090 }, { "epoch": 2.1741508713904083, "ewc_loss": 0.032228983938694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222898521926254e-05, "grad_norm": 18.799503326416016, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.869988203048706, "num_tokens": 652052260.0, "step": 17091 }, { "epoch": 2.174278081668999, "ewc_loss": 0.03227423503994942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.227423439966515e-05, "grad_norm": 18.68116569519043, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8593732714653015, "num_tokens": 652089917.0, "step": 17092 }, { "epoch": 2.1744052919475894, "ewc_loss": 0.03219054266810417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2190542697208e-05, "grad_norm": 18.737205505371094, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8770630955696106, "num_tokens": 652129849.0, "step": 17093 }, { "epoch": 2.17453250222618, "ewc_loss": 0.03227484971284866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2274849218083546e-05, "grad_norm": 18.66216278076172, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8658543229103088, "num_tokens": 652163178.0, "step": 17094 }, { "epoch": 2.1746597125047704, "ewc_loss": 0.032285116612911224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228511559427716e-05, "grad_norm": 18.665008544921875, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8790298104286194, "num_tokens": 652201521.0, "step": 17095 }, { "epoch": 2.174786922783361, "ewc_loss": 0.03228946402668953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228946297895163e-05, "grad_norm": 18.68115997314453, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.874366283416748, "num_tokens": 652234820.0, "step": 17096 }, { "epoch": 2.1749141330619515, "ewc_loss": 0.032368894666433334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2368894608225673e-05, "grad_norm": 18.76512908935547, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8708635568618774, "num_tokens": 652275686.0, "step": 17097 }, { "epoch": 2.175041343340542, "ewc_loss": 0.03236391767859459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236391785321757e-05, "grad_norm": 18.6270694732666, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8652337789535522, "num_tokens": 652313243.0, "step": 17098 }, { "epoch": 2.1751685536191325, "ewc_loss": 0.03227131813764572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.227131674066186e-05, "grad_norm": 18.7266788482666, "learning_rate": 1e-06, "loss": 0.3514, "mean_token_accuracy": 0.8883146047592163, "num_tokens": 652351550.0, "step": 17099 }, { "epoch": 2.175295763897723, "ewc_loss": 0.032330676913261414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2330677640857175e-05, "grad_norm": 18.63273811340332, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8713051080703735, "num_tokens": 652399834.0, "step": 17100 }, { "epoch": 2.1754229741763136, "ewc_loss": 0.03222770616412163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2227704650722444e-05, "grad_norm": 18.624174118041992, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8605728149414062, "num_tokens": 652435839.0, "step": 17101 }, { "epoch": 2.175550184454904, "ewc_loss": 0.03234115242958069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234115138184279e-05, "grad_norm": 18.688066482543945, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8704363107681274, "num_tokens": 652475747.0, "step": 17102 }, { "epoch": 2.1756773947334946, "ewc_loss": 0.032383568584918976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238357021473348e-05, "grad_norm": 18.664613723754883, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8853985667228699, "num_tokens": 652517147.0, "step": 17103 }, { "epoch": 2.175804605012085, "ewc_loss": 0.03236974775791168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236974953324534e-05, "grad_norm": 18.732595443725586, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8861167430877686, "num_tokens": 652552811.0, "step": 17104 }, { "epoch": 2.1759318152906753, "ewc_loss": 0.03236350044608116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2363499485654756e-05, "grad_norm": 18.756547927856445, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8818507790565491, "num_tokens": 652587709.0, "step": 17105 }, { "epoch": 2.1760590255692662, "ewc_loss": 0.032304517924785614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230451693525538e-05, "grad_norm": 18.69914436340332, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8740114569664001, "num_tokens": 652626213.0, "step": 17106 }, { "epoch": 2.1761862358478563, "ewc_loss": 0.03233588486909866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233588358853012e-05, "grad_norm": 18.754947662353516, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8445244431495667, "num_tokens": 652665692.0, "step": 17107 }, { "epoch": 2.176313446126447, "ewc_loss": 0.0323181189596653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2318119338015094e-05, "grad_norm": 18.705894470214844, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8658788800239563, "num_tokens": 652705515.0, "step": 17108 }, { "epoch": 2.1764406564050374, "ewc_loss": 0.03231175243854523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2311752875102684e-05, "grad_norm": 18.65883445739746, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8851289749145508, "num_tokens": 652743343.0, "step": 17109 }, { "epoch": 2.176567866683628, "ewc_loss": 0.032257337123155594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2257335988106206e-05, "grad_norm": 18.712749481201172, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8683310747146606, "num_tokens": 652784587.0, "step": 17110 }, { "epoch": 2.1766950769622184, "ewc_loss": 0.032334521412849426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233452298445627e-05, "grad_norm": 18.730276107788086, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8651092648506165, "num_tokens": 652821985.0, "step": 17111 }, { "epoch": 2.176822287240809, "ewc_loss": 0.032243117690086365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224311876692809e-05, "grad_norm": 18.689939498901367, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8831685781478882, "num_tokens": 652855772.0, "step": 17112 }, { "epoch": 2.1769494975193995, "ewc_loss": 0.03224770724773407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2247706258203834e-05, "grad_norm": 18.739803314208984, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8691019415855408, "num_tokens": 652890497.0, "step": 17113 }, { "epoch": 2.17707670779799, "ewc_loss": 0.03233460709452629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2334606657968834e-05, "grad_norm": 18.747562408447266, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8837715983390808, "num_tokens": 652931611.0, "step": 17114 }, { "epoch": 2.1772039180765805, "ewc_loss": 0.03225398808717728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225398904760368e-05, "grad_norm": 18.677959442138672, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8744001388549805, "num_tokens": 652972129.0, "step": 17115 }, { "epoch": 2.177331128355171, "ewc_loss": 0.032260630279779434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226063199690543e-05, "grad_norm": 18.65076446533203, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8673802614212036, "num_tokens": 653009848.0, "step": 17116 }, { "epoch": 2.1774583386337616, "ewc_loss": 0.03220493718981743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2204938179347664e-05, "grad_norm": 18.661151885986328, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8707679510116577, "num_tokens": 653044614.0, "step": 17117 }, { "epoch": 2.177585548912352, "ewc_loss": 0.03239332512021065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2393323635915294e-05, "grad_norm": 18.734188079833984, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8869620561599731, "num_tokens": 653087614.0, "step": 17118 }, { "epoch": 2.1777127591909426, "ewc_loss": 0.03233160451054573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233160532545298e-05, "grad_norm": 18.689586639404297, "learning_rate": 1e-06, "loss": 0.3563, "mean_token_accuracy": 0.8865362405776978, "num_tokens": 653120434.0, "step": 17119 }, { "epoch": 2.177839969469533, "ewc_loss": 0.03233756124973297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233756069676019e-05, "grad_norm": 18.710433959960938, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8834649920463562, "num_tokens": 653159390.0, "step": 17120 }, { "epoch": 2.1779671797481237, "ewc_loss": 0.03237929567694664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237929558963515e-05, "grad_norm": 18.748268127441406, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8660359382629395, "num_tokens": 653201045.0, "step": 17121 }, { "epoch": 2.1780943900267142, "ewc_loss": 0.03225308656692505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225308682885952e-05, "grad_norm": 18.638296127319336, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8759344816207886, "num_tokens": 653244968.0, "step": 17122 }, { "epoch": 2.1782216003053048, "ewc_loss": 0.03233470767736435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233470852137543e-05, "grad_norm": 18.7227840423584, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8664770126342773, "num_tokens": 653281157.0, "step": 17123 }, { "epoch": 2.1783488105838953, "ewc_loss": 0.03230782970786095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230783113394864e-05, "grad_norm": 18.66707420349121, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8871808052062988, "num_tokens": 653321055.0, "step": 17124 }, { "epoch": 2.178476020862486, "ewc_loss": 0.03229736164212227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229736103094183e-05, "grad_norm": 18.754064559936523, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8607731461524963, "num_tokens": 653359320.0, "step": 17125 }, { "epoch": 2.1786032311410763, "ewc_loss": 0.03231443464756012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231443406548351e-05, "grad_norm": 18.627735137939453, "learning_rate": 1e-06, "loss": 0.3286, "mean_token_accuracy": 0.8928104639053345, "num_tokens": 653404186.0, "step": 17126 }, { "epoch": 2.178730441419667, "ewc_loss": 0.032237738370895386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.22377381962724e-05, "grad_norm": 18.690845489501953, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8840764760971069, "num_tokens": 653438122.0, "step": 17127 }, { "epoch": 2.1788576516982574, "ewc_loss": 0.03232172876596451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232172821299173e-05, "grad_norm": 18.719449996948242, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8737665414810181, "num_tokens": 653474727.0, "step": 17128 }, { "epoch": 2.178984861976848, "ewc_loss": 0.032292187213897705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2292187825078145e-05, "grad_norm": 18.623294830322266, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8793057203292847, "num_tokens": 653513531.0, "step": 17129 }, { "epoch": 2.179112072255438, "ewc_loss": 0.03225110471248627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225110413040966e-05, "grad_norm": 18.679567337036133, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8845028877258301, "num_tokens": 653555871.0, "step": 17130 }, { "epoch": 2.1792392825340285, "ewc_loss": 0.03233712911605835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233712777728215e-05, "grad_norm": 18.729074478149414, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8760792016983032, "num_tokens": 653598825.0, "step": 17131 }, { "epoch": 2.179366492812619, "ewc_loss": 0.03222322091460228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222322266083211e-05, "grad_norm": 18.673511505126953, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8828837275505066, "num_tokens": 653639838.0, "step": 17132 }, { "epoch": 2.1794937030912096, "ewc_loss": 0.032242003828287125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224200554541312e-05, "grad_norm": 18.696640014648438, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8649743795394897, "num_tokens": 653678958.0, "step": 17133 }, { "epoch": 2.1796209133698, "ewc_loss": 0.03221810981631279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221811130060814e-05, "grad_norm": 18.688369750976562, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8822673559188843, "num_tokens": 653718043.0, "step": 17134 }, { "epoch": 2.1797481236483907, "ewc_loss": 0.03223330155014992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223329986212775e-05, "grad_norm": 18.684837341308594, "learning_rate": 1e-06, "loss": 0.3468, "mean_token_accuracy": 0.8887069225311279, "num_tokens": 653756308.0, "step": 17135 }, { "epoch": 2.179875333926981, "ewc_loss": 0.032233867794275284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223386738682166e-05, "grad_norm": 18.709810256958008, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8832802176475525, "num_tokens": 653793140.0, "step": 17136 }, { "epoch": 2.1800025442055717, "ewc_loss": 0.03217754140496254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2177540560951456e-05, "grad_norm": 18.723955154418945, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8801737427711487, "num_tokens": 653823441.0, "step": 17137 }, { "epoch": 2.1801297544841622, "ewc_loss": 0.03219110518693924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.21911065839231e-05, "grad_norm": 18.649085998535156, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8645459413528442, "num_tokens": 653860743.0, "step": 17138 }, { "epoch": 2.1802569647627528, "ewc_loss": 0.03221810981631279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221811130060814e-05, "grad_norm": 18.73770523071289, "learning_rate": 1e-06, "loss": 0.3453, "mean_token_accuracy": 0.886832594871521, "num_tokens": 653894785.0, "step": 17139 }, { "epoch": 2.1803841750413433, "ewc_loss": 0.032189685851335526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218968413420953e-05, "grad_norm": 18.62053108215332, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8710877895355225, "num_tokens": 653932208.0, "step": 17140 }, { "epoch": 2.180511385319934, "ewc_loss": 0.0322122760117054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221227598260157e-05, "grad_norm": 18.746662139892578, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8604319095611572, "num_tokens": 653968818.0, "step": 17141 }, { "epoch": 2.1806385955985244, "ewc_loss": 0.03223522752523422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223522799089551e-05, "grad_norm": 18.538297653198242, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8710517883300781, "num_tokens": 654010696.0, "step": 17142 }, { "epoch": 2.180765805877115, "ewc_loss": 0.03214061260223389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214061143808067e-05, "grad_norm": 18.658531188964844, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8785127401351929, "num_tokens": 654047632.0, "step": 17143 }, { "epoch": 2.1808930161557054, "ewc_loss": 0.03234366327524185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2343661587219685e-05, "grad_norm": 18.715560913085938, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8786017894744873, "num_tokens": 654088200.0, "step": 17144 }, { "epoch": 2.181020226434296, "ewc_loss": 0.03232386335730553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232386370655149e-05, "grad_norm": 18.687271118164062, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8716919422149658, "num_tokens": 654124225.0, "step": 17145 }, { "epoch": 2.1811474367128865, "ewc_loss": 0.032319292426109314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231929076719098e-05, "grad_norm": 18.7261962890625, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8768202066421509, "num_tokens": 654163317.0, "step": 17146 }, { "epoch": 2.181274646991477, "ewc_loss": 0.03233420476317406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2334206480300054e-05, "grad_norm": 18.66487693786621, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8663265705108643, "num_tokens": 654199208.0, "step": 17147 }, { "epoch": 2.1814018572700675, "ewc_loss": 0.03228073939681053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228073910577223e-05, "grad_norm": 18.69084358215332, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8627091646194458, "num_tokens": 654239988.0, "step": 17148 }, { "epoch": 2.181529067548658, "ewc_loss": 0.03237933665513992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237933560740203e-05, "grad_norm": 18.744462966918945, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8691503405570984, "num_tokens": 654276361.0, "step": 17149 }, { "epoch": 2.1816562778272486, "ewc_loss": 0.03233359381556511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233359529986046e-05, "grad_norm": 18.72210121154785, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8774043917655945, "num_tokens": 654316690.0, "step": 17150 }, { "epoch": 2.181783488105839, "ewc_loss": 0.03232869505882263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232869494240731e-05, "grad_norm": 18.758777618408203, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8830550909042358, "num_tokens": 654354112.0, "step": 17151 }, { "epoch": 2.1819106983844296, "ewc_loss": 0.03231671079993248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231671144021675e-05, "grad_norm": 18.70245361328125, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8864425420761108, "num_tokens": 654398896.0, "step": 17152 }, { "epoch": 2.18203790866302, "ewc_loss": 0.03229781612753868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2297815778292716e-05, "grad_norm": 18.763031005859375, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8867381811141968, "num_tokens": 654435180.0, "step": 17153 }, { "epoch": 2.1821651189416107, "ewc_loss": 0.032237619161605835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.223761814297177e-05, "grad_norm": 18.646793365478516, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8628944754600525, "num_tokens": 654473958.0, "step": 17154 }, { "epoch": 2.1822923292202008, "ewc_loss": 0.03221515938639641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221516089979559e-05, "grad_norm": 18.695444107055664, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8603032827377319, "num_tokens": 654512918.0, "step": 17155 }, { "epoch": 2.1824195394987913, "ewc_loss": 0.03230784460902214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230784568586387e-05, "grad_norm": 18.697851181030273, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8825769424438477, "num_tokens": 654552296.0, "step": 17156 }, { "epoch": 2.182546749777382, "ewc_loss": 0.032315321266651154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231532173231244e-05, "grad_norm": 18.74140739440918, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8811373114585876, "num_tokens": 654592088.0, "step": 17157 }, { "epoch": 2.1826739600559724, "ewc_loss": 0.03228038176894188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228038258384913e-05, "grad_norm": 18.677459716796875, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8671517372131348, "num_tokens": 654635119.0, "step": 17158 }, { "epoch": 2.182801170334563, "ewc_loss": 0.03228745236992836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228745117667131e-05, "grad_norm": 18.695621490478516, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8630528450012207, "num_tokens": 654671204.0, "step": 17159 }, { "epoch": 2.1829283806131534, "ewc_loss": 0.03230750188231468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2307503715856e-05, "grad_norm": 18.696985244750977, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8724379539489746, "num_tokens": 654702986.0, "step": 17160 }, { "epoch": 2.183055590891744, "ewc_loss": 0.03231450542807579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231450682505965e-05, "grad_norm": 18.667272567749023, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.850806713104248, "num_tokens": 654745813.0, "step": 17161 }, { "epoch": 2.1831828011703345, "ewc_loss": 0.03226352110505104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226352055207826e-05, "grad_norm": 18.63829803466797, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8743513822555542, "num_tokens": 654783305.0, "step": 17162 }, { "epoch": 2.183310011448925, "ewc_loss": 0.03235255926847458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235256008338183e-05, "grad_norm": 18.654098510742188, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8820806741714478, "num_tokens": 654823427.0, "step": 17163 }, { "epoch": 2.1834372217275155, "ewc_loss": 0.032350897789001465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235089752706699e-05, "grad_norm": 18.66704559326172, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8872036337852478, "num_tokens": 654862400.0, "step": 17164 }, { "epoch": 2.183564432006106, "ewc_loss": 0.032354582101106644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2354582799598575e-05, "grad_norm": 18.68090057373047, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.878441333770752, "num_tokens": 654902484.0, "step": 17165 }, { "epoch": 2.1836916422846966, "ewc_loss": 0.032362308353185654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236230986658484e-05, "grad_norm": 18.644594192504883, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8796929717063904, "num_tokens": 654946882.0, "step": 17166 }, { "epoch": 2.183818852563287, "ewc_loss": 0.03237967938184738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237967757740989e-05, "grad_norm": 18.804990768432617, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8652933239936829, "num_tokens": 654977850.0, "step": 17167 }, { "epoch": 2.1839460628418776, "ewc_loss": 0.03240116685628891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2401167118223384e-05, "grad_norm": 18.66592025756836, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8605045676231384, "num_tokens": 655015352.0, "step": 17168 }, { "epoch": 2.184073273120468, "ewc_loss": 0.032292138785123825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229214053135365e-05, "grad_norm": 18.666839599609375, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8752279877662659, "num_tokens": 655047488.0, "step": 17169 }, { "epoch": 2.1842004833990587, "ewc_loss": 0.03242471441626549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242471575504169e-05, "grad_norm": 18.73612403869629, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8711219429969788, "num_tokens": 655083410.0, "step": 17170 }, { "epoch": 2.1843276936776492, "ewc_loss": 0.03240714967250824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240714795538224e-05, "grad_norm": 18.636398315429688, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8819851875305176, "num_tokens": 655120594.0, "step": 17171 }, { "epoch": 2.1844549039562398, "ewc_loss": 0.03235173597931862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235173426219262e-05, "grad_norm": 18.757247924804688, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8725271224975586, "num_tokens": 655157310.0, "step": 17172 }, { "epoch": 2.1845821142348303, "ewc_loss": 0.03239715099334717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2397150789620355e-05, "grad_norm": 18.67669105529785, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8834669589996338, "num_tokens": 655193749.0, "step": 17173 }, { "epoch": 2.184709324513421, "ewc_loss": 0.03230149671435356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230149741284549e-05, "grad_norm": 18.69315528869629, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8883715867996216, "num_tokens": 655235532.0, "step": 17174 }, { "epoch": 2.1848365347920113, "ewc_loss": 0.03236936032772064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236936026951298e-05, "grad_norm": 18.765674591064453, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8796030282974243, "num_tokens": 655282154.0, "step": 17175 }, { "epoch": 2.184963745070602, "ewc_loss": 0.03233537822961807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233537790947594e-05, "grad_norm": 18.702749252319336, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8679814338684082, "num_tokens": 655321342.0, "step": 17176 }, { "epoch": 2.1850909553491924, "ewc_loss": 0.03228766843676567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228766945539974e-05, "grad_norm": 18.763704299926758, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8918325304985046, "num_tokens": 655358107.0, "step": 17177 }, { "epoch": 2.185218165627783, "ewc_loss": 0.032388173043727875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238817225792445e-05, "grad_norm": 18.702550888061523, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8688874244689941, "num_tokens": 655393462.0, "step": 17178 }, { "epoch": 2.1853453759063735, "ewc_loss": 0.032267700880765915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226770058972761e-05, "grad_norm": 18.691823959350586, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8761584758758545, "num_tokens": 655436114.0, "step": 17179 }, { "epoch": 2.1854725861849635, "ewc_loss": 0.03231816738843918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2318166631739587e-05, "grad_norm": 18.673677444458008, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8774270415306091, "num_tokens": 655473119.0, "step": 17180 }, { "epoch": 2.185599796463554, "ewc_loss": 0.0323086716234684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230867150705308e-05, "grad_norm": 18.70745086669922, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8684912919998169, "num_tokens": 655515537.0, "step": 17181 }, { "epoch": 2.1857270067421446, "ewc_loss": 0.03228520229458809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228520290576853e-05, "grad_norm": 18.621440887451172, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8685020208358765, "num_tokens": 655555944.0, "step": 17182 }, { "epoch": 2.185854217020735, "ewc_loss": 0.0323396734893322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233967436244711e-05, "grad_norm": 18.67771339416504, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8575365543365479, "num_tokens": 655589952.0, "step": 17183 }, { "epoch": 2.1859814272993257, "ewc_loss": 0.032393187284469604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239318903069943e-05, "grad_norm": 18.650875091552734, "learning_rate": 1e-06, "loss": 0.3514, "mean_token_accuracy": 0.886866569519043, "num_tokens": 655628269.0, "step": 17184 }, { "epoch": 2.186108637577916, "ewc_loss": 0.03235068917274475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2350690162274987e-05, "grad_norm": 18.710847854614258, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8771302700042725, "num_tokens": 655657315.0, "step": 17185 }, { "epoch": 2.1862358478565067, "ewc_loss": 0.032380763441324234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2380761695094407e-05, "grad_norm": 18.6633243560791, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8738337159156799, "num_tokens": 655696117.0, "step": 17186 }, { "epoch": 2.1863630581350972, "ewc_loss": 0.03243770822882652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243770697736181e-05, "grad_norm": 18.810651779174805, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8571211099624634, "num_tokens": 655738823.0, "step": 17187 }, { "epoch": 2.1864902684136878, "ewc_loss": 0.03236031159758568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2360312616219744e-05, "grad_norm": 18.59566307067871, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.865448534488678, "num_tokens": 655781457.0, "step": 17188 }, { "epoch": 2.1866174786922783, "ewc_loss": 0.03234047070145607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2340471079805866e-05, "grad_norm": 18.705623626708984, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.869186282157898, "num_tokens": 655824704.0, "step": 17189 }, { "epoch": 2.186744688970869, "ewc_loss": 0.03246085345745087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246085179853253e-05, "grad_norm": 18.79293441772461, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.8837629556655884, "num_tokens": 655860013.0, "step": 17190 }, { "epoch": 2.1868718992494594, "ewc_loss": 0.032325662672519684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2325660868082196e-05, "grad_norm": 18.72827911376953, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8721827268600464, "num_tokens": 655897146.0, "step": 17191 }, { "epoch": 2.18699910952805, "ewc_loss": 0.03234551101922989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234550968045369e-05, "grad_norm": 18.772045135498047, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8802874088287354, "num_tokens": 655934299.0, "step": 17192 }, { "epoch": 2.1871263198066404, "ewc_loss": 0.03236265107989311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2362651836592704e-05, "grad_norm": 18.789588928222656, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8781219720840454, "num_tokens": 655976554.0, "step": 17193 }, { "epoch": 2.187253530085231, "ewc_loss": 0.03233466297388077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233466122765094e-05, "grad_norm": 18.783344268798828, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8853592872619629, "num_tokens": 656011756.0, "step": 17194 }, { "epoch": 2.1873807403638215, "ewc_loss": 0.03228402137756348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2284020562656224e-05, "grad_norm": 18.711400985717773, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8716374635696411, "num_tokens": 656054405.0, "step": 17195 }, { "epoch": 2.187507950642412, "ewc_loss": 0.03222013637423515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.222013765480369e-05, "grad_norm": 18.7323055267334, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8782565593719482, "num_tokens": 656094668.0, "step": 17196 }, { "epoch": 2.1876351609210025, "ewc_loss": 0.0322469025850296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224690226488747e-05, "grad_norm": 18.709758758544922, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8748521208763123, "num_tokens": 656137664.0, "step": 17197 }, { "epoch": 2.187762371199593, "ewc_loss": 0.03222128748893738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2221287256106734e-05, "grad_norm": 18.763635635375977, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8670746088027954, "num_tokens": 656180611.0, "step": 17198 }, { "epoch": 2.1878895814781836, "ewc_loss": 0.032297395169734955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229739377275109e-05, "grad_norm": 18.77604103088379, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8663105964660645, "num_tokens": 656217789.0, "step": 17199 }, { "epoch": 2.188016791756774, "ewc_loss": 0.032274194061756134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.227419438189827e-05, "grad_norm": 18.826656341552734, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8714629411697388, "num_tokens": 656261940.0, "step": 17200 }, { "epoch": 2.1881440020353646, "ewc_loss": 0.03221164643764496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221164661226794e-05, "grad_norm": 18.695466995239258, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8837776184082031, "num_tokens": 656296000.0, "step": 17201 }, { "epoch": 2.188271212313955, "ewc_loss": 0.03219134733080864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219134669052437e-05, "grad_norm": 18.76470375061035, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8842894434928894, "num_tokens": 656340862.0, "step": 17202 }, { "epoch": 2.1883984225925452, "ewc_loss": 0.0322488471865654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224884858354926e-05, "grad_norm": 18.715200424194336, "learning_rate": 1e-06, "loss": 0.3222, "mean_token_accuracy": 0.8975679874420166, "num_tokens": 656376923.0, "step": 17203 }, { "epoch": 2.188525632871136, "ewc_loss": 0.03212472051382065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.212472074665129e-05, "grad_norm": 18.726364135742188, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8710476756095886, "num_tokens": 656414058.0, "step": 17204 }, { "epoch": 2.1886528431497263, "ewc_loss": 0.03221135959029198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221135921194218e-05, "grad_norm": 18.649343490600586, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8661400675773621, "num_tokens": 656450077.0, "step": 17205 }, { "epoch": 2.188780053428317, "ewc_loss": 0.032217662781476974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.221766382921487e-05, "grad_norm": 18.782384872436523, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8658468723297119, "num_tokens": 656487258.0, "step": 17206 }, { "epoch": 2.1889072637069074, "ewc_loss": 0.03224975988268852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224976171622984e-05, "grad_norm": 18.738876342773438, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8889394998550415, "num_tokens": 656525141.0, "step": 17207 }, { "epoch": 2.189034473985498, "ewc_loss": 0.032145753502845764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.214575190213509e-05, "grad_norm": 18.762977600097656, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8770537972450256, "num_tokens": 656561517.0, "step": 17208 }, { "epoch": 2.1891616842640884, "ewc_loss": 0.03222173824906349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2221738365478814e-05, "grad_norm": 18.734994888305664, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8803697824478149, "num_tokens": 656595859.0, "step": 17209 }, { "epoch": 2.189288894542679, "ewc_loss": 0.032198093831539154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2198095141211525e-05, "grad_norm": 18.797948837280273, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8709368109703064, "num_tokens": 656627652.0, "step": 17210 }, { "epoch": 2.1894161048212695, "ewc_loss": 0.03226293995976448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226293847546913e-05, "grad_norm": 18.700780868530273, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8807270526885986, "num_tokens": 656669951.0, "step": 17211 }, { "epoch": 2.18954331509986, "ewc_loss": 0.032189808785915375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.218980782548897e-05, "grad_norm": 18.782045364379883, "learning_rate": 1e-06, "loss": 0.3561, "mean_token_accuracy": 0.8857483863830566, "num_tokens": 656702216.0, "step": 17212 }, { "epoch": 2.1896705253784505, "ewc_loss": 0.03224180266261101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2241801818599924e-05, "grad_norm": 18.685375213623047, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8647767305374146, "num_tokens": 656744816.0, "step": 17213 }, { "epoch": 2.189797735657041, "ewc_loss": 0.032194238156080246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.219423888367601e-05, "grad_norm": 18.761117935180664, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8696900606155396, "num_tokens": 656785533.0, "step": 17214 }, { "epoch": 2.1899249459356316, "ewc_loss": 0.03227442130446434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.227441993658431e-05, "grad_norm": 18.691238403320312, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8732101917266846, "num_tokens": 656825375.0, "step": 17215 }, { "epoch": 2.190052156214222, "ewc_loss": 0.032177627086639404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.217762787244283e-05, "grad_norm": 18.70857810974121, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8826515078544617, "num_tokens": 656857951.0, "step": 17216 }, { "epoch": 2.1901793664928126, "ewc_loss": 0.03228697180747986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2286970963468775e-05, "grad_norm": 18.71149444580078, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8829034566879272, "num_tokens": 656896911.0, "step": 17217 }, { "epoch": 2.190306576771403, "ewc_loss": 0.03224993497133255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224993633921258e-05, "grad_norm": 18.72127342224121, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8806124925613403, "num_tokens": 656935723.0, "step": 17218 }, { "epoch": 2.1904337870499937, "ewc_loss": 0.032254625111818314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.225462569389492e-05, "grad_norm": 18.73001480102539, "learning_rate": 1e-06, "loss": 0.352, "mean_token_accuracy": 0.8853659629821777, "num_tokens": 656970881.0, "step": 17219 }, { "epoch": 2.1905609973285842, "ewc_loss": 0.032286617904901505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228661807952449e-05, "grad_norm": 18.724754333496094, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8704055547714233, "num_tokens": 657007721.0, "step": 17220 }, { "epoch": 2.1906882076071748, "ewc_loss": 0.03226688504219055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.226688568247482e-05, "grad_norm": 18.746965408325195, "learning_rate": 1e-06, "loss": 0.3421, "mean_token_accuracy": 0.8852612972259521, "num_tokens": 657042359.0, "step": 17221 }, { "epoch": 2.1908154178857653, "ewc_loss": 0.03231636807322502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231636947020888e-05, "grad_norm": 18.759235382080078, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8799633979797363, "num_tokens": 657084662.0, "step": 17222 }, { "epoch": 2.190942628164356, "ewc_loss": 0.03225379437208176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2253796234726906e-05, "grad_norm": 18.758901596069336, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8761085271835327, "num_tokens": 657122002.0, "step": 17223 }, { "epoch": 2.1910698384429463, "ewc_loss": 0.032239437103271484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2239437132375315e-05, "grad_norm": 18.647764205932617, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8755790591239929, "num_tokens": 657158768.0, "step": 17224 }, { "epoch": 2.191197048721537, "ewc_loss": 0.032297007739543915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.229700814699754e-05, "grad_norm": 18.729211807250977, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8630090355873108, "num_tokens": 657193445.0, "step": 17225 }, { "epoch": 2.1913242590001274, "ewc_loss": 0.0323343463242054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2334344723494723e-05, "grad_norm": 18.69504165649414, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8740487694740295, "num_tokens": 657234922.0, "step": 17226 }, { "epoch": 2.191451469278718, "ewc_loss": 0.03239991515874863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2399915653513744e-05, "grad_norm": 18.7331485748291, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8657965064048767, "num_tokens": 657270774.0, "step": 17227 }, { "epoch": 2.191578679557308, "ewc_loss": 0.03231099620461464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231099617551081e-05, "grad_norm": 18.645477294921875, "learning_rate": 1e-06, "loss": 0.3406, "mean_token_accuracy": 0.8886306285858154, "num_tokens": 657307046.0, "step": 17228 }, { "epoch": 2.1917058898358985, "ewc_loss": 0.032355375587940216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235537587897852e-05, "grad_norm": 18.66948890686035, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8647946119308472, "num_tokens": 657352865.0, "step": 17229 }, { "epoch": 2.191833100114489, "ewc_loss": 0.03237345442175865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237345299567096e-05, "grad_norm": 18.70370101928711, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8546254634857178, "num_tokens": 657393372.0, "step": 17230 }, { "epoch": 2.1919603103930796, "ewc_loss": 0.032445743680000305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244574327254668e-05, "grad_norm": 18.685928344726562, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8551235198974609, "num_tokens": 657432445.0, "step": 17231 }, { "epoch": 2.19208752067167, "ewc_loss": 0.03238727152347565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2387270039180294e-05, "grad_norm": 18.739784240722656, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.872961163520813, "num_tokens": 657469145.0, "step": 17232 }, { "epoch": 2.1922147309502606, "ewc_loss": 0.03242526575922966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242526508984156e-05, "grad_norm": 18.697389602661133, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8894395232200623, "num_tokens": 657510369.0, "step": 17233 }, { "epoch": 2.192341941228851, "ewc_loss": 0.03230179846286774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230179936508648e-05, "grad_norm": 18.65656089782715, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8809462785720825, "num_tokens": 657550257.0, "step": 17234 }, { "epoch": 2.1924691515074417, "ewc_loss": 0.03241298347711563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241298327338882e-05, "grad_norm": 18.661121368408203, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8768112659454346, "num_tokens": 657587095.0, "step": 17235 }, { "epoch": 2.1925963617860322, "ewc_loss": 0.032343361526727676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2343363272957504e-05, "grad_norm": 18.608617782592773, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8787164688110352, "num_tokens": 657619710.0, "step": 17236 }, { "epoch": 2.1927235720646228, "ewc_loss": 0.03237060829997063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2370608096243814e-05, "grad_norm": 18.74117660522461, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8701616525650024, "num_tokens": 657653316.0, "step": 17237 }, { "epoch": 2.1928507823432133, "ewc_loss": 0.03239716589450836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239716534153558e-05, "grad_norm": 18.674928665161133, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8700942993164062, "num_tokens": 657693570.0, "step": 17238 }, { "epoch": 2.192977992621804, "ewc_loss": 0.03236238285899162, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236238262616098e-05, "grad_norm": 18.713638305664062, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8757977485656738, "num_tokens": 657736781.0, "step": 17239 }, { "epoch": 2.1931052029003943, "ewc_loss": 0.0324159599840641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241595914005302e-05, "grad_norm": 18.741016387939453, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8662331104278564, "num_tokens": 657775603.0, "step": 17240 }, { "epoch": 2.193232413178985, "ewc_loss": 0.03241029009222984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241029116907157e-05, "grad_norm": 18.7087459564209, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8712027668952942, "num_tokens": 657809065.0, "step": 17241 }, { "epoch": 2.1933596234575754, "ewc_loss": 0.03232288360595703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2322885090252385e-05, "grad_norm": 18.672773361206055, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8720577955245972, "num_tokens": 657851369.0, "step": 17242 }, { "epoch": 2.193486833736166, "ewc_loss": 0.03239278495311737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2392785215051845e-05, "grad_norm": 18.707494735717773, "learning_rate": 1e-06, "loss": 0.3325, "mean_token_accuracy": 0.8953738212585449, "num_tokens": 657889008.0, "step": 17243 }, { "epoch": 2.1936140440147565, "ewc_loss": 0.03233792632818222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23379244946409e-05, "grad_norm": 18.744365692138672, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8723669052124023, "num_tokens": 657925867.0, "step": 17244 }, { "epoch": 2.193741254293347, "ewc_loss": 0.03239735960960388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239735815441236e-05, "grad_norm": 18.774869918823242, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8648521304130554, "num_tokens": 657959502.0, "step": 17245 }, { "epoch": 2.1938684645719375, "ewc_loss": 0.03235459700226784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23545973515138e-05, "grad_norm": 18.723356246948242, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8779886364936829, "num_tokens": 657999063.0, "step": 17246 }, { "epoch": 2.193995674850528, "ewc_loss": 0.03235345333814621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2353455026168376e-05, "grad_norm": 18.73491096496582, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8648192286491394, "num_tokens": 658033016.0, "step": 17247 }, { "epoch": 2.1941228851291186, "ewc_loss": 0.03232632577419281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232632661820389e-05, "grad_norm": 18.700584411621094, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.881045937538147, "num_tokens": 658071259.0, "step": 17248 }, { "epoch": 2.194250095407709, "ewc_loss": 0.03236865997314453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236865813960321e-05, "grad_norm": 18.70896339416504, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.87629634141922, "num_tokens": 658103190.0, "step": 17249 }, { "epoch": 2.1943773056862996, "ewc_loss": 0.03235926106572151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2359261240344495e-05, "grad_norm": 18.636402130126953, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8675018548965454, "num_tokens": 658143346.0, "step": 17250 }, { "epoch": 2.19450451596489, "ewc_loss": 0.032340291887521744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234029281884432e-05, "grad_norm": 18.743513107299805, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.883797824382782, "num_tokens": 658181747.0, "step": 17251 }, { "epoch": 2.1946317262434807, "ewc_loss": 0.03241649642586708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241649756091647e-05, "grad_norm": 18.66108512878418, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8752477169036865, "num_tokens": 658221770.0, "step": 17252 }, { "epoch": 2.1947589365220708, "ewc_loss": 0.03231414407491684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231414302717894e-05, "grad_norm": 18.688222885131836, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8787994980812073, "num_tokens": 658260157.0, "step": 17253 }, { "epoch": 2.1948861468006613, "ewc_loss": 0.032422374933958054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242237653466873e-05, "grad_norm": 18.692134857177734, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.864425778388977, "num_tokens": 658301815.0, "step": 17254 }, { "epoch": 2.195013357079252, "ewc_loss": 0.03240274637937546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240274600102566e-05, "grad_norm": 18.7860050201416, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.876446008682251, "num_tokens": 658340732.0, "step": 17255 }, { "epoch": 2.1951405673578424, "ewc_loss": 0.032432664185762405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2432664738735184e-05, "grad_norm": 18.72705841064453, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8691656589508057, "num_tokens": 658380569.0, "step": 17256 }, { "epoch": 2.195267777636433, "ewc_loss": 0.03233838453888893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2338382879970595e-05, "grad_norm": 18.693218231201172, "learning_rate": 1e-06, "loss": 0.3514, "mean_token_accuracy": 0.8859532475471497, "num_tokens": 658415842.0, "step": 17257 }, { "epoch": 2.1953949879150234, "ewc_loss": 0.03239823132753372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239823126932606e-05, "grad_norm": 18.708843231201172, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8786144256591797, "num_tokens": 658452861.0, "step": 17258 }, { "epoch": 2.195522198193614, "ewc_loss": 0.032373473048210144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23734748235438e-05, "grad_norm": 18.77524185180664, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8772449493408203, "num_tokens": 658487019.0, "step": 17259 }, { "epoch": 2.1956494084722045, "ewc_loss": 0.03236030042171478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236030170228332e-05, "grad_norm": 18.66368865966797, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8726636171340942, "num_tokens": 658529290.0, "step": 17260 }, { "epoch": 2.195776618750795, "ewc_loss": 0.032331038266420364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233103780075908e-05, "grad_norm": 18.703401565551758, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8697293400764465, "num_tokens": 658572118.0, "step": 17261 }, { "epoch": 2.1959038290293855, "ewc_loss": 0.032372016459703445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237201599404216e-05, "grad_norm": 18.683534622192383, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8715997934341431, "num_tokens": 658615972.0, "step": 17262 }, { "epoch": 2.196031039307976, "ewc_loss": 0.03235175833106041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235175972804427e-05, "grad_norm": 18.72893714904785, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8824632167816162, "num_tokens": 658653848.0, "step": 17263 }, { "epoch": 2.1961582495865666, "ewc_loss": 0.03235011175274849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235011172364466e-05, "grad_norm": 18.70240592956543, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8739672899246216, "num_tokens": 658686247.0, "step": 17264 }, { "epoch": 2.196285459865157, "ewc_loss": 0.03234635293483734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234635369153693e-05, "grad_norm": 18.65935516357422, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8721252083778381, "num_tokens": 658725787.0, "step": 17265 }, { "epoch": 2.1964126701437476, "ewc_loss": 0.03239909186959267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239909347030334e-05, "grad_norm": 18.737464904785156, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8713284730911255, "num_tokens": 658765957.0, "step": 17266 }, { "epoch": 2.196539880422338, "ewc_loss": 0.03243054077029228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243054015911184e-05, "grad_norm": 18.7076358795166, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8843531012535095, "num_tokens": 658800996.0, "step": 17267 }, { "epoch": 2.1966670907009287, "ewc_loss": 0.03236446902155876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236447082599625e-05, "grad_norm": 18.7518367767334, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8723289966583252, "num_tokens": 658833779.0, "step": 17268 }, { "epoch": 2.196794300979519, "ewc_loss": 0.03239115700125694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239115540054627e-05, "grad_norm": 18.725353240966797, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8787174820899963, "num_tokens": 658874425.0, "step": 17269 }, { "epoch": 2.1969215112581097, "ewc_loss": 0.03236748278141022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236748307244852e-05, "grad_norm": 18.701486587524414, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.884075403213501, "num_tokens": 658912007.0, "step": 17270 }, { "epoch": 2.1970487215367003, "ewc_loss": 0.03230689465999603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2306896173395216e-05, "grad_norm": 18.722209930419922, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8651174306869507, "num_tokens": 658956775.0, "step": 17271 }, { "epoch": 2.197175931815291, "ewc_loss": 0.03235770761966705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2357707823393866e-05, "grad_norm": 18.73821258544922, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8876796960830688, "num_tokens": 658997132.0, "step": 17272 }, { "epoch": 2.1973031420938813, "ewc_loss": 0.032289646565914154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228964487789199e-05, "grad_norm": 18.73250389099121, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.881829023361206, "num_tokens": 659040437.0, "step": 17273 }, { "epoch": 2.197430352372472, "ewc_loss": 0.032337795943021774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2337797165382653e-05, "grad_norm": 18.823318481445312, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.8891153335571289, "num_tokens": 659074461.0, "step": 17274 }, { "epoch": 2.1975575626510624, "ewc_loss": 0.03230719268321991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230719084967859e-05, "grad_norm": 18.68259048461914, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8721374869346619, "num_tokens": 659113818.0, "step": 17275 }, { "epoch": 2.197684772929653, "ewc_loss": 0.03224983438849449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.224983447580598e-05, "grad_norm": 18.673662185668945, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8621089458465576, "num_tokens": 659149297.0, "step": 17276 }, { "epoch": 2.1978119832082434, "ewc_loss": 0.032312020659446716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231202208553441e-05, "grad_norm": 18.75429344177246, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8777868151664734, "num_tokens": 659187500.0, "step": 17277 }, { "epoch": 2.1979391934868335, "ewc_loss": 0.03234022110700607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2340220059268177e-05, "grad_norm": 18.763107299804688, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8496875166893005, "num_tokens": 659228624.0, "step": 17278 }, { "epoch": 2.198066403765424, "ewc_loss": 0.032324615865945816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232461676816456e-05, "grad_norm": 18.730363845825195, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8821990489959717, "num_tokens": 659270317.0, "step": 17279 }, { "epoch": 2.1981936140440146, "ewc_loss": 0.03233848884701729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2338488381356e-05, "grad_norm": 18.783010482788086, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.875, "num_tokens": 659302133.0, "step": 17280 }, { "epoch": 2.198320824322605, "ewc_loss": 0.032365601509809494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2365602237405255e-05, "grad_norm": 18.7811222076416, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8802475929260254, "num_tokens": 659342469.0, "step": 17281 }, { "epoch": 2.1984480346011956, "ewc_loss": 0.032326385378837585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232638482586481e-05, "grad_norm": 18.726566314697266, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.886449933052063, "num_tokens": 659373786.0, "step": 17282 }, { "epoch": 2.198575244879786, "ewc_loss": 0.03236962854862213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2369629479944706e-05, "grad_norm": 18.82411766052246, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8678128719329834, "num_tokens": 659412498.0, "step": 17283 }, { "epoch": 2.1987024551583767, "ewc_loss": 0.03239520266652107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2395200832979754e-05, "grad_norm": 18.738059997558594, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8605630397796631, "num_tokens": 659446415.0, "step": 17284 }, { "epoch": 2.1988296654369672, "ewc_loss": 0.032300516963005066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.230051515856758e-05, "grad_norm": 18.673084259033203, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8674131631851196, "num_tokens": 659488824.0, "step": 17285 }, { "epoch": 2.1989568757155578, "ewc_loss": 0.0323631577193737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236315751564689e-05, "grad_norm": 18.739988327026367, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8783798217773438, "num_tokens": 659523036.0, "step": 17286 }, { "epoch": 2.1990840859941483, "ewc_loss": 0.032378096133470535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237809505662881e-05, "grad_norm": 18.752429962158203, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8694043159484863, "num_tokens": 659562780.0, "step": 17287 }, { "epoch": 2.199211296272739, "ewc_loss": 0.03242417797446251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242417733417824e-05, "grad_norm": 18.782102584838867, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8824383020401001, "num_tokens": 659599364.0, "step": 17288 }, { "epoch": 2.1993385065513293, "ewc_loss": 0.03240499272942543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2404994271928445e-05, "grad_norm": 18.753734588623047, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8685806393623352, "num_tokens": 659636275.0, "step": 17289 }, { "epoch": 2.19946571682992, "ewc_loss": 0.032381508499383926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238150748074986e-05, "grad_norm": 18.778043746948242, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8745004534721375, "num_tokens": 659670315.0, "step": 17290 }, { "epoch": 2.1995929271085104, "ewc_loss": 0.03239574283361435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239574289182201e-05, "grad_norm": 18.699094772338867, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8700878620147705, "num_tokens": 659710968.0, "step": 17291 }, { "epoch": 2.199720137387101, "ewc_loss": 0.032397400587797165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2397401810158044e-05, "grad_norm": 18.8153133392334, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8836113214492798, "num_tokens": 659751334.0, "step": 17292 }, { "epoch": 2.1998473476656915, "ewc_loss": 0.03241834044456482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2418342016171664e-05, "grad_norm": 18.66704750061035, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8745241165161133, "num_tokens": 659793867.0, "step": 17293 }, { "epoch": 2.199974557944282, "ewc_loss": 0.032329853624105453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2329851819667965e-05, "grad_norm": 18.690500259399414, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8755831122398376, "num_tokens": 659838724.0, "step": 17294 }, { "epoch": 2.2001017682228725, "ewc_loss": 0.03247992321848869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2479922083439305e-05, "grad_norm": 18.78311538696289, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8724678754806519, "num_tokens": 659876589.0, "step": 17295 }, { "epoch": 2.200228978501463, "ewc_loss": 0.0324055552482605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240555452066474e-05, "grad_norm": 18.771995544433594, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8656805753707886, "num_tokens": 659914864.0, "step": 17296 }, { "epoch": 2.2003561887800536, "ewc_loss": 0.03239224851131439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2392246794188395e-05, "grad_norm": 18.752899169921875, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8817344903945923, "num_tokens": 659952482.0, "step": 17297 }, { "epoch": 2.200483399058644, "ewc_loss": 0.03238667547702789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238667704863474e-05, "grad_norm": 18.703128814697266, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8746709227561951, "num_tokens": 659985192.0, "step": 17298 }, { "epoch": 2.2006106093372346, "ewc_loss": 0.032374776899814606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237477721995674e-05, "grad_norm": 18.75848960876465, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8819040060043335, "num_tokens": 660019505.0, "step": 17299 }, { "epoch": 2.200737819615825, "ewc_loss": 0.03238637000322342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238637145841494e-05, "grad_norm": 18.70307731628418, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8683372139930725, "num_tokens": 660054942.0, "step": 17300 }, { "epoch": 2.2008650298944152, "ewc_loss": 0.03238889202475548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2388892577728257e-05, "grad_norm": 18.733549118041992, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8742228746414185, "num_tokens": 660092836.0, "step": 17301 }, { "epoch": 2.200992240173006, "ewc_loss": 0.03243328258395195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243328319513239e-05, "grad_norm": 18.722917556762695, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8691531419754028, "num_tokens": 660129419.0, "step": 17302 }, { "epoch": 2.2011194504515963, "ewc_loss": 0.03238189220428467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238189310650341e-05, "grad_norm": 18.72922706604004, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8785433173179626, "num_tokens": 660168116.0, "step": 17303 }, { "epoch": 2.201246660730187, "ewc_loss": 0.03245604783296585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245604966650717e-05, "grad_norm": 18.742488861083984, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8688787221908569, "num_tokens": 660208006.0, "step": 17304 }, { "epoch": 2.2013738710087773, "ewc_loss": 0.03242633864283562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242633829358965e-05, "grad_norm": 18.715757369995117, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8747812509536743, "num_tokens": 660254046.0, "step": 17305 }, { "epoch": 2.201501081287368, "ewc_loss": 0.032447148114442825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2447147532366216e-05, "grad_norm": 18.75176239013672, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8743771314620972, "num_tokens": 660289321.0, "step": 17306 }, { "epoch": 2.2016282915659584, "ewc_loss": 0.03248373046517372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248373104725033e-05, "grad_norm": 18.78029441833496, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.871658444404602, "num_tokens": 660325052.0, "step": 17307 }, { "epoch": 2.201755501844549, "ewc_loss": 0.0324031263589859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2403127988800406e-05, "grad_norm": 18.676647186279297, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8650630116462708, "num_tokens": 660357942.0, "step": 17308 }, { "epoch": 2.2018827121231395, "ewc_loss": 0.03242186829447746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2421867217635736e-05, "grad_norm": 18.731325149536133, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8488713502883911, "num_tokens": 660390654.0, "step": 17309 }, { "epoch": 2.20200992240173, "ewc_loss": 0.03248075023293495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248075154260732e-05, "grad_norm": 18.663799285888672, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.875406801700592, "num_tokens": 660432020.0, "step": 17310 }, { "epoch": 2.2021371326803205, "ewc_loss": 0.03249708190560341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.249708242947236e-05, "grad_norm": 18.75687599182129, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8743663430213928, "num_tokens": 660465051.0, "step": 17311 }, { "epoch": 2.202264342958911, "ewc_loss": 0.03253420814871788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253420800319873e-05, "grad_norm": 18.688676834106445, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8714686632156372, "num_tokens": 660504967.0, "step": 17312 }, { "epoch": 2.2023915532375016, "ewc_loss": 0.032445698976516724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2445699616800994e-05, "grad_norm": 18.643264770507812, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8644628524780273, "num_tokens": 660543826.0, "step": 17313 }, { "epoch": 2.202518763516092, "ewc_loss": 0.032465141266584396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246514097554609e-05, "grad_norm": 18.7989501953125, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8700339794158936, "num_tokens": 660583074.0, "step": 17314 }, { "epoch": 2.2026459737946826, "ewc_loss": 0.03250061348080635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250061490689404e-05, "grad_norm": 18.777542114257812, "learning_rate": 1e-06, "loss": 0.3386, "mean_token_accuracy": 0.890340268611908, "num_tokens": 660619974.0, "step": 17315 }, { "epoch": 2.202773184073273, "ewc_loss": 0.03244607895612717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244607796659693e-05, "grad_norm": 18.72701644897461, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8803408145904541, "num_tokens": 660654005.0, "step": 17316 }, { "epoch": 2.2029003943518637, "ewc_loss": 0.032459061592817307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245906191295944e-05, "grad_norm": 18.753881454467773, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.862764835357666, "num_tokens": 660689179.0, "step": 17317 }, { "epoch": 2.203027604630454, "ewc_loss": 0.03249451518058777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.249451401643455e-05, "grad_norm": 18.715957641601562, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8627173900604248, "num_tokens": 660724923.0, "step": 17318 }, { "epoch": 2.2031548149090447, "ewc_loss": 0.03242358937859535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242358798161149e-05, "grad_norm": 18.745582580566406, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.878512442111969, "num_tokens": 660760763.0, "step": 17319 }, { "epoch": 2.2032820251876353, "ewc_loss": 0.03255199268460274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25519940815866e-05, "grad_norm": 18.74130630493164, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8750159740447998, "num_tokens": 660795520.0, "step": 17320 }, { "epoch": 2.203409235466226, "ewc_loss": 0.0323907732963562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239077341277152e-05, "grad_norm": 18.644887924194336, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8841027617454529, "num_tokens": 660834858.0, "step": 17321 }, { "epoch": 2.2035364457448163, "ewc_loss": 0.03244440630078316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2444408134324476e-05, "grad_norm": 18.763702392578125, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.883540689945221, "num_tokens": 660870319.0, "step": 17322 }, { "epoch": 2.203663656023407, "ewc_loss": 0.032504305243492126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2504303817404434e-05, "grad_norm": 18.699386596679688, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8618717789649963, "num_tokens": 660907313.0, "step": 17323 }, { "epoch": 2.2037908663019974, "ewc_loss": 0.03246266394853592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246266351197846e-05, "grad_norm": 18.760597229003906, "learning_rate": 1e-06, "loss": 0.3527, "mean_token_accuracy": 0.8893882036209106, "num_tokens": 660948087.0, "step": 17324 }, { "epoch": 2.203918076580588, "ewc_loss": 0.0324663445353508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246634514653124e-05, "grad_norm": 18.67891502380371, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8804331421852112, "num_tokens": 660984984.0, "step": 17325 }, { "epoch": 2.204045286859178, "ewc_loss": 0.032472942024469376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2472940802108496e-05, "grad_norm": 18.778099060058594, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8712232708930969, "num_tokens": 661022898.0, "step": 17326 }, { "epoch": 2.2041724971377685, "ewc_loss": 0.032540325075387955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254032344557345e-05, "grad_norm": 18.74787712097168, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8835996389389038, "num_tokens": 661058645.0, "step": 17327 }, { "epoch": 2.204299707416359, "ewc_loss": 0.03239252790808678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239252691855654e-05, "grad_norm": 18.764263153076172, "learning_rate": 1e-06, "loss": 0.3344, "mean_token_accuracy": 0.8966350555419922, "num_tokens": 661096954.0, "step": 17328 }, { "epoch": 2.2044269176949496, "ewc_loss": 0.0324709415435791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2470939913764596e-05, "grad_norm": 18.714048385620117, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8642603158950806, "num_tokens": 661136947.0, "step": 17329 }, { "epoch": 2.20455412797354, "ewc_loss": 0.032439131289720535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243913306505419e-05, "grad_norm": 18.786067962646484, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8761523365974426, "num_tokens": 661175376.0, "step": 17330 }, { "epoch": 2.2046813382521306, "ewc_loss": 0.03245110064744949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2451102015329525e-05, "grad_norm": 18.791547775268555, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8677337765693665, "num_tokens": 661213238.0, "step": 17331 }, { "epoch": 2.204808548530721, "ewc_loss": 0.032400332391262054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240033402107656e-05, "grad_norm": 18.75925636291504, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.8894072771072388, "num_tokens": 661246512.0, "step": 17332 }, { "epoch": 2.2049357588093117, "ewc_loss": 0.032468896359205246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246889536967501e-05, "grad_norm": 18.81449317932129, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.8851990699768066, "num_tokens": 661288272.0, "step": 17333 }, { "epoch": 2.2050629690879022, "ewc_loss": 0.03238098695874214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2380987249780446e-05, "grad_norm": 18.75877571105957, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8801237344741821, "num_tokens": 661328705.0, "step": 17334 }, { "epoch": 2.2051901793664928, "ewc_loss": 0.032402001321315765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240200021537021e-05, "grad_norm": 18.789997100830078, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8787816166877747, "num_tokens": 661369877.0, "step": 17335 }, { "epoch": 2.2053173896450833, "ewc_loss": 0.03232095018029213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232094968552701e-05, "grad_norm": 18.725919723510742, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8773765563964844, "num_tokens": 661405209.0, "step": 17336 }, { "epoch": 2.205444599923674, "ewc_loss": 0.03238009661436081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238009594497271e-05, "grad_norm": 18.78098487854004, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8786280751228333, "num_tokens": 661441083.0, "step": 17337 }, { "epoch": 2.2055718102022643, "ewc_loss": 0.03236614540219307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236614429624751e-05, "grad_norm": 18.741750717163086, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8776108622550964, "num_tokens": 661476938.0, "step": 17338 }, { "epoch": 2.205699020480855, "ewc_loss": 0.03238977491855621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238977660657838e-05, "grad_norm": 18.799684524536133, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8679273724555969, "num_tokens": 661519281.0, "step": 17339 }, { "epoch": 2.2058262307594454, "ewc_loss": 0.03234273940324783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2342737540602684e-05, "grad_norm": 18.74778175354004, "learning_rate": 1e-06, "loss": 0.3412, "mean_token_accuracy": 0.8892083764076233, "num_tokens": 661555880.0, "step": 17340 }, { "epoch": 2.205953441038036, "ewc_loss": 0.03233643248677254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2336432923329994e-05, "grad_norm": 18.712644577026367, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8660628795623779, "num_tokens": 661595937.0, "step": 17341 }, { "epoch": 2.2060806513166265, "ewc_loss": 0.03239165246486664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239165380364284e-05, "grad_norm": 18.74518394470215, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8617582321166992, "num_tokens": 661636868.0, "step": 17342 }, { "epoch": 2.206207861595217, "ewc_loss": 0.032334014773368835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233401366742328e-05, "grad_norm": 18.74225616455078, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8681216239929199, "num_tokens": 661681503.0, "step": 17343 }, { "epoch": 2.2063350718738075, "ewc_loss": 0.03233109042048454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2331088732462376e-05, "grad_norm": 18.82254409790039, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8789774179458618, "num_tokens": 661718438.0, "step": 17344 }, { "epoch": 2.206462282152398, "ewc_loss": 0.03242640197277069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242640013922937e-05, "grad_norm": 18.713891983032227, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8548107743263245, "num_tokens": 661759048.0, "step": 17345 }, { "epoch": 2.2065894924309886, "ewc_loss": 0.03228039667010307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228039713576436e-05, "grad_norm": 18.75550079345703, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8853517770767212, "num_tokens": 661803131.0, "step": 17346 }, { "epoch": 2.206716702709579, "ewc_loss": 0.032405536621809006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.24055363307707e-05, "grad_norm": 18.728670120239258, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8755511045455933, "num_tokens": 661838139.0, "step": 17347 }, { "epoch": 2.2068439129881696, "ewc_loss": 0.0323481410741806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2348139939131215e-05, "grad_norm": 18.822538375854492, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8682544231414795, "num_tokens": 661878568.0, "step": 17348 }, { "epoch": 2.20697112326676, "ewc_loss": 0.032411858439445496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2411859137937427e-05, "grad_norm": 18.814794540405273, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8686907291412354, "num_tokens": 661909110.0, "step": 17349 }, { "epoch": 2.2070983335453507, "ewc_loss": 0.0323568657040596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.235686745028943e-05, "grad_norm": 18.78447723388672, "learning_rate": 1e-06, "loss": 0.3507, "mean_token_accuracy": 0.8880192637443542, "num_tokens": 661944472.0, "step": 17350 }, { "epoch": 2.2072255438239408, "ewc_loss": 0.032287415117025375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228741479688324e-05, "grad_norm": 18.600353240966797, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.8867150545120239, "num_tokens": 661985865.0, "step": 17351 }, { "epoch": 2.2073527541025313, "ewc_loss": 0.03231404721736908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231404843972996e-05, "grad_norm": 18.746469497680664, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8705830574035645, "num_tokens": 662019460.0, "step": 17352 }, { "epoch": 2.207479964381122, "ewc_loss": 0.03245916590094566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245916741434485e-05, "grad_norm": 18.708847045898438, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8823491334915161, "num_tokens": 662059109.0, "step": 17353 }, { "epoch": 2.2076071746597123, "ewc_loss": 0.03238910064101219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238909994252026e-05, "grad_norm": 18.81324005126953, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8676460981369019, "num_tokens": 662096040.0, "step": 17354 }, { "epoch": 2.207734384938303, "ewc_loss": 0.03244013339281082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244013350922614e-05, "grad_norm": 18.7182674407959, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8795588612556458, "num_tokens": 662139351.0, "step": 17355 }, { "epoch": 2.2078615952168934, "ewc_loss": 0.03231210261583328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2312102121068165e-05, "grad_norm": 18.726423263549805, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8763514757156372, "num_tokens": 662177353.0, "step": 17356 }, { "epoch": 2.207988805495484, "ewc_loss": 0.03242156654596329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242156526539475e-05, "grad_norm": 18.75847816467285, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8643990755081177, "num_tokens": 662209519.0, "step": 17357 }, { "epoch": 2.2081160157740745, "ewc_loss": 0.03239642456173897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2396423193858936e-05, "grad_norm": 18.721588134765625, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8553897738456726, "num_tokens": 662251311.0, "step": 17358 }, { "epoch": 2.208243226052665, "ewc_loss": 0.032437343150377274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2437343179481104e-05, "grad_norm": 18.84391212463379, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8815488815307617, "num_tokens": 662282770.0, "step": 17359 }, { "epoch": 2.2083704363312555, "ewc_loss": 0.03244761377573013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2447613193653524e-05, "grad_norm": 18.722423553466797, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8718070387840271, "num_tokens": 662326770.0, "step": 17360 }, { "epoch": 2.208497646609846, "ewc_loss": 0.03237300366163254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237300188629888e-05, "grad_norm": 18.807655334472656, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8745290040969849, "num_tokens": 662366811.0, "step": 17361 }, { "epoch": 2.2086248568884366, "ewc_loss": 0.03244965896010399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244965773774311e-05, "grad_norm": 18.735321044921875, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8710070848464966, "num_tokens": 662404517.0, "step": 17362 }, { "epoch": 2.208752067167027, "ewc_loss": 0.032351765781641006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2351767004001886e-05, "grad_norm": 18.77448272705078, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8799539804458618, "num_tokens": 662442091.0, "step": 17363 }, { "epoch": 2.2088792774456176, "ewc_loss": 0.032470542937517166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247054337407462e-05, "grad_norm": 18.727567672729492, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8500242829322815, "num_tokens": 662483343.0, "step": 17364 }, { "epoch": 2.209006487724208, "ewc_loss": 0.032373152673244476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237315104342997e-05, "grad_norm": 18.745773315429688, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8687593936920166, "num_tokens": 662521994.0, "step": 17365 }, { "epoch": 2.2091336980027987, "ewc_loss": 0.032369278371334076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236927659600042e-05, "grad_norm": 18.769241333007812, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8707839250564575, "num_tokens": 662562737.0, "step": 17366 }, { "epoch": 2.209260908281389, "ewc_loss": 0.0323680080473423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236800694139674e-05, "grad_norm": 18.69937515258789, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8723067045211792, "num_tokens": 662596927.0, "step": 17367 }, { "epoch": 2.2093881185599797, "ewc_loss": 0.03238096460700035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2380965421907604e-05, "grad_norm": 18.792016983032227, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8823862075805664, "num_tokens": 662632831.0, "step": 17368 }, { "epoch": 2.2095153288385703, "ewc_loss": 0.03242522105574608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2425221434095874e-05, "grad_norm": 18.683103561401367, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8720325231552124, "num_tokens": 662675554.0, "step": 17369 }, { "epoch": 2.209642539117161, "ewc_loss": 0.03237530216574669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237530108890496e-05, "grad_norm": 18.75286865234375, "learning_rate": 1e-06, "loss": 0.3529, "mean_token_accuracy": 0.8859620094299316, "num_tokens": 662720248.0, "step": 17370 }, { "epoch": 2.2097697493957513, "ewc_loss": 0.03244457766413689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244457911932841e-05, "grad_norm": 18.79525375366211, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8748883008956909, "num_tokens": 662761388.0, "step": 17371 }, { "epoch": 2.209896959674342, "ewc_loss": 0.03239652141928673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.239652141928673e-05, "grad_norm": 18.717185974121094, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8742016553878784, "num_tokens": 662798007.0, "step": 17372 }, { "epoch": 2.2100241699529324, "ewc_loss": 0.03234749659895897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234749601688236e-05, "grad_norm": 18.76384925842285, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.871324896812439, "num_tokens": 662836042.0, "step": 17373 }, { "epoch": 2.210151380231523, "ewc_loss": 0.032370179891586304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237017881474458e-05, "grad_norm": 18.764019012451172, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8653574585914612, "num_tokens": 662869465.0, "step": 17374 }, { "epoch": 2.2102785905101134, "ewc_loss": 0.03235597163438797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2355972507502884e-05, "grad_norm": 18.823698043823242, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8667706251144409, "num_tokens": 662909440.0, "step": 17375 }, { "epoch": 2.2104058007887035, "ewc_loss": 0.03232237696647644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232237577321939e-05, "grad_norm": 18.69110870361328, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8703000545501709, "num_tokens": 662949637.0, "step": 17376 }, { "epoch": 2.210533011067294, "ewc_loss": 0.03227382153272629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2273823308059946e-05, "grad_norm": 18.75982093811035, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8731860518455505, "num_tokens": 662987679.0, "step": 17377 }, { "epoch": 2.2106602213458846, "ewc_loss": 0.03236379846930504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236379779991694e-05, "grad_norm": 18.845558166503906, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8559747934341431, "num_tokens": 663027651.0, "step": 17378 }, { "epoch": 2.210787431624475, "ewc_loss": 0.032314468175172806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2314466807292774e-05, "grad_norm": 18.727951049804688, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8870182037353516, "num_tokens": 663063708.0, "step": 17379 }, { "epoch": 2.2109146419030656, "ewc_loss": 0.032343633472919464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.234363248338923e-05, "grad_norm": 18.812625885009766, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8667230606079102, "num_tokens": 663097255.0, "step": 17380 }, { "epoch": 2.211041852181656, "ewc_loss": 0.032376062124967575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2376061426475644e-05, "grad_norm": 18.699148178100586, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8711963891983032, "num_tokens": 663134496.0, "step": 17381 }, { "epoch": 2.2111690624602467, "ewc_loss": 0.032365817576646805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2365816878154874e-05, "grad_norm": 18.863618850708008, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8821251392364502, "num_tokens": 663170278.0, "step": 17382 }, { "epoch": 2.211296272738837, "ewc_loss": 0.0323774516582489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.237745113437995e-05, "grad_norm": 18.70562744140625, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8712630867958069, "num_tokens": 663205407.0, "step": 17383 }, { "epoch": 2.2114234830174277, "ewc_loss": 0.03228771314024925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.228771311114542e-05, "grad_norm": 18.796443939208984, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8783565163612366, "num_tokens": 663243166.0, "step": 17384 }, { "epoch": 2.2115506932960183, "ewc_loss": 0.03242288902401924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242288948968053e-05, "grad_norm": 18.80948829650879, "learning_rate": 1e-06, "loss": 0.3288, "mean_token_accuracy": 0.8958345651626587, "num_tokens": 663282267.0, "step": 17385 }, { "epoch": 2.211677903574609, "ewc_loss": 0.03232400119304657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.232400194974616e-05, "grad_norm": 18.70455551147461, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8666192293167114, "num_tokens": 663318620.0, "step": 17386 }, { "epoch": 2.2118051138531993, "ewc_loss": 0.03238830715417862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2388306863140315e-05, "grad_norm": 18.857357025146484, "learning_rate": 1e-06, "loss": 0.3361, "mean_token_accuracy": 0.8911903500556946, "num_tokens": 663353256.0, "step": 17387 }, { "epoch": 2.21193232413179, "ewc_loss": 0.03244396671652794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244396793888882e-05, "grad_norm": 18.75514793395996, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8774425387382507, "num_tokens": 663386691.0, "step": 17388 }, { "epoch": 2.2120595344103804, "ewc_loss": 0.032330166548490524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.233016832382418e-05, "grad_norm": 18.900714874267578, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8705801963806152, "num_tokens": 663416550.0, "step": 17389 }, { "epoch": 2.212186744688971, "ewc_loss": 0.032420285046100616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242028469685465e-05, "grad_norm": 18.67728614807129, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8764731884002686, "num_tokens": 663464912.0, "step": 17390 }, { "epoch": 2.2123139549675614, "ewc_loss": 0.032362200319767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2362200727220625e-05, "grad_norm": 18.81913185119629, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8657922744750977, "num_tokens": 663503786.0, "step": 17391 }, { "epoch": 2.212441165246152, "ewc_loss": 0.032418183982372284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241818558308296e-05, "grad_norm": 18.703718185424805, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8818780183792114, "num_tokens": 663540579.0, "step": 17392 }, { "epoch": 2.2125683755247425, "ewc_loss": 0.03242368996143341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242368984501809e-05, "grad_norm": 18.749347686767578, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8757915496826172, "num_tokens": 663577284.0, "step": 17393 }, { "epoch": 2.212695585803333, "ewc_loss": 0.03245532512664795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245532570872456e-05, "grad_norm": 18.6939697265625, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8706552982330322, "num_tokens": 663613118.0, "step": 17394 }, { "epoch": 2.2128227960819236, "ewc_loss": 0.032467879354953766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246788037358783e-05, "grad_norm": 18.781238555908203, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8783746957778931, "num_tokens": 663651548.0, "step": 17395 }, { "epoch": 2.212950006360514, "ewc_loss": 0.03247775509953499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247775384807028e-05, "grad_norm": 18.6702938079834, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8569236993789673, "num_tokens": 663694381.0, "step": 17396 }, { "epoch": 2.2130772166391046, "ewc_loss": 0.032415613532066345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2415613532066345e-05, "grad_norm": 18.721115112304688, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8750408887863159, "num_tokens": 663734686.0, "step": 17397 }, { "epoch": 2.213204426917695, "ewc_loss": 0.03254436329007149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254436160204932e-05, "grad_norm": 18.816078186035156, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8656193017959595, "num_tokens": 663768764.0, "step": 17398 }, { "epoch": 2.2133316371962852, "ewc_loss": 0.03242862969636917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242863022023812e-05, "grad_norm": 18.64209747314453, "learning_rate": 1e-06, "loss": 0.3337, "mean_token_accuracy": 0.8938260078430176, "num_tokens": 663807650.0, "step": 17399 }, { "epoch": 2.213458847474876, "ewc_loss": 0.032465141266584396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246514097554609e-05, "grad_norm": 18.80006980895996, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8714343905448914, "num_tokens": 663849308.0, "step": 17400 }, { "epoch": 2.2135860577534663, "ewc_loss": 0.032549113035202026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254911280237138e-05, "grad_norm": 18.70067596435547, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8614763021469116, "num_tokens": 663888917.0, "step": 17401 }, { "epoch": 2.213713268032057, "ewc_loss": 0.03245336934924126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245336847612634e-05, "grad_norm": 18.81644630432129, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8746745586395264, "num_tokens": 663921048.0, "step": 17402 }, { "epoch": 2.2138404783106473, "ewc_loss": 0.032489974051713943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.24899738188833e-05, "grad_norm": 18.638200759887695, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8789438605308533, "num_tokens": 663963433.0, "step": 17403 }, { "epoch": 2.213967688589238, "ewc_loss": 0.032389622181653976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238962381146848e-05, "grad_norm": 18.76883316040039, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8575656414031982, "num_tokens": 664003294.0, "step": 17404 }, { "epoch": 2.2140948988678284, "ewc_loss": 0.032542042434215546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2542044209549204e-05, "grad_norm": 18.688594818115234, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8619825839996338, "num_tokens": 664043161.0, "step": 17405 }, { "epoch": 2.214222109146419, "ewc_loss": 0.03245167061686516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245166954002343e-05, "grad_norm": 18.73773956298828, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8852514028549194, "num_tokens": 664082457.0, "step": 17406 }, { "epoch": 2.2143493194250095, "ewc_loss": 0.03251142427325249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251142334192991e-05, "grad_norm": 18.673095703125, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8842592239379883, "num_tokens": 664125999.0, "step": 17407 }, { "epoch": 2.2144765297036, "ewc_loss": 0.03243979811668396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243979881517589e-05, "grad_norm": 18.654787063598633, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8796833753585815, "num_tokens": 664167256.0, "step": 17408 }, { "epoch": 2.2146037399821905, "ewc_loss": 0.03254319354891777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2543193810852244e-05, "grad_norm": 18.77305793762207, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8704180121421814, "num_tokens": 664201887.0, "step": 17409 }, { "epoch": 2.214730950260781, "ewc_loss": 0.032462988048791885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2462987292092294e-05, "grad_norm": 18.69808006286621, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8791863322257996, "num_tokens": 664238986.0, "step": 17410 }, { "epoch": 2.2148581605393716, "ewc_loss": 0.03245432302355766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.24543216265738e-05, "grad_norm": 18.792354583740234, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8708367347717285, "num_tokens": 664283436.0, "step": 17411 }, { "epoch": 2.214985370817962, "ewc_loss": 0.03246879205107689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246879350626841e-05, "grad_norm": 18.752782821655273, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8677141666412354, "num_tokens": 664325196.0, "step": 17412 }, { "epoch": 2.2151125810965526, "ewc_loss": 0.03247053548693657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247053609811701e-05, "grad_norm": 18.745479583740234, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8656785488128662, "num_tokens": 664365722.0, "step": 17413 }, { "epoch": 2.215239791375143, "ewc_loss": 0.03245244547724724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245244442950934e-05, "grad_norm": 18.76361083984375, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8657903075218201, "num_tokens": 664411570.0, "step": 17414 }, { "epoch": 2.2153670016537337, "ewc_loss": 0.03238314762711525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.238314820919186e-05, "grad_norm": 18.718040466308594, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8768128156661987, "num_tokens": 664453545.0, "step": 17415 }, { "epoch": 2.215494211932324, "ewc_loss": 0.03241254389286041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241254307795316e-05, "grad_norm": 18.781320571899414, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8744772672653198, "num_tokens": 664489882.0, "step": 17416 }, { "epoch": 2.2156214222109147, "ewc_loss": 0.03247130662202835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247130734962411e-05, "grad_norm": 18.75106430053711, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8720933794975281, "num_tokens": 664528720.0, "step": 17417 }, { "epoch": 2.2157486324895053, "ewc_loss": 0.03244515508413315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244515391997993e-05, "grad_norm": 18.810632705688477, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8646969795227051, "num_tokens": 664569645.0, "step": 17418 }, { "epoch": 2.215875842768096, "ewc_loss": 0.03242836892604828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242836828576401e-05, "grad_norm": 18.74775505065918, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8690941333770752, "num_tokens": 664612033.0, "step": 17419 }, { "epoch": 2.2160030530466863, "ewc_loss": 0.03240962699055672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240962541894987e-05, "grad_norm": 18.765819549560547, "learning_rate": 1e-06, "loss": 0.37, "mean_token_accuracy": 0.8818404674530029, "num_tokens": 664651408.0, "step": 17420 }, { "epoch": 2.216130263325277, "ewc_loss": 0.03241178020834923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241177910240367e-05, "grad_norm": 18.707015991210938, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8591539859771729, "num_tokens": 664695764.0, "step": 17421 }, { "epoch": 2.2162574736038674, "ewc_loss": 0.032406289130449295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240628939238377e-05, "grad_norm": 18.817649841308594, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8773800730705261, "num_tokens": 664735528.0, "step": 17422 }, { "epoch": 2.216384683882458, "ewc_loss": 0.03246941417455673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2469415600644425e-05, "grad_norm": 18.757036209106445, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.866014838218689, "num_tokens": 664775612.0, "step": 17423 }, { "epoch": 2.216511894161048, "ewc_loss": 0.03231615945696831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.231615846743807e-05, "grad_norm": 18.704954147338867, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8701912760734558, "num_tokens": 664812485.0, "step": 17424 }, { "epoch": 2.2166391044396385, "ewc_loss": 0.03244722634553909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244722756789997e-05, "grad_norm": 18.741130828857422, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8591803312301636, "num_tokens": 664853166.0, "step": 17425 }, { "epoch": 2.216766314718229, "ewc_loss": 0.03244440257549286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244440085836686e-05, "grad_norm": 18.792203903198242, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8628802299499512, "num_tokens": 664888778.0, "step": 17426 }, { "epoch": 2.2168935249968196, "ewc_loss": 0.03238070756196976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23807071254123e-05, "grad_norm": 18.72306251525879, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8725306987762451, "num_tokens": 664933783.0, "step": 17427 }, { "epoch": 2.21702073527541, "ewc_loss": 0.03238657861948013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2386578823206946e-05, "grad_norm": 18.826576232910156, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8578954339027405, "num_tokens": 664965228.0, "step": 17428 }, { "epoch": 2.2171479455540006, "ewc_loss": 0.032450489699840546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2450490834889933e-05, "grad_norm": 18.720897674560547, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8697203397750854, "num_tokens": 664996691.0, "step": 17429 }, { "epoch": 2.217275155832591, "ewc_loss": 0.032409943640232086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.240994192310609e-05, "grad_norm": 18.747900009155273, "learning_rate": 1e-06, "loss": 0.3614, "mean_token_accuracy": 0.8840686678886414, "num_tokens": 665032006.0, "step": 17430 }, { "epoch": 2.2174023661111817, "ewc_loss": 0.03251155838370323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2511557947145775e-05, "grad_norm": 18.85689926147461, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8636247515678406, "num_tokens": 665071819.0, "step": 17431 }, { "epoch": 2.217529576389772, "ewc_loss": 0.03244320675730705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2443207601318136e-05, "grad_norm": 18.664823532104492, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8622021675109863, "num_tokens": 665118883.0, "step": 17432 }, { "epoch": 2.2176567866683627, "ewc_loss": 0.03236594796180725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.236594784539193e-05, "grad_norm": 18.819398880004883, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8541099429130554, "num_tokens": 665154926.0, "step": 17433 }, { "epoch": 2.2177839969469533, "ewc_loss": 0.03251207619905472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251207454013638e-05, "grad_norm": 18.755817413330078, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8820518255233765, "num_tokens": 665189910.0, "step": 17434 }, { "epoch": 2.217911207225544, "ewc_loss": 0.03240897133946419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2408970582764596e-05, "grad_norm": 18.81307029724121, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8803869485855103, "num_tokens": 665235849.0, "step": 17435 }, { "epoch": 2.2180384175041343, "ewc_loss": 0.03247000277042389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2470001315232366e-05, "grad_norm": 18.732072830200195, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8641112446784973, "num_tokens": 665274231.0, "step": 17436 }, { "epoch": 2.218165627782725, "ewc_loss": 0.032396335154771805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2396335882367566e-05, "grad_norm": 18.770902633666992, "learning_rate": 1e-06, "loss": 0.3386, "mean_token_accuracy": 0.8910838961601257, "num_tokens": 665309057.0, "step": 17437 }, { "epoch": 2.2182928380613154, "ewc_loss": 0.03253112733364105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253112663514912e-05, "grad_norm": 18.840848922729492, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8714439868927002, "num_tokens": 665337031.0, "step": 17438 }, { "epoch": 2.218420048339906, "ewc_loss": 0.03241788595914841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.241788726882078e-05, "grad_norm": 18.771757125854492, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8726853728294373, "num_tokens": 665376557.0, "step": 17439 }, { "epoch": 2.2185472586184964, "ewc_loss": 0.0324808731675148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2480871595907956e-05, "grad_norm": 18.739072799682617, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8714213371276855, "num_tokens": 665419928.0, "step": 17440 }, { "epoch": 2.218674468897087, "ewc_loss": 0.0324292927980423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242929233238101e-05, "grad_norm": 18.73246192932129, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8796049952507019, "num_tokens": 665458827.0, "step": 17441 }, { "epoch": 2.2188016791756775, "ewc_loss": 0.032472774386405945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247277345508337e-05, "grad_norm": 18.823720932006836, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8758718967437744, "num_tokens": 665498623.0, "step": 17442 }, { "epoch": 2.218928889454268, "ewc_loss": 0.03250830993056297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250830923207104e-05, "grad_norm": 18.760528564453125, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8686351776123047, "num_tokens": 665536103.0, "step": 17443 }, { "epoch": 2.2190560997328586, "ewc_loss": 0.03245626389980316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245626430725679e-05, "grad_norm": 18.710582733154297, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8714255094528198, "num_tokens": 665572089.0, "step": 17444 }, { "epoch": 2.219183310011449, "ewc_loss": 0.03248462453484535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2484625990036875e-05, "grad_norm": 18.77855682373047, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8656418323516846, "num_tokens": 665608088.0, "step": 17445 }, { "epoch": 2.2193105202900396, "ewc_loss": 0.032540615648031235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2540614483878016e-05, "grad_norm": 18.800552368164062, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.881779134273529, "num_tokens": 665641883.0, "step": 17446 }, { "epoch": 2.21943773056863, "ewc_loss": 0.032513659447431564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2513660698896274e-05, "grad_norm": 18.750946044921875, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8784981369972229, "num_tokens": 665682211.0, "step": 17447 }, { "epoch": 2.2195649408472207, "ewc_loss": 0.03255053982138634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255053889006376e-05, "grad_norm": 18.737558364868164, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8784635066986084, "num_tokens": 665726182.0, "step": 17448 }, { "epoch": 2.2196921511258108, "ewc_loss": 0.032504696398973465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25046967191156e-05, "grad_norm": 18.811609268188477, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8770585060119629, "num_tokens": 665760404.0, "step": 17449 }, { "epoch": 2.2198193614044013, "ewc_loss": 0.03248659521341324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248659413657151e-05, "grad_norm": 18.693923950195312, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8733982443809509, "num_tokens": 665798235.0, "step": 17450 }, { "epoch": 2.219946571682992, "ewc_loss": 0.032530948519706726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2530948374187574e-05, "grad_norm": 18.781103134155273, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8738972544670105, "num_tokens": 665840118.0, "step": 17451 }, { "epoch": 2.2200737819615823, "ewc_loss": 0.032565075904130936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25650762533769e-05, "grad_norm": 18.78626251220703, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8728289604187012, "num_tokens": 665878498.0, "step": 17452 }, { "epoch": 2.220200992240173, "ewc_loss": 0.03255724534392357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255724368500523e-05, "grad_norm": 18.851778030395508, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8697739243507385, "num_tokens": 665924179.0, "step": 17453 }, { "epoch": 2.2203282025187634, "ewc_loss": 0.03248508647084236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2485088013345376e-05, "grad_norm": 18.725801467895508, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.886865496635437, "num_tokens": 665963160.0, "step": 17454 }, { "epoch": 2.220455412797354, "ewc_loss": 0.03248382732272148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2483825634699315e-05, "grad_norm": 18.846656799316406, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8822393417358398, "num_tokens": 665998854.0, "step": 17455 }, { "epoch": 2.2205826230759445, "ewc_loss": 0.032504767179489136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2504765840712935e-05, "grad_norm": 18.740989685058594, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8817856311798096, "num_tokens": 666036420.0, "step": 17456 }, { "epoch": 2.220709833354535, "ewc_loss": 0.03247075527906418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2470754376845434e-05, "grad_norm": 18.805068969726562, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.854195237159729, "num_tokens": 666076692.0, "step": 17457 }, { "epoch": 2.2208370436331255, "ewc_loss": 0.032531145960092545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253114482504316e-05, "grad_norm": 18.77495002746582, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8605093955993652, "num_tokens": 666118009.0, "step": 17458 }, { "epoch": 2.220964253911716, "ewc_loss": 0.0324382446706295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.243824539822526e-05, "grad_norm": 18.74481773376465, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8643361926078796, "num_tokens": 666158132.0, "step": 17459 }, { "epoch": 2.2210914641903066, "ewc_loss": 0.03245528042316437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245528205297887e-05, "grad_norm": 18.79388999938965, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8683464527130127, "num_tokens": 666187195.0, "step": 17460 }, { "epoch": 2.221218674468897, "ewc_loss": 0.032451044768095016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245104380766861e-05, "grad_norm": 18.712955474853516, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8786437511444092, "num_tokens": 666227790.0, "step": 17461 }, { "epoch": 2.2213458847474876, "ewc_loss": 0.032487984746694565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248798384447582e-05, "grad_norm": 18.72612953186035, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.866928219795227, "num_tokens": 666268517.0, "step": 17462 }, { "epoch": 2.221473095026078, "ewc_loss": 0.032487839460372925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248783832532354e-05, "grad_norm": 18.726673126220703, "learning_rate": 1e-06, "loss": 0.3449, "mean_token_accuracy": 0.8901315331459045, "num_tokens": 666313727.0, "step": 17463 }, { "epoch": 2.2216003053046687, "ewc_loss": 0.03252489119768143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2524891139473766e-05, "grad_norm": 18.76872444152832, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8763942122459412, "num_tokens": 666352713.0, "step": 17464 }, { "epoch": 2.221727515583259, "ewc_loss": 0.03257991746068001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2579919206909835e-05, "grad_norm": 18.773239135742188, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8807305693626404, "num_tokens": 666395065.0, "step": 17465 }, { "epoch": 2.2218547258618497, "ewc_loss": 0.032518547028303146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25185465044342e-05, "grad_norm": 18.68959617614746, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.875385582447052, "num_tokens": 666428019.0, "step": 17466 }, { "epoch": 2.2219819361404403, "ewc_loss": 0.03255496546626091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255496631027199e-05, "grad_norm": 18.770017623901367, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.871320903301239, "num_tokens": 666466830.0, "step": 17467 }, { "epoch": 2.222109146419031, "ewc_loss": 0.032606836408376694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260683661210351e-05, "grad_norm": 18.730426788330078, "learning_rate": 1e-06, "loss": 0.3392, "mean_token_accuracy": 0.8901779651641846, "num_tokens": 666501028.0, "step": 17468 }, { "epoch": 2.2222363566976213, "ewc_loss": 0.03258436545729637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25843648170121e-05, "grad_norm": 18.747812271118164, "learning_rate": 1e-06, "loss": 0.3484, "mean_token_accuracy": 0.8859745264053345, "num_tokens": 666538554.0, "step": 17469 }, { "epoch": 2.222363566976212, "ewc_loss": 0.032574452459812164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2574451324762776e-05, "grad_norm": 18.740833282470703, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8731741905212402, "num_tokens": 666577019.0, "step": 17470 }, { "epoch": 2.2224907772548024, "ewc_loss": 0.03253358602523804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2533585908822715e-05, "grad_norm": 18.672794342041016, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8882384300231934, "num_tokens": 666616597.0, "step": 17471 }, { "epoch": 2.222617987533393, "ewc_loss": 0.0325750969350338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257509524701163e-05, "grad_norm": 18.802165985107422, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8777837753295898, "num_tokens": 666651523.0, "step": 17472 }, { "epoch": 2.2227451978119834, "ewc_loss": 0.03267664834856987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26766494254116e-05, "grad_norm": 18.752750396728516, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8649452924728394, "num_tokens": 666689270.0, "step": 17473 }, { "epoch": 2.2228724080905735, "ewc_loss": 0.03249901160597801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2499010558240116e-05, "grad_norm": 18.742515563964844, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8710640668869019, "num_tokens": 666726836.0, "step": 17474 }, { "epoch": 2.222999618369164, "ewc_loss": 0.032604288309812546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260429002693854e-05, "grad_norm": 18.779834747314453, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8715197443962097, "num_tokens": 666761207.0, "step": 17475 }, { "epoch": 2.2231268286477546, "ewc_loss": 0.032502345740795135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250234658480622e-05, "grad_norm": 18.736572265625, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.882858157157898, "num_tokens": 666800293.0, "step": 17476 }, { "epoch": 2.223254038926345, "ewc_loss": 0.03255009278655052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255009141867049e-05, "grad_norm": 18.792095184326172, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8818030953407288, "num_tokens": 666843046.0, "step": 17477 }, { "epoch": 2.2233812492049356, "ewc_loss": 0.032578833401203156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257883508922532e-05, "grad_norm": 18.752050399780273, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8872297406196594, "num_tokens": 666885840.0, "step": 17478 }, { "epoch": 2.223508459483526, "ewc_loss": 0.03253878653049469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2538788218516856e-05, "grad_norm": 18.81774139404297, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8744600415229797, "num_tokens": 666925654.0, "step": 17479 }, { "epoch": 2.2236356697621167, "ewc_loss": 0.03250937536358833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250937515986152e-05, "grad_norm": 18.73752784729004, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8756855130195618, "num_tokens": 666968098.0, "step": 17480 }, { "epoch": 2.223762880040707, "ewc_loss": 0.032465629279613495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246562846470624e-05, "grad_norm": 18.831233978271484, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8838635683059692, "num_tokens": 667010349.0, "step": 17481 }, { "epoch": 2.2238900903192977, "ewc_loss": 0.03256838396191597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256838317611255e-05, "grad_norm": 18.745641708374023, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8660352230072021, "num_tokens": 667047086.0, "step": 17482 }, { "epoch": 2.2240173005978883, "ewc_loss": 0.03247532993555069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2475330954184756e-05, "grad_norm": 18.780065536499023, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8570915460586548, "num_tokens": 667083955.0, "step": 17483 }, { "epoch": 2.224144510876479, "ewc_loss": 0.032557692378759384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2557691156398505e-05, "grad_norm": 18.785057067871094, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8798532485961914, "num_tokens": 667119751.0, "step": 17484 }, { "epoch": 2.2242717211550693, "ewc_loss": 0.03253932669758797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2539326639380306e-05, "grad_norm": 18.7491455078125, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8698146343231201, "num_tokens": 667150980.0, "step": 17485 }, { "epoch": 2.22439893143366, "ewc_loss": 0.03254447132349014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2544470741413534e-05, "grad_norm": 18.82113265991211, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8710724115371704, "num_tokens": 667185198.0, "step": 17486 }, { "epoch": 2.2245261417122504, "ewc_loss": 0.03254963085055351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254962939536199e-05, "grad_norm": 18.783597946166992, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8768097162246704, "num_tokens": 667225286.0, "step": 17487 }, { "epoch": 2.224653351990841, "ewc_loss": 0.03251956030726433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251956150052138e-05, "grad_norm": 18.81989097595215, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.868590235710144, "num_tokens": 667266294.0, "step": 17488 }, { "epoch": 2.2247805622694314, "ewc_loss": 0.03252950310707092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252950409660116e-05, "grad_norm": 18.79292106628418, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8714693784713745, "num_tokens": 667302430.0, "step": 17489 }, { "epoch": 2.224907772548022, "ewc_loss": 0.032454513013362885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245451443945058e-05, "grad_norm": 18.772218704223633, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.86771160364151, "num_tokens": 667342881.0, "step": 17490 }, { "epoch": 2.2250349828266125, "ewc_loss": 0.03253593668341637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2535936043132097e-05, "grad_norm": 18.824508666992188, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8754609823226929, "num_tokens": 667376122.0, "step": 17491 }, { "epoch": 2.225162193105203, "ewc_loss": 0.03251291438937187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251291491324082e-05, "grad_norm": 18.827531814575195, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8677059412002563, "num_tokens": 667415120.0, "step": 17492 }, { "epoch": 2.2252894033837936, "ewc_loss": 0.032509129494428635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250912777730264e-05, "grad_norm": 18.793529510498047, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8735038042068481, "num_tokens": 667451014.0, "step": 17493 }, { "epoch": 2.225416613662384, "ewc_loss": 0.03248724713921547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248724897275679e-05, "grad_norm": 18.846206665039062, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8650831580162048, "num_tokens": 667485600.0, "step": 17494 }, { "epoch": 2.2255438239409746, "ewc_loss": 0.03255739063024521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2557389204157516e-05, "grad_norm": 18.760652542114258, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8696147203445435, "num_tokens": 667527561.0, "step": 17495 }, { "epoch": 2.225671034219565, "ewc_loss": 0.03244777023792267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244776962674223e-05, "grad_norm": 18.75161361694336, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8811533451080322, "num_tokens": 667569677.0, "step": 17496 }, { "epoch": 2.225798244498155, "ewc_loss": 0.03255423158407211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255423143855296e-05, "grad_norm": 18.882305145263672, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8740692734718323, "num_tokens": 667605954.0, "step": 17497 }, { "epoch": 2.225925454776746, "ewc_loss": 0.032580792903900146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258079232182354e-05, "grad_norm": 18.706018447875977, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8771941661834717, "num_tokens": 667645082.0, "step": 17498 }, { "epoch": 2.2260526650553363, "ewc_loss": 0.03246677666902542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246677806600928e-05, "grad_norm": 18.90951156616211, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8701316118240356, "num_tokens": 667681390.0, "step": 17499 }, { "epoch": 2.226179875333927, "ewc_loss": 0.03260881081223488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260881203459576e-05, "grad_norm": 18.799516677856445, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8638164401054382, "num_tokens": 667722772.0, "step": 17500 }, { "epoch": 2.2263070856125173, "ewc_loss": 0.032501596957445145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250159716117196e-05, "grad_norm": 18.812715530395508, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8491470813751221, "num_tokens": 667758647.0, "step": 17501 }, { "epoch": 2.226434295891108, "ewc_loss": 0.03256190940737724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2561907573835924e-05, "grad_norm": 18.807384490966797, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8739970922470093, "num_tokens": 667790625.0, "step": 17502 }, { "epoch": 2.2265615061696984, "ewc_loss": 0.03247743844985962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247743734391406e-05, "grad_norm": 18.710369110107422, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8636955618858337, "num_tokens": 667823724.0, "step": 17503 }, { "epoch": 2.226688716448289, "ewc_loss": 0.032546523958444595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2546522561460733e-05, "grad_norm": 18.821548461914062, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8877906799316406, "num_tokens": 667862548.0, "step": 17504 }, { "epoch": 2.2268159267268794, "ewc_loss": 0.03256158530712128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256158379372209e-05, "grad_norm": 18.812894821166992, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8660495281219482, "num_tokens": 667905994.0, "step": 17505 }, { "epoch": 2.22694313700547, "ewc_loss": 0.032541483640670776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254148396081291e-05, "grad_norm": 18.77579116821289, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8675073385238647, "num_tokens": 667945087.0, "step": 17506 }, { "epoch": 2.2270703472840605, "ewc_loss": 0.032507456839084625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250745794503018e-05, "grad_norm": 18.737215042114258, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8837661743164062, "num_tokens": 667984736.0, "step": 17507 }, { "epoch": 2.227197557562651, "ewc_loss": 0.03252251073718071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252251190133393e-05, "grad_norm": 18.734750747680664, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.868905782699585, "num_tokens": 668024020.0, "step": 17508 }, { "epoch": 2.2273247678412416, "ewc_loss": 0.032528188079595566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252818714827299e-05, "grad_norm": 18.83574104309082, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8690539598464966, "num_tokens": 668059567.0, "step": 17509 }, { "epoch": 2.227451978119832, "ewc_loss": 0.03261929750442505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2619296689517796e-05, "grad_norm": 18.77589225769043, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8800824284553528, "num_tokens": 668099901.0, "step": 17510 }, { "epoch": 2.2275791883984226, "ewc_loss": 0.03254419565200806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2544194255024195e-05, "grad_norm": 18.825143814086914, "learning_rate": 1e-06, "loss": 0.3475, "mean_token_accuracy": 0.8876433372497559, "num_tokens": 668135974.0, "step": 17511 }, { "epoch": 2.227706398677013, "ewc_loss": 0.032559849321842194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255984847783111e-05, "grad_norm": 18.86922836303711, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8756014704704285, "num_tokens": 668179200.0, "step": 17512 }, { "epoch": 2.2278336089556037, "ewc_loss": 0.03244778513908386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.244778417865746e-05, "grad_norm": 18.75899314880371, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8522063493728638, "num_tokens": 668219683.0, "step": 17513 }, { "epoch": 2.227960819234194, "ewc_loss": 0.03250792995095253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2507930882275105e-05, "grad_norm": 18.81584358215332, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8762966990470886, "num_tokens": 668253899.0, "step": 17514 }, { "epoch": 2.2280880295127847, "ewc_loss": 0.03252674639225006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252674650866538e-05, "grad_norm": 18.79840660095215, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8670623302459717, "num_tokens": 668294140.0, "step": 17515 }, { "epoch": 2.2282152397913753, "ewc_loss": 0.03248130530118942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2481304515386e-05, "grad_norm": 18.68825912475586, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8799008727073669, "num_tokens": 668328442.0, "step": 17516 }, { "epoch": 2.228342450069966, "ewc_loss": 0.03252924606204033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2529245800105855e-05, "grad_norm": 18.822139739990234, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8768997192382812, "num_tokens": 668367903.0, "step": 17517 }, { "epoch": 2.2284696603485563, "ewc_loss": 0.03257116302847862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2571162591921166e-05, "grad_norm": 18.73440933227539, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8858107328414917, "num_tokens": 668409889.0, "step": 17518 }, { "epoch": 2.228596870627147, "ewc_loss": 0.03256506100296974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256506170146167e-05, "grad_norm": 18.82370376586914, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8768911957740784, "num_tokens": 668451061.0, "step": 17519 }, { "epoch": 2.2287240809057374, "ewc_loss": 0.03260922431945801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260922312620096e-05, "grad_norm": 18.78380584716797, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8515495657920837, "num_tokens": 668493038.0, "step": 17520 }, { "epoch": 2.228851291184328, "ewc_loss": 0.03251805156469345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2518051739316434e-05, "grad_norm": 18.77450942993164, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.880605936050415, "num_tokens": 668528087.0, "step": 17521 }, { "epoch": 2.228978501462918, "ewc_loss": 0.032552171498537064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255217234254815e-05, "grad_norm": 18.813310623168945, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8755629658699036, "num_tokens": 668562325.0, "step": 17522 }, { "epoch": 2.2291057117415085, "ewc_loss": 0.032590899616479874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259089862694964e-05, "grad_norm": 18.8090763092041, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8763933777809143, "num_tokens": 668601336.0, "step": 17523 }, { "epoch": 2.229232922020099, "ewc_loss": 0.03253140300512314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253140312153846e-05, "grad_norm": 18.798765182495117, "learning_rate": 1e-06, "loss": 0.3296, "mean_token_accuracy": 0.8921673893928528, "num_tokens": 668637113.0, "step": 17524 }, { "epoch": 2.2293601322986896, "ewc_loss": 0.03254640847444534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2546409784117714e-05, "grad_norm": 18.820295333862305, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8703095316886902, "num_tokens": 668675645.0, "step": 17525 }, { "epoch": 2.22948734257728, "ewc_loss": 0.03254854679107666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2548545277677476e-05, "grad_norm": 18.836055755615234, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8753193616867065, "num_tokens": 668706480.0, "step": 17526 }, { "epoch": 2.2296145528558706, "ewc_loss": 0.032573238015174866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2573239877820015e-05, "grad_norm": 18.850866317749023, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8677819967269897, "num_tokens": 668743956.0, "step": 17527 }, { "epoch": 2.229741763134461, "ewc_loss": 0.03260395675897598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260395533288829e-05, "grad_norm": 18.791278839111328, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.8829652070999146, "num_tokens": 668779134.0, "step": 17528 }, { "epoch": 2.2298689734130517, "ewc_loss": 0.03251231461763382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251231464673765e-05, "grad_norm": 18.81533432006836, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8749697208404541, "num_tokens": 668814978.0, "step": 17529 }, { "epoch": 2.229996183691642, "ewc_loss": 0.032557252794504166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2557254598941654e-05, "grad_norm": 18.74077796936035, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8584255576133728, "num_tokens": 668854194.0, "step": 17530 }, { "epoch": 2.2301233939702327, "ewc_loss": 0.0325644314289093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2564432331128046e-05, "grad_norm": 18.845645904541016, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8664677143096924, "num_tokens": 668893095.0, "step": 17531 }, { "epoch": 2.2302506042488233, "ewc_loss": 0.032600171864032745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2600171834928915e-05, "grad_norm": 18.775550842285156, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8808139562606812, "num_tokens": 668930626.0, "step": 17532 }, { "epoch": 2.230377814527414, "ewc_loss": 0.03254137933254242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2541378459427506e-05, "grad_norm": 18.861785888671875, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8816964626312256, "num_tokens": 668960941.0, "step": 17533 }, { "epoch": 2.2305050248060043, "ewc_loss": 0.03261736035346985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261736128479242e-05, "grad_norm": 18.691896438598633, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8757954835891724, "num_tokens": 668997771.0, "step": 17534 }, { "epoch": 2.230632235084595, "ewc_loss": 0.03258002921938896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258002834627405e-05, "grad_norm": 18.790903091430664, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8869930505752563, "num_tokens": 669038714.0, "step": 17535 }, { "epoch": 2.2307594453631854, "ewc_loss": 0.032696302980184555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2696301786927506e-05, "grad_norm": 18.837562561035156, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8828574419021606, "num_tokens": 669072336.0, "step": 17536 }, { "epoch": 2.230886655641776, "ewc_loss": 0.03260333091020584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260332960053347e-05, "grad_norm": 18.720434188842773, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8834629058837891, "num_tokens": 669110303.0, "step": 17537 }, { "epoch": 2.2310138659203664, "ewc_loss": 0.032630011439323425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263001053710468e-05, "grad_norm": 18.85406494140625, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8642374277114868, "num_tokens": 669146874.0, "step": 17538 }, { "epoch": 2.231141076198957, "ewc_loss": 0.03268279507756233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2682793971616775e-05, "grad_norm": 18.75848960876465, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8854583501815796, "num_tokens": 669181682.0, "step": 17539 }, { "epoch": 2.2312682864775475, "ewc_loss": 0.032637856900691986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263785765739158e-05, "grad_norm": 18.847625732421875, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8739867210388184, "num_tokens": 669227201.0, "step": 17540 }, { "epoch": 2.231395496756138, "ewc_loss": 0.032631076872348785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263107646489516e-05, "grad_norm": 18.674076080322266, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8735734820365906, "num_tokens": 669265915.0, "step": 17541 }, { "epoch": 2.2315227070347285, "ewc_loss": 0.03262702748179436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2627027394482866e-05, "grad_norm": 18.844942092895508, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8792750835418701, "num_tokens": 669304130.0, "step": 17542 }, { "epoch": 2.231649917313319, "ewc_loss": 0.03266066312789917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2660664146533236e-05, "grad_norm": 18.75642204284668, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.87168288230896, "num_tokens": 669344225.0, "step": 17543 }, { "epoch": 2.2317771275919096, "ewc_loss": 0.03259078785777092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2590789487585425e-05, "grad_norm": 18.840442657470703, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.855897068977356, "num_tokens": 669385126.0, "step": 17544 }, { "epoch": 2.2319043378705, "ewc_loss": 0.03266901150345802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2669009669916704e-05, "grad_norm": 18.790952682495117, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8629266023635864, "num_tokens": 669425774.0, "step": 17545 }, { "epoch": 2.2320315481490907, "ewc_loss": 0.032592613250017166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259261211496778e-05, "grad_norm": 18.811582565307617, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8647032976150513, "num_tokens": 669459789.0, "step": 17546 }, { "epoch": 2.2321587584276807, "ewc_loss": 0.03265194222331047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.265194391133264e-05, "grad_norm": 18.853225708007812, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.888984203338623, "num_tokens": 669498709.0, "step": 17547 }, { "epoch": 2.2322859687062713, "ewc_loss": 0.03258030116558075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258030119468458e-05, "grad_norm": 18.812633514404297, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8786183595657349, "num_tokens": 669536021.0, "step": 17548 }, { "epoch": 2.232413178984862, "ewc_loss": 0.03254002332687378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254002149333246e-05, "grad_norm": 18.83579444885254, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8628495335578918, "num_tokens": 669574316.0, "step": 17549 }, { "epoch": 2.2325403892634523, "ewc_loss": 0.03257432579994202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257432399550453e-05, "grad_norm": 18.870439529418945, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8644494414329529, "num_tokens": 669611826.0, "step": 17550 }, { "epoch": 2.232667599542043, "ewc_loss": 0.03260190039873123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2601899874862283e-05, "grad_norm": 18.76189613342285, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8707096576690674, "num_tokens": 669655277.0, "step": 17551 }, { "epoch": 2.2327948098206334, "ewc_loss": 0.03251248225569725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2512481993762776e-05, "grad_norm": 18.776845932006836, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8670510649681091, "num_tokens": 669693481.0, "step": 17552 }, { "epoch": 2.232922020099224, "ewc_loss": 0.03253857046365738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253856993978843e-05, "grad_norm": 18.798580169677734, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8731384873390198, "num_tokens": 669724104.0, "step": 17553 }, { "epoch": 2.2330492303778144, "ewc_loss": 0.03258905187249184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2589050533715636e-05, "grad_norm": 18.859649658203125, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8567674160003662, "num_tokens": 669768725.0, "step": 17554 }, { "epoch": 2.233176440656405, "ewc_loss": 0.03256746008992195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2567459129495546e-05, "grad_norm": 18.716999053955078, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8820692896842957, "num_tokens": 669809508.0, "step": 17555 }, { "epoch": 2.2333036509349955, "ewc_loss": 0.032547395676374435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2547395676374435e-05, "grad_norm": 18.812702178955078, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8752644062042236, "num_tokens": 669848047.0, "step": 17556 }, { "epoch": 2.233430861213586, "ewc_loss": 0.03258891776204109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258891956647858e-05, "grad_norm": 18.782106399536133, "learning_rate": 1e-06, "loss": 0.3445, "mean_token_accuracy": 0.8894022703170776, "num_tokens": 669881274.0, "step": 17557 }, { "epoch": 2.2335580714921766, "ewc_loss": 0.03257272392511368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257272328482941e-05, "grad_norm": 18.857101440429688, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.868242084980011, "num_tokens": 669918517.0, "step": 17558 }, { "epoch": 2.233685281770767, "ewc_loss": 0.03260665759444237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260665835114196e-05, "grad_norm": 18.733427047729492, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8659372329711914, "num_tokens": 669960990.0, "step": 17559 }, { "epoch": 2.2338124920493576, "ewc_loss": 0.03249938413500786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2499385270057246e-05, "grad_norm": 18.77566146850586, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.879431962966919, "num_tokens": 669995542.0, "step": 17560 }, { "epoch": 2.233939702327948, "ewc_loss": 0.032641518861055374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264151746407151e-05, "grad_norm": 18.80204200744629, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8747117519378662, "num_tokens": 670036798.0, "step": 17561 }, { "epoch": 2.2340669126065387, "ewc_loss": 0.03258132189512253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258132346672937e-05, "grad_norm": 18.840007781982422, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8732296228408813, "num_tokens": 670071763.0, "step": 17562 }, { "epoch": 2.234194122885129, "ewc_loss": 0.032616425305604935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2616426324239e-05, "grad_norm": 18.811809539794922, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8725852966308594, "num_tokens": 670100864.0, "step": 17563 }, { "epoch": 2.2343213331637197, "ewc_loss": 0.03249817341566086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2498173823114485e-05, "grad_norm": 18.73560333251953, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8624372482299805, "num_tokens": 670144114.0, "step": 17564 }, { "epoch": 2.2344485434423103, "ewc_loss": 0.03256933391094208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25693326885812e-05, "grad_norm": 18.787399291992188, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8772822618484497, "num_tokens": 670182719.0, "step": 17565 }, { "epoch": 2.234575753720901, "ewc_loss": 0.03263627365231514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263627513661049e-05, "grad_norm": 18.850473403930664, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8777464628219604, "num_tokens": 670218059.0, "step": 17566 }, { "epoch": 2.2347029639994913, "ewc_loss": 0.03266501799225807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2665018807165325e-05, "grad_norm": 18.801782608032227, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.869281530380249, "num_tokens": 670259589.0, "step": 17567 }, { "epoch": 2.234830174278082, "ewc_loss": 0.03258844465017319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258844299125485e-05, "grad_norm": 18.713619232177734, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.864829421043396, "num_tokens": 670301755.0, "step": 17568 }, { "epoch": 2.2349573845566724, "ewc_loss": 0.032571468502283096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257146818214096e-05, "grad_norm": 18.767169952392578, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8771255612373352, "num_tokens": 670339205.0, "step": 17569 }, { "epoch": 2.235084594835263, "ewc_loss": 0.0326126366853714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261263555032201e-05, "grad_norm": 18.750829696655273, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8768378496170044, "num_tokens": 670382961.0, "step": 17570 }, { "epoch": 2.2352118051138534, "ewc_loss": 0.03256724774837494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2567248126724735e-05, "grad_norm": 18.790252685546875, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8642520904541016, "num_tokens": 670420747.0, "step": 17571 }, { "epoch": 2.2353390153924435, "ewc_loss": 0.03263862431049347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2638625270919874e-05, "grad_norm": 18.81584358215332, "learning_rate": 1e-06, "loss": 0.3473, "mean_token_accuracy": 0.8911203145980835, "num_tokens": 670457906.0, "step": 17572 }, { "epoch": 2.235466225671034, "ewc_loss": 0.032624002546072006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262400423409417e-05, "grad_norm": 18.772991180419922, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8623424768447876, "num_tokens": 670493187.0, "step": 17573 }, { "epoch": 2.2355934359496246, "ewc_loss": 0.0326089970767498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260899757151492e-05, "grad_norm": 18.751707077026367, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8871487975120544, "num_tokens": 670529750.0, "step": 17574 }, { "epoch": 2.235720646228215, "ewc_loss": 0.0326213575899601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262135942350142e-05, "grad_norm": 18.802064895629883, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8714797496795654, "num_tokens": 670570147.0, "step": 17575 }, { "epoch": 2.2358478565068056, "ewc_loss": 0.03262804448604584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262804602854885e-05, "grad_norm": 18.782344818115234, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8756259083747864, "num_tokens": 670604113.0, "step": 17576 }, { "epoch": 2.235975066785396, "ewc_loss": 0.03255574405193329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255574483773671e-05, "grad_norm": 18.760316848754883, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8726422786712646, "num_tokens": 670645451.0, "step": 17577 }, { "epoch": 2.2361022770639867, "ewc_loss": 0.03264172375202179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2641724828863516e-05, "grad_norm": 18.841123580932617, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8785029649734497, "num_tokens": 670684055.0, "step": 17578 }, { "epoch": 2.236229487342577, "ewc_loss": 0.0326167494058609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261675010435283e-05, "grad_norm": 18.75234603881836, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8665200471878052, "num_tokens": 670724397.0, "step": 17579 }, { "epoch": 2.2363566976211677, "ewc_loss": 0.03257764130830765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2577641832176596e-05, "grad_norm": 18.752824783325195, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8835357427597046, "num_tokens": 670761282.0, "step": 17580 }, { "epoch": 2.2364839078997583, "ewc_loss": 0.032581623643636703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258162541897036e-05, "grad_norm": 18.69373321533203, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8784290552139282, "num_tokens": 670800007.0, "step": 17581 }, { "epoch": 2.236611118178349, "ewc_loss": 0.03252424672245979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252424721722491e-05, "grad_norm": 18.767772674560547, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8702783584594727, "num_tokens": 670840040.0, "step": 17582 }, { "epoch": 2.2367383284569393, "ewc_loss": 0.03261099010705948, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2610991183901206e-05, "grad_norm": 18.762577056884766, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8669073581695557, "num_tokens": 670881193.0, "step": 17583 }, { "epoch": 2.23686553873553, "ewc_loss": 0.032587312161922455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2587311579845846e-05, "grad_norm": 18.821334838867188, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8762316703796387, "num_tokens": 670927275.0, "step": 17584 }, { "epoch": 2.2369927490141204, "ewc_loss": 0.03263910859823227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263910912210122e-05, "grad_norm": 18.80776023864746, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8776454329490662, "num_tokens": 670967327.0, "step": 17585 }, { "epoch": 2.237119959292711, "ewc_loss": 0.03262161463499069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2621614082017913e-05, "grad_norm": 18.812353134155273, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8776043653488159, "num_tokens": 671009250.0, "step": 17586 }, { "epoch": 2.2372471695713014, "ewc_loss": 0.032556045800447464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25560467899777e-05, "grad_norm": 18.803129196166992, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8781296014785767, "num_tokens": 671045068.0, "step": 17587 }, { "epoch": 2.237374379849892, "ewc_loss": 0.03255876153707504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255876072216779e-05, "grad_norm": 18.802518844604492, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8616698980331421, "num_tokens": 671084786.0, "step": 17588 }, { "epoch": 2.2375015901284825, "ewc_loss": 0.032603565603494644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260356606915593e-05, "grad_norm": 18.81184959411621, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8670043349266052, "num_tokens": 671123235.0, "step": 17589 }, { "epoch": 2.237628800407073, "ewc_loss": 0.03254442289471626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254442344768904e-05, "grad_norm": 18.75829315185547, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8901932835578918, "num_tokens": 671161702.0, "step": 17590 }, { "epoch": 2.2377560106856635, "ewc_loss": 0.03261487931013107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261488018324599e-05, "grad_norm": 18.849136352539062, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8850620985031128, "num_tokens": 671195487.0, "step": 17591 }, { "epoch": 2.237883220964254, "ewc_loss": 0.03261640667915344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2616408134344965e-05, "grad_norm": 18.84447479248047, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8704535961151123, "num_tokens": 671232288.0, "step": 17592 }, { "epoch": 2.2380104312428446, "ewc_loss": 0.032512880861759186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2512882171431556e-05, "grad_norm": 18.742870330810547, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8572667241096497, "num_tokens": 671275154.0, "step": 17593 }, { "epoch": 2.238137641521435, "ewc_loss": 0.03255008906126022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255008778069168e-05, "grad_norm": 18.856430053710938, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8640897274017334, "num_tokens": 671312266.0, "step": 17594 }, { "epoch": 2.238264851800025, "ewc_loss": 0.03259079158306122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259079312556423e-05, "grad_norm": 18.821590423583984, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8740307688713074, "num_tokens": 671349690.0, "step": 17595 }, { "epoch": 2.238392062078616, "ewc_loss": 0.03252646327018738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252646274631843e-05, "grad_norm": 18.863990783691406, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8776156306266785, "num_tokens": 671387898.0, "step": 17596 }, { "epoch": 2.2385192723572063, "ewc_loss": 0.0325787179172039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2578718673903495e-05, "grad_norm": 18.82034683227539, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8534780144691467, "num_tokens": 671428054.0, "step": 17597 }, { "epoch": 2.238646482635797, "ewc_loss": 0.032493576407432556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.249357541790232e-05, "grad_norm": 18.793323516845703, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8791705369949341, "num_tokens": 671467096.0, "step": 17598 }, { "epoch": 2.2387736929143873, "ewc_loss": 0.032554253935813904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25542532664258e-05, "grad_norm": 18.770355224609375, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8727313876152039, "num_tokens": 671506752.0, "step": 17599 }, { "epoch": 2.238900903192978, "ewc_loss": 0.032508328557014465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250832742196508e-05, "grad_norm": 18.750591278076172, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8669097423553467, "num_tokens": 671542403.0, "step": 17600 }, { "epoch": 2.2390281134715684, "ewc_loss": 0.03251100704073906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2511008612345904e-05, "grad_norm": 18.80752182006836, "learning_rate": 1e-06, "loss": 0.3474, "mean_token_accuracy": 0.8844557404518127, "num_tokens": 671578115.0, "step": 17601 }, { "epoch": 2.239155323750159, "ewc_loss": 0.03259431570768356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25943146890495e-05, "grad_norm": 18.757123947143555, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.8843542337417603, "num_tokens": 671619854.0, "step": 17602 }, { "epoch": 2.2392825340287494, "ewc_loss": 0.03248834237456322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248834400437772e-05, "grad_norm": 18.75140380859375, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8873203992843628, "num_tokens": 671661617.0, "step": 17603 }, { "epoch": 2.23940974430734, "ewc_loss": 0.032612185925245285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261218444094993e-05, "grad_norm": 18.789216995239258, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8638810515403748, "num_tokens": 671702231.0, "step": 17604 }, { "epoch": 2.2395369545859305, "ewc_loss": 0.03255042806267738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255042975069955e-05, "grad_norm": 18.78275489807129, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8731168508529663, "num_tokens": 671741483.0, "step": 17605 }, { "epoch": 2.239664164864521, "ewc_loss": 0.03257174417376518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25717446685303e-05, "grad_norm": 18.754793167114258, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8697012662887573, "num_tokens": 671780168.0, "step": 17606 }, { "epoch": 2.2397913751431116, "ewc_loss": 0.032572824507951736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2572825148236006e-05, "grad_norm": 18.790088653564453, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8515658974647522, "num_tokens": 671818980.0, "step": 17607 }, { "epoch": 2.239918585421702, "ewc_loss": 0.03255736455321312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255736373830587e-05, "grad_norm": 18.788557052612305, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8742142915725708, "num_tokens": 671850010.0, "step": 17608 }, { "epoch": 2.2400457957002926, "ewc_loss": 0.03257613256573677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257613207097165e-05, "grad_norm": 18.742969512939453, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8666536808013916, "num_tokens": 671888506.0, "step": 17609 }, { "epoch": 2.240173005978883, "ewc_loss": 0.0326416902244091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2641688449075446e-05, "grad_norm": 18.86030387878418, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8821638822555542, "num_tokens": 671930965.0, "step": 17610 }, { "epoch": 2.2403002162574737, "ewc_loss": 0.032580915838479996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258091601310298e-05, "grad_norm": 18.749004364013672, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.882429838180542, "num_tokens": 671967472.0, "step": 17611 }, { "epoch": 2.240427426536064, "ewc_loss": 0.03260919824242592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260919766034931e-05, "grad_norm": 18.824892044067383, "learning_rate": 1e-06, "loss": 0.3223, "mean_token_accuracy": 0.8947473764419556, "num_tokens": 672005986.0, "step": 17612 }, { "epoch": 2.2405546368146547, "ewc_loss": 0.032644543796777725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264454426243901e-05, "grad_norm": 18.765535354614258, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8738376498222351, "num_tokens": 672044729.0, "step": 17613 }, { "epoch": 2.2406818470932452, "ewc_loss": 0.032565075904130936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25650762533769e-05, "grad_norm": 18.750648498535156, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8741333484649658, "num_tokens": 672080141.0, "step": 17614 }, { "epoch": 2.2408090573718358, "ewc_loss": 0.032660309225320816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266030762461014e-05, "grad_norm": 18.82579231262207, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8814864754676819, "num_tokens": 672119277.0, "step": 17615 }, { "epoch": 2.2409362676504263, "ewc_loss": 0.032608721405267715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260872108512558e-05, "grad_norm": 18.745975494384766, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8715025782585144, "num_tokens": 672154205.0, "step": 17616 }, { "epoch": 2.241063477929017, "ewc_loss": 0.032588738948106766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2588737667538226e-05, "grad_norm": 18.806137084960938, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.872795581817627, "num_tokens": 672193667.0, "step": 17617 }, { "epoch": 2.2411906882076074, "ewc_loss": 0.03266267478466034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266267594881356e-05, "grad_norm": 18.711994171142578, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.84434974193573, "num_tokens": 672232360.0, "step": 17618 }, { "epoch": 2.241317898486198, "ewc_loss": 0.03261113539338112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261113670305349e-05, "grad_norm": 18.85525894165039, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8612062931060791, "num_tokens": 672264127.0, "step": 17619 }, { "epoch": 2.241445108764788, "ewc_loss": 0.03269416466355324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2694166293367743e-05, "grad_norm": 18.751413345336914, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8815941214561462, "num_tokens": 672300719.0, "step": 17620 }, { "epoch": 2.2415723190433785, "ewc_loss": 0.03260108083486557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260108132963069e-05, "grad_norm": 18.812183380126953, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8835548162460327, "num_tokens": 672342796.0, "step": 17621 }, { "epoch": 2.241699529321969, "ewc_loss": 0.032668210566043854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2668209314579144e-05, "grad_norm": 18.82158851623535, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8759257197380066, "num_tokens": 672382597.0, "step": 17622 }, { "epoch": 2.2418267396005596, "ewc_loss": 0.032563190907239914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256319178035483e-05, "grad_norm": 18.760311126708984, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8619472980499268, "num_tokens": 672427350.0, "step": 17623 }, { "epoch": 2.24195394987915, "ewc_loss": 0.03261213004589081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2612129871267825e-05, "grad_norm": 18.87918472290039, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8679443001747131, "num_tokens": 672468997.0, "step": 17624 }, { "epoch": 2.2420811601577406, "ewc_loss": 0.0326324999332428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263249891460873e-05, "grad_norm": 18.77400779724121, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8800667524337769, "num_tokens": 672505061.0, "step": 17625 }, { "epoch": 2.242208370436331, "ewc_loss": 0.03257909417152405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2579093385720626e-05, "grad_norm": 18.792430877685547, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.871843159198761, "num_tokens": 672544241.0, "step": 17626 }, { "epoch": 2.2423355807149217, "ewc_loss": 0.03263159468770027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2631593057885766e-05, "grad_norm": 18.805959701538086, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.873251736164093, "num_tokens": 672586334.0, "step": 17627 }, { "epoch": 2.242462790993512, "ewc_loss": 0.032630328088998795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26303270412609e-05, "grad_norm": 18.864503860473633, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8702889084815979, "num_tokens": 672628627.0, "step": 17628 }, { "epoch": 2.2425900012721027, "ewc_loss": 0.03254377469420433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254377588746138e-05, "grad_norm": 18.778926849365234, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8647802472114563, "num_tokens": 672663046.0, "step": 17629 }, { "epoch": 2.2427172115506933, "ewc_loss": 0.03261231631040573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261231540818699e-05, "grad_norm": 18.760210037231445, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8803114295005798, "num_tokens": 672703394.0, "step": 17630 }, { "epoch": 2.242844421829284, "ewc_loss": 0.032515961676836014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2515959901502356e-05, "grad_norm": 18.757408142089844, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8647061586380005, "num_tokens": 672743262.0, "step": 17631 }, { "epoch": 2.2429716321078743, "ewc_loss": 0.03257162123918533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257162097725086e-05, "grad_norm": 18.829313278198242, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.840301513671875, "num_tokens": 672782662.0, "step": 17632 }, { "epoch": 2.243098842386465, "ewc_loss": 0.03261953219771385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261953315814026e-05, "grad_norm": 18.78766632080078, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8707553148269653, "num_tokens": 672818400.0, "step": 17633 }, { "epoch": 2.2432260526650554, "ewc_loss": 0.032534100115299225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2534098863834515e-05, "grad_norm": 18.830596923828125, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8857062458992004, "num_tokens": 672858843.0, "step": 17634 }, { "epoch": 2.243353262943646, "ewc_loss": 0.03262105584144592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262105747126043e-05, "grad_norm": 18.844554901123047, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8749988675117493, "num_tokens": 672900397.0, "step": 17635 }, { "epoch": 2.2434804732222364, "ewc_loss": 0.03248230367898941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248230495955795e-05, "grad_norm": 18.72090721130371, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8722411394119263, "num_tokens": 672936025.0, "step": 17636 }, { "epoch": 2.243607683500827, "ewc_loss": 0.03258933499455452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258933429606259e-05, "grad_norm": 18.894874572753906, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8552263975143433, "num_tokens": 672979180.0, "step": 17637 }, { "epoch": 2.2437348937794175, "ewc_loss": 0.032561931759119034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2561933039687574e-05, "grad_norm": 18.7760009765625, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8831419944763184, "num_tokens": 673018053.0, "step": 17638 }, { "epoch": 2.243862104058008, "ewc_loss": 0.03250892832875252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.250892768846825e-05, "grad_norm": 18.873035430908203, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.855096697807312, "num_tokens": 673057596.0, "step": 17639 }, { "epoch": 2.2439893143365985, "ewc_loss": 0.03257701173424721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257701246184297e-05, "grad_norm": 18.820850372314453, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8852261304855347, "num_tokens": 673097740.0, "step": 17640 }, { "epoch": 2.244116524615189, "ewc_loss": 0.032440267503261566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2440268114442006e-05, "grad_norm": 18.811176300048828, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8787100315093994, "num_tokens": 673141821.0, "step": 17641 }, { "epoch": 2.2442437348937796, "ewc_loss": 0.03255802392959595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255802221246995e-05, "grad_norm": 18.803279876708984, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8810749053955078, "num_tokens": 673184279.0, "step": 17642 }, { "epoch": 2.24437094517237, "ewc_loss": 0.03251389414072037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251389352953993e-05, "grad_norm": 18.810009002685547, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8830374479293823, "num_tokens": 673224328.0, "step": 17643 }, { "epoch": 2.2444981554509607, "ewc_loss": 0.032464999705553055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2464999094372615e-05, "grad_norm": 18.84475326538086, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8607825636863708, "num_tokens": 673263328.0, "step": 17644 }, { "epoch": 2.2446253657295507, "ewc_loss": 0.03257940337061882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.257940261391923e-05, "grad_norm": 18.971786499023438, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8779251575469971, "num_tokens": 673301663.0, "step": 17645 }, { "epoch": 2.2447525760081413, "ewc_loss": 0.03248181939125061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.248182110837661e-05, "grad_norm": 18.74055290222168, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8720757961273193, "num_tokens": 673340606.0, "step": 17646 }, { "epoch": 2.244879786286732, "ewc_loss": 0.03239363431930542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.23936328641139e-05, "grad_norm": 18.87419891357422, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8711358904838562, "num_tokens": 673382201.0, "step": 17647 }, { "epoch": 2.2450069965653223, "ewc_loss": 0.03247526288032532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247526183258742e-05, "grad_norm": 18.719152450561523, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8824492692947388, "num_tokens": 673423330.0, "step": 17648 }, { "epoch": 2.245134206843913, "ewc_loss": 0.03242368996143341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.242368984501809e-05, "grad_norm": 18.832475662231445, "learning_rate": 1e-06, "loss": 0.3349, "mean_token_accuracy": 0.8904696702957153, "num_tokens": 673459690.0, "step": 17649 }, { "epoch": 2.2452614171225034, "ewc_loss": 0.03250432759523392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2504329283256084e-05, "grad_norm": 18.785667419433594, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8697675466537476, "num_tokens": 673494827.0, "step": 17650 }, { "epoch": 2.245388627401094, "ewc_loss": 0.0324515625834465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2451564038638026e-05, "grad_norm": 18.76445960998535, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8624517917633057, "num_tokens": 673540568.0, "step": 17651 }, { "epoch": 2.2455158376796844, "ewc_loss": 0.03249125927686691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.24912580254022e-05, "grad_norm": 18.80388641357422, "learning_rate": 1e-06, "loss": 0.348, "mean_token_accuracy": 0.8893714547157288, "num_tokens": 673574022.0, "step": 17652 }, { "epoch": 2.245643047958275, "ewc_loss": 0.03248211368918419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2482112146681175e-05, "grad_norm": 18.7894344329834, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8705251216888428, "num_tokens": 673612493.0, "step": 17653 }, { "epoch": 2.2457702582368655, "ewc_loss": 0.03254915028810501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2549149182159454e-05, "grad_norm": 18.84563446044922, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8702213168144226, "num_tokens": 673653729.0, "step": 17654 }, { "epoch": 2.245897468515456, "ewc_loss": 0.03251360356807709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251360249123536e-05, "grad_norm": 18.824934005737305, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8861030340194702, "num_tokens": 673695626.0, "step": 17655 }, { "epoch": 2.2460246787940465, "ewc_loss": 0.03249385207891464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.249385190429166e-05, "grad_norm": 18.848562240600586, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8803597092628479, "num_tokens": 673732188.0, "step": 17656 }, { "epoch": 2.246151889072637, "ewc_loss": 0.03253660351037979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2536601793253794e-05, "grad_norm": 18.858360290527344, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8651263117790222, "num_tokens": 673770259.0, "step": 17657 }, { "epoch": 2.2462790993512276, "ewc_loss": 0.032419055700302124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2419055060017854e-05, "grad_norm": 18.903507232666016, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.870205283164978, "num_tokens": 673805018.0, "step": 17658 }, { "epoch": 2.246406309629818, "ewc_loss": 0.03250262513756752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2502626709174365e-05, "grad_norm": 18.802350997924805, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8671404719352722, "num_tokens": 673846058.0, "step": 17659 }, { "epoch": 2.2465335199084087, "ewc_loss": 0.032469410449266434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.246941196266562e-05, "grad_norm": 18.927101135253906, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8606705665588379, "num_tokens": 673886437.0, "step": 17660 }, { "epoch": 2.246660730186999, "ewc_loss": 0.0325901061296463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259010554756969e-05, "grad_norm": 18.83160400390625, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8769777417182922, "num_tokens": 673925701.0, "step": 17661 }, { "epoch": 2.2467879404655897, "ewc_loss": 0.03240206465125084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2402065698988736e-05, "grad_norm": 18.78949546813965, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8763251304626465, "num_tokens": 673969854.0, "step": 17662 }, { "epoch": 2.2469151507441802, "ewc_loss": 0.03252657875418663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252657916164026e-05, "grad_norm": 18.84279441833496, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8751354217529297, "num_tokens": 674005048.0, "step": 17663 }, { "epoch": 2.2470423610227708, "ewc_loss": 0.032506316900253296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2506315619684756e-05, "grad_norm": 18.807415008544922, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8683571219444275, "num_tokens": 674046548.0, "step": 17664 }, { "epoch": 2.2471695713013613, "ewc_loss": 0.03249590843915939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2495907362317666e-05, "grad_norm": 18.82181167602539, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8679198026657104, "num_tokens": 674086905.0, "step": 17665 }, { "epoch": 2.247296781579952, "ewc_loss": 0.03254595771431923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2545958674745634e-05, "grad_norm": 18.86595344543457, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8831924200057983, "num_tokens": 674124589.0, "step": 17666 }, { "epoch": 2.2474239918585424, "ewc_loss": 0.03253637254238129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253637260058895e-05, "grad_norm": 18.754383087158203, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.8863337635993958, "num_tokens": 674165135.0, "step": 17667 }, { "epoch": 2.247551202137133, "ewc_loss": 0.032470859587192535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.247085987823084e-05, "grad_norm": 18.8221435546875, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8686047792434692, "num_tokens": 674207269.0, "step": 17668 }, { "epoch": 2.2476784124157234, "ewc_loss": 0.03262651339173317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2626514439471066e-05, "grad_norm": 18.77246856689453, "learning_rate": 1e-06, "loss": 0.3382, "mean_token_accuracy": 0.8909574747085571, "num_tokens": 674240806.0, "step": 17669 }, { "epoch": 2.2478056226943135, "ewc_loss": 0.03253183141350746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25318324030377e-05, "grad_norm": 18.881166458129883, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8654614686965942, "num_tokens": 674274697.0, "step": 17670 }, { "epoch": 2.247932832972904, "ewc_loss": 0.032612405717372894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2612406357657164e-05, "grad_norm": 18.85809326171875, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8649328351020813, "num_tokens": 674311362.0, "step": 17671 }, { "epoch": 2.2480600432514946, "ewc_loss": 0.0325334407389164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253344038967043e-05, "grad_norm": 18.886127471923828, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8802828788757324, "num_tokens": 674352505.0, "step": 17672 }, { "epoch": 2.248187253530085, "ewc_loss": 0.032530851662158966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253085014875978e-05, "grad_norm": 18.943376541137695, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8787383437156677, "num_tokens": 674383938.0, "step": 17673 }, { "epoch": 2.2483144638086756, "ewc_loss": 0.032545872032642365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2545871363254264e-05, "grad_norm": 18.76443862915039, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8703392148017883, "num_tokens": 674421665.0, "step": 17674 }, { "epoch": 2.248441674087266, "ewc_loss": 0.032520465552806854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.252046735724434e-05, "grad_norm": 18.902738571166992, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8756140470504761, "num_tokens": 674466065.0, "step": 17675 }, { "epoch": 2.2485688843658567, "ewc_loss": 0.03261110186576843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261110032326542e-05, "grad_norm": 18.86555290222168, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8758738040924072, "num_tokens": 674506868.0, "step": 17676 }, { "epoch": 2.248696094644447, "ewc_loss": 0.03245408460497856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.245408515795134e-05, "grad_norm": 18.829376220703125, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8696796894073486, "num_tokens": 674545756.0, "step": 17677 }, { "epoch": 2.2488233049230377, "ewc_loss": 0.03255937993526459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2559379178564996e-05, "grad_norm": 18.799745559692383, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8693200349807739, "num_tokens": 674590680.0, "step": 17678 }, { "epoch": 2.2489505152016283, "ewc_loss": 0.03249594569206238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2495947380084544e-05, "grad_norm": 18.7734375, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8783175349235535, "num_tokens": 674631624.0, "step": 17679 }, { "epoch": 2.249077725480219, "ewc_loss": 0.03254063427448273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254063267377205e-05, "grad_norm": 18.828195571899414, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8691716194152832, "num_tokens": 674667365.0, "step": 17680 }, { "epoch": 2.2492049357588093, "ewc_loss": 0.03251180425286293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.251180532970466e-05, "grad_norm": 18.740028381347656, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8725566864013672, "num_tokens": 674706587.0, "step": 17681 }, { "epoch": 2.2493321460374, "ewc_loss": 0.03262082859873772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262082827859558e-05, "grad_norm": 18.861459732055664, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8720183372497559, "num_tokens": 674744480.0, "step": 17682 }, { "epoch": 2.2494593563159904, "ewc_loss": 0.032606128603219986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2606127206236124e-05, "grad_norm": 18.8759708404541, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.873115062713623, "num_tokens": 674783104.0, "step": 17683 }, { "epoch": 2.249586566594581, "ewc_loss": 0.032627642154693604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2627642212901264e-05, "grad_norm": 18.893054962158203, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8653453588485718, "num_tokens": 674823488.0, "step": 17684 }, { "epoch": 2.2497137768731714, "ewc_loss": 0.032549433410167694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254943294450641e-05, "grad_norm": 18.791357040405273, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8794129490852356, "num_tokens": 674857322.0, "step": 17685 }, { "epoch": 2.249840987151762, "ewc_loss": 0.03254149854183197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254149851272814e-05, "grad_norm": 18.80602264404297, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8653169870376587, "num_tokens": 674901566.0, "step": 17686 }, { "epoch": 2.2499681974303525, "ewc_loss": 0.03254016116261482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.254015973652713e-05, "grad_norm": 18.772111892700195, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8795973062515259, "num_tokens": 674942803.0, "step": 17687 }, { "epoch": 2.250095407708943, "ewc_loss": 0.032535817474126816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253581598983146e-05, "grad_norm": 18.831777572631836, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8655392527580261, "num_tokens": 674983160.0, "step": 17688 }, { "epoch": 2.2502226179875335, "ewc_loss": 0.032633695751428604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2633695809636265e-05, "grad_norm": 18.799304962158203, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8746911287307739, "num_tokens": 675021243.0, "step": 17689 }, { "epoch": 2.250349828266124, "ewc_loss": 0.03256966546177864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2569663744652644e-05, "grad_norm": 18.8140926361084, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.869631826877594, "num_tokens": 675059435.0, "step": 17690 }, { "epoch": 2.2504770385447146, "ewc_loss": 0.032643742859363556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264374390710145e-05, "grad_norm": 18.832218170166016, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8765947818756104, "num_tokens": 675091689.0, "step": 17691 }, { "epoch": 2.250604248823305, "ewc_loss": 0.03253374621272087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253374597989023e-05, "grad_norm": 18.71338653564453, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8681591749191284, "num_tokens": 675130368.0, "step": 17692 }, { "epoch": 2.250731459101895, "ewc_loss": 0.03258345276117325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258345168433152e-05, "grad_norm": 18.835691452026367, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8768837451934814, "num_tokens": 675171745.0, "step": 17693 }, { "epoch": 2.250858669380486, "ewc_loss": 0.032627686858177185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262768586864695e-05, "grad_norm": 18.75034523010254, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8720061779022217, "num_tokens": 675206052.0, "step": 17694 }, { "epoch": 2.2509858796590763, "ewc_loss": 0.0325736440718174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25736436934676e-05, "grad_norm": 18.85173988342285, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8684945106506348, "num_tokens": 675243231.0, "step": 17695 }, { "epoch": 2.251113089937667, "ewc_loss": 0.032682154327631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268215368734673e-05, "grad_norm": 18.848676681518555, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8670745491981506, "num_tokens": 675273582.0, "step": 17696 }, { "epoch": 2.2512403002162573, "ewc_loss": 0.032564036548137665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256403579143807e-05, "grad_norm": 18.7738037109375, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8693005442619324, "num_tokens": 675314247.0, "step": 17697 }, { "epoch": 2.251367510494848, "ewc_loss": 0.03264092281460762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2640924473525956e-05, "grad_norm": 18.79633331298828, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.875352144241333, "num_tokens": 675355675.0, "step": 17698 }, { "epoch": 2.2514947207734384, "ewc_loss": 0.032664887607097626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266488783992827e-05, "grad_norm": 18.824325561523438, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8714447021484375, "num_tokens": 675394290.0, "step": 17699 }, { "epoch": 2.251621931052029, "ewc_loss": 0.03266981616616249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266981730121188e-05, "grad_norm": 18.87875747680664, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8763841390609741, "num_tokens": 675434251.0, "step": 17700 }, { "epoch": 2.2517491413306194, "ewc_loss": 0.03267274424433708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267274587415159e-05, "grad_norm": 18.85141372680664, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8798354864120483, "num_tokens": 675474731.0, "step": 17701 }, { "epoch": 2.25187635160921, "ewc_loss": 0.03261201083660126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261200981796719e-05, "grad_norm": 18.805950164794922, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8680424690246582, "num_tokens": 675515180.0, "step": 17702 }, { "epoch": 2.2520035618878005, "ewc_loss": 0.03264849632978439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2648495107423514e-05, "grad_norm": 18.82317352294922, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8637151718139648, "num_tokens": 675553429.0, "step": 17703 }, { "epoch": 2.252130772166391, "ewc_loss": 0.032616499811410904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261649908381514e-05, "grad_norm": 18.922449111938477, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8777542114257812, "num_tokens": 675592919.0, "step": 17704 }, { "epoch": 2.2522579824449815, "ewc_loss": 0.0325518436729908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255184492445551e-05, "grad_norm": 18.751985549926758, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8812835812568665, "num_tokens": 675627528.0, "step": 17705 }, { "epoch": 2.252385192723572, "ewc_loss": 0.03256108611822128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256108539062552e-05, "grad_norm": 18.845251083374023, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8810075521469116, "num_tokens": 675663721.0, "step": 17706 }, { "epoch": 2.2525124030021626, "ewc_loss": 0.0326630175113678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2663017918821424e-05, "grad_norm": 18.82905387878418, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8680373430252075, "num_tokens": 675705187.0, "step": 17707 }, { "epoch": 2.252639613280753, "ewc_loss": 0.03256945312023163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256945274188183e-05, "grad_norm": 18.771032333374023, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8833017945289612, "num_tokens": 675752713.0, "step": 17708 }, { "epoch": 2.2527668235593437, "ewc_loss": 0.03266853466629982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2668533094692975e-05, "grad_norm": 18.992305755615234, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8744673728942871, "num_tokens": 675791009.0, "step": 17709 }, { "epoch": 2.252894033837934, "ewc_loss": 0.03255708888173103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255708725191653e-05, "grad_norm": 18.778879165649414, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8765491247177124, "num_tokens": 675828890.0, "step": 17710 }, { "epoch": 2.2530212441165247, "ewc_loss": 0.03254246711730957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2542466215090826e-05, "grad_norm": 18.869348526000977, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8746874332427979, "num_tokens": 675862855.0, "step": 17711 }, { "epoch": 2.2531484543951152, "ewc_loss": 0.03266550600528717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2665506296325475e-05, "grad_norm": 18.87350845336914, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8718414902687073, "num_tokens": 675903032.0, "step": 17712 }, { "epoch": 2.2532756646737058, "ewc_loss": 0.03257138282060623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25713845086284e-05, "grad_norm": 18.867740631103516, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8698505163192749, "num_tokens": 675943035.0, "step": 17713 }, { "epoch": 2.2534028749522963, "ewc_loss": 0.03256642445921898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256642594351433e-05, "grad_norm": 18.758281707763672, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8759218454360962, "num_tokens": 675983073.0, "step": 17714 }, { "epoch": 2.253530085230887, "ewc_loss": 0.03260738402605057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260738230892457e-05, "grad_norm": 18.858448028564453, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8787738680839539, "num_tokens": 676022341.0, "step": 17715 }, { "epoch": 2.2536572955094774, "ewc_loss": 0.03256012126803398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256012132624164e-05, "grad_norm": 18.782978057861328, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.870637059211731, "num_tokens": 676060907.0, "step": 17716 }, { "epoch": 2.253784505788068, "ewc_loss": 0.032585062086582184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258506330894306e-05, "grad_norm": 18.832290649414062, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8722118735313416, "num_tokens": 676100693.0, "step": 17717 }, { "epoch": 2.253911716066658, "ewc_loss": 0.03255511075258255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255511182942428e-05, "grad_norm": 18.85188865661621, "learning_rate": 1e-06, "loss": 0.3532, "mean_token_accuracy": 0.8877429366111755, "num_tokens": 676137526.0, "step": 17718 }, { "epoch": 2.254038926345249, "ewc_loss": 0.032607827335596085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2607826142339036e-05, "grad_norm": 18.736684799194336, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8765209317207336, "num_tokens": 676177245.0, "step": 17719 }, { "epoch": 2.254166136623839, "ewc_loss": 0.03262139856815338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2621399441268295e-05, "grad_norm": 18.851884841918945, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8799238204956055, "num_tokens": 676214568.0, "step": 17720 }, { "epoch": 2.2542933469024296, "ewc_loss": 0.032652348279953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2652347726980224e-05, "grad_norm": 18.78817367553711, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8747378587722778, "num_tokens": 676252552.0, "step": 17721 }, { "epoch": 2.25442055718102, "ewc_loss": 0.03260917216539383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260917219449766e-05, "grad_norm": 18.865795135498047, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8563358783721924, "num_tokens": 676290480.0, "step": 17722 }, { "epoch": 2.2545477674596106, "ewc_loss": 0.03263091295957565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263091275584884e-05, "grad_norm": 18.789562225341797, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8862892389297485, "num_tokens": 676324950.0, "step": 17723 }, { "epoch": 2.254674977738201, "ewc_loss": 0.03259764611721039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259764707763679e-05, "grad_norm": 18.900236129760742, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8657017350196838, "num_tokens": 676363851.0, "step": 17724 }, { "epoch": 2.2548021880167917, "ewc_loss": 0.03267034515738487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267034480813891e-05, "grad_norm": 18.837039947509766, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8820894956588745, "num_tokens": 676399920.0, "step": 17725 }, { "epoch": 2.254929398295382, "ewc_loss": 0.032604433596134186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260443190811202e-05, "grad_norm": 18.850536346435547, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8715486526489258, "num_tokens": 676443243.0, "step": 17726 }, { "epoch": 2.2550566085739727, "ewc_loss": 0.032608307898044586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260830635554157e-05, "grad_norm": 18.800678253173828, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8609110116958618, "num_tokens": 676480272.0, "step": 17727 }, { "epoch": 2.2551838188525632, "ewc_loss": 0.03263026848435402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2630268833599985e-05, "grad_norm": 18.937414169311523, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.869536280632019, "num_tokens": 676516630.0, "step": 17728 }, { "epoch": 2.2553110291311538, "ewc_loss": 0.03264705836772919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264705810579471e-05, "grad_norm": 18.720203399658203, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.877576470375061, "num_tokens": 676557842.0, "step": 17729 }, { "epoch": 2.2554382394097443, "ewc_loss": 0.032634347677230835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2634347007842734e-05, "grad_norm": 18.953105926513672, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8840755820274353, "num_tokens": 676599413.0, "step": 17730 }, { "epoch": 2.255565449688335, "ewc_loss": 0.032719604671001434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271960304118693e-05, "grad_norm": 18.80521583557129, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8656859397888184, "num_tokens": 676637791.0, "step": 17731 }, { "epoch": 2.2556926599669254, "ewc_loss": 0.03256594389677048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2565945730311796e-05, "grad_norm": 18.822010040283203, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8695502877235413, "num_tokens": 676677108.0, "step": 17732 }, { "epoch": 2.255819870245516, "ewc_loss": 0.03274862840771675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2748626836109906e-05, "grad_norm": 18.840862274169922, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8753349781036377, "num_tokens": 676720698.0, "step": 17733 }, { "epoch": 2.2559470805241064, "ewc_loss": 0.03258534520864487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258534343331121e-05, "grad_norm": 18.87143325805664, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8624632954597473, "num_tokens": 676763719.0, "step": 17734 }, { "epoch": 2.256074290802697, "ewc_loss": 0.032632485032081604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2632484362693503e-05, "grad_norm": 18.73782730102539, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8745102882385254, "num_tokens": 676801083.0, "step": 17735 }, { "epoch": 2.2562015010812875, "ewc_loss": 0.03261146321892738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261146412114613e-05, "grad_norm": 18.953250885009766, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8665497303009033, "num_tokens": 676840861.0, "step": 17736 }, { "epoch": 2.256328711359878, "ewc_loss": 0.03275690972805023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2756910513853654e-05, "grad_norm": 18.79785919189453, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8862490653991699, "num_tokens": 676876864.0, "step": 17737 }, { "epoch": 2.2564559216384685, "ewc_loss": 0.03264275938272476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264275801484473e-05, "grad_norm": 18.845048904418945, "learning_rate": 1e-06, "loss": 0.3321, "mean_token_accuracy": 0.8935943841934204, "num_tokens": 676920983.0, "step": 17738 }, { "epoch": 2.256583131917059, "ewc_loss": 0.03268035501241684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268035652581602e-05, "grad_norm": 18.822538375854492, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8824565410614014, "num_tokens": 676961294.0, "step": 17739 }, { "epoch": 2.2567103421956496, "ewc_loss": 0.03259614109992981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259614095441066e-05, "grad_norm": 18.81917953491211, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8660019040107727, "num_tokens": 676996318.0, "step": 17740 }, { "epoch": 2.2568375524742397, "ewc_loss": 0.03264519199728966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2645191822666675e-05, "grad_norm": 18.85246467590332, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8751832246780396, "num_tokens": 677034083.0, "step": 17741 }, { "epoch": 2.2569647627528306, "ewc_loss": 0.03263789415359497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263789403717965e-05, "grad_norm": 18.82634162902832, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8748064041137695, "num_tokens": 677071206.0, "step": 17742 }, { "epoch": 2.2570919730314207, "ewc_loss": 0.032617565244436264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261756501160562e-05, "grad_norm": 18.82183074951172, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8797502517700195, "num_tokens": 677103573.0, "step": 17743 }, { "epoch": 2.2572191833100113, "ewc_loss": 0.03264092281460762, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2640924473525956e-05, "grad_norm": 18.853445053100586, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8833315968513489, "num_tokens": 677141406.0, "step": 17744 }, { "epoch": 2.257346393588602, "ewc_loss": 0.03263741359114647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263741382397711e-05, "grad_norm": 18.79793357849121, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8641959428787231, "num_tokens": 677185968.0, "step": 17745 }, { "epoch": 2.2574736038671923, "ewc_loss": 0.0325922854244709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259228469687514e-05, "grad_norm": 18.90966033935547, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8663948178291321, "num_tokens": 677229004.0, "step": 17746 }, { "epoch": 2.257600814145783, "ewc_loss": 0.03266308084130287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2663079764461145e-05, "grad_norm": 18.842878341674805, "learning_rate": 1e-06, "loss": 0.3471, "mean_token_accuracy": 0.8837838768959045, "num_tokens": 677266317.0, "step": 17747 }, { "epoch": 2.2577280244243734, "ewc_loss": 0.032537996768951416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.253799513913691e-05, "grad_norm": 18.805110931396484, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8843033909797668, "num_tokens": 677302344.0, "step": 17748 }, { "epoch": 2.257855234702964, "ewc_loss": 0.03263315558433533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2633157388772815e-05, "grad_norm": 18.920555114746094, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8641096353530884, "num_tokens": 677342103.0, "step": 17749 }, { "epoch": 2.2579824449815544, "ewc_loss": 0.032592643052339554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259264485677704e-05, "grad_norm": 18.81910514831543, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.861965000629425, "num_tokens": 677378602.0, "step": 17750 }, { "epoch": 2.258109655260145, "ewc_loss": 0.0325448215007782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2544819987379014e-05, "grad_norm": 18.832136154174805, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8728002905845642, "num_tokens": 677415005.0, "step": 17751 }, { "epoch": 2.2582368655387355, "ewc_loss": 0.03267042338848114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267042484367266e-05, "grad_norm": 18.81613540649414, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8558903336524963, "num_tokens": 677457908.0, "step": 17752 }, { "epoch": 2.258364075817326, "ewc_loss": 0.03255893662571907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.255893534515053e-05, "grad_norm": 18.866714477539062, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8635144829750061, "num_tokens": 677492759.0, "step": 17753 }, { "epoch": 2.2584912860959165, "ewc_loss": 0.032631997019052505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263199687353335e-05, "grad_norm": 18.761056900024414, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8726128935813904, "num_tokens": 677535771.0, "step": 17754 }, { "epoch": 2.258618496374507, "ewc_loss": 0.032563626766204834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.256362833781168e-05, "grad_norm": 18.84795379638672, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8871192336082458, "num_tokens": 677575325.0, "step": 17755 }, { "epoch": 2.2587457066530976, "ewc_loss": 0.032662078738212585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2662079320289195e-05, "grad_norm": 18.789369583129883, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8737400770187378, "num_tokens": 677614132.0, "step": 17756 }, { "epoch": 2.258872916931688, "ewc_loss": 0.032604627311229706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26046283589676e-05, "grad_norm": 18.879566192626953, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8689649105072021, "num_tokens": 677645657.0, "step": 17757 }, { "epoch": 2.2590001272102787, "ewc_loss": 0.0326501801609993, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26501794916112e-05, "grad_norm": 18.771390914916992, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.881609320640564, "num_tokens": 677686238.0, "step": 17758 }, { "epoch": 2.259127337488869, "ewc_loss": 0.03266923129558563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266923158662394e-05, "grad_norm": 18.88751983642578, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8779126405715942, "num_tokens": 677724080.0, "step": 17759 }, { "epoch": 2.2592545477674597, "ewc_loss": 0.03264785557985306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2647854823153466e-05, "grad_norm": 18.806171417236328, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8736780285835266, "num_tokens": 677765768.0, "step": 17760 }, { "epoch": 2.2593817580460502, "ewc_loss": 0.03260774910449982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260774974478409e-05, "grad_norm": 18.820655822753906, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8743011951446533, "num_tokens": 677803937.0, "step": 17761 }, { "epoch": 2.2595089683246408, "ewc_loss": 0.03265294060111046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.265294071752578e-05, "grad_norm": 18.766239166259766, "learning_rate": 1e-06, "loss": 0.35, "mean_token_accuracy": 0.8840513229370117, "num_tokens": 677839685.0, "step": 17762 }, { "epoch": 2.2596361786032313, "ewc_loss": 0.03267059102654457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267059219069779e-05, "grad_norm": 18.8459415435791, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8599806427955627, "num_tokens": 677880411.0, "step": 17763 }, { "epoch": 2.259763388881822, "ewc_loss": 0.03275180608034134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2751806429587305e-05, "grad_norm": 18.848312377929688, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8821249604225159, "num_tokens": 677921177.0, "step": 17764 }, { "epoch": 2.2598905991604123, "ewc_loss": 0.032636359333992004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2636358810123056e-05, "grad_norm": 18.776269912719727, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8842840790748596, "num_tokens": 677956393.0, "step": 17765 }, { "epoch": 2.2600178094390024, "ewc_loss": 0.03264227509498596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264227416366339e-05, "grad_norm": 18.91455078125, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8717391490936279, "num_tokens": 677988364.0, "step": 17766 }, { "epoch": 2.2601450197175934, "ewc_loss": 0.03268345445394516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2683456083759665e-05, "grad_norm": 18.821666717529297, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8813810348510742, "num_tokens": 678028093.0, "step": 17767 }, { "epoch": 2.2602722299961835, "ewc_loss": 0.03263067826628685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2630679925205186e-05, "grad_norm": 18.850841522216797, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8748236894607544, "num_tokens": 678068174.0, "step": 17768 }, { "epoch": 2.260399440274774, "ewc_loss": 0.03261450305581093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261450183345005e-05, "grad_norm": 18.827804565429688, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8757811784744263, "num_tokens": 678105784.0, "step": 17769 }, { "epoch": 2.2605266505533645, "ewc_loss": 0.03268354758620262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268354703322984e-05, "grad_norm": 18.777511596679688, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.884651780128479, "num_tokens": 678142873.0, "step": 17770 }, { "epoch": 2.260653860831955, "ewc_loss": 0.03263365104794502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263365215389058e-05, "grad_norm": 18.810773849487305, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8708104491233826, "num_tokens": 678181527.0, "step": 17771 }, { "epoch": 2.2607810711105456, "ewc_loss": 0.03272836282849312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2728363294154406e-05, "grad_norm": 18.851703643798828, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8768353462219238, "num_tokens": 678221225.0, "step": 17772 }, { "epoch": 2.260908281389136, "ewc_loss": 0.032710347324609756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271034802310169e-05, "grad_norm": 18.81196403503418, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8823931813240051, "num_tokens": 678254767.0, "step": 17773 }, { "epoch": 2.2610354916677267, "ewc_loss": 0.03267452493309975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2674524845788255e-05, "grad_norm": 18.87510108947754, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.876558780670166, "num_tokens": 678290830.0, "step": 17774 }, { "epoch": 2.261162701946317, "ewc_loss": 0.03272705525159836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272705362178385e-05, "grad_norm": 18.8077335357666, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8850564956665039, "num_tokens": 678327102.0, "step": 17775 }, { "epoch": 2.2612899122249077, "ewc_loss": 0.032704077661037445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270407614763826e-05, "grad_norm": 18.92710304260254, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8872311115264893, "num_tokens": 678364872.0, "step": 17776 }, { "epoch": 2.2614171225034982, "ewc_loss": 0.03272802382707596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2728024962125346e-05, "grad_norm": 18.842472076416016, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8684899806976318, "num_tokens": 678399233.0, "step": 17777 }, { "epoch": 2.2615443327820888, "ewc_loss": 0.03268427029252052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2684270991012454e-05, "grad_norm": 18.8563175201416, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8824077844619751, "num_tokens": 678434577.0, "step": 17778 }, { "epoch": 2.2616715430606793, "ewc_loss": 0.032741010189056396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2741008908487856e-05, "grad_norm": 18.897010803222656, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8798681497573853, "num_tokens": 678466835.0, "step": 17779 }, { "epoch": 2.26179875333927, "ewc_loss": 0.032687265425920486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268726504757069e-05, "grad_norm": 18.760761260986328, "learning_rate": 1e-06, "loss": 0.343, "mean_token_accuracy": 0.8889344930648804, "num_tokens": 678506817.0, "step": 17780 }, { "epoch": 2.2619259636178604, "ewc_loss": 0.03263550624251366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26355075230822e-05, "grad_norm": 18.861408233642578, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8751462697982788, "num_tokens": 678543734.0, "step": 17781 }, { "epoch": 2.262053173896451, "ewc_loss": 0.032764121890068054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276412098784931e-05, "grad_norm": 18.80368423461914, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8707245588302612, "num_tokens": 678585066.0, "step": 17782 }, { "epoch": 2.2621803841750414, "ewc_loss": 0.032630518078804016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2630519854137674e-05, "grad_norm": 18.850805282592773, "learning_rate": 1e-06, "loss": 0.3405, "mean_token_accuracy": 0.8889725804328918, "num_tokens": 678614951.0, "step": 17783 }, { "epoch": 2.262307594453632, "ewc_loss": 0.03270598500967026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2705986086511984e-05, "grad_norm": 18.84878921508789, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8766708374023438, "num_tokens": 678649671.0, "step": 17784 }, { "epoch": 2.2624348047322225, "ewc_loss": 0.03265847638249397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2658477721270174e-05, "grad_norm": 18.788564682006836, "learning_rate": 1e-06, "loss": 0.3403, "mean_token_accuracy": 0.8909727931022644, "num_tokens": 678693418.0, "step": 17785 }, { "epoch": 2.262562015010813, "ewc_loss": 0.03262485936284065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262485915911384e-05, "grad_norm": 18.75327491760254, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8852192163467407, "num_tokens": 678730891.0, "step": 17786 }, { "epoch": 2.2626892252894035, "ewc_loss": 0.03271402046084404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271401874371804e-05, "grad_norm": 18.928871154785156, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8832710981369019, "num_tokens": 678766369.0, "step": 17787 }, { "epoch": 2.262816435567994, "ewc_loss": 0.032620832324028015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.262083191657439e-05, "grad_norm": 18.601253509521484, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8634439706802368, "num_tokens": 678809118.0, "step": 17788 }, { "epoch": 2.2629436458465846, "ewc_loss": 0.0326351672410965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263516555307433e-05, "grad_norm": 18.866960525512695, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8736826181411743, "num_tokens": 678850057.0, "step": 17789 }, { "epoch": 2.263070856125175, "ewc_loss": 0.032782841473817825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2782842026790604e-05, "grad_norm": 18.817365646362305, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8727787733078003, "num_tokens": 678890253.0, "step": 17790 }, { "epoch": 2.263198066403765, "ewc_loss": 0.03262627497315407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26262743328698e-05, "grad_norm": 18.81256675720215, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.8890225887298584, "num_tokens": 678922535.0, "step": 17791 }, { "epoch": 2.263325276682356, "ewc_loss": 0.0327349528670311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273495167377405e-05, "grad_norm": 18.807701110839844, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.874101996421814, "num_tokens": 678956136.0, "step": 17792 }, { "epoch": 2.2634524869609463, "ewc_loss": 0.032713763415813446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2713764085201547e-05, "grad_norm": 18.880037307739258, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8781887292861938, "num_tokens": 678990864.0, "step": 17793 }, { "epoch": 2.263579697239537, "ewc_loss": 0.032658085227012634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.265808481955901e-05, "grad_norm": 18.72864532470703, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8876007795333862, "num_tokens": 679028345.0, "step": 17794 }, { "epoch": 2.2637069075181273, "ewc_loss": 0.03266432508826256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266432395321317e-05, "grad_norm": 18.78304672241211, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8618007898330688, "num_tokens": 679073679.0, "step": 17795 }, { "epoch": 2.263834117796718, "ewc_loss": 0.03276381269097328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276381175965071e-05, "grad_norm": 18.864261627197266, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8734394311904907, "num_tokens": 679109274.0, "step": 17796 }, { "epoch": 2.2639613280753084, "ewc_loss": 0.032672081142663956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267208012402989e-05, "grad_norm": 18.847267150878906, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.875707745552063, "num_tokens": 679141644.0, "step": 17797 }, { "epoch": 2.264088538353899, "ewc_loss": 0.03270183131098747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2701831514714286e-05, "grad_norm": 18.846940994262695, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8593803644180298, "num_tokens": 679184351.0, "step": 17798 }, { "epoch": 2.2642157486324894, "ewc_loss": 0.0327162891626358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271628884249367e-05, "grad_norm": 18.839162826538086, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8715783357620239, "num_tokens": 679222509.0, "step": 17799 }, { "epoch": 2.26434295891108, "ewc_loss": 0.032700709998607635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27007110172417e-05, "grad_norm": 18.841575622558594, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8595491051673889, "num_tokens": 679263362.0, "step": 17800 }, { "epoch": 2.2644701691896705, "ewc_loss": 0.03274378180503845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274378104833886e-05, "grad_norm": 18.844928741455078, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8637394905090332, "num_tokens": 679307859.0, "step": 17801 }, { "epoch": 2.264597379468261, "ewc_loss": 0.03275143355131149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.275143535574898e-05, "grad_norm": 18.848451614379883, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8842005729675293, "num_tokens": 679344801.0, "step": 17802 }, { "epoch": 2.2647245897468515, "ewc_loss": 0.03272933140397072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272933099651709e-05, "grad_norm": 18.92389488220215, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8681952953338623, "num_tokens": 679382795.0, "step": 17803 }, { "epoch": 2.264851800025442, "ewc_loss": 0.03273303806781769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273303809692152e-05, "grad_norm": 18.82724380493164, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8724675178527832, "num_tokens": 679425282.0, "step": 17804 }, { "epoch": 2.2649790103040326, "ewc_loss": 0.0326644703745842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2664469472365454e-05, "grad_norm": 18.815093994140625, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.861923336982727, "num_tokens": 679466881.0, "step": 17805 }, { "epoch": 2.265106220582623, "ewc_loss": 0.032727714627981186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272771573392674e-05, "grad_norm": 18.862377166748047, "learning_rate": 1e-06, "loss": 0.3497, "mean_token_accuracy": 0.8865275382995605, "num_tokens": 679509268.0, "step": 17806 }, { "epoch": 2.2652334308612136, "ewc_loss": 0.03270948678255081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270948582212441e-05, "grad_norm": 18.772397994995117, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8728315830230713, "num_tokens": 679545082.0, "step": 17807 }, { "epoch": 2.265360641139804, "ewc_loss": 0.032703008502721786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2703010219847783e-05, "grad_norm": 18.921817779541016, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8758569359779358, "num_tokens": 679581795.0, "step": 17808 }, { "epoch": 2.2654878514183947, "ewc_loss": 0.03267954662442207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267954525654204e-05, "grad_norm": 18.77530288696289, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8727438449859619, "num_tokens": 679618978.0, "step": 17809 }, { "epoch": 2.2656150616969852, "ewc_loss": 0.03264458104968071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2644580642227083e-05, "grad_norm": 18.87885856628418, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8782330751419067, "num_tokens": 679655002.0, "step": 17810 }, { "epoch": 2.2657422719755758, "ewc_loss": 0.03273053467273712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273053516750224e-05, "grad_norm": 18.789981842041016, "learning_rate": 1e-06, "loss": 0.3382, "mean_token_accuracy": 0.888369619846344, "num_tokens": 679691143.0, "step": 17811 }, { "epoch": 2.2658694822541663, "ewc_loss": 0.032662030309438705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26620320265647e-05, "grad_norm": 18.862403869628906, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8766012191772461, "num_tokens": 679729168.0, "step": 17812 }, { "epoch": 2.265996692532757, "ewc_loss": 0.03274896368384361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2748965168138966e-05, "grad_norm": 18.910703659057617, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8731157183647156, "num_tokens": 679773385.0, "step": 17813 }, { "epoch": 2.2661239028113473, "ewc_loss": 0.03261548653244972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.261548772570677e-05, "grad_norm": 18.792072296142578, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8859394192695618, "num_tokens": 679813884.0, "step": 17814 }, { "epoch": 2.266251113089938, "ewc_loss": 0.032668888568878174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266888961661607e-05, "grad_norm": 18.886524200439453, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8757848143577576, "num_tokens": 679850834.0, "step": 17815 }, { "epoch": 2.266378323368528, "ewc_loss": 0.032674532383680344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267453212174587e-05, "grad_norm": 18.854782104492188, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.880853533744812, "num_tokens": 679890210.0, "step": 17816 }, { "epoch": 2.266505533647119, "ewc_loss": 0.032593730837106705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.259373261244036e-05, "grad_norm": 18.78049087524414, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8830494284629822, "num_tokens": 679926252.0, "step": 17817 }, { "epoch": 2.266632743925709, "ewc_loss": 0.0326773039996624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2677304261596873e-05, "grad_norm": 18.931049346923828, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8715468645095825, "num_tokens": 679959626.0, "step": 17818 }, { "epoch": 2.2667599542042995, "ewc_loss": 0.032674700021743774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2674699468770996e-05, "grad_norm": 18.79183578491211, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.869763970375061, "num_tokens": 679999973.0, "step": 17819 }, { "epoch": 2.26688716448289, "ewc_loss": 0.03256610035896301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.25661021634005e-05, "grad_norm": 18.829689025878906, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8680052757263184, "num_tokens": 680041081.0, "step": 17820 }, { "epoch": 2.2670143747614806, "ewc_loss": 0.03270343318581581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2703432225389406e-05, "grad_norm": 18.82790756225586, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8800768852233887, "num_tokens": 680078953.0, "step": 17821 }, { "epoch": 2.267141585040071, "ewc_loss": 0.032618310302495956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2618310797261074e-05, "grad_norm": 18.84786605834961, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8833749890327454, "num_tokens": 680112309.0, "step": 17822 }, { "epoch": 2.2672687953186617, "ewc_loss": 0.032691121101379395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269112130510621e-05, "grad_norm": 18.855417251586914, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8699539303779602, "num_tokens": 680148607.0, "step": 17823 }, { "epoch": 2.267396005597252, "ewc_loss": 0.03268039971590042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268040018156171e-05, "grad_norm": 18.774105072021484, "learning_rate": 1e-06, "loss": 0.3393, "mean_token_accuracy": 0.8920445442199707, "num_tokens": 680189617.0, "step": 17824 }, { "epoch": 2.2675232158758427, "ewc_loss": 0.032584305852651596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.258430660935119e-05, "grad_norm": 18.804641723632812, "learning_rate": 1e-06, "loss": 0.3406, "mean_token_accuracy": 0.8903152346611023, "num_tokens": 680227940.0, "step": 17825 }, { "epoch": 2.2676504261544332, "ewc_loss": 0.032636113464832306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263611506554298e-05, "grad_norm": 18.837095260620117, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.8873031139373779, "num_tokens": 680263473.0, "step": 17826 }, { "epoch": 2.2677776364330238, "ewc_loss": 0.0326494537293911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2649455533828586e-05, "grad_norm": 18.793920516967773, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8673014044761658, "num_tokens": 680297895.0, "step": 17827 }, { "epoch": 2.2679048467116143, "ewc_loss": 0.03263326734304428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263326652813703e-05, "grad_norm": 18.817367553710938, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.877464234828949, "num_tokens": 680337658.0, "step": 17828 }, { "epoch": 2.268032056990205, "ewc_loss": 0.03272590413689613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272590402048081e-05, "grad_norm": 18.925111770629883, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8692469596862793, "num_tokens": 680375912.0, "step": 17829 }, { "epoch": 2.2681592672687954, "ewc_loss": 0.0326545424759388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.26545414282009e-05, "grad_norm": 18.798192977905273, "learning_rate": 1e-06, "loss": 0.3576, "mean_token_accuracy": 0.886786699295044, "num_tokens": 680415231.0, "step": 17830 }, { "epoch": 2.268286477547386, "ewc_loss": 0.03263336420059204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263336475356482e-05, "grad_norm": 18.864696502685547, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8702114820480347, "num_tokens": 680460037.0, "step": 17831 }, { "epoch": 2.2684136878259764, "ewc_loss": 0.03267139941453934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2671399821992964e-05, "grad_norm": 18.793418884277344, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8535416722297668, "num_tokens": 680505521.0, "step": 17832 }, { "epoch": 2.268540898104567, "ewc_loss": 0.03264870122075081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264870247221552e-05, "grad_norm": 18.863012313842773, "learning_rate": 1e-06, "loss": 0.3497, "mean_token_accuracy": 0.8881715536117554, "num_tokens": 680547682.0, "step": 17833 }, { "epoch": 2.2686681083831575, "ewc_loss": 0.03267521411180496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2675212423782796e-05, "grad_norm": 18.800371170043945, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8820785284042358, "num_tokens": 680587510.0, "step": 17834 }, { "epoch": 2.268795318661748, "ewc_loss": 0.03266126662492752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266126805101521e-05, "grad_norm": 18.88697624206543, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8791993856430054, "num_tokens": 680624233.0, "step": 17835 }, { "epoch": 2.2689225289403385, "ewc_loss": 0.03264855965971947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264856059104204e-05, "grad_norm": 18.797443389892578, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8705291152000427, "num_tokens": 680666091.0, "step": 17836 }, { "epoch": 2.269049739218929, "ewc_loss": 0.03260749578475952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260749508626759e-05, "grad_norm": 18.87642478942871, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8818714618682861, "num_tokens": 680703062.0, "step": 17837 }, { "epoch": 2.2691769494975196, "ewc_loss": 0.03269914910197258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269915032433346e-05, "grad_norm": 18.824583053588867, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.8884577751159668, "num_tokens": 680735966.0, "step": 17838 }, { "epoch": 2.2693041597761097, "ewc_loss": 0.03263455256819725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263455437263474e-05, "grad_norm": 18.850194931030273, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8755988478660583, "num_tokens": 680780010.0, "step": 17839 }, { "epoch": 2.2694313700547006, "ewc_loss": 0.03266065567731857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266065687057562e-05, "grad_norm": 18.905500411987305, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8742237091064453, "num_tokens": 680819516.0, "step": 17840 }, { "epoch": 2.2695585803332907, "ewc_loss": 0.03268265724182129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2682655728422105e-05, "grad_norm": 18.865936279296875, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8750871419906616, "num_tokens": 680861374.0, "step": 17841 }, { "epoch": 2.2696857906118812, "ewc_loss": 0.03263452649116516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263452526880428e-05, "grad_norm": 18.86702537536621, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8664821982383728, "num_tokens": 680902447.0, "step": 17842 }, { "epoch": 2.2698130008904718, "ewc_loss": 0.03264733403921127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264733459218405e-05, "grad_norm": 18.850759506225586, "learning_rate": 1e-06, "loss": 0.3342, "mean_token_accuracy": 0.890863299369812, "num_tokens": 680942376.0, "step": 17843 }, { "epoch": 2.2699402111690623, "ewc_loss": 0.03258544206619263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2585441658739e-05, "grad_norm": 18.80235481262207, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.875709056854248, "num_tokens": 680977060.0, "step": 17844 }, { "epoch": 2.270067421447653, "ewc_loss": 0.03260335326194763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.260335142840631e-05, "grad_norm": 18.90641212463379, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8605207204818726, "num_tokens": 681014504.0, "step": 17845 }, { "epoch": 2.2701946317262434, "ewc_loss": 0.03266122564673424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266122439526953e-05, "grad_norm": 18.814424514770508, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8583033084869385, "num_tokens": 681053919.0, "step": 17846 }, { "epoch": 2.270321842004834, "ewc_loss": 0.032628513872623444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2628515327814966e-05, "grad_norm": 18.85055923461914, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8748213648796082, "num_tokens": 681090883.0, "step": 17847 }, { "epoch": 2.2704490522834244, "ewc_loss": 0.032701652497053146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270165325375274e-05, "grad_norm": 18.811830520629883, "learning_rate": 1e-06, "loss": 0.3468, "mean_token_accuracy": 0.8889638185501099, "num_tokens": 681128819.0, "step": 17848 }, { "epoch": 2.270576262562015, "ewc_loss": 0.03263747692108154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.263747566961683e-05, "grad_norm": 18.918914794921875, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.865851879119873, "num_tokens": 681171434.0, "step": 17849 }, { "epoch": 2.2707034728406055, "ewc_loss": 0.032680779695510864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2680778531357646e-05, "grad_norm": 18.855384826660156, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8855596780776978, "num_tokens": 681206297.0, "step": 17850 }, { "epoch": 2.270830683119196, "ewc_loss": 0.03264370188117027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264370025135577e-05, "grad_norm": 18.909204483032227, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.865976095199585, "num_tokens": 681242521.0, "step": 17851 }, { "epoch": 2.2709578933977865, "ewc_loss": 0.03265462815761566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.265462873969227e-05, "grad_norm": 18.80100440979004, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8849040269851685, "num_tokens": 681282474.0, "step": 17852 }, { "epoch": 2.271085103676377, "ewc_loss": 0.032641857862472534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264185943407938e-05, "grad_norm": 18.829227447509766, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.87180095911026, "num_tokens": 681317280.0, "step": 17853 }, { "epoch": 2.2712123139549676, "ewc_loss": 0.03269648551940918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269648368586786e-05, "grad_norm": 18.857004165649414, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8729528784751892, "num_tokens": 681349155.0, "step": 17854 }, { "epoch": 2.271339524233558, "ewc_loss": 0.03264724835753441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.264724728069268e-05, "grad_norm": 18.715984344482422, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8753656148910522, "num_tokens": 681380863.0, "step": 17855 }, { "epoch": 2.2714667345121486, "ewc_loss": 0.03269384428858757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269384251325391e-05, "grad_norm": 18.874799728393555, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8794739246368408, "num_tokens": 681421970.0, "step": 17856 }, { "epoch": 2.271593944790739, "ewc_loss": 0.03276337683200836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2763375202193856e-05, "grad_norm": 18.773954391479492, "learning_rate": 1e-06, "loss": 0.3385, "mean_token_accuracy": 0.8902073502540588, "num_tokens": 681453511.0, "step": 17857 }, { "epoch": 2.2717211550693297, "ewc_loss": 0.03275562450289726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.275562630733475e-05, "grad_norm": 18.89244270324707, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.861872136592865, "num_tokens": 681496293.0, "step": 17858 }, { "epoch": 2.2718483653479202, "ewc_loss": 0.032790813595056534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279081283835694e-05, "grad_norm": 18.84473991394043, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8684030175209045, "num_tokens": 681537074.0, "step": 17859 }, { "epoch": 2.2719755756265108, "ewc_loss": 0.032749973237514496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274997288826853e-05, "grad_norm": 18.849401473999023, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8685566186904907, "num_tokens": 681578911.0, "step": 17860 }, { "epoch": 2.2721027859051013, "ewc_loss": 0.03274563327431679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274563277955167e-05, "grad_norm": 18.93465805053711, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8722879886627197, "num_tokens": 681622134.0, "step": 17861 }, { "epoch": 2.272229996183692, "ewc_loss": 0.0327562540769577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.275625567766838e-05, "grad_norm": 18.87746810913086, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8763558864593506, "num_tokens": 681655919.0, "step": 17862 }, { "epoch": 2.2723572064622823, "ewc_loss": 0.03271780163049698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271780224167742e-05, "grad_norm": 18.929906845092773, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8673779368400574, "num_tokens": 681696539.0, "step": 17863 }, { "epoch": 2.2724844167408724, "ewc_loss": 0.03273138031363487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2731379178585485e-05, "grad_norm": 18.850221633911133, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8731616735458374, "num_tokens": 681734630.0, "step": 17864 }, { "epoch": 2.2726116270194634, "ewc_loss": 0.0326949842274189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269498483859934e-05, "grad_norm": 18.846921920776367, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8630960583686829, "num_tokens": 681779669.0, "step": 17865 }, { "epoch": 2.2727388372980535, "ewc_loss": 0.03275219351053238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2752192055340856e-05, "grad_norm": 18.86240577697754, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8721970319747925, "num_tokens": 681821920.0, "step": 17866 }, { "epoch": 2.272866047576644, "ewc_loss": 0.03274169936776161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2741700124461204e-05, "grad_norm": 18.875953674316406, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8731788396835327, "num_tokens": 681859239.0, "step": 17867 }, { "epoch": 2.2729932578552345, "ewc_loss": 0.032741643488407135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274164191680029e-05, "grad_norm": 18.833295822143555, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8629989624023438, "num_tokens": 681896569.0, "step": 17868 }, { "epoch": 2.273120468133825, "ewc_loss": 0.03272354602813721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2723546610213816e-05, "grad_norm": 18.85555076599121, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8889411687850952, "num_tokens": 681932549.0, "step": 17869 }, { "epoch": 2.2732476784124156, "ewc_loss": 0.03276541829109192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276541974628344e-05, "grad_norm": 18.843170166015625, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8709828853607178, "num_tokens": 681973981.0, "step": 17870 }, { "epoch": 2.273374888691006, "ewc_loss": 0.032696954905986786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269695662311278e-05, "grad_norm": 18.884756088256836, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.852989912033081, "num_tokens": 682009512.0, "step": 17871 }, { "epoch": 2.2735020989695967, "ewc_loss": 0.03274065628647804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274065602454357e-05, "grad_norm": 18.885950088500977, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8726540803909302, "num_tokens": 682048058.0, "step": 17872 }, { "epoch": 2.273629309248187, "ewc_loss": 0.03269808366894722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.269808439654298e-05, "grad_norm": 18.838577270507812, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8678165674209595, "num_tokens": 682084181.0, "step": 17873 }, { "epoch": 2.2737565195267777, "ewc_loss": 0.03269251435995102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2692514650989324e-05, "grad_norm": 18.813016891479492, "learning_rate": 1e-06, "loss": 0.3354, "mean_token_accuracy": 0.893868088722229, "num_tokens": 682119613.0, "step": 17874 }, { "epoch": 2.2738837298053682, "ewc_loss": 0.03274550288915634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2745501812314615e-05, "grad_norm": 18.828107833862305, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8785114288330078, "num_tokens": 682159501.0, "step": 17875 }, { "epoch": 2.2740109400839588, "ewc_loss": 0.032720863819122314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272086541983299e-05, "grad_norm": 18.876018524169922, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.8830338716506958, "num_tokens": 682197323.0, "step": 17876 }, { "epoch": 2.2741381503625493, "ewc_loss": 0.03270133584737778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270133674959652e-05, "grad_norm": 18.796859741210938, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.876952052116394, "num_tokens": 682231150.0, "step": 17877 }, { "epoch": 2.27426536064114, "ewc_loss": 0.03272230550646782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27223060594406e-05, "grad_norm": 18.86309814453125, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8765578866004944, "num_tokens": 682269351.0, "step": 17878 }, { "epoch": 2.2743925709197303, "ewc_loss": 0.032781220972537994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278121948824264e-05, "grad_norm": 18.83234214782715, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8859280347824097, "num_tokens": 682303633.0, "step": 17879 }, { "epoch": 2.274519781198321, "ewc_loss": 0.0327681228518486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276812276453711e-05, "grad_norm": 18.904321670532227, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8734171390533447, "num_tokens": 682340890.0, "step": 17880 }, { "epoch": 2.2746469914769114, "ewc_loss": 0.03279523178935051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279523298260756e-05, "grad_norm": 18.831096649169922, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8889979124069214, "num_tokens": 682374253.0, "step": 17881 }, { "epoch": 2.274774201755502, "ewc_loss": 0.0327540785074234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.275408016634174e-05, "grad_norm": 18.827192306518555, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8750163316726685, "num_tokens": 682412675.0, "step": 17882 }, { "epoch": 2.2749014120340925, "ewc_loss": 0.03274938091635704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2749379897722974e-05, "grad_norm": 18.901269912719727, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8862631320953369, "num_tokens": 682447864.0, "step": 17883 }, { "epoch": 2.275028622312683, "ewc_loss": 0.03280496224761009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280496093793772e-05, "grad_norm": 18.888940811157227, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8753452897071838, "num_tokens": 682486445.0, "step": 17884 }, { "epoch": 2.2751558325912735, "ewc_loss": 0.03272246941924095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272246976848692e-05, "grad_norm": 18.783056259155273, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8745021820068359, "num_tokens": 682518173.0, "step": 17885 }, { "epoch": 2.275283042869864, "ewc_loss": 0.03279031068086624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279031079728156e-05, "grad_norm": 18.914634704589844, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8744333982467651, "num_tokens": 682558627.0, "step": 17886 }, { "epoch": 2.2754102531484546, "ewc_loss": 0.03280200809240341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280200689914636e-05, "grad_norm": 18.75380516052246, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8796032071113586, "num_tokens": 682600821.0, "step": 17887 }, { "epoch": 2.275537463427045, "ewc_loss": 0.03273288160562515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2732881663832814e-05, "grad_norm": 18.88994789123535, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8629497289657593, "num_tokens": 682643947.0, "step": 17888 }, { "epoch": 2.275664673705635, "ewc_loss": 0.032874055206775665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287405343144201e-05, "grad_norm": 18.849367141723633, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.869888186454773, "num_tokens": 682683045.0, "step": 17889 }, { "epoch": 2.275791883984226, "ewc_loss": 0.03270561993122101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270561865065247e-05, "grad_norm": 18.872846603393555, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.876096785068512, "num_tokens": 682721617.0, "step": 17890 }, { "epoch": 2.2759190942628162, "ewc_loss": 0.032817620784044266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28176211041864e-05, "grad_norm": 18.85539436340332, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.869113564491272, "num_tokens": 682758339.0, "step": 17891 }, { "epoch": 2.2760463045414068, "ewc_loss": 0.03272082656621933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272082540206611e-05, "grad_norm": 18.923885345458984, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8803387880325317, "num_tokens": 682800501.0, "step": 17892 }, { "epoch": 2.2761735148199973, "ewc_loss": 0.03277582675218582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277582800365053e-05, "grad_norm": 18.77737045288086, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8775708675384521, "num_tokens": 682833231.0, "step": 17893 }, { "epoch": 2.276300725098588, "ewc_loss": 0.03273867815732956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273867696407251e-05, "grad_norm": 18.936798095703125, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8802855014801025, "num_tokens": 682873945.0, "step": 17894 }, { "epoch": 2.2764279353771784, "ewc_loss": 0.0328507125377655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285071215941571e-05, "grad_norm": 18.814409255981445, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8732954263687134, "num_tokens": 682918329.0, "step": 17895 }, { "epoch": 2.276555145655769, "ewc_loss": 0.03270917013287544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270916931796819e-05, "grad_norm": 18.910507202148438, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8805220127105713, "num_tokens": 682956277.0, "step": 17896 }, { "epoch": 2.2766823559343594, "ewc_loss": 0.03280402719974518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28040259773843e-05, "grad_norm": 18.833932876586914, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.878151535987854, "num_tokens": 682990648.0, "step": 17897 }, { "epoch": 2.27680956621295, "ewc_loss": 0.03266226872801781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266226849518716e-05, "grad_norm": 18.904254913330078, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.889269232749939, "num_tokens": 683029046.0, "step": 17898 }, { "epoch": 2.2769367764915405, "ewc_loss": 0.032802097499370575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280209784861654e-05, "grad_norm": 18.92940330505371, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8757191896438599, "num_tokens": 683064541.0, "step": 17899 }, { "epoch": 2.277063986770131, "ewc_loss": 0.03272825479507446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272825415479019e-05, "grad_norm": 18.90601921081543, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8755243420600891, "num_tokens": 683105015.0, "step": 17900 }, { "epoch": 2.2771911970487215, "ewc_loss": 0.03273415192961693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273415131843649e-05, "grad_norm": 18.845731735229492, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8688743710517883, "num_tokens": 683142938.0, "step": 17901 }, { "epoch": 2.277318407327312, "ewc_loss": 0.032669954001903534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.266995554440655e-05, "grad_norm": 18.89128303527832, "learning_rate": 1e-06, "loss": 0.3524, "mean_token_accuracy": 0.8881704807281494, "num_tokens": 683181593.0, "step": 17902 }, { "epoch": 2.2774456176059026, "ewc_loss": 0.03272880241274834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2728803489590064e-05, "grad_norm": 18.812259674072266, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8645948767662048, "num_tokens": 683218641.0, "step": 17903 }, { "epoch": 2.277572827884493, "ewc_loss": 0.03270550072193146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270550223533064e-05, "grad_norm": 18.854019165039062, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.859406590461731, "num_tokens": 683261479.0, "step": 17904 }, { "epoch": 2.2777000381630836, "ewc_loss": 0.03276599571108818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276599454693496e-05, "grad_norm": 18.85270118713379, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8494540452957153, "num_tokens": 683300602.0, "step": 17905 }, { "epoch": 2.277827248441674, "ewc_loss": 0.03272005170583725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27200505125802e-05, "grad_norm": 18.9171142578125, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8739396929740906, "num_tokens": 683337012.0, "step": 17906 }, { "epoch": 2.2779544587202647, "ewc_loss": 0.03279048576951027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2790485420264304e-05, "grad_norm": 18.86472511291504, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8818215727806091, "num_tokens": 683368949.0, "step": 17907 }, { "epoch": 2.2780816689988552, "ewc_loss": 0.03273525834083557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2735257263993844e-05, "grad_norm": 18.94771385192871, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8731461763381958, "num_tokens": 683408741.0, "step": 17908 }, { "epoch": 2.2782088792774458, "ewc_loss": 0.032735660672187805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273566107964143e-05, "grad_norm": 18.88518524169922, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8647466897964478, "num_tokens": 683447843.0, "step": 17909 }, { "epoch": 2.2783360895560363, "ewc_loss": 0.03272143006324768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272142930654809e-05, "grad_norm": 18.899089813232422, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8711903691291809, "num_tokens": 683489695.0, "step": 17910 }, { "epoch": 2.278463299834627, "ewc_loss": 0.0326944999396801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2694500987417996e-05, "grad_norm": 18.816875457763672, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8736017942428589, "num_tokens": 683528381.0, "step": 17911 }, { "epoch": 2.2785905101132173, "ewc_loss": 0.03283422812819481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2834228477440774e-05, "grad_norm": 18.938156127929688, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8762611150741577, "num_tokens": 683567261.0, "step": 17912 }, { "epoch": 2.278717720391808, "ewc_loss": 0.03268910199403763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.268910222686827e-05, "grad_norm": 18.767593383789062, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8617614507675171, "num_tokens": 683601083.0, "step": 17913 }, { "epoch": 2.278844930670398, "ewc_loss": 0.032813478261232376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281347744632512e-05, "grad_norm": 18.97868537902832, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.868282675743103, "num_tokens": 683643494.0, "step": 17914 }, { "epoch": 2.278972140948989, "ewc_loss": 0.0327923409640789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279234078945592e-05, "grad_norm": 18.831581115722656, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8700219392776489, "num_tokens": 683684267.0, "step": 17915 }, { "epoch": 2.279099351227579, "ewc_loss": 0.03270520642399788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.270520755904727e-05, "grad_norm": 18.909513473510742, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8638340830802917, "num_tokens": 683723292.0, "step": 17916 }, { "epoch": 2.2792265615061695, "ewc_loss": 0.03289685398340225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289685264462605e-05, "grad_norm": 18.934232711791992, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.885347843170166, "num_tokens": 683761378.0, "step": 17917 }, { "epoch": 2.27935377178476, "ewc_loss": 0.032749202102422714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274920163676143e-05, "grad_norm": 18.887243270874023, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8673417568206787, "num_tokens": 683794838.0, "step": 17918 }, { "epoch": 2.2794809820633506, "ewc_loss": 0.032808803021907806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280880264355801e-05, "grad_norm": 18.890710830688477, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8573384284973145, "num_tokens": 683832307.0, "step": 17919 }, { "epoch": 2.279608192341941, "ewc_loss": 0.032839588820934296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283958722022362e-05, "grad_norm": 18.871313095092773, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8683975338935852, "num_tokens": 683871848.0, "step": 17920 }, { "epoch": 2.2797354026205316, "ewc_loss": 0.03280176222324371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280176315456629e-05, "grad_norm": 18.84955596923828, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8777183294296265, "num_tokens": 683915086.0, "step": 17921 }, { "epoch": 2.279862612899122, "ewc_loss": 0.03275096043944359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.275095878052525e-05, "grad_norm": 18.866670608520508, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8684258460998535, "num_tokens": 683945411.0, "step": 17922 }, { "epoch": 2.2799898231777127, "ewc_loss": 0.03279382735490799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279382872278802e-05, "grad_norm": 18.859468460083008, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8747783899307251, "num_tokens": 683979664.0, "step": 17923 }, { "epoch": 2.2801170334563032, "ewc_loss": 0.0328277163207531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282771649537608e-05, "grad_norm": 18.78231430053711, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.864686131477356, "num_tokens": 684025526.0, "step": 17924 }, { "epoch": 2.2802442437348938, "ewc_loss": 0.032886795699596405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288679727120325e-05, "grad_norm": 18.876224517822266, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8692002892494202, "num_tokens": 684056992.0, "step": 17925 }, { "epoch": 2.2803714540134843, "ewc_loss": 0.03285616263747215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285616185166873e-05, "grad_norm": 18.86461067199707, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8783331513404846, "num_tokens": 684094515.0, "step": 17926 }, { "epoch": 2.280498664292075, "ewc_loss": 0.032837871462106705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283787009422667e-05, "grad_norm": 18.758840560913086, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8813818693161011, "num_tokens": 684132294.0, "step": 17927 }, { "epoch": 2.2806258745706653, "ewc_loss": 0.03283866494894028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283866317360662e-05, "grad_norm": 18.819917678833008, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8778542280197144, "num_tokens": 684164010.0, "step": 17928 }, { "epoch": 2.280753084849256, "ewc_loss": 0.032962679862976074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296267823316157e-05, "grad_norm": 18.819541931152344, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8817665576934814, "num_tokens": 684202077.0, "step": 17929 }, { "epoch": 2.2808802951278464, "ewc_loss": 0.03287116065621376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287116123829037e-05, "grad_norm": 18.814802169799805, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8478997349739075, "num_tokens": 684240799.0, "step": 17930 }, { "epoch": 2.281007505406437, "ewc_loss": 0.03293639421463013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293639383628033e-05, "grad_norm": 18.81538200378418, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8716139197349548, "num_tokens": 684284080.0, "step": 17931 }, { "epoch": 2.2811347156850275, "ewc_loss": 0.03295287862420082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2952877518255264e-05, "grad_norm": 18.88567543029785, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8589496612548828, "num_tokens": 684317915.0, "step": 17932 }, { "epoch": 2.281261925963618, "ewc_loss": 0.032906513661146164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290651511633769e-05, "grad_norm": 18.86562728881836, "learning_rate": 1e-06, "loss": 0.2791, "mean_token_accuracy": 0.910914421081543, "num_tokens": 684354665.0, "step": 17933 }, { "epoch": 2.2813891362422085, "ewc_loss": 0.03290754184126854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290754102636129e-05, "grad_norm": 18.791397094726562, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8724203109741211, "num_tokens": 684392866.0, "step": 17934 }, { "epoch": 2.281516346520799, "ewc_loss": 0.03291245549917221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291245593572967e-05, "grad_norm": 18.917272567749023, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8783530592918396, "num_tokens": 684428422.0, "step": 17935 }, { "epoch": 2.2816435567993896, "ewc_loss": 0.03293123468756676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2931235182331875e-05, "grad_norm": 18.79239273071289, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8611963987350464, "num_tokens": 684467129.0, "step": 17936 }, { "epoch": 2.2817707670779797, "ewc_loss": 0.032820671796798706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282067336840555e-05, "grad_norm": 18.8720703125, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8681763410568237, "num_tokens": 684500093.0, "step": 17937 }, { "epoch": 2.2818979773565706, "ewc_loss": 0.03296755626797676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296755676274188e-05, "grad_norm": 18.904359817504883, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8650518655776978, "num_tokens": 684541422.0, "step": 17938 }, { "epoch": 2.2820251876351607, "ewc_loss": 0.032937612384557724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293761255918071e-05, "grad_norm": 18.85401725769043, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8712132573127747, "num_tokens": 684579164.0, "step": 17939 }, { "epoch": 2.2821523979137512, "ewc_loss": 0.032905809581279755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290580934844911e-05, "grad_norm": 18.904909133911133, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8582974672317505, "num_tokens": 684618617.0, "step": 17940 }, { "epoch": 2.2822796081923418, "ewc_loss": 0.03292587399482727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292587280157022e-05, "grad_norm": 18.872249603271484, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8607304096221924, "num_tokens": 684655525.0, "step": 17941 }, { "epoch": 2.2824068184709323, "ewc_loss": 0.03285577520728111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285577622591518e-05, "grad_norm": 18.834091186523438, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8732020854949951, "num_tokens": 684694525.0, "step": 17942 }, { "epoch": 2.282534028749523, "ewc_loss": 0.03287791833281517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287791696493514e-05, "grad_norm": 18.852977752685547, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8746713399887085, "num_tokens": 684733247.0, "step": 17943 }, { "epoch": 2.2826612390281134, "ewc_loss": 0.03290146216750145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2901461963774636e-05, "grad_norm": 18.86533546447754, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8867702484130859, "num_tokens": 684763642.0, "step": 17944 }, { "epoch": 2.282788449306704, "ewc_loss": 0.03289904072880745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289903906988911e-05, "grad_norm": 18.865196228027344, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8807746171951294, "num_tokens": 684806164.0, "step": 17945 }, { "epoch": 2.2829156595852944, "ewc_loss": 0.03289955481886864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289955566287972e-05, "grad_norm": 18.797956466674805, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.872832179069519, "num_tokens": 684844193.0, "step": 17946 }, { "epoch": 2.283042869863885, "ewc_loss": 0.032872967422008514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287296567577869e-05, "grad_norm": 18.9515380859375, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8797110319137573, "num_tokens": 684882103.0, "step": 17947 }, { "epoch": 2.2831700801424755, "ewc_loss": 0.03285811468958855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285811544628814e-05, "grad_norm": 18.77541732788086, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8891745805740356, "num_tokens": 684917667.0, "step": 17948 }, { "epoch": 2.283297290421066, "ewc_loss": 0.03281969949603081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281969839008525e-05, "grad_norm": 18.92488670349121, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8781311511993408, "num_tokens": 684954007.0, "step": 17949 }, { "epoch": 2.2834245006996565, "ewc_loss": 0.032968442887067795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296844442957081e-05, "grad_norm": 18.846961975097656, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8700217604637146, "num_tokens": 684994693.0, "step": 17950 }, { "epoch": 2.283551710978247, "ewc_loss": 0.03273574635386467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2735744753153995e-05, "grad_norm": 18.807645797729492, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8621198534965515, "num_tokens": 685032713.0, "step": 17951 }, { "epoch": 2.2836789212568376, "ewc_loss": 0.03288218751549721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2882187952054664e-05, "grad_norm": 18.88441276550293, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8681497573852539, "num_tokens": 685066597.0, "step": 17952 }, { "epoch": 2.283806131535428, "ewc_loss": 0.0328284427523613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28284440911375e-05, "grad_norm": 18.78327751159668, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8691679239273071, "num_tokens": 685106243.0, "step": 17953 }, { "epoch": 2.2839333418140186, "ewc_loss": 0.03287123516201973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287123399786651e-05, "grad_norm": 18.893733978271484, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8660170435905457, "num_tokens": 685145177.0, "step": 17954 }, { "epoch": 2.284060552092609, "ewc_loss": 0.03284332901239395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284333070041612e-05, "grad_norm": 18.77464485168457, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8807827234268188, "num_tokens": 685189130.0, "step": 17955 }, { "epoch": 2.2841877623711997, "ewc_loss": 0.03282178193330765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2821782951941714e-05, "grad_norm": 18.919618606567383, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8798022866249084, "num_tokens": 685227536.0, "step": 17956 }, { "epoch": 2.28431497264979, "ewc_loss": 0.03291986137628555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2919862860580906e-05, "grad_norm": 18.859312057495117, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8794049024581909, "num_tokens": 685267213.0, "step": 17957 }, { "epoch": 2.2844421829283807, "ewc_loss": 0.03282018005847931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2820178603287786e-05, "grad_norm": 18.94122886657715, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8722831010818481, "num_tokens": 685301115.0, "step": 17958 }, { "epoch": 2.2845693932069713, "ewc_loss": 0.03286294639110565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2862946682143956e-05, "grad_norm": 18.928970336914062, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8795062899589539, "num_tokens": 685338497.0, "step": 17959 }, { "epoch": 2.284696603485562, "ewc_loss": 0.03284640610218048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284640479250811e-05, "grad_norm": 18.913475036621094, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8635448813438416, "num_tokens": 685374983.0, "step": 17960 }, { "epoch": 2.2848238137641523, "ewc_loss": 0.032783810049295425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278380972915329e-05, "grad_norm": 18.894981384277344, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8790425062179565, "num_tokens": 685408859.0, "step": 17961 }, { "epoch": 2.2849510240427424, "ewc_loss": 0.03283803537487984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283803380327299e-05, "grad_norm": 18.89851188659668, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8698297142982483, "num_tokens": 685445103.0, "step": 17962 }, { "epoch": 2.2850782343213334, "ewc_loss": 0.032833874225616455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2833875593496487e-05, "grad_norm": 18.86730194091797, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8695634603500366, "num_tokens": 685484657.0, "step": 17963 }, { "epoch": 2.2852054445999235, "ewc_loss": 0.03280562162399292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280562305008061e-05, "grad_norm": 18.901582717895508, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8819103837013245, "num_tokens": 685524034.0, "step": 17964 }, { "epoch": 2.285332654878514, "ewc_loss": 0.03284916281700134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2849162380443886e-05, "grad_norm": 18.934667587280273, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8797295093536377, "num_tokens": 685560368.0, "step": 17965 }, { "epoch": 2.2854598651571045, "ewc_loss": 0.03277264162898064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277264113421552e-05, "grad_norm": 18.860353469848633, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8698814511299133, "num_tokens": 685598192.0, "step": 17966 }, { "epoch": 2.285587075435695, "ewc_loss": 0.03283204883337021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2832049328135327e-05, "grad_norm": 18.88384246826172, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8764103055000305, "num_tokens": 685636300.0, "step": 17967 }, { "epoch": 2.2857142857142856, "ewc_loss": 0.03277597203850746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2775973522802815e-05, "grad_norm": 18.910566329956055, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8715968132019043, "num_tokens": 685671457.0, "step": 17968 }, { "epoch": 2.285841495992876, "ewc_loss": 0.03282206133008003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282206307630986e-05, "grad_norm": 18.893125534057617, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.863858163356781, "num_tokens": 685709687.0, "step": 17969 }, { "epoch": 2.2859687062714666, "ewc_loss": 0.03278721868991852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2787218515295535e-05, "grad_norm": 18.857938766479492, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.852461040019989, "num_tokens": 685747854.0, "step": 17970 }, { "epoch": 2.286095916550057, "ewc_loss": 0.03283850476145744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283850310253911e-05, "grad_norm": 18.926366806030273, "learning_rate": 1e-06, "loss": 0.3371, "mean_token_accuracy": 0.8932517766952515, "num_tokens": 685785215.0, "step": 17971 }, { "epoch": 2.2862231268286477, "ewc_loss": 0.03280586004257202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280585951870307e-05, "grad_norm": 18.952184677124023, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.878081202507019, "num_tokens": 685818871.0, "step": 17972 }, { "epoch": 2.2863503371072382, "ewc_loss": 0.032835908234119415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283590922364965e-05, "grad_norm": 18.925764083862305, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8677462339401245, "num_tokens": 685859089.0, "step": 17973 }, { "epoch": 2.2864775473858288, "ewc_loss": 0.032737381756305695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2737381843617186e-05, "grad_norm": 18.806758880615234, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8650359511375427, "num_tokens": 685900346.0, "step": 17974 }, { "epoch": 2.2866047576644193, "ewc_loss": 0.032788679003715515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278867734479718e-05, "grad_norm": 19.022024154663086, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8729248642921448, "num_tokens": 685938186.0, "step": 17975 }, { "epoch": 2.28673196794301, "ewc_loss": 0.03284892812371254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284892954980023e-05, "grad_norm": 18.87318992614746, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8809179067611694, "num_tokens": 685974911.0, "step": 17976 }, { "epoch": 2.2868591782216003, "ewc_loss": 0.0327463299036026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274633127148263e-05, "grad_norm": 18.897117614746094, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8757234215736389, "num_tokens": 686010430.0, "step": 17977 }, { "epoch": 2.286986388500191, "ewc_loss": 0.032845862209796906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2845862733665854e-05, "grad_norm": 18.93568229675293, "learning_rate": 1e-06, "loss": 0.3419, "mean_token_accuracy": 0.8880079984664917, "num_tokens": 686042727.0, "step": 17978 }, { "epoch": 2.2871135987787814, "ewc_loss": 0.03276296332478523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2762964110588655e-05, "grad_norm": 18.85726547241211, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8763860464096069, "num_tokens": 686078819.0, "step": 17979 }, { "epoch": 2.287240809057372, "ewc_loss": 0.03279552236199379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2795524020912126e-05, "grad_norm": 18.91274642944336, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.876849353313446, "num_tokens": 686117760.0, "step": 17980 }, { "epoch": 2.2873680193359625, "ewc_loss": 0.032821692526340485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2821692002471536e-05, "grad_norm": 18.845251083374023, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8641606569290161, "num_tokens": 686152992.0, "step": 17981 }, { "epoch": 2.287495229614553, "ewc_loss": 0.03282737359404564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2827374525368214e-05, "grad_norm": 18.946529388427734, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8689156770706177, "num_tokens": 686195353.0, "step": 17982 }, { "epoch": 2.2876224398931435, "ewc_loss": 0.03283260762691498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283260593889281e-05, "grad_norm": 18.876174926757812, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8703091144561768, "num_tokens": 686232990.0, "step": 17983 }, { "epoch": 2.287749650171734, "ewc_loss": 0.03279825672507286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279825614299625e-05, "grad_norm": 18.95001792907715, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8784738183021545, "num_tokens": 686265933.0, "step": 17984 }, { "epoch": 2.2878768604503246, "ewc_loss": 0.03282685950398445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282685793237761e-05, "grad_norm": 18.848648071289062, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8796494007110596, "num_tokens": 686307111.0, "step": 17985 }, { "epoch": 2.288004070728915, "ewc_loss": 0.03277510404586792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277510404586792e-05, "grad_norm": 18.93543815612793, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.875869631767273, "num_tokens": 686343363.0, "step": 17986 }, { "epoch": 2.288131281007505, "ewc_loss": 0.032834604382514954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2834603189257905e-05, "grad_norm": 18.855924606323242, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8733452558517456, "num_tokens": 686380888.0, "step": 17987 }, { "epoch": 2.288258491286096, "ewc_loss": 0.03281339630484581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281339741079137e-05, "grad_norm": 18.923555374145508, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8651735186576843, "num_tokens": 686417350.0, "step": 17988 }, { "epoch": 2.2883857015646862, "ewc_loss": 0.032820701599121094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282070247223601e-05, "grad_norm": 18.869787216186523, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8822216987609863, "num_tokens": 686458658.0, "step": 17989 }, { "epoch": 2.2885129118432768, "ewc_loss": 0.03277277573943138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277277573943138e-05, "grad_norm": 18.911006927490234, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8768006563186646, "num_tokens": 686493945.0, "step": 17990 }, { "epoch": 2.2886401221218673, "ewc_loss": 0.03286276385188103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28627647832036e-05, "grad_norm": 18.95295524597168, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8742153644561768, "num_tokens": 686527608.0, "step": 17991 }, { "epoch": 2.288767332400458, "ewc_loss": 0.03285494074225426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285493949078955e-05, "grad_norm": 18.911609649658203, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8834125995635986, "num_tokens": 686562272.0, "step": 17992 }, { "epoch": 2.2888945426790483, "ewc_loss": 0.0327889658510685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278896474512294e-05, "grad_norm": 18.9051570892334, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8850417733192444, "num_tokens": 686599937.0, "step": 17993 }, { "epoch": 2.289021752957639, "ewc_loss": 0.03287125378847122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287125218776055e-05, "grad_norm": 18.93605613708496, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8745372295379639, "num_tokens": 686632103.0, "step": 17994 }, { "epoch": 2.2891489632362294, "ewc_loss": 0.03272611275315285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2726111385272816e-05, "grad_norm": 18.854089736938477, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8803186416625977, "num_tokens": 686662802.0, "step": 17995 }, { "epoch": 2.28927617351482, "ewc_loss": 0.03278542682528496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278542499174364e-05, "grad_norm": 18.848405838012695, "learning_rate": 1e-06, "loss": 0.3317, "mean_token_accuracy": 0.8950012922286987, "num_tokens": 686699386.0, "step": 17996 }, { "epoch": 2.2894033837934105, "ewc_loss": 0.032801054418087006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2801053748698905e-05, "grad_norm": 18.87185287475586, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8772698044776917, "num_tokens": 686738369.0, "step": 17997 }, { "epoch": 2.289530594072001, "ewc_loss": 0.032818105071783066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2818104955367744e-05, "grad_norm": 18.872446060180664, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8679389953613281, "num_tokens": 686774543.0, "step": 17998 }, { "epoch": 2.2896578043505915, "ewc_loss": 0.03285853564739227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285853745182976e-05, "grad_norm": 19.018957138061523, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8669727444648743, "num_tokens": 686807048.0, "step": 17999 }, { "epoch": 2.289785014629182, "ewc_loss": 0.03283188119530678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28318819811102e-05, "grad_norm": 18.858089447021484, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8758763670921326, "num_tokens": 686844678.0, "step": 18000 }, { "epoch": 2.2899122249077726, "ewc_loss": 0.032749176025390625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274917617090978e-05, "grad_norm": 18.854633331298828, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8739004731178284, "num_tokens": 686877721.0, "step": 18001 }, { "epoch": 2.290039435186363, "ewc_loss": 0.03288279473781586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288279549451545e-05, "grad_norm": 18.941375732421875, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8674947023391724, "num_tokens": 686918329.0, "step": 18002 }, { "epoch": 2.2901666454649536, "ewc_loss": 0.03280419483780861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2804193324409425e-05, "grad_norm": 18.92460060119629, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8700100183486938, "num_tokens": 686960050.0, "step": 18003 }, { "epoch": 2.290293855743544, "ewc_loss": 0.03278724476695061, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2787243981147185e-05, "grad_norm": 18.8170108795166, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.884306013584137, "num_tokens": 686997849.0, "step": 18004 }, { "epoch": 2.2904210660221347, "ewc_loss": 0.032729946076869965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272994581493549e-05, "grad_norm": 18.91651153564453, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8780345916748047, "num_tokens": 687032363.0, "step": 18005 }, { "epoch": 2.290548276300725, "ewc_loss": 0.03292766958475113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2927669963100925e-05, "grad_norm": 18.977643966674805, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8583707809448242, "num_tokens": 687069606.0, "step": 18006 }, { "epoch": 2.2906754865793157, "ewc_loss": 0.03277174383401871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277174255345017e-05, "grad_norm": 18.86198616027832, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8653964996337891, "num_tokens": 687112857.0, "step": 18007 }, { "epoch": 2.2908026968579063, "ewc_loss": 0.032808538526296616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280853707110509e-05, "grad_norm": 18.91456413269043, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8721333742141724, "num_tokens": 687152245.0, "step": 18008 }, { "epoch": 2.290929907136497, "ewc_loss": 0.03279968351125717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279968223068863e-05, "grad_norm": 18.884506225585938, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.873787522315979, "num_tokens": 687193988.0, "step": 18009 }, { "epoch": 2.2910571174150873, "ewc_loss": 0.03276705741882324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276705683674663e-05, "grad_norm": 18.88027000427246, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8692158460617065, "num_tokens": 687232375.0, "step": 18010 }, { "epoch": 2.291184327693678, "ewc_loss": 0.03278788924217224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.278788790339604e-05, "grad_norm": 18.860328674316406, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8780907988548279, "num_tokens": 687274566.0, "step": 18011 }, { "epoch": 2.291311537972268, "ewc_loss": 0.03281960263848305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2819603802636266e-05, "grad_norm": 18.881765365600586, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8732225298881531, "num_tokens": 687313231.0, "step": 18012 }, { "epoch": 2.291438748250859, "ewc_loss": 0.032832641154527664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283264231868088e-05, "grad_norm": 18.927602767944336, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8795392513275146, "num_tokens": 687354502.0, "step": 18013 }, { "epoch": 2.291565958529449, "ewc_loss": 0.032812196761369705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281219687778503e-05, "grad_norm": 18.861814498901367, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8707294464111328, "num_tokens": 687399160.0, "step": 18014 }, { "epoch": 2.2916931688080395, "ewc_loss": 0.03275563195347786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2755633583292365e-05, "grad_norm": 18.93752098083496, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8640679717063904, "num_tokens": 687438775.0, "step": 18015 }, { "epoch": 2.29182037908663, "ewc_loss": 0.03281038999557495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28103888023179e-05, "grad_norm": 18.931303024291992, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.877469539642334, "num_tokens": 687471435.0, "step": 18016 }, { "epoch": 2.2919475893652206, "ewc_loss": 0.03274887055158615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274887058068998e-05, "grad_norm": 18.928667068481445, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8762545585632324, "num_tokens": 687508657.0, "step": 18017 }, { "epoch": 2.292074799643811, "ewc_loss": 0.03274719417095184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.274719347245991e-05, "grad_norm": 18.872831344604492, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8838586211204529, "num_tokens": 687545024.0, "step": 18018 }, { "epoch": 2.2922020099224016, "ewc_loss": 0.03274482861161232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27448287862353e-05, "grad_norm": 18.94317626953125, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8787118196487427, "num_tokens": 687579432.0, "step": 18019 }, { "epoch": 2.292329220200992, "ewc_loss": 0.03277117386460304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277117502875626e-05, "grad_norm": 18.910133361816406, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8646817803382874, "num_tokens": 687618378.0, "step": 18020 }, { "epoch": 2.2924564304795827, "ewc_loss": 0.03273828700184822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.273828770034015e-05, "grad_norm": 18.804452896118164, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8731335997581482, "num_tokens": 687657119.0, "step": 18021 }, { "epoch": 2.2925836407581732, "ewc_loss": 0.03270864859223366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2708649086998776e-05, "grad_norm": 18.79476547241211, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8638020753860474, "num_tokens": 687691924.0, "step": 18022 }, { "epoch": 2.2927108510367638, "ewc_loss": 0.032831333577632904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283133264631033e-05, "grad_norm": 18.897796630859375, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.851291298866272, "num_tokens": 687724221.0, "step": 18023 }, { "epoch": 2.2928380613153543, "ewc_loss": 0.032899390906095505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28993919538334e-05, "grad_norm": 18.855798721313477, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8763028383255005, "num_tokens": 687765049.0, "step": 18024 }, { "epoch": 2.292965271593945, "ewc_loss": 0.03280112147331238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280112287029624e-05, "grad_norm": 18.84130859375, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8648216724395752, "num_tokens": 687810222.0, "step": 18025 }, { "epoch": 2.2930924818725353, "ewc_loss": 0.03282094746828079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282094621681608e-05, "grad_norm": 18.850875854492188, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8691058158874512, "num_tokens": 687851961.0, "step": 18026 }, { "epoch": 2.293219692151126, "ewc_loss": 0.03276817500591278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276817369624041e-05, "grad_norm": 18.82361602783203, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8624139428138733, "num_tokens": 687890448.0, "step": 18027 }, { "epoch": 2.2933469024297164, "ewc_loss": 0.03285330533981323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285330603830516e-05, "grad_norm": 18.909700393676758, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.882980227470398, "num_tokens": 687929509.0, "step": 18028 }, { "epoch": 2.293474112708307, "ewc_loss": 0.032831959426403046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283195837866515e-05, "grad_norm": 18.849422454833984, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8637166023254395, "num_tokens": 687966814.0, "step": 18029 }, { "epoch": 2.2936013229868975, "ewc_loss": 0.0327707938849926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2770793040981516e-05, "grad_norm": 18.916690826416016, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.881028413772583, "num_tokens": 688006398.0, "step": 18030 }, { "epoch": 2.293728533265488, "ewc_loss": 0.032873209565877914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287320942035876e-05, "grad_norm": 18.866798400878906, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8722820281982422, "num_tokens": 688044604.0, "step": 18031 }, { "epoch": 2.2938557435440785, "ewc_loss": 0.032788943499326706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27889429172501e-05, "grad_norm": 18.97681427001953, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8691644668579102, "num_tokens": 688082969.0, "step": 18032 }, { "epoch": 2.293982953822669, "ewc_loss": 0.03285873681306839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285873754066415e-05, "grad_norm": 18.83652114868164, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8806859254837036, "num_tokens": 688121442.0, "step": 18033 }, { "epoch": 2.2941101641012596, "ewc_loss": 0.03276415914297104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276415736763738e-05, "grad_norm": 18.949054718017578, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8745665550231934, "num_tokens": 688161096.0, "step": 18034 }, { "epoch": 2.2942373743798496, "ewc_loss": 0.0328691266477108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2869127608137205e-05, "grad_norm": 18.90473747253418, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8737825155258179, "num_tokens": 688200782.0, "step": 18035 }, { "epoch": 2.2943645846584406, "ewc_loss": 0.0327286496758461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2728650694480166e-05, "grad_norm": 18.867677688598633, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8807006478309631, "num_tokens": 688241333.0, "step": 18036 }, { "epoch": 2.2944917949370307, "ewc_loss": 0.032810695469379425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28106943925377e-05, "grad_norm": 18.998035430908203, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8716939687728882, "num_tokens": 688278297.0, "step": 18037 }, { "epoch": 2.2946190052156212, "ewc_loss": 0.03282462805509567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282462785136886e-05, "grad_norm": 18.8692684173584, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8498574495315552, "num_tokens": 688318835.0, "step": 18038 }, { "epoch": 2.2947462154942118, "ewc_loss": 0.032714858651161194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271485911682248e-05, "grad_norm": 18.924175262451172, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8765048980712891, "num_tokens": 688354814.0, "step": 18039 }, { "epoch": 2.2948734257728023, "ewc_loss": 0.03276616334915161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276616189396009e-05, "grad_norm": 18.892114639282227, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8756310939788818, "num_tokens": 688390008.0, "step": 18040 }, { "epoch": 2.295000636051393, "ewc_loss": 0.03274949640035629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27494963130448e-05, "grad_norm": 18.9078369140625, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8759651780128479, "num_tokens": 688430938.0, "step": 18041 }, { "epoch": 2.2951278463299833, "ewc_loss": 0.03275301679968834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2753017876530066e-05, "grad_norm": 18.900793075561523, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8433520793914795, "num_tokens": 688466730.0, "step": 18042 }, { "epoch": 2.295255056608574, "ewc_loss": 0.0327724814414978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277248106314801e-05, "grad_norm": 18.94727897644043, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8723925352096558, "num_tokens": 688506666.0, "step": 18043 }, { "epoch": 2.2953822668871644, "ewc_loss": 0.03275426849722862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2754269341239706e-05, "grad_norm": 18.841949462890625, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8596205711364746, "num_tokens": 688547726.0, "step": 18044 }, { "epoch": 2.295509477165755, "ewc_loss": 0.032768577337265015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2768577511888e-05, "grad_norm": 18.90773582458496, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8697758913040161, "num_tokens": 688587487.0, "step": 18045 }, { "epoch": 2.2956366874443455, "ewc_loss": 0.032806459814310074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2806459785206243e-05, "grad_norm": 18.86682891845703, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.866904616355896, "num_tokens": 688632454.0, "step": 18046 }, { "epoch": 2.295763897722936, "ewc_loss": 0.032676491886377335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.267649299232289e-05, "grad_norm": 18.81804847717285, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8805074095726013, "num_tokens": 688668489.0, "step": 18047 }, { "epoch": 2.2958911080015265, "ewc_loss": 0.03281131014227867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28113092109561e-05, "grad_norm": 18.933813095092773, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8768908977508545, "num_tokens": 688707899.0, "step": 18048 }, { "epoch": 2.296018318280117, "ewc_loss": 0.03281587362289429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2815874874359e-05, "grad_norm": 18.889162063598633, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8786677122116089, "num_tokens": 688747171.0, "step": 18049 }, { "epoch": 2.2961455285587076, "ewc_loss": 0.032728683203458786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.272868343628943e-05, "grad_norm": 18.83564567565918, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8714056015014648, "num_tokens": 688789553.0, "step": 18050 }, { "epoch": 2.296272738837298, "ewc_loss": 0.03284343332052231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2843432563822716e-05, "grad_norm": 18.910594940185547, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8787870407104492, "num_tokens": 688827632.0, "step": 18051 }, { "epoch": 2.2963999491158886, "ewc_loss": 0.03280418738722801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.280418604845181e-05, "grad_norm": 18.930330276489258, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8816936016082764, "num_tokens": 688861505.0, "step": 18052 }, { "epoch": 2.296527159394479, "ewc_loss": 0.03274845331907272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2748452213127166e-05, "grad_norm": 18.967870712280273, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8671039938926697, "num_tokens": 688898270.0, "step": 18053 }, { "epoch": 2.2966543696730697, "ewc_loss": 0.03277081996202469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2770818506833166e-05, "grad_norm": 18.856943130493164, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8747800588607788, "num_tokens": 688937420.0, "step": 18054 }, { "epoch": 2.29678157995166, "ewc_loss": 0.03267974033951759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2679741707397625e-05, "grad_norm": 18.879451751708984, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8796908855438232, "num_tokens": 688972426.0, "step": 18055 }, { "epoch": 2.2969087902302507, "ewc_loss": 0.032772187143564224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277218638686463e-05, "grad_norm": 18.90465545654297, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8740095496177673, "num_tokens": 689005801.0, "step": 18056 }, { "epoch": 2.2970360005088413, "ewc_loss": 0.03277203440666199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2772033591754735e-05, "grad_norm": 18.91588592529297, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8696655035018921, "num_tokens": 689039999.0, "step": 18057 }, { "epoch": 2.297163210787432, "ewc_loss": 0.03280268982052803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2802690839162096e-05, "grad_norm": 18.926273345947266, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8582425713539124, "num_tokens": 689079404.0, "step": 18058 }, { "epoch": 2.2972904210660223, "ewc_loss": 0.032747600227594376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2747600926086307e-05, "grad_norm": 18.848604202270508, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.852575421333313, "num_tokens": 689116197.0, "step": 18059 }, { "epoch": 2.2974176313446124, "ewc_loss": 0.03281169384717941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281169483670965e-05, "grad_norm": 18.860193252563477, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.874466061592102, "num_tokens": 689153446.0, "step": 18060 }, { "epoch": 2.2975448416232034, "ewc_loss": 0.032846201211214066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284620106569491e-05, "grad_norm": 18.943960189819336, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8571193218231201, "num_tokens": 689185310.0, "step": 18061 }, { "epoch": 2.2976720519017935, "ewc_loss": 0.032890599220991135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289059895905666e-05, "grad_norm": 18.87894630432129, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8847773671150208, "num_tokens": 689224150.0, "step": 18062 }, { "epoch": 2.297799262180384, "ewc_loss": 0.03281304985284805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2813048164825886e-05, "grad_norm": 18.881481170654297, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8626741170883179, "num_tokens": 689261206.0, "step": 18063 }, { "epoch": 2.2979264724589745, "ewc_loss": 0.03289390727877617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289390588179231e-05, "grad_norm": 18.923410415649414, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.879173994064331, "num_tokens": 689299129.0, "step": 18064 }, { "epoch": 2.298053682737565, "ewc_loss": 0.03289181366562843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289181404397823e-05, "grad_norm": 18.892723083496094, "learning_rate": 1e-06, "loss": 0.3451, "mean_token_accuracy": 0.8883419036865234, "num_tokens": 689330554.0, "step": 18065 }, { "epoch": 2.2981808930161556, "ewc_loss": 0.0328918918967247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289189044153318e-05, "grad_norm": 18.845884323120117, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8805667161941528, "num_tokens": 689364084.0, "step": 18066 }, { "epoch": 2.298308103294746, "ewc_loss": 0.032956063747406006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2956064387690276e-05, "grad_norm": 18.90409278869629, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8617007732391357, "num_tokens": 689405254.0, "step": 18067 }, { "epoch": 2.2984353135733366, "ewc_loss": 0.03306160494685173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3061605790862814e-05, "grad_norm": 19.036123275756836, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8644479513168335, "num_tokens": 689441109.0, "step": 18068 }, { "epoch": 2.298562523851927, "ewc_loss": 0.03295213356614113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295213173259981e-05, "grad_norm": 18.83544921875, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8746277689933777, "num_tokens": 689474937.0, "step": 18069 }, { "epoch": 2.2986897341305177, "ewc_loss": 0.03295673057436943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2956730137811974e-05, "grad_norm": 18.954782485961914, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8689275979995728, "num_tokens": 689515971.0, "step": 18070 }, { "epoch": 2.298816944409108, "ewc_loss": 0.033056147396564484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3056148822652176e-05, "grad_norm": 18.89876365661621, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8519524335861206, "num_tokens": 689553620.0, "step": 18071 }, { "epoch": 2.2989441546876987, "ewc_loss": 0.032988835126161575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298883530078456e-05, "grad_norm": 18.962265014648438, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8780063986778259, "num_tokens": 689592157.0, "step": 18072 }, { "epoch": 2.2990713649662893, "ewc_loss": 0.03296000137925148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296000068075955e-05, "grad_norm": 18.882753372192383, "learning_rate": 1e-06, "loss": 0.3466, "mean_token_accuracy": 0.8863062858581543, "num_tokens": 689626889.0, "step": 18073 }, { "epoch": 2.29919857524488, "ewc_loss": 0.032927580177783966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292757901363075e-05, "grad_norm": 18.87861442565918, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8856408596038818, "num_tokens": 689666161.0, "step": 18074 }, { "epoch": 2.2993257855234703, "ewc_loss": 0.0329706035554409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297060538898222e-05, "grad_norm": 18.96103286743164, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8651391863822937, "num_tokens": 689713713.0, "step": 18075 }, { "epoch": 2.299452995802061, "ewc_loss": 0.032923027873039246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2923027902143076e-05, "grad_norm": 18.87015151977539, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8657159209251404, "num_tokens": 689755146.0, "step": 18076 }, { "epoch": 2.2995802060806514, "ewc_loss": 0.03282879292964935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282879333710298e-05, "grad_norm": 18.846586227416992, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8823452591896057, "num_tokens": 689795925.0, "step": 18077 }, { "epoch": 2.299707416359242, "ewc_loss": 0.03298630192875862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298630326753482e-05, "grad_norm": 18.95187759399414, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8846901655197144, "num_tokens": 689830673.0, "step": 18078 }, { "epoch": 2.2998346266378324, "ewc_loss": 0.032956451177597046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295645001344383e-05, "grad_norm": 18.845964431762695, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.873591423034668, "num_tokens": 689863649.0, "step": 18079 }, { "epoch": 2.299961836916423, "ewc_loss": 0.03295663371682167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295663191238418e-05, "grad_norm": 18.978940963745117, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8733643293380737, "num_tokens": 689901946.0, "step": 18080 }, { "epoch": 2.3000890471950135, "ewc_loss": 0.032968830317258835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296883005532436e-05, "grad_norm": 18.85400390625, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8674090504646301, "num_tokens": 689939968.0, "step": 18081 }, { "epoch": 2.300216257473604, "ewc_loss": 0.032881151884794235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288115112809464e-05, "grad_norm": 18.99040412902832, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.881763219833374, "num_tokens": 689979792.0, "step": 18082 }, { "epoch": 2.3003434677521946, "ewc_loss": 0.032998617738485336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299861782579683e-05, "grad_norm": 18.88968849182129, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.8858972191810608, "num_tokens": 690013974.0, "step": 18083 }, { "epoch": 2.300470678030785, "ewc_loss": 0.03282184153795242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282184115960263e-05, "grad_norm": 18.86311149597168, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8816264867782593, "num_tokens": 690056540.0, "step": 18084 }, { "epoch": 2.300597888309375, "ewc_loss": 0.03293003514409065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2930034649325535e-05, "grad_norm": 18.87631607055664, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8675068616867065, "num_tokens": 690101032.0, "step": 18085 }, { "epoch": 2.300725098587966, "ewc_loss": 0.032886479049921036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288648076704703e-05, "grad_norm": 18.82648277282715, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8860565423965454, "num_tokens": 690135417.0, "step": 18086 }, { "epoch": 2.3008523088665562, "ewc_loss": 0.032984063029289246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2984062272589654e-05, "grad_norm": 18.9597225189209, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8751751780509949, "num_tokens": 690178552.0, "step": 18087 }, { "epoch": 2.3009795191451468, "ewc_loss": 0.03292369470000267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2923693652264774e-05, "grad_norm": 18.81222915649414, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8707414865493774, "num_tokens": 690221928.0, "step": 18088 }, { "epoch": 2.3011067294237373, "ewc_loss": 0.032896656543016434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289665619377047e-05, "grad_norm": 18.953792572021484, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8797332644462585, "num_tokens": 690257040.0, "step": 18089 }, { "epoch": 2.301233939702328, "ewc_loss": 0.03290550783276558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290550739620812e-05, "grad_norm": 18.91364288330078, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8740401268005371, "num_tokens": 690302720.0, "step": 18090 }, { "epoch": 2.3013611499809183, "ewc_loss": 0.03285183757543564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28518362948671e-05, "grad_norm": 18.886810302734375, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8687738180160522, "num_tokens": 690337042.0, "step": 18091 }, { "epoch": 2.301488360259509, "ewc_loss": 0.03289495408535004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289495361968875e-05, "grad_norm": 18.919626235961914, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8798996210098267, "num_tokens": 690380247.0, "step": 18092 }, { "epoch": 2.3016155705380994, "ewc_loss": 0.03289953991770744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289954111096449e-05, "grad_norm": 18.86414909362793, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8784863948822021, "num_tokens": 690425361.0, "step": 18093 }, { "epoch": 2.30174278081669, "ewc_loss": 0.03283907100558281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2839070627233014e-05, "grad_norm": 18.984712600708008, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8678239583969116, "num_tokens": 690465137.0, "step": 18094 }, { "epoch": 2.3018699910952805, "ewc_loss": 0.032848600298166275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2848602131707594e-05, "grad_norm": 18.890230178833008, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8719652891159058, "num_tokens": 690501437.0, "step": 18095 }, { "epoch": 2.301997201373871, "ewc_loss": 0.03281175345182419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281175304437056e-05, "grad_norm": 18.917203903198242, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8685267567634583, "num_tokens": 690544134.0, "step": 18096 }, { "epoch": 2.3021244116524615, "ewc_loss": 0.03282766789197922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282766920165159e-05, "grad_norm": 18.862092971801758, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8826251029968262, "num_tokens": 690581062.0, "step": 18097 }, { "epoch": 2.302251621931052, "ewc_loss": 0.032850731164216995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285073034930974e-05, "grad_norm": 19.017414093017578, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8777645826339722, "num_tokens": 690615145.0, "step": 18098 }, { "epoch": 2.3023788322096426, "ewc_loss": 0.03284691274166107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28469141095411e-05, "grad_norm": 18.862192153930664, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8798876404762268, "num_tokens": 690655483.0, "step": 18099 }, { "epoch": 2.302506042488233, "ewc_loss": 0.032719116657972336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271911555202678e-05, "grad_norm": 18.978302001953125, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8709486722946167, "num_tokens": 690694133.0, "step": 18100 }, { "epoch": 2.3026332527668236, "ewc_loss": 0.03280020132660866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2800202461658046e-05, "grad_norm": 18.901174545288086, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8522982597351074, "num_tokens": 690729939.0, "step": 18101 }, { "epoch": 2.302760463045414, "ewc_loss": 0.03271683678030968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.271683817729354e-05, "grad_norm": 18.84370231628418, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8798881769180298, "num_tokens": 690770139.0, "step": 18102 }, { "epoch": 2.3028876733240047, "ewc_loss": 0.0327945277094841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279452721471898e-05, "grad_norm": 18.875211715698242, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8752965331077576, "num_tokens": 690811379.0, "step": 18103 }, { "epoch": 2.303014883602595, "ewc_loss": 0.03286284580826759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286284481873736e-05, "grad_norm": 19.06772232055664, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8696866035461426, "num_tokens": 690844194.0, "step": 18104 }, { "epoch": 2.3031420938811857, "ewc_loss": 0.032879605889320374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287960498710163e-05, "grad_norm": 18.88968849182129, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8672587275505066, "num_tokens": 690882129.0, "step": 18105 }, { "epoch": 2.3032693041597763, "ewc_loss": 0.03274693712592125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2746938813943416e-05, "grad_norm": 18.956361770629883, "learning_rate": 1e-06, "loss": 0.3529, "mean_token_accuracy": 0.8877627849578857, "num_tokens": 690915568.0, "step": 18106 }, { "epoch": 2.303396514438367, "ewc_loss": 0.03283658251166344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283658224972896e-05, "grad_norm": 18.83733558654785, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8714262843132019, "num_tokens": 690953412.0, "step": 18107 }, { "epoch": 2.3035237247169573, "ewc_loss": 0.03284594416618347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284594276919961e-05, "grad_norm": 19.013259887695312, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8708967566490173, "num_tokens": 690990047.0, "step": 18108 }, { "epoch": 2.303650934995548, "ewc_loss": 0.03291505202651024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291505345259793e-05, "grad_norm": 18.900434494018555, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8788369297981262, "num_tokens": 691031010.0, "step": 18109 }, { "epoch": 2.303778145274138, "ewc_loss": 0.03277675434947014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277675568824634e-05, "grad_norm": 18.94597625732422, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8756681084632874, "num_tokens": 691065830.0, "step": 18110 }, { "epoch": 2.303905355552729, "ewc_loss": 0.03287690132856369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287690196884796e-05, "grad_norm": 18.862171173095703, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8848797678947449, "num_tokens": 691101418.0, "step": 18111 }, { "epoch": 2.304032565831319, "ewc_loss": 0.03281654790043831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281654790043831e-05, "grad_norm": 18.925783157348633, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.85954749584198, "num_tokens": 691139447.0, "step": 18112 }, { "epoch": 2.3041597761099095, "ewc_loss": 0.03291360288858414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29136018990539e-05, "grad_norm": 18.925247192382812, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8552953004837036, "num_tokens": 691168809.0, "step": 18113 }, { "epoch": 2.3042869863885, "ewc_loss": 0.032855305820703506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2855306926649064e-05, "grad_norm": 18.889951705932617, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.879996657371521, "num_tokens": 691200420.0, "step": 18114 }, { "epoch": 2.3044141966670906, "ewc_loss": 0.032905057072639465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290505628683604e-05, "grad_norm": 18.971899032592773, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.868394136428833, "num_tokens": 691238587.0, "step": 18115 }, { "epoch": 2.304541406945681, "ewc_loss": 0.032879289239645004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287928848294541e-05, "grad_norm": 18.944091796875, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8718733787536621, "num_tokens": 691277137.0, "step": 18116 }, { "epoch": 2.3046686172242716, "ewc_loss": 0.03290335088968277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290335007477552e-05, "grad_norm": 18.915773391723633, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8725066184997559, "num_tokens": 691319564.0, "step": 18117 }, { "epoch": 2.304795827502862, "ewc_loss": 0.03288724645972252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288724474259652e-05, "grad_norm": 18.858749389648438, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8720778226852417, "num_tokens": 691359652.0, "step": 18118 }, { "epoch": 2.3049230377814527, "ewc_loss": 0.03291236609220505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291236498625949e-05, "grad_norm": 18.91103744506836, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8730171918869019, "num_tokens": 691399705.0, "step": 18119 }, { "epoch": 2.305050248060043, "ewc_loss": 0.03291552513837814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291552638984285e-05, "grad_norm": 18.833148956298828, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8671458959579468, "num_tokens": 691439447.0, "step": 18120 }, { "epoch": 2.3051774583386337, "ewc_loss": 0.03294937685132027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294937778264284e-05, "grad_norm": 18.897741317749023, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8621388673782349, "num_tokens": 691477453.0, "step": 18121 }, { "epoch": 2.3053046686172243, "ewc_loss": 0.03302089124917984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302088953205384e-05, "grad_norm": 18.924753189086914, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.87328040599823, "num_tokens": 691517766.0, "step": 18122 }, { "epoch": 2.305431878895815, "ewc_loss": 0.03291749954223633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29174981743563e-05, "grad_norm": 18.849035263061523, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8731666207313538, "num_tokens": 691553956.0, "step": 18123 }, { "epoch": 2.3055590891744053, "ewc_loss": 0.032926566898822784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2926567655522376e-05, "grad_norm": 18.917543411254883, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8715203404426575, "num_tokens": 691592522.0, "step": 18124 }, { "epoch": 2.305686299452996, "ewc_loss": 0.03301252797245979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3012529456755146e-05, "grad_norm": 18.876447677612305, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8722245693206787, "num_tokens": 691629202.0, "step": 18125 }, { "epoch": 2.3058135097315864, "ewc_loss": 0.03297599032521248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297598959761672e-05, "grad_norm": 18.94243049621582, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.876488983631134, "num_tokens": 691665324.0, "step": 18126 }, { "epoch": 2.305940720010177, "ewc_loss": 0.033022090792655945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3022090065060183e-05, "grad_norm": 18.85914421081543, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8834065198898315, "num_tokens": 691704223.0, "step": 18127 }, { "epoch": 2.3060679302887674, "ewc_loss": 0.0329144261777401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291442772024311e-05, "grad_norm": 18.888994216918945, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8699600696563721, "num_tokens": 691745061.0, "step": 18128 }, { "epoch": 2.306195140567358, "ewc_loss": 0.03296478092670441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296478098491207e-05, "grad_norm": 18.911401748657227, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.884719967842102, "num_tokens": 691783630.0, "step": 18129 }, { "epoch": 2.3063223508459485, "ewc_loss": 0.032891713082790375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289171218057163e-05, "grad_norm": 18.90468978881836, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8728374242782593, "num_tokens": 691818283.0, "step": 18130 }, { "epoch": 2.306449561124539, "ewc_loss": 0.03291093930602074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291093889856711e-05, "grad_norm": 18.89402961730957, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8649348020553589, "num_tokens": 691854585.0, "step": 18131 }, { "epoch": 2.3065767714031296, "ewc_loss": 0.03292976692318916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292976543889381e-05, "grad_norm": 18.88068962097168, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.856723964214325, "num_tokens": 691891784.0, "step": 18132 }, { "epoch": 2.3067039816817196, "ewc_loss": 0.032958291471004486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2958290830720216e-05, "grad_norm": 18.973390579223633, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8749529123306274, "num_tokens": 691927389.0, "step": 18133 }, { "epoch": 2.3068311919603106, "ewc_loss": 0.03289516642689705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289516826043837e-05, "grad_norm": 18.954526901245117, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8713412880897522, "num_tokens": 691962928.0, "step": 18134 }, { "epoch": 2.3069584022389007, "ewc_loss": 0.03283750265836716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283750265836716e-05, "grad_norm": 18.88674545288086, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.873069167137146, "num_tokens": 692007064.0, "step": 18135 }, { "epoch": 2.3070856125174912, "ewc_loss": 0.032879538834095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28795395034831e-05, "grad_norm": 18.882366180419922, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8775702118873596, "num_tokens": 692040668.0, "step": 18136 }, { "epoch": 2.3072128227960818, "ewc_loss": 0.03288200497627258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288200605311431e-05, "grad_norm": 18.940397262573242, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8746315836906433, "num_tokens": 692077710.0, "step": 18137 }, { "epoch": 2.3073400330746723, "ewc_loss": 0.032918963581323624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2918964279815555e-05, "grad_norm": 18.863624572753906, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8797977566719055, "num_tokens": 692112316.0, "step": 18138 }, { "epoch": 2.307467243353263, "ewc_loss": 0.032878488302230835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287848812760785e-05, "grad_norm": 18.966333389282227, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8780695796012878, "num_tokens": 692153201.0, "step": 18139 }, { "epoch": 2.3075944536318533, "ewc_loss": 0.03294883668422699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2948835723800585e-05, "grad_norm": 18.874094009399414, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8675091862678528, "num_tokens": 692192942.0, "step": 18140 }, { "epoch": 2.307721663910444, "ewc_loss": 0.03283960744738579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283960904809646e-05, "grad_norm": 18.87871742248535, "learning_rate": 1e-06, "loss": 0.3525, "mean_token_accuracy": 0.8876729011535645, "num_tokens": 692235515.0, "step": 18141 }, { "epoch": 2.3078488741890344, "ewc_loss": 0.03300553187727928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300553362350911e-05, "grad_norm": 18.95022201538086, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8646465539932251, "num_tokens": 692271338.0, "step": 18142 }, { "epoch": 2.307976084467625, "ewc_loss": 0.03290284425020218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290284439572133e-05, "grad_norm": 18.876527786254883, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8549764752388, "num_tokens": 692311469.0, "step": 18143 }, { "epoch": 2.3081032947462155, "ewc_loss": 0.032910484820604324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2910484151216224e-05, "grad_norm": 18.968965530395508, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8567108511924744, "num_tokens": 692352052.0, "step": 18144 }, { "epoch": 2.308230505024806, "ewc_loss": 0.032957207411527634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29572067130357e-05, "grad_norm": 18.892757415771484, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8580114841461182, "num_tokens": 692382978.0, "step": 18145 }, { "epoch": 2.3083577153033965, "ewc_loss": 0.03286062180995941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2860622013686225e-05, "grad_norm": 18.857816696166992, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8759235143661499, "num_tokens": 692415651.0, "step": 18146 }, { "epoch": 2.308484925581987, "ewc_loss": 0.03295652195811272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295652277301997e-05, "grad_norm": 18.9224796295166, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8563419580459595, "num_tokens": 692455937.0, "step": 18147 }, { "epoch": 2.3086121358605776, "ewc_loss": 0.033022601157426834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3022599382093176e-05, "grad_norm": 18.905433654785156, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8680788278579712, "num_tokens": 692496277.0, "step": 18148 }, { "epoch": 2.308739346139168, "ewc_loss": 0.03290851414203644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290851600468159e-05, "grad_norm": 18.91843032836914, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8733841180801392, "num_tokens": 692530485.0, "step": 18149 }, { "epoch": 2.3088665564177586, "ewc_loss": 0.03296086564660072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296086651971564e-05, "grad_norm": 18.909826278686523, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8703588247299194, "num_tokens": 692564794.0, "step": 18150 }, { "epoch": 2.308993766696349, "ewc_loss": 0.03294328600168228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294328416814096e-05, "grad_norm": 18.86893081665039, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8625088930130005, "num_tokens": 692610359.0, "step": 18151 }, { "epoch": 2.3091209769749397, "ewc_loss": 0.03295914828777313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295914939371869e-05, "grad_norm": 18.93401527404785, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8795828819274902, "num_tokens": 692641560.0, "step": 18152 }, { "epoch": 2.30924818725353, "ewc_loss": 0.03297513723373413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297513831057586e-05, "grad_norm": 18.90093231201172, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8680799007415771, "num_tokens": 692680114.0, "step": 18153 }, { "epoch": 2.3093753975321207, "ewc_loss": 0.0329725444316864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29725444316864e-05, "grad_norm": 18.920747756958008, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.861447811126709, "num_tokens": 692714199.0, "step": 18154 }, { "epoch": 2.3095026078107113, "ewc_loss": 0.033127088099718094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3127089409390464e-05, "grad_norm": 18.97736358642578, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8750303983688354, "num_tokens": 692752331.0, "step": 18155 }, { "epoch": 2.309629818089302, "ewc_loss": 0.03305260092020035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305260179331526e-05, "grad_norm": 18.97344398498535, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8790766000747681, "num_tokens": 692789167.0, "step": 18156 }, { "epoch": 2.3097570283678923, "ewc_loss": 0.033065542578697205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306554208393209e-05, "grad_norm": 18.852373123168945, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8729798793792725, "num_tokens": 692829234.0, "step": 18157 }, { "epoch": 2.3098842386464824, "ewc_loss": 0.032990116626024246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2990115869324654e-05, "grad_norm": 18.947114944458008, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8519680500030518, "num_tokens": 692863749.0, "step": 18158 }, { "epoch": 2.3100114489250734, "ewc_loss": 0.03306051343679428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3060514397220686e-05, "grad_norm": 18.932830810546875, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8688803911209106, "num_tokens": 692900972.0, "step": 18159 }, { "epoch": 2.3101386592036635, "ewc_loss": 0.03300010785460472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3000109397107735e-05, "grad_norm": 18.97407341003418, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8673223853111267, "num_tokens": 692938522.0, "step": 18160 }, { "epoch": 2.310265869482254, "ewc_loss": 0.03301383927464485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30138391291257e-05, "grad_norm": 18.927066802978516, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8827135562896729, "num_tokens": 692973653.0, "step": 18161 }, { "epoch": 2.3103930797608445, "ewc_loss": 0.03304877132177353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3048771001631394e-05, "grad_norm": 19.09987449645996, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8661317229270935, "num_tokens": 693016325.0, "step": 18162 }, { "epoch": 2.310520290039435, "ewc_loss": 0.03308228403329849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308228406240232e-05, "grad_norm": 18.97496223449707, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.882695198059082, "num_tokens": 693052022.0, "step": 18163 }, { "epoch": 2.3106475003180256, "ewc_loss": 0.03285084664821625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285084676463157e-05, "grad_norm": 18.84836769104004, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.876722514629364, "num_tokens": 693093936.0, "step": 18164 }, { "epoch": 2.310774710596616, "ewc_loss": 0.03301948308944702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30194816342555e-05, "grad_norm": 18.97701644897461, "learning_rate": 1e-06, "loss": 0.3488, "mean_token_accuracy": 0.8863114714622498, "num_tokens": 693130160.0, "step": 18165 }, { "epoch": 2.3109019208752066, "ewc_loss": 0.032970864325761795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2970863685477525e-05, "grad_norm": 18.924468994140625, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8756082057952881, "num_tokens": 693172405.0, "step": 18166 }, { "epoch": 2.311029131153797, "ewc_loss": 0.0329912044107914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2991203624987975e-05, "grad_norm": 19.000322341918945, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8756120204925537, "num_tokens": 693206499.0, "step": 18167 }, { "epoch": 2.3111563414323877, "ewc_loss": 0.03296668827533722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2966687285806984e-05, "grad_norm": 18.89067840576172, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8721334934234619, "num_tokens": 693244287.0, "step": 18168 }, { "epoch": 2.311283551710978, "ewc_loss": 0.032920222729444504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292022302048281e-05, "grad_norm": 18.903072357177734, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8645497560501099, "num_tokens": 693286164.0, "step": 18169 }, { "epoch": 2.3114107619895687, "ewc_loss": 0.03296341374516487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29634131048806e-05, "grad_norm": 18.899333953857422, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8797188997268677, "num_tokens": 693330961.0, "step": 18170 }, { "epoch": 2.3115379722681593, "ewc_loss": 0.03291567414999008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2915675546973944e-05, "grad_norm": 18.898775100708008, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.878662109375, "num_tokens": 693375169.0, "step": 18171 }, { "epoch": 2.31166518254675, "ewc_loss": 0.03296248987317085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29624890582636e-05, "grad_norm": 18.919252395629883, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8669476509094238, "num_tokens": 693414127.0, "step": 18172 }, { "epoch": 2.3117923928253403, "ewc_loss": 0.03295203670859337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2952037145150825e-05, "grad_norm": 18.984724044799805, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.868611216545105, "num_tokens": 693450446.0, "step": 18173 }, { "epoch": 2.311919603103931, "ewc_loss": 0.03294455632567406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2944557460723445e-05, "grad_norm": 18.90044593811035, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8643035888671875, "num_tokens": 693490973.0, "step": 18174 }, { "epoch": 2.3120468133825214, "ewc_loss": 0.032864682376384735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286468199803494e-05, "grad_norm": 18.83854103088379, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8743658661842346, "num_tokens": 693526815.0, "step": 18175 }, { "epoch": 2.312174023661112, "ewc_loss": 0.032911524176597595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291152461315505e-05, "grad_norm": 18.98563003540039, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8663452863693237, "num_tokens": 693566949.0, "step": 18176 }, { "epoch": 2.3123012339397024, "ewc_loss": 0.03286425769329071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286425635451451e-05, "grad_norm": 18.85188865661621, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8674123287200928, "num_tokens": 693601116.0, "step": 18177 }, { "epoch": 2.312428444218293, "ewc_loss": 0.03290407732129097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2904077670536935e-05, "grad_norm": 18.935619354248047, "learning_rate": 1e-06, "loss": 0.3192, "mean_token_accuracy": 0.902768611907959, "num_tokens": 693640588.0, "step": 18178 }, { "epoch": 2.3125556544968835, "ewc_loss": 0.03298432379961014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2984324207063764e-05, "grad_norm": 18.901222229003906, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8746050596237183, "num_tokens": 693683352.0, "step": 18179 }, { "epoch": 2.312682864775474, "ewc_loss": 0.032869160175323486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286916034994647e-05, "grad_norm": 18.90658950805664, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8786136507987976, "num_tokens": 693722579.0, "step": 18180 }, { "epoch": 2.3128100750540646, "ewc_loss": 0.032902516424655914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290251697762869e-05, "grad_norm": 18.825380325317383, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8745390176773071, "num_tokens": 693766743.0, "step": 18181 }, { "epoch": 2.312937285332655, "ewc_loss": 0.03291858732700348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291858593001962e-05, "grad_norm": 18.95848274230957, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8756009936332703, "num_tokens": 693809699.0, "step": 18182 }, { "epoch": 2.313064495611245, "ewc_loss": 0.032909560948610306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290956010459922e-05, "grad_norm": 18.854780197143555, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8789685964584351, "num_tokens": 693845508.0, "step": 18183 }, { "epoch": 2.313191705889836, "ewc_loss": 0.03287189453840256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28718961100094e-05, "grad_norm": 18.98488998413086, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8724557161331177, "num_tokens": 693883763.0, "step": 18184 }, { "epoch": 2.313318916168426, "ewc_loss": 0.03295388072729111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295388160040602e-05, "grad_norm": 18.910409927368164, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8787456750869751, "num_tokens": 693917093.0, "step": 18185 }, { "epoch": 2.3134461264470167, "ewc_loss": 0.03281451016664505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.281451063230634e-05, "grad_norm": 18.944286346435547, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.873511552810669, "num_tokens": 693949219.0, "step": 18186 }, { "epoch": 2.3135733367256073, "ewc_loss": 0.03291790559887886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291790562798269e-05, "grad_norm": 18.93552017211914, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8674381971359253, "num_tokens": 693992755.0, "step": 18187 }, { "epoch": 2.313700547004198, "ewc_loss": 0.032851409167051315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285141065134667e-05, "grad_norm": 18.933794021606445, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8581619262695312, "num_tokens": 694030386.0, "step": 18188 }, { "epoch": 2.3138277572827883, "ewc_loss": 0.0328836590051651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288365769549273e-05, "grad_norm": 18.939495086669922, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8811513185501099, "num_tokens": 694065019.0, "step": 18189 }, { "epoch": 2.313954967561379, "ewc_loss": 0.03286130353808403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286130231572315e-05, "grad_norm": 18.875810623168945, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8711025714874268, "num_tokens": 694106645.0, "step": 18190 }, { "epoch": 2.3140821778399694, "ewc_loss": 0.03288700059056282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288700099801645e-05, "grad_norm": 18.88792610168457, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8775556087493896, "num_tokens": 694149146.0, "step": 18191 }, { "epoch": 2.31420938811856, "ewc_loss": 0.03293764591217041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293764530098997e-05, "grad_norm": 18.949338912963867, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8571206331253052, "num_tokens": 694193567.0, "step": 18192 }, { "epoch": 2.3143365983971504, "ewc_loss": 0.03291628882288933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291629036539234e-05, "grad_norm": 18.874187469482422, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8843706846237183, "num_tokens": 694232160.0, "step": 18193 }, { "epoch": 2.314463808675741, "ewc_loss": 0.0329466350376606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2946634746622294e-05, "grad_norm": 19.010623931884766, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8736475706100464, "num_tokens": 694275153.0, "step": 18194 }, { "epoch": 2.3145910189543315, "ewc_loss": 0.032931648194789886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2931649911915883e-05, "grad_norm": 18.8882999420166, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8795350790023804, "num_tokens": 694313722.0, "step": 18195 }, { "epoch": 2.314718229232922, "ewc_loss": 0.03286413103342056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286413266323507e-05, "grad_norm": 18.974424362182617, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8764311075210571, "num_tokens": 694350544.0, "step": 18196 }, { "epoch": 2.3148454395115126, "ewc_loss": 0.03289641812443733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289641972514801e-05, "grad_norm": 18.83612632751465, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8759204745292664, "num_tokens": 694386552.0, "step": 18197 }, { "epoch": 2.314972649790103, "ewc_loss": 0.032833877950906754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2833879231475294e-05, "grad_norm": 18.91925621032715, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.869444727897644, "num_tokens": 694434368.0, "step": 18198 }, { "epoch": 2.3150998600686936, "ewc_loss": 0.03287835419178009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287835352239199e-05, "grad_norm": 18.894123077392578, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8727998733520508, "num_tokens": 694479856.0, "step": 18199 }, { "epoch": 2.315227070347284, "ewc_loss": 0.032908014953136444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290801396360621e-05, "grad_norm": 19.03611946105957, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8735123872756958, "num_tokens": 694521107.0, "step": 18200 }, { "epoch": 2.3153542806258747, "ewc_loss": 0.03289084509015083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289084634161554e-05, "grad_norm": 18.931007385253906, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8745887279510498, "num_tokens": 694560886.0, "step": 18201 }, { "epoch": 2.315481490904465, "ewc_loss": 0.032820865511894226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2820866181282327e-05, "grad_norm": 18.95713233947754, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8778707981109619, "num_tokens": 694598404.0, "step": 18202 }, { "epoch": 2.3156087011830557, "ewc_loss": 0.032870110124349594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287010986241512e-05, "grad_norm": 18.91200065612793, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8684864044189453, "num_tokens": 694637162.0, "step": 18203 }, { "epoch": 2.3157359114616463, "ewc_loss": 0.0328320637345314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2832063880050555e-05, "grad_norm": 18.937931060791016, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8768260478973389, "num_tokens": 694677435.0, "step": 18204 }, { "epoch": 2.315863121740237, "ewc_loss": 0.0328020304441452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2802028727019206e-05, "grad_norm": 18.933565139770508, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8821249604225159, "num_tokens": 694713980.0, "step": 18205 }, { "epoch": 2.3159903320188273, "ewc_loss": 0.03279851749539375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279851807747036e-05, "grad_norm": 18.86883544921875, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8683033585548401, "num_tokens": 694753884.0, "step": 18206 }, { "epoch": 2.316117542297418, "ewc_loss": 0.032807108014822006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2807107345433906e-05, "grad_norm": 18.958669662475586, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.885604977607727, "num_tokens": 694785739.0, "step": 18207 }, { "epoch": 2.316244752576008, "ewc_loss": 0.03288545459508896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288545485702343e-05, "grad_norm": 18.863733291625977, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8713802695274353, "num_tokens": 694827629.0, "step": 18208 }, { "epoch": 2.316371962854599, "ewc_loss": 0.03280879557132721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2808795367600396e-05, "grad_norm": 19.068838119506836, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8702559471130371, "num_tokens": 694868241.0, "step": 18209 }, { "epoch": 2.316499173133189, "ewc_loss": 0.03288692608475685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28869246004615e-05, "grad_norm": 18.949851989746094, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8642902374267578, "num_tokens": 694905298.0, "step": 18210 }, { "epoch": 2.3166263834117795, "ewc_loss": 0.032827261835336685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2827261748025194e-05, "grad_norm": 19.018352508544922, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8653725385665894, "num_tokens": 694938054.0, "step": 18211 }, { "epoch": 2.31675359369037, "ewc_loss": 0.0328584685921669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2858468330232427e-05, "grad_norm": 19.047061920166016, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8707162141799927, "num_tokens": 694978761.0, "step": 18212 }, { "epoch": 2.3168808039689606, "ewc_loss": 0.032835036516189575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283503610873595e-05, "grad_norm": 18.9932804107666, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8702901601791382, "num_tokens": 695015073.0, "step": 18213 }, { "epoch": 2.317008014247551, "ewc_loss": 0.0327892005443573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.27892012137454e-05, "grad_norm": 18.983274459838867, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8677675724029541, "num_tokens": 695052288.0, "step": 18214 }, { "epoch": 2.3171352245261416, "ewc_loss": 0.03283633291721344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2836331229191273e-05, "grad_norm": 18.881221771240234, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8622661828994751, "num_tokens": 695088976.0, "step": 18215 }, { "epoch": 2.317262434804732, "ewc_loss": 0.03277339041233063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.277339055784978e-05, "grad_norm": 18.924964904785156, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.875774085521698, "num_tokens": 695128482.0, "step": 18216 }, { "epoch": 2.3173896450833227, "ewc_loss": 0.03287684544920921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287684376118705e-05, "grad_norm": 18.967411041259766, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8700377941131592, "num_tokens": 695164963.0, "step": 18217 }, { "epoch": 2.317516855361913, "ewc_loss": 0.0328376330435276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283763362560421e-05, "grad_norm": 18.915769577026367, "learning_rate": 1e-06, "loss": 0.3367, "mean_token_accuracy": 0.8943048119544983, "num_tokens": 695201515.0, "step": 18218 }, { "epoch": 2.3176440656405037, "ewc_loss": 0.032864753156900406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286475475761108e-05, "grad_norm": 18.942548751831055, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8792984485626221, "num_tokens": 695242057.0, "step": 18219 }, { "epoch": 2.3177712759190943, "ewc_loss": 0.032888781279325485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2888779969653115e-05, "grad_norm": 19.018892288208008, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8751708269119263, "num_tokens": 695277128.0, "step": 18220 }, { "epoch": 2.317898486197685, "ewc_loss": 0.03285859525203705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2858595659490675e-05, "grad_norm": 18.931865692138672, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8861830234527588, "num_tokens": 695311984.0, "step": 18221 }, { "epoch": 2.3180256964762753, "ewc_loss": 0.032885778695344925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2885778637137264e-05, "grad_norm": 18.919479370117188, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8623098134994507, "num_tokens": 695348457.0, "step": 18222 }, { "epoch": 2.318152906754866, "ewc_loss": 0.03284797817468643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2847976399352774e-05, "grad_norm": 19.01177978515625, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8757123947143555, "num_tokens": 695386518.0, "step": 18223 }, { "epoch": 2.3182801170334564, "ewc_loss": 0.03290209546685219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290209497208707e-05, "grad_norm": 18.886539459228516, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8695318698883057, "num_tokens": 695425365.0, "step": 18224 }, { "epoch": 2.318407327312047, "ewc_loss": 0.03282826766371727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.282826583017595e-05, "grad_norm": 18.995113372802734, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8754294514656067, "num_tokens": 695459559.0, "step": 18225 }, { "epoch": 2.3185345375906374, "ewc_loss": 0.03289571404457092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289571395725943e-05, "grad_norm": 18.954565048217773, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.869379997253418, "num_tokens": 695491386.0, "step": 18226 }, { "epoch": 2.318661747869228, "ewc_loss": 0.0329013429582119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2901341910474e-05, "grad_norm": 18.994422912597656, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8562389016151428, "num_tokens": 695534660.0, "step": 18227 }, { "epoch": 2.3187889581478185, "ewc_loss": 0.03290114179253578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290114182163961e-05, "grad_norm": 18.88254165649414, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8650799989700317, "num_tokens": 695578127.0, "step": 18228 }, { "epoch": 2.318916168426409, "ewc_loss": 0.03295446187257767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2954463677015156e-05, "grad_norm": 19.032745361328125, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8617733716964722, "num_tokens": 695614181.0, "step": 18229 }, { "epoch": 2.3190433787049995, "ewc_loss": 0.03295573592185974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295573696959764e-05, "grad_norm": 18.871421813964844, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8725925087928772, "num_tokens": 695652753.0, "step": 18230 }, { "epoch": 2.3191705889835896, "ewc_loss": 0.03289009630680084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289009691798128e-05, "grad_norm": 19.01477813720703, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8802868127822876, "num_tokens": 695692449.0, "step": 18231 }, { "epoch": 2.3192977992621806, "ewc_loss": 0.03302876278758049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302876211819239e-05, "grad_norm": 18.947410583496094, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8728743195533752, "num_tokens": 695730812.0, "step": 18232 }, { "epoch": 2.3194250095407707, "ewc_loss": 0.03289395198225975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.28939531755168e-05, "grad_norm": 18.96906089782715, "learning_rate": 1e-06, "loss": 0.349, "mean_token_accuracy": 0.8874536156654358, "num_tokens": 695767577.0, "step": 18233 }, { "epoch": 2.319552219819361, "ewc_loss": 0.03298246115446091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298246156191453e-05, "grad_norm": 18.94864273071289, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8738411664962769, "num_tokens": 695812372.0, "step": 18234 }, { "epoch": 2.3196794300979517, "ewc_loss": 0.03294002637267113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2940028177108616e-05, "grad_norm": 18.968223571777344, "learning_rate": 1e-06, "loss": 0.3438, "mean_token_accuracy": 0.8876783847808838, "num_tokens": 695847470.0, "step": 18235 }, { "epoch": 2.3198066403765423, "ewc_loss": 0.032907210290431976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290720997028984e-05, "grad_norm": 18.917930603027344, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8884593844413757, "num_tokens": 695888082.0, "step": 18236 }, { "epoch": 2.319933850655133, "ewc_loss": 0.03295152261853218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2951524190139025e-05, "grad_norm": 19.028797149658203, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8798918128013611, "num_tokens": 695928904.0, "step": 18237 }, { "epoch": 2.3200610609337233, "ewc_loss": 0.03286726400256157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2867264962987974e-05, "grad_norm": 18.860319137573242, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8629677295684814, "num_tokens": 695967549.0, "step": 18238 }, { "epoch": 2.320188271212314, "ewc_loss": 0.03288637101650238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288637162768282e-05, "grad_norm": 19.075407028198242, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.867256224155426, "num_tokens": 696005611.0, "step": 18239 }, { "epoch": 2.3203154814909044, "ewc_loss": 0.03294440731406212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2944408303592354e-05, "grad_norm": 18.979116439819336, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8754724264144897, "num_tokens": 696034864.0, "step": 18240 }, { "epoch": 2.320442691769495, "ewc_loss": 0.03287963196635246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287963045295328e-05, "grad_norm": 18.95956039428711, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.871367871761322, "num_tokens": 696072137.0, "step": 18241 }, { "epoch": 2.3205699020480854, "ewc_loss": 0.032910849899053574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291085158707574e-05, "grad_norm": 18.94155502319336, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.875824511051178, "num_tokens": 696115744.0, "step": 18242 }, { "epoch": 2.320697112326676, "ewc_loss": 0.032957110553979874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295711212558672e-05, "grad_norm": 18.950613021850586, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.869279682636261, "num_tokens": 696151293.0, "step": 18243 }, { "epoch": 2.3208243226052665, "ewc_loss": 0.03291573375463486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291573375463486e-05, "grad_norm": 18.941984176635742, "learning_rate": 1e-06, "loss": 0.3421, "mean_token_accuracy": 0.8903488516807556, "num_tokens": 696190742.0, "step": 18244 }, { "epoch": 2.320951532883857, "ewc_loss": 0.032939035445451736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293903500889428e-05, "grad_norm": 18.987327575683594, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8630872964859009, "num_tokens": 696232807.0, "step": 18245 }, { "epoch": 2.3210787431624476, "ewc_loss": 0.03291374444961548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291374378022738e-05, "grad_norm": 18.962459564208984, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8704178333282471, "num_tokens": 696269416.0, "step": 18246 }, { "epoch": 2.321205953441038, "ewc_loss": 0.032875917851924896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287591607659124e-05, "grad_norm": 18.921184539794922, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8742579221725464, "num_tokens": 696305144.0, "step": 18247 }, { "epoch": 2.3213331637196286, "ewc_loss": 0.0329340323805809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293403278803453e-05, "grad_norm": 19.006160736083984, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8657757043838501, "num_tokens": 696344854.0, "step": 18248 }, { "epoch": 2.321460373998219, "ewc_loss": 0.03291723504662514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2917236239882186e-05, "grad_norm": 18.948205947875977, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8911274075508118, "num_tokens": 696380154.0, "step": 18249 }, { "epoch": 2.3215875842768097, "ewc_loss": 0.032909102737903595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.290910171926953e-05, "grad_norm": 19.07647132873535, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8616018891334534, "num_tokens": 696416458.0, "step": 18250 }, { "epoch": 2.3217147945554, "ewc_loss": 0.03290417790412903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2904179533943534e-05, "grad_norm": 18.937402725219727, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8756605982780457, "num_tokens": 696453232.0, "step": 18251 }, { "epoch": 2.3218420048339907, "ewc_loss": 0.03284239396452904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2842395739862695e-05, "grad_norm": 19.06179428100586, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8533145189285278, "num_tokens": 696487930.0, "step": 18252 }, { "epoch": 2.3219692151125813, "ewc_loss": 0.03293713554739952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293713598395698e-05, "grad_norm": 18.93686866760254, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8642976880073547, "num_tokens": 696523824.0, "step": 18253 }, { "epoch": 2.322096425391172, "ewc_loss": 0.03287793695926666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287793879280798e-05, "grad_norm": 18.977750778198242, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8829651474952698, "num_tokens": 696567007.0, "step": 18254 }, { "epoch": 2.3222236356697623, "ewc_loss": 0.032986242324113846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29862414218951e-05, "grad_norm": 18.957866668701172, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8886788487434387, "num_tokens": 696610055.0, "step": 18255 }, { "epoch": 2.3223508459483524, "ewc_loss": 0.03287140280008316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287140134489164e-05, "grad_norm": 18.966360092163086, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8596577644348145, "num_tokens": 696644058.0, "step": 18256 }, { "epoch": 2.3224780562269434, "ewc_loss": 0.03286314755678177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2863146770978346e-05, "grad_norm": 18.864622116088867, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8804810047149658, "num_tokens": 696686381.0, "step": 18257 }, { "epoch": 2.3226052665055335, "ewc_loss": 0.03289266303181648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289266169304028e-05, "grad_norm": 19.030921936035156, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.867331862449646, "num_tokens": 696727567.0, "step": 18258 }, { "epoch": 2.322732476784124, "ewc_loss": 0.03294619917869568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294619818916544e-05, "grad_norm": 18.925968170166016, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8628063797950745, "num_tokens": 696767266.0, "step": 18259 }, { "epoch": 2.3228596870627145, "ewc_loss": 0.03291034325957298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291034227004275e-05, "grad_norm": 19.028913497924805, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8870917558670044, "num_tokens": 696806059.0, "step": 18260 }, { "epoch": 2.322986897341305, "ewc_loss": 0.03302747383713722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302747427369468e-05, "grad_norm": 18.957487106323242, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8720083236694336, "num_tokens": 696844232.0, "step": 18261 }, { "epoch": 2.3231141076198956, "ewc_loss": 0.03283282369375229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283282421762124e-05, "grad_norm": 19.02025604248047, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8757365942001343, "num_tokens": 696886223.0, "step": 18262 }, { "epoch": 2.323241317898486, "ewc_loss": 0.0329497829079628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294978159829043e-05, "grad_norm": 18.923080444335938, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8589447736740112, "num_tokens": 696922625.0, "step": 18263 }, { "epoch": 2.3233685281770766, "ewc_loss": 0.03287295997142792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287295839982107e-05, "grad_norm": 19.015844345092773, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8625291585922241, "num_tokens": 696965160.0, "step": 18264 }, { "epoch": 2.323495738455667, "ewc_loss": 0.032957207411527634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29572067130357e-05, "grad_norm": 18.943546295166016, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8784704804420471, "num_tokens": 697000854.0, "step": 18265 }, { "epoch": 2.3236229487342577, "ewc_loss": 0.032866060733795166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286606079200283e-05, "grad_norm": 19.00809097290039, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8876267671585083, "num_tokens": 697041264.0, "step": 18266 }, { "epoch": 2.323750159012848, "ewc_loss": 0.03294774889945984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2947747968137264e-05, "grad_norm": 18.966693878173828, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8707401752471924, "num_tokens": 697080079.0, "step": 18267 }, { "epoch": 2.3238773692914387, "ewc_loss": 0.03288264945149422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2882649975363165e-05, "grad_norm": 18.974773406982422, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8693529963493347, "num_tokens": 697123755.0, "step": 18268 }, { "epoch": 2.3240045795700293, "ewc_loss": 0.032861318439245224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286131686763838e-05, "grad_norm": 19.08800506591797, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8451807498931885, "num_tokens": 697162012.0, "step": 18269 }, { "epoch": 2.32413178984862, "ewc_loss": 0.032849349081516266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.284934791736305e-05, "grad_norm": 18.92400550842285, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.883171558380127, "num_tokens": 697200188.0, "step": 18270 }, { "epoch": 2.3242590001272103, "ewc_loss": 0.03282100334763527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2821004424476996e-05, "grad_norm": 19.0150089263916, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8758364915847778, "num_tokens": 697240536.0, "step": 18271 }, { "epoch": 2.324386210405801, "ewc_loss": 0.03293456882238388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293456757091917e-05, "grad_norm": 18.948848724365234, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8847256898880005, "num_tokens": 697279459.0, "step": 18272 }, { "epoch": 2.3245134206843914, "ewc_loss": 0.0328352153301239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2835214369697496e-05, "grad_norm": 19.06103515625, "learning_rate": 1e-06, "loss": 0.3371, "mean_token_accuracy": 0.8904598355293274, "num_tokens": 697312594.0, "step": 18273 }, { "epoch": 2.324640630962982, "ewc_loss": 0.032865915447473526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.286591527285054e-05, "grad_norm": 18.89010238647461, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8603599667549133, "num_tokens": 697357834.0, "step": 18274 }, { "epoch": 2.3247678412415724, "ewc_loss": 0.03276371583342552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.276371717220172e-05, "grad_norm": 19.03424835205078, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8719816207885742, "num_tokens": 697401522.0, "step": 18275 }, { "epoch": 2.324895051520163, "ewc_loss": 0.0329604335129261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296043360023759e-05, "grad_norm": 18.999481201171875, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.870477557182312, "num_tokens": 697440415.0, "step": 18276 }, { "epoch": 2.3250222617987535, "ewc_loss": 0.03276132792234421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2761327020125464e-05, "grad_norm": 18.961193084716797, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8577214479446411, "num_tokens": 697477609.0, "step": 18277 }, { "epoch": 2.325149472077344, "ewc_loss": 0.032897740602493286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289774031145498e-05, "grad_norm": 19.014734268188477, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8698233962059021, "num_tokens": 697515566.0, "step": 18278 }, { "epoch": 2.3252766823559345, "ewc_loss": 0.03284994140267372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2849940907908604e-05, "grad_norm": 18.984039306640625, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.877703070640564, "num_tokens": 697549537.0, "step": 18279 }, { "epoch": 2.325403892634525, "ewc_loss": 0.03283650428056717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2836505852174014e-05, "grad_norm": 19.00409507751465, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8766914010047913, "num_tokens": 697582618.0, "step": 18280 }, { "epoch": 2.325531102913115, "ewc_loss": 0.03291365131735802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29136528307572e-05, "grad_norm": 19.091527938842773, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8859479427337646, "num_tokens": 697622234.0, "step": 18281 }, { "epoch": 2.325658313191706, "ewc_loss": 0.03288520500063896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2885203836485744e-05, "grad_norm": 18.959880828857422, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8652419447898865, "num_tokens": 697657504.0, "step": 18282 }, { "epoch": 2.325785523470296, "ewc_loss": 0.032793838530778885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.279383963672444e-05, "grad_norm": 18.995271682739258, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8609892129898071, "num_tokens": 697699496.0, "step": 18283 }, { "epoch": 2.3259127337488867, "ewc_loss": 0.03294810280203819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294810449006036e-05, "grad_norm": 19.044336318969727, "learning_rate": 1e-06, "loss": 0.3561, "mean_token_accuracy": 0.8850525617599487, "num_tokens": 697739969.0, "step": 18284 }, { "epoch": 2.3260399440274773, "ewc_loss": 0.0328756645321846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287566505605355e-05, "grad_norm": 19.042776107788086, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.863773763179779, "num_tokens": 697774346.0, "step": 18285 }, { "epoch": 2.326167154306068, "ewc_loss": 0.03284686431288719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2846863177837804e-05, "grad_norm": 19.058317184448242, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.859376072883606, "num_tokens": 697813013.0, "step": 18286 }, { "epoch": 2.3262943645846583, "ewc_loss": 0.03287442401051521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287442450528033e-05, "grad_norm": 18.96280288696289, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8666269779205322, "num_tokens": 697854163.0, "step": 18287 }, { "epoch": 2.326421574863249, "ewc_loss": 0.03283124417066574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.283124533481896e-05, "grad_norm": 18.926342010498047, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8805997967720032, "num_tokens": 697898049.0, "step": 18288 }, { "epoch": 2.3265487851418394, "ewc_loss": 0.032887428998947144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2887430279515684e-05, "grad_norm": 18.957059860229492, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8767485618591309, "num_tokens": 697936086.0, "step": 18289 }, { "epoch": 2.32667599542043, "ewc_loss": 0.03288127854466438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288127845735289e-05, "grad_norm": 18.974748611450195, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8498427271842957, "num_tokens": 697981529.0, "step": 18290 }, { "epoch": 2.3268032056990204, "ewc_loss": 0.03287223353981972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.287223444203846e-05, "grad_norm": 18.86298179626465, "learning_rate": 1e-06, "loss": 0.345, "mean_token_accuracy": 0.8905770778656006, "num_tokens": 698016047.0, "step": 18291 }, { "epoch": 2.326930415977611, "ewc_loss": 0.032954368740320206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295436908956617e-05, "grad_norm": 18.989227294921875, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8850809335708618, "num_tokens": 698052408.0, "step": 18292 }, { "epoch": 2.3270576262562015, "ewc_loss": 0.03292437642812729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292437759228051e-05, "grad_norm": 18.91905403137207, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8666882514953613, "num_tokens": 698089751.0, "step": 18293 }, { "epoch": 2.327184836534792, "ewc_loss": 0.032900772988796234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2900774385780096e-05, "grad_norm": 18.98436164855957, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8710963726043701, "num_tokens": 698121274.0, "step": 18294 }, { "epoch": 2.3273120468133826, "ewc_loss": 0.032954711467027664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295471105957404e-05, "grad_norm": 18.851871490478516, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8820705413818359, "num_tokens": 698155880.0, "step": 18295 }, { "epoch": 2.327439257091973, "ewc_loss": 0.03289554640650749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2895546610234305e-05, "grad_norm": 18.98808479309082, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8753395676612854, "num_tokens": 698191448.0, "step": 18296 }, { "epoch": 2.3275664673705636, "ewc_loss": 0.033003635704517365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300363459857181e-05, "grad_norm": 18.901731491088867, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8683327436447144, "num_tokens": 698228437.0, "step": 18297 }, { "epoch": 2.327693677649154, "ewc_loss": 0.03293079510331154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293079498689622e-05, "grad_norm": 18.99468231201172, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.866288423538208, "num_tokens": 698268714.0, "step": 18298 }, { "epoch": 2.3278208879277447, "ewc_loss": 0.03297097608447075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2970976462820545e-05, "grad_norm": 18.918210983276367, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8741393089294434, "num_tokens": 698305575.0, "step": 18299 }, { "epoch": 2.327948098206335, "ewc_loss": 0.03296909108757973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296909198979847e-05, "grad_norm": 18.97506332397461, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8673555254936218, "num_tokens": 698347701.0, "step": 18300 }, { "epoch": 2.3280753084849257, "ewc_loss": 0.033090151846408844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309015301056206e-05, "grad_norm": 19.013402938842773, "learning_rate": 1e-06, "loss": 0.3594, "mean_token_accuracy": 0.8881695866584778, "num_tokens": 698385906.0, "step": 18301 }, { "epoch": 2.3282025187635162, "ewc_loss": 0.03294287249445915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294287307653576e-05, "grad_norm": 18.983015060424805, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8724012970924377, "num_tokens": 698415511.0, "step": 18302 }, { "epoch": 2.3283297290421068, "ewc_loss": 0.03306009620428085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306009602965787e-05, "grad_norm": 19.029937744140625, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8692418932914734, "num_tokens": 698460528.0, "step": 18303 }, { "epoch": 2.3284569393206973, "ewc_loss": 0.03303759545087814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303759513073601e-05, "grad_norm": 19.037912368774414, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8588720560073853, "num_tokens": 698500800.0, "step": 18304 }, { "epoch": 2.328584149599288, "ewc_loss": 0.032934077084064484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2934076443780214e-05, "grad_norm": 18.90996551513672, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.871619462966919, "num_tokens": 698539373.0, "step": 18305 }, { "epoch": 2.328711359877878, "ewc_loss": 0.03297262638807297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2972628105198964e-05, "grad_norm": 18.98628807067871, "learning_rate": 1e-06, "loss": 0.3413, "mean_token_accuracy": 0.895602822303772, "num_tokens": 698577067.0, "step": 18306 }, { "epoch": 2.328838570156469, "ewc_loss": 0.03300384804606438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3003849239321426e-05, "grad_norm": 18.92422103881836, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8726260662078857, "num_tokens": 698613515.0, "step": 18307 }, { "epoch": 2.328965780435059, "ewc_loss": 0.03293454647064209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293454574304633e-05, "grad_norm": 18.97480583190918, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8752123713493347, "num_tokens": 698647278.0, "step": 18308 }, { "epoch": 2.3290929907136495, "ewc_loss": 0.03302040323615074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302040204289369e-05, "grad_norm": 18.999650955200195, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8799231052398682, "num_tokens": 698688738.0, "step": 18309 }, { "epoch": 2.32922020099224, "ewc_loss": 0.03300737962126732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3007378078764305e-05, "grad_norm": 19.014524459838867, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8772667646408081, "num_tokens": 698725003.0, "step": 18310 }, { "epoch": 2.3293474112708306, "ewc_loss": 0.0329609140753746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296091381344013e-05, "grad_norm": 18.97608184814453, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.855933427810669, "num_tokens": 698768205.0, "step": 18311 }, { "epoch": 2.329474621549421, "ewc_loss": 0.0329311266541481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293112604296766e-05, "grad_norm": 19.003520965576172, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8544933795928955, "num_tokens": 698805352.0, "step": 18312 }, { "epoch": 2.3296018318280116, "ewc_loss": 0.03299897536635399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299897434771992e-05, "grad_norm": 18.956615447998047, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.882502019405365, "num_tokens": 698836610.0, "step": 18313 }, { "epoch": 2.329729042106602, "ewc_loss": 0.03295044228434563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295044371043332e-05, "grad_norm": 18.974964141845703, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8815200328826904, "num_tokens": 698877514.0, "step": 18314 }, { "epoch": 2.3298562523851927, "ewc_loss": 0.032993510365486145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299351010355167e-05, "grad_norm": 18.942520141601562, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.876207709312439, "num_tokens": 698916699.0, "step": 18315 }, { "epoch": 2.329983462663783, "ewc_loss": 0.032912660390138626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2912659662542865e-05, "grad_norm": 19.011348724365234, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8731218576431274, "num_tokens": 698953568.0, "step": 18316 }, { "epoch": 2.3301106729423737, "ewc_loss": 0.03296288475394249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296288559795357e-05, "grad_norm": 18.94154930114746, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8685895204544067, "num_tokens": 698990562.0, "step": 18317 }, { "epoch": 2.3302378832209643, "ewc_loss": 0.03296676650643349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296676732134074e-05, "grad_norm": 19.05603790283203, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8757925033569336, "num_tokens": 699024095.0, "step": 18318 }, { "epoch": 2.330365093499555, "ewc_loss": 0.032933350652456284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29333524859976e-05, "grad_norm": 18.921478271484375, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8533691167831421, "num_tokens": 699062327.0, "step": 18319 }, { "epoch": 2.3304923037781453, "ewc_loss": 0.03293999284505844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2939991797320545e-05, "grad_norm": 19.008838653564453, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8785122632980347, "num_tokens": 699101591.0, "step": 18320 }, { "epoch": 2.330619514056736, "ewc_loss": 0.03292093425989151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292093242635019e-05, "grad_norm": 18.94124984741211, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8648840188980103, "num_tokens": 699134836.0, "step": 18321 }, { "epoch": 2.3307467243353264, "ewc_loss": 0.033007971942424774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300797106930986e-05, "grad_norm": 18.999387741088867, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8642325401306152, "num_tokens": 699171218.0, "step": 18322 }, { "epoch": 2.330873934613917, "ewc_loss": 0.03304039686918259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304039637441747e-05, "grad_norm": 19.03693389892578, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8908858895301819, "num_tokens": 699200891.0, "step": 18323 }, { "epoch": 2.3310011448925074, "ewc_loss": 0.032934416085481644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293441477580927e-05, "grad_norm": 18.872106552124023, "learning_rate": 1e-06, "loss": 0.3545, "mean_token_accuracy": 0.8883708715438843, "num_tokens": 699238080.0, "step": 18324 }, { "epoch": 2.331128355171098, "ewc_loss": 0.03302251547574997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302251570858061e-05, "grad_norm": 19.079696655273438, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8611282706260681, "num_tokens": 699276051.0, "step": 18325 }, { "epoch": 2.3312555654496885, "ewc_loss": 0.03308756276965141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308756276965141e-05, "grad_norm": 18.888904571533203, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8797307014465332, "num_tokens": 699315721.0, "step": 18326 }, { "epoch": 2.331382775728279, "ewc_loss": 0.032927028834819794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292702967883088e-05, "grad_norm": 19.026050567626953, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8849009871482849, "num_tokens": 699350342.0, "step": 18327 }, { "epoch": 2.3315099860068695, "ewc_loss": 0.03305808827280998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3058087865356356e-05, "grad_norm": 18.89723014831543, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8788056373596191, "num_tokens": 699390301.0, "step": 18328 }, { "epoch": 2.3316371962854596, "ewc_loss": 0.03296912834048271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296912836958654e-05, "grad_norm": 19.03605079650879, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8593980073928833, "num_tokens": 699427135.0, "step": 18329 }, { "epoch": 2.3317644065640506, "ewc_loss": 0.03309686481952667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3096865081461146e-05, "grad_norm": 18.952547073364258, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8792269229888916, "num_tokens": 699466461.0, "step": 18330 }, { "epoch": 2.3318916168426407, "ewc_loss": 0.032980211079120636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2980209653032944e-05, "grad_norm": 19.027509689331055, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8662600517272949, "num_tokens": 699502446.0, "step": 18331 }, { "epoch": 2.332018827121231, "ewc_loss": 0.033073797821998596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307379665784538e-05, "grad_norm": 18.943361282348633, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8737257719039917, "num_tokens": 699546357.0, "step": 18332 }, { "epoch": 2.3321460373998217, "ewc_loss": 0.03301924839615822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3019248803611845e-05, "grad_norm": 18.964975357055664, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.872145414352417, "num_tokens": 699581879.0, "step": 18333 }, { "epoch": 2.3322732476784123, "ewc_loss": 0.03310009837150574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310009924462065e-05, "grad_norm": 19.007835388183594, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8870817422866821, "num_tokens": 699616287.0, "step": 18334 }, { "epoch": 2.332400457957003, "ewc_loss": 0.03297828137874603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2978281524265185e-05, "grad_norm": 18.95814323425293, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8716986179351807, "num_tokens": 699655409.0, "step": 18335 }, { "epoch": 2.3325276682355933, "ewc_loss": 0.03304540365934372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304540223325603e-05, "grad_norm": 19.0320987701416, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8478538990020752, "num_tokens": 699701420.0, "step": 18336 }, { "epoch": 2.332654878514184, "ewc_loss": 0.03301990404725075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301990363979712e-05, "grad_norm": 19.049602508544922, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8750466108322144, "num_tokens": 699736151.0, "step": 18337 }, { "epoch": 2.3327820887927744, "ewc_loss": 0.03296471759676933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296471913927235e-05, "grad_norm": 19.010208129882812, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8662141561508179, "num_tokens": 699770111.0, "step": 18338 }, { "epoch": 2.332909299071365, "ewc_loss": 0.032960060983896255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296006252639927e-05, "grad_norm": 19.014633178710938, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8691744804382324, "num_tokens": 699807442.0, "step": 18339 }, { "epoch": 2.3330365093499554, "ewc_loss": 0.03295864909887314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295864735264331e-05, "grad_norm": 18.92837905883789, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8705359101295471, "num_tokens": 699849364.0, "step": 18340 }, { "epoch": 2.333163719628546, "ewc_loss": 0.032884374260902405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.288437437731773e-05, "grad_norm": 18.946857452392578, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8876209855079651, "num_tokens": 699886234.0, "step": 18341 }, { "epoch": 2.3332909299071365, "ewc_loss": 0.03298301249742508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2983010896714404e-05, "grad_norm": 18.989017486572266, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8565674424171448, "num_tokens": 699925660.0, "step": 18342 }, { "epoch": 2.333418140185727, "ewc_loss": 0.03292866423726082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292866313131526e-05, "grad_norm": 18.98341178894043, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8458433151245117, "num_tokens": 699967894.0, "step": 18343 }, { "epoch": 2.3335453504643175, "ewc_loss": 0.03298405185341835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298405135865323e-05, "grad_norm": 18.982892990112305, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8676548600196838, "num_tokens": 699999211.0, "step": 18344 }, { "epoch": 2.333672560742908, "ewc_loss": 0.032938092947006226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293809277238324e-05, "grad_norm": 18.96354866027832, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8811802864074707, "num_tokens": 700037639.0, "step": 18345 }, { "epoch": 2.3337997710214986, "ewc_loss": 0.03298303857445717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298304000054486e-05, "grad_norm": 18.995166778564453, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8667988181114197, "num_tokens": 700075677.0, "step": 18346 }, { "epoch": 2.333926981300089, "ewc_loss": 0.03293323889374733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293323970865458e-05, "grad_norm": 18.917997360229492, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8792934417724609, "num_tokens": 700115697.0, "step": 18347 }, { "epoch": 2.3340541915786797, "ewc_loss": 0.03289606422185898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289606320322491e-05, "grad_norm": 18.93154525756836, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8560762405395508, "num_tokens": 700152553.0, "step": 18348 }, { "epoch": 2.33418140185727, "ewc_loss": 0.03293471410870552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2934713090071455e-05, "grad_norm": 18.86878204345703, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8799413442611694, "num_tokens": 700190958.0, "step": 18349 }, { "epoch": 2.3343086121358607, "ewc_loss": 0.03293575718998909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293575718998909e-05, "grad_norm": 18.975553512573242, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8711715340614319, "num_tokens": 700224244.0, "step": 18350 }, { "epoch": 2.3344358224144512, "ewc_loss": 0.033003564924001694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300356547697447e-05, "grad_norm": 19.009918212890625, "learning_rate": 1e-06, "loss": 0.3178, "mean_token_accuracy": 0.8990013003349304, "num_tokens": 700259736.0, "step": 18351 }, { "epoch": 2.3345630326930418, "ewc_loss": 0.033020343631505966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302034383523278e-05, "grad_norm": 18.97382354736328, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8732795715332031, "num_tokens": 700293624.0, "step": 18352 }, { "epoch": 2.3346902429716323, "ewc_loss": 0.032943859696388245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294385896879248e-05, "grad_norm": 18.972888946533203, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8775686025619507, "num_tokens": 700328052.0, "step": 18353 }, { "epoch": 2.3348174532502224, "ewc_loss": 0.03301321342587471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301321339677088e-05, "grad_norm": 18.94710350036621, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8654841780662537, "num_tokens": 700364652.0, "step": 18354 }, { "epoch": 2.3349446635288134, "ewc_loss": 0.03301119804382324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301119795651175e-05, "grad_norm": 18.98069190979004, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8853399157524109, "num_tokens": 700403698.0, "step": 18355 }, { "epoch": 2.3350718738074034, "ewc_loss": 0.0330282598733902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302826007711701e-05, "grad_norm": 18.91559600830078, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.887625515460968, "num_tokens": 700439391.0, "step": 18356 }, { "epoch": 2.335199084085994, "ewc_loss": 0.0330062136054039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3006213925546035e-05, "grad_norm": 18.911998748779297, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8767165541648865, "num_tokens": 700475095.0, "step": 18357 }, { "epoch": 2.3353262943645845, "ewc_loss": 0.03303561732172966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303561607026495e-05, "grad_norm": 19.010326385498047, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8771408796310425, "num_tokens": 700514167.0, "step": 18358 }, { "epoch": 2.335453504643175, "ewc_loss": 0.03305867686867714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3058677217923105e-05, "grad_norm": 18.959714889526367, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8818246126174927, "num_tokens": 700548333.0, "step": 18359 }, { "epoch": 2.3355807149217656, "ewc_loss": 0.03302282467484474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3022824936779216e-05, "grad_norm": 19.01645278930664, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8782652616500854, "num_tokens": 700594261.0, "step": 18360 }, { "epoch": 2.335707925200356, "ewc_loss": 0.03306451067328453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306450889795087e-05, "grad_norm": 18.953535079956055, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8788146376609802, "num_tokens": 700632513.0, "step": 18361 }, { "epoch": 2.3358351354789466, "ewc_loss": 0.03295201063156128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2952011679299176e-05, "grad_norm": 18.9555721282959, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8799718022346497, "num_tokens": 700672430.0, "step": 18362 }, { "epoch": 2.335962345757537, "ewc_loss": 0.03304484859108925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304484926047735e-05, "grad_norm": 18.964269638061523, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8792705535888672, "num_tokens": 700714321.0, "step": 18363 }, { "epoch": 2.3360895560361277, "ewc_loss": 0.03300632908940315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300633034086786e-05, "grad_norm": 18.988615036010742, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8736677169799805, "num_tokens": 700751714.0, "step": 18364 }, { "epoch": 2.336216766314718, "ewc_loss": 0.0329691581428051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2969157473417e-05, "grad_norm": 18.96712875366211, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8704149723052979, "num_tokens": 700794087.0, "step": 18365 }, { "epoch": 2.3363439765933087, "ewc_loss": 0.032936543226242065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293654299341142e-05, "grad_norm": 18.932941436767578, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8682004809379578, "num_tokens": 700832031.0, "step": 18366 }, { "epoch": 2.3364711868718993, "ewc_loss": 0.03299688547849655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299688614788465e-05, "grad_norm": 19.027685165405273, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8758042454719543, "num_tokens": 700867175.0, "step": 18367 }, { "epoch": 2.33659839715049, "ewc_loss": 0.03296973928809166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2969739550026134e-05, "grad_norm": 18.96967124938965, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.879801869392395, "num_tokens": 700906762.0, "step": 18368 }, { "epoch": 2.3367256074290803, "ewc_loss": 0.03298762068152428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298762021586299e-05, "grad_norm": 18.992216110229492, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8762985467910767, "num_tokens": 700944316.0, "step": 18369 }, { "epoch": 2.336852817707671, "ewc_loss": 0.03291665017604828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2916650525294244e-05, "grad_norm": 19.03107452392578, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8693054914474487, "num_tokens": 700979369.0, "step": 18370 }, { "epoch": 2.3369800279862614, "ewc_loss": 0.03299107775092125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2991076295729727e-05, "grad_norm": 19.01569938659668, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8700252771377563, "num_tokens": 701016213.0, "step": 18371 }, { "epoch": 2.337107238264852, "ewc_loss": 0.032921552658081055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29215508827474e-05, "grad_norm": 18.958721160888672, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8714295625686646, "num_tokens": 701055270.0, "step": 18372 }, { "epoch": 2.3372344485434424, "ewc_loss": 0.03293297067284584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293297049822286e-05, "grad_norm": 18.902565002441406, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8716442584991455, "num_tokens": 701091058.0, "step": 18373 }, { "epoch": 2.337361658822033, "ewc_loss": 0.032999925315380096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2999923860188574e-05, "grad_norm": 19.093963623046875, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8811997175216675, "num_tokens": 701129446.0, "step": 18374 }, { "epoch": 2.3374888691006235, "ewc_loss": 0.03305929899215698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305929931229912e-05, "grad_norm": 18.972530364990234, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.87710040807724, "num_tokens": 701168726.0, "step": 18375 }, { "epoch": 2.337616079379214, "ewc_loss": 0.03292010352015495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2920102967182174e-05, "grad_norm": 19.02529525756836, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8672029376029968, "num_tokens": 701204041.0, "step": 18376 }, { "epoch": 2.3377432896578045, "ewc_loss": 0.03303035721182823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3030355552909896e-05, "grad_norm": 19.026212692260742, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8522253036499023, "num_tokens": 701244660.0, "step": 18377 }, { "epoch": 2.337870499936395, "ewc_loss": 0.03291606530547142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29160648107063e-05, "grad_norm": 19.01205825805664, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.864782452583313, "num_tokens": 701284830.0, "step": 18378 }, { "epoch": 2.337997710214985, "ewc_loss": 0.0329347588121891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293476038379595e-05, "grad_norm": 19.00175666809082, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8811755180358887, "num_tokens": 701319720.0, "step": 18379 }, { "epoch": 2.338124920493576, "ewc_loss": 0.03297794237732887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2977943192236125e-05, "grad_norm": 19.00696563720703, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.852980375289917, "num_tokens": 701361770.0, "step": 18380 }, { "epoch": 2.338252130772166, "ewc_loss": 0.03293187543749809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293187546660192e-05, "grad_norm": 18.93920135498047, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.869547426700592, "num_tokens": 701399777.0, "step": 18381 }, { "epoch": 2.3383793410507567, "ewc_loss": 0.032980091869831085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298009323771112e-05, "grad_norm": 19.024272918701172, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8725205659866333, "num_tokens": 701432680.0, "step": 18382 }, { "epoch": 2.3385065513293473, "ewc_loss": 0.033022310584783554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3022311981767416e-05, "grad_norm": 18.977569580078125, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8842093348503113, "num_tokens": 701474101.0, "step": 18383 }, { "epoch": 2.338633761607938, "ewc_loss": 0.03295040875673294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295040733064525e-05, "grad_norm": 19.044401168823242, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.885657548904419, "num_tokens": 701517568.0, "step": 18384 }, { "epoch": 2.3387609718865283, "ewc_loss": 0.03299909085035324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299909076304175e-05, "grad_norm": 19.017173767089844, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8666828870773315, "num_tokens": 701558466.0, "step": 18385 }, { "epoch": 2.338888182165119, "ewc_loss": 0.03289561718702316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289561573183164e-05, "grad_norm": 18.91623878479004, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8682752251625061, "num_tokens": 701591104.0, "step": 18386 }, { "epoch": 2.3390153924437094, "ewc_loss": 0.03297195956110954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297195871709846e-05, "grad_norm": 19.01949691772461, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8664095997810364, "num_tokens": 701633242.0, "step": 18387 }, { "epoch": 2.3391426027223, "ewc_loss": 0.032980453222990036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298045339761302e-05, "grad_norm": 18.903587341308594, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8724508285522461, "num_tokens": 701668864.0, "step": 18388 }, { "epoch": 2.3392698130008904, "ewc_loss": 0.032966550439596176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2966549042612314e-05, "grad_norm": 19.019607543945312, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8777593374252319, "num_tokens": 701703648.0, "step": 18389 }, { "epoch": 2.339397023279481, "ewc_loss": 0.033027470111846924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302747063571587e-05, "grad_norm": 19.010204315185547, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8700003027915955, "num_tokens": 701744417.0, "step": 18390 }, { "epoch": 2.3395242335580715, "ewc_loss": 0.03299907222390175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2999072573147714e-05, "grad_norm": 18.95811653137207, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8422328233718872, "num_tokens": 701783541.0, "step": 18391 }, { "epoch": 2.339651443836662, "ewc_loss": 0.03302618861198425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3026190067175776e-05, "grad_norm": 19.033103942871094, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8688315153121948, "num_tokens": 701825054.0, "step": 18392 }, { "epoch": 2.3397786541152525, "ewc_loss": 0.03298403322696686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29840331687592e-05, "grad_norm": 18.967140197753906, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.879722535610199, "num_tokens": 701866173.0, "step": 18393 }, { "epoch": 2.339905864393843, "ewc_loss": 0.03296074643731117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2960746466415e-05, "grad_norm": 18.959436416625977, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8695567846298218, "num_tokens": 701904812.0, "step": 18394 }, { "epoch": 2.3400330746724336, "ewc_loss": 0.03302295133471489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3022952266037464e-05, "grad_norm": 18.974599838256836, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8633257150650024, "num_tokens": 701942590.0, "step": 18395 }, { "epoch": 2.340160284951024, "ewc_loss": 0.03297992795705795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29799295286648e-05, "grad_norm": 19.055723190307617, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8762664794921875, "num_tokens": 701980773.0, "step": 18396 }, { "epoch": 2.3402874952296147, "ewc_loss": 0.03299342840909958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2993430068017915e-05, "grad_norm": 18.934545516967773, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8866293430328369, "num_tokens": 702014508.0, "step": 18397 }, { "epoch": 2.340414705508205, "ewc_loss": 0.03297625482082367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2976255170069635e-05, "grad_norm": 19.004825592041016, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.885755181312561, "num_tokens": 702055754.0, "step": 18398 }, { "epoch": 2.3405419157867957, "ewc_loss": 0.033043231815099716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304323035990819e-05, "grad_norm": 18.930395126342773, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8715664148330688, "num_tokens": 702100022.0, "step": 18399 }, { "epoch": 2.3406691260653862, "ewc_loss": 0.032983213663101196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29832146235276e-05, "grad_norm": 18.981842041015625, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8633047938346863, "num_tokens": 702133217.0, "step": 18400 }, { "epoch": 2.3407963363439768, "ewc_loss": 0.03296113386750221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2961132092168555e-05, "grad_norm": 18.88888168334961, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8858155012130737, "num_tokens": 702176998.0, "step": 18401 }, { "epoch": 2.3409235466225673, "ewc_loss": 0.03298282250761986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2982821721816435e-05, "grad_norm": 18.97261619567871, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8793630599975586, "num_tokens": 702216219.0, "step": 18402 }, { "epoch": 2.341050756901158, "ewc_loss": 0.03307591378688812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3075913961511105e-05, "grad_norm": 19.050107955932617, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8679800033569336, "num_tokens": 702246502.0, "step": 18403 }, { "epoch": 2.341177967179748, "ewc_loss": 0.03301732614636421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301732431282289e-05, "grad_norm": 19.068965911865234, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.880403995513916, "num_tokens": 702277990.0, "step": 18404 }, { "epoch": 2.341305177458339, "ewc_loss": 0.033062081784009933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306208236608654e-05, "grad_norm": 19.01759910583496, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8759219646453857, "num_tokens": 702312742.0, "step": 18405 }, { "epoch": 2.341432387736929, "ewc_loss": 0.0329800546169281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298005321994424e-05, "grad_norm": 18.967214584350586, "learning_rate": 1e-06, "loss": 0.3504, "mean_token_accuracy": 0.8881239295005798, "num_tokens": 702348575.0, "step": 18406 }, { "epoch": 2.3415595980155195, "ewc_loss": 0.03303299471735954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303299308754504e-05, "grad_norm": 19.017425537109375, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8623459339141846, "num_tokens": 702385956.0, "step": 18407 }, { "epoch": 2.34168680829411, "ewc_loss": 0.03303167223930359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303167250123806e-05, "grad_norm": 19.004276275634766, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8788678646087646, "num_tokens": 702423973.0, "step": 18408 }, { "epoch": 2.3418140185727006, "ewc_loss": 0.03305497020483017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305497011751868e-05, "grad_norm": 18.992589950561523, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.870023787021637, "num_tokens": 702455508.0, "step": 18409 }, { "epoch": 2.341941228851291, "ewc_loss": 0.033056143671274185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305614518467337e-05, "grad_norm": 18.993005752563477, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8835727572441101, "num_tokens": 702493365.0, "step": 18410 }, { "epoch": 2.3420684391298816, "ewc_loss": 0.03302584961056709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302584809716791e-05, "grad_norm": 18.946317672729492, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8753757476806641, "num_tokens": 702528036.0, "step": 18411 }, { "epoch": 2.342195649408472, "ewc_loss": 0.03304041922092438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304041820229031e-05, "grad_norm": 19.001882553100586, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8679730296134949, "num_tokens": 702571238.0, "step": 18412 }, { "epoch": 2.3423228596870627, "ewc_loss": 0.03306721895933151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3067219192162156e-05, "grad_norm": 18.985239028930664, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8688977956771851, "num_tokens": 702611247.0, "step": 18413 }, { "epoch": 2.342450069965653, "ewc_loss": 0.03304402530193329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304402343928814e-05, "grad_norm": 18.99140167236328, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8749099969863892, "num_tokens": 702649661.0, "step": 18414 }, { "epoch": 2.3425772802442437, "ewc_loss": 0.033066414296627045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306641519884579e-05, "grad_norm": 19.03985023498535, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8667576313018799, "num_tokens": 702693573.0, "step": 18415 }, { "epoch": 2.3427044905228342, "ewc_loss": 0.033050261437892914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30502625729423e-05, "grad_norm": 18.982589721679688, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8845723271369934, "num_tokens": 702735351.0, "step": 18416 }, { "epoch": 2.3428317008014248, "ewc_loss": 0.03298923000693321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2989228202495724e-05, "grad_norm": 19.015270233154297, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8791804313659668, "num_tokens": 702773406.0, "step": 18417 }, { "epoch": 2.3429589110800153, "ewc_loss": 0.03306049853563309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306049984530546e-05, "grad_norm": 18.970993041992188, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8793781995773315, "num_tokens": 702809939.0, "step": 18418 }, { "epoch": 2.343086121358606, "ewc_loss": 0.032963238656520844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296323848189786e-05, "grad_norm": 18.99716567993164, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8707316517829895, "num_tokens": 702851120.0, "step": 18419 }, { "epoch": 2.3432133316371964, "ewc_loss": 0.032942429184913635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2942429243121296e-05, "grad_norm": 18.929636001586914, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8813889026641846, "num_tokens": 702889492.0, "step": 18420 }, { "epoch": 2.343340541915787, "ewc_loss": 0.03302033245563507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302033292129636e-05, "grad_norm": 18.944902420043945, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8675107955932617, "num_tokens": 702930486.0, "step": 18421 }, { "epoch": 2.3434677521943774, "ewc_loss": 0.03296411409974098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296411523479037e-05, "grad_norm": 18.95418930053711, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8652897477149963, "num_tokens": 702969866.0, "step": 18422 }, { "epoch": 2.343594962472968, "ewc_loss": 0.032945871353149414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2945870771072805e-05, "grad_norm": 18.91206169128418, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8646016120910645, "num_tokens": 703013703.0, "step": 18423 }, { "epoch": 2.3437221727515585, "ewc_loss": 0.03297116234898567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2971161999739707e-05, "grad_norm": 18.99291229248047, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8628754615783691, "num_tokens": 703054849.0, "step": 18424 }, { "epoch": 2.343849383030149, "ewc_loss": 0.03299139812588692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299139643786475e-05, "grad_norm": 18.944717407226562, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8685925006866455, "num_tokens": 703088250.0, "step": 18425 }, { "epoch": 2.3439765933087395, "ewc_loss": 0.03299909830093384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2999098038999364e-05, "grad_norm": 19.099721908569336, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8692675828933716, "num_tokens": 703124072.0, "step": 18426 }, { "epoch": 2.3441038035873296, "ewc_loss": 0.033017996698617935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3017997338902205e-05, "grad_norm": 18.941062927246094, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8569322824478149, "num_tokens": 703158869.0, "step": 18427 }, { "epoch": 2.3442310138659206, "ewc_loss": 0.032919496297836304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291949542472139e-05, "grad_norm": 18.950767517089844, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8750268816947937, "num_tokens": 703193342.0, "step": 18428 }, { "epoch": 2.3443582241445107, "ewc_loss": 0.03305153176188469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305153222754598e-05, "grad_norm": 18.961395263671875, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.8847348690032959, "num_tokens": 703229899.0, "step": 18429 }, { "epoch": 2.344485434423101, "ewc_loss": 0.03295551985502243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295551869086921e-05, "grad_norm": 18.904848098754883, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8719111680984497, "num_tokens": 703266735.0, "step": 18430 }, { "epoch": 2.3446126447016917, "ewc_loss": 0.03300391882658005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300391836091876e-05, "grad_norm": 18.967517852783203, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8691956996917725, "num_tokens": 703307515.0, "step": 18431 }, { "epoch": 2.3447398549802823, "ewc_loss": 0.033014725893735886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301472679595463e-05, "grad_norm": 18.954679489135742, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.884207010269165, "num_tokens": 703341237.0, "step": 18432 }, { "epoch": 2.344867065258873, "ewc_loss": 0.033017296344041824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3017295208992437e-05, "grad_norm": 18.922636032104492, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8839884996414185, "num_tokens": 703378900.0, "step": 18433 }, { "epoch": 2.3449942755374633, "ewc_loss": 0.03302934020757675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302934055682272e-05, "grad_norm": 19.002838134765625, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8821344971656799, "num_tokens": 703415260.0, "step": 18434 }, { "epoch": 2.345121485816054, "ewc_loss": 0.03305618092417717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305618156446144e-05, "grad_norm": 18.969858169555664, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8783343434333801, "num_tokens": 703452399.0, "step": 18435 }, { "epoch": 2.3452486960946444, "ewc_loss": 0.03303790092468262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3037900720955804e-05, "grad_norm": 18.990297317504883, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8746225833892822, "num_tokens": 703492700.0, "step": 18436 }, { "epoch": 2.345375906373235, "ewc_loss": 0.033127933740615845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312793342047371e-05, "grad_norm": 19.106008529663086, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8851826190948486, "num_tokens": 703530805.0, "step": 18437 }, { "epoch": 2.3455031166518254, "ewc_loss": 0.033042363822460175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3042364520952106e-05, "grad_norm": 18.947162628173828, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8824812769889832, "num_tokens": 703571894.0, "step": 18438 }, { "epoch": 2.345630326930416, "ewc_loss": 0.03295871615409851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2958716474240646e-05, "grad_norm": 18.9981632232666, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8793142437934875, "num_tokens": 703612583.0, "step": 18439 }, { "epoch": 2.3457575372090065, "ewc_loss": 0.033026497811079025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302649929537438e-05, "grad_norm": 18.956003189086914, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8671761751174927, "num_tokens": 703649100.0, "step": 18440 }, { "epoch": 2.345884747487597, "ewc_loss": 0.03301544860005379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3015447115758434e-05, "grad_norm": 19.00550651550293, "learning_rate": 1e-06, "loss": 0.3519, "mean_token_accuracy": 0.889015793800354, "num_tokens": 703680425.0, "step": 18441 }, { "epoch": 2.3460119577661875, "ewc_loss": 0.03301677852869034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301677861600183e-05, "grad_norm": 18.95857048034668, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8802214860916138, "num_tokens": 703721819.0, "step": 18442 }, { "epoch": 2.346139168044778, "ewc_loss": 0.032994601875543594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29946014971938e-05, "grad_norm": 18.94070816040039, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8707860708236694, "num_tokens": 703755432.0, "step": 18443 }, { "epoch": 2.3462663783233686, "ewc_loss": 0.03298870101571083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2988700695568696e-05, "grad_norm": 18.869775772094727, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8725847005844116, "num_tokens": 703795903.0, "step": 18444 }, { "epoch": 2.346393588601959, "ewc_loss": 0.03304416313767433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304416168248281e-05, "grad_norm": 18.96011734008789, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.884716272354126, "num_tokens": 703836074.0, "step": 18445 }, { "epoch": 2.3465207988805497, "ewc_loss": 0.03303106874227524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3031068596756086e-05, "grad_norm": 18.99365234375, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8780025839805603, "num_tokens": 703870609.0, "step": 18446 }, { "epoch": 2.34664800915914, "ewc_loss": 0.03301284834742546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301284959889017e-05, "grad_norm": 18.92515754699707, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8789073824882507, "num_tokens": 703909034.0, "step": 18447 }, { "epoch": 2.3467752194377307, "ewc_loss": 0.03299787640571594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299787567812018e-05, "grad_norm": 18.970359802246094, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8658128380775452, "num_tokens": 703946976.0, "step": 18448 }, { "epoch": 2.3469024297163212, "ewc_loss": 0.03309224918484688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309224848635495e-05, "grad_norm": 18.9655818939209, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8765826225280762, "num_tokens": 703986379.0, "step": 18449 }, { "epoch": 2.3470296399949118, "ewc_loss": 0.033040497452020645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304049823782407e-05, "grad_norm": 18.966726303100586, "learning_rate": 1e-06, "loss": 0.336, "mean_token_accuracy": 0.8920589089393616, "num_tokens": 704019461.0, "step": 18450 }, { "epoch": 2.3471568502735023, "ewc_loss": 0.033049575984478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304957499494776e-05, "grad_norm": 18.969430923461914, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8606094121932983, "num_tokens": 704053411.0, "step": 18451 }, { "epoch": 2.3472840605520924, "ewc_loss": 0.03305722773075104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305722930235788e-05, "grad_norm": 19.039438247680664, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8785120844841003, "num_tokens": 704092633.0, "step": 18452 }, { "epoch": 2.3474112708306833, "ewc_loss": 0.03301437199115753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3014370274031535e-05, "grad_norm": 18.990034103393555, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8779857754707336, "num_tokens": 704126955.0, "step": 18453 }, { "epoch": 2.3475384811092734, "ewc_loss": 0.032981399446725845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298139927210286e-05, "grad_norm": 18.97315216064453, "learning_rate": 1e-06, "loss": 0.346, "mean_token_accuracy": 0.8905311226844788, "num_tokens": 704162222.0, "step": 18454 }, { "epoch": 2.347665691387864, "ewc_loss": 0.03299962356686592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2999621907947585e-05, "grad_norm": 19.01133155822754, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8745170831680298, "num_tokens": 704198949.0, "step": 18455 }, { "epoch": 2.3477929016664545, "ewc_loss": 0.03305676206946373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3056763641070575e-05, "grad_norm": 18.976457595825195, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8690587282180786, "num_tokens": 704235327.0, "step": 18456 }, { "epoch": 2.347920111945045, "ewc_loss": 0.033010076731443405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3010077459039167e-05, "grad_norm": 19.081335067749023, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8805468082427979, "num_tokens": 704272783.0, "step": 18457 }, { "epoch": 2.3480473222236355, "ewc_loss": 0.0330577977001667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305779682705179e-05, "grad_norm": 18.941160202026367, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8570133447647095, "num_tokens": 704314396.0, "step": 18458 }, { "epoch": 2.348174532502226, "ewc_loss": 0.03300674632191658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300674507045187e-05, "grad_norm": 19.049583435058594, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8905352354049683, "num_tokens": 704346903.0, "step": 18459 }, { "epoch": 2.3483017427808166, "ewc_loss": 0.033098090440034866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3098091080319136e-05, "grad_norm": 18.968416213989258, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8812363147735596, "num_tokens": 704380456.0, "step": 18460 }, { "epoch": 2.348428953059407, "ewc_loss": 0.03294152021408081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294151974841952e-05, "grad_norm": 18.96489715576172, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8857587575912476, "num_tokens": 704416403.0, "step": 18461 }, { "epoch": 2.3485561633379977, "ewc_loss": 0.033024951815605164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3024953154381365e-05, "grad_norm": 18.90526580810547, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8756213188171387, "num_tokens": 704460431.0, "step": 18462 }, { "epoch": 2.348683373616588, "ewc_loss": 0.03295170143246651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295170245110057e-05, "grad_norm": 18.987918853759766, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8821877837181091, "num_tokens": 704497872.0, "step": 18463 }, { "epoch": 2.3488105838951787, "ewc_loss": 0.03304441645741463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3044416340999305e-05, "grad_norm": 18.946941375732422, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8762282133102417, "num_tokens": 704538180.0, "step": 18464 }, { "epoch": 2.3489377941737692, "ewc_loss": 0.03306061774492264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3060616260627285e-05, "grad_norm": 19.061063766479492, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8858556151390076, "num_tokens": 704578342.0, "step": 18465 }, { "epoch": 2.3490650044523598, "ewc_loss": 0.03305397182703018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3053973311325535e-05, "grad_norm": 18.988000869750977, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8860228657722473, "num_tokens": 704611792.0, "step": 18466 }, { "epoch": 2.3491922147309503, "ewc_loss": 0.032988958060741425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2988958992064e-05, "grad_norm": 19.01094627380371, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8767963647842407, "num_tokens": 704649898.0, "step": 18467 }, { "epoch": 2.349319425009541, "ewc_loss": 0.03300432488322258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3004325814545155e-05, "grad_norm": 19.022417068481445, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8829923868179321, "num_tokens": 704690632.0, "step": 18468 }, { "epoch": 2.3494466352881314, "ewc_loss": 0.03294391185045242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294391353847459e-05, "grad_norm": 18.97014045715332, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.865301251411438, "num_tokens": 704722955.0, "step": 18469 }, { "epoch": 2.349573845566722, "ewc_loss": 0.03299985080957413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299985110061243e-05, "grad_norm": 18.99015235900879, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8735024929046631, "num_tokens": 704762678.0, "step": 18470 }, { "epoch": 2.3497010558453124, "ewc_loss": 0.0330076701939106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300766911706887e-05, "grad_norm": 18.984926223754883, "learning_rate": 1e-06, "loss": 0.3174, "mean_token_accuracy": 0.8994214534759521, "num_tokens": 704805440.0, "step": 18471 }, { "epoch": 2.349828266123903, "ewc_loss": 0.032977230846881866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2977230148389935e-05, "grad_norm": 19.01272964477539, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8552559614181519, "num_tokens": 704848153.0, "step": 18472 }, { "epoch": 2.3499554764024935, "ewc_loss": 0.03297358378767967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297358489362523e-05, "grad_norm": 18.96683120727539, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8721218109130859, "num_tokens": 704886739.0, "step": 18473 }, { "epoch": 2.350082686681084, "ewc_loss": 0.032937075942754745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2937077776296064e-05, "grad_norm": 18.98407554626465, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8741005659103394, "num_tokens": 704926145.0, "step": 18474 }, { "epoch": 2.3502098969596745, "ewc_loss": 0.032938893884420395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29388931277208e-05, "grad_norm": 18.999053955078125, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8596789836883545, "num_tokens": 704964135.0, "step": 18475 }, { "epoch": 2.350337107238265, "ewc_loss": 0.03295876458287239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295876376796514e-05, "grad_norm": 19.034862518310547, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.863369345664978, "num_tokens": 705002171.0, "step": 18476 }, { "epoch": 2.350464317516855, "ewc_loss": 0.033012114465236664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301211472717114e-05, "grad_norm": 19.01598358154297, "learning_rate": 1e-06, "loss": 0.3295, "mean_token_accuracy": 0.8941648006439209, "num_tokens": 705040660.0, "step": 18477 }, { "epoch": 2.350591527795446, "ewc_loss": 0.03285197168588638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.285197090008296e-05, "grad_norm": 18.901668548583984, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8806561231613159, "num_tokens": 705073655.0, "step": 18478 }, { "epoch": 2.350718738074036, "ewc_loss": 0.032950710505247116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295070928288624e-05, "grad_norm": 19.004297256469727, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8721187114715576, "num_tokens": 705115497.0, "step": 18479 }, { "epoch": 2.3508459483526267, "ewc_loss": 0.033005956560373306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300595562905073e-05, "grad_norm": 18.98944854736328, "learning_rate": 1e-06, "loss": 0.3405, "mean_token_accuracy": 0.8919705748558044, "num_tokens": 705152642.0, "step": 18480 }, { "epoch": 2.3509731586312173, "ewc_loss": 0.03295677900314331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2956777431536466e-05, "grad_norm": 19.0142822265625, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8599055409431458, "num_tokens": 705187282.0, "step": 18481 }, { "epoch": 2.351100368909808, "ewc_loss": 0.032967813313007355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296781505923718e-05, "grad_norm": 19.012683868408203, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8782351613044739, "num_tokens": 705225635.0, "step": 18482 }, { "epoch": 2.3512275791883983, "ewc_loss": 0.03297516703605652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2975167414406314e-05, "grad_norm": 18.957883834838867, "learning_rate": 1e-06, "loss": 0.3529, "mean_token_accuracy": 0.8839722275733948, "num_tokens": 705260376.0, "step": 18483 }, { "epoch": 2.351354789466989, "ewc_loss": 0.03301132842898369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3011328923748806e-05, "grad_norm": 19.090566635131836, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8739147186279297, "num_tokens": 705298799.0, "step": 18484 }, { "epoch": 2.3514819997455794, "ewc_loss": 0.03298237919807434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298237788840197e-05, "grad_norm": 18.932966232299805, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8570795059204102, "num_tokens": 705332570.0, "step": 18485 }, { "epoch": 2.35160921002417, "ewc_loss": 0.032946642488241196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294664202257991e-05, "grad_norm": 19.102676391601562, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8762001991271973, "num_tokens": 705371214.0, "step": 18486 }, { "epoch": 2.3517364203027604, "ewc_loss": 0.03303117677569389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30311777361203e-05, "grad_norm": 18.963655471801758, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8716028332710266, "num_tokens": 705405051.0, "step": 18487 }, { "epoch": 2.351863630581351, "ewc_loss": 0.032936133444309235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293613190180622e-05, "grad_norm": 19.010801315307617, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.873161256313324, "num_tokens": 705442460.0, "step": 18488 }, { "epoch": 2.3519908408599415, "ewc_loss": 0.03310222178697586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3102220186265185e-05, "grad_norm": 19.07901954650879, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8735263347625732, "num_tokens": 705482929.0, "step": 18489 }, { "epoch": 2.352118051138532, "ewc_loss": 0.03302852064371109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302852201159112e-05, "grad_norm": 19.089393615722656, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8701304197311401, "num_tokens": 705525313.0, "step": 18490 }, { "epoch": 2.3522452614171225, "ewc_loss": 0.03295326232910156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2953263144008815e-05, "grad_norm": 18.9876651763916, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8688168525695801, "num_tokens": 705563730.0, "step": 18491 }, { "epoch": 2.352372471695713, "ewc_loss": 0.03300065919756889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3000658731907606e-05, "grad_norm": 19.05278778076172, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8773115277290344, "num_tokens": 705602459.0, "step": 18492 }, { "epoch": 2.3524996819743036, "ewc_loss": 0.03299138322472572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299138188594952e-05, "grad_norm": 18.986164093017578, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8714936375617981, "num_tokens": 705636223.0, "step": 18493 }, { "epoch": 2.352626892252894, "ewc_loss": 0.033021166920661926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302116601844318e-05, "grad_norm": 19.0098819732666, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.873810887336731, "num_tokens": 705676178.0, "step": 18494 }, { "epoch": 2.3527541025314846, "ewc_loss": 0.033057939261198044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3057938708225265e-05, "grad_norm": 19.002906799316406, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8804795742034912, "num_tokens": 705716687.0, "step": 18495 }, { "epoch": 2.352881312810075, "ewc_loss": 0.03299398720264435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29939866787754e-05, "grad_norm": 18.948814392089844, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8812991380691528, "num_tokens": 705759163.0, "step": 18496 }, { "epoch": 2.3530085230886657, "ewc_loss": 0.03302977234125137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302977347630076e-05, "grad_norm": 19.08357810974121, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8764474391937256, "num_tokens": 705795982.0, "step": 18497 }, { "epoch": 2.3531357333672562, "ewc_loss": 0.03308100998401642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308101076981984e-05, "grad_norm": 19.038911819458008, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8785914182662964, "num_tokens": 705835601.0, "step": 18498 }, { "epoch": 2.3532629436458468, "ewc_loss": 0.032946228981018066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29462272929959e-05, "grad_norm": 18.95277976989746, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8718820810317993, "num_tokens": 705876858.0, "step": 18499 }, { "epoch": 2.353390153924437, "ewc_loss": 0.03305082768201828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30508264596574e-05, "grad_norm": 19.065664291381836, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8663395643234253, "num_tokens": 705921944.0, "step": 18500 }, { "epoch": 2.353517364203028, "ewc_loss": 0.032968778163194656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.296877912362106e-05, "grad_norm": 19.00714111328125, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8743684887886047, "num_tokens": 705967999.0, "step": 18501 }, { "epoch": 2.353644574481618, "ewc_loss": 0.03290816396474838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.29081631207373e-05, "grad_norm": 18.90785026550293, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8667842149734497, "num_tokens": 706003370.0, "step": 18502 }, { "epoch": 2.353771784760209, "ewc_loss": 0.032984327524900436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298432784504257e-05, "grad_norm": 19.08850860595703, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8636751770973206, "num_tokens": 706036551.0, "step": 18503 }, { "epoch": 2.353898995038799, "ewc_loss": 0.032992616295814514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2992615160765126e-05, "grad_norm": 18.88933753967285, "learning_rate": 1e-06, "loss": 0.3384, "mean_token_accuracy": 0.8915752172470093, "num_tokens": 706075965.0, "step": 18504 }, { "epoch": 2.3540262053173895, "ewc_loss": 0.03292911499738693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292911424068734e-05, "grad_norm": 19.03333854675293, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8622186779975891, "num_tokens": 706117943.0, "step": 18505 }, { "epoch": 2.35415341559598, "ewc_loss": 0.033059995621442795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305999416625127e-05, "grad_norm": 18.94223403930664, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8782564997673035, "num_tokens": 706157183.0, "step": 18506 }, { "epoch": 2.3542806258745705, "ewc_loss": 0.032936934381723404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293693589512259e-05, "grad_norm": 19.01953887939453, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8831153512001038, "num_tokens": 706196878.0, "step": 18507 }, { "epoch": 2.354407836153161, "ewc_loss": 0.03308384492993355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3083844755310565e-05, "grad_norm": 19.073089599609375, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8614583611488342, "num_tokens": 706231102.0, "step": 18508 }, { "epoch": 2.3545350464317516, "ewc_loss": 0.032959118485450745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2959116651909426e-05, "grad_norm": 19.006103515625, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8821142911911011, "num_tokens": 706273372.0, "step": 18509 }, { "epoch": 2.354662256710342, "ewc_loss": 0.03298371285200119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298371302662417e-05, "grad_norm": 19.01412010192871, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8678478598594666, "num_tokens": 706308106.0, "step": 18510 }, { "epoch": 2.3547894669889327, "ewc_loss": 0.03297774866223335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297775037935935e-05, "grad_norm": 19.022422790527344, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.882550060749054, "num_tokens": 706344267.0, "step": 18511 }, { "epoch": 2.354916677267523, "ewc_loss": 0.03296499699354172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2964995625661686e-05, "grad_norm": 18.974246978759766, "learning_rate": 1e-06, "loss": 0.3455, "mean_token_accuracy": 0.8928574919700623, "num_tokens": 706385019.0, "step": 18512 }, { "epoch": 2.3550438875461137, "ewc_loss": 0.03293126821517944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293126792414114e-05, "grad_norm": 18.98578643798828, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.873662531375885, "num_tokens": 706424913.0, "step": 18513 }, { "epoch": 2.3551710978247042, "ewc_loss": 0.0329878106713295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298780939076096e-05, "grad_norm": 18.9234561920166, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.875119686126709, "num_tokens": 706466064.0, "step": 18514 }, { "epoch": 2.3552983081032948, "ewc_loss": 0.0329754538834095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2975454814732075e-05, "grad_norm": 19.03772735595703, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8817230463027954, "num_tokens": 706504572.0, "step": 18515 }, { "epoch": 2.3554255183818853, "ewc_loss": 0.03300900012254715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300900061731227e-05, "grad_norm": 19.04694938659668, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8683211803436279, "num_tokens": 706543901.0, "step": 18516 }, { "epoch": 2.355552728660476, "ewc_loss": 0.03297396004199982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297395960544236e-05, "grad_norm": 19.03510093688965, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8681411147117615, "num_tokens": 706579095.0, "step": 18517 }, { "epoch": 2.3556799389390664, "ewc_loss": 0.03297329321503639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.297329385532066e-05, "grad_norm": 18.94472885131836, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8671886920928955, "num_tokens": 706614078.0, "step": 18518 }, { "epoch": 2.355807149217657, "ewc_loss": 0.032938919961452484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293891859357245e-05, "grad_norm": 18.920827865600586, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8818185329437256, "num_tokens": 706652596.0, "step": 18519 }, { "epoch": 2.3559343594962474, "ewc_loss": 0.03307894989848137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307895167381503e-05, "grad_norm": 19.051349639892578, "learning_rate": 1e-06, "loss": 0.3226, "mean_token_accuracy": 0.8978914618492126, "num_tokens": 706691297.0, "step": 18520 }, { "epoch": 2.356061569774838, "ewc_loss": 0.033036962151527405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3036962122423574e-05, "grad_norm": 18.932680130004883, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8631197810173035, "num_tokens": 706727344.0, "step": 18521 }, { "epoch": 2.3561887800534285, "ewc_loss": 0.03305714577436447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305714562884532e-05, "grad_norm": 19.04047966003418, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8777638077735901, "num_tokens": 706770411.0, "step": 18522 }, { "epoch": 2.356315990332019, "ewc_loss": 0.03308207914233208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3082080335589126e-05, "grad_norm": 19.005081176757812, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8773074746131897, "num_tokens": 706816215.0, "step": 18523 }, { "epoch": 2.3564432006106095, "ewc_loss": 0.03300367668271065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300367825431749e-05, "grad_norm": 18.991607666015625, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8693690299987793, "num_tokens": 706857400.0, "step": 18524 }, { "epoch": 2.3565704108891996, "ewc_loss": 0.03305376321077347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3053762308554724e-05, "grad_norm": 19.099950790405273, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8665857911109924, "num_tokens": 706892715.0, "step": 18525 }, { "epoch": 2.3566976211677906, "ewc_loss": 0.0330432653427124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3043266739696264e-05, "grad_norm": 19.017900466918945, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8752288222312927, "num_tokens": 706934373.0, "step": 18526 }, { "epoch": 2.3568248314463807, "ewc_loss": 0.03294355422258377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294355337857269e-05, "grad_norm": 19.024415969848633, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8750683665275574, "num_tokens": 706976885.0, "step": 18527 }, { "epoch": 2.356952041724971, "ewc_loss": 0.033033743500709534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30337425111793e-05, "grad_norm": 19.0712890625, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8798431158065796, "num_tokens": 707016957.0, "step": 18528 }, { "epoch": 2.3570792520035617, "ewc_loss": 0.03295798599720001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295798524050042e-05, "grad_norm": 19.117563247680664, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8848526477813721, "num_tokens": 707053687.0, "step": 18529 }, { "epoch": 2.3572064622821522, "ewc_loss": 0.03291214257478714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291214306955226e-05, "grad_norm": 19.009307861328125, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.868745744228363, "num_tokens": 707087327.0, "step": 18530 }, { "epoch": 2.3573336725607428, "ewc_loss": 0.03295966982841492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2959669624688104e-05, "grad_norm": 19.0545711517334, "learning_rate": 1e-06, "loss": 0.3565, "mean_token_accuracy": 0.885977029800415, "num_tokens": 707124479.0, "step": 18531 }, { "epoch": 2.3574608828393333, "ewc_loss": 0.03291800618171692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.291800749138929e-05, "grad_norm": 19.0631103515625, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8785879015922546, "num_tokens": 707162452.0, "step": 18532 }, { "epoch": 2.357588093117924, "ewc_loss": 0.032931532710790634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2931533496594056e-05, "grad_norm": 19.00735092163086, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8835249543190002, "num_tokens": 707196765.0, "step": 18533 }, { "epoch": 2.3577153033965144, "ewc_loss": 0.03289994224905968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.289994128863327e-05, "grad_norm": 19.008913040161133, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.881892204284668, "num_tokens": 707230151.0, "step": 18534 }, { "epoch": 2.357842513675105, "ewc_loss": 0.03293614462018013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293614645372145e-05, "grad_norm": 19.044998168945312, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8569921255111694, "num_tokens": 707264373.0, "step": 18535 }, { "epoch": 2.3579697239536954, "ewc_loss": 0.03295041620731354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295041460660286e-05, "grad_norm": 19.020681381225586, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8698434233665466, "num_tokens": 707304844.0, "step": 18536 }, { "epoch": 2.358096934232286, "ewc_loss": 0.032923534512519836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.292353358119726e-05, "grad_norm": 19.020978927612305, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8765669465065002, "num_tokens": 707347081.0, "step": 18537 }, { "epoch": 2.3582241445108765, "ewc_loss": 0.03302082046866417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302082041045651e-05, "grad_norm": 19.05706787109375, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8686362504959106, "num_tokens": 707388243.0, "step": 18538 }, { "epoch": 2.358351354789467, "ewc_loss": 0.032936207950115204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293620829936117e-05, "grad_norm": 19.01481056213379, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8625818490982056, "num_tokens": 707427109.0, "step": 18539 }, { "epoch": 2.3584785650680575, "ewc_loss": 0.032947391271591187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294739144621417e-05, "grad_norm": 19.02118682861328, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.876488983631134, "num_tokens": 707462502.0, "step": 18540 }, { "epoch": 2.358605775346648, "ewc_loss": 0.032946955412626266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.294695488875732e-05, "grad_norm": 19.00827980041504, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8687350749969482, "num_tokens": 707502861.0, "step": 18541 }, { "epoch": 2.3587329856252386, "ewc_loss": 0.032995931804180145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2995932997437194e-05, "grad_norm": 19.05303192138672, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.854924201965332, "num_tokens": 707535868.0, "step": 18542 }, { "epoch": 2.358860195903829, "ewc_loss": 0.03299086540937424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2990865292958915e-05, "grad_norm": 19.043014526367188, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8782548904418945, "num_tokens": 707566128.0, "step": 18543 }, { "epoch": 2.3589874061824196, "ewc_loss": 0.03295515850186348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.295515853096731e-05, "grad_norm": 18.994081497192383, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8815630078315735, "num_tokens": 707605113.0, "step": 18544 }, { "epoch": 2.35911461646101, "ewc_loss": 0.03298623487353325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298623414593749e-05, "grad_norm": 18.996156692504883, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8728547096252441, "num_tokens": 707647968.0, "step": 18545 }, { "epoch": 2.3592418267396007, "ewc_loss": 0.033013585954904556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30135844706092e-05, "grad_norm": 19.045639038085938, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8822358846664429, "num_tokens": 707682183.0, "step": 18546 }, { "epoch": 2.3593690370181912, "ewc_loss": 0.03299897536635399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.299897434771992e-05, "grad_norm": 19.04077911376953, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8834388256072998, "num_tokens": 707720938.0, "step": 18547 }, { "epoch": 2.3594962472967818, "ewc_loss": 0.0329396054148674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293960617156699e-05, "grad_norm": 18.928224563598633, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8630565404891968, "num_tokens": 707762624.0, "step": 18548 }, { "epoch": 2.3596234575753723, "ewc_loss": 0.032935142517089844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.293514237157069e-05, "grad_norm": 19.03461456298828, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8669686317443848, "num_tokens": 707798406.0, "step": 18549 }, { "epoch": 2.3597506678539624, "ewc_loss": 0.03305815905332565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30581606249325e-05, "grad_norm": 18.894012451171875, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8763411641120911, "num_tokens": 707835935.0, "step": 18550 }, { "epoch": 2.3598778781325533, "ewc_loss": 0.03297214210033417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2972140616038814e-05, "grad_norm": 19.043304443359375, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8856655955314636, "num_tokens": 707872971.0, "step": 18551 }, { "epoch": 2.3600050884111434, "ewc_loss": 0.03309595212340355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3095951948780566e-05, "grad_norm": 18.95144271850586, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8854563236236572, "num_tokens": 707909798.0, "step": 18552 }, { "epoch": 2.360132298689734, "ewc_loss": 0.03300973027944565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300973185105249e-05, "grad_norm": 19.008190155029297, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.877746045589447, "num_tokens": 707951130.0, "step": 18553 }, { "epoch": 2.3602595089683245, "ewc_loss": 0.033090755343437195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309075691504404e-05, "grad_norm": 19.064138412475586, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8622056841850281, "num_tokens": 707989502.0, "step": 18554 }, { "epoch": 2.360386719246915, "ewc_loss": 0.03303234651684761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3032345527317375e-05, "grad_norm": 19.007421493530273, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8735332489013672, "num_tokens": 708023521.0, "step": 18555 }, { "epoch": 2.3605139295255055, "ewc_loss": 0.03307375684380531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30737566400785e-05, "grad_norm": 19.04865264892578, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8795230388641357, "num_tokens": 708057594.0, "step": 18556 }, { "epoch": 2.360641139804096, "ewc_loss": 0.03303307667374611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30330767610576e-05, "grad_norm": 18.900766372680664, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8721400499343872, "num_tokens": 708101393.0, "step": 18557 }, { "epoch": 2.3607683500826866, "ewc_loss": 0.03305623307824135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305623249616474e-05, "grad_norm": 19.061573028564453, "learning_rate": 1e-06, "loss": 0.3605, "mean_token_accuracy": 0.8849813938140869, "num_tokens": 708137141.0, "step": 18558 }, { "epoch": 2.360895560361277, "ewc_loss": 0.03310093283653259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3100932341767475e-05, "grad_norm": 18.968891143798828, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8791015148162842, "num_tokens": 708173497.0, "step": 18559 }, { "epoch": 2.3610227706398677, "ewc_loss": 0.03305540606379509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305540667497553e-05, "grad_norm": 19.032852172851562, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8734279870986938, "num_tokens": 708209412.0, "step": 18560 }, { "epoch": 2.361149980918458, "ewc_loss": 0.03306679427623749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3066793548641726e-05, "grad_norm": 18.981595993041992, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8833620548248291, "num_tokens": 708239675.0, "step": 18561 }, { "epoch": 2.3612771911970487, "ewc_loss": 0.03307328745722771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3073287340812385e-05, "grad_norm": 19.038272857666016, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8689411878585815, "num_tokens": 708274918.0, "step": 18562 }, { "epoch": 2.3614044014756392, "ewc_loss": 0.03315321356058121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315321373520419e-05, "grad_norm": 19.043041229248047, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8588272333145142, "num_tokens": 708313645.0, "step": 18563 }, { "epoch": 2.3615316117542298, "ewc_loss": 0.03308174014091492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3081742003560066e-05, "grad_norm": 18.996719360351562, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8762325048446655, "num_tokens": 708357985.0, "step": 18564 }, { "epoch": 2.3616588220328203, "ewc_loss": 0.03312210738658905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3122109016403556e-05, "grad_norm": 19.030731201171875, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8544520139694214, "num_tokens": 708393606.0, "step": 18565 }, { "epoch": 2.361786032311411, "ewc_loss": 0.03315475210547447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315475260023959e-05, "grad_norm": 19.10452651977539, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8794578909873962, "num_tokens": 708428941.0, "step": 18566 }, { "epoch": 2.3619132425900013, "ewc_loss": 0.033146779984235764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3146778150694445e-05, "grad_norm": 19.055335998535156, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8784100413322449, "num_tokens": 708467847.0, "step": 18567 }, { "epoch": 2.362040452868592, "ewc_loss": 0.0330749936401844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307499355287291e-05, "grad_norm": 19.015640258789062, "learning_rate": 1e-06, "loss": 0.3354, "mean_token_accuracy": 0.8911423683166504, "num_tokens": 708503212.0, "step": 18568 }, { "epoch": 2.3621676631471824, "ewc_loss": 0.03312889114022255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3128890208899975e-05, "grad_norm": 19.043678283691406, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8757020235061646, "num_tokens": 708542897.0, "step": 18569 }, { "epoch": 2.362294873425773, "ewc_loss": 0.0331350713968277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3135071134893224e-05, "grad_norm": 19.153940200805664, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8637067079544067, "num_tokens": 708574725.0, "step": 18570 }, { "epoch": 2.3624220837043635, "ewc_loss": 0.03309134393930435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309134262963198e-05, "grad_norm": 18.96518325805664, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8847997188568115, "num_tokens": 708613557.0, "step": 18571 }, { "epoch": 2.362549293982954, "ewc_loss": 0.03308689966797829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308690065750852e-05, "grad_norm": 19.064205169677734, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8668524622917175, "num_tokens": 708651872.0, "step": 18572 }, { "epoch": 2.3626765042615445, "ewc_loss": 0.033169668167829514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3169668313348666e-05, "grad_norm": 18.97846031188965, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8628536462783813, "num_tokens": 708688654.0, "step": 18573 }, { "epoch": 2.362803714540135, "ewc_loss": 0.03309394419193268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309394378447905e-05, "grad_norm": 19.004098892211914, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8810164928436279, "num_tokens": 708723285.0, "step": 18574 }, { "epoch": 2.362930924818725, "ewc_loss": 0.0331309549510479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31309565808624e-05, "grad_norm": 19.02391242980957, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.883914589881897, "num_tokens": 708760617.0, "step": 18575 }, { "epoch": 2.363058135097316, "ewc_loss": 0.03312378376722336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312378248665482e-05, "grad_norm": 19.038253784179688, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8790828585624695, "num_tokens": 708790792.0, "step": 18576 }, { "epoch": 2.363185345375906, "ewc_loss": 0.03309524804353714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309524618089199e-05, "grad_norm": 18.930559158325195, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8658323287963867, "num_tokens": 708829935.0, "step": 18577 }, { "epoch": 2.3633125556544967, "ewc_loss": 0.03312855213880539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3128551876870915e-05, "grad_norm": 19.022626876831055, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8705899715423584, "num_tokens": 708870578.0, "step": 18578 }, { "epoch": 2.3634397659330872, "ewc_loss": 0.033199112862348557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3199114113813266e-05, "grad_norm": 19.084646224975586, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8728523254394531, "num_tokens": 708907044.0, "step": 18579 }, { "epoch": 2.3635669762116778, "ewc_loss": 0.03308696299791336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308696250314824e-05, "grad_norm": 18.969453811645508, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8573133945465088, "num_tokens": 708947443.0, "step": 18580 }, { "epoch": 2.3636941864902683, "ewc_loss": 0.03306853398680687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3068532502511516e-05, "grad_norm": 19.033517837524414, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8763212561607361, "num_tokens": 708983174.0, "step": 18581 }, { "epoch": 2.363821396768859, "ewc_loss": 0.03317412734031677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317412847536616e-05, "grad_norm": 19.00417709350586, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8783375024795532, "num_tokens": 709021950.0, "step": 18582 }, { "epoch": 2.3639486070474494, "ewc_loss": 0.033126600086688995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312659828225151e-05, "grad_norm": 19.02081298828125, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8806881904602051, "num_tokens": 709057236.0, "step": 18583 }, { "epoch": 2.36407581732604, "ewc_loss": 0.03314918652176857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314918649266474e-05, "grad_norm": 19.007423400878906, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8766415119171143, "num_tokens": 709093376.0, "step": 18584 }, { "epoch": 2.3642030276046304, "ewc_loss": 0.033150967210531235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315096546430141e-05, "grad_norm": 19.038921356201172, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8698346614837646, "num_tokens": 709129502.0, "step": 18585 }, { "epoch": 2.364330237883221, "ewc_loss": 0.03313585743308067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3135856938315555e-05, "grad_norm": 18.991811752319336, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8748049139976501, "num_tokens": 709162538.0, "step": 18586 }, { "epoch": 2.3644574481618115, "ewc_loss": 0.033182598650455475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318259769002907e-05, "grad_norm": 19.054285049438477, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.864001452922821, "num_tokens": 709201690.0, "step": 18587 }, { "epoch": 2.364584658440402, "ewc_loss": 0.03315078839659691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315078720333986e-05, "grad_norm": 19.031755447387695, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8680809140205383, "num_tokens": 709240593.0, "step": 18588 }, { "epoch": 2.3647118687189925, "ewc_loss": 0.03313424065709114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313424167572521e-05, "grad_norm": 19.021997451782227, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8854674100875854, "num_tokens": 709278192.0, "step": 18589 }, { "epoch": 2.364839078997583, "ewc_loss": 0.0331537090241909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3153708500321954e-05, "grad_norm": 18.97488784790039, "learning_rate": 1e-06, "loss": 0.3301, "mean_token_accuracy": 0.8925573825836182, "num_tokens": 709311797.0, "step": 18590 }, { "epoch": 2.3649662892761736, "ewc_loss": 0.033176176249980927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317617665743455e-05, "grad_norm": 19.137962341308594, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8726727962493896, "num_tokens": 709350379.0, "step": 18591 }, { "epoch": 2.365093499554764, "ewc_loss": 0.03318379446864128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31837945850566e-05, "grad_norm": 19.01175880432129, "learning_rate": 1e-06, "loss": 0.3169, "mean_token_accuracy": 0.8993450403213501, "num_tokens": 709389030.0, "step": 18592 }, { "epoch": 2.3652207098333546, "ewc_loss": 0.03311806172132492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311805994599126e-05, "grad_norm": 18.993501663208008, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8666853904724121, "num_tokens": 709433692.0, "step": 18593 }, { "epoch": 2.365347920111945, "ewc_loss": 0.033171575516462326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317157461424358e-05, "grad_norm": 19.04007339477539, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8740403056144714, "num_tokens": 709470773.0, "step": 18594 }, { "epoch": 2.3654751303905357, "ewc_loss": 0.0330946110188961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309460953460075e-05, "grad_norm": 18.971620559692383, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8702048659324646, "num_tokens": 709506764.0, "step": 18595 }, { "epoch": 2.3656023406691262, "ewc_loss": 0.0330938920378685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309389285277575e-05, "grad_norm": 19.0272216796875, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8739750385284424, "num_tokens": 709546931.0, "step": 18596 }, { "epoch": 2.3657295509477168, "ewc_loss": 0.033188432455062866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3188433008035645e-05, "grad_norm": 19.097820281982422, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8765362501144409, "num_tokens": 709582842.0, "step": 18597 }, { "epoch": 2.365856761226307, "ewc_loss": 0.033105138689279556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310513784526847e-05, "grad_norm": 19.06406021118164, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8750818967819214, "num_tokens": 709621574.0, "step": 18598 }, { "epoch": 2.365983971504898, "ewc_loss": 0.03307991847395897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307991937617771e-05, "grad_norm": 19.11199378967285, "learning_rate": 1e-06, "loss": 0.3364, "mean_token_accuracy": 0.8927547931671143, "num_tokens": 709662437.0, "step": 18599 }, { "epoch": 2.366111181783488, "ewc_loss": 0.03311635181307793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311635009595193e-05, "grad_norm": 19.054115295410156, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8879588842391968, "num_tokens": 709701318.0, "step": 18600 }, { "epoch": 2.3662383920620784, "ewc_loss": 0.03298843652009964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298843512311578e-05, "grad_norm": 19.051942825317383, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8742222189903259, "num_tokens": 709745156.0, "step": 18601 }, { "epoch": 2.366365602340669, "ewc_loss": 0.03306803107261658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306803046143614e-05, "grad_norm": 19.068862915039062, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.869850218296051, "num_tokens": 709785849.0, "step": 18602 }, { "epoch": 2.3664928126192595, "ewc_loss": 0.033053550869226456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305355130578391e-05, "grad_norm": 19.029743194580078, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8780756592750549, "num_tokens": 709830038.0, "step": 18603 }, { "epoch": 2.36662002289785, "ewc_loss": 0.03303347900509834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303348057670519e-05, "grad_norm": 19.01747703552246, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8727332353591919, "num_tokens": 709871174.0, "step": 18604 }, { "epoch": 2.3667472331764405, "ewc_loss": 0.033047083765268326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3047082979464903e-05, "grad_norm": 19.049741744995117, "learning_rate": 1e-06, "loss": 0.3265, "mean_token_accuracy": 0.8956797122955322, "num_tokens": 709905921.0, "step": 18605 }, { "epoch": 2.366874443455031, "ewc_loss": 0.033046506345272064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3046504540834576e-05, "grad_norm": 19.013826370239258, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8910233974456787, "num_tokens": 709939856.0, "step": 18606 }, { "epoch": 2.3670016537336216, "ewc_loss": 0.03301041200757027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301041215308942e-05, "grad_norm": 18.997901916503906, "learning_rate": 1e-06, "loss": 0.3495, "mean_token_accuracy": 0.8879410028457642, "num_tokens": 709978805.0, "step": 18607 }, { "epoch": 2.367128864012212, "ewc_loss": 0.033067625015974045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306762664578855e-05, "grad_norm": 19.025400161743164, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8882683515548706, "num_tokens": 710015015.0, "step": 18608 }, { "epoch": 2.3672560742908026, "ewc_loss": 0.03308368846774101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308368832222186e-05, "grad_norm": 19.11470603942871, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8665497303009033, "num_tokens": 710054887.0, "step": 18609 }, { "epoch": 2.367383284569393, "ewc_loss": 0.03302758187055588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302758341305889e-05, "grad_norm": 19.072582244873047, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8754404783248901, "num_tokens": 710097012.0, "step": 18610 }, { "epoch": 2.3675104948479837, "ewc_loss": 0.03299190476536751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2991905754897743e-05, "grad_norm": 19.04348373413086, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8590139150619507, "num_tokens": 710130912.0, "step": 18611 }, { "epoch": 2.3676377051265742, "ewc_loss": 0.03298770263791084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298770388937555e-05, "grad_norm": 18.89947509765625, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8717703819274902, "num_tokens": 710168880.0, "step": 18612 }, { "epoch": 2.3677649154051648, "ewc_loss": 0.03307117149233818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307117003714666e-05, "grad_norm": 19.121150970458984, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.884188711643219, "num_tokens": 710205619.0, "step": 18613 }, { "epoch": 2.3678921256837553, "ewc_loss": 0.03311050683259964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311050750198774e-05, "grad_norm": 18.916852951049805, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8808544874191284, "num_tokens": 710239475.0, "step": 18614 }, { "epoch": 2.368019335962346, "ewc_loss": 0.03301975503563881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301975448266603e-05, "grad_norm": 19.071292877197266, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8776339292526245, "num_tokens": 710276533.0, "step": 18615 }, { "epoch": 2.3681465462409363, "ewc_loss": 0.03321807458996773, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321807525935583e-05, "grad_norm": 19.023761749267578, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8908527493476868, "num_tokens": 710316731.0, "step": 18616 }, { "epoch": 2.368273756519527, "ewc_loss": 0.03304368630051613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304368510725908e-05, "grad_norm": 19.02242088317871, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8715109825134277, "num_tokens": 710352310.0, "step": 18617 }, { "epoch": 2.3684009667981174, "ewc_loss": 0.0330909825861454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309098246973008e-05, "grad_norm": 18.96293830871582, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8712711930274963, "num_tokens": 710400203.0, "step": 18618 }, { "epoch": 2.368528177076708, "ewc_loss": 0.033095426857471466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3095428079832345e-05, "grad_norm": 19.05663299560547, "learning_rate": 1e-06, "loss": 0.367, "mean_token_accuracy": 0.880807638168335, "num_tokens": 710433054.0, "step": 18619 }, { "epoch": 2.3686553873552985, "ewc_loss": 0.033243682235479355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324368299217895e-05, "grad_norm": 19.08098602294922, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8797107934951782, "num_tokens": 710466879.0, "step": 18620 }, { "epoch": 2.368782597633889, "ewc_loss": 0.033078789710998535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307878796476871e-05, "grad_norm": 19.004899978637695, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8698717355728149, "num_tokens": 710502808.0, "step": 18621 }, { "epoch": 2.3689098079124795, "ewc_loss": 0.03307902812957764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3079028071369976e-05, "grad_norm": 19.087493896484375, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8644149303436279, "num_tokens": 710546190.0, "step": 18622 }, { "epoch": 2.3690370181910696, "ewc_loss": 0.03309391066431999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309391104266979e-05, "grad_norm": 18.983943939208984, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8772212266921997, "num_tokens": 710585556.0, "step": 18623 }, { "epoch": 2.3691642284696606, "ewc_loss": 0.03311217203736305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311217369628139e-05, "grad_norm": 19.07005500793457, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8784494400024414, "num_tokens": 710621366.0, "step": 18624 }, { "epoch": 2.3692914387482507, "ewc_loss": 0.033122655004262924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312265471322462e-05, "grad_norm": 19.0296573638916, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.878602921962738, "num_tokens": 710660217.0, "step": 18625 }, { "epoch": 2.369418649026841, "ewc_loss": 0.03312477096915245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312477201689035e-05, "grad_norm": 18.994112014770508, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8724365234375, "num_tokens": 710695798.0, "step": 18626 }, { "epoch": 2.3695458593054317, "ewc_loss": 0.0331086628139019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3108663046732545e-05, "grad_norm": 19.079435348510742, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8579875230789185, "num_tokens": 710732739.0, "step": 18627 }, { "epoch": 2.3696730695840222, "ewc_loss": 0.033135127276182175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313512570457533e-05, "grad_norm": 19.030494689941406, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8679978847503662, "num_tokens": 710769651.0, "step": 18628 }, { "epoch": 2.3698002798626128, "ewc_loss": 0.033142559230327606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314255809527822e-05, "grad_norm": 19.126161575317383, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8730034828186035, "num_tokens": 710809487.0, "step": 18629 }, { "epoch": 2.3699274901412033, "ewc_loss": 0.033066071569919586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306607322883792e-05, "grad_norm": 18.943082809448242, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8787333965301514, "num_tokens": 710849152.0, "step": 18630 }, { "epoch": 2.370054700419794, "ewc_loss": 0.033098284155130386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309828389319591e-05, "grad_norm": 19.107484817504883, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8616488575935364, "num_tokens": 710890933.0, "step": 18631 }, { "epoch": 2.3701819106983844, "ewc_loss": 0.03317832574248314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3178326702909544e-05, "grad_norm": 18.962505340576172, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8780664801597595, "num_tokens": 710928262.0, "step": 18632 }, { "epoch": 2.370309120976975, "ewc_loss": 0.03306012228131294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306012149550952e-05, "grad_norm": 19.03243064880371, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8800272941589355, "num_tokens": 710962969.0, "step": 18633 }, { "epoch": 2.3704363312555654, "ewc_loss": 0.03314310684800148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314310743007809e-05, "grad_norm": 18.926361083984375, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8713840246200562, "num_tokens": 710998717.0, "step": 18634 }, { "epoch": 2.370563541534156, "ewc_loss": 0.03307659924030304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307659790152684e-05, "grad_norm": 19.031322479248047, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.860954225063324, "num_tokens": 711032819.0, "step": 18635 }, { "epoch": 2.3706907518127465, "ewc_loss": 0.033245496451854706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324549470562488e-05, "grad_norm": 19.048398971557617, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8648719787597656, "num_tokens": 711069395.0, "step": 18636 }, { "epoch": 2.370817962091337, "ewc_loss": 0.0331016480922699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310164902359247e-05, "grad_norm": 18.900074005126953, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8771030902862549, "num_tokens": 711105754.0, "step": 18637 }, { "epoch": 2.3709451723699275, "ewc_loss": 0.03320785611867905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320785617688671e-05, "grad_norm": 19.050548553466797, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8690064549446106, "num_tokens": 711145291.0, "step": 18638 }, { "epoch": 2.371072382648518, "ewc_loss": 0.03324006497859955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32400668412447e-05, "grad_norm": 19.022401809692383, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.875641942024231, "num_tokens": 711183431.0, "step": 18639 }, { "epoch": 2.3711995929271086, "ewc_loss": 0.033179059624671936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317905793664977e-05, "grad_norm": 19.018417358398438, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8608019351959229, "num_tokens": 711229754.0, "step": 18640 }, { "epoch": 2.371326803205699, "ewc_loss": 0.033179424703121185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3179425372509286e-05, "grad_norm": 18.9562931060791, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8762502670288086, "num_tokens": 711269071.0, "step": 18641 }, { "epoch": 2.3714540134842896, "ewc_loss": 0.03313826024532318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313826164230704e-05, "grad_norm": 18.963367462158203, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8722971081733704, "num_tokens": 711312036.0, "step": 18642 }, { "epoch": 2.37158122376288, "ewc_loss": 0.03317568078637123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317568189231679e-05, "grad_norm": 19.020286560058594, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8823136687278748, "num_tokens": 711347945.0, "step": 18643 }, { "epoch": 2.3717084340414707, "ewc_loss": 0.0331345833837986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313458364573307e-05, "grad_norm": 19.059022903442383, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8539606928825378, "num_tokens": 711388433.0, "step": 18644 }, { "epoch": 2.371835644320061, "ewc_loss": 0.03313209116458893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3132091630250216e-05, "grad_norm": 19.016586303710938, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8783993721008301, "num_tokens": 711425821.0, "step": 18645 }, { "epoch": 2.3719628545986517, "ewc_loss": 0.03307340666651726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307340739411302e-05, "grad_norm": 19.04342269897461, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8685611486434937, "num_tokens": 711462725.0, "step": 18646 }, { "epoch": 2.3720900648772423, "ewc_loss": 0.0331965871155262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319658571854234e-05, "grad_norm": 18.952363967895508, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8880188465118408, "num_tokens": 711499987.0, "step": 18647 }, { "epoch": 2.3722172751558324, "ewc_loss": 0.03313906490802765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313906563562341e-05, "grad_norm": 19.04616928100586, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8602381944656372, "num_tokens": 711543658.0, "step": 18648 }, { "epoch": 2.3723444854344233, "ewc_loss": 0.033148832619190216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314883360872045e-05, "grad_norm": 18.983102798461914, "learning_rate": 1e-06, "loss": 0.3506, "mean_token_accuracy": 0.8861713409423828, "num_tokens": 711583875.0, "step": 18649 }, { "epoch": 2.3724716957130134, "ewc_loss": 0.03307679295539856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307679435238242e-05, "grad_norm": 19.015457153320312, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8582069873809814, "num_tokens": 711625543.0, "step": 18650 }, { "epoch": 2.372598905991604, "ewc_loss": 0.03318263962864876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3182641345774755e-05, "grad_norm": 19.022056579589844, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8781297206878662, "num_tokens": 711660528.0, "step": 18651 }, { "epoch": 2.3727261162701945, "ewc_loss": 0.03309106081724167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3091062505263835e-05, "grad_norm": 19.017465591430664, "learning_rate": 1e-06, "loss": 0.3453, "mean_token_accuracy": 0.890496551990509, "num_tokens": 711695646.0, "step": 18652 }, { "epoch": 2.372853326548785, "ewc_loss": 0.03316234424710274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316234506200999e-05, "grad_norm": 19.0260009765625, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8592499494552612, "num_tokens": 711734691.0, "step": 18653 }, { "epoch": 2.3729805368273755, "ewc_loss": 0.03312528878450394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3125288609880954e-05, "grad_norm": 19.086210250854492, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8775631189346313, "num_tokens": 711771518.0, "step": 18654 }, { "epoch": 2.373107747105966, "ewc_loss": 0.03309858217835426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3098582207458094e-05, "grad_norm": 19.004165649414062, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8644441366195679, "num_tokens": 711808968.0, "step": 18655 }, { "epoch": 2.3732349573845566, "ewc_loss": 0.03308427333831787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30842740368098e-05, "grad_norm": 19.085081100463867, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8823487758636475, "num_tokens": 711842944.0, "step": 18656 }, { "epoch": 2.373362167663147, "ewc_loss": 0.033187154680490494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3187156077474356e-05, "grad_norm": 19.013513565063477, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8715075850486755, "num_tokens": 711882740.0, "step": 18657 }, { "epoch": 2.3734893779417376, "ewc_loss": 0.03307546302676201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3075462852139026e-05, "grad_norm": 18.999372482299805, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8668187856674194, "num_tokens": 711925362.0, "step": 18658 }, { "epoch": 2.373616588220328, "ewc_loss": 0.03310553729534149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310553802293725e-05, "grad_norm": 19.000534057617188, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8778477907180786, "num_tokens": 711971285.0, "step": 18659 }, { "epoch": 2.3737437984989187, "ewc_loss": 0.0331600047647953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316000584163703e-05, "grad_norm": 19.100072860717773, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8699536323547363, "num_tokens": 712013306.0, "step": 18660 }, { "epoch": 2.3738710087775092, "ewc_loss": 0.033169664442539215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316966467536986e-05, "grad_norm": 19.05467987060547, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8754634857177734, "num_tokens": 712050837.0, "step": 18661 }, { "epoch": 2.3739982190560998, "ewc_loss": 0.03309262543916702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3092626836150885e-05, "grad_norm": 19.03877067565918, "learning_rate": 1e-06, "loss": 0.3445, "mean_token_accuracy": 0.8918542861938477, "num_tokens": 712085140.0, "step": 18662 }, { "epoch": 2.3741254293346903, "ewc_loss": 0.0331822894513607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318228846183047e-05, "grad_norm": 18.979787826538086, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8878540992736816, "num_tokens": 712122495.0, "step": 18663 }, { "epoch": 2.374252639613281, "ewc_loss": 0.03305448591709137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3054486266337335e-05, "grad_norm": 19.024797439575195, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8684446811676025, "num_tokens": 712161312.0, "step": 18664 }, { "epoch": 2.3743798498918713, "ewc_loss": 0.033157553523778915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315755384392105e-05, "grad_norm": 19.042510986328125, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8741931915283203, "num_tokens": 712197275.0, "step": 18665 }, { "epoch": 2.374507060170462, "ewc_loss": 0.033160727471113205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3160726161440834e-05, "grad_norm": 19.007774353027344, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8625423908233643, "num_tokens": 712231309.0, "step": 18666 }, { "epoch": 2.3746342704490524, "ewc_loss": 0.03313513845205307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313514025649056e-05, "grad_norm": 19.017223358154297, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8702725172042847, "num_tokens": 712264864.0, "step": 18667 }, { "epoch": 2.374761480727643, "ewc_loss": 0.0331815741956234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318157541798428e-05, "grad_norm": 19.014263153076172, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8746514320373535, "num_tokens": 712303564.0, "step": 18668 }, { "epoch": 2.3748886910062335, "ewc_loss": 0.03313368931412697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313368870294653e-05, "grad_norm": 18.937841415405273, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8854284882545471, "num_tokens": 712341872.0, "step": 18669 }, { "epoch": 2.375015901284824, "ewc_loss": 0.033175576478242874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317557639093138e-05, "grad_norm": 19.071794509887695, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8638831377029419, "num_tokens": 712379835.0, "step": 18670 }, { "epoch": 2.3751431115634145, "ewc_loss": 0.03326297923922539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326297883177176e-05, "grad_norm": 18.98141860961914, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8710901141166687, "num_tokens": 712417462.0, "step": 18671 }, { "epoch": 2.375270321842005, "ewc_loss": 0.03315720707178116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315720823593438e-05, "grad_norm": 19.068336486816406, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8500369787216187, "num_tokens": 712461581.0, "step": 18672 }, { "epoch": 2.375397532120595, "ewc_loss": 0.03324681147933006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3246811653953046e-05, "grad_norm": 18.992652893066406, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8744082450866699, "num_tokens": 712499615.0, "step": 18673 }, { "epoch": 2.375524742399186, "ewc_loss": 0.03320568427443504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3205684303538874e-05, "grad_norm": 19.008554458618164, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.885097086429596, "num_tokens": 712533264.0, "step": 18674 }, { "epoch": 2.375651952677776, "ewc_loss": 0.03326715528964996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32671552314423e-05, "grad_norm": 19.01949119567871, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8663251399993896, "num_tokens": 712571496.0, "step": 18675 }, { "epoch": 2.3757791629563667, "ewc_loss": 0.03327614441514015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3276144677074626e-05, "grad_norm": 19.08845329284668, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8820103406906128, "num_tokens": 712606510.0, "step": 18676 }, { "epoch": 2.3759063732349572, "ewc_loss": 0.03325539827346802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325539728393778e-05, "grad_norm": 19.00669288635254, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8838591575622559, "num_tokens": 712646218.0, "step": 18677 }, { "epoch": 2.3760335835135478, "ewc_loss": 0.033190637826919556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319063762319274e-05, "grad_norm": 19.00509262084961, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8437495231628418, "num_tokens": 712681905.0, "step": 18678 }, { "epoch": 2.3761607937921383, "ewc_loss": 0.03323890268802643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323890268802643e-05, "grad_norm": 18.98006248474121, "learning_rate": 1e-06, "loss": 0.3425, "mean_token_accuracy": 0.8870128393173218, "num_tokens": 712719916.0, "step": 18679 }, { "epoch": 2.376288004070729, "ewc_loss": 0.033276740461587906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327674130559899e-05, "grad_norm": 19.045276641845703, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8745384216308594, "num_tokens": 712760849.0, "step": 18680 }, { "epoch": 2.3764152143493193, "ewc_loss": 0.03324238955974579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324238787172362e-05, "grad_norm": 18.923171997070312, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8854917287826538, "num_tokens": 712801199.0, "step": 18681 }, { "epoch": 2.37654242462791, "ewc_loss": 0.033249225467443466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3249227271880955e-05, "grad_norm": 19.06053924560547, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8695909380912781, "num_tokens": 712843471.0, "step": 18682 }, { "epoch": 2.3766696349065004, "ewc_loss": 0.033312514424324036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331251355120912e-05, "grad_norm": 18.97134780883789, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8761475682258606, "num_tokens": 712880649.0, "step": 18683 }, { "epoch": 2.376796845185091, "ewc_loss": 0.033209823071956635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3209824323421344e-05, "grad_norm": 19.004060745239258, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8516054749488831, "num_tokens": 712920574.0, "step": 18684 }, { "epoch": 2.3769240554636815, "ewc_loss": 0.033250387758016586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325038778712042e-05, "grad_norm": 18.959205627441406, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8828311562538147, "num_tokens": 712962544.0, "step": 18685 }, { "epoch": 2.377051265742272, "ewc_loss": 0.03325532749295235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325532816234045e-05, "grad_norm": 19.034147262573242, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8788834810256958, "num_tokens": 712999138.0, "step": 18686 }, { "epoch": 2.3771784760208625, "ewc_loss": 0.0332522839307785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325228317407891e-05, "grad_norm": 18.963788986206055, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8763192892074585, "num_tokens": 713041799.0, "step": 18687 }, { "epoch": 2.377305686299453, "ewc_loss": 0.03323002904653549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323002965771593e-05, "grad_norm": 18.981426239013672, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.867872416973114, "num_tokens": 713088248.0, "step": 18688 }, { "epoch": 2.3774328965780436, "ewc_loss": 0.03325815498828888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325815487187356e-05, "grad_norm": 19.09791374206543, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8833082914352417, "num_tokens": 713127239.0, "step": 18689 }, { "epoch": 2.377560106856634, "ewc_loss": 0.03321085125207901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3210850233444944e-05, "grad_norm": 19.04384422302246, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8775538802146912, "num_tokens": 713168389.0, "step": 18690 }, { "epoch": 2.3776873171352246, "ewc_loss": 0.03316346928477287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316346919746138e-05, "grad_norm": 19.077556610107422, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.867395281791687, "num_tokens": 713208137.0, "step": 18691 }, { "epoch": 2.377814527413815, "ewc_loss": 0.033179301768541336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3179301681229845e-05, "grad_norm": 19.11882781982422, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8655844330787659, "num_tokens": 713249344.0, "step": 18692 }, { "epoch": 2.3779417376924057, "ewc_loss": 0.03311379253864288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3113792596850544e-05, "grad_norm": 19.023765563964844, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.870608925819397, "num_tokens": 713291376.0, "step": 18693 }, { "epoch": 2.378068947970996, "ewc_loss": 0.03311022371053696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311022373964079e-05, "grad_norm": 19.040754318237305, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8727508187294006, "num_tokens": 713329925.0, "step": 18694 }, { "epoch": 2.3781961582495867, "ewc_loss": 0.033081430941820145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3081432775361463e-05, "grad_norm": 19.005748748779297, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8763830065727234, "num_tokens": 713372509.0, "step": 18695 }, { "epoch": 2.378323368528177, "ewc_loss": 0.03307608142495155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307608130853623e-05, "grad_norm": 19.053741455078125, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8685886263847351, "num_tokens": 713410055.0, "step": 18696 }, { "epoch": 2.378450578806768, "ewc_loss": 0.03312017396092415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312017361167818e-05, "grad_norm": 19.1114501953125, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8721402883529663, "num_tokens": 713453958.0, "step": 18697 }, { "epoch": 2.378577789085358, "ewc_loss": 0.033037975430488586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303797711851075e-05, "grad_norm": 19.001672744750977, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8749244213104248, "num_tokens": 713485683.0, "step": 18698 }, { "epoch": 2.3787049993639484, "ewc_loss": 0.033024441450834274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3024440199369565e-05, "grad_norm": 18.93354034423828, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8766275644302368, "num_tokens": 713528565.0, "step": 18699 }, { "epoch": 2.378832209642539, "ewc_loss": 0.03309512510895729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309512612759136e-05, "grad_norm": 19.084306716918945, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8709346055984497, "num_tokens": 713560932.0, "step": 18700 }, { "epoch": 2.3789594199211295, "ewc_loss": 0.033095572143793106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309557359898463e-05, "grad_norm": 19.07917022705078, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8713465332984924, "num_tokens": 713592891.0, "step": 18701 }, { "epoch": 2.37908663019972, "ewc_loss": 0.03310336917638779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3103369787568226e-05, "grad_norm": 19.059810638427734, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8726884722709656, "num_tokens": 713631285.0, "step": 18702 }, { "epoch": 2.3792138404783105, "ewc_loss": 0.033137716352939606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313771594548598e-05, "grad_norm": 19.074947357177734, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8563026189804077, "num_tokens": 713675143.0, "step": 18703 }, { "epoch": 2.379341050756901, "ewc_loss": 0.03313162177801132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31316223309841e-05, "grad_norm": 19.08373260498047, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8786534667015076, "num_tokens": 713717556.0, "step": 18704 }, { "epoch": 2.3794682610354916, "ewc_loss": 0.03309279680252075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309279782115482e-05, "grad_norm": 19.053918838500977, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8673082590103149, "num_tokens": 713763515.0, "step": 18705 }, { "epoch": 2.379595471314082, "ewc_loss": 0.033119574189186096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311957334517501e-05, "grad_norm": 19.057918548583984, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8794381618499756, "num_tokens": 713801064.0, "step": 18706 }, { "epoch": 2.3797226815926726, "ewc_loss": 0.03313944861292839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3139447623398155e-05, "grad_norm": 19.068870544433594, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.871113657951355, "num_tokens": 713841662.0, "step": 18707 }, { "epoch": 2.379849891871263, "ewc_loss": 0.033190712332725525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319071402074769e-05, "grad_norm": 19.08014488220215, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8760608434677124, "num_tokens": 713883646.0, "step": 18708 }, { "epoch": 2.3799771021498537, "ewc_loss": 0.03309086337685585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3090862416429445e-05, "grad_norm": 19.054122924804688, "learning_rate": 1e-06, "loss": 0.3387, "mean_token_accuracy": 0.8948527574539185, "num_tokens": 713925365.0, "step": 18709 }, { "epoch": 2.3801043124284442, "ewc_loss": 0.033206600695848465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320660107419826e-05, "grad_norm": 19.13214111328125, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8711686134338379, "num_tokens": 713961142.0, "step": 18710 }, { "epoch": 2.3802315227070348, "ewc_loss": 0.033096976578235626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3096977858804166e-05, "grad_norm": 19.02202796936035, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8822073340415955, "num_tokens": 714001652.0, "step": 18711 }, { "epoch": 2.3803587329856253, "ewc_loss": 0.03314445540308952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314445712021552e-05, "grad_norm": 19.19192886352539, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8765667676925659, "num_tokens": 714043023.0, "step": 18712 }, { "epoch": 2.380485943264216, "ewc_loss": 0.033130306750535965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3130305382655933e-05, "grad_norm": 19.04364013671875, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8741170167922974, "num_tokens": 714080389.0, "step": 18713 }, { "epoch": 2.3806131535428063, "ewc_loss": 0.033103086054325104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310308602522127e-05, "grad_norm": 19.1158504486084, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8779081106185913, "num_tokens": 714117695.0, "step": 18714 }, { "epoch": 2.380740363821397, "ewc_loss": 0.033093180507421494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309317980892956e-05, "grad_norm": 19.03670883178711, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8813248872756958, "num_tokens": 714154204.0, "step": 18715 }, { "epoch": 2.3808675740999874, "ewc_loss": 0.03306584060192108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306584039819427e-05, "grad_norm": 19.110565185546875, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8581667542457581, "num_tokens": 714194857.0, "step": 18716 }, { "epoch": 2.380994784378578, "ewc_loss": 0.033083561807870865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308356099296361e-05, "grad_norm": 19.116283416748047, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8858057856559753, "num_tokens": 714228034.0, "step": 18717 }, { "epoch": 2.3811219946571685, "ewc_loss": 0.03305485099554062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3054850064218044e-05, "grad_norm": 19.10345458984375, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8805965781211853, "num_tokens": 714269688.0, "step": 18718 }, { "epoch": 2.381249204935759, "ewc_loss": 0.033026013523340225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302601180621423e-05, "grad_norm": 19.054641723632812, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8694813847541809, "num_tokens": 714309039.0, "step": 18719 }, { "epoch": 2.3813764152143495, "ewc_loss": 0.03303520008921623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303520134068094e-05, "grad_norm": 19.145751953125, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8803931474685669, "num_tokens": 714347453.0, "step": 18720 }, { "epoch": 2.3815036254929396, "ewc_loss": 0.03308051452040672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3080516004702076e-05, "grad_norm": 19.071260452270508, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8724024295806885, "num_tokens": 714385850.0, "step": 18721 }, { "epoch": 2.3816308357715306, "ewc_loss": 0.03298770636320114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.298770752735436e-05, "grad_norm": 19.079713821411133, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8696061372756958, "num_tokens": 714426735.0, "step": 18722 }, { "epoch": 2.3817580460501206, "ewc_loss": 0.03304339572787285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304339406895451e-05, "grad_norm": 19.11173439025879, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8869581818580627, "num_tokens": 714465279.0, "step": 18723 }, { "epoch": 2.381885256328711, "ewc_loss": 0.03305883705615997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305883728899062e-05, "grad_norm": 19.112533569335938, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8635717630386353, "num_tokens": 714505703.0, "step": 18724 }, { "epoch": 2.3820124666073017, "ewc_loss": 0.03301801532506943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301801552879624e-05, "grad_norm": 19.026338577270508, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.868160605430603, "num_tokens": 714543642.0, "step": 18725 }, { "epoch": 2.3821396768858922, "ewc_loss": 0.032990291714668274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2990290492307395e-05, "grad_norm": 19.05330467224121, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.8915042281150818, "num_tokens": 714583533.0, "step": 18726 }, { "epoch": 2.3822668871644828, "ewc_loss": 0.033130187541246414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3130188967334107e-05, "grad_norm": 19.10034942626953, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8797810077667236, "num_tokens": 714625577.0, "step": 18727 }, { "epoch": 2.3823940974430733, "ewc_loss": 0.03302060812711716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30206094076857e-05, "grad_norm": 18.990659713745117, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8705800175666809, "num_tokens": 714664987.0, "step": 18728 }, { "epoch": 2.382521307721664, "ewc_loss": 0.03305670619010925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305670543340966e-05, "grad_norm": 19.112930297851562, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8685083389282227, "num_tokens": 714705881.0, "step": 18729 }, { "epoch": 2.3826485180002543, "ewc_loss": 0.033062681555747986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306268263258971e-05, "grad_norm": 19.025056838989258, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8655234575271606, "num_tokens": 714746012.0, "step": 18730 }, { "epoch": 2.382775728278845, "ewc_loss": 0.033043425530195236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3043426810763776e-05, "grad_norm": 19.106250762939453, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8766473531723022, "num_tokens": 714790001.0, "step": 18731 }, { "epoch": 2.3829029385574354, "ewc_loss": 0.0330745130777359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3074513339670375e-05, "grad_norm": 19.08429527282715, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8813480138778687, "num_tokens": 714824515.0, "step": 18732 }, { "epoch": 2.383030148836026, "ewc_loss": 0.03303991258144379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3039912523236126e-05, "grad_norm": 19.118104934692383, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8676542043685913, "num_tokens": 714860841.0, "step": 18733 }, { "epoch": 2.3831573591146165, "ewc_loss": 0.03302006423473358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302006371086463e-05, "grad_norm": 19.039918899536133, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8766710162162781, "num_tokens": 714896373.0, "step": 18734 }, { "epoch": 2.383284569393207, "ewc_loss": 0.033031340688467026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303134144516662e-05, "grad_norm": 19.13042640686035, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8773524761199951, "num_tokens": 714926684.0, "step": 18735 }, { "epoch": 2.3834117796717975, "ewc_loss": 0.03308834508061409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308834493509494e-05, "grad_norm": 19.089271545410156, "learning_rate": 1e-06, "loss": 0.3527, "mean_token_accuracy": 0.888202428817749, "num_tokens": 714961057.0, "step": 18736 }, { "epoch": 2.383538989950388, "ewc_loss": 0.033048320561647415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3048319892259315e-05, "grad_norm": 19.090131759643555, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8805496096611023, "num_tokens": 715001215.0, "step": 18737 }, { "epoch": 2.3836662002289786, "ewc_loss": 0.03301132470369339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301132528577e-05, "grad_norm": 19.092823028564453, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8556283712387085, "num_tokens": 715038252.0, "step": 18738 }, { "epoch": 2.383793410507569, "ewc_loss": 0.033033549785614014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.303354969830252e-05, "grad_norm": 19.093965530395508, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8737204074859619, "num_tokens": 715078320.0, "step": 18739 }, { "epoch": 2.3839206207861596, "ewc_loss": 0.03302154317498207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302154436823912e-05, "grad_norm": 19.07950210571289, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8695666193962097, "num_tokens": 715115628.0, "step": 18740 }, { "epoch": 2.38404783106475, "ewc_loss": 0.03300061449408531, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300061507616192e-05, "grad_norm": 19.002758026123047, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8839019536972046, "num_tokens": 715153294.0, "step": 18741 }, { "epoch": 2.3841750413433407, "ewc_loss": 0.033016595989465714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3016596717061475e-05, "grad_norm": 19.038402557373047, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8751009702682495, "num_tokens": 715193736.0, "step": 18742 }, { "epoch": 2.384302251621931, "ewc_loss": 0.03308802843093872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308802843093872e-05, "grad_norm": 19.01892852783203, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8757649064064026, "num_tokens": 715234539.0, "step": 18743 }, { "epoch": 2.3844294619005217, "ewc_loss": 0.03305608406662941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305608333903365e-05, "grad_norm": 19.04166603088379, "learning_rate": 1e-06, "loss": 0.3461, "mean_token_accuracy": 0.8869438171386719, "num_tokens": 715271567.0, "step": 18744 }, { "epoch": 2.3845566721791123, "ewc_loss": 0.03317319229245186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317319351481274e-05, "grad_norm": 19.127784729003906, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8867411613464355, "num_tokens": 715314850.0, "step": 18745 }, { "epoch": 2.3846838824577024, "ewc_loss": 0.03308165445923805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3081654692068696e-05, "grad_norm": 19.152969360351562, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.88080894947052, "num_tokens": 715353353.0, "step": 18746 }, { "epoch": 2.3848110927362933, "ewc_loss": 0.033090319484472275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309032035758719e-05, "grad_norm": 19.129911422729492, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8853366374969482, "num_tokens": 715387843.0, "step": 18747 }, { "epoch": 2.3849383030148834, "ewc_loss": 0.03305261954665184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.30526199832093e-05, "grad_norm": 19.119016647338867, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8627554774284363, "num_tokens": 715428037.0, "step": 18748 }, { "epoch": 2.385065513293474, "ewc_loss": 0.03306690603494644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3066906325984746e-05, "grad_norm": 19.058883666992188, "learning_rate": 1e-06, "loss": 0.3491, "mean_token_accuracy": 0.8882126808166504, "num_tokens": 715464403.0, "step": 18749 }, { "epoch": 2.3851927235720645, "ewc_loss": 0.03302161023020744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.302160985185765e-05, "grad_norm": 19.109254837036133, "learning_rate": 1e-06, "loss": 0.3553, "mean_token_accuracy": 0.8846054077148438, "num_tokens": 715501820.0, "step": 18750 }, { "epoch": 2.385319933850655, "ewc_loss": 0.03300362825393677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.300362732261419e-05, "grad_norm": 19.071285247802734, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8847261667251587, "num_tokens": 715543287.0, "step": 18751 }, { "epoch": 2.3854471441292455, "ewc_loss": 0.03305692598223686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3056927350116894e-05, "grad_norm": 19.141820907592773, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8835418224334717, "num_tokens": 715578189.0, "step": 18752 }, { "epoch": 2.385574354407836, "ewc_loss": 0.03305999934673309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.305999780423008e-05, "grad_norm": 19.10467529296875, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8652166724205017, "num_tokens": 715616447.0, "step": 18753 }, { "epoch": 2.3857015646864266, "ewc_loss": 0.03304130584001541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304130586911924e-05, "grad_norm": 19.13174057006836, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8794829249382019, "num_tokens": 715651692.0, "step": 18754 }, { "epoch": 2.385828774965017, "ewc_loss": 0.032995179295539856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.2995179935824126e-05, "grad_norm": 19.028249740600586, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8682770729064941, "num_tokens": 715687733.0, "step": 18755 }, { "epoch": 2.3859559852436076, "ewc_loss": 0.03306722640991211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.306722646811977e-05, "grad_norm": 19.165590286254883, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8539010286331177, "num_tokens": 715727481.0, "step": 18756 }, { "epoch": 2.386083195522198, "ewc_loss": 0.03305299952626228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3052998333005235e-05, "grad_norm": 19.111331939697266, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8570958375930786, "num_tokens": 715767549.0, "step": 18757 }, { "epoch": 2.3862104058007887, "ewc_loss": 0.0330108217895031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301082324469462e-05, "grad_norm": 19.111440658569336, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8684159517288208, "num_tokens": 715807251.0, "step": 18758 }, { "epoch": 2.386337616079379, "ewc_loss": 0.0330159030854702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.301590186310932e-05, "grad_norm": 19.021841049194336, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8591805696487427, "num_tokens": 715846893.0, "step": 18759 }, { "epoch": 2.3864648263579697, "ewc_loss": 0.0330469012260437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.304690108052455e-05, "grad_norm": 19.130634307861328, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8616466522216797, "num_tokens": 715883611.0, "step": 18760 }, { "epoch": 2.3865920366365603, "ewc_loss": 0.03305268660187721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3052685466827825e-05, "grad_norm": 19.01957130432129, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8760039806365967, "num_tokens": 715915598.0, "step": 18761 }, { "epoch": 2.386719246915151, "ewc_loss": 0.03309639170765877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3096392144216225e-05, "grad_norm": 19.14469337463379, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8813259601593018, "num_tokens": 715956647.0, "step": 18762 }, { "epoch": 2.3868464571937413, "ewc_loss": 0.0331960991024971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319609822938219e-05, "grad_norm": 19.022632598876953, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8812664747238159, "num_tokens": 715996627.0, "step": 18763 }, { "epoch": 2.386973667472332, "ewc_loss": 0.033155299723148346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3155298297060654e-05, "grad_norm": 19.16657829284668, "learning_rate": 1e-06, "loss": 0.325, "mean_token_accuracy": 0.8945587873458862, "num_tokens": 716027823.0, "step": 18764 }, { "epoch": 2.3871008777509224, "ewc_loss": 0.03319516032934189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319515963084996e-05, "grad_norm": 18.956754684448242, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8617751598358154, "num_tokens": 716067430.0, "step": 18765 }, { "epoch": 2.387228088029513, "ewc_loss": 0.03313099220395088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3130992960650474e-05, "grad_norm": 19.151378631591797, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8676239252090454, "num_tokens": 716109436.0, "step": 18766 }, { "epoch": 2.3873552983081034, "ewc_loss": 0.033279623836278915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3279622584814206e-05, "grad_norm": 19.055255889892578, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8708407878875732, "num_tokens": 716148157.0, "step": 18767 }, { "epoch": 2.387482508586694, "ewc_loss": 0.03314598649740219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31459850713145e-05, "grad_norm": 19.01719856262207, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8760395050048828, "num_tokens": 716184662.0, "step": 18768 }, { "epoch": 2.3876097188652845, "ewc_loss": 0.03320934623479843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3209347748197615e-05, "grad_norm": 19.13044548034668, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8677011132240295, "num_tokens": 716227382.0, "step": 18769 }, { "epoch": 2.387736929143875, "ewc_loss": 0.03322887048125267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322886914247647e-05, "grad_norm": 19.090620040893555, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.878273606300354, "num_tokens": 716262236.0, "step": 18770 }, { "epoch": 2.387864139422465, "ewc_loss": 0.03319830447435379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3198302844539285e-05, "grad_norm": 19.119571685791016, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8822580575942993, "num_tokens": 716295541.0, "step": 18771 }, { "epoch": 2.387991349701056, "ewc_loss": 0.03320389986038208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32039016939234e-05, "grad_norm": 19.128955841064453, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8766903877258301, "num_tokens": 716331796.0, "step": 18772 }, { "epoch": 2.388118559979646, "ewc_loss": 0.033175207674503326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3175208955071867e-05, "grad_norm": 19.109474182128906, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8651794195175171, "num_tokens": 716369539.0, "step": 18773 }, { "epoch": 2.3882457702582367, "ewc_loss": 0.033225156366825104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322515476611443e-05, "grad_norm": 19.121267318725586, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.8918527364730835, "num_tokens": 716409806.0, "step": 18774 }, { "epoch": 2.3883729805368272, "ewc_loss": 0.033134765923023224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313476554467343e-05, "grad_norm": 19.025758743286133, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8678789138793945, "num_tokens": 716448191.0, "step": 18775 }, { "epoch": 2.3885001908154178, "ewc_loss": 0.033177513629198074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317751543363556e-05, "grad_norm": 19.14616584777832, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8754227161407471, "num_tokens": 716486522.0, "step": 18776 }, { "epoch": 2.3886274010940083, "ewc_loss": 0.03318945690989494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318945528008044e-05, "grad_norm": 19.03681182861328, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8760961294174194, "num_tokens": 716519383.0, "step": 18777 }, { "epoch": 2.388754611372599, "ewc_loss": 0.03317953273653984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31795345118735e-05, "grad_norm": 19.32457733154297, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8766851425170898, "num_tokens": 716556136.0, "step": 18778 }, { "epoch": 2.3888818216511893, "ewc_loss": 0.03320269659161568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320269752293825e-05, "grad_norm": 19.051267623901367, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8681524991989136, "num_tokens": 716592346.0, "step": 18779 }, { "epoch": 2.38900903192978, "ewc_loss": 0.03309224918484688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.309224848635495e-05, "grad_norm": 19.120534896850586, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8677887916564941, "num_tokens": 716633996.0, "step": 18780 }, { "epoch": 2.3891362422083704, "ewc_loss": 0.03319701552391052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3197015000041574e-05, "grad_norm": 19.047685623168945, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8702989220619202, "num_tokens": 716669638.0, "step": 18781 }, { "epoch": 2.389263452486961, "ewc_loss": 0.03313068300485611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313068373245187e-05, "grad_norm": 19.07222557067871, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8657851219177246, "num_tokens": 716711194.0, "step": 18782 }, { "epoch": 2.3893906627655515, "ewc_loss": 0.03321932256221771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321932308608666e-05, "grad_norm": 19.0728816986084, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8820905685424805, "num_tokens": 716753402.0, "step": 18783 }, { "epoch": 2.389517873044142, "ewc_loss": 0.033208731561899185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320873292977922e-05, "grad_norm": 19.108123779296875, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8765640258789062, "num_tokens": 716794638.0, "step": 18784 }, { "epoch": 2.3896450833227325, "ewc_loss": 0.03317238762974739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317238588351756e-05, "grad_norm": 19.065208435058594, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8465412259101868, "num_tokens": 716834003.0, "step": 18785 }, { "epoch": 2.389772293601323, "ewc_loss": 0.033136557787656784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3136559068225324e-05, "grad_norm": 19.04943084716797, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8798705339431763, "num_tokens": 716867349.0, "step": 18786 }, { "epoch": 2.3898995038799136, "ewc_loss": 0.03317360207438469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317360096843913e-05, "grad_norm": 19.056509017944336, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8733540773391724, "num_tokens": 716904271.0, "step": 18787 }, { "epoch": 2.390026714158504, "ewc_loss": 0.03313783183693886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3137832360807806e-05, "grad_norm": 19.070466995239258, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.876764714717865, "num_tokens": 716936165.0, "step": 18788 }, { "epoch": 2.3901539244370946, "ewc_loss": 0.03323424607515335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323424607515335e-05, "grad_norm": 19.070728302001953, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8736908435821533, "num_tokens": 716974011.0, "step": 18789 }, { "epoch": 2.390281134715685, "ewc_loss": 0.03321542963385582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321543044876307e-05, "grad_norm": 19.08176612854004, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.874701976776123, "num_tokens": 717012438.0, "step": 18790 }, { "epoch": 2.3904083449942757, "ewc_loss": 0.03325280174612999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3252803405048326e-05, "grad_norm": 19.02191734313965, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8658185601234436, "num_tokens": 717051755.0, "step": 18791 }, { "epoch": 2.390535555272866, "ewc_loss": 0.03323286771774292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323286728118546e-05, "grad_norm": 19.102994918823242, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.873999834060669, "num_tokens": 717090740.0, "step": 18792 }, { "epoch": 2.3906627655514567, "ewc_loss": 0.0332026369869709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320263567729853e-05, "grad_norm": 19.106355667114258, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8494287133216858, "num_tokens": 717132033.0, "step": 18793 }, { "epoch": 2.390789975830047, "ewc_loss": 0.03318449482321739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3184493076987565e-05, "grad_norm": 19.088396072387695, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8808014988899231, "num_tokens": 717168255.0, "step": 18794 }, { "epoch": 2.390917186108638, "ewc_loss": 0.03324757143855095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324757199152373e-05, "grad_norm": 19.104976654052734, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8729478120803833, "num_tokens": 717206863.0, "step": 18795 }, { "epoch": 2.391044396387228, "ewc_loss": 0.03321468457579613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321468466310762e-05, "grad_norm": 19.049541473388672, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8543012142181396, "num_tokens": 717245374.0, "step": 18796 }, { "epoch": 2.3911716066658184, "ewc_loss": 0.03319469094276428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319469033158384e-05, "grad_norm": 19.094388961791992, "learning_rate": 1e-06, "loss": 0.339, "mean_token_accuracy": 0.8923632502555847, "num_tokens": 717279821.0, "step": 18797 }, { "epoch": 2.391298816944409, "ewc_loss": 0.03319748863577843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3197487937286496e-05, "grad_norm": 19.02016830444336, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8676499128341675, "num_tokens": 717318896.0, "step": 18798 }, { "epoch": 2.3914260272229995, "ewc_loss": 0.03317958489060402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31795854435768e-05, "grad_norm": 19.057353973388672, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8647518754005432, "num_tokens": 717361493.0, "step": 18799 }, { "epoch": 2.39155323750159, "ewc_loss": 0.03327525407075882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327525337226689e-05, "grad_norm": 19.05080795288086, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8662959933280945, "num_tokens": 717402801.0, "step": 18800 }, { "epoch": 2.3916804477801805, "ewc_loss": 0.03320948779582977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3209485991392285e-05, "grad_norm": 19.078733444213867, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.891987681388855, "num_tokens": 717441275.0, "step": 18801 }, { "epoch": 2.391807658058771, "ewc_loss": 0.03321372717618942, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3213727874681354e-05, "grad_norm": 19.13308334350586, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8621068000793457, "num_tokens": 717477423.0, "step": 18802 }, { "epoch": 2.3919348683373616, "ewc_loss": 0.03319435939192772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31943592755124e-05, "grad_norm": 19.029945373535156, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8676370978355408, "num_tokens": 717512978.0, "step": 18803 }, { "epoch": 2.392062078615952, "ewc_loss": 0.033207811415195465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320781252114102e-05, "grad_norm": 19.11916732788086, "learning_rate": 1e-06, "loss": 0.3353, "mean_token_accuracy": 0.8932784199714661, "num_tokens": 717549710.0, "step": 18804 }, { "epoch": 2.3921892888945426, "ewc_loss": 0.03328727185726166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328727325424552e-05, "grad_norm": 19.058822631835938, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8697834014892578, "num_tokens": 717586588.0, "step": 18805 }, { "epoch": 2.392316499173133, "ewc_loss": 0.033176813274621964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3176813303725794e-05, "grad_norm": 19.096609115600586, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8867282271385193, "num_tokens": 717621507.0, "step": 18806 }, { "epoch": 2.3924437094517237, "ewc_loss": 0.033260833472013474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326083242427558e-05, "grad_norm": 19.06051254272461, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8759807348251343, "num_tokens": 717655915.0, "step": 18807 }, { "epoch": 2.392570919730314, "ewc_loss": 0.033206917345523834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320691757835448e-05, "grad_norm": 19.144323348999023, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8708316683769226, "num_tokens": 717696357.0, "step": 18808 }, { "epoch": 2.3926981300089047, "ewc_loss": 0.033216916024684906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3216914744116366e-05, "grad_norm": 19.150854110717773, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8870639204978943, "num_tokens": 717735397.0, "step": 18809 }, { "epoch": 2.3928253402874953, "ewc_loss": 0.03319466486573219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319466486573219e-05, "grad_norm": 19.105682373046875, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.87839674949646, "num_tokens": 717770081.0, "step": 18810 }, { "epoch": 2.392952550566086, "ewc_loss": 0.033183325082063675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318332528579049e-05, "grad_norm": 19.110715866088867, "learning_rate": 1e-06, "loss": 0.3377, "mean_token_accuracy": 0.8909016847610474, "num_tokens": 717805262.0, "step": 18811 }, { "epoch": 2.3930797608446763, "ewc_loss": 0.033177051693201065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317705341032706e-05, "grad_norm": 19.187335968017578, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8740718960762024, "num_tokens": 717841877.0, "step": 18812 }, { "epoch": 2.393206971123267, "ewc_loss": 0.033156245946884155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31562441715505e-05, "grad_norm": 19.120702743530273, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8585206270217896, "num_tokens": 717881435.0, "step": 18813 }, { "epoch": 2.3933341814018574, "ewc_loss": 0.033166803419589996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3166805224027485e-05, "grad_norm": 19.094968795776367, "learning_rate": 1e-06, "loss": 0.3381, "mean_token_accuracy": 0.8937599658966064, "num_tokens": 717922240.0, "step": 18814 }, { "epoch": 2.393461391680448, "ewc_loss": 0.03316438943147659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3164389606099576e-05, "grad_norm": 19.11545181274414, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8710950613021851, "num_tokens": 717958438.0, "step": 18815 }, { "epoch": 2.3935886019590384, "ewc_loss": 0.03318696469068527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318696326459758e-05, "grad_norm": 19.104843139648438, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8657448887825012, "num_tokens": 717994451.0, "step": 18816 }, { "epoch": 2.393715812237629, "ewc_loss": 0.033146120607852936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314611967653036e-05, "grad_norm": 19.00544548034668, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8764709234237671, "num_tokens": 718034318.0, "step": 18817 }, { "epoch": 2.3938430225162195, "ewc_loss": 0.03315410390496254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315410504001193e-05, "grad_norm": 19.103364944458008, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8735243082046509, "num_tokens": 718074994.0, "step": 18818 }, { "epoch": 2.3939702327948096, "ewc_loss": 0.03321043774485588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321043914183974e-05, "grad_norm": 19.008602142333984, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8727326989173889, "num_tokens": 718118849.0, "step": 18819 }, { "epoch": 2.3940974430734006, "ewc_loss": 0.03315236419439316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315236244816333e-05, "grad_norm": 19.187685012817383, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8796252608299255, "num_tokens": 718152874.0, "step": 18820 }, { "epoch": 2.3942246533519906, "ewc_loss": 0.033252373337745667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325237412354909e-05, "grad_norm": 19.03943634033203, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8670550584793091, "num_tokens": 718194206.0, "step": 18821 }, { "epoch": 2.394351863630581, "ewc_loss": 0.0330372229218483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3037224056897685e-05, "grad_norm": 19.078269958496094, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.87755286693573, "num_tokens": 718227408.0, "step": 18822 }, { "epoch": 2.3944790739091717, "ewc_loss": 0.03325382620096207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325382567709312e-05, "grad_norm": 19.1439151763916, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8689658641815186, "num_tokens": 718266962.0, "step": 18823 }, { "epoch": 2.3946062841877622, "ewc_loss": 0.03312375396490097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312375338282436e-05, "grad_norm": 19.03420066833496, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8791182637214661, "num_tokens": 718307873.0, "step": 18824 }, { "epoch": 2.3947334944663528, "ewc_loss": 0.03313789516687393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313789420644753e-05, "grad_norm": 19.032974243164062, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8803640604019165, "num_tokens": 718350296.0, "step": 18825 }, { "epoch": 2.3948607047449433, "ewc_loss": 0.033167775720357895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316777656436898e-05, "grad_norm": 19.124418258666992, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.877251386642456, "num_tokens": 718390734.0, "step": 18826 }, { "epoch": 2.394987915023534, "ewc_loss": 0.033138345927000046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3138345315819606e-05, "grad_norm": 19.079578399658203, "learning_rate": 1e-06, "loss": 0.3448, "mean_token_accuracy": 0.890106201171875, "num_tokens": 718435630.0, "step": 18827 }, { "epoch": 2.3951151253021243, "ewc_loss": 0.033150505274534225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315050344099291e-05, "grad_norm": 19.03524398803711, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8818687200546265, "num_tokens": 718474791.0, "step": 18828 }, { "epoch": 2.395242335580715, "ewc_loss": 0.033140454441308975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314045534352772e-05, "grad_norm": 19.114484786987305, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8580573797225952, "num_tokens": 718514957.0, "step": 18829 }, { "epoch": 2.3953695458593054, "ewc_loss": 0.033117253333330154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311725231469609e-05, "grad_norm": 19.10215950012207, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8857616782188416, "num_tokens": 718551761.0, "step": 18830 }, { "epoch": 2.395496756137896, "ewc_loss": 0.033104464411735535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310446481918916e-05, "grad_norm": 19.029705047607422, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.873802661895752, "num_tokens": 718586445.0, "step": 18831 }, { "epoch": 2.3956239664164865, "ewc_loss": 0.033074844628572464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.307484439574182e-05, "grad_norm": 19.024686813354492, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8700525760650635, "num_tokens": 718631105.0, "step": 18832 }, { "epoch": 2.395751176695077, "ewc_loss": 0.033158350735902786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3158350561279804e-05, "grad_norm": 19.116291046142578, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8710484504699707, "num_tokens": 718672278.0, "step": 18833 }, { "epoch": 2.3958783869736675, "ewc_loss": 0.03311919793486595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311919863335788e-05, "grad_norm": 18.975303649902344, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8806456923484802, "num_tokens": 718713191.0, "step": 18834 }, { "epoch": 2.396005597252258, "ewc_loss": 0.03312171250581741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.312171247671358e-05, "grad_norm": 19.094213485717773, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8681480288505554, "num_tokens": 718745012.0, "step": 18835 }, { "epoch": 2.3961328075308486, "ewc_loss": 0.03315836936235428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315836875117384e-05, "grad_norm": 19.061052322387695, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8739170432090759, "num_tokens": 718780149.0, "step": 18836 }, { "epoch": 2.396260017809439, "ewc_loss": 0.033149998635053635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314999776193872e-05, "grad_norm": 19.12830352783203, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8706111907958984, "num_tokens": 718812831.0, "step": 18837 }, { "epoch": 2.3963872280880296, "ewc_loss": 0.033188652247190475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318865128676407e-05, "grad_norm": 18.998689651489258, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8844891786575317, "num_tokens": 718847650.0, "step": 18838 }, { "epoch": 2.39651443836662, "ewc_loss": 0.0331721268594265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317212758702226e-05, "grad_norm": 19.09099769592285, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8766732811927795, "num_tokens": 718883417.0, "step": 18839 }, { "epoch": 2.3966416486452107, "ewc_loss": 0.03324876353144646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3248765248572454e-05, "grad_norm": 19.099946975708008, "learning_rate": 1e-06, "loss": 0.3119, "mean_token_accuracy": 0.8987380266189575, "num_tokens": 718915501.0, "step": 18840 }, { "epoch": 2.396768858923801, "ewc_loss": 0.03320469707250595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320469841128215e-05, "grad_norm": 19.044832229614258, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8890523910522461, "num_tokens": 718954485.0, "step": 18841 }, { "epoch": 2.3968960692023917, "ewc_loss": 0.03319431468844414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319431561976671e-05, "grad_norm": 19.011629104614258, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8603236675262451, "num_tokens": 718995815.0, "step": 18842 }, { "epoch": 2.3970232794809823, "ewc_loss": 0.03324994817376137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324994759168476e-05, "grad_norm": 19.127180099487305, "learning_rate": 1e-06, "loss": 0.3302, "mean_token_accuracy": 0.8887447714805603, "num_tokens": 719032820.0, "step": 18843 }, { "epoch": 2.3971504897595723, "ewc_loss": 0.0332488939166069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32488925778307e-05, "grad_norm": 19.020483016967773, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.873017430305481, "num_tokens": 719066966.0, "step": 18844 }, { "epoch": 2.3972777000381633, "ewc_loss": 0.03319297358393669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3192973205586895e-05, "grad_norm": 19.158687591552734, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8742480278015137, "num_tokens": 719108768.0, "step": 18845 }, { "epoch": 2.3974049103167534, "ewc_loss": 0.03328274190425873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328274033265188e-05, "grad_norm": 19.037981033325195, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8718659281730652, "num_tokens": 719151451.0, "step": 18846 }, { "epoch": 2.397532120595344, "ewc_loss": 0.033109746873378754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310974716441706e-05, "grad_norm": 19.06945037841797, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8671792149543762, "num_tokens": 719192103.0, "step": 18847 }, { "epoch": 2.3976593308739345, "ewc_loss": 0.033329807221889496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3329808502458036e-05, "grad_norm": 19.118961334228516, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8713409900665283, "num_tokens": 719226514.0, "step": 18848 }, { "epoch": 2.397786541152525, "ewc_loss": 0.033177025616168976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3177024306496605e-05, "grad_norm": 19.09752655029297, "learning_rate": 1e-06, "loss": 0.3182, "mean_token_accuracy": 0.8976331353187561, "num_tokens": 719262944.0, "step": 18849 }, { "epoch": 2.3979137514311155, "ewc_loss": 0.033259522169828415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3259522751905024e-05, "grad_norm": 19.166912078857422, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8801789283752441, "num_tokens": 719306707.0, "step": 18850 }, { "epoch": 2.398040961709706, "ewc_loss": 0.03324932977557182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324932913528755e-05, "grad_norm": 19.042898178100586, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8801587820053101, "num_tokens": 719347523.0, "step": 18851 }, { "epoch": 2.3981681719882966, "ewc_loss": 0.03313245624303818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3132455428130925e-05, "grad_norm": 19.044048309326172, "learning_rate": 1e-06, "loss": 0.3237, "mean_token_accuracy": 0.8933107852935791, "num_tokens": 719379474.0, "step": 18852 }, { "epoch": 2.398295382266887, "ewc_loss": 0.03327959030866623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327958984300494e-05, "grad_norm": 19.09407615661621, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8599674701690674, "num_tokens": 719421624.0, "step": 18853 }, { "epoch": 2.3984225925454776, "ewc_loss": 0.03325716033577919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325716170365922e-05, "grad_norm": 19.184038162231445, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8709468841552734, "num_tokens": 719455570.0, "step": 18854 }, { "epoch": 2.398549802824068, "ewc_loss": 0.033247530460357666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324753197375685e-05, "grad_norm": 19.050081253051758, "learning_rate": 1e-06, "loss": 0.3473, "mean_token_accuracy": 0.8913962244987488, "num_tokens": 719500300.0, "step": 18855 }, { "epoch": 2.3986770131026587, "ewc_loss": 0.033189527690410614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318952803965658e-05, "grad_norm": 19.179370880126953, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8598160743713379, "num_tokens": 719538119.0, "step": 18856 }, { "epoch": 2.398804223381249, "ewc_loss": 0.033196885138750076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319688403280452e-05, "grad_norm": 19.03010368347168, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8656501770019531, "num_tokens": 719574012.0, "step": 18857 }, { "epoch": 2.3989314336598397, "ewc_loss": 0.033138614147901535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313861452625133e-05, "grad_norm": 19.111770629882812, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8862962126731873, "num_tokens": 719609548.0, "step": 18858 }, { "epoch": 2.3990586439384303, "ewc_loss": 0.033213354647159576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321335316286422e-05, "grad_norm": 19.043642044067383, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8691556453704834, "num_tokens": 719647280.0, "step": 18859 }, { "epoch": 2.399185854217021, "ewc_loss": 0.0331878736615181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318787275929935e-05, "grad_norm": 19.140493392944336, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8893150687217712, "num_tokens": 719686987.0, "step": 18860 }, { "epoch": 2.3993130644956113, "ewc_loss": 0.033232707530260086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323270721011795e-05, "grad_norm": 19.153791427612305, "learning_rate": 1e-06, "loss": 0.3422, "mean_token_accuracy": 0.8934454917907715, "num_tokens": 719727974.0, "step": 18861 }, { "epoch": 2.399440274774202, "ewc_loss": 0.033182933926582336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318293238407932e-05, "grad_norm": 19.092191696166992, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8761041164398193, "num_tokens": 719764308.0, "step": 18862 }, { "epoch": 2.3995674850527924, "ewc_loss": 0.033164799213409424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316480069770478e-05, "grad_norm": 19.08966827392578, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.864830732345581, "num_tokens": 719801872.0, "step": 18863 }, { "epoch": 2.399694695331383, "ewc_loss": 0.033201780170202255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3201780752278864e-05, "grad_norm": 19.115474700927734, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.876885712146759, "num_tokens": 719842504.0, "step": 18864 }, { "epoch": 2.3998219056099734, "ewc_loss": 0.033184800297021866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318479866720736e-05, "grad_norm": 19.129884719848633, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8691766262054443, "num_tokens": 719878885.0, "step": 18865 }, { "epoch": 2.399949115888564, "ewc_loss": 0.033236537128686905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323653800180182e-05, "grad_norm": 19.12958335876465, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8580226898193359, "num_tokens": 719922891.0, "step": 18866 }, { "epoch": 2.4000763261671545, "ewc_loss": 0.033237800002098083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323780038044788e-05, "grad_norm": 19.148563385009766, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8727588653564453, "num_tokens": 719960113.0, "step": 18867 }, { "epoch": 2.400203536445745, "ewc_loss": 0.03310931473970413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3109314244939014e-05, "grad_norm": 19.03388214111328, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8738052845001221, "num_tokens": 720006079.0, "step": 18868 }, { "epoch": 2.400330746724335, "ewc_loss": 0.03316624462604523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316624497529119e-05, "grad_norm": 19.111852645874023, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8616591691970825, "num_tokens": 720042219.0, "step": 18869 }, { "epoch": 2.400457957002926, "ewc_loss": 0.033165737986564636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316573929623701e-05, "grad_norm": 19.091808319091797, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8755018711090088, "num_tokens": 720082574.0, "step": 18870 }, { "epoch": 2.400585167281516, "ewc_loss": 0.033083416521549225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.308341547381133e-05, "grad_norm": 19.047014236450195, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8739538788795471, "num_tokens": 720115583.0, "step": 18871 }, { "epoch": 2.4007123775601067, "ewc_loss": 0.033184733241796494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318473318358883e-05, "grad_norm": 19.067453384399414, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8663674592971802, "num_tokens": 720151350.0, "step": 18872 }, { "epoch": 2.400839587838697, "ewc_loss": 0.0331927128136158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3192711271112785e-05, "grad_norm": 19.124513626098633, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.86688232421875, "num_tokens": 720187072.0, "step": 18873 }, { "epoch": 2.4009667981172877, "ewc_loss": 0.0331687405705452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316874062875286e-05, "grad_norm": 19.046846389770508, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.877973198890686, "num_tokens": 720222249.0, "step": 18874 }, { "epoch": 2.4010940083958783, "ewc_loss": 0.033119481056928635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3119482395704836e-05, "grad_norm": 19.059354782104492, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8738707304000854, "num_tokens": 720264682.0, "step": 18875 }, { "epoch": 2.401221218674469, "ewc_loss": 0.0332321934401989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323219425510615e-05, "grad_norm": 19.086902618408203, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8712917566299438, "num_tokens": 720309935.0, "step": 18876 }, { "epoch": 2.4013484289530593, "ewc_loss": 0.03321876749396324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3218766475329176e-05, "grad_norm": 19.152334213256836, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8694257736206055, "num_tokens": 720343992.0, "step": 18877 }, { "epoch": 2.40147563923165, "ewc_loss": 0.03317015990614891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317015944048762e-05, "grad_norm": 19.009723663330078, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8743268251419067, "num_tokens": 720379707.0, "step": 18878 }, { "epoch": 2.4016028495102404, "ewc_loss": 0.033159587532281876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3159587474074215e-05, "grad_norm": 19.15247917175293, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.871235728263855, "num_tokens": 720419796.0, "step": 18879 }, { "epoch": 2.401730059788831, "ewc_loss": 0.03322470188140869, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322470365674235e-05, "grad_norm": 19.01138687133789, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8767102360725403, "num_tokens": 720460919.0, "step": 18880 }, { "epoch": 2.4018572700674214, "ewc_loss": 0.03317301720380783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.317301889183e-05, "grad_norm": 19.093887329101562, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8710809350013733, "num_tokens": 720498403.0, "step": 18881 }, { "epoch": 2.401984480346012, "ewc_loss": 0.033274825662374496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327482409076765e-05, "grad_norm": 19.119739532470703, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8758653402328491, "num_tokens": 720534931.0, "step": 18882 }, { "epoch": 2.4021116906246025, "ewc_loss": 0.033196572214365005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319657116662711e-05, "grad_norm": 19.095989227294922, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8775319457054138, "num_tokens": 720574062.0, "step": 18883 }, { "epoch": 2.402238900903193, "ewc_loss": 0.033248044550418854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324804492876865e-05, "grad_norm": 19.09328842163086, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8664161562919617, "num_tokens": 720611552.0, "step": 18884 }, { "epoch": 2.4023661111817836, "ewc_loss": 0.03327270597219467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3272706787101924e-05, "grad_norm": 19.192529678344727, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8747348189353943, "num_tokens": 720647886.0, "step": 18885 }, { "epoch": 2.402493321460374, "ewc_loss": 0.03325142711400986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3251428249059245e-05, "grad_norm": 19.16401481628418, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.879846453666687, "num_tokens": 720688034.0, "step": 18886 }, { "epoch": 2.4026205317389646, "ewc_loss": 0.033200185745954514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320018731756136e-05, "grad_norm": 19.10623550415039, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8817360401153564, "num_tokens": 720727805.0, "step": 18887 }, { "epoch": 2.402747742017555, "ewc_loss": 0.033140406012535095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314040441182442e-05, "grad_norm": 19.090774536132812, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8779984712600708, "num_tokens": 720759717.0, "step": 18888 }, { "epoch": 2.4028749522961457, "ewc_loss": 0.03318788856267929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318788731121458e-05, "grad_norm": 19.181493759155273, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8682328462600708, "num_tokens": 720790435.0, "step": 18889 }, { "epoch": 2.403002162574736, "ewc_loss": 0.033235736191272736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323573764646426e-05, "grad_norm": 19.111589431762695, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8706222772598267, "num_tokens": 720831715.0, "step": 18890 }, { "epoch": 2.4031293728533267, "ewc_loss": 0.033132344484329224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.313234628876671e-05, "grad_norm": 19.073701858520508, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8732994794845581, "num_tokens": 720869598.0, "step": 18891 }, { "epoch": 2.403256583131917, "ewc_loss": 0.03319929912686348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319929965073243e-05, "grad_norm": 19.06689453125, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.86994868516922, "num_tokens": 720911122.0, "step": 18892 }, { "epoch": 2.403383793410508, "ewc_loss": 0.0332123301923275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321233089081943e-05, "grad_norm": 19.086835861206055, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8783156275749207, "num_tokens": 720952265.0, "step": 18893 }, { "epoch": 2.403511003689098, "ewc_loss": 0.03325580060482025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325580109958537e-05, "grad_norm": 19.150671005249023, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8783659934997559, "num_tokens": 720991191.0, "step": 18894 }, { "epoch": 2.4036382139676884, "ewc_loss": 0.033231545239686966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323154669487849e-05, "grad_norm": 19.10634994506836, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8841372728347778, "num_tokens": 721032524.0, "step": 18895 }, { "epoch": 2.403765424246279, "ewc_loss": 0.033210352063179016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321035183034837e-05, "grad_norm": 19.15680503845215, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8745352029800415, "num_tokens": 721072699.0, "step": 18896 }, { "epoch": 2.4038926345248695, "ewc_loss": 0.03322458267211914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3224583603441715e-05, "grad_norm": 19.104312896728516, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8859754800796509, "num_tokens": 721109467.0, "step": 18897 }, { "epoch": 2.40401984480346, "ewc_loss": 0.03311796113848686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.311796172056347e-05, "grad_norm": 19.114614486694336, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8507952094078064, "num_tokens": 721142108.0, "step": 18898 }, { "epoch": 2.4041470550820505, "ewc_loss": 0.03318187966942787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318188100820407e-05, "grad_norm": 19.018146514892578, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8678728342056274, "num_tokens": 721179875.0, "step": 18899 }, { "epoch": 2.404274265360641, "ewc_loss": 0.03316274285316467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316274160169996e-05, "grad_norm": 19.08019256591797, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8711614608764648, "num_tokens": 721217141.0, "step": 18900 }, { "epoch": 2.4044014756392316, "ewc_loss": 0.03328496217727661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3284963137703016e-05, "grad_norm": 19.071903228759766, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8686540126800537, "num_tokens": 721256357.0, "step": 18901 }, { "epoch": 2.404528685917822, "ewc_loss": 0.033221371471881866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322137126815505e-05, "grad_norm": 19.10828971862793, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8745713233947754, "num_tokens": 721292610.0, "step": 18902 }, { "epoch": 2.4046558961964126, "ewc_loss": 0.03327168896794319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327168815303594e-05, "grad_norm": 19.11048698425293, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.880892813205719, "num_tokens": 721329048.0, "step": 18903 }, { "epoch": 2.404783106475003, "ewc_loss": 0.03320619463920593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320619362057187e-05, "grad_norm": 19.094745635986328, "learning_rate": 1e-06, "loss": 0.3407, "mean_token_accuracy": 0.8894985318183899, "num_tokens": 721364249.0, "step": 18904 }, { "epoch": 2.4049103167535937, "ewc_loss": 0.033287856727838516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3287855330854654e-05, "grad_norm": 19.17451286315918, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8815792202949524, "num_tokens": 721391154.0, "step": 18905 }, { "epoch": 2.405037527032184, "ewc_loss": 0.033308226615190506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330822801217437e-05, "grad_norm": 19.152687072753906, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8844125270843506, "num_tokens": 721428443.0, "step": 18906 }, { "epoch": 2.4051647373107747, "ewc_loss": 0.03325692564249039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325692523503676e-05, "grad_norm": 19.049583435058594, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8648380041122437, "num_tokens": 721474545.0, "step": 18907 }, { "epoch": 2.4052919475893653, "ewc_loss": 0.03327008709311485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327008744236082e-05, "grad_norm": 19.10008430480957, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8751149773597717, "num_tokens": 721513153.0, "step": 18908 }, { "epoch": 2.405419157867956, "ewc_loss": 0.033296842128038406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329684113850817e-05, "grad_norm": 19.096071243286133, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8704551458358765, "num_tokens": 721556287.0, "step": 18909 }, { "epoch": 2.4055463681465463, "ewc_loss": 0.033245112746953964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3245112717850134e-05, "grad_norm": 19.062763214111328, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8809420466423035, "num_tokens": 721590219.0, "step": 18910 }, { "epoch": 2.405673578425137, "ewc_loss": 0.03325189650058746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325189754832536e-05, "grad_norm": 19.050912857055664, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8745007514953613, "num_tokens": 721627308.0, "step": 18911 }, { "epoch": 2.4058007887037274, "ewc_loss": 0.03330869972705841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330870094941929e-05, "grad_norm": 19.08847999572754, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8698879480361938, "num_tokens": 721664218.0, "step": 18912 }, { "epoch": 2.405927998982318, "ewc_loss": 0.0332847498357296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32847484969534e-05, "grad_norm": 19.01365852355957, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8851112127304077, "num_tokens": 721703511.0, "step": 18913 }, { "epoch": 2.4060552092609084, "ewc_loss": 0.03336822986602783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336822919663973e-05, "grad_norm": 19.16060447692871, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8866648077964783, "num_tokens": 721741512.0, "step": 18914 }, { "epoch": 2.406182419539499, "ewc_loss": 0.03330560028553009, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330560139147565e-05, "grad_norm": 19.054874420166016, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8778406977653503, "num_tokens": 721778443.0, "step": 18915 }, { "epoch": 2.4063096298180895, "ewc_loss": 0.033226147294044495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3226147934328765e-05, "grad_norm": 19.039995193481445, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8715952634811401, "num_tokens": 721817226.0, "step": 18916 }, { "epoch": 2.4064368400966796, "ewc_loss": 0.03330419585108757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330419713165611e-05, "grad_norm": 18.994159698486328, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8610222935676575, "num_tokens": 721859419.0, "step": 18917 }, { "epoch": 2.4065640503752705, "ewc_loss": 0.03329496830701828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329496757942252e-05, "grad_norm": 19.06150245666504, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.8875150680541992, "num_tokens": 721895183.0, "step": 18918 }, { "epoch": 2.4066912606538606, "ewc_loss": 0.03335365653038025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335365545353852e-05, "grad_norm": 19.156789779663086, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8605730533599854, "num_tokens": 721933566.0, "step": 18919 }, { "epoch": 2.406818470932451, "ewc_loss": 0.03329450264573097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329450191813521e-05, "grad_norm": 19.080947875976562, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8779532313346863, "num_tokens": 721968827.0, "step": 18920 }, { "epoch": 2.4069456812110417, "ewc_loss": 0.03325667604804039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325667785247788e-05, "grad_norm": 19.084720611572266, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8768223524093628, "num_tokens": 722004989.0, "step": 18921 }, { "epoch": 2.407072891489632, "ewc_loss": 0.033310774713754654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331077459733933e-05, "grad_norm": 19.026201248168945, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8775237202644348, "num_tokens": 722043681.0, "step": 18922 }, { "epoch": 2.4072001017682227, "ewc_loss": 0.033314298838377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3314299798803404e-05, "grad_norm": 19.09819221496582, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8739111423492432, "num_tokens": 722086541.0, "step": 18923 }, { "epoch": 2.4073273120468133, "ewc_loss": 0.03328069671988487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32806957885623e-05, "grad_norm": 19.10031509399414, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8748136758804321, "num_tokens": 722125519.0, "step": 18924 }, { "epoch": 2.407454522325404, "ewc_loss": 0.033327143639326096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332714186399244e-05, "grad_norm": 19.14151954650879, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8704010248184204, "num_tokens": 722157665.0, "step": 18925 }, { "epoch": 2.4075817326039943, "ewc_loss": 0.03326914459466934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326914520584978e-05, "grad_norm": 19.092924118041992, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8877733945846558, "num_tokens": 722191819.0, "step": 18926 }, { "epoch": 2.407708942882585, "ewc_loss": 0.033286139369010925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328613820485771e-05, "grad_norm": 19.171131134033203, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8640438318252563, "num_tokens": 722229755.0, "step": 18927 }, { "epoch": 2.4078361531611754, "ewc_loss": 0.033320605754852295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3320604416076094e-05, "grad_norm": 19.18425941467285, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8639811277389526, "num_tokens": 722267440.0, "step": 18928 }, { "epoch": 2.407963363439766, "ewc_loss": 0.033206403255462646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320640462334268e-05, "grad_norm": 19.144695281982422, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8700714707374573, "num_tokens": 722310102.0, "step": 18929 }, { "epoch": 2.4080905737183564, "ewc_loss": 0.0332677848637104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326778460177593e-05, "grad_norm": 19.116958618164062, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8679846525192261, "num_tokens": 722346713.0, "step": 18930 }, { "epoch": 2.408217783996947, "ewc_loss": 0.033195044845342636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319504321552813e-05, "grad_norm": 19.124149322509766, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8690060377120972, "num_tokens": 722387803.0, "step": 18931 }, { "epoch": 2.4083449942755375, "ewc_loss": 0.033238038420677185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323803684907034e-05, "grad_norm": 19.173559188842773, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8655714392662048, "num_tokens": 722428943.0, "step": 18932 }, { "epoch": 2.408472204554128, "ewc_loss": 0.03319060429930687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319060488138348e-05, "grad_norm": 19.019046783447266, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8678545951843262, "num_tokens": 722467860.0, "step": 18933 }, { "epoch": 2.4085994148327186, "ewc_loss": 0.033173009753227234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3173007977893576e-05, "grad_norm": 19.109540939331055, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8697909116744995, "num_tokens": 722503067.0, "step": 18934 }, { "epoch": 2.408726625111309, "ewc_loss": 0.03329408913850784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329409082653001e-05, "grad_norm": 19.111106872558594, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8762467503547668, "num_tokens": 722537842.0, "step": 18935 }, { "epoch": 2.4088538353898996, "ewc_loss": 0.03320305794477463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3203057682840154e-05, "grad_norm": 19.084402084350586, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8800458908081055, "num_tokens": 722574036.0, "step": 18936 }, { "epoch": 2.40898104566849, "ewc_loss": 0.033246006816625595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324600766063668e-05, "grad_norm": 19.130847930908203, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8747565746307373, "num_tokens": 722606087.0, "step": 18937 }, { "epoch": 2.4091082559470807, "ewc_loss": 0.03319024667143822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3190248359460384e-05, "grad_norm": 19.045059204101562, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8734285235404968, "num_tokens": 722648745.0, "step": 18938 }, { "epoch": 2.409235466225671, "ewc_loss": 0.03328326344490051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3283264201600105e-05, "grad_norm": 19.11676597595215, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8663902282714844, "num_tokens": 722688589.0, "step": 18939 }, { "epoch": 2.4093626765042617, "ewc_loss": 0.03325599431991577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3255993912462145e-05, "grad_norm": 19.124895095825195, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8909188508987427, "num_tokens": 722723941.0, "step": 18940 }, { "epoch": 2.4094898867828523, "ewc_loss": 0.03319665789604187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319665847811848e-05, "grad_norm": 19.051830291748047, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8573029637336731, "num_tokens": 722768242.0, "step": 18941 }, { "epoch": 2.4096170970614423, "ewc_loss": 0.0332314595580101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323145938338712e-05, "grad_norm": 19.253751754760742, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8757583498954773, "num_tokens": 722800970.0, "step": 18942 }, { "epoch": 2.4097443073400333, "ewc_loss": 0.03323864936828613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323864802950993e-05, "grad_norm": 19.0955867767334, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8655716180801392, "num_tokens": 722839168.0, "step": 18943 }, { "epoch": 2.4098715176186234, "ewc_loss": 0.03318348154425621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318348171887919e-05, "grad_norm": 19.19989585876465, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8857022523880005, "num_tokens": 722883451.0, "step": 18944 }, { "epoch": 2.409998727897214, "ewc_loss": 0.03324417024850845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32441704813391e-05, "grad_norm": 19.170734405517578, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8753026127815247, "num_tokens": 722928581.0, "step": 18945 }, { "epoch": 2.4101259381758044, "ewc_loss": 0.03316308930516243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3163090847665444e-05, "grad_norm": 19.139890670776367, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.885821521282196, "num_tokens": 722967141.0, "step": 18946 }, { "epoch": 2.410253148454395, "ewc_loss": 0.033164381980895996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.316438233014196e-05, "grad_norm": 19.099239349365234, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8709723949432373, "num_tokens": 723006263.0, "step": 18947 }, { "epoch": 2.4103803587329855, "ewc_loss": 0.033169206231832504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3169206290040165e-05, "grad_norm": 19.248628616333008, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.877406895160675, "num_tokens": 723050843.0, "step": 18948 }, { "epoch": 2.410507569011576, "ewc_loss": 0.033218592405319214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3218591852346435e-05, "grad_norm": 19.107786178588867, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.874498724937439, "num_tokens": 723092124.0, "step": 18949 }, { "epoch": 2.4106347792901666, "ewc_loss": 0.03315148130059242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3151482057292014e-05, "grad_norm": 19.17369270324707, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.885524332523346, "num_tokens": 723122478.0, "step": 18950 }, { "epoch": 2.410761989568757, "ewc_loss": 0.033253010362386703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325301076984033e-05, "grad_norm": 19.28253173828125, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8692022562026978, "num_tokens": 723163054.0, "step": 18951 }, { "epoch": 2.4108891998473476, "ewc_loss": 0.03315480425953865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315480353194289e-05, "grad_norm": 19.15469741821289, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8741530179977417, "num_tokens": 723201518.0, "step": 18952 }, { "epoch": 2.411016410125938, "ewc_loss": 0.033102475106716156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.310247484478168e-05, "grad_norm": 19.160737991333008, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8747208118438721, "num_tokens": 723235685.0, "step": 18953 }, { "epoch": 2.4111436204045287, "ewc_loss": 0.03314509615302086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.314509740448557e-05, "grad_norm": 19.10789680480957, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8762491941452026, "num_tokens": 723266510.0, "step": 18954 }, { "epoch": 2.411270830683119, "ewc_loss": 0.03315718099474907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.315718277008273e-05, "grad_norm": 19.175966262817383, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8783506155014038, "num_tokens": 723302800.0, "step": 18955 }, { "epoch": 2.4113980409617097, "ewc_loss": 0.03317989408969879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31798946717754e-05, "grad_norm": 19.021869659423828, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8771830797195435, "num_tokens": 723340669.0, "step": 18956 }, { "epoch": 2.4115252512403003, "ewc_loss": 0.03320462629199028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.320462565170601e-05, "grad_norm": 19.198144912719727, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8770270347595215, "num_tokens": 723377556.0, "step": 18957 }, { "epoch": 2.411652461518891, "ewc_loss": 0.033219337463378906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321933763800189e-05, "grad_norm": 19.07798194885254, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8785006403923035, "num_tokens": 723410942.0, "step": 18958 }, { "epoch": 2.4117796717974813, "ewc_loss": 0.03314787521958351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3147876820294186e-05, "grad_norm": 19.163251876831055, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8881561160087585, "num_tokens": 723451131.0, "step": 18959 }, { "epoch": 2.411906882076072, "ewc_loss": 0.033202774822711945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32027739204932e-05, "grad_norm": 19.092039108276367, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8782886266708374, "num_tokens": 723485876.0, "step": 18960 }, { "epoch": 2.4120340923546624, "ewc_loss": 0.033234208822250366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.323420969536528e-05, "grad_norm": 19.1768741607666, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8831607699394226, "num_tokens": 723521252.0, "step": 18961 }, { "epoch": 2.412161302633253, "ewc_loss": 0.03325057029724121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325056968606077e-05, "grad_norm": 19.187780380249023, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8595030307769775, "num_tokens": 723561094.0, "step": 18962 }, { "epoch": 2.4122885129118434, "ewc_loss": 0.03325958922505379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325958823552355e-05, "grad_norm": 19.137454986572266, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8729737401008606, "num_tokens": 723597281.0, "step": 18963 }, { "epoch": 2.412415723190434, "ewc_loss": 0.03325999528169632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3259995689149946e-05, "grad_norm": 19.189157485961914, "learning_rate": 1e-06, "loss": 0.3196, "mean_token_accuracy": 0.8974546194076538, "num_tokens": 723637690.0, "step": 18964 }, { "epoch": 2.4125429334690245, "ewc_loss": 0.03326704725623131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326704609207809e-05, "grad_norm": 19.1475830078125, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8762458562850952, "num_tokens": 723676250.0, "step": 18965 }, { "epoch": 2.412670143747615, "ewc_loss": 0.03319798782467842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319798634038307e-05, "grad_norm": 19.115310668945312, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8851941823959351, "num_tokens": 723713157.0, "step": 18966 }, { "epoch": 2.412797354026205, "ewc_loss": 0.033266548067331314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326654768898152e-05, "grad_norm": 19.222183227539062, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8802344799041748, "num_tokens": 723745877.0, "step": 18967 }, { "epoch": 2.412924564304796, "ewc_loss": 0.03327740356326103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327740341774188e-05, "grad_norm": 19.14385414123535, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8760412931442261, "num_tokens": 723784285.0, "step": 18968 }, { "epoch": 2.413051774583386, "ewc_loss": 0.033225689083337784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322568954899907e-05, "grad_norm": 19.1051082611084, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8639189004898071, "num_tokens": 723817085.0, "step": 18969 }, { "epoch": 2.4131789848619767, "ewc_loss": 0.03327295184135437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3272950531682e-05, "grad_norm": 19.150911331176758, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8697662353515625, "num_tokens": 723858466.0, "step": 18970 }, { "epoch": 2.413306195140567, "ewc_loss": 0.03331073373556137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3310734579572454e-05, "grad_norm": 19.18902015686035, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8747595548629761, "num_tokens": 723896627.0, "step": 18971 }, { "epoch": 2.4134334054191577, "ewc_loss": 0.033244308084249496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3244308724533767e-05, "grad_norm": 19.086456298828125, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8702976107597351, "num_tokens": 723930727.0, "step": 18972 }, { "epoch": 2.4135606156977483, "ewc_loss": 0.033245932310819626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324593126308173e-05, "grad_norm": 19.12018585205078, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.864924967288971, "num_tokens": 723965996.0, "step": 18973 }, { "epoch": 2.413687825976339, "ewc_loss": 0.033276982605457306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327698141220026e-05, "grad_norm": 19.1624755859375, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8697013258934021, "num_tokens": 724002441.0, "step": 18974 }, { "epoch": 2.4138150362549293, "ewc_loss": 0.03335060179233551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335060318931937e-05, "grad_norm": 19.158544540405273, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8792861700057983, "num_tokens": 724039238.0, "step": 18975 }, { "epoch": 2.41394224653352, "ewc_loss": 0.033269062638282776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326906153233722e-05, "grad_norm": 19.06272315979004, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8799606561660767, "num_tokens": 724077430.0, "step": 18976 }, { "epoch": 2.4140694568121104, "ewc_loss": 0.03327031061053276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327030935906805e-05, "grad_norm": 19.14434051513672, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8705512881278992, "num_tokens": 724113175.0, "step": 18977 }, { "epoch": 2.414196667090701, "ewc_loss": 0.03330035135149956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3300351788057014e-05, "grad_norm": 19.072370529174805, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.869629979133606, "num_tokens": 724156409.0, "step": 18978 }, { "epoch": 2.4143238773692914, "ewc_loss": 0.033311422914266586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3311422157566994e-05, "grad_norm": 19.099767684936523, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8748147487640381, "num_tokens": 724199879.0, "step": 18979 }, { "epoch": 2.414451087647882, "ewc_loss": 0.03331170976161957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3311709557892755e-05, "grad_norm": 19.19999122619629, "learning_rate": 1e-06, "loss": 0.346, "mean_token_accuracy": 0.8879765868186951, "num_tokens": 724238065.0, "step": 18980 }, { "epoch": 2.4145782979264725, "ewc_loss": 0.0333196297287941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331962943775579e-05, "grad_norm": 19.13437843322754, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8687600493431091, "num_tokens": 724272336.0, "step": 18981 }, { "epoch": 2.414705508205063, "ewc_loss": 0.03324047103524208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3240470656892285e-05, "grad_norm": 19.169696807861328, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8661338090896606, "num_tokens": 724305200.0, "step": 18982 }, { "epoch": 2.4148327184836536, "ewc_loss": 0.03333209827542305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33320967911277e-05, "grad_norm": 19.18776512145996, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8795197010040283, "num_tokens": 724341295.0, "step": 18983 }, { "epoch": 2.414959928762244, "ewc_loss": 0.033197738230228424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3197738957824185e-05, "grad_norm": 19.025548934936523, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8777168989181519, "num_tokens": 724384336.0, "step": 18984 }, { "epoch": 2.4150871390408346, "ewc_loss": 0.03328303247690201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328303137095645e-05, "grad_norm": 19.24262046813965, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8711000084877014, "num_tokens": 724417279.0, "step": 18985 }, { "epoch": 2.415214349319425, "ewc_loss": 0.03331673517823219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331673360662535e-05, "grad_norm": 19.113216400146484, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8737040758132935, "num_tokens": 724456509.0, "step": 18986 }, { "epoch": 2.4153415595980157, "ewc_loss": 0.0332147553563118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3214753784704953e-05, "grad_norm": 19.16714096069336, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8852875232696533, "num_tokens": 724498812.0, "step": 18987 }, { "epoch": 2.415468769876606, "ewc_loss": 0.0333421565592289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3342155802529305e-05, "grad_norm": 19.149858474731445, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8629698157310486, "num_tokens": 724538174.0, "step": 18988 }, { "epoch": 2.4155959801551967, "ewc_loss": 0.03326825797557831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326825753902085e-05, "grad_norm": 19.169513702392578, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.880128026008606, "num_tokens": 724577171.0, "step": 18989 }, { "epoch": 2.415723190433787, "ewc_loss": 0.03331819921731949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3318199712084606e-05, "grad_norm": 19.164165496826172, "learning_rate": 1e-06, "loss": 0.3261, "mean_token_accuracy": 0.8965623378753662, "num_tokens": 724610626.0, "step": 18990 }, { "epoch": 2.4158504007123778, "ewc_loss": 0.033292144536972046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3292144507868215e-05, "grad_norm": 19.167144775390625, "learning_rate": 1e-06, "loss": 0.3545, "mean_token_accuracy": 0.8869222402572632, "num_tokens": 724647613.0, "step": 18991 }, { "epoch": 2.415977610990968, "ewc_loss": 0.03322478383779526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3224783692276105e-05, "grad_norm": 19.14388084411621, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8769490718841553, "num_tokens": 724684898.0, "step": 18992 }, { "epoch": 2.4161048212695584, "ewc_loss": 0.03328942507505417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328942693769932e-05, "grad_norm": 19.09458351135254, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.87056964635849, "num_tokens": 724725566.0, "step": 18993 }, { "epoch": 2.416232031548149, "ewc_loss": 0.033248115330934525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3248114050365984e-05, "grad_norm": 19.103933334350586, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8655256032943726, "num_tokens": 724763701.0, "step": 18994 }, { "epoch": 2.4163592418267394, "ewc_loss": 0.03327651694417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327651575091295e-05, "grad_norm": 19.150712966918945, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.874079704284668, "num_tokens": 724798108.0, "step": 18995 }, { "epoch": 2.41648645210533, "ewc_loss": 0.03328058868646622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328059028717689e-05, "grad_norm": 19.12697410583496, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8548041582107544, "num_tokens": 724842430.0, "step": 18996 }, { "epoch": 2.4166136623839205, "ewc_loss": 0.03326351195573807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3263513614656404e-05, "grad_norm": 19.130355834960938, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8699382543563843, "num_tokens": 724884585.0, "step": 18997 }, { "epoch": 2.416740872662511, "ewc_loss": 0.033292096108198166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329209721414372e-05, "grad_norm": 19.181699752807617, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8853042721748352, "num_tokens": 724919338.0, "step": 18998 }, { "epoch": 2.4168680829411016, "ewc_loss": 0.033214665949344635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321466647321358e-05, "grad_norm": 19.1072998046875, "learning_rate": 1e-06, "loss": 0.3296, "mean_token_accuracy": 0.8962771892547607, "num_tokens": 724956007.0, "step": 18999 }, { "epoch": 2.416995293219692, "ewc_loss": 0.03327269107103348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3272692235186696e-05, "grad_norm": 19.145259857177734, "learning_rate": 1e-06, "loss": 0.3422, "mean_token_accuracy": 0.8912962675094604, "num_tokens": 724996994.0, "step": 19000 }, { "epoch": 2.4171225034982826, "ewc_loss": 0.03329912945628166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329912942717783e-05, "grad_norm": 19.132762908935547, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8697108030319214, "num_tokens": 725035607.0, "step": 19001 }, { "epoch": 2.417249713776873, "ewc_loss": 0.033303402364254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3303404052276164e-05, "grad_norm": 19.133705139160156, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8783718347549438, "num_tokens": 725075370.0, "step": 19002 }, { "epoch": 2.4173769240554637, "ewc_loss": 0.03328413516283035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3284133678535e-05, "grad_norm": 19.069448471069336, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8729570508003235, "num_tokens": 725119536.0, "step": 19003 }, { "epoch": 2.417504134334054, "ewc_loss": 0.03319008648395538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3190084650414065e-05, "grad_norm": 18.957538604736328, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8741636872291565, "num_tokens": 725158048.0, "step": 19004 }, { "epoch": 2.4176313446126447, "ewc_loss": 0.03330645710229874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3306456316495314e-05, "grad_norm": 19.247194290161133, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8833191990852356, "num_tokens": 725196343.0, "step": 19005 }, { "epoch": 2.4177585548912353, "ewc_loss": 0.03332545608282089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3325457479804754e-05, "grad_norm": 19.0833740234375, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8747571706771851, "num_tokens": 725234613.0, "step": 19006 }, { "epoch": 2.417885765169826, "ewc_loss": 0.03318313509225845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.318313611089252e-05, "grad_norm": 19.095401763916016, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8747738599777222, "num_tokens": 725272129.0, "step": 19007 }, { "epoch": 2.4180129754484163, "ewc_loss": 0.033320099115371704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332009873702191e-05, "grad_norm": 19.190690994262695, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8661634922027588, "num_tokens": 725305740.0, "step": 19008 }, { "epoch": 2.418140185727007, "ewc_loss": 0.033240966498851776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324096542201005e-05, "grad_norm": 19.07324981689453, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8715177774429321, "num_tokens": 725340579.0, "step": 19009 }, { "epoch": 2.4182673960055974, "ewc_loss": 0.033198196440935135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319819734315388e-05, "grad_norm": 19.098600387573242, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8778449296951294, "num_tokens": 725380492.0, "step": 19010 }, { "epoch": 2.418394606284188, "ewc_loss": 0.033323533833026886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33235329890158e-05, "grad_norm": 19.123836517333984, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8773521780967712, "num_tokens": 725418043.0, "step": 19011 }, { "epoch": 2.4185218165627784, "ewc_loss": 0.03322553634643555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.322553675388917e-05, "grad_norm": 19.123443603515625, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8671064376831055, "num_tokens": 725456260.0, "step": 19012 }, { "epoch": 2.418649026841369, "ewc_loss": 0.03334258869290352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334258872200735e-05, "grad_norm": 19.094797134399414, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8719874024391174, "num_tokens": 725496605.0, "step": 19013 }, { "epoch": 2.4187762371199595, "ewc_loss": 0.03321649506688118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.321649637655355e-05, "grad_norm": 19.062191009521484, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8778623342514038, "num_tokens": 725532395.0, "step": 19014 }, { "epoch": 2.4189034473985496, "ewc_loss": 0.03330669552087784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330669642309658e-05, "grad_norm": 19.186885833740234, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8699938058853149, "num_tokens": 725575335.0, "step": 19015 }, { "epoch": 2.4190306576771405, "ewc_loss": 0.03327103704214096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327103695482947e-05, "grad_norm": 19.12189483642578, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8813662528991699, "num_tokens": 725612345.0, "step": 19016 }, { "epoch": 2.4191578679557306, "ewc_loss": 0.033297546207904816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329754690639675e-05, "grad_norm": 19.155790328979492, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8783435821533203, "num_tokens": 725645566.0, "step": 19017 }, { "epoch": 2.419285078234321, "ewc_loss": 0.03326502814888954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3265027013840154e-05, "grad_norm": 19.075544357299805, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8694111704826355, "num_tokens": 725684218.0, "step": 19018 }, { "epoch": 2.4194122885129117, "ewc_loss": 0.03325175866484642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325175930513069e-05, "grad_norm": 19.198713302612305, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8781318068504333, "num_tokens": 725718478.0, "step": 19019 }, { "epoch": 2.419539498791502, "ewc_loss": 0.033324722200632095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332472260808572e-05, "grad_norm": 19.1008243560791, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8676629662513733, "num_tokens": 725756208.0, "step": 19020 }, { "epoch": 2.4196667090700927, "ewc_loss": 0.033203285187482834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3203286875505e-05, "grad_norm": 19.107654571533203, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8699164390563965, "num_tokens": 725791884.0, "step": 19021 }, { "epoch": 2.4197939193486833, "ewc_loss": 0.03330433741211891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330433901282959e-05, "grad_norm": 19.05608558654785, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.878518283367157, "num_tokens": 725829138.0, "step": 19022 }, { "epoch": 2.419921129627274, "ewc_loss": 0.03325818479061127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3258183975704014e-05, "grad_norm": 19.15175437927246, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8759644031524658, "num_tokens": 725868301.0, "step": 19023 }, { "epoch": 2.4200483399058643, "ewc_loss": 0.03330153226852417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330153413116932e-05, "grad_norm": 19.063796997070312, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8652776479721069, "num_tokens": 725906186.0, "step": 19024 }, { "epoch": 2.420175550184455, "ewc_loss": 0.03324238210916519, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324238059576601e-05, "grad_norm": 19.023122787475586, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8632751703262329, "num_tokens": 725947006.0, "step": 19025 }, { "epoch": 2.4203027604630454, "ewc_loss": 0.03330959007143974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330958861624822e-05, "grad_norm": 19.10796546936035, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8790764808654785, "num_tokens": 725987363.0, "step": 19026 }, { "epoch": 2.420429970741636, "ewc_loss": 0.0333934910595417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3393491321476176e-05, "grad_norm": 19.19129180908203, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8665288686752319, "num_tokens": 726026176.0, "step": 19027 }, { "epoch": 2.4205571810202264, "ewc_loss": 0.03333473578095436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.333473432576284e-05, "grad_norm": 19.09669303894043, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8730542659759521, "num_tokens": 726068124.0, "step": 19028 }, { "epoch": 2.420684391298817, "ewc_loss": 0.033295974135398865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329597529955208e-05, "grad_norm": 19.164569854736328, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8659451007843018, "num_tokens": 726110270.0, "step": 19029 }, { "epoch": 2.4208116015774075, "ewc_loss": 0.03334151208400726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334151188028045e-05, "grad_norm": 19.135438919067383, "learning_rate": 1e-06, "loss": 0.3402, "mean_token_accuracy": 0.8903699517250061, "num_tokens": 726143908.0, "step": 19030 }, { "epoch": 2.420938811855998, "ewc_loss": 0.03329089283943176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3290893043158576e-05, "grad_norm": 19.166635513305664, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.872812807559967, "num_tokens": 726179082.0, "step": 19031 }, { "epoch": 2.4210660221345885, "ewc_loss": 0.033371370285749435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337136877235025e-05, "grad_norm": 19.170917510986328, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.869498610496521, "num_tokens": 726216387.0, "step": 19032 }, { "epoch": 2.421193232413179, "ewc_loss": 0.03329605981707573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3296058973064646e-05, "grad_norm": 19.199460983276367, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8841917514801025, "num_tokens": 726258831.0, "step": 19033 }, { "epoch": 2.4213204426917696, "ewc_loss": 0.03328213095664978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328212915221229e-05, "grad_norm": 19.072935104370117, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8766241073608398, "num_tokens": 726301929.0, "step": 19034 }, { "epoch": 2.42144765297036, "ewc_loss": 0.03325163573026657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.325163561385125e-05, "grad_norm": 19.11501121520996, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8780386447906494, "num_tokens": 726339938.0, "step": 19035 }, { "epoch": 2.4215748632489507, "ewc_loss": 0.033340729773044586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3340729714836925e-05, "grad_norm": 19.186559677124023, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8760738372802734, "num_tokens": 726377197.0, "step": 19036 }, { "epoch": 2.421702073527541, "ewc_loss": 0.03326913341879845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326913429191336e-05, "grad_norm": 19.116239547729492, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8790081143379211, "num_tokens": 726417591.0, "step": 19037 }, { "epoch": 2.4218292838061317, "ewc_loss": 0.03324934467673302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324934368720278e-05, "grad_norm": 19.12821388244629, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.865411102771759, "num_tokens": 726456094.0, "step": 19038 }, { "epoch": 2.4219564940847222, "ewc_loss": 0.03330305218696594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3303051168331876e-05, "grad_norm": 19.086502075195312, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8833786249160767, "num_tokens": 726491158.0, "step": 19039 }, { "epoch": 2.4220837043633123, "ewc_loss": 0.03330107405781746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3301075745839626e-05, "grad_norm": 19.123077392578125, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8616883754730225, "num_tokens": 726531334.0, "step": 19040 }, { "epoch": 2.4222109146419033, "ewc_loss": 0.033313799649477005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331380139570683e-05, "grad_norm": 19.097082138061523, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8656662106513977, "num_tokens": 726571999.0, "step": 19041 }, { "epoch": 2.4223381249204934, "ewc_loss": 0.033297471702098846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32974705088418e-05, "grad_norm": 19.199092864990234, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8655231595039368, "num_tokens": 726604945.0, "step": 19042 }, { "epoch": 2.422465335199084, "ewc_loss": 0.03326023370027542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326023215777241e-05, "grad_norm": 19.084243774414062, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.866495668888092, "num_tokens": 726645023.0, "step": 19043 }, { "epoch": 2.4225925454776744, "ewc_loss": 0.03330300375819206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3303003874607384e-05, "grad_norm": 19.211137771606445, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8735471367835999, "num_tokens": 726684297.0, "step": 19044 }, { "epoch": 2.422719755756265, "ewc_loss": 0.03326960280537605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3269603591179475e-05, "grad_norm": 19.08260154724121, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8639748692512512, "num_tokens": 726726950.0, "step": 19045 }, { "epoch": 2.4228469660348555, "ewc_loss": 0.03325740993022919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3257409086218104e-05, "grad_norm": 19.131052017211914, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8560404777526855, "num_tokens": 726761944.0, "step": 19046 }, { "epoch": 2.422974176313446, "ewc_loss": 0.03331653028726578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331652987981215e-05, "grad_norm": 19.07909393310547, "learning_rate": 1e-06, "loss": 0.3557, "mean_token_accuracy": 0.8836400508880615, "num_tokens": 726799041.0, "step": 19047 }, { "epoch": 2.4231013865920366, "ewc_loss": 0.03330269455909729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330269464640878e-05, "grad_norm": 19.08956527709961, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8691016435623169, "num_tokens": 726838160.0, "step": 19048 }, { "epoch": 2.423228596870627, "ewc_loss": 0.033334989100694656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3334988984279335e-05, "grad_norm": 19.07732391357422, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8842840790748596, "num_tokens": 726873150.0, "step": 19049 }, { "epoch": 2.4233558071492176, "ewc_loss": 0.033338453620672226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3338452340103686e-05, "grad_norm": 19.10563087463379, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8775895237922668, "num_tokens": 726905253.0, "step": 19050 }, { "epoch": 2.423483017427808, "ewc_loss": 0.03336773067712784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336773079354316e-05, "grad_norm": 19.136350631713867, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8846397995948792, "num_tokens": 726942842.0, "step": 19051 }, { "epoch": 2.4236102277063987, "ewc_loss": 0.03338003158569336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338003079988994e-05, "grad_norm": 19.126527786254883, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8790066242218018, "num_tokens": 726987974.0, "step": 19052 }, { "epoch": 2.423737437984989, "ewc_loss": 0.03329711779952049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329711762489751e-05, "grad_norm": 19.111873626708984, "learning_rate": 1e-06, "loss": 0.3385, "mean_token_accuracy": 0.8927125930786133, "num_tokens": 727021614.0, "step": 19053 }, { "epoch": 2.4238646482635797, "ewc_loss": 0.03337204083800316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3372041798429564e-05, "grad_norm": 19.12097930908203, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8728130459785461, "num_tokens": 727061773.0, "step": 19054 }, { "epoch": 2.4239918585421703, "ewc_loss": 0.033309925347566605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330992694827728e-05, "grad_norm": 19.146484375, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8695043325424194, "num_tokens": 727095877.0, "step": 19055 }, { "epoch": 2.424119068820761, "ewc_loss": 0.03339194133877754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3391941542504355e-05, "grad_norm": 19.167491912841797, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8781321048736572, "num_tokens": 727128676.0, "step": 19056 }, { "epoch": 2.4242462790993513, "ewc_loss": 0.03333672881126404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3336727938149124e-05, "grad_norm": 19.098241806030273, "learning_rate": 1e-06, "loss": 0.3527, "mean_token_accuracy": 0.8839716911315918, "num_tokens": 727164528.0, "step": 19057 }, { "epoch": 2.424373489377942, "ewc_loss": 0.03331902250647545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331902189529501e-05, "grad_norm": 19.11564064025879, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8799729347229004, "num_tokens": 727202403.0, "step": 19058 }, { "epoch": 2.4245006996565324, "ewc_loss": 0.033330921083688736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3330921723973006e-05, "grad_norm": 19.071317672729492, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8777555227279663, "num_tokens": 727242231.0, "step": 19059 }, { "epoch": 2.424627909935123, "ewc_loss": 0.03333953768014908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33395364577882e-05, "grad_norm": 19.149381637573242, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8625108003616333, "num_tokens": 727276371.0, "step": 19060 }, { "epoch": 2.4247551202137134, "ewc_loss": 0.033341605216264725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3341606467729434e-05, "grad_norm": 19.06415557861328, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8778952360153198, "num_tokens": 727320101.0, "step": 19061 }, { "epoch": 2.424882330492304, "ewc_loss": 0.03332318365573883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332318374305032e-05, "grad_norm": 19.128894805908203, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.865851640701294, "num_tokens": 727363687.0, "step": 19062 }, { "epoch": 2.4250095407708945, "ewc_loss": 0.03334331139922142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334331267978996e-05, "grad_norm": 19.127859115600586, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8802029490470886, "num_tokens": 727399772.0, "step": 19063 }, { "epoch": 2.425136751049485, "ewc_loss": 0.03333733230829239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33373318426311e-05, "grad_norm": 19.114282608032227, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8823505640029907, "num_tokens": 727442944.0, "step": 19064 }, { "epoch": 2.425263961328075, "ewc_loss": 0.03333079814910889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3330798032693565e-05, "grad_norm": 19.092754364013672, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.88318932056427, "num_tokens": 727483283.0, "step": 19065 }, { "epoch": 2.425391171606666, "ewc_loss": 0.03330087661743164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3300875657005236e-05, "grad_norm": 19.138437271118164, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8731099367141724, "num_tokens": 727526316.0, "step": 19066 }, { "epoch": 2.425518381885256, "ewc_loss": 0.03329062461853027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329062383272685e-05, "grad_norm": 19.087818145751953, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8817834258079529, "num_tokens": 727563218.0, "step": 19067 }, { "epoch": 2.4256455921638467, "ewc_loss": 0.03328773379325867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328773527755402e-05, "grad_norm": 19.171297073364258, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.887581467628479, "num_tokens": 727594877.0, "step": 19068 }, { "epoch": 2.425772802442437, "ewc_loss": 0.03334781154990196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334781285957433e-05, "grad_norm": 19.18388557434082, "learning_rate": 1e-06, "loss": 0.3482, "mean_token_accuracy": 0.8882447481155396, "num_tokens": 727627561.0, "step": 19069 }, { "epoch": 2.4259000127210277, "ewc_loss": 0.03324674442410469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324674617033452e-05, "grad_norm": 19.029651641845703, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8657031059265137, "num_tokens": 727664384.0, "step": 19070 }, { "epoch": 2.4260272229996183, "ewc_loss": 0.033338677138090134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3338677894789726e-05, "grad_norm": 19.232614517211914, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8713870048522949, "num_tokens": 727700866.0, "step": 19071 }, { "epoch": 2.426154433278209, "ewc_loss": 0.033367931842803955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336793088237755e-05, "grad_norm": 19.163549423217773, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8917200565338135, "num_tokens": 727741539.0, "step": 19072 }, { "epoch": 2.4262816435567993, "ewc_loss": 0.03326643258333206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326643127365969e-05, "grad_norm": 19.234392166137695, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8750492930412292, "num_tokens": 727777828.0, "step": 19073 }, { "epoch": 2.42640885383539, "ewc_loss": 0.0332670696079731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.326706791995093e-05, "grad_norm": 19.043663024902344, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.8826098442077637, "num_tokens": 727818979.0, "step": 19074 }, { "epoch": 2.4265360641139804, "ewc_loss": 0.03318240866065025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.31824085151311e-05, "grad_norm": 19.095989227294922, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8760936260223389, "num_tokens": 727856440.0, "step": 19075 }, { "epoch": 2.426663274392571, "ewc_loss": 0.033357083797454834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33570824295748e-05, "grad_norm": 19.169878005981445, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8831526041030884, "num_tokens": 727895036.0, "step": 19076 }, { "epoch": 2.4267904846711614, "ewc_loss": 0.033270131796598434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3270131098106503e-05, "grad_norm": 19.108144760131836, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8648830056190491, "num_tokens": 727932292.0, "step": 19077 }, { "epoch": 2.426917694949752, "ewc_loss": 0.033320412039756775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332041160319932e-05, "grad_norm": 19.072168350219727, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8755737543106079, "num_tokens": 727967005.0, "step": 19078 }, { "epoch": 2.4270449052283425, "ewc_loss": 0.03323950245976448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32395029545296e-05, "grad_norm": 19.031959533691406, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8664731979370117, "num_tokens": 728004262.0, "step": 19079 }, { "epoch": 2.427172115506933, "ewc_loss": 0.03338209539651871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3382097171852365e-05, "grad_norm": 19.19679069519043, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8854286670684814, "num_tokens": 728041451.0, "step": 19080 }, { "epoch": 2.4272993257855235, "ewc_loss": 0.03333401679992676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.333401764393784e-05, "grad_norm": 19.026065826416016, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8735339641571045, "num_tokens": 728079217.0, "step": 19081 }, { "epoch": 2.427426536064114, "ewc_loss": 0.03332551568746567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332551568746567e-05, "grad_norm": 19.163192749023438, "learning_rate": 1e-06, "loss": 0.3305, "mean_token_accuracy": 0.8901723027229309, "num_tokens": 728116221.0, "step": 19082 }, { "epoch": 2.4275537463427046, "ewc_loss": 0.033417511731386185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.34175128955394e-05, "grad_norm": 19.082006454467773, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8675695657730103, "num_tokens": 728154861.0, "step": 19083 }, { "epoch": 2.427680956621295, "ewc_loss": 0.03329964354634285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329964238218963e-05, "grad_norm": 19.183135986328125, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8644617795944214, "num_tokens": 728188276.0, "step": 19084 }, { "epoch": 2.4278081668998857, "ewc_loss": 0.03339259326457977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3392592740710825e-05, "grad_norm": 19.164945602416992, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8861420154571533, "num_tokens": 728229313.0, "step": 19085 }, { "epoch": 2.427935377178476, "ewc_loss": 0.033339183777570724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.333918357384391e-05, "grad_norm": 19.184450149536133, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8644832372665405, "num_tokens": 728268706.0, "step": 19086 }, { "epoch": 2.4280625874570667, "ewc_loss": 0.03333849459886551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.333849599584937e-05, "grad_norm": 19.133647918701172, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8886334300041199, "num_tokens": 728297490.0, "step": 19087 }, { "epoch": 2.428189797735657, "ewc_loss": 0.03328179568052292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328179445816204e-05, "grad_norm": 19.118934631347656, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8672388792037964, "num_tokens": 728336041.0, "step": 19088 }, { "epoch": 2.4283170080142478, "ewc_loss": 0.033301904797554016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330190520500764e-05, "grad_norm": 19.048847198486328, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8825933933258057, "num_tokens": 728372701.0, "step": 19089 }, { "epoch": 2.428444218292838, "ewc_loss": 0.03330633416771889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330633262521587e-05, "grad_norm": 19.135543823242188, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8781846165657043, "num_tokens": 728408138.0, "step": 19090 }, { "epoch": 2.4285714285714284, "ewc_loss": 0.033375248312950134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337524685775861e-05, "grad_norm": 19.111066818237305, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.882128119468689, "num_tokens": 728452649.0, "step": 19091 }, { "epoch": 2.428698638850019, "ewc_loss": 0.033326223492622375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332622509333305e-05, "grad_norm": 19.06847381591797, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8655037879943848, "num_tokens": 728490551.0, "step": 19092 }, { "epoch": 2.4288258491286094, "ewc_loss": 0.03336197882890701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336197914904915e-05, "grad_norm": 19.19232177734375, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.882187008857727, "num_tokens": 728529558.0, "step": 19093 }, { "epoch": 2.4289530594072, "ewc_loss": 0.033382076770067215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338207534397952e-05, "grad_norm": 19.050151824951172, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8516299724578857, "num_tokens": 728573495.0, "step": 19094 }, { "epoch": 2.4290802696857905, "ewc_loss": 0.03329090774059296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3290907595073804e-05, "grad_norm": 19.127153396606445, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8760784864425659, "num_tokens": 728609166.0, "step": 19095 }, { "epoch": 2.429207479964381, "ewc_loss": 0.033428311347961426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342831041663885e-05, "grad_norm": 19.099706649780273, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8823164701461792, "num_tokens": 728653070.0, "step": 19096 }, { "epoch": 2.4293346902429716, "ewc_loss": 0.0333271324634552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3327130950056016e-05, "grad_norm": 19.150537490844727, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8779805898666382, "num_tokens": 728689142.0, "step": 19097 }, { "epoch": 2.429461900521562, "ewc_loss": 0.03335411474108696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3354113838868216e-05, "grad_norm": 19.156396865844727, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8800944089889526, "num_tokens": 728725615.0, "step": 19098 }, { "epoch": 2.4295891108001526, "ewc_loss": 0.033325113356113434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332511187181808e-05, "grad_norm": 19.095455169677734, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8657578825950623, "num_tokens": 728762366.0, "step": 19099 }, { "epoch": 2.429716321078743, "ewc_loss": 0.03326920419931412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3269203413510695e-05, "grad_norm": 19.042253494262695, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8631574511528015, "num_tokens": 728801951.0, "step": 19100 }, { "epoch": 2.4298435313573337, "ewc_loss": 0.03334381431341171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334381472086534e-05, "grad_norm": 19.221216201782227, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8835709095001221, "num_tokens": 728836403.0, "step": 19101 }, { "epoch": 2.429970741635924, "ewc_loss": 0.0333293192088604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3329321013297886e-05, "grad_norm": 19.064838409423828, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.882441520690918, "num_tokens": 728878485.0, "step": 19102 }, { "epoch": 2.4300979519145147, "ewc_loss": 0.033307995647192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3307995181530714e-05, "grad_norm": 19.184295654296875, "learning_rate": 1e-06, "loss": 0.3527, "mean_token_accuracy": 0.8836672306060791, "num_tokens": 728915935.0, "step": 19103 }, { "epoch": 2.4302251621931052, "ewc_loss": 0.0333629809319973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33629795932211e-05, "grad_norm": 19.135623931884766, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8588482141494751, "num_tokens": 728956847.0, "step": 19104 }, { "epoch": 2.4303523724716958, "ewc_loss": 0.033296890556812286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329689207021147e-05, "grad_norm": 19.112720489501953, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8807827234268188, "num_tokens": 728994865.0, "step": 19105 }, { "epoch": 2.4304795827502863, "ewc_loss": 0.033362098038196564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336209920234978e-05, "grad_norm": 19.137290954589844, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8691073060035706, "num_tokens": 729033030.0, "step": 19106 }, { "epoch": 2.430606793028877, "ewc_loss": 0.03331770375370979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331770494696684e-05, "grad_norm": 19.103605270385742, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8848962187767029, "num_tokens": 729070430.0, "step": 19107 }, { "epoch": 2.4307340033074674, "ewc_loss": 0.03329882398247719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3298823836958036e-05, "grad_norm": 19.154193878173828, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8850805163383484, "num_tokens": 729102127.0, "step": 19108 }, { "epoch": 2.430861213586058, "ewc_loss": 0.03336961939930916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336961890454404e-05, "grad_norm": 19.108592987060547, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8783215880393982, "num_tokens": 729138128.0, "step": 19109 }, { "epoch": 2.4309884238646484, "ewc_loss": 0.0333297997713089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332980122650042e-05, "grad_norm": 19.137958526611328, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8723750114440918, "num_tokens": 729179756.0, "step": 19110 }, { "epoch": 2.431115634143239, "ewc_loss": 0.03334483504295349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3344833354931325e-05, "grad_norm": 19.079774856567383, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.864054799079895, "num_tokens": 729221356.0, "step": 19111 }, { "epoch": 2.4312428444218295, "ewc_loss": 0.033369988203048706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3369989978382364e-05, "grad_norm": 19.1132755279541, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8794467449188232, "num_tokens": 729260456.0, "step": 19112 }, { "epoch": 2.4313700547004196, "ewc_loss": 0.033388786017894745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338878741487861e-05, "grad_norm": 19.10964584350586, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8631608486175537, "num_tokens": 729299654.0, "step": 19113 }, { "epoch": 2.4314972649790105, "ewc_loss": 0.03334565460681915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334565553814173e-05, "grad_norm": 19.164627075195312, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8780226111412048, "num_tokens": 729341296.0, "step": 19114 }, { "epoch": 2.4316244752576006, "ewc_loss": 0.033348508179187775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334850771352649e-05, "grad_norm": 19.031349182128906, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8792178630828857, "num_tokens": 729379632.0, "step": 19115 }, { "epoch": 2.431751685536191, "ewc_loss": 0.033350635319948196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3350635931128636e-05, "grad_norm": 19.2095947265625, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8776658177375793, "num_tokens": 729423202.0, "step": 19116 }, { "epoch": 2.4318788958147817, "ewc_loss": 0.03339381515979767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339381510159001e-05, "grad_norm": 19.055423736572266, "learning_rate": 1e-06, "loss": 0.3272, "mean_token_accuracy": 0.8945345282554626, "num_tokens": 729457108.0, "step": 19117 }, { "epoch": 2.432006106093372, "ewc_loss": 0.033364102244377136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336410372867249e-05, "grad_norm": 19.206756591796875, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8717039823532104, "num_tokens": 729494065.0, "step": 19118 }, { "epoch": 2.4321333163719627, "ewc_loss": 0.03346854820847511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3468546462245286e-05, "grad_norm": 19.117046356201172, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8794304132461548, "num_tokens": 729536621.0, "step": 19119 }, { "epoch": 2.4322605266505533, "ewc_loss": 0.0332786962389946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.32786949002184e-05, "grad_norm": 19.054365158081055, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8760548830032349, "num_tokens": 729574423.0, "step": 19120 }, { "epoch": 2.432387736929144, "ewc_loss": 0.03344682604074478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344682772876695e-05, "grad_norm": 19.144784927368164, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8695560693740845, "num_tokens": 729614673.0, "step": 19121 }, { "epoch": 2.4325149472077343, "ewc_loss": 0.033351484686136246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335148358019069e-05, "grad_norm": 19.066762924194336, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8779779672622681, "num_tokens": 729653049.0, "step": 19122 }, { "epoch": 2.432642157486325, "ewc_loss": 0.033329807221889496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3329808502458036e-05, "grad_norm": 19.074499130249023, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8765090703964233, "num_tokens": 729689667.0, "step": 19123 }, { "epoch": 2.4327693677649154, "ewc_loss": 0.03343632072210312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343632124597207e-05, "grad_norm": 19.132335662841797, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8789099454879761, "num_tokens": 729734747.0, "step": 19124 }, { "epoch": 2.432896578043506, "ewc_loss": 0.03336292505264282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336292502353899e-05, "grad_norm": 19.12517547607422, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8709989786148071, "num_tokens": 729770135.0, "step": 19125 }, { "epoch": 2.4330237883220964, "ewc_loss": 0.033369697630405426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3369698940077797e-05, "grad_norm": 19.098024368286133, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8734620809555054, "num_tokens": 729809203.0, "step": 19126 }, { "epoch": 2.433150998600687, "ewc_loss": 0.03342247009277344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342247146065347e-05, "grad_norm": 19.25127601623535, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8814351558685303, "num_tokens": 729850164.0, "step": 19127 }, { "epoch": 2.4332782088792775, "ewc_loss": 0.03343629464507103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343629578012042e-05, "grad_norm": 19.137664794921875, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8648608922958374, "num_tokens": 729884337.0, "step": 19128 }, { "epoch": 2.433405419157868, "ewc_loss": 0.03329966962337494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329967148602009e-05, "grad_norm": 19.135286331176758, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.8877478837966919, "num_tokens": 729924098.0, "step": 19129 }, { "epoch": 2.4335326294364585, "ewc_loss": 0.03341495245695114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341495175845921e-05, "grad_norm": 19.182363510131836, "learning_rate": 1e-06, "loss": 0.3425, "mean_token_accuracy": 0.8934247493743896, "num_tokens": 729960671.0, "step": 19130 }, { "epoch": 2.433659839715049, "ewc_loss": 0.03332588076591492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332587948534638e-05, "grad_norm": 19.07151222229004, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8752303123474121, "num_tokens": 730000780.0, "step": 19131 }, { "epoch": 2.4337870499936396, "ewc_loss": 0.03333602473139763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3336025808239356e-05, "grad_norm": 19.195680618286133, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8728837966918945, "num_tokens": 730045198.0, "step": 19132 }, { "epoch": 2.43391426027223, "ewc_loss": 0.03340676426887512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340676266816445e-05, "grad_norm": 19.141738891601562, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8691792488098145, "num_tokens": 730085010.0, "step": 19133 }, { "epoch": 2.4340414705508207, "ewc_loss": 0.03330639749765396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33063981088344e-05, "grad_norm": 19.12885856628418, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8693236112594604, "num_tokens": 730121512.0, "step": 19134 }, { "epoch": 2.434168680829411, "ewc_loss": 0.033345162868499756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334516441100277e-05, "grad_norm": 19.13568878173828, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8740214705467224, "num_tokens": 730164859.0, "step": 19135 }, { "epoch": 2.4342958911080017, "ewc_loss": 0.0333457812666893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3345782867399976e-05, "grad_norm": 19.17342758178711, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8756062984466553, "num_tokens": 730202273.0, "step": 19136 }, { "epoch": 2.4344231013865922, "ewc_loss": 0.03336736187338829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3367363357683644e-05, "grad_norm": 19.0978946685791, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8682523965835571, "num_tokens": 730248208.0, "step": 19137 }, { "epoch": 2.4345503116651823, "ewc_loss": 0.03334629908204079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334629946039058e-05, "grad_norm": 19.189855575561523, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8850620985031128, "num_tokens": 730291084.0, "step": 19138 }, { "epoch": 2.4346775219437733, "ewc_loss": 0.03330211713910103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3302116207778454e-05, "grad_norm": 19.05572509765625, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8797739744186401, "num_tokens": 730334504.0, "step": 19139 }, { "epoch": 2.4348047322223634, "ewc_loss": 0.033302780240774155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330278195790015e-05, "grad_norm": 19.192089080810547, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8785783648490906, "num_tokens": 730373356.0, "step": 19140 }, { "epoch": 2.434931942500954, "ewc_loss": 0.03336391597986221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336391455377452e-05, "grad_norm": 19.126380920410156, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8622326850891113, "num_tokens": 730416229.0, "step": 19141 }, { "epoch": 2.4350591527795444, "ewc_loss": 0.033242713660001755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.324271528981626e-05, "grad_norm": 19.166645050048828, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8864682912826538, "num_tokens": 730458922.0, "step": 19142 }, { "epoch": 2.435186363058135, "ewc_loss": 0.03336457535624504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336457666591741e-05, "grad_norm": 19.185997009277344, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8737477660179138, "num_tokens": 730494583.0, "step": 19143 }, { "epoch": 2.4353135733367255, "ewc_loss": 0.0332656130194664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3265612728428096e-05, "grad_norm": 19.047767639160156, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8783829212188721, "num_tokens": 730529526.0, "step": 19144 }, { "epoch": 2.435440783615316, "ewc_loss": 0.033270858228206635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327085869386792e-05, "grad_norm": 19.27450180053711, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8787550330162048, "num_tokens": 730558584.0, "step": 19145 }, { "epoch": 2.4355679938939065, "ewc_loss": 0.033355481922626495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335548171889968e-05, "grad_norm": 19.20149040222168, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8750483393669128, "num_tokens": 730598677.0, "step": 19146 }, { "epoch": 2.435695204172497, "ewc_loss": 0.03319935128092766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.319935058243573e-05, "grad_norm": 19.159883499145508, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.8852051496505737, "num_tokens": 730636985.0, "step": 19147 }, { "epoch": 2.4358224144510876, "ewc_loss": 0.03332522511482239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33252246491611e-05, "grad_norm": 19.05838966369629, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8581907749176025, "num_tokens": 730675409.0, "step": 19148 }, { "epoch": 2.435949624729678, "ewc_loss": 0.03328123316168785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328123420942575e-05, "grad_norm": 19.228124618530273, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8648186326026917, "num_tokens": 730714739.0, "step": 19149 }, { "epoch": 2.4360768350082687, "ewc_loss": 0.03333725035190582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3337251807097346e-05, "grad_norm": 19.16565704345703, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8737015724182129, "num_tokens": 730757059.0, "step": 19150 }, { "epoch": 2.436204045286859, "ewc_loss": 0.03328406810760498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.328406819491647e-05, "grad_norm": 19.124876022338867, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8742337822914124, "num_tokens": 730793469.0, "step": 19151 }, { "epoch": 2.4363312555654497, "ewc_loss": 0.033413659781217575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341366027598269e-05, "grad_norm": 19.269996643066406, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8800996541976929, "num_tokens": 730829336.0, "step": 19152 }, { "epoch": 2.4364584658440402, "ewc_loss": 0.03335607051849365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335607107146643e-05, "grad_norm": 19.15605354309082, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8741011619567871, "num_tokens": 730869304.0, "step": 19153 }, { "epoch": 2.4365856761226308, "ewc_loss": 0.033313628286123276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331362677272409e-05, "grad_norm": 19.208139419555664, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8630154728889465, "num_tokens": 730905315.0, "step": 19154 }, { "epoch": 2.4367128864012213, "ewc_loss": 0.03339003771543503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3390038879588246e-05, "grad_norm": 19.203073501586914, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8667630553245544, "num_tokens": 730947346.0, "step": 19155 }, { "epoch": 2.436840096679812, "ewc_loss": 0.03329375758767128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.329375613247976e-05, "grad_norm": 19.197723388671875, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.880088210105896, "num_tokens": 730976907.0, "step": 19156 }, { "epoch": 2.4369673069584024, "ewc_loss": 0.03336428478360176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3364285627612844e-05, "grad_norm": 19.2052001953125, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8714171051979065, "num_tokens": 731015940.0, "step": 19157 }, { "epoch": 2.437094517236993, "ewc_loss": 0.03332658112049103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3326581615256146e-05, "grad_norm": 19.12836265563965, "learning_rate": 1e-06, "loss": 0.3294, "mean_token_accuracy": 0.89098060131073, "num_tokens": 731051345.0, "step": 19158 }, { "epoch": 2.4372217275155834, "ewc_loss": 0.03335203975439072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3352040190948173e-05, "grad_norm": 19.17754554748535, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8443665504455566, "num_tokens": 731089642.0, "step": 19159 }, { "epoch": 2.437348937794174, "ewc_loss": 0.033398665487766266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339866452733986e-05, "grad_norm": 19.135547637939453, "learning_rate": 1e-06, "loss": 0.3415, "mean_token_accuracy": 0.8922527432441711, "num_tokens": 731133196.0, "step": 19160 }, { "epoch": 2.4374761480727645, "ewc_loss": 0.033341292291879654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3341293601552024e-05, "grad_norm": 19.13358497619629, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8639283180236816, "num_tokens": 731175510.0, "step": 19161 }, { "epoch": 2.437603358351355, "ewc_loss": 0.03340977057814598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340977127663791e-05, "grad_norm": 19.1323184967041, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8834786415100098, "num_tokens": 731217526.0, "step": 19162 }, { "epoch": 2.437730568629945, "ewc_loss": 0.0334465354681015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344653669046238e-05, "grad_norm": 19.234682083129883, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8630989193916321, "num_tokens": 731259525.0, "step": 19163 }, { "epoch": 2.437857778908536, "ewc_loss": 0.03341378644108772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341378760524094e-05, "grad_norm": 19.176172256469727, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8784167170524597, "num_tokens": 731301480.0, "step": 19164 }, { "epoch": 2.437984989187126, "ewc_loss": 0.033379532396793365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3379532396793365e-05, "grad_norm": 19.073389053344727, "learning_rate": 1e-06, "loss": 0.3684, "mean_token_accuracy": 0.8842151761054993, "num_tokens": 731347385.0, "step": 19165 }, { "epoch": 2.4381121994657167, "ewc_loss": 0.033409763127565384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.34097640006803e-05, "grad_norm": 19.189908981323242, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8799718022346497, "num_tokens": 731383666.0, "step": 19166 }, { "epoch": 2.438239409744307, "ewc_loss": 0.03344237804412842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3442378480685875e-05, "grad_norm": 19.139860153198242, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.875698983669281, "num_tokens": 731420297.0, "step": 19167 }, { "epoch": 2.4383666200228977, "ewc_loss": 0.033279139548540115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.327913873363286e-05, "grad_norm": 19.106212615966797, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8718479871749878, "num_tokens": 731455083.0, "step": 19168 }, { "epoch": 2.4384938303014883, "ewc_loss": 0.0334114208817482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3411419281037524e-05, "grad_norm": 19.186769485473633, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8825899362564087, "num_tokens": 731482555.0, "step": 19169 }, { "epoch": 2.438621040580079, "ewc_loss": 0.03335821256041527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335821384098381e-05, "grad_norm": 19.124576568603516, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8876163959503174, "num_tokens": 731519459.0, "step": 19170 }, { "epoch": 2.4387482508586693, "ewc_loss": 0.033394791185855865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339479007991031e-05, "grad_norm": 19.143075942993164, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8805463910102844, "num_tokens": 731554577.0, "step": 19171 }, { "epoch": 2.43887546113726, "ewc_loss": 0.03338318690657616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338318856549449e-05, "grad_norm": 19.12099838256836, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8666917085647583, "num_tokens": 731594993.0, "step": 19172 }, { "epoch": 2.4390026714158504, "ewc_loss": 0.03339248523116112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339248360134661e-05, "grad_norm": 19.18240737915039, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.879962146282196, "num_tokens": 731629182.0, "step": 19173 }, { "epoch": 2.439129881694441, "ewc_loss": 0.03345536068081856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3455362427048385e-05, "grad_norm": 19.172086715698242, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8745623826980591, "num_tokens": 731667661.0, "step": 19174 }, { "epoch": 2.4392570919730314, "ewc_loss": 0.03340063989162445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340063994983211e-05, "grad_norm": 19.184568405151367, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.875819206237793, "num_tokens": 731704911.0, "step": 19175 }, { "epoch": 2.439384302251622, "ewc_loss": 0.0334654375910759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346543599036522e-05, "grad_norm": 19.175708770751953, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8597190380096436, "num_tokens": 731744990.0, "step": 19176 }, { "epoch": 2.4395115125302125, "ewc_loss": 0.033412154763936996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341215415275656e-05, "grad_norm": 19.164627075195312, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8766262531280518, "num_tokens": 731783335.0, "step": 19177 }, { "epoch": 2.439638722808803, "ewc_loss": 0.03344108909368515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3441090636188164e-05, "grad_norm": 19.157766342163086, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8780673742294312, "num_tokens": 731819206.0, "step": 19178 }, { "epoch": 2.4397659330873935, "ewc_loss": 0.03341634199023247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341634146636352e-05, "grad_norm": 19.180665969848633, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8736914396286011, "num_tokens": 731855291.0, "step": 19179 }, { "epoch": 2.439893143365984, "ewc_loss": 0.03342900797724724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342900890856981e-05, "grad_norm": 19.081113815307617, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.880082368850708, "num_tokens": 731899392.0, "step": 19180 }, { "epoch": 2.4400203536445746, "ewc_loss": 0.033435627818107605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343562639201991e-05, "grad_norm": 19.17085075378418, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8740370273590088, "num_tokens": 731937502.0, "step": 19181 }, { "epoch": 2.440147563923165, "ewc_loss": 0.0334831178188324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348311656736769e-05, "grad_norm": 19.176633834838867, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.878070056438446, "num_tokens": 731974946.0, "step": 19182 }, { "epoch": 2.4402747742017556, "ewc_loss": 0.03342840075492859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342840136610903e-05, "grad_norm": 19.172283172607422, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8636122345924377, "num_tokens": 732018732.0, "step": 19183 }, { "epoch": 2.440401984480346, "ewc_loss": 0.03340594097971916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3405940484954044e-05, "grad_norm": 19.135204315185547, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8746947646141052, "num_tokens": 732052036.0, "step": 19184 }, { "epoch": 2.4405291947589367, "ewc_loss": 0.03342928737401962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342928903293796e-05, "grad_norm": 19.171424865722656, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8705955147743225, "num_tokens": 732090944.0, "step": 19185 }, { "epoch": 2.440656405037527, "ewc_loss": 0.033451635390520096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345163713674992e-05, "grad_norm": 19.185701370239258, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8774977922439575, "num_tokens": 732131406.0, "step": 19186 }, { "epoch": 2.4407836153161178, "ewc_loss": 0.03339577093720436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339577233418822e-05, "grad_norm": 19.107284545898438, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8739063739776611, "num_tokens": 732171701.0, "step": 19187 }, { "epoch": 2.440910825594708, "ewc_loss": 0.033458031713962555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345803270349279e-05, "grad_norm": 19.160083770751953, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8832552433013916, "num_tokens": 732209658.0, "step": 19188 }, { "epoch": 2.4410380358732984, "ewc_loss": 0.03344334289431572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3443342545069754e-05, "grad_norm": 19.15656852722168, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8630693554878235, "num_tokens": 732251395.0, "step": 19189 }, { "epoch": 2.441165246151889, "ewc_loss": 0.03338415175676346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338415262987837e-05, "grad_norm": 19.188302993774414, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8658246994018555, "num_tokens": 732289306.0, "step": 19190 }, { "epoch": 2.4412924564304794, "ewc_loss": 0.03331281617283821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.331281550345011e-05, "grad_norm": 19.12897300720215, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8615341782569885, "num_tokens": 732337916.0, "step": 19191 }, { "epoch": 2.44141966670907, "ewc_loss": 0.033403292298316956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340329203638248e-05, "grad_norm": 19.233854293823242, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.861295223236084, "num_tokens": 732380358.0, "step": 19192 }, { "epoch": 2.4415468769876605, "ewc_loss": 0.033359091728925705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335909059387632e-05, "grad_norm": 19.048887252807617, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8852663040161133, "num_tokens": 732416534.0, "step": 19193 }, { "epoch": 2.441674087266251, "ewc_loss": 0.03331940248608589, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3319403883069754e-05, "grad_norm": 19.18370819091797, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8539026379585266, "num_tokens": 732458927.0, "step": 19194 }, { "epoch": 2.4418012975448415, "ewc_loss": 0.033443186432123184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344318611198105e-05, "grad_norm": 19.125593185424805, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8659887313842773, "num_tokens": 732501261.0, "step": 19195 }, { "epoch": 2.441928507823432, "ewc_loss": 0.033301014453172684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3301013900199905e-05, "grad_norm": 19.202987670898438, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8830698728561401, "num_tokens": 732541162.0, "step": 19196 }, { "epoch": 2.4420557181020226, "ewc_loss": 0.0333855077624321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338550959597342e-05, "grad_norm": 19.107131958007812, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8711622357368469, "num_tokens": 732578237.0, "step": 19197 }, { "epoch": 2.442182928380613, "ewc_loss": 0.03336110711097717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3361106034135446e-05, "grad_norm": 19.239349365234375, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8646184206008911, "num_tokens": 732618624.0, "step": 19198 }, { "epoch": 2.4423101386592037, "ewc_loss": 0.033432915806770325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343291609780863e-05, "grad_norm": 19.114255905151367, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8717483878135681, "num_tokens": 732661747.0, "step": 19199 }, { "epoch": 2.442437348937794, "ewc_loss": 0.03326375037431717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3263750083278865e-05, "grad_norm": 19.065109252929688, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8545277714729309, "num_tokens": 732703511.0, "step": 19200 }, { "epoch": 2.4425645592163847, "ewc_loss": 0.033423129469156265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342312993481755e-05, "grad_norm": 19.27064323425293, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8550748229026794, "num_tokens": 732746749.0, "step": 19201 }, { "epoch": 2.4426917694949752, "ewc_loss": 0.03338859975337982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338859823998064e-05, "grad_norm": 19.12272834777832, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8770128488540649, "num_tokens": 732783184.0, "step": 19202 }, { "epoch": 2.4428189797735658, "ewc_loss": 0.03332614153623581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.332614141982049e-05, "grad_norm": 19.230514526367188, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.870047926902771, "num_tokens": 732818664.0, "step": 19203 }, { "epoch": 2.4429461900521563, "ewc_loss": 0.0334346741437912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3434673241572455e-05, "grad_norm": 19.155580520629883, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8650026321411133, "num_tokens": 732854286.0, "step": 19204 }, { "epoch": 2.443073400330747, "ewc_loss": 0.03335007652640343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3350075682392344e-05, "grad_norm": 19.199932098388672, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8819229602813721, "num_tokens": 732893032.0, "step": 19205 }, { "epoch": 2.4432006106093374, "ewc_loss": 0.03340819105505943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3408192393835634e-05, "grad_norm": 19.161846160888672, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8747111558914185, "num_tokens": 732929942.0, "step": 19206 }, { "epoch": 2.443327820887928, "ewc_loss": 0.03334468975663185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334469147375785e-05, "grad_norm": 19.277673721313477, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8706981539726257, "num_tokens": 732968218.0, "step": 19207 }, { "epoch": 2.4434550311665184, "ewc_loss": 0.033440299332141876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344029755680822e-05, "grad_norm": 19.111988067626953, "learning_rate": 1e-06, "loss": 0.3174, "mean_token_accuracy": 0.8984112739562988, "num_tokens": 733001464.0, "step": 19208 }, { "epoch": 2.443582241445109, "ewc_loss": 0.03331410884857178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3314110623905435e-05, "grad_norm": 19.153806686401367, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8662433624267578, "num_tokens": 733036311.0, "step": 19209 }, { "epoch": 2.4437094517236995, "ewc_loss": 0.03340451791882515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340451803524047e-05, "grad_norm": 19.143712997436523, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8808112144470215, "num_tokens": 733070487.0, "step": 19210 }, { "epoch": 2.4438366620022896, "ewc_loss": 0.03339071199297905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339071190566756e-05, "grad_norm": 19.1534481048584, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8728357553482056, "num_tokens": 733110357.0, "step": 19211 }, { "epoch": 2.4439638722808805, "ewc_loss": 0.0334475077688694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3447508030803874e-05, "grad_norm": 19.197832107543945, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8668904304504395, "num_tokens": 733149070.0, "step": 19212 }, { "epoch": 2.4440910825594706, "ewc_loss": 0.033419057726860046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341905903653242e-05, "grad_norm": 19.11830711364746, "learning_rate": 1e-06, "loss": 0.3466, "mean_token_accuracy": 0.8898881077766418, "num_tokens": 733180848.0, "step": 19213 }, { "epoch": 2.444218292838061, "ewc_loss": 0.03342664614319801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342664786032401e-05, "grad_norm": 19.224895477294922, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8767991662025452, "num_tokens": 733219204.0, "step": 19214 }, { "epoch": 2.4443455031166517, "ewc_loss": 0.033459216356277466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3459215046605095e-05, "grad_norm": 19.259164810180664, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8645746111869812, "num_tokens": 733255237.0, "step": 19215 }, { "epoch": 2.444472713395242, "ewc_loss": 0.03344122692942619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344122524140403e-05, "grad_norm": 19.16954231262207, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8857123851776123, "num_tokens": 733293243.0, "step": 19216 }, { "epoch": 2.4445999236738327, "ewc_loss": 0.03340567648410797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3405674912501127e-05, "grad_norm": 19.175119400024414, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.877747654914856, "num_tokens": 733333530.0, "step": 19217 }, { "epoch": 2.4447271339524232, "ewc_loss": 0.03345739468932152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345739605720155e-05, "grad_norm": 19.14822006225586, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.857207715511322, "num_tokens": 733373790.0, "step": 19218 }, { "epoch": 2.4448543442310138, "ewc_loss": 0.03343673050403595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343673233757727e-05, "grad_norm": 19.205753326416016, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8781435489654541, "num_tokens": 733416037.0, "step": 19219 }, { "epoch": 2.4449815545096043, "ewc_loss": 0.033493515104055405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3493513910798356e-05, "grad_norm": 19.189918518066406, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8719010353088379, "num_tokens": 733453014.0, "step": 19220 }, { "epoch": 2.445108764788195, "ewc_loss": 0.03339257836341858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3392578188795596e-05, "grad_norm": 19.09966278076172, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8596522808074951, "num_tokens": 733491197.0, "step": 19221 }, { "epoch": 2.4452359750667854, "ewc_loss": 0.0334191769361496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3419175451854244e-05, "grad_norm": 19.23952293395996, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8874571323394775, "num_tokens": 733531142.0, "step": 19222 }, { "epoch": 2.445363185345376, "ewc_loss": 0.0334775447845459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347754318383522e-05, "grad_norm": 19.124433517456055, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8697655200958252, "num_tokens": 733573916.0, "step": 19223 }, { "epoch": 2.4454903956239664, "ewc_loss": 0.033425331115722656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342533091199584e-05, "grad_norm": 19.237998962402344, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8686829805374146, "num_tokens": 733613204.0, "step": 19224 }, { "epoch": 2.445617605902557, "ewc_loss": 0.03352264314889908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352264320710674e-05, "grad_norm": 19.19847297668457, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8696014881134033, "num_tokens": 733653618.0, "step": 19225 }, { "epoch": 2.4457448161811475, "ewc_loss": 0.03338633477687836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338633541716263e-05, "grad_norm": 19.193735122680664, "learning_rate": 1e-06, "loss": 0.347, "mean_token_accuracy": 0.8903416395187378, "num_tokens": 733687795.0, "step": 19226 }, { "epoch": 2.445872026459738, "ewc_loss": 0.033391695469617844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339169415994547e-05, "grad_norm": 19.106412887573242, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8744664192199707, "num_tokens": 733723574.0, "step": 19227 }, { "epoch": 2.4459992367383285, "ewc_loss": 0.03338058665394783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338058741064742e-05, "grad_norm": 19.16326904296875, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8820856809616089, "num_tokens": 733762051.0, "step": 19228 }, { "epoch": 2.446126447016919, "ewc_loss": 0.033428605645895004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3428605092922226e-05, "grad_norm": 19.03965950012207, "learning_rate": 1e-06, "loss": 0.3389, "mean_token_accuracy": 0.8904107809066772, "num_tokens": 733800640.0, "step": 19229 }, { "epoch": 2.4462536572955096, "ewc_loss": 0.03335515409708023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3355154300807044e-05, "grad_norm": 19.222627639770508, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.864884078502655, "num_tokens": 733841321.0, "step": 19230 }, { "epoch": 2.4463808675741, "ewc_loss": 0.033502332866191864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3502332371426746e-05, "grad_norm": 19.104677200317383, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.877437949180603, "num_tokens": 733874104.0, "step": 19231 }, { "epoch": 2.4465080778526906, "ewc_loss": 0.033351387828588486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3351388992741704e-05, "grad_norm": 19.23171615600586, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8791368007659912, "num_tokens": 733913127.0, "step": 19232 }, { "epoch": 2.446635288131281, "ewc_loss": 0.03351965546607971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351965642650612e-05, "grad_norm": 19.161725997924805, "learning_rate": 1e-06, "loss": 0.3392, "mean_token_accuracy": 0.8908519148826599, "num_tokens": 733951900.0, "step": 19233 }, { "epoch": 2.4467624984098717, "ewc_loss": 0.033373404294252396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337340604048222e-05, "grad_norm": 19.297929763793945, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8770179748535156, "num_tokens": 733992866.0, "step": 19234 }, { "epoch": 2.4468897086884622, "ewc_loss": 0.03343250975012779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3432508644182235e-05, "grad_norm": 19.145679473876953, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8693162202835083, "num_tokens": 734029895.0, "step": 19235 }, { "epoch": 2.4470169189670523, "ewc_loss": 0.033306438475847244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.330643812660128e-05, "grad_norm": 19.182214736938477, "learning_rate": 1e-06, "loss": 0.3483, "mean_token_accuracy": 0.8910390734672546, "num_tokens": 734066954.0, "step": 19236 }, { "epoch": 2.4471441292456433, "ewc_loss": 0.03344806283712387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344806100358255e-05, "grad_norm": 19.153291702270508, "learning_rate": 1e-06, "loss": 0.327, "mean_token_accuracy": 0.8965268135070801, "num_tokens": 734102412.0, "step": 19237 }, { "epoch": 2.4472713395242334, "ewc_loss": 0.03337469324469566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3374693884979934e-05, "grad_norm": 19.151643753051758, "learning_rate": 1e-06, "loss": 0.3387, "mean_token_accuracy": 0.89070063829422, "num_tokens": 734141400.0, "step": 19238 }, { "epoch": 2.447398549802824, "ewc_loss": 0.033476099371910095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347609890624881e-05, "grad_norm": 19.3240909576416, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8554363250732422, "num_tokens": 734176628.0, "step": 19239 }, { "epoch": 2.4475257600814144, "ewc_loss": 0.03343462571501732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343462594784796e-05, "grad_norm": 19.120281219482422, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8808563947677612, "num_tokens": 734219756.0, "step": 19240 }, { "epoch": 2.447652970360005, "ewc_loss": 0.033377598971128464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33776006300468e-05, "grad_norm": 19.224302291870117, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8742688894271851, "num_tokens": 734250509.0, "step": 19241 }, { "epoch": 2.4477801806385955, "ewc_loss": 0.033412352204322815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.341235060361214e-05, "grad_norm": 19.035676956176758, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.881858229637146, "num_tokens": 734287887.0, "step": 19242 }, { "epoch": 2.447907390917186, "ewc_loss": 0.03335391730070114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335391738801263e-05, "grad_norm": 19.161945343017578, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8786828517913818, "num_tokens": 734330446.0, "step": 19243 }, { "epoch": 2.4480346011957765, "ewc_loss": 0.03350129723548889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3501295547466725e-05, "grad_norm": 19.10004234313965, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8804762363433838, "num_tokens": 734368427.0, "step": 19244 }, { "epoch": 2.448161811474367, "ewc_loss": 0.033402975648641586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3402975532226264e-05, "grad_norm": 19.076656341552734, "learning_rate": 1e-06, "loss": 0.3301, "mean_token_accuracy": 0.8933404684066772, "num_tokens": 734405103.0, "step": 19245 }, { "epoch": 2.4482890217529576, "ewc_loss": 0.033449187874794006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344918877701275e-05, "grad_norm": 19.162559509277344, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8716616630554199, "num_tokens": 734448069.0, "step": 19246 }, { "epoch": 2.448416232031548, "ewc_loss": 0.03350212797522545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350212864461355e-05, "grad_norm": 19.070615768432617, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8618007898330688, "num_tokens": 734486016.0, "step": 19247 }, { "epoch": 2.4485434423101387, "ewc_loss": 0.033392418175935745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3392418117728084e-05, "grad_norm": 19.13047218322754, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8643764853477478, "num_tokens": 734526943.0, "step": 19248 }, { "epoch": 2.448670652588729, "ewc_loss": 0.03352741524577141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3527416235301644e-05, "grad_norm": 19.1411190032959, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8885294795036316, "num_tokens": 734566137.0, "step": 19249 }, { "epoch": 2.4487978628673197, "ewc_loss": 0.03336412459611893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336412555654533e-05, "grad_norm": 19.143089294433594, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8736271858215332, "num_tokens": 734602366.0, "step": 19250 }, { "epoch": 2.4489250731459102, "ewc_loss": 0.03342362865805626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3423628337914124e-05, "grad_norm": 19.107017517089844, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.883516788482666, "num_tokens": 734644251.0, "step": 19251 }, { "epoch": 2.4490522834245008, "ewc_loss": 0.03345852717757225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3458527468610555e-05, "grad_norm": 19.171186447143555, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8715512752532959, "num_tokens": 734682938.0, "step": 19252 }, { "epoch": 2.4491794937030913, "ewc_loss": 0.033429402858018875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342940181028098e-05, "grad_norm": 19.142908096313477, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8761955499649048, "num_tokens": 734722975.0, "step": 19253 }, { "epoch": 2.449306703981682, "ewc_loss": 0.03345268592238426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3452684874646366e-05, "grad_norm": 19.17281723022461, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8774416446685791, "num_tokens": 734761361.0, "step": 19254 }, { "epoch": 2.4494339142602723, "ewc_loss": 0.033410780131816864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3410778996767476e-05, "grad_norm": 19.07332420349121, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.873626708984375, "num_tokens": 734800597.0, "step": 19255 }, { "epoch": 2.449561124538863, "ewc_loss": 0.03339579701423645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339579780003987e-05, "grad_norm": 19.2047119140625, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8643091917037964, "num_tokens": 734840575.0, "step": 19256 }, { "epoch": 2.4496883348174534, "ewc_loss": 0.033498216420412064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3498217817395926e-05, "grad_norm": 19.126953125, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.86634361743927, "num_tokens": 734876943.0, "step": 19257 }, { "epoch": 2.449815545096044, "ewc_loss": 0.03342360258102417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3423602872062474e-05, "grad_norm": 19.214954376220703, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8903706073760986, "num_tokens": 734916436.0, "step": 19258 }, { "epoch": 2.4499427553746345, "ewc_loss": 0.033423613756895065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3423613785998896e-05, "grad_norm": 19.163785934448242, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8735129237174988, "num_tokens": 734956336.0, "step": 19259 }, { "epoch": 2.450069965653225, "ewc_loss": 0.033363472670316696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336347435833886e-05, "grad_norm": 19.115650177001953, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8740371465682983, "num_tokens": 734997132.0, "step": 19260 }, { "epoch": 2.450197175931815, "ewc_loss": 0.03338171914219856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338171882205643e-05, "grad_norm": 19.11709976196289, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8632010221481323, "num_tokens": 735036192.0, "step": 19261 }, { "epoch": 2.450324386210406, "ewc_loss": 0.03343518078327179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343517892062664e-05, "grad_norm": 19.267581939697266, "learning_rate": 1e-06, "loss": 0.3475, "mean_token_accuracy": 0.8882926106452942, "num_tokens": 735079368.0, "step": 19262 }, { "epoch": 2.450451596488996, "ewc_loss": 0.033370960503816605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337096131872386e-05, "grad_norm": 19.115888595581055, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8809190988540649, "num_tokens": 735109714.0, "step": 19263 }, { "epoch": 2.4505788067675867, "ewc_loss": 0.03335750848054886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335750807309523e-05, "grad_norm": 19.17167091369629, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8726135492324829, "num_tokens": 735152712.0, "step": 19264 }, { "epoch": 2.450706017046177, "ewc_loss": 0.03345397114753723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3453972719144076e-05, "grad_norm": 19.219940185546875, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8706637620925903, "num_tokens": 735193313.0, "step": 19265 }, { "epoch": 2.4508332273247677, "ewc_loss": 0.03336097300052643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3360971428919584e-05, "grad_norm": 19.113744735717773, "learning_rate": 1e-06, "loss": 0.35, "mean_token_accuracy": 0.8898210525512695, "num_tokens": 735235413.0, "step": 19266 }, { "epoch": 2.4509604376033582, "ewc_loss": 0.033421922475099564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.34219221258536e-05, "grad_norm": 19.278783798217773, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8757115006446838, "num_tokens": 735270809.0, "step": 19267 }, { "epoch": 2.4510876478819488, "ewc_loss": 0.03336237743496895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336237568873912e-05, "grad_norm": 19.092775344848633, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8679015636444092, "num_tokens": 735309790.0, "step": 19268 }, { "epoch": 2.4512148581605393, "ewc_loss": 0.0333515964448452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.335159635753371e-05, "grad_norm": 19.211074829101562, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8758090734481812, "num_tokens": 735341084.0, "step": 19269 }, { "epoch": 2.45134206843913, "ewc_loss": 0.03342701122164726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342701165820472e-05, "grad_norm": 19.15812110900879, "learning_rate": 1e-06, "loss": 0.3391, "mean_token_accuracy": 0.8918126821517944, "num_tokens": 735376964.0, "step": 19270 }, { "epoch": 2.4514692787177204, "ewc_loss": 0.033386167138814926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.33861680701375e-05, "grad_norm": 19.179611206054688, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8850539326667786, "num_tokens": 735413296.0, "step": 19271 }, { "epoch": 2.451596488996311, "ewc_loss": 0.03344053775072098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3440537663409486e-05, "grad_norm": 19.163637161254883, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8801544904708862, "num_tokens": 735454992.0, "step": 19272 }, { "epoch": 2.4517236992749014, "ewc_loss": 0.033316224813461304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3316224289592355e-05, "grad_norm": 19.158918380737305, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8771024942398071, "num_tokens": 735493315.0, "step": 19273 }, { "epoch": 2.451850909553492, "ewc_loss": 0.03345395252108574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3453950891271234e-05, "grad_norm": 19.15184211730957, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8835799694061279, "num_tokens": 735532666.0, "step": 19274 }, { "epoch": 2.4519781198320825, "ewc_loss": 0.03339773789048195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339773684274405e-05, "grad_norm": 19.118881225585938, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8737574219703674, "num_tokens": 735571987.0, "step": 19275 }, { "epoch": 2.452105330110673, "ewc_loss": 0.03344317525625229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344317519804463e-05, "grad_norm": 19.19305992126465, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.874060869216919, "num_tokens": 735609409.0, "step": 19276 }, { "epoch": 2.4522325403892635, "ewc_loss": 0.03348664194345474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348664176883176e-05, "grad_norm": 19.1359920501709, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8498886823654175, "num_tokens": 735647299.0, "step": 19277 }, { "epoch": 2.452359750667854, "ewc_loss": 0.0334014967083931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3401498512830585e-05, "grad_norm": 19.209718704223633, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8836254477500916, "num_tokens": 735677757.0, "step": 19278 }, { "epoch": 2.4524869609464446, "ewc_loss": 0.03353222459554672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353222564328462e-05, "grad_norm": 19.18626594543457, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8803137540817261, "num_tokens": 735723924.0, "step": 19279 }, { "epoch": 2.452614171225035, "ewc_loss": 0.03340483456850052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340483453939669e-05, "grad_norm": 19.11782455444336, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8614001274108887, "num_tokens": 735758105.0, "step": 19280 }, { "epoch": 2.4527413815036256, "ewc_loss": 0.033443283289670944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344328433740884e-05, "grad_norm": 19.15776252746582, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8710744380950928, "num_tokens": 735797603.0, "step": 19281 }, { "epoch": 2.452868591782216, "ewc_loss": 0.033465538173913956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346553785377182e-05, "grad_norm": 19.091110229492188, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8495100736618042, "num_tokens": 735838958.0, "step": 19282 }, { "epoch": 2.4529958020608067, "ewc_loss": 0.03351380676031113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3513806556584314e-05, "grad_norm": 19.163646697998047, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8666349649429321, "num_tokens": 735878894.0, "step": 19283 }, { "epoch": 2.453123012339397, "ewc_loss": 0.03355575352907181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355575245223008e-05, "grad_norm": 19.138031005859375, "learning_rate": 1e-06, "loss": 0.3014, "mean_token_accuracy": 0.9045689105987549, "num_tokens": 735913222.0, "step": 19284 }, { "epoch": 2.4532502226179878, "ewc_loss": 0.03346764296293259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346764424350113e-05, "grad_norm": 19.09331512451172, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8822822570800781, "num_tokens": 735951884.0, "step": 19285 }, { "epoch": 2.453377432896578, "ewc_loss": 0.03349912166595459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3499120036140084e-05, "grad_norm": 19.195335388183594, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.870453953742981, "num_tokens": 735998955.0, "step": 19286 }, { "epoch": 2.4535046431751684, "ewc_loss": 0.03352673351764679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352673229528591e-05, "grad_norm": 19.138044357299805, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8736839294433594, "num_tokens": 736039631.0, "step": 19287 }, { "epoch": 2.453631853453759, "ewc_loss": 0.03347183018922806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347183155710809e-05, "grad_norm": 19.182973861694336, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8714379668235779, "num_tokens": 736085100.0, "step": 19288 }, { "epoch": 2.4537590637323494, "ewc_loss": 0.03354181349277496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3541811717441306e-05, "grad_norm": 19.21446418762207, "learning_rate": 1e-06, "loss": 0.3418, "mean_token_accuracy": 0.8902737498283386, "num_tokens": 736119680.0, "step": 19289 }, { "epoch": 2.45388627401094, "ewc_loss": 0.033506058156490326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350605766172521e-05, "grad_norm": 19.260570526123047, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8875528573989868, "num_tokens": 736159769.0, "step": 19290 }, { "epoch": 2.4540134842895305, "ewc_loss": 0.03337157517671585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3371576137142256e-05, "grad_norm": 19.10295867919922, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8808562159538269, "num_tokens": 736199566.0, "step": 19291 }, { "epoch": 2.454140694568121, "ewc_loss": 0.0334555059671402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345550430822186e-05, "grad_norm": 19.301605224609375, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.879776656627655, "num_tokens": 736239238.0, "step": 19292 }, { "epoch": 2.4542679048467115, "ewc_loss": 0.033460721373558044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346072116983123e-05, "grad_norm": 19.213687896728516, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8790041208267212, "num_tokens": 736274576.0, "step": 19293 }, { "epoch": 2.454395115125302, "ewc_loss": 0.03339185193181038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339185059303418e-05, "grad_norm": 19.31298828125, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8691157102584839, "num_tokens": 736307509.0, "step": 19294 }, { "epoch": 2.4545223254038926, "ewc_loss": 0.03339458629488945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339458635309711e-05, "grad_norm": 19.170555114746094, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8715813159942627, "num_tokens": 736348313.0, "step": 19295 }, { "epoch": 2.454649535682483, "ewc_loss": 0.03336918354034424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.336918234708719e-05, "grad_norm": 19.283666610717773, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8712573051452637, "num_tokens": 736380716.0, "step": 19296 }, { "epoch": 2.4547767459610736, "ewc_loss": 0.033415522426366806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3415522921131924e-05, "grad_norm": 19.20296859741211, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.854600191116333, "num_tokens": 736422669.0, "step": 19297 }, { "epoch": 2.454903956239664, "ewc_loss": 0.03334220126271248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.334219945827499e-05, "grad_norm": 19.23828887939453, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8823601603507996, "num_tokens": 736452929.0, "step": 19298 }, { "epoch": 2.4550311665182547, "ewc_loss": 0.03340335935354233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340335752000101e-05, "grad_norm": 19.2353515625, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8766910433769226, "num_tokens": 736486755.0, "step": 19299 }, { "epoch": 2.4551583767968452, "ewc_loss": 0.03342175856232643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342175841680728e-05, "grad_norm": 19.22309684753418, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8835224509239197, "num_tokens": 736521542.0, "step": 19300 }, { "epoch": 2.4552855870754358, "ewc_loss": 0.033400457352399826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.340045805089176e-05, "grad_norm": 19.152240753173828, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.876891553401947, "num_tokens": 736557312.0, "step": 19301 }, { "epoch": 2.4554127973540263, "ewc_loss": 0.03340322524309158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3403226552763954e-05, "grad_norm": 19.260799407958984, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8713695406913757, "num_tokens": 736596469.0, "step": 19302 }, { "epoch": 2.455540007632617, "ewc_loss": 0.03345038369297981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3450382034061477e-05, "grad_norm": 19.140527725219727, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8602657914161682, "num_tokens": 736636078.0, "step": 19303 }, { "epoch": 2.4556672179112073, "ewc_loss": 0.03339598700404167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339598697493784e-05, "grad_norm": 19.243623733520508, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8866082429885864, "num_tokens": 736674117.0, "step": 19304 }, { "epoch": 2.455794428189798, "ewc_loss": 0.033493462949991226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349346297909506e-05, "grad_norm": 19.181303024291992, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8720664978027344, "num_tokens": 736709116.0, "step": 19305 }, { "epoch": 2.4559216384683884, "ewc_loss": 0.033379942178726196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337994348839857e-05, "grad_norm": 19.177701950073242, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8672448396682739, "num_tokens": 736750383.0, "step": 19306 }, { "epoch": 2.456048848746979, "ewc_loss": 0.03350691497325897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350691622472368e-05, "grad_norm": 19.249794006347656, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8643051385879517, "num_tokens": 736790836.0, "step": 19307 }, { "epoch": 2.4561760590255695, "ewc_loss": 0.033460237085819244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346023731864989e-05, "grad_norm": 19.198415756225586, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8741500377655029, "num_tokens": 736826085.0, "step": 19308 }, { "epoch": 2.4563032693041595, "ewc_loss": 0.033474694937467575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347469464642927e-05, "grad_norm": 19.20005226135254, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.874559760093689, "num_tokens": 736860599.0, "step": 19309 }, { "epoch": 2.4564304795827505, "ewc_loss": 0.033451441675424576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345144068589434e-05, "grad_norm": 19.209625244140625, "learning_rate": 1e-06, "loss": 0.3287, "mean_token_accuracy": 0.8924694657325745, "num_tokens": 736893286.0, "step": 19310 }, { "epoch": 2.4565576898613406, "ewc_loss": 0.03348005190491676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3480053389212117e-05, "grad_norm": 19.14404296875, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.885055661201477, "num_tokens": 736931597.0, "step": 19311 }, { "epoch": 2.456684900139931, "ewc_loss": 0.033482085913419724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348208701936528e-05, "grad_norm": 19.298240661621094, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8829103708267212, "num_tokens": 736972183.0, "step": 19312 }, { "epoch": 2.4568121104185217, "ewc_loss": 0.0336211733520031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362117422511801e-05, "grad_norm": 19.150196075439453, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8812181949615479, "num_tokens": 737004184.0, "step": 19313 }, { "epoch": 2.456939320697112, "ewc_loss": 0.033472344279289246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347234451211989e-05, "grad_norm": 19.257522583007812, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8755135536193848, "num_tokens": 737042644.0, "step": 19314 }, { "epoch": 2.4570665309757027, "ewc_loss": 0.03356540948152542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356541128596291e-05, "grad_norm": 19.176971435546875, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8668383359909058, "num_tokens": 737074830.0, "step": 19315 }, { "epoch": 2.4571937412542932, "ewc_loss": 0.03349224478006363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349224425619468e-05, "grad_norm": 19.275503158569336, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8821991682052612, "num_tokens": 737107756.0, "step": 19316 }, { "epoch": 2.4573209515328838, "ewc_loss": 0.03357798233628273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357798414072022e-05, "grad_norm": 19.194414138793945, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8840850591659546, "num_tokens": 737143457.0, "step": 19317 }, { "epoch": 2.4574481618114743, "ewc_loss": 0.03353029862046242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353029751451686e-05, "grad_norm": 19.237873077392578, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8705072999000549, "num_tokens": 737177005.0, "step": 19318 }, { "epoch": 2.457575372090065, "ewc_loss": 0.03353964537382126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3539647120051086e-05, "grad_norm": 19.16638946533203, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.866688072681427, "num_tokens": 737218265.0, "step": 19319 }, { "epoch": 2.4577025823686554, "ewc_loss": 0.033504657447338104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350465703988448e-05, "grad_norm": 19.139419555664062, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8667113780975342, "num_tokens": 737259495.0, "step": 19320 }, { "epoch": 2.457829792647246, "ewc_loss": 0.033623166382312775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36231678375043e-05, "grad_norm": 19.23112678527832, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8674497008323669, "num_tokens": 737300073.0, "step": 19321 }, { "epoch": 2.4579570029258364, "ewc_loss": 0.03353063389658928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353063220856711e-05, "grad_norm": 19.152559280395508, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8768554329872131, "num_tokens": 737334716.0, "step": 19322 }, { "epoch": 2.458084213204427, "ewc_loss": 0.03358029946684837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358029789524153e-05, "grad_norm": 19.213472366333008, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.879837691783905, "num_tokens": 737377086.0, "step": 19323 }, { "epoch": 2.4582114234830175, "ewc_loss": 0.033576756715774536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357675814186223e-05, "grad_norm": 19.177440643310547, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8652772307395935, "num_tokens": 737418259.0, "step": 19324 }, { "epoch": 2.458338633761608, "ewc_loss": 0.03359242528676987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359242691658437e-05, "grad_norm": 19.200817108154297, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8654472231864929, "num_tokens": 737453150.0, "step": 19325 }, { "epoch": 2.4584658440401985, "ewc_loss": 0.03359878063201904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359878246556036e-05, "grad_norm": 19.160795211791992, "learning_rate": 1e-06, "loss": 0.3466, "mean_token_accuracy": 0.8901102542877197, "num_tokens": 737495552.0, "step": 19326 }, { "epoch": 2.458593054318789, "ewc_loss": 0.03357071429491043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357071545906365e-05, "grad_norm": 19.248825073242188, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8816787004470825, "num_tokens": 737537518.0, "step": 19327 }, { "epoch": 2.4587202645973796, "ewc_loss": 0.033574178814888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3574178814888e-05, "grad_norm": 19.165061950683594, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8723021745681763, "num_tokens": 737577472.0, "step": 19328 }, { "epoch": 2.45884747487597, "ewc_loss": 0.03351366147398949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351366103743203e-05, "grad_norm": 19.13992691040039, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.865786075592041, "num_tokens": 737617250.0, "step": 19329 }, { "epoch": 2.4589746851545606, "ewc_loss": 0.033583927899599075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3583928598091006e-05, "grad_norm": 19.181819915771484, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8793582916259766, "num_tokens": 737655544.0, "step": 19330 }, { "epoch": 2.459101895433151, "ewc_loss": 0.03353893384337425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3538934076204896e-05, "grad_norm": 19.13300132751465, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8790183067321777, "num_tokens": 737688644.0, "step": 19331 }, { "epoch": 2.4592291057117417, "ewc_loss": 0.033458318561315536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345832010381855e-05, "grad_norm": 19.12210464477539, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8791320323944092, "num_tokens": 737730264.0, "step": 19332 }, { "epoch": 2.459356315990332, "ewc_loss": 0.03356139361858368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356139495735988e-05, "grad_norm": 19.21204376220703, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.870176374912262, "num_tokens": 737763578.0, "step": 19333 }, { "epoch": 2.4594835262689223, "ewc_loss": 0.03357778117060661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357778041390702e-05, "grad_norm": 19.177921295166016, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8766401410102844, "num_tokens": 737804062.0, "step": 19334 }, { "epoch": 2.4596107365475133, "ewc_loss": 0.033546071499586105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354607179062441e-05, "grad_norm": 19.18482780456543, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8891977071762085, "num_tokens": 737845608.0, "step": 19335 }, { "epoch": 2.4597379468261034, "ewc_loss": 0.033505961298942566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3505963074276224e-05, "grad_norm": 19.266353607177734, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8573033809661865, "num_tokens": 737881366.0, "step": 19336 }, { "epoch": 2.459865157104694, "ewc_loss": 0.033549435436725616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354943692102097e-05, "grad_norm": 19.184003829956055, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.867896556854248, "num_tokens": 737922595.0, "step": 19337 }, { "epoch": 2.4599923673832844, "ewc_loss": 0.03346177563071251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346177618368529e-05, "grad_norm": 19.198265075683594, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.882393479347229, "num_tokens": 737956368.0, "step": 19338 }, { "epoch": 2.460119577661875, "ewc_loss": 0.033483341336250305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348334212205373e-05, "grad_norm": 19.102256774902344, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8802347779273987, "num_tokens": 738000167.0, "step": 19339 }, { "epoch": 2.4602467879404655, "ewc_loss": 0.033463891595602036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346388984937221e-05, "grad_norm": 19.241867065429688, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8681697845458984, "num_tokens": 738040850.0, "step": 19340 }, { "epoch": 2.460373998219056, "ewc_loss": 0.03353322669863701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353322608745657e-05, "grad_norm": 19.072498321533203, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8741936683654785, "num_tokens": 738080370.0, "step": 19341 }, { "epoch": 2.4605012084976465, "ewc_loss": 0.03337228298187256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.337228190503083e-05, "grad_norm": 19.17589569091797, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.880160927772522, "num_tokens": 738116751.0, "step": 19342 }, { "epoch": 2.460628418776237, "ewc_loss": 0.033503174781799316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350317638250999e-05, "grad_norm": 19.09357261657715, "learning_rate": 1e-06, "loss": 0.3181, "mean_token_accuracy": 0.8929123282432556, "num_tokens": 738154597.0, "step": 19343 }, { "epoch": 2.4607556290548276, "ewc_loss": 0.03352668508887291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352668500156142e-05, "grad_norm": 19.221866607666016, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8819446563720703, "num_tokens": 738186484.0, "step": 19344 }, { "epoch": 2.460882839333418, "ewc_loss": 0.03350815549492836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35081567754969e-05, "grad_norm": 19.08217430114746, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8791627883911133, "num_tokens": 738217722.0, "step": 19345 }, { "epoch": 2.4610100496120086, "ewc_loss": 0.03339173272252083, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339173417771235e-05, "grad_norm": 19.16556739807129, "learning_rate": 1e-06, "loss": 0.3483, "mean_token_accuracy": 0.8896921873092651, "num_tokens": 738255304.0, "step": 19346 }, { "epoch": 2.461137259890599, "ewc_loss": 0.033553849905729294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355384978931397e-05, "grad_norm": 19.127079010009766, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8704941272735596, "num_tokens": 738292162.0, "step": 19347 }, { "epoch": 2.4612644701691897, "ewc_loss": 0.03351154923439026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351155100972392e-05, "grad_norm": 19.180835723876953, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8879748582839966, "num_tokens": 738332851.0, "step": 19348 }, { "epoch": 2.4613916804477802, "ewc_loss": 0.033591751009225845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3591750252526253e-05, "grad_norm": 19.20695686340332, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8853754997253418, "num_tokens": 738375120.0, "step": 19349 }, { "epoch": 2.4615188907263708, "ewc_loss": 0.033569831401109695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3569831430213526e-05, "grad_norm": 19.25419807434082, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8672734498977661, "num_tokens": 738413857.0, "step": 19350 }, { "epoch": 2.4616461010049613, "ewc_loss": 0.03348884731531143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348884638398886e-05, "grad_norm": 19.229148864746094, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.86957848072052, "num_tokens": 738450097.0, "step": 19351 }, { "epoch": 2.461773311283552, "ewc_loss": 0.03348688408732414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3486885513411835e-05, "grad_norm": 19.256135940551758, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8566447496414185, "num_tokens": 738483784.0, "step": 19352 }, { "epoch": 2.4619005215621423, "ewc_loss": 0.03343537449836731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343537537148222e-05, "grad_norm": 19.150249481201172, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8568007946014404, "num_tokens": 738526820.0, "step": 19353 }, { "epoch": 2.462027731840733, "ewc_loss": 0.033479638397693634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347963865962811e-05, "grad_norm": 19.239482879638672, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8616282343864441, "num_tokens": 738569049.0, "step": 19354 }, { "epoch": 2.4621549421193234, "ewc_loss": 0.03351834788918495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3518346754135564e-05, "grad_norm": 19.1287784576416, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8727714419364929, "num_tokens": 738608320.0, "step": 19355 }, { "epoch": 2.462282152397914, "ewc_loss": 0.03345959261059761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345959339640103e-05, "grad_norm": 19.20218849182129, "learning_rate": 1e-06, "loss": 0.3464, "mean_token_accuracy": 0.8903911113739014, "num_tokens": 738645566.0, "step": 19356 }, { "epoch": 2.4624093626765045, "ewc_loss": 0.03348661586642265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348661630298011e-05, "grad_norm": 19.186513900756836, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8527019023895264, "num_tokens": 738686184.0, "step": 19357 }, { "epoch": 2.462536572955095, "ewc_loss": 0.03342137113213539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.342137279105373e-05, "grad_norm": 19.110973358154297, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8620188236236572, "num_tokens": 738728820.0, "step": 19358 }, { "epoch": 2.462663783233685, "ewc_loss": 0.033494919538497925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349491817061789e-05, "grad_norm": 19.21097755432129, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8464192152023315, "num_tokens": 738760180.0, "step": 19359 }, { "epoch": 2.462790993512276, "ewc_loss": 0.03355192393064499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355192529852502e-05, "grad_norm": 19.099363327026367, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8829136490821838, "num_tokens": 738795501.0, "step": 19360 }, { "epoch": 2.462918203790866, "ewc_loss": 0.03344191610813141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3441916457377374e-05, "grad_norm": 19.15638542175293, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8658864498138428, "num_tokens": 738831027.0, "step": 19361 }, { "epoch": 2.4630454140694567, "ewc_loss": 0.03359618037939072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359618131071329e-05, "grad_norm": 19.165687561035156, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.857909083366394, "num_tokens": 738870383.0, "step": 19362 }, { "epoch": 2.463172624348047, "ewc_loss": 0.03347841277718544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347841266077012e-05, "grad_norm": 19.153182983398438, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8791107535362244, "num_tokens": 738908231.0, "step": 19363 }, { "epoch": 2.4632998346266377, "ewc_loss": 0.03359026461839676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359026595717296e-05, "grad_norm": 19.243040084838867, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8662523031234741, "num_tokens": 738950927.0, "step": 19364 }, { "epoch": 2.4634270449052282, "ewc_loss": 0.033573925495147705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3573924156371504e-05, "grad_norm": 19.22887420654297, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8867230415344238, "num_tokens": 738989857.0, "step": 19365 }, { "epoch": 2.4635542551838188, "ewc_loss": 0.033517856150865555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3517855626996607e-05, "grad_norm": 19.23261260986328, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.887852668762207, "num_tokens": 739021281.0, "step": 19366 }, { "epoch": 2.4636814654624093, "ewc_loss": 0.03350137546658516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350137558300048e-05, "grad_norm": 19.215709686279297, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8822095394134521, "num_tokens": 739063475.0, "step": 19367 }, { "epoch": 2.463808675741, "ewc_loss": 0.03353777900338173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353777719894424e-05, "grad_norm": 19.164531707763672, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.870589017868042, "num_tokens": 739107083.0, "step": 19368 }, { "epoch": 2.4639358860195903, "ewc_loss": 0.03344055265188217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3440552215324715e-05, "grad_norm": 19.21402359008789, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8540627956390381, "num_tokens": 739152002.0, "step": 19369 }, { "epoch": 2.464063096298181, "ewc_loss": 0.03357870504260063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357870446052402e-05, "grad_norm": 19.215885162353516, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8721033334732056, "num_tokens": 739191781.0, "step": 19370 }, { "epoch": 2.4641903065767714, "ewc_loss": 0.033434346318244934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3434345823479816e-05, "grad_norm": 19.133499145507812, "learning_rate": 1e-06, "loss": 0.3317, "mean_token_accuracy": 0.894767165184021, "num_tokens": 739230374.0, "step": 19371 }, { "epoch": 2.464317516855362, "ewc_loss": 0.033441055566072464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3441054256400093e-05, "grad_norm": 19.23116111755371, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8717702031135559, "num_tokens": 739267310.0, "step": 19372 }, { "epoch": 2.4644447271339525, "ewc_loss": 0.03347886726260185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3478867408121005e-05, "grad_norm": 19.16916275024414, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8775453567504883, "num_tokens": 739305074.0, "step": 19373 }, { "epoch": 2.464571937412543, "ewc_loss": 0.03343988582491875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.343988646520302e-05, "grad_norm": 19.172149658203125, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8672058582305908, "num_tokens": 739343028.0, "step": 19374 }, { "epoch": 2.4646991476911335, "ewc_loss": 0.03351648524403572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351648410898633e-05, "grad_norm": 19.151517868041992, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8740352392196655, "num_tokens": 739384364.0, "step": 19375 }, { "epoch": 2.464826357969724, "ewc_loss": 0.033491235226392746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3491236536065117e-05, "grad_norm": 19.261117935180664, "learning_rate": 1e-06, "loss": 0.3421, "mean_token_accuracy": 0.8911957740783691, "num_tokens": 739423564.0, "step": 19376 }, { "epoch": 2.4649535682483146, "ewc_loss": 0.033451300114393234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3451298804720864e-05, "grad_norm": 19.19674301147461, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8839004039764404, "num_tokens": 739457609.0, "step": 19377 }, { "epoch": 2.465080778526905, "ewc_loss": 0.0333854965865612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338549504405819e-05, "grad_norm": 19.138200759887695, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8763676285743713, "num_tokens": 739497204.0, "step": 19378 }, { "epoch": 2.4652079888054956, "ewc_loss": 0.033395785838365555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.339578688610345e-05, "grad_norm": 19.153776168823242, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8696049451828003, "num_tokens": 739543688.0, "step": 19379 }, { "epoch": 2.465335199084086, "ewc_loss": 0.03347250074148178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3472500945208594e-05, "grad_norm": 19.179126739501953, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8869219422340393, "num_tokens": 739578221.0, "step": 19380 }, { "epoch": 2.4654624093626767, "ewc_loss": 0.033449381589889526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3449381589889526e-05, "grad_norm": 19.12163543701172, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8809781074523926, "num_tokens": 739615892.0, "step": 19381 }, { "epoch": 2.4655896196412668, "ewc_loss": 0.0335116870701313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351168561493978e-05, "grad_norm": 19.161375045776367, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8606964349746704, "num_tokens": 739657697.0, "step": 19382 }, { "epoch": 2.4657168299198577, "ewc_loss": 0.03354353830218315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3543539757374674e-05, "grad_norm": 19.211915969848633, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8829619288444519, "num_tokens": 739694173.0, "step": 19383 }, { "epoch": 2.465844040198448, "ewc_loss": 0.03355870395898819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355870285304263e-05, "grad_norm": 19.172168731689453, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8723898530006409, "num_tokens": 739734111.0, "step": 19384 }, { "epoch": 2.4659712504770384, "ewc_loss": 0.0335114486515522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351144914631732e-05, "grad_norm": 19.245210647583008, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8775002360343933, "num_tokens": 739771409.0, "step": 19385 }, { "epoch": 2.466098460755629, "ewc_loss": 0.0335363894701004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353639112901874e-05, "grad_norm": 19.21360206604004, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8627604842185974, "num_tokens": 739807185.0, "step": 19386 }, { "epoch": 2.4662256710342194, "ewc_loss": 0.03351631015539169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351630948600359e-05, "grad_norm": 19.20645523071289, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8872479200363159, "num_tokens": 739837750.0, "step": 19387 }, { "epoch": 2.46635288131281, "ewc_loss": 0.033510174602270126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3510175853734836e-05, "grad_norm": 19.184133529663086, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8743515014648438, "num_tokens": 739879516.0, "step": 19388 }, { "epoch": 2.4664800915914005, "ewc_loss": 0.03354577720165253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3545777114341035e-05, "grad_norm": 19.200998306274414, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8815287351608276, "num_tokens": 739918688.0, "step": 19389 }, { "epoch": 2.466607301869991, "ewc_loss": 0.03356853872537613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356853994773701e-05, "grad_norm": 19.224092483520508, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8788201808929443, "num_tokens": 739958978.0, "step": 19390 }, { "epoch": 2.4667345121485815, "ewc_loss": 0.033539656549692154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353965803398751e-05, "grad_norm": 19.23483657836914, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8797528743743896, "num_tokens": 740002428.0, "step": 19391 }, { "epoch": 2.466861722427172, "ewc_loss": 0.03351765125989914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351765190018341e-05, "grad_norm": 19.142047882080078, "learning_rate": 1e-06, "loss": 0.3356, "mean_token_accuracy": 0.8961844444274902, "num_tokens": 740045856.0, "step": 19392 }, { "epoch": 2.4669889327057626, "ewc_loss": 0.033521268516778946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352126805111766e-05, "grad_norm": 19.316307067871094, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8723225593566895, "num_tokens": 740084512.0, "step": 19393 }, { "epoch": 2.467116142984353, "ewc_loss": 0.03355192765593529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355192893650383e-05, "grad_norm": 19.19822883605957, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8858228921890259, "num_tokens": 740129624.0, "step": 19394 }, { "epoch": 2.4672433532629436, "ewc_loss": 0.033444736152887344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344473589095287e-05, "grad_norm": 19.223722457885742, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8596463203430176, "num_tokens": 740168368.0, "step": 19395 }, { "epoch": 2.467370563541534, "ewc_loss": 0.03354461491107941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354461659910157e-05, "grad_norm": 19.265520095825195, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8686144948005676, "num_tokens": 740206670.0, "step": 19396 }, { "epoch": 2.4674977738201247, "ewc_loss": 0.03341066464781761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3410666219424456e-05, "grad_norm": 19.19450569152832, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8533508777618408, "num_tokens": 740246422.0, "step": 19397 }, { "epoch": 2.4676249840987152, "ewc_loss": 0.033512137830257416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351213672431186e-05, "grad_norm": 19.211383819580078, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8730647563934326, "num_tokens": 740284450.0, "step": 19398 }, { "epoch": 2.4677521943773058, "ewc_loss": 0.03344914689660072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3449145121267065e-05, "grad_norm": 19.284561157226562, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8533778190612793, "num_tokens": 740319062.0, "step": 19399 }, { "epoch": 2.4678794046558963, "ewc_loss": 0.03348391130566597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3483909646747634e-05, "grad_norm": 19.174314498901367, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8767651915550232, "num_tokens": 740353996.0, "step": 19400 }, { "epoch": 2.468006614934487, "ewc_loss": 0.03345194086432457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345193908899091e-05, "grad_norm": 19.24836540222168, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8741719126701355, "num_tokens": 740394694.0, "step": 19401 }, { "epoch": 2.4681338252130773, "ewc_loss": 0.03350929170846939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350929182488471e-05, "grad_norm": 19.16584587097168, "learning_rate": 1e-06, "loss": 0.3338, "mean_token_accuracy": 0.8951873779296875, "num_tokens": 740436567.0, "step": 19402 }, { "epoch": 2.468261035491668, "ewc_loss": 0.03349657729268074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349657708895393e-05, "grad_norm": 19.28555679321289, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8671383857727051, "num_tokens": 740476458.0, "step": 19403 }, { "epoch": 2.4683882457702584, "ewc_loss": 0.03349725157022476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349725011503324e-05, "grad_norm": 19.19456672668457, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.883299708366394, "num_tokens": 740514703.0, "step": 19404 }, { "epoch": 2.468515456048849, "ewc_loss": 0.03338729217648506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.338729220558889e-05, "grad_norm": 19.178178787231445, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8856984376907349, "num_tokens": 740552328.0, "step": 19405 }, { "epoch": 2.4686426663274394, "ewc_loss": 0.03357500582933426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357500463607721e-05, "grad_norm": 19.283123016357422, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8572540283203125, "num_tokens": 740593141.0, "step": 19406 }, { "epoch": 2.4687698766060295, "ewc_loss": 0.033450376242399216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345037475810386e-05, "grad_norm": 19.148006439208984, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8856872916221619, "num_tokens": 740634097.0, "step": 19407 }, { "epoch": 2.4688970868846205, "ewc_loss": 0.033478498458862305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347849997226149e-05, "grad_norm": 19.229963302612305, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.87999027967453, "num_tokens": 740672917.0, "step": 19408 }, { "epoch": 2.4690242971632106, "ewc_loss": 0.0334930419921875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3493040973553434e-05, "grad_norm": 19.16826057434082, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8698595762252808, "num_tokens": 740709351.0, "step": 19409 }, { "epoch": 2.469151507441801, "ewc_loss": 0.03347193822264671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3471937058493495e-05, "grad_norm": 19.213014602661133, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8660046458244324, "num_tokens": 740744049.0, "step": 19410 }, { "epoch": 2.4692787177203916, "ewc_loss": 0.03353424742817879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353424835950136e-05, "grad_norm": 19.176349639892578, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8781312704086304, "num_tokens": 740784765.0, "step": 19411 }, { "epoch": 2.469405927998982, "ewc_loss": 0.033456869423389435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.345686855027452e-05, "grad_norm": 19.179243087768555, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8687546253204346, "num_tokens": 740815859.0, "step": 19412 }, { "epoch": 2.4695331382775727, "ewc_loss": 0.033568281680345535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3568281651241705e-05, "grad_norm": 19.261077880859375, "learning_rate": 1e-06, "loss": 0.3296, "mean_token_accuracy": 0.8928945064544678, "num_tokens": 740854300.0, "step": 19413 }, { "epoch": 2.4696603485561632, "ewc_loss": 0.033490147441625595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3490148780401796e-05, "grad_norm": 19.187767028808594, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8588991165161133, "num_tokens": 740891836.0, "step": 19414 }, { "epoch": 2.4697875588347538, "ewc_loss": 0.03344137594103813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344137439853512e-05, "grad_norm": 19.162796020507812, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8826714754104614, "num_tokens": 740927434.0, "step": 19415 }, { "epoch": 2.4699147691133443, "ewc_loss": 0.03355444595217705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3554446417838335e-05, "grad_norm": 19.19095802307129, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8776618242263794, "num_tokens": 740973707.0, "step": 19416 }, { "epoch": 2.470041979391935, "ewc_loss": 0.03347661346197128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347661186126061e-05, "grad_norm": 19.1273193359375, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8822131156921387, "num_tokens": 741015953.0, "step": 19417 }, { "epoch": 2.4701691896705253, "ewc_loss": 0.03350702300667763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350702172610909e-05, "grad_norm": 19.216596603393555, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8820148706436157, "num_tokens": 741049634.0, "step": 19418 }, { "epoch": 2.470296399949116, "ewc_loss": 0.03355945274233818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355945227667689e-05, "grad_norm": 19.179222106933594, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8860918283462524, "num_tokens": 741084267.0, "step": 19419 }, { "epoch": 2.4704236102277064, "ewc_loss": 0.033489566296339035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.348956670379266e-05, "grad_norm": 19.16234588623047, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8809086084365845, "num_tokens": 741122237.0, "step": 19420 }, { "epoch": 2.470550820506297, "ewc_loss": 0.03352205455303192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352205385453999e-05, "grad_norm": 19.190710067749023, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8630151748657227, "num_tokens": 741163593.0, "step": 19421 }, { "epoch": 2.4706780307848875, "ewc_loss": 0.03353536128997803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353536158101633e-05, "grad_norm": 19.23036766052246, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8646930456161499, "num_tokens": 741202328.0, "step": 19422 }, { "epoch": 2.470805241063478, "ewc_loss": 0.03350219130516052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.350219049025327e-05, "grad_norm": 19.139389038085938, "learning_rate": 1e-06, "loss": 0.3281, "mean_token_accuracy": 0.8946473598480225, "num_tokens": 741240095.0, "step": 19423 }, { "epoch": 2.4709324513420685, "ewc_loss": 0.03353186324238777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353186184540391e-05, "grad_norm": 19.189090728759766, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8769996762275696, "num_tokens": 741278509.0, "step": 19424 }, { "epoch": 2.471059661620659, "ewc_loss": 0.033444564789533615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.344456490594894e-05, "grad_norm": 19.119932174682617, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8717391490936279, "num_tokens": 741321446.0, "step": 19425 }, { "epoch": 2.4711868718992496, "ewc_loss": 0.03346189111471176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3461892599007115e-05, "grad_norm": 19.23828887939453, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.871335506439209, "num_tokens": 741360469.0, "step": 19426 }, { "epoch": 2.47131408217784, "ewc_loss": 0.033532384783029556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353238571435213e-05, "grad_norm": 19.19552230834961, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8658403158187866, "num_tokens": 741393833.0, "step": 19427 }, { "epoch": 2.4714412924564306, "ewc_loss": 0.0334390252828598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3439024264225736e-05, "grad_norm": 19.18769645690918, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8807079792022705, "num_tokens": 741426956.0, "step": 19428 }, { "epoch": 2.471568502735021, "ewc_loss": 0.03354983404278755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354983346071094e-05, "grad_norm": 19.187734603881836, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8750073313713074, "num_tokens": 741464820.0, "step": 19429 }, { "epoch": 2.4716957130136117, "ewc_loss": 0.03351260721683502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351260602357797e-05, "grad_norm": 19.1909236907959, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8810542225837708, "num_tokens": 741499519.0, "step": 19430 }, { "epoch": 2.471822923292202, "ewc_loss": 0.03347046300768852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.347046367707662e-05, "grad_norm": 19.14997673034668, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8822558522224426, "num_tokens": 741533443.0, "step": 19431 }, { "epoch": 2.4719501335707923, "ewc_loss": 0.03351513668894768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.351513805682771e-05, "grad_norm": 19.150774002075195, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8722707629203796, "num_tokens": 741576849.0, "step": 19432 }, { "epoch": 2.4720773438493833, "ewc_loss": 0.03352310135960579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352310159243643e-05, "grad_norm": 19.133609771728516, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8799454569816589, "num_tokens": 741614653.0, "step": 19433 }, { "epoch": 2.4722045541279734, "ewc_loss": 0.03352055698633194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3520558645250276e-05, "grad_norm": 19.132675170898438, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.869735836982727, "num_tokens": 741649354.0, "step": 19434 }, { "epoch": 2.472331764406564, "ewc_loss": 0.03356843441724777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35684344463516e-05, "grad_norm": 19.195213317871094, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8682211637496948, "num_tokens": 741692257.0, "step": 19435 }, { "epoch": 2.4724589746851544, "ewc_loss": 0.03357655182480812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3576550777070224e-05, "grad_norm": 19.18243980407715, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8734973669052124, "num_tokens": 741729739.0, "step": 19436 }, { "epoch": 2.472586184963745, "ewc_loss": 0.03354766219854355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354766158736311e-05, "grad_norm": 19.189701080322266, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8656670451164246, "num_tokens": 741763409.0, "step": 19437 }, { "epoch": 2.4727133952423355, "ewc_loss": 0.03359834477305412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359834590810351e-05, "grad_norm": 19.259017944335938, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8726877570152283, "num_tokens": 741798746.0, "step": 19438 }, { "epoch": 2.472840605520926, "ewc_loss": 0.033607810735702515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360781192895956e-05, "grad_norm": 19.15207862854004, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8836630582809448, "num_tokens": 741840488.0, "step": 19439 }, { "epoch": 2.4729678157995165, "ewc_loss": 0.03356053680181503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3560536394361407e-05, "grad_norm": 19.229915618896484, "learning_rate": 1e-06, "loss": 0.3396, "mean_token_accuracy": 0.8914434909820557, "num_tokens": 741880839.0, "step": 19440 }, { "epoch": 2.473095026078107, "ewc_loss": 0.033616404980421066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3616404834901914e-05, "grad_norm": 19.13185691833496, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8691686391830444, "num_tokens": 741919317.0, "step": 19441 }, { "epoch": 2.4732222363566976, "ewc_loss": 0.033574290573596954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357429159223102e-05, "grad_norm": 19.241914749145508, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.855537474155426, "num_tokens": 741955782.0, "step": 19442 }, { "epoch": 2.473349446635288, "ewc_loss": 0.03360988572239876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3609885576879606e-05, "grad_norm": 19.232467651367188, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.868445634841919, "num_tokens": 741990410.0, "step": 19443 }, { "epoch": 2.4734766569138786, "ewc_loss": 0.03355882316827774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3558822906343266e-05, "grad_norm": 19.18729591369629, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8709465861320496, "num_tokens": 742031536.0, "step": 19444 }, { "epoch": 2.473603867192469, "ewc_loss": 0.03354350104928017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3543499739607796e-05, "grad_norm": 19.241899490356445, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8646336793899536, "num_tokens": 742067222.0, "step": 19445 }, { "epoch": 2.4737310774710597, "ewc_loss": 0.033574409782886505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357440800755285e-05, "grad_norm": 19.11861801147461, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8859721422195435, "num_tokens": 742114740.0, "step": 19446 }, { "epoch": 2.47385828774965, "ewc_loss": 0.03347337618470192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.34733776981011e-05, "grad_norm": 19.148588180541992, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8839302062988281, "num_tokens": 742149344.0, "step": 19447 }, { "epoch": 2.4739854980282407, "ewc_loss": 0.03366946801543236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3669468393782154e-05, "grad_norm": 19.237781524658203, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8773785829544067, "num_tokens": 742191089.0, "step": 19448 }, { "epoch": 2.4741127083068313, "ewc_loss": 0.03365188464522362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365188604220748e-05, "grad_norm": 19.282196044921875, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.875164806842804, "num_tokens": 742224944.0, "step": 19449 }, { "epoch": 2.474239918585422, "ewc_loss": 0.033577609807252884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357760942890309e-05, "grad_norm": 19.19038963317871, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8647958040237427, "num_tokens": 742263674.0, "step": 19450 }, { "epoch": 2.4743671288640123, "ewc_loss": 0.03361351415514946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3613512641750276e-05, "grad_norm": 19.290847778320312, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8694052696228027, "num_tokens": 742300424.0, "step": 19451 }, { "epoch": 2.474494339142603, "ewc_loss": 0.03356179594993591, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356179513502866e-05, "grad_norm": 19.146106719970703, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8852071166038513, "num_tokens": 742333854.0, "step": 19452 }, { "epoch": 2.4746215494211934, "ewc_loss": 0.03356156870722771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356156958034262e-05, "grad_norm": 19.162729263305664, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.883293867111206, "num_tokens": 742377126.0, "step": 19453 }, { "epoch": 2.474748759699784, "ewc_loss": 0.033593401312828064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359340189490467e-05, "grad_norm": 19.206701278686523, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8731595277786255, "num_tokens": 742421284.0, "step": 19454 }, { "epoch": 2.4748759699783744, "ewc_loss": 0.033575937151908875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3575935958651826e-05, "grad_norm": 19.182315826416016, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8584402203559875, "num_tokens": 742459124.0, "step": 19455 }, { "epoch": 2.475003180256965, "ewc_loss": 0.03359435126185417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3594351407373324e-05, "grad_norm": 19.184795379638672, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8663474321365356, "num_tokens": 742493137.0, "step": 19456 }, { "epoch": 2.475130390535555, "ewc_loss": 0.03357992321252823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35799231834244e-05, "grad_norm": 19.191686630249023, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8785064816474915, "num_tokens": 742529805.0, "step": 19457 }, { "epoch": 2.475257600814146, "ewc_loss": 0.033547092229127884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3547090424690396e-05, "grad_norm": 19.147768020629883, "learning_rate": 1e-06, "loss": 0.3497, "mean_token_accuracy": 0.8881402015686035, "num_tokens": 742560859.0, "step": 19458 }, { "epoch": 2.475384811092736, "ewc_loss": 0.033603884279727936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3603882911847904e-05, "grad_norm": 19.33579444885254, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8867760896682739, "num_tokens": 742601465.0, "step": 19459 }, { "epoch": 2.4755120213713266, "ewc_loss": 0.03357527777552605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357527748448774e-05, "grad_norm": 19.151973724365234, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8692474365234375, "num_tokens": 742639242.0, "step": 19460 }, { "epoch": 2.475639231649917, "ewc_loss": 0.03356005996465683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356005981913768e-05, "grad_norm": 19.291000366210938, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8747339844703674, "num_tokens": 742676574.0, "step": 19461 }, { "epoch": 2.4757664419285077, "ewc_loss": 0.033609747886657715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3609747333684936e-05, "grad_norm": 19.14324378967285, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.864654541015625, "num_tokens": 742713326.0, "step": 19462 }, { "epoch": 2.4758936522070982, "ewc_loss": 0.033557768911123276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355776789248921e-05, "grad_norm": 19.228282928466797, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8519325256347656, "num_tokens": 742756883.0, "step": 19463 }, { "epoch": 2.4760208624856888, "ewc_loss": 0.03361606225371361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361606286489405e-05, "grad_norm": 19.231292724609375, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8711252212524414, "num_tokens": 742794301.0, "step": 19464 }, { "epoch": 2.4761480727642793, "ewc_loss": 0.033586278557777405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358627873240039e-05, "grad_norm": 19.185136795043945, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8774425983428955, "num_tokens": 742832966.0, "step": 19465 }, { "epoch": 2.47627528304287, "ewc_loss": 0.03356817737221718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35681761498563e-05, "grad_norm": 19.251644134521484, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8714426755905151, "num_tokens": 742872926.0, "step": 19466 }, { "epoch": 2.4764024933214603, "ewc_loss": 0.03353181481361389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353181455167942e-05, "grad_norm": 19.154844284057617, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8820201754570007, "num_tokens": 742910838.0, "step": 19467 }, { "epoch": 2.476529703600051, "ewc_loss": 0.03362714499235153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362714414834045e-05, "grad_norm": 19.33169937133789, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8835382461547852, "num_tokens": 742950098.0, "step": 19468 }, { "epoch": 2.4766569138786414, "ewc_loss": 0.03360495716333389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3604956115595996e-05, "grad_norm": 19.23275375366211, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.8856076002120972, "num_tokens": 742983872.0, "step": 19469 }, { "epoch": 2.476784124157232, "ewc_loss": 0.03355450555682182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355450462549925e-05, "grad_norm": 19.20919418334961, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8633658289909363, "num_tokens": 743029816.0, "step": 19470 }, { "epoch": 2.4769113344358225, "ewc_loss": 0.03353220596909523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3532207453390583e-05, "grad_norm": 19.175939559936523, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8669541478157043, "num_tokens": 743068565.0, "step": 19471 }, { "epoch": 2.477038544714413, "ewc_loss": 0.03357406705617905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357406603754498e-05, "grad_norm": 19.17803192138672, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8763105273246765, "num_tokens": 743112137.0, "step": 19472 }, { "epoch": 2.4771657549930035, "ewc_loss": 0.03356976434588432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3569765946595e-05, "grad_norm": 19.24715805053711, "learning_rate": 1e-06, "loss": 0.3488, "mean_token_accuracy": 0.889130711555481, "num_tokens": 743145067.0, "step": 19473 }, { "epoch": 2.477292965271594, "ewc_loss": 0.03348716348409653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3487161999801174e-05, "grad_norm": 19.108545303344727, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8790300488471985, "num_tokens": 743177017.0, "step": 19474 }, { "epoch": 2.4774201755501846, "ewc_loss": 0.03353766351938248, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353766442160122e-05, "grad_norm": 19.298805236816406, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8746031522750854, "num_tokens": 743218121.0, "step": 19475 }, { "epoch": 2.477547385828775, "ewc_loss": 0.03359917923808098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3599179005250335e-05, "grad_norm": 19.117427825927734, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8809833526611328, "num_tokens": 743256635.0, "step": 19476 }, { "epoch": 2.4776745961073656, "ewc_loss": 0.033544715493917465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3544714824529365e-05, "grad_norm": 19.295316696166992, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8732735514640808, "num_tokens": 743293801.0, "step": 19477 }, { "epoch": 2.477801806385956, "ewc_loss": 0.033634766936302185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3634765713941306e-05, "grad_norm": 19.17824363708496, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.8886408805847168, "num_tokens": 743331582.0, "step": 19478 }, { "epoch": 2.4779290166645467, "ewc_loss": 0.03354102745652199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3541025914018974e-05, "grad_norm": 19.345373153686523, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8783406615257263, "num_tokens": 743369901.0, "step": 19479 }, { "epoch": 2.4780562269431368, "ewc_loss": 0.03362393379211426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3623935451032594e-05, "grad_norm": 19.199527740478516, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8706632852554321, "num_tokens": 743405808.0, "step": 19480 }, { "epoch": 2.4781834372217277, "ewc_loss": 0.03347127139568329, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.34712713083718e-05, "grad_norm": 19.220333099365234, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8774400353431702, "num_tokens": 743447236.0, "step": 19481 }, { "epoch": 2.478310647500318, "ewc_loss": 0.033598918467760086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3598917070776224e-05, "grad_norm": 19.163694381713867, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8822939395904541, "num_tokens": 743486494.0, "step": 19482 }, { "epoch": 2.4784378577789083, "ewc_loss": 0.03352494537830353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352494604769163e-05, "grad_norm": 19.210182189941406, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8716802597045898, "num_tokens": 743523941.0, "step": 19483 }, { "epoch": 2.478565068057499, "ewc_loss": 0.033576883375644684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357688183314167e-05, "grad_norm": 19.214189529418945, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8794276714324951, "num_tokens": 743554169.0, "step": 19484 }, { "epoch": 2.4786922783360894, "ewc_loss": 0.033573996275663376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3573996915947646e-05, "grad_norm": 19.205219268798828, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8609263896942139, "num_tokens": 743594656.0, "step": 19485 }, { "epoch": 2.47881948861468, "ewc_loss": 0.033530090004205704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3530090149724856e-05, "grad_norm": 19.16007423400879, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8813107013702393, "num_tokens": 743628638.0, "step": 19486 }, { "epoch": 2.4789466988932705, "ewc_loss": 0.03361855074763298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36185512423981e-05, "grad_norm": 19.241010665893555, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8678247332572937, "num_tokens": 743665382.0, "step": 19487 }, { "epoch": 2.479073909171861, "ewc_loss": 0.03360851854085922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360851769684814e-05, "grad_norm": 19.108901977539062, "learning_rate": 1e-06, "loss": 0.351, "mean_token_accuracy": 0.8878895044326782, "num_tokens": 743701712.0, "step": 19488 }, { "epoch": 2.4792011194504515, "ewc_loss": 0.0335589200258255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355892113177106e-05, "grad_norm": 19.222990036010742, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8757225275039673, "num_tokens": 743740347.0, "step": 19489 }, { "epoch": 2.479328329729042, "ewc_loss": 0.03364601731300354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364601798239164e-05, "grad_norm": 19.167760848999023, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8781917095184326, "num_tokens": 743775434.0, "step": 19490 }, { "epoch": 2.4794555400076326, "ewc_loss": 0.03359460085630417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359460242791101e-05, "grad_norm": 19.255088806152344, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8743283748626709, "num_tokens": 743812475.0, "step": 19491 }, { "epoch": 2.479582750286223, "ewc_loss": 0.03369974344968796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369974365341477e-05, "grad_norm": 19.199872970581055, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8666291832923889, "num_tokens": 743851350.0, "step": 19492 }, { "epoch": 2.4797099605648136, "ewc_loss": 0.03361118212342262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361118069733493e-05, "grad_norm": 19.255807876586914, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8731261491775513, "num_tokens": 743885086.0, "step": 19493 }, { "epoch": 2.479837170843404, "ewc_loss": 0.03366933763027191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36693374265451e-05, "grad_norm": 19.20022201538086, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.877930223941803, "num_tokens": 743931747.0, "step": 19494 }, { "epoch": 2.4799643811219947, "ewc_loss": 0.03360375016927719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360375194461085e-05, "grad_norm": 19.234474182128906, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8876854777336121, "num_tokens": 743971806.0, "step": 19495 }, { "epoch": 2.480091591400585, "ewc_loss": 0.033623334020376205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3623335184529424e-05, "grad_norm": 19.205181121826172, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8853752017021179, "num_tokens": 744005304.0, "step": 19496 }, { "epoch": 2.4802188016791757, "ewc_loss": 0.033529918640851974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352991916472092e-05, "grad_norm": 19.111831665039062, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8670941591262817, "num_tokens": 744051159.0, "step": 19497 }, { "epoch": 2.4803460119577663, "ewc_loss": 0.03363325074315071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3633252314757556e-05, "grad_norm": 19.275556564331055, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8571622967720032, "num_tokens": 744089856.0, "step": 19498 }, { "epoch": 2.480473222236357, "ewc_loss": 0.03363315761089325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363315772730857e-05, "grad_norm": 19.186573028564453, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8564286231994629, "num_tokens": 744136355.0, "step": 19499 }, { "epoch": 2.4806004325149473, "ewc_loss": 0.03356623277068138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356623346917331e-05, "grad_norm": 19.243053436279297, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8649129271507263, "num_tokens": 744177682.0, "step": 19500 }, { "epoch": 2.480727642793538, "ewc_loss": 0.03362280875444412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3622807677602395e-05, "grad_norm": 19.16189956665039, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8761072158813477, "num_tokens": 744214209.0, "step": 19501 }, { "epoch": 2.4808548530721284, "ewc_loss": 0.033611755818128586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361175549798645e-05, "grad_norm": 19.269359588623047, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8587669134140015, "num_tokens": 744257923.0, "step": 19502 }, { "epoch": 2.480982063350719, "ewc_loss": 0.03366886451840401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3668864489300177e-05, "grad_norm": 19.30459976196289, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8815107345581055, "num_tokens": 744293370.0, "step": 19503 }, { "epoch": 2.4811092736293094, "ewc_loss": 0.0335419587790966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354195723659359e-05, "grad_norm": 19.251577377319336, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.880850076675415, "num_tokens": 744328572.0, "step": 19504 }, { "epoch": 2.4812364839078995, "ewc_loss": 0.0335155688226223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3515567338326946e-05, "grad_norm": 19.154455184936523, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8583308458328247, "num_tokens": 744364124.0, "step": 19505 }, { "epoch": 2.4813636941864905, "ewc_loss": 0.033561430871486664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356143133714795e-05, "grad_norm": 19.243040084838867, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8777072429656982, "num_tokens": 744403679.0, "step": 19506 }, { "epoch": 2.4814909044650806, "ewc_loss": 0.03362107649445534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362107599969022e-05, "grad_norm": 19.239290237426758, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8741530776023865, "num_tokens": 744440922.0, "step": 19507 }, { "epoch": 2.481618114743671, "ewc_loss": 0.03353993222117424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353993088239804e-05, "grad_norm": 19.217941284179688, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8720430135726929, "num_tokens": 744479025.0, "step": 19508 }, { "epoch": 2.4817453250222616, "ewc_loss": 0.03359530121088028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3595300919841975e-05, "grad_norm": 19.25289535522461, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8769128322601318, "num_tokens": 744514165.0, "step": 19509 }, { "epoch": 2.481872535300852, "ewc_loss": 0.033465124666690826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346512312418781e-05, "grad_norm": 19.266063690185547, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8697471618652344, "num_tokens": 744547210.0, "step": 19510 }, { "epoch": 2.4819997455794427, "ewc_loss": 0.03358759358525276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358759204274975e-05, "grad_norm": 19.23016929626465, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8744932413101196, "num_tokens": 744585132.0, "step": 19511 }, { "epoch": 2.4821269558580332, "ewc_loss": 0.03358368203043938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3583681215532124e-05, "grad_norm": 19.30660057067871, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8855366110801697, "num_tokens": 744617921.0, "step": 19512 }, { "epoch": 2.4822541661366238, "ewc_loss": 0.03362630680203438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362630741321482e-05, "grad_norm": 19.264772415161133, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8698077201843262, "num_tokens": 744653095.0, "step": 19513 }, { "epoch": 2.4823813764152143, "ewc_loss": 0.03349078446626663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.349078542669304e-05, "grad_norm": 19.199337005615234, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8815515041351318, "num_tokens": 744688258.0, "step": 19514 }, { "epoch": 2.482508586693805, "ewc_loss": 0.03360861912369728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360861956025474e-05, "grad_norm": 19.20891571044922, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8757966756820679, "num_tokens": 744723500.0, "step": 19515 }, { "epoch": 2.4826357969723953, "ewc_loss": 0.03360942751169205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360942719154991e-05, "grad_norm": 19.273479461669922, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8749760985374451, "num_tokens": 744757133.0, "step": 19516 }, { "epoch": 2.482763007250986, "ewc_loss": 0.03364425525069237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36442535626702e-05, "grad_norm": 19.194250106811523, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.8878157138824463, "num_tokens": 744793814.0, "step": 19517 }, { "epoch": 2.4828902175295764, "ewc_loss": 0.0336357019841671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363570067449473e-05, "grad_norm": 19.243566513061523, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8781381249427795, "num_tokens": 744828680.0, "step": 19518 }, { "epoch": 2.483017427808167, "ewc_loss": 0.03361515328288078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3615153370192274e-05, "grad_norm": 19.205230712890625, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8696522116661072, "num_tokens": 744861642.0, "step": 19519 }, { "epoch": 2.4831446380867574, "ewc_loss": 0.033631063997745514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3631062251515687e-05, "grad_norm": 19.277103424072266, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8862003087997437, "num_tokens": 744900345.0, "step": 19520 }, { "epoch": 2.483271848365348, "ewc_loss": 0.03369997814297676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3699976484058425e-05, "grad_norm": 19.19256591796875, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8811848759651184, "num_tokens": 744938589.0, "step": 19521 }, { "epoch": 2.4833990586439385, "ewc_loss": 0.03365899249911308, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365899101481773e-05, "grad_norm": 19.27703094482422, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8662482500076294, "num_tokens": 744975385.0, "step": 19522 }, { "epoch": 2.483526268922529, "ewc_loss": 0.03369970619678497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36997072736267e-05, "grad_norm": 19.163898468017578, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8646886348724365, "num_tokens": 745010860.0, "step": 19523 }, { "epoch": 2.4836534792011196, "ewc_loss": 0.03363329544663429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363329597050324e-05, "grad_norm": 19.303403854370117, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8670049905776978, "num_tokens": 745047183.0, "step": 19524 }, { "epoch": 2.48378068947971, "ewc_loss": 0.03373410925269127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373410800122656e-05, "grad_norm": 19.1507568359375, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8721145391464233, "num_tokens": 745081906.0, "step": 19525 }, { "epoch": 2.4839078997583006, "ewc_loss": 0.033725492656230927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3725493267411366e-05, "grad_norm": 19.301374435424805, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8802685737609863, "num_tokens": 745127285.0, "step": 19526 }, { "epoch": 2.484035110036891, "ewc_loss": 0.03375118970870972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3751188311725855e-05, "grad_norm": 19.136613845825195, "learning_rate": 1e-06, "loss": 0.3454, "mean_token_accuracy": 0.8904044032096863, "num_tokens": 745171564.0, "step": 19527 }, { "epoch": 2.4841623203154817, "ewc_loss": 0.03367132321000099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367132376297377e-05, "grad_norm": 19.280488967895508, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.884372353553772, "num_tokens": 745214252.0, "step": 19528 }, { "epoch": 2.484289530594072, "ewc_loss": 0.033668387681245804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366838791407645e-05, "grad_norm": 19.151470184326172, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8772302865982056, "num_tokens": 745251983.0, "step": 19529 }, { "epoch": 2.4844167408726623, "ewc_loss": 0.0335875041782856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358750473125838e-05, "grad_norm": 19.218570709228516, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8768324851989746, "num_tokens": 745287037.0, "step": 19530 }, { "epoch": 2.4845439511512533, "ewc_loss": 0.033769845962524414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376984750502743e-05, "grad_norm": 19.18419075012207, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8730100393295288, "num_tokens": 745327854.0, "step": 19531 }, { "epoch": 2.4846711614298433, "ewc_loss": 0.033635202795267105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363520227139816e-05, "grad_norm": 19.195344924926758, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8643270134925842, "num_tokens": 745370227.0, "step": 19532 }, { "epoch": 2.484798371708434, "ewc_loss": 0.033737558871507645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373756044311449e-05, "grad_norm": 19.182994842529297, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.87337327003479, "num_tokens": 745409416.0, "step": 19533 }, { "epoch": 2.4849255819870244, "ewc_loss": 0.03366992622613907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366992677911185e-05, "grad_norm": 19.250165939331055, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8578697443008423, "num_tokens": 745449908.0, "step": 19534 }, { "epoch": 2.485052792265615, "ewc_loss": 0.03371617943048477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371618004166521e-05, "grad_norm": 19.24137306213379, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8646368384361267, "num_tokens": 745486384.0, "step": 19535 }, { "epoch": 2.4851800025442055, "ewc_loss": 0.03365897014737129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365896918694489e-05, "grad_norm": 19.219240188598633, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8640565872192383, "num_tokens": 745523058.0, "step": 19536 }, { "epoch": 2.485307212822796, "ewc_loss": 0.033668000251054764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366799865034409e-05, "grad_norm": 19.244333267211914, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8578433394432068, "num_tokens": 745559276.0, "step": 19537 }, { "epoch": 2.4854344231013865, "ewc_loss": 0.03366726636886597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366726741660386e-05, "grad_norm": 19.20322608947754, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8728524446487427, "num_tokens": 745592726.0, "step": 19538 }, { "epoch": 2.485561633379977, "ewc_loss": 0.0336766317486763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367663157405332e-05, "grad_norm": 19.278696060180664, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8767798542976379, "num_tokens": 745630117.0, "step": 19539 }, { "epoch": 2.4856888436585676, "ewc_loss": 0.03366966173052788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366966120665893e-05, "grad_norm": 19.173757553100586, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.861538290977478, "num_tokens": 745664968.0, "step": 19540 }, { "epoch": 2.485816053937158, "ewc_loss": 0.03360157087445259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3601569157326594e-05, "grad_norm": 19.175615310668945, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8761260509490967, "num_tokens": 745701893.0, "step": 19541 }, { "epoch": 2.4859432642157486, "ewc_loss": 0.033752769231796265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375277083250694e-05, "grad_norm": 19.26651382446289, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8726862668991089, "num_tokens": 745736884.0, "step": 19542 }, { "epoch": 2.486070474494339, "ewc_loss": 0.03372713923454285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372713763383217e-05, "grad_norm": 19.229713439941406, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8862205743789673, "num_tokens": 745782381.0, "step": 19543 }, { "epoch": 2.4861976847729297, "ewc_loss": 0.03366086632013321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366086457390338e-05, "grad_norm": 19.221677780151367, "learning_rate": 1e-06, "loss": 0.341, "mean_token_accuracy": 0.8909227848052979, "num_tokens": 745827630.0, "step": 19544 }, { "epoch": 2.48632489505152, "ewc_loss": 0.03369748964905739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3697488106554374e-05, "grad_norm": 19.199363708496094, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8818756341934204, "num_tokens": 745862223.0, "step": 19545 }, { "epoch": 2.4864521053301107, "ewc_loss": 0.03363089635968208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363089490449056e-05, "grad_norm": 19.232023239135742, "learning_rate": 1e-06, "loss": 0.347, "mean_token_accuracy": 0.8899626135826111, "num_tokens": 745903077.0, "step": 19546 }, { "epoch": 2.4865793156087013, "ewc_loss": 0.033655062317848206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365506199770607e-05, "grad_norm": 19.19672966003418, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8749979138374329, "num_tokens": 745945534.0, "step": 19547 }, { "epoch": 2.486706525887292, "ewc_loss": 0.03359697014093399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359697075211443e-05, "grad_norm": 19.274600982666016, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8579934239387512, "num_tokens": 745986308.0, "step": 19548 }, { "epoch": 2.4868337361658823, "ewc_loss": 0.03370409458875656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370409467606805e-05, "grad_norm": 19.213239669799805, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8671789169311523, "num_tokens": 746022206.0, "step": 19549 }, { "epoch": 2.486960946444473, "ewc_loss": 0.03359545022249222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3595450076973066e-05, "grad_norm": 19.241580963134766, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8816648721694946, "num_tokens": 746059854.0, "step": 19550 }, { "epoch": 2.4870881567230634, "ewc_loss": 0.033711761236190796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3711759897414595e-05, "grad_norm": 19.272647857666016, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8826429843902588, "num_tokens": 746103994.0, "step": 19551 }, { "epoch": 2.487215367001654, "ewc_loss": 0.03354061767458916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354061846039258e-05, "grad_norm": 19.212818145751953, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8650538325309753, "num_tokens": 746138491.0, "step": 19552 }, { "epoch": 2.4873425772802444, "ewc_loss": 0.033574189990758896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357418972882442e-05, "grad_norm": 19.25472068786621, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8724015355110168, "num_tokens": 746172598.0, "step": 19553 }, { "epoch": 2.487469787558835, "ewc_loss": 0.03359974920749664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359975016792305e-05, "grad_norm": 19.205787658691406, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8813806772232056, "num_tokens": 746210385.0, "step": 19554 }, { "epoch": 2.487596997837425, "ewc_loss": 0.03346308693289757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.346308585605584e-05, "grad_norm": 19.170284271240234, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8771011829376221, "num_tokens": 746252563.0, "step": 19555 }, { "epoch": 2.487724208116016, "ewc_loss": 0.03364333510398865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3643336792010814e-05, "grad_norm": 19.244766235351562, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8809619545936584, "num_tokens": 746292356.0, "step": 19556 }, { "epoch": 2.487851418394606, "ewc_loss": 0.03360099717974663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360099799465388e-05, "grad_norm": 19.140838623046875, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8633359670639038, "num_tokens": 746325467.0, "step": 19557 }, { "epoch": 2.4879786286731966, "ewc_loss": 0.033574894070625305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3574895496713e-05, "grad_norm": 19.225914001464844, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8704894185066223, "num_tokens": 746362285.0, "step": 19558 }, { "epoch": 2.488105838951787, "ewc_loss": 0.033673468977212906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3673470170469955e-05, "grad_norm": 19.16098403930664, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8817112445831299, "num_tokens": 746401540.0, "step": 19559 }, { "epoch": 2.4882330492303777, "ewc_loss": 0.03361010178923607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3610100217629224e-05, "grad_norm": 19.1978702545166, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8767107129096985, "num_tokens": 746439282.0, "step": 19560 }, { "epoch": 2.488360259508968, "ewc_loss": 0.03366096690297127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366096643730998e-05, "grad_norm": 19.17696762084961, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8866389393806458, "num_tokens": 746480993.0, "step": 19561 }, { "epoch": 2.4884874697875587, "ewc_loss": 0.0336611308157444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36611301463563e-05, "grad_norm": 19.23265266418457, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8715882301330566, "num_tokens": 746519318.0, "step": 19562 }, { "epoch": 2.4886146800661493, "ewc_loss": 0.03365916386246681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365916563780047e-05, "grad_norm": 19.155454635620117, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8623158931732178, "num_tokens": 746559761.0, "step": 19563 }, { "epoch": 2.48874189034474, "ewc_loss": 0.03363151103258133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363150972290896e-05, "grad_norm": 19.30562400817871, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8833214640617371, "num_tokens": 746591622.0, "step": 19564 }, { "epoch": 2.4888691006233303, "ewc_loss": 0.033630914986133575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36309167323634e-05, "grad_norm": 19.15041160583496, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8736057281494141, "num_tokens": 746627709.0, "step": 19565 }, { "epoch": 2.488996310901921, "ewc_loss": 0.0336180105805397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361800918355584e-05, "grad_norm": 19.28337860107422, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8720893859863281, "num_tokens": 746663445.0, "step": 19566 }, { "epoch": 2.4891235211805114, "ewc_loss": 0.0336570106446743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365701195434667e-05, "grad_norm": 19.201833724975586, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8624494075775146, "num_tokens": 746706869.0, "step": 19567 }, { "epoch": 2.489250731459102, "ewc_loss": 0.03360643982887268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360644041094929e-05, "grad_norm": 19.260847091674805, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8865001201629639, "num_tokens": 746749502.0, "step": 19568 }, { "epoch": 2.4893779417376924, "ewc_loss": 0.033671632409095764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367163299117237e-05, "grad_norm": 19.219314575195312, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8769178986549377, "num_tokens": 746786510.0, "step": 19569 }, { "epoch": 2.489505152016283, "ewc_loss": 0.033639393746852875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3639393222983927e-05, "grad_norm": 19.224462509155273, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8804368376731873, "num_tokens": 746819430.0, "step": 19570 }, { "epoch": 2.4896323622948735, "ewc_loss": 0.03357844054698944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3578438888071105e-05, "grad_norm": 19.164445877075195, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8775452971458435, "num_tokens": 746856018.0, "step": 19571 }, { "epoch": 2.489759572573464, "ewc_loss": 0.03366284817457199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366284727235325e-05, "grad_norm": 19.221057891845703, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8690288066864014, "num_tokens": 746897350.0, "step": 19572 }, { "epoch": 2.4898867828520546, "ewc_loss": 0.03363878279924393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3638782042544335e-05, "grad_norm": 19.210487365722656, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8693143129348755, "num_tokens": 746937163.0, "step": 19573 }, { "epoch": 2.490013993130645, "ewc_loss": 0.0336511991918087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365119846421294e-05, "grad_norm": 19.192134857177734, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.884132981300354, "num_tokens": 746976140.0, "step": 19574 }, { "epoch": 2.4901412034092356, "ewc_loss": 0.03354833275079727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354833461344242e-05, "grad_norm": 19.181217193603516, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8756139278411865, "num_tokens": 747012576.0, "step": 19575 }, { "epoch": 2.490268413687826, "ewc_loss": 0.03362084925174713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362085044500418e-05, "grad_norm": 19.28835105895996, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8735679388046265, "num_tokens": 747048179.0, "step": 19576 }, { "epoch": 2.4903956239664167, "ewc_loss": 0.03362841159105301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362841016496532e-05, "grad_norm": 19.22396469116211, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.856469988822937, "num_tokens": 747085291.0, "step": 19577 }, { "epoch": 2.4905228342450068, "ewc_loss": 0.03359673172235489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359673064551316e-05, "grad_norm": 19.27593994140625, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8818393349647522, "num_tokens": 747119085.0, "step": 19578 }, { "epoch": 2.4906500445235977, "ewc_loss": 0.03365666791796684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365666634636e-05, "grad_norm": 19.23382568359375, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8737754225730896, "num_tokens": 747153463.0, "step": 19579 }, { "epoch": 2.490777254802188, "ewc_loss": 0.033599622547626495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35996228386648e-05, "grad_norm": 19.19426727294922, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8501535654067993, "num_tokens": 747194781.0, "step": 19580 }, { "epoch": 2.4909044650807783, "ewc_loss": 0.033623725175857544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362372444826178e-05, "grad_norm": 19.14267349243164, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8754478693008423, "num_tokens": 747235328.0, "step": 19581 }, { "epoch": 2.491031675359369, "ewc_loss": 0.03365413472056389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365413431311026e-05, "grad_norm": 19.313358306884766, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8731798529624939, "num_tokens": 747278481.0, "step": 19582 }, { "epoch": 2.4911588856379594, "ewc_loss": 0.03370289504528046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370289414306171e-05, "grad_norm": 19.20334243774414, "learning_rate": 1e-06, "loss": 0.3442, "mean_token_accuracy": 0.8887174725532532, "num_tokens": 747313077.0, "step": 19583 }, { "epoch": 2.49128609591655, "ewc_loss": 0.033579181879758835, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357918103574775e-05, "grad_norm": 19.280656814575195, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8817540407180786, "num_tokens": 747352693.0, "step": 19584 }, { "epoch": 2.4914133061951405, "ewc_loss": 0.03365165367722511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3651653211563826e-05, "grad_norm": 19.219947814941406, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8672922849655151, "num_tokens": 747397717.0, "step": 19585 }, { "epoch": 2.491540516473731, "ewc_loss": 0.03360578045248985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3605781936785206e-05, "grad_norm": 19.282230377197266, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8787024021148682, "num_tokens": 747435457.0, "step": 19586 }, { "epoch": 2.4916677267523215, "ewc_loss": 0.03364495187997818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364495205460116e-05, "grad_norm": 19.19007682800293, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8791501522064209, "num_tokens": 747479017.0, "step": 19587 }, { "epoch": 2.491794937030912, "ewc_loss": 0.03359736129641533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359736001584679e-05, "grad_norm": 19.230161666870117, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8741883635520935, "num_tokens": 747520247.0, "step": 19588 }, { "epoch": 2.4919221473095026, "ewc_loss": 0.0336320586502552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363205905770883e-05, "grad_norm": 19.21874237060547, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8610202670097351, "num_tokens": 747559983.0, "step": 19589 }, { "epoch": 2.492049357588093, "ewc_loss": 0.033541515469551086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354151704115793e-05, "grad_norm": 19.261056900024414, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.870459794998169, "num_tokens": 747595746.0, "step": 19590 }, { "epoch": 2.4921765678666836, "ewc_loss": 0.03361334279179573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361334165674634e-05, "grad_norm": 19.214181900024414, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8729552626609802, "num_tokens": 747639261.0, "step": 19591 }, { "epoch": 2.492303778145274, "ewc_loss": 0.03351864591240883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3518645068397745e-05, "grad_norm": 19.211313247680664, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8559126257896423, "num_tokens": 747680665.0, "step": 19592 }, { "epoch": 2.4924309884238647, "ewc_loss": 0.033616114407777786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361611379659735e-05, "grad_norm": 19.2884521484375, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8781722187995911, "num_tokens": 747713320.0, "step": 19593 }, { "epoch": 2.492558198702455, "ewc_loss": 0.03362595662474632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362595816724934e-05, "grad_norm": 19.207927703857422, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8490139842033386, "num_tokens": 747752167.0, "step": 19594 }, { "epoch": 2.4926854089810457, "ewc_loss": 0.03356388583779335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356388697284274e-05, "grad_norm": 19.267250061035156, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8790875673294067, "num_tokens": 747787274.0, "step": 19595 }, { "epoch": 2.4928126192596363, "ewc_loss": 0.033658549189567566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365855081938207e-05, "grad_norm": 19.273025512695312, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8690709471702576, "num_tokens": 747826647.0, "step": 19596 }, { "epoch": 2.492939829538227, "ewc_loss": 0.033567048609256744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35670483764261e-05, "grad_norm": 19.23320960998535, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8706108331680298, "num_tokens": 747866797.0, "step": 19597 }, { "epoch": 2.4930670398168173, "ewc_loss": 0.03362010791897774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362010829732753e-05, "grad_norm": 19.219818115234375, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8767445683479309, "num_tokens": 747907589.0, "step": 19598 }, { "epoch": 2.493194250095408, "ewc_loss": 0.0335947722196579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359477341291495e-05, "grad_norm": 19.26120376586914, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8669209480285645, "num_tokens": 747939865.0, "step": 19599 }, { "epoch": 2.4933214603739984, "ewc_loss": 0.033638887107372284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363888754392974e-05, "grad_norm": 19.218061447143555, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8711853623390198, "num_tokens": 747976238.0, "step": 19600 }, { "epoch": 2.493448670652589, "ewc_loss": 0.03360695764422417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36069570039399e-05, "grad_norm": 19.210309982299805, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8724238872528076, "num_tokens": 748012076.0, "step": 19601 }, { "epoch": 2.4935758809311794, "ewc_loss": 0.03371585160493851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371585262357257e-05, "grad_norm": 19.32245445251465, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8684574365615845, "num_tokens": 748042518.0, "step": 19602 }, { "epoch": 2.4937030912097695, "ewc_loss": 0.03366387262940407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366387318237685e-05, "grad_norm": 19.29706382751465, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.882099986076355, "num_tokens": 748077509.0, "step": 19603 }, { "epoch": 2.4938303014883605, "ewc_loss": 0.03361627086997032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361627022968605e-05, "grad_norm": 19.231170654296875, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8792906999588013, "num_tokens": 748118164.0, "step": 19604 }, { "epoch": 2.4939575117669506, "ewc_loss": 0.033632367849349976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363236828590743e-05, "grad_norm": 19.206262588500977, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8676670789718628, "num_tokens": 748153609.0, "step": 19605 }, { "epoch": 2.494084722045541, "ewc_loss": 0.03361162543296814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3611624530749395e-05, "grad_norm": 19.199207305908203, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8720616698265076, "num_tokens": 748189573.0, "step": 19606 }, { "epoch": 2.4942119323241316, "ewc_loss": 0.033623043447732925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3623044146224856e-05, "grad_norm": 19.174907684326172, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8690776824951172, "num_tokens": 748231121.0, "step": 19607 }, { "epoch": 2.494339142602722, "ewc_loss": 0.033641666173934937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364166695973836e-05, "grad_norm": 19.16509437561035, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8857063055038452, "num_tokens": 748268744.0, "step": 19608 }, { "epoch": 2.4944663528813127, "ewc_loss": 0.03365985304117203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365985321579501e-05, "grad_norm": 19.2127628326416, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8779889941215515, "num_tokens": 748307441.0, "step": 19609 }, { "epoch": 2.494593563159903, "ewc_loss": 0.03373190760612488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373190702404827e-05, "grad_norm": 19.258052825927734, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8745315074920654, "num_tokens": 748343506.0, "step": 19610 }, { "epoch": 2.4947207734384937, "ewc_loss": 0.033672355115413666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3672356948954985e-05, "grad_norm": 19.16888427734375, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8750585317611694, "num_tokens": 748382209.0, "step": 19611 }, { "epoch": 2.4948479837170843, "ewc_loss": 0.03367000073194504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366999953868799e-05, "grad_norm": 19.256927490234375, "learning_rate": 1e-06, "loss": 0.3265, "mean_token_accuracy": 0.8933976888656616, "num_tokens": 748413660.0, "step": 19612 }, { "epoch": 2.494975193995675, "ewc_loss": 0.033731233328580856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373123399796896e-05, "grad_norm": 19.193668365478516, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8753523230552673, "num_tokens": 748457037.0, "step": 19613 }, { "epoch": 2.4951024042742653, "ewc_loss": 0.03362380713224411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3623808121774346e-05, "grad_norm": 19.19727897644043, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.853789210319519, "num_tokens": 748497227.0, "step": 19614 }, { "epoch": 2.495229614552856, "ewc_loss": 0.03368779271841049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3687792893033475e-05, "grad_norm": 19.223203659057617, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8719564080238342, "num_tokens": 748529901.0, "step": 19615 }, { "epoch": 2.4953568248314464, "ewc_loss": 0.03367741033434868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3677410101518035e-05, "grad_norm": 19.240455627441406, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8618258237838745, "num_tokens": 748568238.0, "step": 19616 }, { "epoch": 2.495484035110037, "ewc_loss": 0.033717185258865356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371718412381597e-05, "grad_norm": 19.179481506347656, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8639978170394897, "num_tokens": 748608153.0, "step": 19617 }, { "epoch": 2.4956112453886274, "ewc_loss": 0.033666931092739105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366693272255361e-05, "grad_norm": 19.211483001708984, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8700346946716309, "num_tokens": 748646977.0, "step": 19618 }, { "epoch": 2.495738455667218, "ewc_loss": 0.033689916133880615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3689917472656816e-05, "grad_norm": 19.13938331604004, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.865272581577301, "num_tokens": 748689586.0, "step": 19619 }, { "epoch": 2.4958656659458085, "ewc_loss": 0.03367521986365318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3675220038276166e-05, "grad_norm": 19.160926818847656, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8736105561256409, "num_tokens": 748730915.0, "step": 19620 }, { "epoch": 2.495992876224399, "ewc_loss": 0.03372031822800636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372032006154768e-05, "grad_norm": 19.183813095092773, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8703005909919739, "num_tokens": 748766996.0, "step": 19621 }, { "epoch": 2.4961200865029896, "ewc_loss": 0.033762376755476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376237509655766e-05, "grad_norm": 19.23699951171875, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8674567937850952, "num_tokens": 748801225.0, "step": 19622 }, { "epoch": 2.49624729678158, "ewc_loss": 0.033778779208660126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377877874299884e-05, "grad_norm": 19.252965927124023, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8692030310630798, "num_tokens": 748841355.0, "step": 19623 }, { "epoch": 2.4963745070601706, "ewc_loss": 0.03366828337311745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366828241269104e-05, "grad_norm": 19.213623046875, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8716745376586914, "num_tokens": 748879636.0, "step": 19624 }, { "epoch": 2.496501717338761, "ewc_loss": 0.03371432423591614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3714324672473595e-05, "grad_norm": 19.26066017150879, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8705995082855225, "num_tokens": 748917795.0, "step": 19625 }, { "epoch": 2.4966289276173517, "ewc_loss": 0.03370527923107147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3705280657159165e-05, "grad_norm": 19.250234603881836, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8481436967849731, "num_tokens": 748956512.0, "step": 19626 }, { "epoch": 2.496756137895942, "ewc_loss": 0.033646032214164734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364603253430687e-05, "grad_norm": 19.166534423828125, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8780860900878906, "num_tokens": 748995577.0, "step": 19627 }, { "epoch": 2.4968833481745323, "ewc_loss": 0.033723775297403336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372377614141442e-05, "grad_norm": 19.29616928100586, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8701165914535522, "num_tokens": 749027672.0, "step": 19628 }, { "epoch": 2.4970105584531233, "ewc_loss": 0.033759310841560364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375931191840209e-05, "grad_norm": 19.225040435791016, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8840280771255493, "num_tokens": 749059101.0, "step": 19629 }, { "epoch": 2.4971377687317133, "ewc_loss": 0.03366199880838394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3661999623291194e-05, "grad_norm": 19.26101303100586, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8587741851806641, "num_tokens": 749097034.0, "step": 19630 }, { "epoch": 2.497264979010304, "ewc_loss": 0.033687394112348557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3687392715364695e-05, "grad_norm": 19.165393829345703, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8708354234695435, "num_tokens": 749137536.0, "step": 19631 }, { "epoch": 2.4973921892888944, "ewc_loss": 0.03369172662496567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369172554812394e-05, "grad_norm": 19.263965606689453, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8739399909973145, "num_tokens": 749171487.0, "step": 19632 }, { "epoch": 2.497519399567485, "ewc_loss": 0.033751342445611954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375134110683575e-05, "grad_norm": 19.248863220214844, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8661424517631531, "num_tokens": 749213234.0, "step": 19633 }, { "epoch": 2.4976466098460754, "ewc_loss": 0.03369660675525665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369660771568306e-05, "grad_norm": 19.183349609375, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8620898723602295, "num_tokens": 749256184.0, "step": 19634 }, { "epoch": 2.497773820124666, "ewc_loss": 0.03369509428739548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369509431649931e-05, "grad_norm": 19.306245803833008, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8717939853668213, "num_tokens": 749292909.0, "step": 19635 }, { "epoch": 2.4979010304032565, "ewc_loss": 0.03372402861714363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372402716195211e-05, "grad_norm": 19.168743133544922, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8605707287788391, "num_tokens": 749327947.0, "step": 19636 }, { "epoch": 2.498028240681847, "ewc_loss": 0.033599454909563065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3599455491639674e-05, "grad_norm": 19.320068359375, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8827394247055054, "num_tokens": 749364107.0, "step": 19637 }, { "epoch": 2.4981554509604376, "ewc_loss": 0.03374313935637474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374314110260457e-05, "grad_norm": 19.257278442382812, "learning_rate": 1e-06, "loss": 0.3461, "mean_token_accuracy": 0.8886074423789978, "num_tokens": 749398055.0, "step": 19638 }, { "epoch": 2.498282661239028, "ewc_loss": 0.033599626272916794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359962647664361e-05, "grad_norm": 19.24072265625, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8745030164718628, "num_tokens": 749434626.0, "step": 19639 }, { "epoch": 2.4984098715176186, "ewc_loss": 0.03373173996806145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373173967702314e-05, "grad_norm": 19.28413200378418, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8668961524963379, "num_tokens": 749470616.0, "step": 19640 }, { "epoch": 2.498537081796209, "ewc_loss": 0.03366107493638992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366107557667419e-05, "grad_norm": 19.27419662475586, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8755383491516113, "num_tokens": 749513072.0, "step": 19641 }, { "epoch": 2.4986642920747997, "ewc_loss": 0.033633869141340256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363387077115476e-05, "grad_norm": 19.22125244140625, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.8870732188224792, "num_tokens": 749546988.0, "step": 19642 }, { "epoch": 2.49879150235339, "ewc_loss": 0.0336533859372139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3653384889476e-05, "grad_norm": 19.25408935546875, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8682574033737183, "num_tokens": 749585851.0, "step": 19643 }, { "epoch": 2.4989187126319807, "ewc_loss": 0.033658936619758606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365893644513562e-05, "grad_norm": 19.252384185791016, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8719227313995361, "num_tokens": 749619510.0, "step": 19644 }, { "epoch": 2.4990459229105713, "ewc_loss": 0.03365173935890198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3651740523055196e-05, "grad_norm": 19.209457397460938, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8810637593269348, "num_tokens": 749653660.0, "step": 19645 }, { "epoch": 2.499173133189162, "ewc_loss": 0.03366157039999962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366157034179196e-05, "grad_norm": 19.22821044921875, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8736522793769836, "num_tokens": 749691321.0, "step": 19646 }, { "epoch": 2.4993003434677523, "ewc_loss": 0.03367649018764496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367648969287984e-05, "grad_norm": 19.162092208862305, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8744489550590515, "num_tokens": 749729305.0, "step": 19647 }, { "epoch": 2.499427553746343, "ewc_loss": 0.033685363829135895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368536272319034e-05, "grad_norm": 19.21193504333496, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8794540166854858, "num_tokens": 749766134.0, "step": 19648 }, { "epoch": 2.4995547640249334, "ewc_loss": 0.03376206383109093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376206223038025e-05, "grad_norm": 19.24018669128418, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8701550960540771, "num_tokens": 749807715.0, "step": 19649 }, { "epoch": 2.499681974303524, "ewc_loss": 0.033754002302885056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375400046934374e-05, "grad_norm": 19.193679809570312, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8746215105056763, "num_tokens": 749838799.0, "step": 19650 }, { "epoch": 2.499809184582114, "ewc_loss": 0.03368917480111122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368917532498017e-05, "grad_norm": 19.246549606323242, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8857085108757019, "num_tokens": 749874283.0, "step": 19651 }, { "epoch": 2.499936394860705, "ewc_loss": 0.03374483436346054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3744832762749866e-05, "grad_norm": 19.239486694335938, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.871082067489624, "num_tokens": 749915912.0, "step": 19652 }, { "epoch": 2.500063605139295, "ewc_loss": 0.03371737152338028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371736966073513e-05, "grad_norm": 19.294597625732422, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8797440528869629, "num_tokens": 749959133.0, "step": 19653 }, { "epoch": 2.500190815417886, "ewc_loss": 0.03377106040716171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377105895197019e-05, "grad_norm": 19.28849983215332, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.870313286781311, "num_tokens": 749997846.0, "step": 19654 }, { "epoch": 2.500318025696476, "ewc_loss": 0.03369377180933952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3693770092213526e-05, "grad_norm": 19.195749282836914, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8768128156661987, "num_tokens": 750040973.0, "step": 19655 }, { "epoch": 2.5004452359750666, "ewc_loss": 0.03369170054793358, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369170008227229e-05, "grad_norm": 19.26497459411621, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8678429126739502, "num_tokens": 750082171.0, "step": 19656 }, { "epoch": 2.500572446253657, "ewc_loss": 0.0337420217692852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3742020605131984e-05, "grad_norm": 19.151836395263672, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8852378129959106, "num_tokens": 750123083.0, "step": 19657 }, { "epoch": 2.5006996565322477, "ewc_loss": 0.03364857658743858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3648575481493026e-05, "grad_norm": 19.25938606262207, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8598716855049133, "num_tokens": 750162235.0, "step": 19658 }, { "epoch": 2.500826866810838, "ewc_loss": 0.03373560309410095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3735603210516274e-05, "grad_norm": 19.228849411010742, "learning_rate": 1e-06, "loss": 0.3525, "mean_token_accuracy": 0.8856481313705444, "num_tokens": 750196501.0, "step": 19659 }, { "epoch": 2.5009540770894287, "ewc_loss": 0.03370416536927223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370416379766539e-05, "grad_norm": 19.302589416503906, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8456190824508667, "num_tokens": 750241178.0, "step": 19660 }, { "epoch": 2.5010812873680193, "ewc_loss": 0.03376152366399765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37615238095168e-05, "grad_norm": 19.32799530029297, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8780055046081543, "num_tokens": 750284104.0, "step": 19661 }, { "epoch": 2.50120849764661, "ewc_loss": 0.0336592011153698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365920201758854e-05, "grad_norm": 19.292320251464844, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8790271282196045, "num_tokens": 750318525.0, "step": 19662 }, { "epoch": 2.5013357079252003, "ewc_loss": 0.03362566977739334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3625670766923577e-05, "grad_norm": 19.286378860473633, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8819934129714966, "num_tokens": 750358892.0, "step": 19663 }, { "epoch": 2.501462918203791, "ewc_loss": 0.033622633665800095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3622633054619655e-05, "grad_norm": 19.19666290283203, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8606467247009277, "num_tokens": 750399325.0, "step": 19664 }, { "epoch": 2.5015901284823814, "ewc_loss": 0.033632270991802216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363227006047964e-05, "grad_norm": 19.318161010742188, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8672082424163818, "num_tokens": 750437493.0, "step": 19665 }, { "epoch": 2.501717338760972, "ewc_loss": 0.03364783152937889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364783333381638e-05, "grad_norm": 19.298486709594727, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8812289237976074, "num_tokens": 750472381.0, "step": 19666 }, { "epoch": 2.5018445490395624, "ewc_loss": 0.0335383415222168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353834108565934e-05, "grad_norm": 19.19478988647461, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8573672771453857, "num_tokens": 750511786.0, "step": 19667 }, { "epoch": 2.501971759318153, "ewc_loss": 0.03358681499958038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358681351528503e-05, "grad_norm": 19.262479782104492, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8809014558792114, "num_tokens": 750550596.0, "step": 19668 }, { "epoch": 2.5020989695967435, "ewc_loss": 0.03358263894915581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3582640753593296e-05, "grad_norm": 19.17799949645996, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.877997875213623, "num_tokens": 750594790.0, "step": 19669 }, { "epoch": 2.502226179875334, "ewc_loss": 0.033567123115062714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.356712477398105e-05, "grad_norm": 19.29234504699707, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8859457969665527, "num_tokens": 750636738.0, "step": 19670 }, { "epoch": 2.5023533901539246, "ewc_loss": 0.03361506760120392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3615066058700904e-05, "grad_norm": 19.179903030395508, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8853796720504761, "num_tokens": 750672338.0, "step": 19671 }, { "epoch": 2.502480600432515, "ewc_loss": 0.03359692171216011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.359692345838994e-05, "grad_norm": 19.326068878173828, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8684289455413818, "num_tokens": 750709149.0, "step": 19672 }, { "epoch": 2.5026078107111056, "ewc_loss": 0.03365704417228699, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3657044696155936e-05, "grad_norm": 19.242855072021484, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.8856300115585327, "num_tokens": 750741727.0, "step": 19673 }, { "epoch": 2.5027350209896957, "ewc_loss": 0.033524997532367706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.352499697939493e-05, "grad_norm": 19.261417388916016, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8611661195755005, "num_tokens": 750775903.0, "step": 19674 }, { "epoch": 2.5028622312682867, "ewc_loss": 0.033614322543144226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361432391102426e-05, "grad_norm": 19.325645446777344, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.870080828666687, "num_tokens": 750810502.0, "step": 19675 }, { "epoch": 2.5029894415468767, "ewc_loss": 0.03359925001859665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3599251764826477e-05, "grad_norm": 19.217782974243164, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.881737232208252, "num_tokens": 750847762.0, "step": 19676 }, { "epoch": 2.5031166518254677, "ewc_loss": 0.0335150808095932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3515079849166796e-05, "grad_norm": 19.27552032470703, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8749784231185913, "num_tokens": 750887201.0, "step": 19677 }, { "epoch": 2.503243862104058, "ewc_loss": 0.03363307565450668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363307405379601e-05, "grad_norm": 19.320629119873047, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8720368146896362, "num_tokens": 750927323.0, "step": 19678 }, { "epoch": 2.5033710723826488, "ewc_loss": 0.03354223817586899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3542237360961735e-05, "grad_norm": 19.190317153930664, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8658655285835266, "num_tokens": 750972501.0, "step": 19679 }, { "epoch": 2.503498282661239, "ewc_loss": 0.03354635462164879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.354635555297136e-05, "grad_norm": 19.16718292236328, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8837511539459229, "num_tokens": 751010743.0, "step": 19680 }, { "epoch": 2.5036254929398294, "ewc_loss": 0.03357937932014465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357938112458214e-05, "grad_norm": 19.30219841003418, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8691434860229492, "num_tokens": 751050440.0, "step": 19681 }, { "epoch": 2.50375270321842, "ewc_loss": 0.03358453884720802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.35845397785306e-05, "grad_norm": 19.183408737182617, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8774791955947876, "num_tokens": 751088319.0, "step": 19682 }, { "epoch": 2.5038799134970104, "ewc_loss": 0.03355731815099716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.355731678311713e-05, "grad_norm": 19.312273025512695, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8732394576072693, "num_tokens": 751123766.0, "step": 19683 }, { "epoch": 2.504007123775601, "ewc_loss": 0.03367506340146065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367506360518746e-05, "grad_norm": 19.172637939453125, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8899022936820984, "num_tokens": 751163375.0, "step": 19684 }, { "epoch": 2.5041343340541915, "ewc_loss": 0.03357906639575958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357906825840473e-05, "grad_norm": 19.28978729248047, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8687953948974609, "num_tokens": 751204526.0, "step": 19685 }, { "epoch": 2.504261544332782, "ewc_loss": 0.03365594521164894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3655946026556194e-05, "grad_norm": 19.233108520507812, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8787647485733032, "num_tokens": 751245133.0, "step": 19686 }, { "epoch": 2.5043887546113726, "ewc_loss": 0.03353014588356018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.353014471940696e-05, "grad_norm": 19.185285568237305, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8639583587646484, "num_tokens": 751284578.0, "step": 19687 }, { "epoch": 2.504515964889963, "ewc_loss": 0.03360442444682121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.360442497069016e-05, "grad_norm": 19.199634552001953, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8654027581214905, "num_tokens": 751325794.0, "step": 19688 }, { "epoch": 2.5046431751685536, "ewc_loss": 0.03358043357729912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358043250045739e-05, "grad_norm": 19.20627784729004, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8804362416267395, "num_tokens": 751363777.0, "step": 19689 }, { "epoch": 2.504770385447144, "ewc_loss": 0.03372857719659805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372857827343978e-05, "grad_norm": 19.27778434753418, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8653038740158081, "num_tokens": 751400499.0, "step": 19690 }, { "epoch": 2.5048975957257347, "ewc_loss": 0.033641256392002106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364125586813316e-05, "grad_norm": 19.191112518310547, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8725640773773193, "num_tokens": 751434895.0, "step": 19691 }, { "epoch": 2.505024806004325, "ewc_loss": 0.03364478796720505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3644788345554844e-05, "grad_norm": 19.239477157592773, "learning_rate": 1e-06, "loss": 0.3576, "mean_token_accuracy": 0.8831682205200195, "num_tokens": 751473984.0, "step": 19692 }, { "epoch": 2.5051520162829157, "ewc_loss": 0.03366082161664963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36608209181577e-05, "grad_norm": 19.191099166870117, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8751665949821472, "num_tokens": 751515030.0, "step": 19693 }, { "epoch": 2.5052792265615063, "ewc_loss": 0.03358917683362961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.358917820150964e-05, "grad_norm": 19.233903884887695, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8716572523117065, "num_tokens": 751550023.0, "step": 19694 }, { "epoch": 2.505406436840097, "ewc_loss": 0.033670563250780106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367056342540309e-05, "grad_norm": 19.24650001525879, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8750245571136475, "num_tokens": 751584793.0, "step": 19695 }, { "epoch": 2.5055336471186873, "ewc_loss": 0.033712681382894516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371268030605279e-05, "grad_norm": 19.298629760742188, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8565529584884644, "num_tokens": 751619817.0, "step": 19696 }, { "epoch": 2.505660857397278, "ewc_loss": 0.03365205600857735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365205702721141e-05, "grad_norm": 19.28424835205078, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8808197975158691, "num_tokens": 751657097.0, "step": 19697 }, { "epoch": 2.5057880676758684, "ewc_loss": 0.03367661312222481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367661338415928e-05, "grad_norm": 19.202144622802734, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8652976751327515, "num_tokens": 751687042.0, "step": 19698 }, { "epoch": 2.5059152779544585, "ewc_loss": 0.03372478485107422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372478386154398e-05, "grad_norm": 19.351741790771484, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8820004463195801, "num_tokens": 751724281.0, "step": 19699 }, { "epoch": 2.5060424882330494, "ewc_loss": 0.033740170300006866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3740168873919174e-05, "grad_norm": 19.220664978027344, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8542566299438477, "num_tokens": 751765144.0, "step": 19700 }, { "epoch": 2.5061696985116395, "ewc_loss": 0.03365739807486534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3657397580100223e-05, "grad_norm": 19.249181747436523, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8806751370429993, "num_tokens": 751799602.0, "step": 19701 }, { "epoch": 2.5062969087902305, "ewc_loss": 0.03370504453778267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3705044188536704e-05, "grad_norm": 19.175615310668945, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8822935819625854, "num_tokens": 751836972.0, "step": 19702 }, { "epoch": 2.5064241190688206, "ewc_loss": 0.03372367098927498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372367064002901e-05, "grad_norm": 19.277381896972656, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8750872611999512, "num_tokens": 751869627.0, "step": 19703 }, { "epoch": 2.5065513293474115, "ewc_loss": 0.033831819891929626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3831820474006236e-05, "grad_norm": 19.326831817626953, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8658086061477661, "num_tokens": 751911103.0, "step": 19704 }, { "epoch": 2.5066785396260016, "ewc_loss": 0.033741142600774765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3741143852239475e-05, "grad_norm": 19.19685173034668, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8690672516822815, "num_tokens": 751948956.0, "step": 19705 }, { "epoch": 2.506805749904592, "ewc_loss": 0.033709816634655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370981721673161e-05, "grad_norm": 19.297914505004883, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.877769410610199, "num_tokens": 751992185.0, "step": 19706 }, { "epoch": 2.5069329601831827, "ewc_loss": 0.033809687942266464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380968701094389e-05, "grad_norm": 19.290828704833984, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8700916171073914, "num_tokens": 752032661.0, "step": 19707 }, { "epoch": 2.507060170461773, "ewc_loss": 0.0337013304233551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370132981217466e-05, "grad_norm": 19.233144760131836, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8781063556671143, "num_tokens": 752073946.0, "step": 19708 }, { "epoch": 2.5071873807403637, "ewc_loss": 0.03374642878770828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374642983544618e-05, "grad_norm": 19.25836181640625, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8593276739120483, "num_tokens": 752118766.0, "step": 19709 }, { "epoch": 2.5073145910189543, "ewc_loss": 0.03375174105167389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375174128450453e-05, "grad_norm": 19.27753448486328, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8765451908111572, "num_tokens": 752159669.0, "step": 19710 }, { "epoch": 2.507441801297545, "ewc_loss": 0.033764470368623734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376447057235055e-05, "grad_norm": 19.271310806274414, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8782262802124023, "num_tokens": 752198139.0, "step": 19711 }, { "epoch": 2.5075690115761353, "ewc_loss": 0.03367505967617035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3675059967208654e-05, "grad_norm": 19.201622009277344, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8762170672416687, "num_tokens": 752239686.0, "step": 19712 }, { "epoch": 2.507696221854726, "ewc_loss": 0.03368907794356346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368907709955238e-05, "grad_norm": 19.316959381103516, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8574764132499695, "num_tokens": 752275740.0, "step": 19713 }, { "epoch": 2.5078234321333164, "ewc_loss": 0.03375881165266037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375881351530552e-05, "grad_norm": 19.265300750732422, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8612334728240967, "num_tokens": 752310554.0, "step": 19714 }, { "epoch": 2.507950642411907, "ewc_loss": 0.03364071995019913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3640721085248515e-05, "grad_norm": 19.239765167236328, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8788492679595947, "num_tokens": 752347761.0, "step": 19715 }, { "epoch": 2.5080778526904974, "ewc_loss": 0.033678896725177765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3678898034850135e-05, "grad_norm": 19.204811096191406, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8636307120323181, "num_tokens": 752387128.0, "step": 19716 }, { "epoch": 2.508205062969088, "ewc_loss": 0.033659327775239944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365932934684679e-05, "grad_norm": 19.223339080810547, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.872887909412384, "num_tokens": 752428722.0, "step": 19717 }, { "epoch": 2.5083322732476785, "ewc_loss": 0.03375381976366043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375381857040338e-05, "grad_norm": 19.337060928344727, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.873390793800354, "num_tokens": 752468920.0, "step": 19718 }, { "epoch": 2.508459483526269, "ewc_loss": 0.03371675685048103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371675848029554e-05, "grad_norm": 19.16143035888672, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8646701574325562, "num_tokens": 752505226.0, "step": 19719 }, { "epoch": 2.5085866938048595, "ewc_loss": 0.03364181146025658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364181247889064e-05, "grad_norm": 19.29446792602539, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8750014305114746, "num_tokens": 752543693.0, "step": 19720 }, { "epoch": 2.50871390408345, "ewc_loss": 0.03377992659807205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377992834430188e-05, "grad_norm": 19.249408721923828, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8588247299194336, "num_tokens": 752578758.0, "step": 19721 }, { "epoch": 2.5088411143620406, "ewc_loss": 0.0337412990629673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374130028532818e-05, "grad_norm": 19.319507598876953, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8835511207580566, "num_tokens": 752612115.0, "step": 19722 }, { "epoch": 2.508968324640631, "ewc_loss": 0.03374052420258522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374052539584227e-05, "grad_norm": 19.174352645874023, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8791912198066711, "num_tokens": 752644601.0, "step": 19723 }, { "epoch": 2.509095534919221, "ewc_loss": 0.03371771052479744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3717711630743e-05, "grad_norm": 19.297183990478516, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8769636750221252, "num_tokens": 752682333.0, "step": 19724 }, { "epoch": 2.509222745197812, "ewc_loss": 0.03385601192712784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3856013033073395e-05, "grad_norm": 19.249048233032227, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8677554130554199, "num_tokens": 752721802.0, "step": 19725 }, { "epoch": 2.5093499554764023, "ewc_loss": 0.03376098722219467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376098538865335e-05, "grad_norm": 19.321372985839844, "learning_rate": 1e-06, "loss": 0.3442, "mean_token_accuracy": 0.8885759115219116, "num_tokens": 752755673.0, "step": 19726 }, { "epoch": 2.5094771657549932, "ewc_loss": 0.03379300609230995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3793006878113374e-05, "grad_norm": 19.257068634033203, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.882673978805542, "num_tokens": 752795575.0, "step": 19727 }, { "epoch": 2.5096043760335833, "ewc_loss": 0.033742427825927734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374242805875838e-05, "grad_norm": 19.28290367126465, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8894760608673096, "num_tokens": 752833172.0, "step": 19728 }, { "epoch": 2.5097315863121743, "ewc_loss": 0.033736199140548706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373619983904064e-05, "grad_norm": 19.18185043334961, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.858437716960907, "num_tokens": 752872857.0, "step": 19729 }, { "epoch": 2.5098587965907644, "ewc_loss": 0.03369814530014992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369814658071846e-05, "grad_norm": 19.264209747314453, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8831812143325806, "num_tokens": 752908553.0, "step": 19730 }, { "epoch": 2.509986006869355, "ewc_loss": 0.033814895898103714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3814896596595645e-05, "grad_norm": 19.298465728759766, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8791004419326782, "num_tokens": 752937762.0, "step": 19731 }, { "epoch": 2.5101132171479454, "ewc_loss": 0.033768169581890106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376817039679736e-05, "grad_norm": 19.31829833984375, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.860150933265686, "num_tokens": 752977184.0, "step": 19732 }, { "epoch": 2.510240427426536, "ewc_loss": 0.03379453346133232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379453482921235e-05, "grad_norm": 19.302377700805664, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8699998259544373, "num_tokens": 753021071.0, "step": 19733 }, { "epoch": 2.5103676377051265, "ewc_loss": 0.03370663523674011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37066339852754e-05, "grad_norm": 19.3081111907959, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8478356003761292, "num_tokens": 753056597.0, "step": 19734 }, { "epoch": 2.510494847983717, "ewc_loss": 0.0336807556450367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368075704202056e-05, "grad_norm": 19.23997688293457, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8775135278701782, "num_tokens": 753096178.0, "step": 19735 }, { "epoch": 2.5106220582623076, "ewc_loss": 0.03367403894662857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367403769516386e-05, "grad_norm": 19.23016357421875, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8762019872665405, "num_tokens": 753130388.0, "step": 19736 }, { "epoch": 2.510749268540898, "ewc_loss": 0.03373749181628227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3737491321517155e-05, "grad_norm": 19.220643997192383, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8808895349502563, "num_tokens": 753166585.0, "step": 19737 }, { "epoch": 2.5108764788194886, "ewc_loss": 0.03374968096613884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374968218849972e-05, "grad_norm": 19.194055557250977, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8831467032432556, "num_tokens": 753203403.0, "step": 19738 }, { "epoch": 2.511003689098079, "ewc_loss": 0.033750638365745544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3750638976925984e-05, "grad_norm": 19.3327579498291, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8784145712852478, "num_tokens": 753241632.0, "step": 19739 }, { "epoch": 2.5111308993766697, "ewc_loss": 0.03378112241625786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3781121601350605e-05, "grad_norm": 19.172191619873047, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8866376876831055, "num_tokens": 753279292.0, "step": 19740 }, { "epoch": 2.51125810965526, "ewc_loss": 0.03366125375032425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366125383763574e-05, "grad_norm": 19.27631187438965, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8652548789978027, "num_tokens": 753319121.0, "step": 19741 }, { "epoch": 2.5113853199338507, "ewc_loss": 0.033737603574991226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3737604098860174e-05, "grad_norm": 19.316511154174805, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8581951856613159, "num_tokens": 753353074.0, "step": 19742 }, { "epoch": 2.5115125302124413, "ewc_loss": 0.03364931792020798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364931762916967e-05, "grad_norm": 19.15644073486328, "learning_rate": 1e-06, "loss": 0.3407, "mean_token_accuracy": 0.8921146988868713, "num_tokens": 753388994.0, "step": 19743 }, { "epoch": 2.511639740491032, "ewc_loss": 0.033773213624954224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377321263542399e-05, "grad_norm": 19.328611373901367, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8800633549690247, "num_tokens": 753430271.0, "step": 19744 }, { "epoch": 2.5117669507696223, "ewc_loss": 0.03374603018164635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37460296577774e-05, "grad_norm": 19.230609893798828, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8657848238945007, "num_tokens": 753473820.0, "step": 19745 }, { "epoch": 2.511894161048213, "ewc_loss": 0.033684153109788895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3684151276247576e-05, "grad_norm": 19.209571838378906, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.869991660118103, "num_tokens": 753516397.0, "step": 19746 }, { "epoch": 2.5120213713268034, "ewc_loss": 0.033692069351673126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369206751813181e-05, "grad_norm": 19.209651947021484, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8840410113334656, "num_tokens": 753557589.0, "step": 19747 }, { "epoch": 2.512148581605394, "ewc_loss": 0.03364434093236923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.364434087416157e-05, "grad_norm": 19.215499877929688, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8813998699188232, "num_tokens": 753593764.0, "step": 19748 }, { "epoch": 2.512275791883984, "ewc_loss": 0.03375968709588051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375968663021922e-05, "grad_norm": 19.269039154052734, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8619745969772339, "num_tokens": 753635967.0, "step": 19749 }, { "epoch": 2.512403002162575, "ewc_loss": 0.033653195947408676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365319571457803e-05, "grad_norm": 19.1999454498291, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8818962574005127, "num_tokens": 753674595.0, "step": 19750 }, { "epoch": 2.512530212441165, "ewc_loss": 0.03369557857513428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369557816768065e-05, "grad_norm": 19.256624221801758, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8580913543701172, "num_tokens": 753714118.0, "step": 19751 }, { "epoch": 2.512657422719756, "ewc_loss": 0.033676840364933014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367683893884532e-05, "grad_norm": 19.258228302001953, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8600497245788574, "num_tokens": 753758139.0, "step": 19752 }, { "epoch": 2.512784632998346, "ewc_loss": 0.03361009806394577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.361009657965042e-05, "grad_norm": 19.228153228759766, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8623731732368469, "num_tokens": 753800373.0, "step": 19753 }, { "epoch": 2.5129118432769366, "ewc_loss": 0.03366833180189133, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366833334439434e-05, "grad_norm": 19.257143020629883, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8681925535202026, "num_tokens": 753832366.0, "step": 19754 }, { "epoch": 2.513039053555527, "ewc_loss": 0.03363649174571037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363649011589587e-05, "grad_norm": 19.204456329345703, "learning_rate": 1e-06, "loss": 0.3417, "mean_token_accuracy": 0.8924728035926819, "num_tokens": 753866176.0, "step": 19755 }, { "epoch": 2.5131662638341177, "ewc_loss": 0.03365495428442955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365495285834186e-05, "grad_norm": 19.25347900390625, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8734164237976074, "num_tokens": 753911470.0, "step": 19756 }, { "epoch": 2.513293474112708, "ewc_loss": 0.033717334270477295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371733328094706e-05, "grad_norm": 19.2940731048584, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8711954355239868, "num_tokens": 753953027.0, "step": 19757 }, { "epoch": 2.5134206843912987, "ewc_loss": 0.03365166112780571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365166048752144e-05, "grad_norm": 19.232572555541992, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8676483631134033, "num_tokens": 753987992.0, "step": 19758 }, { "epoch": 2.5135478946698893, "ewc_loss": 0.03363720700144768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3637206797720864e-05, "grad_norm": 19.271400451660156, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8785991668701172, "num_tokens": 754026671.0, "step": 19759 }, { "epoch": 2.51367510494848, "ewc_loss": 0.0336821973323822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368219768162817e-05, "grad_norm": 19.267196655273438, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8748941421508789, "num_tokens": 754063526.0, "step": 19760 }, { "epoch": 2.5138023152270703, "ewc_loss": 0.033636100590229034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363610085216351e-05, "grad_norm": 19.231712341308594, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8570244908332825, "num_tokens": 754103361.0, "step": 19761 }, { "epoch": 2.513929525505661, "ewc_loss": 0.03369274362921715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3692744182189927e-05, "grad_norm": 19.2857723236084, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8768287301063538, "num_tokens": 754141969.0, "step": 19762 }, { "epoch": 2.5140567357842514, "ewc_loss": 0.03367132321000099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367132376297377e-05, "grad_norm": 19.23355484008789, "learning_rate": 1e-06, "loss": 0.3472, "mean_token_accuracy": 0.8898370862007141, "num_tokens": 754182691.0, "step": 19763 }, { "epoch": 2.514183946062842, "ewc_loss": 0.033662423491477966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3662425266811624e-05, "grad_norm": 19.227190017700195, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8729465007781982, "num_tokens": 754219956.0, "step": 19764 }, { "epoch": 2.5143111563414324, "ewc_loss": 0.03371986001729965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371986167621799e-05, "grad_norm": 19.228862762451172, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8694325685501099, "num_tokens": 754259467.0, "step": 19765 }, { "epoch": 2.514438366620023, "ewc_loss": 0.03363875299692154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363875293871388e-05, "grad_norm": 19.223661422729492, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8768202662467957, "num_tokens": 754300344.0, "step": 19766 }, { "epoch": 2.5145655768986135, "ewc_loss": 0.033633239567279816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3633241400821134e-05, "grad_norm": 19.199853897094727, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8728535175323486, "num_tokens": 754342606.0, "step": 19767 }, { "epoch": 2.514692787177204, "ewc_loss": 0.03371690586209297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371690763742663e-05, "grad_norm": 19.28354835510254, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8699631094932556, "num_tokens": 754379946.0, "step": 19768 }, { "epoch": 2.5148199974557945, "ewc_loss": 0.03376029431819916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3760294172680005e-05, "grad_norm": 19.2529354095459, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8874791860580444, "num_tokens": 754414353.0, "step": 19769 }, { "epoch": 2.514947207734385, "ewc_loss": 0.03367255628108978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3672557037789375e-05, "grad_norm": 19.277952194213867, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8611940741539001, "num_tokens": 754455506.0, "step": 19770 }, { "epoch": 2.5150744180129756, "ewc_loss": 0.03372473642230034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372473656781949e-05, "grad_norm": 19.23577308654785, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8571711778640747, "num_tokens": 754498316.0, "step": 19771 }, { "epoch": 2.5152016282915657, "ewc_loss": 0.03372256085276604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372256105649285e-05, "grad_norm": 19.332691192626953, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8630000948905945, "num_tokens": 754534943.0, "step": 19772 }, { "epoch": 2.5153288385701567, "ewc_loss": 0.03375189006328583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3751890441635624e-05, "grad_norm": 19.30389976501465, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8711158633232117, "num_tokens": 754572775.0, "step": 19773 }, { "epoch": 2.5154560488487467, "ewc_loss": 0.033710725605487823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371072671143338e-05, "grad_norm": 19.295772552490234, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8835079669952393, "num_tokens": 754610473.0, "step": 19774 }, { "epoch": 2.5155832591273377, "ewc_loss": 0.03367733582854271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367733734194189e-05, "grad_norm": 19.302736282348633, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8746368885040283, "num_tokens": 754649527.0, "step": 19775 }, { "epoch": 2.515710469405928, "ewc_loss": 0.0336245633661747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.362456482136622e-05, "grad_norm": 19.248037338256836, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8749122619628906, "num_tokens": 754691513.0, "step": 19776 }, { "epoch": 2.5158376796845188, "ewc_loss": 0.03367107734084129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367107638041489e-05, "grad_norm": 19.213420867919922, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8614170551300049, "num_tokens": 754727932.0, "step": 19777 }, { "epoch": 2.515964889963109, "ewc_loss": 0.03372105956077576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372105857124552e-05, "grad_norm": 19.37242317199707, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8787887096405029, "num_tokens": 754764571.0, "step": 19778 }, { "epoch": 2.5160921002416994, "ewc_loss": 0.03373953327536583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373953222762793e-05, "grad_norm": 19.231184005737305, "learning_rate": 1e-06, "loss": 0.344, "mean_token_accuracy": 0.887803316116333, "num_tokens": 754805846.0, "step": 19779 }, { "epoch": 2.51621931052029, "ewc_loss": 0.033649053424596786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3649052056716755e-05, "grad_norm": 19.29947280883789, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8801695108413696, "num_tokens": 754844016.0, "step": 19780 }, { "epoch": 2.5163465207988804, "ewc_loss": 0.03375276178121567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3752763556549326e-05, "grad_norm": 19.2872314453125, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8736645579338074, "num_tokens": 754886493.0, "step": 19781 }, { "epoch": 2.516473731077471, "ewc_loss": 0.03366272523999214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3662723581073806e-05, "grad_norm": 19.262712478637695, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8759270906448364, "num_tokens": 754928195.0, "step": 19782 }, { "epoch": 2.5166009413560615, "ewc_loss": 0.033677536994218826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367753743077628e-05, "grad_norm": 19.26628303527832, "learning_rate": 1e-06, "loss": 0.3443, "mean_token_accuracy": 0.8833210468292236, "num_tokens": 754958399.0, "step": 19783 }, { "epoch": 2.516728151634652, "ewc_loss": 0.03370433673262596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370433842064813e-05, "grad_norm": 19.291358947753906, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8782213926315308, "num_tokens": 754997926.0, "step": 19784 }, { "epoch": 2.5168553619132426, "ewc_loss": 0.03375720605254173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3757205528672785e-05, "grad_norm": 19.278900146484375, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8799858093261719, "num_tokens": 755040577.0, "step": 19785 }, { "epoch": 2.516982572191833, "ewc_loss": 0.03368932381272316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368932448211126e-05, "grad_norm": 19.30508804321289, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8720043897628784, "num_tokens": 755070458.0, "step": 19786 }, { "epoch": 2.5171097824704236, "ewc_loss": 0.0337279736995697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37279743689578e-05, "grad_norm": 19.261728286743164, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8532241582870483, "num_tokens": 755117205.0, "step": 19787 }, { "epoch": 2.517236992749014, "ewc_loss": 0.03363175317645073, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3631753467489034e-05, "grad_norm": 19.307538986206055, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8757028579711914, "num_tokens": 755162098.0, "step": 19788 }, { "epoch": 2.5173642030276047, "ewc_loss": 0.033717989921569824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3717988117132336e-05, "grad_norm": 19.21537971496582, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8729974627494812, "num_tokens": 755199655.0, "step": 19789 }, { "epoch": 2.517491413306195, "ewc_loss": 0.03362905979156494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3629061363171786e-05, "grad_norm": 19.330581665039062, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8870648741722107, "num_tokens": 755234165.0, "step": 19790 }, { "epoch": 2.5176186235847857, "ewc_loss": 0.033765945583581924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376594395376742e-05, "grad_norm": 19.267759323120117, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8754028081893921, "num_tokens": 755271933.0, "step": 19791 }, { "epoch": 2.5177458338633762, "ewc_loss": 0.03357362002134323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.357361856615171e-05, "grad_norm": 19.24941062927246, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.882367730140686, "num_tokens": 755306903.0, "step": 19792 }, { "epoch": 2.5178730441419668, "ewc_loss": 0.033736634999513626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373663639649749e-05, "grad_norm": 19.206161499023438, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8821623921394348, "num_tokens": 755347613.0, "step": 19793 }, { "epoch": 2.5180002544205573, "ewc_loss": 0.03364722803235054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.36472294293344e-05, "grad_norm": 19.24937629699707, "learning_rate": 1e-06, "loss": 0.3394, "mean_token_accuracy": 0.8924705982208252, "num_tokens": 755380541.0, "step": 19794 }, { "epoch": 2.518127464699148, "ewc_loss": 0.033707309514284134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370731064933352e-05, "grad_norm": 19.177032470703125, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8712557554244995, "num_tokens": 755422502.0, "step": 19795 }, { "epoch": 2.5182546749777384, "ewc_loss": 0.0337095707654953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3709569834172726e-05, "grad_norm": 19.278244018554688, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8753036260604858, "num_tokens": 755463975.0, "step": 19796 }, { "epoch": 2.5183818852563284, "ewc_loss": 0.033737342804670334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3737342164386064e-05, "grad_norm": 19.222440719604492, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8755179643630981, "num_tokens": 755504873.0, "step": 19797 }, { "epoch": 2.5185090955349194, "ewc_loss": 0.033697545528411865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369754631421529e-05, "grad_norm": 19.25799560546875, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.849838376045227, "num_tokens": 755545646.0, "step": 19798 }, { "epoch": 2.5186363058135095, "ewc_loss": 0.03374449536204338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3744494430720806e-05, "grad_norm": 19.304677963256836, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8622838854789734, "num_tokens": 755577228.0, "step": 19799 }, { "epoch": 2.5187635160921005, "ewc_loss": 0.033709894865751266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370989361428656e-05, "grad_norm": 19.300865173339844, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8828302621841431, "num_tokens": 755621437.0, "step": 19800 }, { "epoch": 2.5188907263706906, "ewc_loss": 0.0336783342063427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3678334148135036e-05, "grad_norm": 19.26551628112793, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8716084957122803, "num_tokens": 755658266.0, "step": 19801 }, { "epoch": 2.5190179366492815, "ewc_loss": 0.03367198631167412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.367198587511666e-05, "grad_norm": 19.30408477783203, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8624545335769653, "num_tokens": 755696755.0, "step": 19802 }, { "epoch": 2.5191451469278716, "ewc_loss": 0.03376030549407005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376030508661643e-05, "grad_norm": 19.300325393676758, "learning_rate": 1e-06, "loss": 0.346, "mean_token_accuracy": 0.8875497579574585, "num_tokens": 755732434.0, "step": 19803 }, { "epoch": 2.519272357206462, "ewc_loss": 0.03359774127602577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3597742003621534e-05, "grad_norm": 19.233285903930664, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8788763284683228, "num_tokens": 755768488.0, "step": 19804 }, { "epoch": 2.5193995674850527, "ewc_loss": 0.03367679938673973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3676798921078444e-05, "grad_norm": 19.288585662841797, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8808467388153076, "num_tokens": 755799637.0, "step": 19805 }, { "epoch": 2.519526777763643, "ewc_loss": 0.03375094383955002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375094456714578e-05, "grad_norm": 19.346351623535156, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.858960747718811, "num_tokens": 755834229.0, "step": 19806 }, { "epoch": 2.5196539880422337, "ewc_loss": 0.0336654856801033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366548480698839e-05, "grad_norm": 19.255985260009766, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8777675628662109, "num_tokens": 755871776.0, "step": 19807 }, { "epoch": 2.5197811983208243, "ewc_loss": 0.03365486115217209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365486190887168e-05, "grad_norm": 19.28880500793457, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8846509456634521, "num_tokens": 755911125.0, "step": 19808 }, { "epoch": 2.519908408599415, "ewc_loss": 0.03377167880535126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3771677408367395e-05, "grad_norm": 19.275821685791016, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8826041221618652, "num_tokens": 755952204.0, "step": 19809 }, { "epoch": 2.5200356188780053, "ewc_loss": 0.03372661769390106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372661740286276e-05, "grad_norm": 19.27838897705078, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8633441925048828, "num_tokens": 755996702.0, "step": 19810 }, { "epoch": 2.520162829156596, "ewc_loss": 0.03371965512633324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3719654311425984e-05, "grad_norm": 19.224319458007812, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.8899198174476624, "num_tokens": 756033285.0, "step": 19811 }, { "epoch": 2.5202900394351864, "ewc_loss": 0.03363795951008797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.363795985933393e-05, "grad_norm": 19.253067016601562, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8743641972541809, "num_tokens": 756068851.0, "step": 19812 }, { "epoch": 2.520417249713777, "ewc_loss": 0.03374951705336571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37495184794534e-05, "grad_norm": 19.21933937072754, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8652646541595459, "num_tokens": 756111676.0, "step": 19813 }, { "epoch": 2.5205444599923674, "ewc_loss": 0.03366959095001221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3669592085061595e-05, "grad_norm": 19.32097625732422, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.859013020992279, "num_tokens": 756146575.0, "step": 19814 }, { "epoch": 2.520671670270958, "ewc_loss": 0.03374391049146652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3743908716132864e-05, "grad_norm": 19.1850643157959, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8646096587181091, "num_tokens": 756190006.0, "step": 19815 }, { "epoch": 2.5207988805495485, "ewc_loss": 0.033707842230796814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370784179423936e-05, "grad_norm": 19.267744064331055, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.885180652141571, "num_tokens": 756232828.0, "step": 19816 }, { "epoch": 2.520926090828139, "ewc_loss": 0.03383258730173111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383258808753453e-05, "grad_norm": 19.27347183227539, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8807694911956787, "num_tokens": 756273077.0, "step": 19817 }, { "epoch": 2.5210533011067295, "ewc_loss": 0.033703695982694626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370369449839927e-05, "grad_norm": 19.256502151489258, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8590257167816162, "num_tokens": 756313421.0, "step": 19818 }, { "epoch": 2.52118051138532, "ewc_loss": 0.03373925760388374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37392593792174e-05, "grad_norm": 19.307809829711914, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8692923188209534, "num_tokens": 756356461.0, "step": 19819 }, { "epoch": 2.5213077216639106, "ewc_loss": 0.03375333920121193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3753338357200846e-05, "grad_norm": 19.270919799804688, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8710595369338989, "num_tokens": 756388309.0, "step": 19820 }, { "epoch": 2.521434931942501, "ewc_loss": 0.033684175461530685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3684176742099226e-05, "grad_norm": 19.32503318786621, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8844418525695801, "num_tokens": 756426655.0, "step": 19821 }, { "epoch": 2.521562142221091, "ewc_loss": 0.03383611515164375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383611692697741e-05, "grad_norm": 19.320205688476562, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.871186375617981, "num_tokens": 756464961.0, "step": 19822 }, { "epoch": 2.521689352499682, "ewc_loss": 0.033720433712005615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37204328388907e-05, "grad_norm": 19.308380126953125, "learning_rate": 1e-06, "loss": 0.3388, "mean_token_accuracy": 0.8892785906791687, "num_tokens": 756497338.0, "step": 19823 }, { "epoch": 2.5218165627782723, "ewc_loss": 0.03375951945781708, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3759519283194095e-05, "grad_norm": 19.272113800048828, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8805031180381775, "num_tokens": 756535324.0, "step": 19824 }, { "epoch": 2.5219437730568632, "ewc_loss": 0.0336911678314209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3691168937366456e-05, "grad_norm": 19.288572311401367, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8820942640304565, "num_tokens": 756574138.0, "step": 19825 }, { "epoch": 2.5220709833354533, "ewc_loss": 0.03375647962093353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3756477932911366e-05, "grad_norm": 19.267194747924805, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8843444585800171, "num_tokens": 756608967.0, "step": 19826 }, { "epoch": 2.522198193614044, "ewc_loss": 0.0337354838848114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373548315721564e-05, "grad_norm": 19.2943058013916, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8538618683815002, "num_tokens": 756653882.0, "step": 19827 }, { "epoch": 2.5223254038926344, "ewc_loss": 0.03380759432911873, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380759517312981e-05, "grad_norm": 19.236581802368164, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8688094615936279, "num_tokens": 756695638.0, "step": 19828 }, { "epoch": 2.522452614171225, "ewc_loss": 0.03370298072695732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370298145455308e-05, "grad_norm": 19.217378616333008, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8679043650627136, "num_tokens": 756734964.0, "step": 19829 }, { "epoch": 2.5225798244498154, "ewc_loss": 0.033807266503572464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3807267755037174e-05, "grad_norm": 19.336320877075195, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.878484845161438, "num_tokens": 756768454.0, "step": 19830 }, { "epoch": 2.522707034728406, "ewc_loss": 0.033770207315683365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377020766492933e-05, "grad_norm": 19.3177433013916, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8640023469924927, "num_tokens": 756801696.0, "step": 19831 }, { "epoch": 2.5228342450069965, "ewc_loss": 0.0337885320186615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378853216418065e-05, "grad_norm": 19.22916603088379, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8747519254684448, "num_tokens": 756839874.0, "step": 19832 }, { "epoch": 2.522961455285587, "ewc_loss": 0.033774081617593765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377408211235888e-05, "grad_norm": 19.252193450927734, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8631734848022461, "num_tokens": 756884179.0, "step": 19833 }, { "epoch": 2.5230886655641775, "ewc_loss": 0.033793795853853226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379379631951451e-05, "grad_norm": 19.2788028717041, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8760149478912354, "num_tokens": 756922725.0, "step": 19834 }, { "epoch": 2.523215875842768, "ewc_loss": 0.03376918286085129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376918175490573e-05, "grad_norm": 19.329648971557617, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8699630498886108, "num_tokens": 756956080.0, "step": 19835 }, { "epoch": 2.5233430861213586, "ewc_loss": 0.033788036555051804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378803739906289e-05, "grad_norm": 19.286609649658203, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8653850555419922, "num_tokens": 756995137.0, "step": 19836 }, { "epoch": 2.523470296399949, "ewc_loss": 0.03378577530384064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3785774576244876e-05, "grad_norm": 19.222232818603516, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8805301189422607, "num_tokens": 757035300.0, "step": 19837 }, { "epoch": 2.5235975066785397, "ewc_loss": 0.033800892531871796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380089401616715e-05, "grad_norm": 19.34846305847168, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8508933782577515, "num_tokens": 757068789.0, "step": 19838 }, { "epoch": 2.52372471695713, "ewc_loss": 0.03376922383904457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3769225410651416e-05, "grad_norm": 19.159955978393555, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8685711622238159, "num_tokens": 757102913.0, "step": 19839 }, { "epoch": 2.5238519272357207, "ewc_loss": 0.03378896415233612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3788965083658695e-05, "grad_norm": 19.268779754638672, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8752204179763794, "num_tokens": 757138932.0, "step": 19840 }, { "epoch": 2.5239791375143112, "ewc_loss": 0.03386536240577698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386536263860762e-05, "grad_norm": 19.216197967529297, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8774056434631348, "num_tokens": 757179181.0, "step": 19841 }, { "epoch": 2.5241063477929018, "ewc_loss": 0.03378648683428764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3786487620091066e-05, "grad_norm": 19.25910758972168, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8437010049819946, "num_tokens": 757224209.0, "step": 19842 }, { "epoch": 2.5242335580714923, "ewc_loss": 0.03381825610995293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381825445103459e-05, "grad_norm": 19.219751358032227, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.886563777923584, "num_tokens": 757262398.0, "step": 19843 }, { "epoch": 2.524360768350083, "ewc_loss": 0.03375069797039032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37506971845869e-05, "grad_norm": 19.21820068359375, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8643466234207153, "num_tokens": 757301744.0, "step": 19844 }, { "epoch": 2.5244879786286734, "ewc_loss": 0.03381696715950966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381696660653688e-05, "grad_norm": 19.269309997558594, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8790481686592102, "num_tokens": 757338273.0, "step": 19845 }, { "epoch": 2.524615188907264, "ewc_loss": 0.0338323712348938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3832369808806106e-05, "grad_norm": 19.23168182373047, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8659657835960388, "num_tokens": 757383169.0, "step": 19846 }, { "epoch": 2.524742399185854, "ewc_loss": 0.03379098325967789, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379098416189663e-05, "grad_norm": 19.298946380615234, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8632035851478577, "num_tokens": 757420998.0, "step": 19847 }, { "epoch": 2.524869609464445, "ewc_loss": 0.03379874303936958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379874397069216e-05, "grad_norm": 19.226058959960938, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.877984881401062, "num_tokens": 757463281.0, "step": 19848 }, { "epoch": 2.524996819743035, "ewc_loss": 0.033773016184568405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3773016184568405e-05, "grad_norm": 19.42022705078125, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8687523603439331, "num_tokens": 757495022.0, "step": 19849 }, { "epoch": 2.525124030021626, "ewc_loss": 0.0338076688349247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3807667932705954e-05, "grad_norm": 19.20842933654785, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8792186975479126, "num_tokens": 757536801.0, "step": 19850 }, { "epoch": 2.525251240300216, "ewc_loss": 0.03369641304016113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3696411264827475e-05, "grad_norm": 19.313955307006836, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8889428377151489, "num_tokens": 757573419.0, "step": 19851 }, { "epoch": 2.5253784505788066, "ewc_loss": 0.033885106444358826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388510594959371e-05, "grad_norm": 19.325416564941406, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8688610792160034, "num_tokens": 757610773.0, "step": 19852 }, { "epoch": 2.525505660857397, "ewc_loss": 0.033756546676158905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37565470545087e-05, "grad_norm": 19.35179328918457, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.8840562105178833, "num_tokens": 757648067.0, "step": 19853 }, { "epoch": 2.5256328711359877, "ewc_loss": 0.03377169743180275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377169559826143e-05, "grad_norm": 19.29567527770996, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8798406720161438, "num_tokens": 757681598.0, "step": 19854 }, { "epoch": 2.525760081414578, "ewc_loss": 0.033739447593688965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373944855411537e-05, "grad_norm": 19.28955841064453, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8555998802185059, "num_tokens": 757724752.0, "step": 19855 }, { "epoch": 2.5258872916931687, "ewc_loss": 0.03376910462975502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376910535735078e-05, "grad_norm": 19.294002532958984, "learning_rate": 1e-06, "loss": 0.3561, "mean_token_accuracy": 0.885657548904419, "num_tokens": 757763801.0, "step": 19856 }, { "epoch": 2.5260145019717593, "ewc_loss": 0.03374602273106575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3746022381819785e-05, "grad_norm": 19.302648544311523, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.872496485710144, "num_tokens": 757799235.0, "step": 19857 }, { "epoch": 2.52614171225035, "ewc_loss": 0.03377293795347214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377293614903465e-05, "grad_norm": 19.23472023010254, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8784646987915039, "num_tokens": 757839695.0, "step": 19858 }, { "epoch": 2.5262689225289403, "ewc_loss": 0.03371681272983551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3716813049977645e-05, "grad_norm": 19.24040412902832, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8603858947753906, "num_tokens": 757877152.0, "step": 19859 }, { "epoch": 2.526396132807531, "ewc_loss": 0.03376005217432976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376005042809993e-05, "grad_norm": 19.258960723876953, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8726719617843628, "num_tokens": 757916173.0, "step": 19860 }, { "epoch": 2.5265233430861214, "ewc_loss": 0.03381175547838211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3811757020885125e-05, "grad_norm": 19.285554885864258, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8785357475280762, "num_tokens": 757957271.0, "step": 19861 }, { "epoch": 2.526650553364712, "ewc_loss": 0.033839818090200424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383981675142422e-05, "grad_norm": 19.26970672607422, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8677303791046143, "num_tokens": 757995553.0, "step": 19862 }, { "epoch": 2.5267777636433024, "ewc_loss": 0.03376759961247444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3767599234124646e-05, "grad_norm": 19.340864181518555, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8632375001907349, "num_tokens": 758035095.0, "step": 19863 }, { "epoch": 2.526904973921893, "ewc_loss": 0.03377855196595192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37785531883128e-05, "grad_norm": 19.282747268676758, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8721509575843811, "num_tokens": 758071185.0, "step": 19864 }, { "epoch": 2.5270321842004835, "ewc_loss": 0.03373798355460167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373798244865611e-05, "grad_norm": 19.325166702270508, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8743912577629089, "num_tokens": 758110251.0, "step": 19865 }, { "epoch": 2.527159394479074, "ewc_loss": 0.03376531973481178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37653182214126e-05, "grad_norm": 19.315471649169922, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8593722581863403, "num_tokens": 758144385.0, "step": 19866 }, { "epoch": 2.5272866047576645, "ewc_loss": 0.033748526126146317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3748525311239064e-05, "grad_norm": 19.355154037475586, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8843542337417603, "num_tokens": 758185935.0, "step": 19867 }, { "epoch": 2.527413815036255, "ewc_loss": 0.03373418003320694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37341807608027e-05, "grad_norm": 19.268613815307617, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8771913051605225, "num_tokens": 758226688.0, "step": 19868 }, { "epoch": 2.5275410253148456, "ewc_loss": 0.033715762197971344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3715761674102396e-05, "grad_norm": 19.31275749206543, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8790934681892395, "num_tokens": 758263866.0, "step": 19869 }, { "epoch": 2.5276682355934357, "ewc_loss": 0.033692751079797745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.369275145814754e-05, "grad_norm": 19.228710174560547, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8789334893226624, "num_tokens": 758303607.0, "step": 19870 }, { "epoch": 2.5277954458720266, "ewc_loss": 0.0336679108440876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.366791133885272e-05, "grad_norm": 19.324052810668945, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8773161768913269, "num_tokens": 758340120.0, "step": 19871 }, { "epoch": 2.5279226561506167, "ewc_loss": 0.033681634813547134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368163379491307e-05, "grad_norm": 19.190153121948242, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8732461333274841, "num_tokens": 758373792.0, "step": 19872 }, { "epoch": 2.5280498664292077, "ewc_loss": 0.033676836639642715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3676835300866514e-05, "grad_norm": 19.282194137573242, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8619787693023682, "num_tokens": 758418157.0, "step": 19873 }, { "epoch": 2.528177076707798, "ewc_loss": 0.03379841521382332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379841655259952e-05, "grad_norm": 19.346956253051758, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8664150834083557, "num_tokens": 758454042.0, "step": 19874 }, { "epoch": 2.5283042869863888, "ewc_loss": 0.03370131552219391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3701315260259435e-05, "grad_norm": 19.246566772460938, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8747000098228455, "num_tokens": 758495305.0, "step": 19875 }, { "epoch": 2.528431497264979, "ewc_loss": 0.033628299832344055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3628301025601104e-05, "grad_norm": 19.31513786315918, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8812851309776306, "num_tokens": 758532646.0, "step": 19876 }, { "epoch": 2.5285587075435694, "ewc_loss": 0.03376956284046173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3769563742680475e-05, "grad_norm": 19.34266471862793, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8693138957023621, "num_tokens": 758568329.0, "step": 19877 }, { "epoch": 2.52868591782216, "ewc_loss": 0.03368659317493439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3686592360027134e-05, "grad_norm": 19.224485397338867, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8766177892684937, "num_tokens": 758608246.0, "step": 19878 }, { "epoch": 2.5288131281007504, "ewc_loss": 0.033653952181339264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.365395241416991e-05, "grad_norm": 19.236204147338867, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8837767243385315, "num_tokens": 758645356.0, "step": 19879 }, { "epoch": 2.528940338379341, "ewc_loss": 0.03371782600879669, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371782440808602e-05, "grad_norm": 19.289255142211914, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.8887380957603455, "num_tokens": 758685657.0, "step": 19880 }, { "epoch": 2.5290675486579315, "ewc_loss": 0.033796750009059906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379675035830587e-05, "grad_norm": 19.33226203918457, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8782130479812622, "num_tokens": 758718843.0, "step": 19881 }, { "epoch": 2.529194758936522, "ewc_loss": 0.03377489745616913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377489701961167e-05, "grad_norm": 19.378320693969727, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8781521320343018, "num_tokens": 758757946.0, "step": 19882 }, { "epoch": 2.5293219692151125, "ewc_loss": 0.03374534845352173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374534935574047e-05, "grad_norm": 19.350387573242188, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8555454015731812, "num_tokens": 758792832.0, "step": 19883 }, { "epoch": 2.529449179493703, "ewc_loss": 0.03373531252145767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373531217221171e-05, "grad_norm": 19.316438674926758, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.86684650182724, "num_tokens": 758826062.0, "step": 19884 }, { "epoch": 2.5295763897722936, "ewc_loss": 0.03376663476228714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3766635169740766e-05, "grad_norm": 19.337108612060547, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8836930990219116, "num_tokens": 758860940.0, "step": 19885 }, { "epoch": 2.529703600050884, "ewc_loss": 0.03374462202191353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3744621759979054e-05, "grad_norm": 19.283784866333008, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8721270561218262, "num_tokens": 758900740.0, "step": 19886 }, { "epoch": 2.5298308103294747, "ewc_loss": 0.03379541635513306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379541522008367e-05, "grad_norm": 19.334104537963867, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8705117106437683, "num_tokens": 758942786.0, "step": 19887 }, { "epoch": 2.529958020608065, "ewc_loss": 0.033776044845581055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377604662091471e-05, "grad_norm": 19.28565788269043, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8826993107795715, "num_tokens": 758983376.0, "step": 19888 }, { "epoch": 2.5300852308866557, "ewc_loss": 0.03374624252319336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374624066054821e-05, "grad_norm": 19.261932373046875, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8626683950424194, "num_tokens": 759024007.0, "step": 19889 }, { "epoch": 2.5302124411652462, "ewc_loss": 0.03376647084951401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376647146069445e-05, "grad_norm": 19.332199096679688, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8809235095977783, "num_tokens": 759061611.0, "step": 19890 }, { "epoch": 2.5303396514438368, "ewc_loss": 0.03382844850420952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382844806765206e-05, "grad_norm": 19.336965560913086, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8585739135742188, "num_tokens": 759101942.0, "step": 19891 }, { "epoch": 2.5304668617224273, "ewc_loss": 0.03372801095247269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3728010748745874e-05, "grad_norm": 19.226213455200195, "learning_rate": 1e-06, "loss": 0.3407, "mean_token_accuracy": 0.8906487226486206, "num_tokens": 759140281.0, "step": 19892 }, { "epoch": 2.530594072001018, "ewc_loss": 0.03378332778811455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37833262165077e-05, "grad_norm": 19.437297821044922, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8736229538917542, "num_tokens": 759174319.0, "step": 19893 }, { "epoch": 2.5307212822796084, "ewc_loss": 0.033837638795375824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3837637602118775e-05, "grad_norm": 19.263965606689453, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8806533813476562, "num_tokens": 759208981.0, "step": 19894 }, { "epoch": 2.5308484925581984, "ewc_loss": 0.033658359199762344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3658358006505296e-05, "grad_norm": 19.367544174194336, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.875428318977356, "num_tokens": 759240559.0, "step": 19895 }, { "epoch": 2.5309757028367894, "ewc_loss": 0.03377427160739899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377427128725685e-05, "grad_norm": 19.252593994140625, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8857961893081665, "num_tokens": 759280550.0, "step": 19896 }, { "epoch": 2.5311029131153795, "ewc_loss": 0.033750344067811966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375034430064261e-05, "grad_norm": 19.342145919799805, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8680371642112732, "num_tokens": 759321298.0, "step": 19897 }, { "epoch": 2.5312301233939705, "ewc_loss": 0.03379916399717331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379916233825497e-05, "grad_norm": 19.284475326538086, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8861067295074463, "num_tokens": 759354627.0, "step": 19898 }, { "epoch": 2.5313573336725606, "ewc_loss": 0.033728018403053284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372801802470349e-05, "grad_norm": 19.40305519104004, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8697218298912048, "num_tokens": 759388129.0, "step": 19899 }, { "epoch": 2.5314845439511515, "ewc_loss": 0.03374112769961357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3741129300324246e-05, "grad_norm": 19.239437103271484, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.8829714059829712, "num_tokens": 759423667.0, "step": 19900 }, { "epoch": 2.5316117542297416, "ewc_loss": 0.033654794096946716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3654792787274346e-05, "grad_norm": 19.286251068115234, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8647340536117554, "num_tokens": 759466586.0, "step": 19901 }, { "epoch": 2.531738964508332, "ewc_loss": 0.03382211551070213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3822114346548915e-05, "grad_norm": 19.325416564941406, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8683472275733948, "num_tokens": 759504214.0, "step": 19902 }, { "epoch": 2.5318661747869227, "ewc_loss": 0.03373249992728233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3732500014593825e-05, "grad_norm": 19.20808982849121, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8604314923286438, "num_tokens": 759543865.0, "step": 19903 }, { "epoch": 2.531993385065513, "ewc_loss": 0.03375990316271782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375990490894765e-05, "grad_norm": 19.353239059448242, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.869823157787323, "num_tokens": 759581917.0, "step": 19904 }, { "epoch": 2.5321205953441037, "ewc_loss": 0.033852074295282364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3852073102025315e-05, "grad_norm": 19.14267349243164, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8761789202690125, "num_tokens": 759617641.0, "step": 19905 }, { "epoch": 2.5322478056226942, "ewc_loss": 0.03376857191324234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376857057446614e-05, "grad_norm": 19.36192512512207, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8848774433135986, "num_tokens": 759649097.0, "step": 19906 }, { "epoch": 2.5323750159012848, "ewc_loss": 0.03394552692770958, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394552550162189e-05, "grad_norm": 19.266036987304688, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8697287440299988, "num_tokens": 759688802.0, "step": 19907 }, { "epoch": 2.5325022261798753, "ewc_loss": 0.03378130868077278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378130713826977e-05, "grad_norm": 19.24491310119629, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8830140829086304, "num_tokens": 759724243.0, "step": 19908 }, { "epoch": 2.532629436458466, "ewc_loss": 0.033908724784851074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390872370800935e-05, "grad_norm": 19.35540008544922, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8668091297149658, "num_tokens": 759764646.0, "step": 19909 }, { "epoch": 2.5327566467370564, "ewc_loss": 0.033869147300720215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3869146136566997e-05, "grad_norm": 19.22653579711914, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8763704299926758, "num_tokens": 759806939.0, "step": 19910 }, { "epoch": 2.532883857015647, "ewc_loss": 0.033820584416389465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382058275747113e-05, "grad_norm": 19.282426834106445, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8764898777008057, "num_tokens": 759848138.0, "step": 19911 }, { "epoch": 2.5330110672942374, "ewc_loss": 0.03383771330118179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3837713999673724e-05, "grad_norm": 19.268184661865234, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8707753419876099, "num_tokens": 759881877.0, "step": 19912 }, { "epoch": 2.533138277572828, "ewc_loss": 0.03383595123887062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3835949579952285e-05, "grad_norm": 19.276844024658203, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8845041394233704, "num_tokens": 759922625.0, "step": 19913 }, { "epoch": 2.5332654878514185, "ewc_loss": 0.0338294580578804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382945942576043e-05, "grad_norm": 19.251489639282227, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8850456476211548, "num_tokens": 759952429.0, "step": 19914 }, { "epoch": 2.533392698130009, "ewc_loss": 0.033903226256370544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390322672203183e-05, "grad_norm": 19.254070281982422, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8766096830368042, "num_tokens": 759987678.0, "step": 19915 }, { "epoch": 2.5335199084085995, "ewc_loss": 0.03384656831622124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3846568840090185e-05, "grad_norm": 19.243051528930664, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8601672649383545, "num_tokens": 760023943.0, "step": 19916 }, { "epoch": 2.53364711868719, "ewc_loss": 0.03387967124581337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387967080925591e-05, "grad_norm": 19.200044631958008, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8777873516082764, "num_tokens": 760068229.0, "step": 19917 }, { "epoch": 2.5337743289657806, "ewc_loss": 0.03390849754214287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390849815332331e-05, "grad_norm": 19.286161422729492, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8610090017318726, "num_tokens": 760109440.0, "step": 19918 }, { "epoch": 2.533901539244371, "ewc_loss": 0.034013837575912476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401383582968265e-05, "grad_norm": 19.33096694946289, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8760707378387451, "num_tokens": 760146616.0, "step": 19919 }, { "epoch": 2.534028749522961, "ewc_loss": 0.03398178890347481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398178887437098e-05, "grad_norm": 19.34773063659668, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8568682670593262, "num_tokens": 760188708.0, "step": 19920 }, { "epoch": 2.534155959801552, "ewc_loss": 0.03387436270713806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3874362998176366e-05, "grad_norm": 19.29411506652832, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8906503319740295, "num_tokens": 760231355.0, "step": 19921 }, { "epoch": 2.5342831700801423, "ewc_loss": 0.03388803079724312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388803088455461e-05, "grad_norm": 19.331687927246094, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8617284297943115, "num_tokens": 760268316.0, "step": 19922 }, { "epoch": 2.5344103803587332, "ewc_loss": 0.03392793610692024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39279358740896e-05, "grad_norm": 19.40425682067871, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8658096194267273, "num_tokens": 760305420.0, "step": 19923 }, { "epoch": 2.5345375906373233, "ewc_loss": 0.033883657306432724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3883658034028485e-05, "grad_norm": 19.265302658081055, "learning_rate": 1e-06, "loss": 0.3357, "mean_token_accuracy": 0.893211841583252, "num_tokens": 760342195.0, "step": 19924 }, { "epoch": 2.534664800915914, "ewc_loss": 0.03380288928747177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3802887628553435e-05, "grad_norm": 19.32162094116211, "learning_rate": 1e-06, "loss": 0.3533, "mean_token_accuracy": 0.8872150182723999, "num_tokens": 760375394.0, "step": 19925 }, { "epoch": 2.5347920111945044, "ewc_loss": 0.033871036022901535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387103424756788e-05, "grad_norm": 19.213401794433594, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.863042950630188, "num_tokens": 760411629.0, "step": 19926 }, { "epoch": 2.534919221473095, "ewc_loss": 0.033809978514909744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380997804924846e-05, "grad_norm": 19.30466079711914, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8765956163406372, "num_tokens": 760456212.0, "step": 19927 }, { "epoch": 2.5350464317516854, "ewc_loss": 0.03391115739941597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391115751583129e-05, "grad_norm": 19.258319854736328, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8646718859672546, "num_tokens": 760496385.0, "step": 19928 }, { "epoch": 2.535173642030276, "ewc_loss": 0.03385777398943901, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385777381481603e-05, "grad_norm": 19.30526351928711, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8734659552574158, "num_tokens": 760530426.0, "step": 19929 }, { "epoch": 2.5353008523088665, "ewc_loss": 0.03388569504022598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388569530216046e-05, "grad_norm": 19.3033390045166, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8850122690200806, "num_tokens": 760568174.0, "step": 19930 }, { "epoch": 2.535428062587457, "ewc_loss": 0.03387577086687088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387577089597471e-05, "grad_norm": 19.31092643737793, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8916749954223633, "num_tokens": 760608097.0, "step": 19931 }, { "epoch": 2.5355552728660475, "ewc_loss": 0.03387439623475075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387439573998563e-05, "grad_norm": 19.364856719970703, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8745998740196228, "num_tokens": 760643018.0, "step": 19932 }, { "epoch": 2.535682483144638, "ewc_loss": 0.033793237060308456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379323607077822e-05, "grad_norm": 19.294139862060547, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8824502229690552, "num_tokens": 760676553.0, "step": 19933 }, { "epoch": 2.5358096934232286, "ewc_loss": 0.033822838217020035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3822838304331526e-05, "grad_norm": 19.335603713989258, "learning_rate": 1e-06, "loss": 0.3486, "mean_token_accuracy": 0.8881195783615112, "num_tokens": 760708850.0, "step": 19934 }, { "epoch": 2.535936903701819, "ewc_loss": 0.033868081867694855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386808020877652e-05, "grad_norm": 19.36139678955078, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8706874847412109, "num_tokens": 760746088.0, "step": 19935 }, { "epoch": 2.5360641139804097, "ewc_loss": 0.033863365650177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386336538824253e-05, "grad_norm": 19.350446701049805, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.879828929901123, "num_tokens": 760785716.0, "step": 19936 }, { "epoch": 2.536191324259, "ewc_loss": 0.03382837772369385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382837894605473e-05, "grad_norm": 19.32694435119629, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8828116655349731, "num_tokens": 760821301.0, "step": 19937 }, { "epoch": 2.5363185345375907, "ewc_loss": 0.03380092605948448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380092675797641e-05, "grad_norm": 19.32952308654785, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.864793062210083, "num_tokens": 760861200.0, "step": 19938 }, { "epoch": 2.5364457448161812, "ewc_loss": 0.03374399244785309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374399238964543e-05, "grad_norm": 19.282386779785156, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8736218214035034, "num_tokens": 760896398.0, "step": 19939 }, { "epoch": 2.5365729550947718, "ewc_loss": 0.03381490334868431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381490387255326e-05, "grad_norm": 19.339336395263672, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8556414246559143, "num_tokens": 760937300.0, "step": 19940 }, { "epoch": 2.5367001653733623, "ewc_loss": 0.03382332995533943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3823329431470484e-05, "grad_norm": 19.294374465942383, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8691701889038086, "num_tokens": 760973726.0, "step": 19941 }, { "epoch": 2.536827375651953, "ewc_loss": 0.03379517048597336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3795171475503594e-05, "grad_norm": 19.354082107543945, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8718690276145935, "num_tokens": 761009048.0, "step": 19942 }, { "epoch": 2.5369545859305433, "ewc_loss": 0.03387492150068283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387492324691266e-05, "grad_norm": 19.236839294433594, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8718534708023071, "num_tokens": 761050836.0, "step": 19943 }, { "epoch": 2.537081796209134, "ewc_loss": 0.0337962806224823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379628105903976e-05, "grad_norm": 19.309595108032227, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8669098615646362, "num_tokens": 761087517.0, "step": 19944 }, { "epoch": 2.537209006487724, "ewc_loss": 0.03388327732682228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388327604625374e-05, "grad_norm": 19.283031463623047, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8875718116760254, "num_tokens": 761121401.0, "step": 19945 }, { "epoch": 2.537336216766315, "ewc_loss": 0.03382273018360138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3822729164967313e-05, "grad_norm": 19.382125854492188, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8629879355430603, "num_tokens": 761154973.0, "step": 19946 }, { "epoch": 2.537463427044905, "ewc_loss": 0.0338512547314167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385125455679372e-05, "grad_norm": 19.315380096435547, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8821783065795898, "num_tokens": 761194211.0, "step": 19947 }, { "epoch": 2.537590637323496, "ewc_loss": 0.03379335254430771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379335248610005e-05, "grad_norm": 19.36477279663086, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8677798509597778, "num_tokens": 761233486.0, "step": 19948 }, { "epoch": 2.537717847602086, "ewc_loss": 0.03371894732117653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.371894854353741e-05, "grad_norm": 19.194072723388672, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8751010894775391, "num_tokens": 761275846.0, "step": 19949 }, { "epoch": 2.5378450578806766, "ewc_loss": 0.03380940109491348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380939961061813e-05, "grad_norm": 19.38679313659668, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8650825023651123, "num_tokens": 761311210.0, "step": 19950 }, { "epoch": 2.537972268159267, "ewc_loss": 0.0338483527302742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384835144970566e-05, "grad_norm": 19.277446746826172, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8735748529434204, "num_tokens": 761349961.0, "step": 19951 }, { "epoch": 2.5380994784378577, "ewc_loss": 0.033799707889556885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379970803507604e-05, "grad_norm": 19.285669326782227, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8806356191635132, "num_tokens": 761386993.0, "step": 19952 }, { "epoch": 2.538226688716448, "ewc_loss": 0.03376016020774841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376015956746414e-05, "grad_norm": 19.206172943115234, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8640551567077637, "num_tokens": 761426910.0, "step": 19953 }, { "epoch": 2.5383538989950387, "ewc_loss": 0.03384849801659584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3848496968857944e-05, "grad_norm": 19.26067543029785, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8759175539016724, "num_tokens": 761466633.0, "step": 19954 }, { "epoch": 2.5384811092736292, "ewc_loss": 0.033853448927402496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3853448258014396e-05, "grad_norm": 19.375904083251953, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8615965843200684, "num_tokens": 761497670.0, "step": 19955 }, { "epoch": 2.5386083195522198, "ewc_loss": 0.033905744552612305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390574420336634e-05, "grad_norm": 19.31608772277832, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8702029585838318, "num_tokens": 761534626.0, "step": 19956 }, { "epoch": 2.5387355298308103, "ewc_loss": 0.03379128500819206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379128611413762e-05, "grad_norm": 19.31011199951172, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8809998035430908, "num_tokens": 761581125.0, "step": 19957 }, { "epoch": 2.538862740109401, "ewc_loss": 0.0338180810213089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381807982805185e-05, "grad_norm": 19.26191520690918, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8748920559883118, "num_tokens": 761616533.0, "step": 19958 }, { "epoch": 2.5389899503879914, "ewc_loss": 0.033770497888326645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37704987032339e-05, "grad_norm": 19.395599365234375, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8635669946670532, "num_tokens": 761650615.0, "step": 19959 }, { "epoch": 2.539117160666582, "ewc_loss": 0.03382796794176102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3827967854449525e-05, "grad_norm": 19.237335205078125, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8849084377288818, "num_tokens": 761687020.0, "step": 19960 }, { "epoch": 2.5392443709451724, "ewc_loss": 0.03372308611869812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372308492544107e-05, "grad_norm": 19.355854034423828, "learning_rate": 1e-06, "loss": 0.3505, "mean_token_accuracy": 0.8899987936019897, "num_tokens": 761721653.0, "step": 19961 }, { "epoch": 2.539371581223763, "ewc_loss": 0.03388499841094017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3884996810229495e-05, "grad_norm": 19.361482620239258, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8814153671264648, "num_tokens": 761758377.0, "step": 19962 }, { "epoch": 2.5394987915023535, "ewc_loss": 0.03374345973134041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374346124473959e-05, "grad_norm": 19.316530227661133, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8747102618217468, "num_tokens": 761795263.0, "step": 19963 }, { "epoch": 2.539626001780944, "ewc_loss": 0.03380351513624191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3803513360908255e-05, "grad_norm": 19.307994842529297, "learning_rate": 1e-06, "loss": 0.3563, "mean_token_accuracy": 0.8838136792182922, "num_tokens": 761834758.0, "step": 19964 }, { "epoch": 2.5397532120595345, "ewc_loss": 0.03378935158252716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378935070941225e-05, "grad_norm": 19.298925399780273, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8669652938842773, "num_tokens": 761880482.0, "step": 19965 }, { "epoch": 2.539880422338125, "ewc_loss": 0.03380590304732323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3805903512984514e-05, "grad_norm": 19.296472549438477, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8729789853096008, "num_tokens": 761922494.0, "step": 19966 }, { "epoch": 2.5400076326167156, "ewc_loss": 0.03374423086643219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3744232496246696e-05, "grad_norm": 19.26946449279785, "learning_rate": 1e-06, "loss": 0.3385, "mean_token_accuracy": 0.8917332887649536, "num_tokens": 761959462.0, "step": 19967 }, { "epoch": 2.5401348428953057, "ewc_loss": 0.03379928693175316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3799286029534414e-05, "grad_norm": 19.399154663085938, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8653467893600464, "num_tokens": 761990316.0, "step": 19968 }, { "epoch": 2.5402620531738966, "ewc_loss": 0.03378169611096382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3781696402002126e-05, "grad_norm": 19.35507583618164, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8685480356216431, "num_tokens": 762023671.0, "step": 19969 }, { "epoch": 2.5403892634524867, "ewc_loss": 0.03373073413968086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3730735594872385e-05, "grad_norm": 19.28754234313965, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8799470067024231, "num_tokens": 762056719.0, "step": 19970 }, { "epoch": 2.5405164737310777, "ewc_loss": 0.03377586975693703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3775868359953165e-05, "grad_norm": 19.331562042236328, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8750368356704712, "num_tokens": 762098802.0, "step": 19971 }, { "epoch": 2.540643684009668, "ewc_loss": 0.03380037471652031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3800373785197735e-05, "grad_norm": 19.24833106994629, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8781381845474243, "num_tokens": 762134470.0, "step": 19972 }, { "epoch": 2.5407708942882588, "ewc_loss": 0.03375648707151413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375648884684779e-05, "grad_norm": 19.255735397338867, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.877293586730957, "num_tokens": 762177163.0, "step": 19973 }, { "epoch": 2.540898104566849, "ewc_loss": 0.0338137224316597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3813721529440954e-05, "grad_norm": 19.293184280395508, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8822132349014282, "num_tokens": 762217323.0, "step": 19974 }, { "epoch": 2.5410253148454394, "ewc_loss": 0.03379201889038086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3792017347877845e-05, "grad_norm": 19.27592658996582, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8704404234886169, "num_tokens": 762252144.0, "step": 19975 }, { "epoch": 2.54115252512403, "ewc_loss": 0.0338076688349247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3807667932705954e-05, "grad_norm": 19.28290557861328, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8683031797409058, "num_tokens": 762282108.0, "step": 19976 }, { "epoch": 2.5412797354026204, "ewc_loss": 0.03382590040564537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38259014824871e-05, "grad_norm": 19.24054718017578, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.8775933384895325, "num_tokens": 762318154.0, "step": 19977 }, { "epoch": 2.541406945681211, "ewc_loss": 0.033872928470373154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387292963452637e-05, "grad_norm": 19.37086296081543, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8716102242469788, "num_tokens": 762354704.0, "step": 19978 }, { "epoch": 2.5415341559598015, "ewc_loss": 0.03390483930706978, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390483834664337e-05, "grad_norm": 19.30512237548828, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8770660758018494, "num_tokens": 762389943.0, "step": 19979 }, { "epoch": 2.541661366238392, "ewc_loss": 0.03385159373283386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385159288882278e-05, "grad_norm": 19.387067794799805, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.876840353012085, "num_tokens": 762421560.0, "step": 19980 }, { "epoch": 2.5417885765169825, "ewc_loss": 0.033854082226753235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385408126632683e-05, "grad_norm": 19.30173110961914, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.880579948425293, "num_tokens": 762458509.0, "step": 19981 }, { "epoch": 2.541915786795573, "ewc_loss": 0.033823080360889435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38230820489116e-05, "grad_norm": 19.351076126098633, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8743540048599243, "num_tokens": 762502382.0, "step": 19982 }, { "epoch": 2.5420429970741636, "ewc_loss": 0.033881109207868576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3881107810884714e-05, "grad_norm": 19.295244216918945, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8703994750976562, "num_tokens": 762536611.0, "step": 19983 }, { "epoch": 2.542170207352754, "ewc_loss": 0.03385953977704048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385953823453747e-05, "grad_norm": 19.35735321044922, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8745909929275513, "num_tokens": 762577801.0, "step": 19984 }, { "epoch": 2.5422974176313446, "ewc_loss": 0.033874791115522385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38747922796756e-05, "grad_norm": 19.255203247070312, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8755639791488647, "num_tokens": 762617413.0, "step": 19985 }, { "epoch": 2.542424627909935, "ewc_loss": 0.03383845463395119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383845614735037e-05, "grad_norm": 19.291454315185547, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8831338882446289, "num_tokens": 762654811.0, "step": 19986 }, { "epoch": 2.5425518381885257, "ewc_loss": 0.033874526619911194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3874526707222685e-05, "grad_norm": 19.267301559448242, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8790125250816345, "num_tokens": 762692902.0, "step": 19987 }, { "epoch": 2.5426790484671162, "ewc_loss": 0.03382105380296707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3821052056737244e-05, "grad_norm": 19.312780380249023, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8747260570526123, "num_tokens": 762731984.0, "step": 19988 }, { "epoch": 2.5428062587457068, "ewc_loss": 0.033859942108392715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3859942050185055e-05, "grad_norm": 19.34221076965332, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8829061388969421, "num_tokens": 762767104.0, "step": 19989 }, { "epoch": 2.5429334690242973, "ewc_loss": 0.03388135880231857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3881358831422403e-05, "grad_norm": 19.335098266601562, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8626856803894043, "num_tokens": 762804982.0, "step": 19990 }, { "epoch": 2.543060679302888, "ewc_loss": 0.03386073186993599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3860731491586193e-05, "grad_norm": 19.30299186706543, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8641607165336609, "num_tokens": 762842956.0, "step": 19991 }, { "epoch": 2.5431878895814783, "ewc_loss": 0.03382337838411331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3823376725194976e-05, "grad_norm": 19.356481552124023, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8798297643661499, "num_tokens": 762875445.0, "step": 19992 }, { "epoch": 2.5433150998600684, "ewc_loss": 0.03389246389269829, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3892465580720454e-05, "grad_norm": 19.31006622314453, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8881877660751343, "num_tokens": 762906061.0, "step": 19993 }, { "epoch": 2.5434423101386594, "ewc_loss": 0.03376465663313866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376465610926971e-05, "grad_norm": 19.30421257019043, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8820479512214661, "num_tokens": 762945550.0, "step": 19994 }, { "epoch": 2.5435695204172495, "ewc_loss": 0.03384389728307724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384389856364578e-05, "grad_norm": 19.282289505004883, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8747098445892334, "num_tokens": 762982586.0, "step": 19995 }, { "epoch": 2.5436967306958405, "ewc_loss": 0.03386200591921806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3862004784168676e-05, "grad_norm": 19.34351921081543, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8767083883285522, "num_tokens": 763013527.0, "step": 19996 }, { "epoch": 2.5438239409744305, "ewc_loss": 0.03373444452881813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373444269527681e-05, "grad_norm": 19.199153900146484, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8694168329238892, "num_tokens": 763054363.0, "step": 19997 }, { "epoch": 2.5439511512530215, "ewc_loss": 0.03380100801587105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380100679351017e-05, "grad_norm": 19.378311157226562, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8697235584259033, "num_tokens": 763096834.0, "step": 19998 }, { "epoch": 2.5440783615316116, "ewc_loss": 0.033904917538166046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390491838217713e-05, "grad_norm": 19.297456741333008, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8743172883987427, "num_tokens": 763139611.0, "step": 19999 }, { "epoch": 2.544205571810202, "ewc_loss": 0.033778268843889236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3778269425965846e-05, "grad_norm": 19.342288970947266, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8755549192428589, "num_tokens": 763181085.0, "step": 20000 }, { "epoch": 2.5443327820887927, "ewc_loss": 0.033825211226940155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382521026651375e-05, "grad_norm": 19.163597106933594, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8720769286155701, "num_tokens": 763219811.0, "step": 20001 }, { "epoch": 2.544459992367383, "ewc_loss": 0.0338563546538353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385635500308126e-05, "grad_norm": 19.402549743652344, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8631624579429626, "num_tokens": 763257745.0, "step": 20002 }, { "epoch": 2.5445872026459737, "ewc_loss": 0.033867478370666504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386747994227335e-05, "grad_norm": 19.224822998046875, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8583847880363464, "num_tokens": 763299475.0, "step": 20003 }, { "epoch": 2.5447144129245642, "ewc_loss": 0.033755358308553696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375535743543878e-05, "grad_norm": 19.23386001586914, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.852900505065918, "num_tokens": 763333960.0, "step": 20004 }, { "epoch": 2.5448416232031548, "ewc_loss": 0.033855173736810684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385517265996896e-05, "grad_norm": 19.261075973510742, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8653159141540527, "num_tokens": 763378646.0, "step": 20005 }, { "epoch": 2.5449688334817453, "ewc_loss": 0.03389540687203407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389540870557539e-05, "grad_norm": 19.320600509643555, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8783690333366394, "num_tokens": 763412441.0, "step": 20006 }, { "epoch": 2.545096043760336, "ewc_loss": 0.03384363278746605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384363299119286e-05, "grad_norm": 19.362285614013672, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8783594965934753, "num_tokens": 763446713.0, "step": 20007 }, { "epoch": 2.5452232540389264, "ewc_loss": 0.033911846578121185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391184509382583e-05, "grad_norm": 19.33169174194336, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8839472532272339, "num_tokens": 763478686.0, "step": 20008 }, { "epoch": 2.545350464317517, "ewc_loss": 0.03377968445420265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3779684599721804e-05, "grad_norm": 19.252901077270508, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8886362314224243, "num_tokens": 763517258.0, "step": 20009 }, { "epoch": 2.5454776745961074, "ewc_loss": 0.03385503217577934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385503077879548e-05, "grad_norm": 19.29422950744629, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.869192898273468, "num_tokens": 763553229.0, "step": 20010 }, { "epoch": 2.545604884874698, "ewc_loss": 0.033791329711675644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3791329769883305e-05, "grad_norm": 19.18680191040039, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8820330500602722, "num_tokens": 763594274.0, "step": 20011 }, { "epoch": 2.5457320951532885, "ewc_loss": 0.03385863080620766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38586323778145e-05, "grad_norm": 19.23832893371582, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8671242594718933, "num_tokens": 763632162.0, "step": 20012 }, { "epoch": 2.545859305431879, "ewc_loss": 0.03388489782810211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3884898584801704e-05, "grad_norm": 19.421295166015625, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8751355409622192, "num_tokens": 763666797.0, "step": 20013 }, { "epoch": 2.5459865157104695, "ewc_loss": 0.03390681371092796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3906813769135624e-05, "grad_norm": 19.27370262145996, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8726852536201477, "num_tokens": 763705475.0, "step": 20014 }, { "epoch": 2.54611372598906, "ewc_loss": 0.03378773853182793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3787739084800705e-05, "grad_norm": 19.247888565063477, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8901565670967102, "num_tokens": 763739434.0, "step": 20015 }, { "epoch": 2.5462409362676506, "ewc_loss": 0.03390561044216156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390560959815048e-05, "grad_norm": 19.33156967163086, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8627643585205078, "num_tokens": 763773262.0, "step": 20016 }, { "epoch": 2.546368146546241, "ewc_loss": 0.03388059884309769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388059849385172e-05, "grad_norm": 19.27465057373047, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8634233474731445, "num_tokens": 763812573.0, "step": 20017 }, { "epoch": 2.546495356824831, "ewc_loss": 0.03382543474435806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382543582119979e-05, "grad_norm": 19.262615203857422, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8774285316467285, "num_tokens": 763848555.0, "step": 20018 }, { "epoch": 2.546622567103422, "ewc_loss": 0.033864717930555344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3864718716358766e-05, "grad_norm": 19.271039962768555, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8724740743637085, "num_tokens": 763884608.0, "step": 20019 }, { "epoch": 2.5467497773820122, "ewc_loss": 0.03390691056847572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3906911994563416e-05, "grad_norm": 19.321819305419922, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8703716993331909, "num_tokens": 763922038.0, "step": 20020 }, { "epoch": 2.546876987660603, "ewc_loss": 0.033921174705028534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3921172871487215e-05, "grad_norm": 19.262475967407227, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8714386224746704, "num_tokens": 763958089.0, "step": 20021 }, { "epoch": 2.5470041979391933, "ewc_loss": 0.03391372412443161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39137259288691e-05, "grad_norm": 19.30633544921875, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8750537633895874, "num_tokens": 764000802.0, "step": 20022 }, { "epoch": 2.547131408217784, "ewc_loss": 0.033910784870386124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391078644199297e-05, "grad_norm": 19.286378860473633, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8791744709014893, "num_tokens": 764036628.0, "step": 20023 }, { "epoch": 2.5472586184963744, "ewc_loss": 0.033920180052518845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392017970327288e-05, "grad_norm": 19.281282424926758, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.877036452293396, "num_tokens": 764073643.0, "step": 20024 }, { "epoch": 2.547385828774965, "ewc_loss": 0.03393932804465294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39393263857346e-05, "grad_norm": 19.35478973388672, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8803204894065857, "num_tokens": 764110473.0, "step": 20025 }, { "epoch": 2.5475130390535554, "ewc_loss": 0.03384801745414734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384801675565541e-05, "grad_norm": 19.2820987701416, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8733019828796387, "num_tokens": 764142941.0, "step": 20026 }, { "epoch": 2.547640249332146, "ewc_loss": 0.03387007117271423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3870070183184e-05, "grad_norm": 19.24750518798828, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8768914937973022, "num_tokens": 764178752.0, "step": 20027 }, { "epoch": 2.5477674596107365, "ewc_loss": 0.033886075019836426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3886073651956394e-05, "grad_norm": 19.256345748901367, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8710291385650635, "num_tokens": 764217440.0, "step": 20028 }, { "epoch": 2.547894669889327, "ewc_loss": 0.033955227583646774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39552279911004e-05, "grad_norm": 19.318864822387695, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8656356930732727, "num_tokens": 764257524.0, "step": 20029 }, { "epoch": 2.5480218801679175, "ewc_loss": 0.033913858234882355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3913856896106154e-05, "grad_norm": 19.344457626342773, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8699194192886353, "num_tokens": 764293645.0, "step": 20030 }, { "epoch": 2.548149090446508, "ewc_loss": 0.033936113119125366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393611405044794e-05, "grad_norm": 19.251171112060547, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8818913698196411, "num_tokens": 764331660.0, "step": 20031 }, { "epoch": 2.5482763007250986, "ewc_loss": 0.033835519105196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383552029845305e-05, "grad_norm": 19.216354370117188, "learning_rate": 1e-06, "loss": 0.3377, "mean_token_accuracy": 0.8912393450737, "num_tokens": 764376577.0, "step": 20032 }, { "epoch": 2.548403511003689, "ewc_loss": 0.03391560912132263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391561040189117e-05, "grad_norm": 19.337831497192383, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8722214698791504, "num_tokens": 764415492.0, "step": 20033 }, { "epoch": 2.5485307212822796, "ewc_loss": 0.033986687660217285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398668923182413e-05, "grad_norm": 19.291215896606445, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8656564950942993, "num_tokens": 764453948.0, "step": 20034 }, { "epoch": 2.54865793156087, "ewc_loss": 0.03385565057396889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3855649235192686e-05, "grad_norm": 19.257110595703125, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8788391351699829, "num_tokens": 764493906.0, "step": 20035 }, { "epoch": 2.5487851418394607, "ewc_loss": 0.033905208110809326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39052094204817e-05, "grad_norm": 19.361722946166992, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8686597943305969, "num_tokens": 764532379.0, "step": 20036 }, { "epoch": 2.5489123521180512, "ewc_loss": 0.03393140807747841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3931406505871564e-05, "grad_norm": 19.294200897216797, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8570706844329834, "num_tokens": 764567726.0, "step": 20037 }, { "epoch": 2.5490395623966418, "ewc_loss": 0.03388379141688347, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388379263924435e-05, "grad_norm": 19.39922523498535, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8751325607299805, "num_tokens": 764606229.0, "step": 20038 }, { "epoch": 2.5491667726752323, "ewc_loss": 0.03391244634985924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3912445360329e-05, "grad_norm": 19.279375076293945, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8846783638000488, "num_tokens": 764641346.0, "step": 20039 }, { "epoch": 2.549293982953823, "ewc_loss": 0.03390410915017128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3904110750881955e-05, "grad_norm": 19.42661476135254, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8755922317504883, "num_tokens": 764687286.0, "step": 20040 }, { "epoch": 2.5494211932324133, "ewc_loss": 0.03395802155137062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395802195882425e-05, "grad_norm": 19.26042366027832, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8776295781135559, "num_tokens": 764729235.0, "step": 20041 }, { "epoch": 2.549548403511004, "ewc_loss": 0.033812280744314194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3812280889833346e-05, "grad_norm": 19.318864822387695, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8676459193229675, "num_tokens": 764771834.0, "step": 20042 }, { "epoch": 2.549675613789594, "ewc_loss": 0.03393787145614624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3937871194211766e-05, "grad_norm": 19.349388122558594, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8742892742156982, "num_tokens": 764808307.0, "step": 20043 }, { "epoch": 2.549802824068185, "ewc_loss": 0.03389293700456619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3892938517965376e-05, "grad_norm": 19.380491256713867, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8668125867843628, "num_tokens": 764848373.0, "step": 20044 }, { "epoch": 2.549930034346775, "ewc_loss": 0.033878620713949203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3878619433380663e-05, "grad_norm": 19.36135482788086, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8705936670303345, "num_tokens": 764886468.0, "step": 20045 }, { "epoch": 2.550057244625366, "ewc_loss": 0.0339045450091362, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390454367036e-05, "grad_norm": 19.33248519897461, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8758512139320374, "num_tokens": 764928690.0, "step": 20046 }, { "epoch": 2.550184454903956, "ewc_loss": 0.0338088758289814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380887574166991e-05, "grad_norm": 19.37945556640625, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8406932950019836, "num_tokens": 764969148.0, "step": 20047 }, { "epoch": 2.5503116651825466, "ewc_loss": 0.0337994284927845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379942791070789e-05, "grad_norm": 19.31830596923828, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8675304055213928, "num_tokens": 765010754.0, "step": 20048 }, { "epoch": 2.550438875461137, "ewc_loss": 0.03380083292722702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380083217052743e-05, "grad_norm": 19.35439109802246, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8775815367698669, "num_tokens": 765043562.0, "step": 20049 }, { "epoch": 2.5505660857397277, "ewc_loss": 0.033758923411369324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375892265466973e-05, "grad_norm": 19.25861358642578, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8565675020217896, "num_tokens": 765085638.0, "step": 20050 }, { "epoch": 2.550693296018318, "ewc_loss": 0.033809807151556015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3809807064244524e-05, "grad_norm": 19.367164611816406, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.880682110786438, "num_tokens": 765115847.0, "step": 20051 }, { "epoch": 2.5508205062969087, "ewc_loss": 0.033827900886535645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3827902370831e-05, "grad_norm": 19.326051712036133, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8725217580795288, "num_tokens": 765146839.0, "step": 20052 }, { "epoch": 2.5509477165754992, "ewc_loss": 0.033774543553590775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3774544135667384e-05, "grad_norm": 19.31625747680664, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8751587271690369, "num_tokens": 765185921.0, "step": 20053 }, { "epoch": 2.5510749268540898, "ewc_loss": 0.03379851207137108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3798511140048504e-05, "grad_norm": 19.280580520629883, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8749830722808838, "num_tokens": 765224534.0, "step": 20054 }, { "epoch": 2.5512021371326803, "ewc_loss": 0.03385689854621887, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385689706192352e-05, "grad_norm": 19.33139419555664, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8828647136688232, "num_tokens": 765267160.0, "step": 20055 }, { "epoch": 2.551329347411271, "ewc_loss": 0.03378865495324135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378865585546009e-05, "grad_norm": 19.276460647583008, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.874965250492096, "num_tokens": 765304770.0, "step": 20056 }, { "epoch": 2.5514565576898613, "ewc_loss": 0.03381839394569397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381839269422926e-05, "grad_norm": 19.31753921508789, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8714352250099182, "num_tokens": 765341966.0, "step": 20057 }, { "epoch": 2.551583767968452, "ewc_loss": 0.033955566585063934, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395556632312946e-05, "grad_norm": 19.371068954467773, "learning_rate": 1e-06, "loss": 0.3509, "mean_token_accuracy": 0.8887935876846313, "num_tokens": 765382653.0, "step": 20058 }, { "epoch": 2.5517109782470424, "ewc_loss": 0.03379320353269577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379320332896896e-05, "grad_norm": 19.296499252319336, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8731036186218262, "num_tokens": 765414940.0, "step": 20059 }, { "epoch": 2.551838188525633, "ewc_loss": 0.033929307013750076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392930739209987e-05, "grad_norm": 19.37973976135254, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.876641035079956, "num_tokens": 765445549.0, "step": 20060 }, { "epoch": 2.5519653988042235, "ewc_loss": 0.03390752896666527, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390753045096062e-05, "grad_norm": 19.36436653137207, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.8819167017936707, "num_tokens": 765484474.0, "step": 20061 }, { "epoch": 2.552092609082814, "ewc_loss": 0.033801041543483734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380104317329824e-05, "grad_norm": 19.273786544799805, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8742351531982422, "num_tokens": 765522651.0, "step": 20062 }, { "epoch": 2.5522198193614045, "ewc_loss": 0.03389568626880646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389568519196473e-05, "grad_norm": 19.326040267944336, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8691314458847046, "num_tokens": 765565983.0, "step": 20063 }, { "epoch": 2.552347029639995, "ewc_loss": 0.03389805927872658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389806079212576e-05, "grad_norm": 19.311315536499023, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8753068447113037, "num_tokens": 765604056.0, "step": 20064 }, { "epoch": 2.5524742399185856, "ewc_loss": 0.033848293125629425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384829324204475e-05, "grad_norm": 19.32784652709961, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8840056657791138, "num_tokens": 765646062.0, "step": 20065 }, { "epoch": 2.5526014501971757, "ewc_loss": 0.03389577940106392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3895779779413715e-05, "grad_norm": 19.340591430664062, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.873887836933136, "num_tokens": 765687284.0, "step": 20066 }, { "epoch": 2.5527286604757666, "ewc_loss": 0.03380456566810608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3804564736783504e-05, "grad_norm": 19.28790855407715, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8605920076370239, "num_tokens": 765723743.0, "step": 20067 }, { "epoch": 2.5528558707543567, "ewc_loss": 0.03382117301225662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382117211003788e-05, "grad_norm": 19.318452835083008, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8710612058639526, "num_tokens": 765768472.0, "step": 20068 }, { "epoch": 2.5529830810329477, "ewc_loss": 0.03383635729551315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383635703357868e-05, "grad_norm": 19.280370712280273, "learning_rate": 1e-06, "loss": 0.347, "mean_token_accuracy": 0.8891887664794922, "num_tokens": 765809040.0, "step": 20069 }, { "epoch": 2.5531102913115378, "ewc_loss": 0.0338616706430912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386167009011842e-05, "grad_norm": 19.3615665435791, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8748186230659485, "num_tokens": 765845968.0, "step": 20070 }, { "epoch": 2.5532375015901287, "ewc_loss": 0.03385782241821289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385782110854052e-05, "grad_norm": 19.259159088134766, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8791184425354004, "num_tokens": 765881037.0, "step": 20071 }, { "epoch": 2.553364711868719, "ewc_loss": 0.033864494413137436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3864493161672726e-05, "grad_norm": 19.408220291137695, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8785068988800049, "num_tokens": 765918945.0, "step": 20072 }, { "epoch": 2.5534919221473094, "ewc_loss": 0.033873919397592545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38739191647619e-05, "grad_norm": 19.328174591064453, "learning_rate": 1e-06, "loss": 0.3445, "mean_token_accuracy": 0.8867601156234741, "num_tokens": 765956128.0, "step": 20073 }, { "epoch": 2.5536191324259, "ewc_loss": 0.03380463644862175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3804637496359646e-05, "grad_norm": 19.377269744873047, "learning_rate": 1e-06, "loss": 0.348, "mean_token_accuracy": 0.8876112103462219, "num_tokens": 765997109.0, "step": 20074 }, { "epoch": 2.5537463427044904, "ewc_loss": 0.03377152979373932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3771528251236305e-05, "grad_norm": 19.30816078186035, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.872569739818573, "num_tokens": 766032557.0, "step": 20075 }, { "epoch": 2.553873552983081, "ewc_loss": 0.033738963305950165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.373896470293403e-05, "grad_norm": 19.297266006469727, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8701362013816833, "num_tokens": 766075285.0, "step": 20076 }, { "epoch": 2.5540007632616715, "ewc_loss": 0.033803533762693405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38035351887811e-05, "grad_norm": 19.392051696777344, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8723254203796387, "num_tokens": 766110600.0, "step": 20077 }, { "epoch": 2.554127973540262, "ewc_loss": 0.03385577350854874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385577292647213e-05, "grad_norm": 19.29145622253418, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8637939691543579, "num_tokens": 766146877.0, "step": 20078 }, { "epoch": 2.5542551838188525, "ewc_loss": 0.033784471452236176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378447217983194e-05, "grad_norm": 19.398473739624023, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8691741824150085, "num_tokens": 766185164.0, "step": 20079 }, { "epoch": 2.554382394097443, "ewc_loss": 0.03387363255023956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387363176443614e-05, "grad_norm": 19.33148765563965, "learning_rate": 1e-06, "loss": 0.3374, "mean_token_accuracy": 0.8932528495788574, "num_tokens": 766221442.0, "step": 20080 }, { "epoch": 2.5545096043760336, "ewc_loss": 0.03378986939787865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378987094038166e-05, "grad_norm": 19.436948776245117, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8526078462600708, "num_tokens": 766255258.0, "step": 20081 }, { "epoch": 2.554636814654624, "ewc_loss": 0.033855170011520386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385516902199015e-05, "grad_norm": 19.367773056030273, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8726344704627991, "num_tokens": 766292392.0, "step": 20082 }, { "epoch": 2.5547640249332146, "ewc_loss": 0.033749863505363464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3749864087440073e-05, "grad_norm": 19.355518341064453, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8652878999710083, "num_tokens": 766335312.0, "step": 20083 }, { "epoch": 2.554891235211805, "ewc_loss": 0.03377841040492058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377841130713932e-05, "grad_norm": 19.32887077331543, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8752654194831848, "num_tokens": 766372189.0, "step": 20084 }, { "epoch": 2.5550184454903957, "ewc_loss": 0.03380518779158592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380518683115952e-05, "grad_norm": 19.44908332824707, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8633676767349243, "num_tokens": 766414021.0, "step": 20085 }, { "epoch": 2.5551456557689862, "ewc_loss": 0.03379932790994644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379932604730129e-05, "grad_norm": 19.414392471313477, "learning_rate": 1e-06, "loss": 0.3494, "mean_token_accuracy": 0.8875241279602051, "num_tokens": 766452005.0, "step": 20086 }, { "epoch": 2.5552728660475768, "ewc_loss": 0.03370857611298561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.370857666595839e-05, "grad_norm": 19.36324119567871, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8684477806091309, "num_tokens": 766492150.0, "step": 20087 }, { "epoch": 2.5554000763261673, "ewc_loss": 0.033761147409677505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376114909769967e-05, "grad_norm": 19.371883392333984, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.8847960233688354, "num_tokens": 766533889.0, "step": 20088 }, { "epoch": 2.555527286604758, "ewc_loss": 0.0337284691631794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.372846913407557e-05, "grad_norm": 19.3619384765625, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8739805817604065, "num_tokens": 766565008.0, "step": 20089 }, { "epoch": 2.5556544968833483, "ewc_loss": 0.03376128897070885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376128734089434e-05, "grad_norm": 19.430252075195312, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8749529719352722, "num_tokens": 766605744.0, "step": 20090 }, { "epoch": 2.5557817071619384, "ewc_loss": 0.03368402272462845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.368402394698933e-05, "grad_norm": 19.28317642211914, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.88241046667099, "num_tokens": 766645782.0, "step": 20091 }, { "epoch": 2.5559089174405294, "ewc_loss": 0.0337497852742672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374978405190632e-05, "grad_norm": 19.391029357910156, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8648242354393005, "num_tokens": 766683135.0, "step": 20092 }, { "epoch": 2.5560361277191195, "ewc_loss": 0.03384602814912796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384602678124793e-05, "grad_norm": 19.356704711914062, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8748816251754761, "num_tokens": 766720691.0, "step": 20093 }, { "epoch": 2.5561633379977104, "ewc_loss": 0.033732037991285324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3732037991285324e-05, "grad_norm": 19.28649139404297, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8693643808364868, "num_tokens": 766755426.0, "step": 20094 }, { "epoch": 2.5562905482763005, "ewc_loss": 0.03380754217505455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3807540603447706e-05, "grad_norm": 19.41620635986328, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8798917531967163, "num_tokens": 766788315.0, "step": 20095 }, { "epoch": 2.5564177585548915, "ewc_loss": 0.03382362052798271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382362046977505e-05, "grad_norm": 19.267303466796875, "learning_rate": 1e-06, "loss": 0.3457, "mean_token_accuracy": 0.8897042870521545, "num_tokens": 766825664.0, "step": 20096 }, { "epoch": 2.5565449688334816, "ewc_loss": 0.03375202789902687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.375202868483029e-05, "grad_norm": 19.44183349609375, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8865585327148438, "num_tokens": 766868145.0, "step": 20097 }, { "epoch": 2.556672179112072, "ewc_loss": 0.033892519772052765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389252015040256e-05, "grad_norm": 19.370243072509766, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8627731800079346, "num_tokens": 766907278.0, "step": 20098 }, { "epoch": 2.5567993893906626, "ewc_loss": 0.03381049633026123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3810494642239064e-05, "grad_norm": 19.45140838623047, "learning_rate": 1e-06, "loss": 0.3534, "mean_token_accuracy": 0.8867940902709961, "num_tokens": 766941662.0, "step": 20099 }, { "epoch": 2.556926599669253, "ewc_loss": 0.03386324644088745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3863245334941894e-05, "grad_norm": 19.368297576904297, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8572795391082764, "num_tokens": 766977725.0, "step": 20100 }, { "epoch": 2.5570538099478437, "ewc_loss": 0.033812567591667175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3812568290159106e-05, "grad_norm": 19.411338806152344, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8704885244369507, "num_tokens": 767011011.0, "step": 20101 }, { "epoch": 2.5571810202264342, "ewc_loss": 0.033889997750520706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3889999031089246e-05, "grad_norm": 19.374574661254883, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8718134164810181, "num_tokens": 767047078.0, "step": 20102 }, { "epoch": 2.5573082305050248, "ewc_loss": 0.03379758819937706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37975870934315e-05, "grad_norm": 19.39556884765625, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8703306913375854, "num_tokens": 767089208.0, "step": 20103 }, { "epoch": 2.5574354407836153, "ewc_loss": 0.03386800363659859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386800381122157e-05, "grad_norm": 19.263442993164062, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8590703010559082, "num_tokens": 767131225.0, "step": 20104 }, { "epoch": 2.557562651062206, "ewc_loss": 0.03382500633597374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382500653970055e-05, "grad_norm": 19.457630157470703, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8637128472328186, "num_tokens": 767165222.0, "step": 20105 }, { "epoch": 2.5576898613407963, "ewc_loss": 0.0339333713054657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3933371014427394e-05, "grad_norm": 19.380998611450195, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8749713897705078, "num_tokens": 767200870.0, "step": 20106 }, { "epoch": 2.557817071619387, "ewc_loss": 0.03382093831896782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3820939279394224e-05, "grad_norm": 19.36669158935547, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8732556104660034, "num_tokens": 767236881.0, "step": 20107 }, { "epoch": 2.5579442818979774, "ewc_loss": 0.03389130160212517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3891301427502185e-05, "grad_norm": 19.394596099853516, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.87975013256073, "num_tokens": 767269716.0, "step": 20108 }, { "epoch": 2.558071492176568, "ewc_loss": 0.0338764488697052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387644756003283e-05, "grad_norm": 19.2916202545166, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.865281879901886, "num_tokens": 767306840.0, "step": 20109 }, { "epoch": 2.5581987024551585, "ewc_loss": 0.03390715271234512, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3907152101164684e-05, "grad_norm": 19.342763900756836, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8857729434967041, "num_tokens": 767346966.0, "step": 20110 }, { "epoch": 2.558325912733749, "ewc_loss": 0.0339670293033123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3967029594350606e-05, "grad_norm": 19.39019203186035, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8699043989181519, "num_tokens": 767387084.0, "step": 20111 }, { "epoch": 2.5584531230123395, "ewc_loss": 0.03389640897512436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389640914974734e-05, "grad_norm": 19.3665714263916, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8562573790550232, "num_tokens": 767417857.0, "step": 20112 }, { "epoch": 2.55858033329093, "ewc_loss": 0.0339110791683197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3911077480297536e-05, "grad_norm": 19.423480987548828, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8761910200119019, "num_tokens": 767455771.0, "step": 20113 }, { "epoch": 2.5587075435695206, "ewc_loss": 0.033949241042137146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394923987798393e-05, "grad_norm": 19.437814712524414, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.860898494720459, "num_tokens": 767493187.0, "step": 20114 }, { "epoch": 2.558834753848111, "ewc_loss": 0.03386472165584564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386472235433757e-05, "grad_norm": 19.40053939819336, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.866939902305603, "num_tokens": 767534526.0, "step": 20115 }, { "epoch": 2.558961964126701, "ewc_loss": 0.033941540867090225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394154191482812e-05, "grad_norm": 19.385080337524414, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8723396062850952, "num_tokens": 767569297.0, "step": 20116 }, { "epoch": 2.559089174405292, "ewc_loss": 0.03387760370969772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3877604437293485e-05, "grad_norm": 19.407007217407227, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8709414005279541, "num_tokens": 767605860.0, "step": 20117 }, { "epoch": 2.5592163846838822, "ewc_loss": 0.033971287310123444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3971286029554904e-05, "grad_norm": 19.410146713256836, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.876702070236206, "num_tokens": 767645252.0, "step": 20118 }, { "epoch": 2.559343594962473, "ewc_loss": 0.03386978060007095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386977914487943e-05, "grad_norm": 19.321256637573242, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8743488788604736, "num_tokens": 767681942.0, "step": 20119 }, { "epoch": 2.5594708052410633, "ewc_loss": 0.03388013318181038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3880132832564414e-05, "grad_norm": 19.362815856933594, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8728188872337341, "num_tokens": 767720631.0, "step": 20120 }, { "epoch": 2.559598015519654, "ewc_loss": 0.03396058827638626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396058673388325e-05, "grad_norm": 19.38275718688965, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8781677484512329, "num_tokens": 767757018.0, "step": 20121 }, { "epoch": 2.5597252257982444, "ewc_loss": 0.033869512379169464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386951357242651e-05, "grad_norm": 19.489608764648438, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8773519992828369, "num_tokens": 767796046.0, "step": 20122 }, { "epoch": 2.559852436076835, "ewc_loss": 0.03391794115304947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391794234630652e-05, "grad_norm": 19.322986602783203, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8840161561965942, "num_tokens": 767835678.0, "step": 20123 }, { "epoch": 2.5599796463554254, "ewc_loss": 0.03382469341158867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382469367352314e-05, "grad_norm": 19.405038833618164, "learning_rate": 1e-06, "loss": 0.3524, "mean_token_accuracy": 0.8887515068054199, "num_tokens": 767870515.0, "step": 20124 }, { "epoch": 2.560106856634016, "ewc_loss": 0.03392516449093819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3925163734238595e-05, "grad_norm": 19.333723068237305, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8585972189903259, "num_tokens": 767911573.0, "step": 20125 }, { "epoch": 2.5602340669126065, "ewc_loss": 0.03385893255472183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385893069207668e-05, "grad_norm": 19.399776458740234, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8730001449584961, "num_tokens": 767950161.0, "step": 20126 }, { "epoch": 2.560361277191197, "ewc_loss": 0.03393632173538208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3936321415239945e-05, "grad_norm": 19.45812225341797, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8722161650657654, "num_tokens": 767989431.0, "step": 20127 }, { "epoch": 2.5604884874697875, "ewc_loss": 0.03386521711945534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386521711945534e-05, "grad_norm": 19.378246307373047, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8839956521987915, "num_tokens": 768023551.0, "step": 20128 }, { "epoch": 2.560615697748378, "ewc_loss": 0.03378988429903984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378988549229689e-05, "grad_norm": 19.288171768188477, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8828071355819702, "num_tokens": 768055317.0, "step": 20129 }, { "epoch": 2.5607429080269686, "ewc_loss": 0.03393079340457916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3930791687453166e-05, "grad_norm": 19.41324234008789, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8683570623397827, "num_tokens": 768089575.0, "step": 20130 }, { "epoch": 2.560870118305559, "ewc_loss": 0.03389909118413925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389909034012817e-05, "grad_norm": 19.33319091796875, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8651408553123474, "num_tokens": 768131795.0, "step": 20131 }, { "epoch": 2.5609973285841496, "ewc_loss": 0.03385326266288757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3853262721095234e-05, "grad_norm": 19.3803768157959, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8780632019042969, "num_tokens": 768165714.0, "step": 20132 }, { "epoch": 2.56112453886274, "ewc_loss": 0.03387720510363579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3877204259624705e-05, "grad_norm": 19.33370590209961, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8822289705276489, "num_tokens": 768203475.0, "step": 20133 }, { "epoch": 2.5612517491413307, "ewc_loss": 0.03388182073831558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3881820854730904e-05, "grad_norm": 19.428375244140625, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8898870944976807, "num_tokens": 768244841.0, "step": 20134 }, { "epoch": 2.561378959419921, "ewc_loss": 0.03392820060253143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3928201446542516e-05, "grad_norm": 19.361454010009766, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8689850568771362, "num_tokens": 768286502.0, "step": 20135 }, { "epoch": 2.5615061696985117, "ewc_loss": 0.033777959644794464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377796019776724e-05, "grad_norm": 19.306320190429688, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8819976449012756, "num_tokens": 768320628.0, "step": 20136 }, { "epoch": 2.5616333799771023, "ewc_loss": 0.033897045999765396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389704579603858e-05, "grad_norm": 19.43456268310547, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8683750033378601, "num_tokens": 768353630.0, "step": 20137 }, { "epoch": 2.561760590255693, "ewc_loss": 0.03385145217180252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38514510076493e-05, "grad_norm": 19.418323516845703, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8749032020568848, "num_tokens": 768388170.0, "step": 20138 }, { "epoch": 2.5618878005342833, "ewc_loss": 0.03381090238690376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381090209586546e-05, "grad_norm": 19.45439910888672, "learning_rate": 1e-06, "loss": 0.3427, "mean_token_accuracy": 0.8911360502243042, "num_tokens": 768428238.0, "step": 20139 }, { "epoch": 2.562015010812874, "ewc_loss": 0.03388058394193649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388058394193649e-05, "grad_norm": 19.40195655822754, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.88720703125, "num_tokens": 768460075.0, "step": 20140 }, { "epoch": 2.562142221091464, "ewc_loss": 0.033800460398197174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3800461096689105e-05, "grad_norm": 19.46293067932129, "learning_rate": 1e-06, "loss": 0.3322, "mean_token_accuracy": 0.8918837904930115, "num_tokens": 768493982.0, "step": 20141 }, { "epoch": 2.562269431370055, "ewc_loss": 0.03384861350059509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384861338417977e-05, "grad_norm": 19.30959701538086, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8787449598312378, "num_tokens": 768529121.0, "step": 20142 }, { "epoch": 2.562396641648645, "ewc_loss": 0.033794376999139786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379437839612365e-05, "grad_norm": 19.509159088134766, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8752837181091309, "num_tokens": 768560730.0, "step": 20143 }, { "epoch": 2.562523851927236, "ewc_loss": 0.03393629565834999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3936295949388295e-05, "grad_norm": 19.32550621032715, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.885757565498352, "num_tokens": 768603015.0, "step": 20144 }, { "epoch": 2.562651062205826, "ewc_loss": 0.033729664981365204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37296660291031e-05, "grad_norm": 19.372220993041992, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8735942840576172, "num_tokens": 768639847.0, "step": 20145 }, { "epoch": 2.5627782724844166, "ewc_loss": 0.03391309827566147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391309655853547e-05, "grad_norm": 19.333471298217773, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8719133734703064, "num_tokens": 768681614.0, "step": 20146 }, { "epoch": 2.562905482763007, "ewc_loss": 0.03381029888987541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381029819138348e-05, "grad_norm": 19.363269805908203, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8788689970970154, "num_tokens": 768722052.0, "step": 20147 }, { "epoch": 2.5630326930415976, "ewc_loss": 0.03387029096484184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387029209989123e-05, "grad_norm": 19.415569305419922, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8681529760360718, "num_tokens": 768754191.0, "step": 20148 }, { "epoch": 2.563159903320188, "ewc_loss": 0.03394194692373276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394194573047571e-05, "grad_norm": 19.43739891052246, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8645567893981934, "num_tokens": 768792981.0, "step": 20149 }, { "epoch": 2.5632871135987787, "ewc_loss": 0.03384644165635109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384644151083194e-05, "grad_norm": 19.336774826049805, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.865778923034668, "num_tokens": 768827040.0, "step": 20150 }, { "epoch": 2.5634143238773692, "ewc_loss": 0.033876895904541016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38768950314261e-05, "grad_norm": 19.412582397460938, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8729617595672607, "num_tokens": 768865669.0, "step": 20151 }, { "epoch": 2.5635415341559598, "ewc_loss": 0.0338587686419487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3858766983030364e-05, "grad_norm": 19.294063568115234, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8742614984512329, "num_tokens": 768903306.0, "step": 20152 }, { "epoch": 2.5636687444345503, "ewc_loss": 0.03387262672185898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387262768228538e-05, "grad_norm": 19.34625244140625, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8797855377197266, "num_tokens": 768946322.0, "step": 20153 }, { "epoch": 2.563795954713141, "ewc_loss": 0.03390013799071312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390013807802461e-05, "grad_norm": 19.310834884643555, "learning_rate": 1e-06, "loss": 0.3325, "mean_token_accuracy": 0.8910695910453796, "num_tokens": 768987350.0, "step": 20154 }, { "epoch": 2.5639231649917313, "ewc_loss": 0.033868011087179184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3868011087179184e-05, "grad_norm": 19.37908363342285, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8826382756233215, "num_tokens": 769030762.0, "step": 20155 }, { "epoch": 2.564050375270322, "ewc_loss": 0.03388582915067673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388582990737632e-05, "grad_norm": 19.275442123413086, "learning_rate": 1e-06, "loss": 0.3578, "mean_token_accuracy": 0.8852194547653198, "num_tokens": 769069827.0, "step": 20156 }, { "epoch": 2.5641775855489124, "ewc_loss": 0.03386785462498665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386785465409048e-05, "grad_norm": 19.392210006713867, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8748664259910583, "num_tokens": 769107742.0, "step": 20157 }, { "epoch": 2.564304795827503, "ewc_loss": 0.03393895551562309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393895531189628e-05, "grad_norm": 19.369464874267578, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8733354210853577, "num_tokens": 769153286.0, "step": 20158 }, { "epoch": 2.5644320061060935, "ewc_loss": 0.03383873030543327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38387289957609e-05, "grad_norm": 19.444284439086914, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.876231849193573, "num_tokens": 769192646.0, "step": 20159 }, { "epoch": 2.564559216384684, "ewc_loss": 0.03386131674051285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3861317206174135e-05, "grad_norm": 19.35532569885254, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8532042503356934, "num_tokens": 769230360.0, "step": 20160 }, { "epoch": 2.5646864266632745, "ewc_loss": 0.03379516676068306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3795167837524787e-05, "grad_norm": 19.41292381286621, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.8904653191566467, "num_tokens": 769270754.0, "step": 20161 }, { "epoch": 2.564813636941865, "ewc_loss": 0.03382742404937744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382742579560727e-05, "grad_norm": 19.452348709106445, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8521524667739868, "num_tokens": 769310109.0, "step": 20162 }, { "epoch": 2.5649408472204556, "ewc_loss": 0.0338108129799366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381081478437409e-05, "grad_norm": 19.31572914123535, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8733526468276978, "num_tokens": 769352101.0, "step": 20163 }, { "epoch": 2.5650680574990457, "ewc_loss": 0.03376030921936035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3760308724595234e-05, "grad_norm": 19.410484313964844, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.8881192803382874, "num_tokens": 769386570.0, "step": 20164 }, { "epoch": 2.5651952677776366, "ewc_loss": 0.03384944424033165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384944284334779e-05, "grad_norm": 19.321765899658203, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8862703442573547, "num_tokens": 769427167.0, "step": 20165 }, { "epoch": 2.5653224780562267, "ewc_loss": 0.03376030921936035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3760308724595234e-05, "grad_norm": 19.398984909057617, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8654725551605225, "num_tokens": 769465456.0, "step": 20166 }, { "epoch": 2.5654496883348177, "ewc_loss": 0.033834390342235565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383438888704404e-05, "grad_norm": 19.359174728393555, "learning_rate": 1e-06, "loss": 0.349, "mean_token_accuracy": 0.8867843151092529, "num_tokens": 769497135.0, "step": 20167 }, { "epoch": 2.5655768986134078, "ewc_loss": 0.033782701939344406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378270048415288e-05, "grad_norm": 19.408977508544922, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8700039386749268, "num_tokens": 769533706.0, "step": 20168 }, { "epoch": 2.5657041088919987, "ewc_loss": 0.033818382769823074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381838178029284e-05, "grad_norm": 19.36353302001953, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8693677186965942, "num_tokens": 769578698.0, "step": 20169 }, { "epoch": 2.565831319170589, "ewc_loss": 0.03377200663089752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3772004826460034e-05, "grad_norm": 19.363929748535156, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.887068510055542, "num_tokens": 769619799.0, "step": 20170 }, { "epoch": 2.5659585294491793, "ewc_loss": 0.03380472958087921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3804728445829824e-05, "grad_norm": 19.378190994262695, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8748769164085388, "num_tokens": 769655973.0, "step": 20171 }, { "epoch": 2.56608573972777, "ewc_loss": 0.0338122583925724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3812259061960503e-05, "grad_norm": 19.297582626342773, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8712754249572754, "num_tokens": 769692333.0, "step": 20172 }, { "epoch": 2.5662129500063604, "ewc_loss": 0.03382250294089317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3822503610281274e-05, "grad_norm": 19.42485237121582, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8794889450073242, "num_tokens": 769731804.0, "step": 20173 }, { "epoch": 2.566340160284951, "ewc_loss": 0.03386804088950157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386804019100964e-05, "grad_norm": 19.299030303955078, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8807470798492432, "num_tokens": 769767409.0, "step": 20174 }, { "epoch": 2.5664673705635415, "ewc_loss": 0.03377009555697441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.377009488758631e-05, "grad_norm": 19.42629051208496, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8724923133850098, "num_tokens": 769806468.0, "step": 20175 }, { "epoch": 2.566594580842132, "ewc_loss": 0.0338796004652977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387960168765858e-05, "grad_norm": 19.356355667114258, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8832772374153137, "num_tokens": 769842025.0, "step": 20176 }, { "epoch": 2.5667217911207225, "ewc_loss": 0.03378306329250336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.378306428203359e-05, "grad_norm": 19.420949935913086, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8703985214233398, "num_tokens": 769884125.0, "step": 20177 }, { "epoch": 2.566849001399313, "ewc_loss": 0.03384273126721382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3842730772448704e-05, "grad_norm": 19.403711318969727, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8649563193321228, "num_tokens": 769920523.0, "step": 20178 }, { "epoch": 2.5669762116779036, "ewc_loss": 0.03375199809670448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3751999580999836e-05, "grad_norm": 19.38727378845215, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8730425834655762, "num_tokens": 769960141.0, "step": 20179 }, { "epoch": 2.567103421956494, "ewc_loss": 0.033857595175504684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385759555385448e-05, "grad_norm": 19.410009384155273, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8764460682868958, "num_tokens": 769995281.0, "step": 20180 }, { "epoch": 2.5672306322350846, "ewc_loss": 0.033746395260095596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374639345565811e-05, "grad_norm": 19.37018585205078, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.877893328666687, "num_tokens": 770028230.0, "step": 20181 }, { "epoch": 2.567357842513675, "ewc_loss": 0.03383788838982582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3837888622656465e-05, "grad_norm": 19.392650604248047, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8810803890228271, "num_tokens": 770068996.0, "step": 20182 }, { "epoch": 2.5674850527922657, "ewc_loss": 0.033811867237091064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381186616024934e-05, "grad_norm": 19.254173278808594, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8643442392349243, "num_tokens": 770107018.0, "step": 20183 }, { "epoch": 2.567612263070856, "ewc_loss": 0.03376217558979988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.376217500772327e-05, "grad_norm": 19.399686813354492, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.873680591583252, "num_tokens": 770140251.0, "step": 20184 }, { "epoch": 2.5677394733494467, "ewc_loss": 0.03383929282426834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3839292882476e-05, "grad_norm": 19.258501052856445, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8667014837265015, "num_tokens": 770182241.0, "step": 20185 }, { "epoch": 2.5678666836280373, "ewc_loss": 0.03374767675995827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.374767766217701e-05, "grad_norm": 19.398155212402344, "learning_rate": 1e-06, "loss": 0.3482, "mean_token_accuracy": 0.8883812427520752, "num_tokens": 770223267.0, "step": 20186 }, { "epoch": 2.567993893906628, "ewc_loss": 0.03395381569862366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395381645532325e-05, "grad_norm": 19.32686996459961, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8644449710845947, "num_tokens": 770261856.0, "step": 20187 }, { "epoch": 2.5681211041852183, "ewc_loss": 0.03386613726615906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386613752809353e-05, "grad_norm": 19.329057693481445, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8644614219665527, "num_tokens": 770299762.0, "step": 20188 }, { "epoch": 2.5682483144638084, "ewc_loss": 0.03386029228568077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3860291296150535e-05, "grad_norm": 19.321550369262695, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8698572516441345, "num_tokens": 770333690.0, "step": 20189 }, { "epoch": 2.5683755247423994, "ewc_loss": 0.03388546034693718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3885458833537996e-05, "grad_norm": 19.355640411376953, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8807373046875, "num_tokens": 770375755.0, "step": 20190 }, { "epoch": 2.5685027350209895, "ewc_loss": 0.033949341624975204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3949341741390526e-05, "grad_norm": 19.273815155029297, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8835786581039429, "num_tokens": 770417805.0, "step": 20191 }, { "epoch": 2.5686299452995804, "ewc_loss": 0.033887431025505066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388743061805144e-05, "grad_norm": 19.32537269592285, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8733499050140381, "num_tokens": 770458183.0, "step": 20192 }, { "epoch": 2.5687571555781705, "ewc_loss": 0.033939626067876816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3939624699996784e-05, "grad_norm": 19.40009307861328, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8779265880584717, "num_tokens": 770488973.0, "step": 20193 }, { "epoch": 2.5688843658567615, "ewc_loss": 0.03383679315447807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383679359103553e-05, "grad_norm": 19.20033836364746, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8693154454231262, "num_tokens": 770529030.0, "step": 20194 }, { "epoch": 2.5690115761353516, "ewc_loss": 0.033876631408929825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387663309695199e-05, "grad_norm": 19.381086349487305, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8639770746231079, "num_tokens": 770574697.0, "step": 20195 }, { "epoch": 2.569138786413942, "ewc_loss": 0.033997487276792526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399748675292358e-05, "grad_norm": 19.334665298461914, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8802992105484009, "num_tokens": 770615417.0, "step": 20196 }, { "epoch": 2.5692659966925326, "ewc_loss": 0.03384992852807045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384993033250794e-05, "grad_norm": 19.32600975036621, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8615027666091919, "num_tokens": 770653228.0, "step": 20197 }, { "epoch": 2.569393206971123, "ewc_loss": 0.03396298363804817, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396298416191712e-05, "grad_norm": 19.433584213256836, "learning_rate": 1e-06, "loss": 0.3539, "mean_token_accuracy": 0.8889954090118408, "num_tokens": 770693876.0, "step": 20198 }, { "epoch": 2.5695204172497137, "ewc_loss": 0.03383645415306091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383645525900647e-05, "grad_norm": 19.290260314941406, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8745476007461548, "num_tokens": 770728754.0, "step": 20199 }, { "epoch": 2.5696476275283042, "ewc_loss": 0.03389252722263336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3892527426360175e-05, "grad_norm": 19.380725860595703, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8726391196250916, "num_tokens": 770767961.0, "step": 20200 }, { "epoch": 2.5697748378068948, "ewc_loss": 0.03390902280807495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390902202227153e-05, "grad_norm": 19.39444923400879, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8486271500587463, "num_tokens": 770808880.0, "step": 20201 }, { "epoch": 2.5699020480854853, "ewc_loss": 0.03389664366841316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3896641980390996e-05, "grad_norm": 19.409095764160156, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8688688278198242, "num_tokens": 770846483.0, "step": 20202 }, { "epoch": 2.570029258364076, "ewc_loss": 0.033867835998535156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3867836464196444e-05, "grad_norm": 19.308650970458984, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8546273708343506, "num_tokens": 770884234.0, "step": 20203 }, { "epoch": 2.5701564686426663, "ewc_loss": 0.03389522805809975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389522680663504e-05, "grad_norm": 19.444604873657227, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8604200482368469, "num_tokens": 770918785.0, "step": 20204 }, { "epoch": 2.570283678921257, "ewc_loss": 0.03391581028699875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391581049072556e-05, "grad_norm": 19.387104034423828, "learning_rate": 1e-06, "loss": 0.3271, "mean_token_accuracy": 0.8953967690467834, "num_tokens": 770956889.0, "step": 20205 }, { "epoch": 2.5704108891998474, "ewc_loss": 0.0338483564555645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384835508768447e-05, "grad_norm": 19.341856002807617, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8609377145767212, "num_tokens": 770994937.0, "step": 20206 }, { "epoch": 2.570538099478438, "ewc_loss": 0.03394022956490517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394022860447876e-05, "grad_norm": 19.400638580322266, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8826315402984619, "num_tokens": 771031451.0, "step": 20207 }, { "epoch": 2.5706653097570284, "ewc_loss": 0.03394171595573425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394171653781086e-05, "grad_norm": 19.403139114379883, "learning_rate": 1e-06, "loss": 0.3684, "mean_token_accuracy": 0.881669282913208, "num_tokens": 771061575.0, "step": 20208 }, { "epoch": 2.570792520035619, "ewc_loss": 0.033864956349134445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386495518498123e-05, "grad_norm": 19.366592407226562, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8738289475440979, "num_tokens": 771103366.0, "step": 20209 }, { "epoch": 2.5709197303142095, "ewc_loss": 0.03392753005027771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3927528420463204e-05, "grad_norm": 19.385229110717773, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8785421252250671, "num_tokens": 771142178.0, "step": 20210 }, { "epoch": 2.5710469405928, "ewc_loss": 0.03389909118413925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389909034012817e-05, "grad_norm": 19.335294723510742, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8761743903160095, "num_tokens": 771179296.0, "step": 20211 }, { "epoch": 2.5711741508713906, "ewc_loss": 0.03392409533262253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392409416846931e-05, "grad_norm": 19.37296485900879, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8680307865142822, "num_tokens": 771219084.0, "step": 20212 }, { "epoch": 2.571301361149981, "ewc_loss": 0.03393523767590523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393523729755543e-05, "grad_norm": 19.379074096679688, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8818830251693726, "num_tokens": 771250609.0, "step": 20213 }, { "epoch": 2.571428571428571, "ewc_loss": 0.03397202119231224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3972020901273936e-05, "grad_norm": 19.40285873413086, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8725138306617737, "num_tokens": 771286191.0, "step": 20214 }, { "epoch": 2.571555781707162, "ewc_loss": 0.03394714370369911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394714440219104e-05, "grad_norm": 19.37801170349121, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8728112578392029, "num_tokens": 771320898.0, "step": 20215 }, { "epoch": 2.5716829919857522, "ewc_loss": 0.033970847725868225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397084947209805e-05, "grad_norm": 19.46138572692871, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8663507699966431, "num_tokens": 771353977.0, "step": 20216 }, { "epoch": 2.571810202264343, "ewc_loss": 0.03392142802476883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392142753000371e-05, "grad_norm": 19.335508346557617, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8796949982643127, "num_tokens": 771391960.0, "step": 20217 }, { "epoch": 2.5719374125429333, "ewc_loss": 0.0338500514626503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385005038580857e-05, "grad_norm": 19.35601234436035, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8667325973510742, "num_tokens": 771427764.0, "step": 20218 }, { "epoch": 2.572064622821524, "ewc_loss": 0.033987753093242645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39877515216358e-05, "grad_norm": 19.367877960205078, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8746373653411865, "num_tokens": 771469706.0, "step": 20219 }, { "epoch": 2.5721918331001143, "ewc_loss": 0.0339273102581501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392731014173478e-05, "grad_norm": 19.385318756103516, "learning_rate": 1e-06, "loss": 0.328, "mean_token_accuracy": 0.8941340446472168, "num_tokens": 771504296.0, "step": 20220 }, { "epoch": 2.572319043378705, "ewc_loss": 0.033874377608299255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3874377550091594e-05, "grad_norm": 19.259748458862305, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8746631145477295, "num_tokens": 771544496.0, "step": 20221 }, { "epoch": 2.5724462536572954, "ewc_loss": 0.03393875062465668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393875158508308e-05, "grad_norm": 19.39739227294922, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8829967379570007, "num_tokens": 771584322.0, "step": 20222 }, { "epoch": 2.572573463935886, "ewc_loss": 0.0339779369533062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397793625481427e-05, "grad_norm": 19.315032958984375, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8824682235717773, "num_tokens": 771621424.0, "step": 20223 }, { "epoch": 2.5727006742144765, "ewc_loss": 0.033942367881536484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394236773601733e-05, "grad_norm": 19.377517700195312, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8769596815109253, "num_tokens": 771658173.0, "step": 20224 }, { "epoch": 2.572827884493067, "ewc_loss": 0.03397246077656746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3972461096709594e-05, "grad_norm": 19.32051658630371, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8661397695541382, "num_tokens": 771698182.0, "step": 20225 }, { "epoch": 2.5729550947716575, "ewc_loss": 0.03391704335808754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3917043765541166e-05, "grad_norm": 19.38363265991211, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.86002117395401, "num_tokens": 771731559.0, "step": 20226 }, { "epoch": 2.573082305050248, "ewc_loss": 0.033939432352781296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393943188712001e-05, "grad_norm": 19.30718994140625, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8701702952384949, "num_tokens": 771774362.0, "step": 20227 }, { "epoch": 2.5732095153288386, "ewc_loss": 0.03388407453894615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3884072763612494e-05, "grad_norm": 19.369632720947266, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8638777732849121, "num_tokens": 771810540.0, "step": 20228 }, { "epoch": 2.573336725607429, "ewc_loss": 0.03399357944726944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399357956368476e-05, "grad_norm": 19.3981990814209, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8783937692642212, "num_tokens": 771846221.0, "step": 20229 }, { "epoch": 2.5734639358860196, "ewc_loss": 0.03394761681556702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3947617339435965e-05, "grad_norm": 19.451847076416016, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.884814977645874, "num_tokens": 771882268.0, "step": 20230 }, { "epoch": 2.57359114616461, "ewc_loss": 0.03385830670595169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385830495972186e-05, "grad_norm": 19.351764678955078, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8751499652862549, "num_tokens": 771913176.0, "step": 20231 }, { "epoch": 2.5737183564432007, "ewc_loss": 0.03393009677529335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393009683350101e-05, "grad_norm": 19.463041305541992, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8689658045768738, "num_tokens": 771948163.0, "step": 20232 }, { "epoch": 2.573845566721791, "ewc_loss": 0.033936962485313416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393696169950999e-05, "grad_norm": 19.425622940063477, "learning_rate": 1e-06, "loss": 0.3508, "mean_token_accuracy": 0.8875477313995361, "num_tokens": 771988296.0, "step": 20233 }, { "epoch": 2.5739727770003817, "ewc_loss": 0.03389138728380203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3891388738993555e-05, "grad_norm": 19.245962142944336, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8817609548568726, "num_tokens": 772024330.0, "step": 20234 }, { "epoch": 2.5740999872789723, "ewc_loss": 0.03389597684144974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38959762302693e-05, "grad_norm": 19.44642448425293, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8630207180976868, "num_tokens": 772062291.0, "step": 20235 }, { "epoch": 2.574227197557563, "ewc_loss": 0.03400040417909622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4000404411926866e-05, "grad_norm": 19.252811431884766, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8753787875175476, "num_tokens": 772110239.0, "step": 20236 }, { "epoch": 2.5743544078361533, "ewc_loss": 0.03387395292520523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3873951906571165e-05, "grad_norm": 19.33607292175293, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8703116774559021, "num_tokens": 772153201.0, "step": 20237 }, { "epoch": 2.574481618114744, "ewc_loss": 0.03403593599796295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403593655093573e-05, "grad_norm": 19.412513732910156, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8764134645462036, "num_tokens": 772186816.0, "step": 20238 }, { "epoch": 2.574608828393334, "ewc_loss": 0.03390367329120636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3903674193425104e-05, "grad_norm": 19.365867614746094, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8837839365005493, "num_tokens": 772224238.0, "step": 20239 }, { "epoch": 2.574736038671925, "ewc_loss": 0.03393939882516861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3939399145310745e-05, "grad_norm": 19.35426902770996, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8828933238983154, "num_tokens": 772263505.0, "step": 20240 }, { "epoch": 2.574863248950515, "ewc_loss": 0.03388524800539017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3885247830767184e-05, "grad_norm": 19.339750289916992, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8828746676445007, "num_tokens": 772303205.0, "step": 20241 }, { "epoch": 2.574990459229106, "ewc_loss": 0.033917203545570374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391720383660868e-05, "grad_norm": 19.310182571411133, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8797173500061035, "num_tokens": 772344495.0, "step": 20242 }, { "epoch": 2.575117669507696, "ewc_loss": 0.033958304673433304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39583057211712e-05, "grad_norm": 19.380054473876953, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8689886331558228, "num_tokens": 772380296.0, "step": 20243 }, { "epoch": 2.5752448797862866, "ewc_loss": 0.03397931903600693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397931868676096e-05, "grad_norm": 19.3176326751709, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8879834413528442, "num_tokens": 772423933.0, "step": 20244 }, { "epoch": 2.575372090064877, "ewc_loss": 0.03392733260989189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392733196960762e-05, "grad_norm": 19.39976692199707, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8615133166313171, "num_tokens": 772464514.0, "step": 20245 }, { "epoch": 2.5754993003434676, "ewc_loss": 0.03397173061966896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397172986296937e-05, "grad_norm": 19.370010375976562, "learning_rate": 1e-06, "loss": 0.3278, "mean_token_accuracy": 0.893295407295227, "num_tokens": 772498501.0, "step": 20246 }, { "epoch": 2.575626510622058, "ewc_loss": 0.03384995087981224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384995216038078e-05, "grad_norm": 19.387304306030273, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8708382248878479, "num_tokens": 772538072.0, "step": 20247 }, { "epoch": 2.5757537209006487, "ewc_loss": 0.03388974070549011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388974073459394e-05, "grad_norm": 19.370563507080078, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8754736185073853, "num_tokens": 772577434.0, "step": 20248 }, { "epoch": 2.575880931179239, "ewc_loss": 0.033926721662282944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392672078916803e-05, "grad_norm": 19.427888870239258, "learning_rate": 1e-06, "loss": 0.3419, "mean_token_accuracy": 0.8883410692214966, "num_tokens": 772612917.0, "step": 20249 }, { "epoch": 2.5760081414578297, "ewc_loss": 0.03391565382480621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391565405763686e-05, "grad_norm": 19.349206924438477, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.886568009853363, "num_tokens": 772652939.0, "step": 20250 }, { "epoch": 2.5761353517364203, "ewc_loss": 0.033874258399009705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387425749679096e-05, "grad_norm": 19.346965789794922, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8813960552215576, "num_tokens": 772688018.0, "step": 20251 }, { "epoch": 2.576262562015011, "ewc_loss": 0.03389960154891014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389960329513997e-05, "grad_norm": 19.462373733520508, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8652887940406799, "num_tokens": 772728071.0, "step": 20252 }, { "epoch": 2.5763897722936013, "ewc_loss": 0.033838145434856415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.383814691915177e-05, "grad_norm": 19.328683853149414, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.876855194568634, "num_tokens": 772762730.0, "step": 20253 }, { "epoch": 2.576516982572192, "ewc_loss": 0.03385286405682564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3852862543426454e-05, "grad_norm": 19.336524963378906, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8776645660400391, "num_tokens": 772799125.0, "step": 20254 }, { "epoch": 2.5766441928507824, "ewc_loss": 0.03385692834854126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385692980373278e-05, "grad_norm": 19.347801208496094, "learning_rate": 1e-06, "loss": 0.3513, "mean_token_accuracy": 0.8869966268539429, "num_tokens": 772838657.0, "step": 20255 }, { "epoch": 2.576771403129373, "ewc_loss": 0.03388164937496185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388164986972697e-05, "grad_norm": 19.334272384643555, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8833409547805786, "num_tokens": 772876470.0, "step": 20256 }, { "epoch": 2.5768986134079634, "ewc_loss": 0.03392532840371132, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3925327443284914e-05, "grad_norm": 19.475067138671875, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8685328364372253, "num_tokens": 772910369.0, "step": 20257 }, { "epoch": 2.577025823686554, "ewc_loss": 0.033907413482666016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3907414035638794e-05, "grad_norm": 19.277191162109375, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8811869621276855, "num_tokens": 772948129.0, "step": 20258 }, { "epoch": 2.5771530339651445, "ewc_loss": 0.03380972146987915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3809719752753153e-05, "grad_norm": 19.391563415527344, "learning_rate": 1e-06, "loss": 0.3128, "mean_token_accuracy": 0.9012365937232971, "num_tokens": 772985359.0, "step": 20259 }, { "epoch": 2.577280244243735, "ewc_loss": 0.03396318107843399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39631806127727e-05, "grad_norm": 19.340322494506836, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8771787285804749, "num_tokens": 773019402.0, "step": 20260 }, { "epoch": 2.5774074545223256, "ewc_loss": 0.0338117852807045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381178612471558e-05, "grad_norm": 19.32879638671875, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8744887113571167, "num_tokens": 773059176.0, "step": 20261 }, { "epoch": 2.5775346648009156, "ewc_loss": 0.03401686251163483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401686262805015e-05, "grad_norm": 19.442068099975586, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8769108653068542, "num_tokens": 773094274.0, "step": 20262 }, { "epoch": 2.5776618750795066, "ewc_loss": 0.033927835524082184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3927834010683e-05, "grad_norm": 19.286155700683594, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8738921284675598, "num_tokens": 773135679.0, "step": 20263 }, { "epoch": 2.5777890853580967, "ewc_loss": 0.03389686718583107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3896867535077035e-05, "grad_norm": 19.34933090209961, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8717875480651855, "num_tokens": 773174270.0, "step": 20264 }, { "epoch": 2.5779162956366877, "ewc_loss": 0.03394807502627373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394807572476566e-05, "grad_norm": 19.24355697631836, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8757869601249695, "num_tokens": 773213582.0, "step": 20265 }, { "epoch": 2.5780435059152778, "ewc_loss": 0.03394836187362671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394836312509142e-05, "grad_norm": 19.358642578125, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8699442744255066, "num_tokens": 773244980.0, "step": 20266 }, { "epoch": 2.5781707161938687, "ewc_loss": 0.03405594453215599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4055945434374735e-05, "grad_norm": 19.395151138305664, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8891026973724365, "num_tokens": 773287395.0, "step": 20267 }, { "epoch": 2.578297926472459, "ewc_loss": 0.033880773931741714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388077311683446e-05, "grad_norm": 19.384559631347656, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8716639280319214, "num_tokens": 773324593.0, "step": 20268 }, { "epoch": 2.5784251367510493, "ewc_loss": 0.03399823233485222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399823253857903e-05, "grad_norm": 19.430788040161133, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8710106015205383, "num_tokens": 773361802.0, "step": 20269 }, { "epoch": 2.57855234702964, "ewc_loss": 0.033916812390089035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391681093489751e-05, "grad_norm": 19.3406982421875, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8790715932846069, "num_tokens": 773401348.0, "step": 20270 }, { "epoch": 2.5786795573082304, "ewc_loss": 0.03395182639360428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395182648091577e-05, "grad_norm": 19.35411262512207, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8911671042442322, "num_tokens": 773439040.0, "step": 20271 }, { "epoch": 2.578806767586821, "ewc_loss": 0.03392898663878441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392898724996485e-05, "grad_norm": 19.274606704711914, "learning_rate": 1e-06, "loss": 0.3424, "mean_token_accuracy": 0.8879013061523438, "num_tokens": 773471382.0, "step": 20272 }, { "epoch": 2.5789339778654115, "ewc_loss": 0.03397750109434128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3977499697357416e-05, "grad_norm": 19.407445907592773, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8648497462272644, "num_tokens": 773508924.0, "step": 20273 }, { "epoch": 2.579061188144002, "ewc_loss": 0.03394780308008194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3947802876355127e-05, "grad_norm": 19.3105411529541, "learning_rate": 1e-06, "loss": 0.3401, "mean_token_accuracy": 0.889214038848877, "num_tokens": 773550219.0, "step": 20274 }, { "epoch": 2.5791883984225925, "ewc_loss": 0.033899374306201935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389937410247512e-05, "grad_norm": 19.378175735473633, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8707934617996216, "num_tokens": 773589363.0, "step": 20275 }, { "epoch": 2.579315608701183, "ewc_loss": 0.03401174396276474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401174399186857e-05, "grad_norm": 19.35015869140625, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8740971088409424, "num_tokens": 773630289.0, "step": 20276 }, { "epoch": 2.5794428189797736, "ewc_loss": 0.033968206495046616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39682046615053e-05, "grad_norm": 19.403432846069336, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8769212365150452, "num_tokens": 773668660.0, "step": 20277 }, { "epoch": 2.579570029258364, "ewc_loss": 0.033984847366809845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398484841454774e-05, "grad_norm": 19.4416561126709, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.872660219669342, "num_tokens": 773704318.0, "step": 20278 }, { "epoch": 2.5796972395369546, "ewc_loss": 0.033986564725637436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398656554054469e-05, "grad_norm": 19.355548858642578, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8610160946846008, "num_tokens": 773745821.0, "step": 20279 }, { "epoch": 2.579824449815545, "ewc_loss": 0.03388907015323639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388907134649344e-05, "grad_norm": 19.407926559448242, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8847533464431763, "num_tokens": 773782880.0, "step": 20280 }, { "epoch": 2.5799516600941357, "ewc_loss": 0.0339328870177269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393288716324605e-05, "grad_norm": 19.295764923095703, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8624536395072937, "num_tokens": 773824880.0, "step": 20281 }, { "epoch": 2.580078870372726, "ewc_loss": 0.03391842171549797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3918422559509054e-05, "grad_norm": 19.49582290649414, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8788981437683105, "num_tokens": 773863210.0, "step": 20282 }, { "epoch": 2.5802060806513167, "ewc_loss": 0.033904798328876495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3904798328876495e-05, "grad_norm": 19.287250518798828, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8829247951507568, "num_tokens": 773903432.0, "step": 20283 }, { "epoch": 2.5803332909299073, "ewc_loss": 0.03384793922305107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384794035810046e-05, "grad_norm": 19.448087692260742, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.8832067847251892, "num_tokens": 773942424.0, "step": 20284 }, { "epoch": 2.580460501208498, "ewc_loss": 0.033955059945583344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3955060644075274e-05, "grad_norm": 19.316810607910156, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8743159174919128, "num_tokens": 773976598.0, "step": 20285 }, { "epoch": 2.5805877114870883, "ewc_loss": 0.03387901559472084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3879015973070636e-05, "grad_norm": 19.363609313964844, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8695440888404846, "num_tokens": 774016696.0, "step": 20286 }, { "epoch": 2.5807149217656784, "ewc_loss": 0.03393819183111191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393819133634679e-05, "grad_norm": 19.347213745117188, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.869472324848175, "num_tokens": 774059046.0, "step": 20287 }, { "epoch": 2.5808421320442694, "ewc_loss": 0.033834461122751236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3834461646620184e-05, "grad_norm": 19.39523696899414, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8713821172714233, "num_tokens": 774095322.0, "step": 20288 }, { "epoch": 2.5809693423228595, "ewc_loss": 0.03391136974096298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39113685186021e-05, "grad_norm": 19.36354637145996, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8638238906860352, "num_tokens": 774134506.0, "step": 20289 }, { "epoch": 2.5810965526014504, "ewc_loss": 0.0338544137775898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3854412322398275e-05, "grad_norm": 19.44199562072754, "learning_rate": 1e-06, "loss": 0.3486, "mean_token_accuracy": 0.890532374382019, "num_tokens": 774176997.0, "step": 20290 }, { "epoch": 2.5812237628800405, "ewc_loss": 0.0339105948805809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391059362911619e-05, "grad_norm": 19.375459671020508, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8709812164306641, "num_tokens": 774213900.0, "step": 20291 }, { "epoch": 2.5813509731586315, "ewc_loss": 0.03382298722863197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382298746146262e-05, "grad_norm": 19.441749572753906, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8680696487426758, "num_tokens": 774259687.0, "step": 20292 }, { "epoch": 2.5814781834372216, "ewc_loss": 0.0338425375521183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384253795957193e-05, "grad_norm": 19.43462562561035, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8786322474479675, "num_tokens": 774297200.0, "step": 20293 }, { "epoch": 2.581605393715812, "ewc_loss": 0.03382798656821251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382798604434356e-05, "grad_norm": 19.36331558227539, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8873739242553711, "num_tokens": 774336642.0, "step": 20294 }, { "epoch": 2.5817326039944026, "ewc_loss": 0.033825572580099106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.382557406439446e-05, "grad_norm": 19.3217830657959, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8601386547088623, "num_tokens": 774376762.0, "step": 20295 }, { "epoch": 2.581859814272993, "ewc_loss": 0.033846333622932434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3846332371467724e-05, "grad_norm": 19.52750587463379, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8835842609405518, "num_tokens": 774421360.0, "step": 20296 }, { "epoch": 2.5819870245515837, "ewc_loss": 0.033901702612638474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390170240891166e-05, "grad_norm": 19.360021591186523, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8824759125709534, "num_tokens": 774460850.0, "step": 20297 }, { "epoch": 2.582114234830174, "ewc_loss": 0.033724840730428696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.37248420692049e-05, "grad_norm": 19.34880828857422, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8730197548866272, "num_tokens": 774497542.0, "step": 20298 }, { "epoch": 2.5822414451087647, "ewc_loss": 0.03391236066818237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391236168681644e-05, "grad_norm": 19.435270309448242, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8666063547134399, "num_tokens": 774538164.0, "step": 20299 }, { "epoch": 2.5823686553873553, "ewc_loss": 0.03379201516509056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379201370989904e-05, "grad_norm": 19.442171096801758, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8698855042457581, "num_tokens": 774576118.0, "step": 20300 }, { "epoch": 2.582495865665946, "ewc_loss": 0.033851657062768936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385165837244131e-05, "grad_norm": 19.41445541381836, "learning_rate": 1e-06, "loss": 0.3307, "mean_token_accuracy": 0.8959104418754578, "num_tokens": 774617506.0, "step": 20301 }, { "epoch": 2.5826230759445363, "ewc_loss": 0.0338050052523613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380500493221916e-05, "grad_norm": 19.44995880126953, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8518878221511841, "num_tokens": 774653720.0, "step": 20302 }, { "epoch": 2.582750286223127, "ewc_loss": 0.03380335867404938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380335692781955e-05, "grad_norm": 19.410369873046875, "learning_rate": 1e-06, "loss": 0.3242, "mean_token_accuracy": 0.8949088454246521, "num_tokens": 774691056.0, "step": 20303 }, { "epoch": 2.5828774965017174, "ewc_loss": 0.03381120413541794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.381120404810645e-05, "grad_norm": 19.389434814453125, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8691532611846924, "num_tokens": 774726514.0, "step": 20304 }, { "epoch": 2.583004706780308, "ewc_loss": 0.033804938197135925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3804939448600635e-05, "grad_norm": 19.479936599731445, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8673661351203918, "num_tokens": 774760831.0, "step": 20305 }, { "epoch": 2.5831319170588984, "ewc_loss": 0.03385351225733757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3853513741632923e-05, "grad_norm": 19.320810317993164, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8842602968215942, "num_tokens": 774799399.0, "step": 20306 }, { "epoch": 2.583259127337489, "ewc_loss": 0.03378268703818321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3782685932237655e-05, "grad_norm": 19.444501876831055, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8674612641334534, "num_tokens": 774839910.0, "step": 20307 }, { "epoch": 2.5833863376160795, "ewc_loss": 0.03388208523392677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388208642718382e-05, "grad_norm": 19.3606014251709, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8764917850494385, "num_tokens": 774875694.0, "step": 20308 }, { "epoch": 2.58351354789467, "ewc_loss": 0.03380897641181946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.380897760507651e-05, "grad_norm": 19.464807510375977, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.876620888710022, "num_tokens": 774908020.0, "step": 20309 }, { "epoch": 2.5836407581732606, "ewc_loss": 0.033911969512701035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391196878510527e-05, "grad_norm": 19.304561614990234, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8647727966308594, "num_tokens": 774946716.0, "step": 20310 }, { "epoch": 2.583767968451851, "ewc_loss": 0.033842530101537704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3842530683614314e-05, "grad_norm": 19.439376831054688, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8695645332336426, "num_tokens": 774981732.0, "step": 20311 }, { "epoch": 2.583895178730441, "ewc_loss": 0.03396415337920189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39641519531142e-05, "grad_norm": 19.384971618652344, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.8802810907363892, "num_tokens": 775015280.0, "step": 20312 }, { "epoch": 2.584022389009032, "ewc_loss": 0.03391554579138756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3915544918272644e-05, "grad_norm": 19.38829231262207, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8831116557121277, "num_tokens": 775054518.0, "step": 20313 }, { "epoch": 2.5841495992876222, "ewc_loss": 0.03395172581076622, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395172461750917e-05, "grad_norm": 19.412851333618164, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.881401538848877, "num_tokens": 775091548.0, "step": 20314 }, { "epoch": 2.584276809566213, "ewc_loss": 0.03394705802202225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394705709069967e-05, "grad_norm": 19.348485946655273, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8627212047576904, "num_tokens": 775130916.0, "step": 20315 }, { "epoch": 2.5844040198448033, "ewc_loss": 0.03399667516350746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3996675483649597e-05, "grad_norm": 19.423675537109375, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8879703283309937, "num_tokens": 775165875.0, "step": 20316 }, { "epoch": 2.584531230123394, "ewc_loss": 0.033897604793310165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3897606044774875e-05, "grad_norm": 19.36966323852539, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8614460229873657, "num_tokens": 775211463.0, "step": 20317 }, { "epoch": 2.5846584404019843, "ewc_loss": 0.03393789380788803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393789302208461e-05, "grad_norm": 19.359539031982422, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8836324214935303, "num_tokens": 775248338.0, "step": 20318 }, { "epoch": 2.584785650680575, "ewc_loss": 0.03389707952737808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389707853784785e-05, "grad_norm": 19.391775131225586, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8637003898620605, "num_tokens": 775285707.0, "step": 20319 }, { "epoch": 2.5849128609591654, "ewc_loss": 0.03396734967827797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396734973648563e-05, "grad_norm": 19.424152374267578, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8837926387786865, "num_tokens": 775322682.0, "step": 20320 }, { "epoch": 2.585040071237756, "ewc_loss": 0.03396602347493172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396602187422104e-05, "grad_norm": 19.433380126953125, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8850661516189575, "num_tokens": 775358274.0, "step": 20321 }, { "epoch": 2.5851672815163464, "ewc_loss": 0.03392496332526207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3924963645404205e-05, "grad_norm": 19.45011329650879, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8710416555404663, "num_tokens": 775394472.0, "step": 20322 }, { "epoch": 2.585294491794937, "ewc_loss": 0.033948201686143875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3948203054023907e-05, "grad_norm": 19.404460906982422, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8847430348396301, "num_tokens": 775433647.0, "step": 20323 }, { "epoch": 2.5854217020735275, "ewc_loss": 0.03389500081539154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3895001251949e-05, "grad_norm": 19.37997817993164, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8668352961540222, "num_tokens": 775469159.0, "step": 20324 }, { "epoch": 2.585548912352118, "ewc_loss": 0.033951032906770706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3951033401535824e-05, "grad_norm": 19.41797637939453, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8835524320602417, "num_tokens": 775507329.0, "step": 20325 }, { "epoch": 2.5856761226307086, "ewc_loss": 0.0339788943529129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397889304324053e-05, "grad_norm": 19.343894958496094, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8730852007865906, "num_tokens": 775544638.0, "step": 20326 }, { "epoch": 2.585803332909299, "ewc_loss": 0.03390602394938469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3906024327734485e-05, "grad_norm": 19.313940048217773, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8777141571044922, "num_tokens": 775584918.0, "step": 20327 }, { "epoch": 2.5859305431878896, "ewc_loss": 0.03395992890000343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395992825971916e-05, "grad_norm": 19.384767532348633, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8710419535636902, "num_tokens": 775624214.0, "step": 20328 }, { "epoch": 2.58605775346648, "ewc_loss": 0.03401847556233406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401847425266169e-05, "grad_norm": 19.359031677246094, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8634867668151855, "num_tokens": 775668372.0, "step": 20329 }, { "epoch": 2.5861849637450707, "ewc_loss": 0.033970627933740616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397062755539082e-05, "grad_norm": 19.37922477722168, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8487700819969177, "num_tokens": 775700578.0, "step": 20330 }, { "epoch": 2.586312174023661, "ewc_loss": 0.03403470665216446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403470691409893e-05, "grad_norm": 19.429424285888672, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8811408877372742, "num_tokens": 775746980.0, "step": 20331 }, { "epoch": 2.5864393843022517, "ewc_loss": 0.033980000764131546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397999898879789e-05, "grad_norm": 19.40460205078125, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8707653284072876, "num_tokens": 775780762.0, "step": 20332 }, { "epoch": 2.5865665945808423, "ewc_loss": 0.03399098291993141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39909820468165e-05, "grad_norm": 19.382835388183594, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8819187879562378, "num_tokens": 775815591.0, "step": 20333 }, { "epoch": 2.586693804859433, "ewc_loss": 0.03402204439043999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402204310987145e-05, "grad_norm": 19.45457649230957, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8663237690925598, "num_tokens": 775856414.0, "step": 20334 }, { "epoch": 2.5868210151380233, "ewc_loss": 0.033988531678915024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398853004910052e-05, "grad_norm": 19.394439697265625, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8757591247558594, "num_tokens": 775896939.0, "step": 20335 }, { "epoch": 2.586948225416614, "ewc_loss": 0.033936455845832825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393645602045581e-05, "grad_norm": 19.32261085510254, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8697555065155029, "num_tokens": 775941778.0, "step": 20336 }, { "epoch": 2.587075435695204, "ewc_loss": 0.03400781750679016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400781861273572e-05, "grad_norm": 19.4355411529541, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8740745782852173, "num_tokens": 775979346.0, "step": 20337 }, { "epoch": 2.587202645973795, "ewc_loss": 0.03401999920606613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401999856578186e-05, "grad_norm": 19.445329666137695, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8726388812065125, "num_tokens": 776013793.0, "step": 20338 }, { "epoch": 2.587329856252385, "ewc_loss": 0.03397675231099129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397675391170196e-05, "grad_norm": 19.398591995239258, "learning_rate": 1e-06, "loss": 0.354, "mean_token_accuracy": 0.8869503736495972, "num_tokens": 776058513.0, "step": 20339 }, { "epoch": 2.587457066530976, "ewc_loss": 0.033930566161870956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3930566132767126e-05, "grad_norm": 19.311813354492188, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8751129508018494, "num_tokens": 776102790.0, "step": 20340 }, { "epoch": 2.587584276809566, "ewc_loss": 0.03397165611386299, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397165710339323e-05, "grad_norm": 19.441621780395508, "learning_rate": 1e-06, "loss": 0.3397, "mean_token_accuracy": 0.8906335830688477, "num_tokens": 776133898.0, "step": 20341 }, { "epoch": 2.5877114870881566, "ewc_loss": 0.0339810810983181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3981079468503594e-05, "grad_norm": 19.3781681060791, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8737078309059143, "num_tokens": 776167590.0, "step": 20342 }, { "epoch": 2.587838697366747, "ewc_loss": 0.033931270241737366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39312719006557e-05, "grad_norm": 19.338102340698242, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8693769574165344, "num_tokens": 776207966.0, "step": 20343 }, { "epoch": 2.5879659076453376, "ewc_loss": 0.03398602455854416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398602348170243e-05, "grad_norm": 19.47295379638672, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8756113052368164, "num_tokens": 776245601.0, "step": 20344 }, { "epoch": 2.588093117923928, "ewc_loss": 0.033983781933784485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3983782486757264e-05, "grad_norm": 19.28693389892578, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.883141040802002, "num_tokens": 776285377.0, "step": 20345 }, { "epoch": 2.5882203282025187, "ewc_loss": 0.03391095995903015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391096106497571e-05, "grad_norm": 19.418256759643555, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8655955195426941, "num_tokens": 776328116.0, "step": 20346 }, { "epoch": 2.588347538481109, "ewc_loss": 0.034014567732810974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4014567063422874e-05, "grad_norm": 19.437774658203125, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8608416318893433, "num_tokens": 776369008.0, "step": 20347 }, { "epoch": 2.5884747487596997, "ewc_loss": 0.03391740471124649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391740392544307e-05, "grad_norm": 19.37899398803711, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8663067817687988, "num_tokens": 776410151.0, "step": 20348 }, { "epoch": 2.5886019590382903, "ewc_loss": 0.033898673951625824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389867561054416e-05, "grad_norm": 19.38585090637207, "learning_rate": 1e-06, "loss": 0.3324, "mean_token_accuracy": 0.8990606069564819, "num_tokens": 776445944.0, "step": 20349 }, { "epoch": 2.588729169316881, "ewc_loss": 0.033973414450883865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397341424715705e-05, "grad_norm": 19.407848358154297, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8787307143211365, "num_tokens": 776485388.0, "step": 20350 }, { "epoch": 2.5888563795954713, "ewc_loss": 0.03392341732978821, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392341750441119e-05, "grad_norm": 19.41143035888672, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8571429252624512, "num_tokens": 776529135.0, "step": 20351 }, { "epoch": 2.588983589874062, "ewc_loss": 0.033944591879844666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3944590541068465e-05, "grad_norm": 19.442955017089844, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8679373264312744, "num_tokens": 776564310.0, "step": 20352 }, { "epoch": 2.5891108001526524, "ewc_loss": 0.033936042338609695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39360412908718e-05, "grad_norm": 19.419567108154297, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8877664804458618, "num_tokens": 776609004.0, "step": 20353 }, { "epoch": 2.589238010431243, "ewc_loss": 0.03385766223073006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385766103747301e-05, "grad_norm": 19.391141891479492, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8810261487960815, "num_tokens": 776645796.0, "step": 20354 }, { "epoch": 2.5893652207098334, "ewc_loss": 0.03396321088075638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396320971660316e-05, "grad_norm": 19.42735481262207, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8657786846160889, "num_tokens": 776684248.0, "step": 20355 }, { "epoch": 2.589492430988424, "ewc_loss": 0.033912308514118195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391230711713433e-05, "grad_norm": 19.361064910888672, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8779085874557495, "num_tokens": 776722684.0, "step": 20356 }, { "epoch": 2.5896196412670145, "ewc_loss": 0.03386210277676582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386210300959647e-05, "grad_norm": 19.401817321777344, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8789960145950317, "num_tokens": 776764635.0, "step": 20357 }, { "epoch": 2.589746851545605, "ewc_loss": 0.03391923010349274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391923019080423e-05, "grad_norm": 19.442041397094727, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8641338348388672, "num_tokens": 776805683.0, "step": 20358 }, { "epoch": 2.5898740618241956, "ewc_loss": 0.03390581160783768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3905813324963674e-05, "grad_norm": 19.412187576293945, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8780906200408936, "num_tokens": 776841229.0, "step": 20359 }, { "epoch": 2.5900012721027856, "ewc_loss": 0.033947110176086426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394711166038178e-05, "grad_norm": 19.454051971435547, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8819533586502075, "num_tokens": 776881319.0, "step": 20360 }, { "epoch": 2.5901284823813766, "ewc_loss": 0.03384505584836006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3845055440906435e-05, "grad_norm": 19.412878036499023, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8796457052230835, "num_tokens": 776919436.0, "step": 20361 }, { "epoch": 2.5902556926599667, "ewc_loss": 0.033923130482435226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392313010408543e-05, "grad_norm": 19.463603973388672, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8581051826477051, "num_tokens": 776958644.0, "step": 20362 }, { "epoch": 2.5903829029385577, "ewc_loss": 0.033821988850831985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3821990655269474e-05, "grad_norm": 19.430511474609375, "learning_rate": 1e-06, "loss": 0.3378, "mean_token_accuracy": 0.8937886357307434, "num_tokens": 776994509.0, "step": 20363 }, { "epoch": 2.5905101132171477, "ewc_loss": 0.03387390077114105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3873900974867865e-05, "grad_norm": 19.448822021484375, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8810282349586487, "num_tokens": 777037557.0, "step": 20364 }, { "epoch": 2.5906373234957387, "ewc_loss": 0.03391604125499725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391603968339041e-05, "grad_norm": 19.411121368408203, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.855971097946167, "num_tokens": 777077237.0, "step": 20365 }, { "epoch": 2.590764533774329, "ewc_loss": 0.03387937694787979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387937613297254e-05, "grad_norm": 19.403932571411133, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.882041335105896, "num_tokens": 777120813.0, "step": 20366 }, { "epoch": 2.5908917440529193, "ewc_loss": 0.03390726447105408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3907264878507704e-05, "grad_norm": 19.4888973236084, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8624029755592346, "num_tokens": 777155809.0, "step": 20367 }, { "epoch": 2.59101895433151, "ewc_loss": 0.033931855112314224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3931853977264836e-05, "grad_norm": 19.4682559967041, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8749165534973145, "num_tokens": 777195868.0, "step": 20368 }, { "epoch": 2.5911461646101004, "ewc_loss": 0.0338732972741127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387329707038589e-05, "grad_norm": 19.44887924194336, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8800379633903503, "num_tokens": 777233406.0, "step": 20369 }, { "epoch": 2.591273374888691, "ewc_loss": 0.03383821249008179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3838212402770296e-05, "grad_norm": 19.412263870239258, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8776870369911194, "num_tokens": 777272817.0, "step": 20370 }, { "epoch": 2.5914005851672814, "ewc_loss": 0.03386269509792328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386269600014202e-05, "grad_norm": 19.47260284423828, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.871158242225647, "num_tokens": 777312398.0, "step": 20371 }, { "epoch": 2.591527795445872, "ewc_loss": 0.03393221274018288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393221413716674e-05, "grad_norm": 19.38738250732422, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8854469060897827, "num_tokens": 777349375.0, "step": 20372 }, { "epoch": 2.5916550057244625, "ewc_loss": 0.03384510055184364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384509909665212e-05, "grad_norm": 19.452898025512695, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8849929571151733, "num_tokens": 777394806.0, "step": 20373 }, { "epoch": 2.591782216003053, "ewc_loss": 0.033918265253305435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391826612642035e-05, "grad_norm": 19.40283966064453, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8799185752868652, "num_tokens": 777434350.0, "step": 20374 }, { "epoch": 2.5919094262816436, "ewc_loss": 0.03385026380419731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385026502655819e-05, "grad_norm": 19.422819137573242, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.849694013595581, "num_tokens": 777469690.0, "step": 20375 }, { "epoch": 2.592036636560234, "ewc_loss": 0.033933158963918686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393316001165658e-05, "grad_norm": 19.34139633178711, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8695678114891052, "num_tokens": 777502095.0, "step": 20376 }, { "epoch": 2.5921638468388246, "ewc_loss": 0.03387882933020592, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3878830436151475e-05, "grad_norm": 19.4580078125, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8869435787200928, "num_tokens": 777540387.0, "step": 20377 }, { "epoch": 2.592291057117415, "ewc_loss": 0.03394553065299988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3945529139600694e-05, "grad_norm": 19.396137237548828, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8813619613647461, "num_tokens": 777581855.0, "step": 20378 }, { "epoch": 2.5924182673960057, "ewc_loss": 0.03387971594929695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38797144650016e-05, "grad_norm": 19.390769958496094, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8763120174407959, "num_tokens": 777619852.0, "step": 20379 }, { "epoch": 2.592545477674596, "ewc_loss": 0.033963561058044434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396356260054745e-05, "grad_norm": 19.478830337524414, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.868618905544281, "num_tokens": 777662772.0, "step": 20380 }, { "epoch": 2.5926726879531867, "ewc_loss": 0.033992644399404526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399264460313134e-05, "grad_norm": 19.42788314819336, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.858328104019165, "num_tokens": 777701171.0, "step": 20381 }, { "epoch": 2.5927998982317773, "ewc_loss": 0.033918075263500214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391807695152238e-05, "grad_norm": 19.45619010925293, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8614058494567871, "num_tokens": 777738245.0, "step": 20382 }, { "epoch": 2.592927108510368, "ewc_loss": 0.033942583948373795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394258237676695e-05, "grad_norm": 19.388629913330078, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8756942749023438, "num_tokens": 777775732.0, "step": 20383 }, { "epoch": 2.5930543187889583, "ewc_loss": 0.033851005136966705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385100353625603e-05, "grad_norm": 19.507183074951172, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8760676383972168, "num_tokens": 777811077.0, "step": 20384 }, { "epoch": 2.5931815290675484, "ewc_loss": 0.03400319814682007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400319837965071e-05, "grad_norm": 19.46098518371582, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8737531900405884, "num_tokens": 777854107.0, "step": 20385 }, { "epoch": 2.5933087393461394, "ewc_loss": 0.03379034250974655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.379034387762658e-05, "grad_norm": 19.419658660888672, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8742218613624573, "num_tokens": 777891950.0, "step": 20386 }, { "epoch": 2.5934359496247295, "ewc_loss": 0.033890996128320694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389099583728239e-05, "grad_norm": 19.458847045898438, "learning_rate": 1e-06, "loss": 0.3351, "mean_token_accuracy": 0.8932094573974609, "num_tokens": 777932638.0, "step": 20387 }, { "epoch": 2.5935631599033204, "ewc_loss": 0.03389543294906616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389543417142704e-05, "grad_norm": 19.426237106323242, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8736380934715271, "num_tokens": 777977733.0, "step": 20388 }, { "epoch": 2.5936903701819105, "ewc_loss": 0.0338703915476799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387039032531902e-05, "grad_norm": 19.44615364074707, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8794510960578918, "num_tokens": 778020451.0, "step": 20389 }, { "epoch": 2.5938175804605015, "ewc_loss": 0.033902738243341446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390273923287168e-05, "grad_norm": 19.417329788208008, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8809947967529297, "num_tokens": 778059594.0, "step": 20390 }, { "epoch": 2.5939447907390916, "ewc_loss": 0.0338774174451828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387741890037432e-05, "grad_norm": 19.426475524902344, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8786863088607788, "num_tokens": 778099235.0, "step": 20391 }, { "epoch": 2.594072001017682, "ewc_loss": 0.03384819254279137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384819137863815e-05, "grad_norm": 19.396814346313477, "learning_rate": 1e-06, "loss": 0.3592, "mean_token_accuracy": 0.8864254951477051, "num_tokens": 778135430.0, "step": 20392 }, { "epoch": 2.5941992112962726, "ewc_loss": 0.03384144604206085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.38414465659298e-05, "grad_norm": 19.36709976196289, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8683023452758789, "num_tokens": 778175111.0, "step": 20393 }, { "epoch": 2.594326421574863, "ewc_loss": 0.033857155591249466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.385715535841882e-05, "grad_norm": 19.44915008544922, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8660121560096741, "num_tokens": 778211954.0, "step": 20394 }, { "epoch": 2.5944536318534537, "ewc_loss": 0.033926501870155334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3926502510439605e-05, "grad_norm": 19.420360565185547, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8596102595329285, "num_tokens": 778246580.0, "step": 20395 }, { "epoch": 2.594580842132044, "ewc_loss": 0.03387387841939926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.387387914699502e-05, "grad_norm": 19.410036087036133, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8816467523574829, "num_tokens": 778288659.0, "step": 20396 }, { "epoch": 2.5947080524106347, "ewc_loss": 0.03390425071120262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3904248994076625e-05, "grad_norm": 19.40505599975586, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8751569986343384, "num_tokens": 778325365.0, "step": 20397 }, { "epoch": 2.5948352626892253, "ewc_loss": 0.03391609713435173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391609789105132e-05, "grad_norm": 19.43515396118164, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.882743775844574, "num_tokens": 778362982.0, "step": 20398 }, { "epoch": 2.594962472967816, "ewc_loss": 0.03398292884230614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39829275617376e-05, "grad_norm": 19.381031036376953, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8621852397918701, "num_tokens": 778394230.0, "step": 20399 }, { "epoch": 2.5950896832464063, "ewc_loss": 0.03384806215763092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.384806041140109e-05, "grad_norm": 19.235607147216797, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8647685050964355, "num_tokens": 778434345.0, "step": 20400 }, { "epoch": 2.595216893524997, "ewc_loss": 0.033940911293029785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3940912544494495e-05, "grad_norm": 19.44171142578125, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8703796863555908, "num_tokens": 778478303.0, "step": 20401 }, { "epoch": 2.5953441038035874, "ewc_loss": 0.03401830047369003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401829962967895e-05, "grad_norm": 19.274982452392578, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.880057692527771, "num_tokens": 778513915.0, "step": 20402 }, { "epoch": 2.595471314082178, "ewc_loss": 0.033920466899871826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.392046710359864e-05, "grad_norm": 19.34332847595215, "learning_rate": 1e-06, "loss": 0.3392, "mean_token_accuracy": 0.8908026814460754, "num_tokens": 778554294.0, "step": 20403 }, { "epoch": 2.5955985243607684, "ewc_loss": 0.03406880423426628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4068805689457804e-05, "grad_norm": 19.333799362182617, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8731327652931213, "num_tokens": 778592836.0, "step": 20404 }, { "epoch": 2.595725734639359, "ewc_loss": 0.03401748836040497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401748836040497e-05, "grad_norm": 19.413881301879883, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8663300275802612, "num_tokens": 778629316.0, "step": 20405 }, { "epoch": 2.5958529449179495, "ewc_loss": 0.034050725400447845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405072493478656e-05, "grad_norm": 19.359394073486328, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8716952800750732, "num_tokens": 778659065.0, "step": 20406 }, { "epoch": 2.59598015519654, "ewc_loss": 0.034001853317022324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400185232749209e-05, "grad_norm": 19.377317428588867, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.8904325366020203, "num_tokens": 778693949.0, "step": 20407 }, { "epoch": 2.5961073654751305, "ewc_loss": 0.03403720632195473, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4037206205539405e-05, "grad_norm": 19.36726951599121, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8795771598815918, "num_tokens": 778738045.0, "step": 20408 }, { "epoch": 2.596234575753721, "ewc_loss": 0.034067392349243164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4067390515701845e-05, "grad_norm": 19.353456497192383, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8336584568023682, "num_tokens": 778769578.0, "step": 20409 }, { "epoch": 2.596361786032311, "ewc_loss": 0.03413846716284752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4138465707655996e-05, "grad_norm": 19.513158798217773, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8705283403396606, "num_tokens": 778810052.0, "step": 20410 }, { "epoch": 2.596488996310902, "ewc_loss": 0.03403828293085098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4038283047266304e-05, "grad_norm": 19.261737823486328, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8693416118621826, "num_tokens": 778850868.0, "step": 20411 }, { "epoch": 2.596616206589492, "ewc_loss": 0.0340217687189579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4021770261460915e-05, "grad_norm": 19.4699649810791, "learning_rate": 1e-06, "loss": 0.3684, "mean_token_accuracy": 0.8808923959732056, "num_tokens": 778890129.0, "step": 20412 }, { "epoch": 2.596743416868083, "ewc_loss": 0.03408079221844673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.408079282962717e-05, "grad_norm": 19.280797958374023, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8792787790298462, "num_tokens": 778923181.0, "step": 20413 }, { "epoch": 2.5968706271466733, "ewc_loss": 0.03404451161623001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4044511266984046e-05, "grad_norm": 19.462764739990234, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8669458031654358, "num_tokens": 778963328.0, "step": 20414 }, { "epoch": 2.596997837425264, "ewc_loss": 0.03423420339822769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423420275794342e-05, "grad_norm": 19.41554069519043, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8850963711738586, "num_tokens": 779003932.0, "step": 20415 }, { "epoch": 2.5971250477038543, "ewc_loss": 0.03404996916651726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404996823519468e-05, "grad_norm": 19.36789894104004, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8651001453399658, "num_tokens": 779043913.0, "step": 20416 }, { "epoch": 2.597252257982445, "ewc_loss": 0.0341305285692215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413052763789892e-05, "grad_norm": 19.377763748168945, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8719597458839417, "num_tokens": 779077126.0, "step": 20417 }, { "epoch": 2.5973794682610354, "ewc_loss": 0.03414054214954376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4140542993554845e-05, "grad_norm": 19.514177322387695, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8798362016677856, "num_tokens": 779111544.0, "step": 20418 }, { "epoch": 2.597506678539626, "ewc_loss": 0.03418134152889252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418134292587638e-05, "grad_norm": 19.396318435668945, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8732204437255859, "num_tokens": 779145493.0, "step": 20419 }, { "epoch": 2.5976338888182164, "ewc_loss": 0.034082114696502686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4082113415934145e-05, "grad_norm": 19.43800163269043, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8679694533348083, "num_tokens": 779189216.0, "step": 20420 }, { "epoch": 2.597761099096807, "ewc_loss": 0.03415442630648613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415442552068271e-05, "grad_norm": 19.44558334350586, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8824911117553711, "num_tokens": 779225914.0, "step": 20421 }, { "epoch": 2.5978883093753975, "ewc_loss": 0.03403746336698532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403746450203471e-05, "grad_norm": 19.427814483642578, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8792879581451416, "num_tokens": 779264118.0, "step": 20422 }, { "epoch": 2.598015519653988, "ewc_loss": 0.03402743116021156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402743095648475e-05, "grad_norm": 19.400714874267578, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.87860107421875, "num_tokens": 779301694.0, "step": 20423 }, { "epoch": 2.5981427299325786, "ewc_loss": 0.034027379006147385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402738002478145e-05, "grad_norm": 19.4687442779541, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8744176626205444, "num_tokens": 779339012.0, "step": 20424 }, { "epoch": 2.598269940211169, "ewc_loss": 0.034039195626974106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4039196179946885e-05, "grad_norm": 19.358001708984375, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8797625303268433, "num_tokens": 779376358.0, "step": 20425 }, { "epoch": 2.5983971504897596, "ewc_loss": 0.03405557945370674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405557799851522e-05, "grad_norm": 19.43378257751465, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8749047517776489, "num_tokens": 779416344.0, "step": 20426 }, { "epoch": 2.59852436076835, "ewc_loss": 0.034078288823366165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407828990020789e-05, "grad_norm": 19.432924270629883, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8644928336143494, "num_tokens": 779452189.0, "step": 20427 }, { "epoch": 2.5986515710469407, "ewc_loss": 0.034047991037368774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404799281270243e-05, "grad_norm": 19.346670150756836, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8738316297531128, "num_tokens": 779493513.0, "step": 20428 }, { "epoch": 2.598778781325531, "ewc_loss": 0.034118108451366425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411810757825151e-05, "grad_norm": 19.468090057373047, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8809263706207275, "num_tokens": 779533061.0, "step": 20429 }, { "epoch": 2.5989059916041217, "ewc_loss": 0.03407646343111992, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407646363484673e-05, "grad_norm": 19.364734649658203, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8677992224693298, "num_tokens": 779570159.0, "step": 20430 }, { "epoch": 2.5990332018827123, "ewc_loss": 0.03402870520949364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402870424906723e-05, "grad_norm": 19.34860610961914, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8723971843719482, "num_tokens": 779609066.0, "step": 20431 }, { "epoch": 2.599160412161303, "ewc_loss": 0.03401299938559532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401299909455702e-05, "grad_norm": 19.374425888061523, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8743780851364136, "num_tokens": 779651087.0, "step": 20432 }, { "epoch": 2.5992876224398933, "ewc_loss": 0.03402281925082207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402281799935736e-05, "grad_norm": 19.357990264892578, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8883347511291504, "num_tokens": 779689531.0, "step": 20433 }, { "epoch": 2.599414832718484, "ewc_loss": 0.03410257026553154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.410256977076642e-05, "grad_norm": 19.513898849487305, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8699089288711548, "num_tokens": 779724062.0, "step": 20434 }, { "epoch": 2.599542042997074, "ewc_loss": 0.034070901572704315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407090116525069e-05, "grad_norm": 19.372087478637695, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8623814582824707, "num_tokens": 779766029.0, "step": 20435 }, { "epoch": 2.599669253275665, "ewc_loss": 0.033985257148742676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3985255868174136e-05, "grad_norm": 19.45209503173828, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8598778247833252, "num_tokens": 779811065.0, "step": 20436 }, { "epoch": 2.599796463554255, "ewc_loss": 0.03409939259290695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409939381526783e-05, "grad_norm": 19.459897994995117, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8618607521057129, "num_tokens": 779852442.0, "step": 20437 }, { "epoch": 2.599923673832846, "ewc_loss": 0.03391718119382858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3917182008735836e-05, "grad_norm": 19.37559700012207, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8782464265823364, "num_tokens": 779885993.0, "step": 20438 }, { "epoch": 2.600050884111436, "ewc_loss": 0.0340428464114666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40428450726904e-05, "grad_norm": 19.467731475830078, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8792456984519958, "num_tokens": 779927791.0, "step": 20439 }, { "epoch": 2.6001780943900266, "ewc_loss": 0.033989738672971725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398973785806447e-05, "grad_norm": 19.39639663696289, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.878563642501831, "num_tokens": 779961517.0, "step": 20440 }, { "epoch": 2.600305304668617, "ewc_loss": 0.033932656049728394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3932654332602397e-05, "grad_norm": 19.450870513916016, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8620501756668091, "num_tokens": 779992902.0, "step": 20441 }, { "epoch": 2.6004325149472076, "ewc_loss": 0.03403906896710396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4039068850688636e-05, "grad_norm": 19.437257766723633, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8749542236328125, "num_tokens": 780032849.0, "step": 20442 }, { "epoch": 2.600559725225798, "ewc_loss": 0.03399508446455002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39950856869109e-05, "grad_norm": 19.376558303833008, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8868691921234131, "num_tokens": 780072563.0, "step": 20443 }, { "epoch": 2.6006869355043887, "ewc_loss": 0.0340389758348465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403897426323965e-05, "grad_norm": 19.577089309692383, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8749144077301025, "num_tokens": 780115299.0, "step": 20444 }, { "epoch": 2.600814145782979, "ewc_loss": 0.03406848758459091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406848918530159e-05, "grad_norm": 19.41072654724121, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8736670017242432, "num_tokens": 780155792.0, "step": 20445 }, { "epoch": 2.6009413560615697, "ewc_loss": 0.03386523574590683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.386523530934937e-05, "grad_norm": 19.472822189331055, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.870376706123352, "num_tokens": 780189040.0, "step": 20446 }, { "epoch": 2.6010685663401603, "ewc_loss": 0.034026023000478745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4026023058686405e-05, "grad_norm": 19.480363845825195, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8732670545578003, "num_tokens": 780229463.0, "step": 20447 }, { "epoch": 2.601195776618751, "ewc_loss": 0.03388950601220131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388950426597148e-05, "grad_norm": 19.4250545501709, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8869910836219788, "num_tokens": 780265961.0, "step": 20448 }, { "epoch": 2.6013229868973413, "ewc_loss": 0.03401487320661545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401487265364267e-05, "grad_norm": 19.45209503173828, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8691275119781494, "num_tokens": 780303220.0, "step": 20449 }, { "epoch": 2.601450197175932, "ewc_loss": 0.033967986702919006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396798638277687e-05, "grad_norm": 19.48275375366211, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8716632723808289, "num_tokens": 780340456.0, "step": 20450 }, { "epoch": 2.6015774074545224, "ewc_loss": 0.03394554555416107, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394554369151592e-05, "grad_norm": 19.366819381713867, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8725423216819763, "num_tokens": 780381402.0, "step": 20451 }, { "epoch": 2.601704617733113, "ewc_loss": 0.0340176559984684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4017655707430094e-05, "grad_norm": 19.481605529785156, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.8789629340171814, "num_tokens": 780422571.0, "step": 20452 }, { "epoch": 2.6018318280117034, "ewc_loss": 0.03396478295326233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3964781323447824e-05, "grad_norm": 19.46124267578125, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8701469898223877, "num_tokens": 780462447.0, "step": 20453 }, { "epoch": 2.601959038290294, "ewc_loss": 0.03395022079348564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395022213226184e-05, "grad_norm": 19.47313690185547, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8744690418243408, "num_tokens": 780499183.0, "step": 20454 }, { "epoch": 2.6020862485688845, "ewc_loss": 0.03395117446780205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39511752827093e-05, "grad_norm": 19.4219970703125, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8634105324745178, "num_tokens": 780539216.0, "step": 20455 }, { "epoch": 2.602213458847475, "ewc_loss": 0.03391684219241142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3916843676706776e-05, "grad_norm": 19.450048446655273, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8808536529541016, "num_tokens": 780578725.0, "step": 20456 }, { "epoch": 2.6023406691260655, "ewc_loss": 0.03403428941965103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403428854653612e-05, "grad_norm": 19.506072998046875, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8753060102462769, "num_tokens": 780615175.0, "step": 20457 }, { "epoch": 2.6024678794046556, "ewc_loss": 0.03391676023602486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391676000319421e-05, "grad_norm": 19.4333438873291, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8778298497200012, "num_tokens": 780651186.0, "step": 20458 }, { "epoch": 2.6025950896832466, "ewc_loss": 0.03394678980112076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3946791518246755e-05, "grad_norm": 19.422531127929688, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.867362916469574, "num_tokens": 780685550.0, "step": 20459 }, { "epoch": 2.6027222999618367, "ewc_loss": 0.03401130437850952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401130379643291e-05, "grad_norm": 19.440128326416016, "learning_rate": 1e-06, "loss": 0.3239, "mean_token_accuracy": 0.8953717947006226, "num_tokens": 780725477.0, "step": 20460 }, { "epoch": 2.6028495102404277, "ewc_loss": 0.03401024267077446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401024150662124e-05, "grad_norm": 19.460834503173828, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8763514757156372, "num_tokens": 780762481.0, "step": 20461 }, { "epoch": 2.6029767205190177, "ewc_loss": 0.033978600054979324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397859836695716e-05, "grad_norm": 19.40517807006836, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8548763990402222, "num_tokens": 780800305.0, "step": 20462 }, { "epoch": 2.6031039307976087, "ewc_loss": 0.03401840105652809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401840149308555e-05, "grad_norm": 19.421035766601562, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8758612275123596, "num_tokens": 780836271.0, "step": 20463 }, { "epoch": 2.603231141076199, "ewc_loss": 0.033976323902606964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3976324630202726e-05, "grad_norm": 19.48056411743164, "learning_rate": 1e-06, "loss": 0.3501, "mean_token_accuracy": 0.8865946531295776, "num_tokens": 780877363.0, "step": 20464 }, { "epoch": 2.6033583513547893, "ewc_loss": 0.03401290252804756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4012900869129226e-05, "grad_norm": 19.349966049194336, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.86312335729599, "num_tokens": 780922982.0, "step": 20465 }, { "epoch": 2.60348556163338, "ewc_loss": 0.03393932431936264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3939322747755796e-05, "grad_norm": 19.53146743774414, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8734586238861084, "num_tokens": 780957925.0, "step": 20466 }, { "epoch": 2.6036127719119704, "ewc_loss": 0.03408518061041832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4085180232068524e-05, "grad_norm": 19.427167892456055, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8566957712173462, "num_tokens": 780996922.0, "step": 20467 }, { "epoch": 2.603739982190561, "ewc_loss": 0.03388278931379318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388278855709359e-05, "grad_norm": 19.48110580444336, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.857720136642456, "num_tokens": 781037209.0, "step": 20468 }, { "epoch": 2.6038671924691514, "ewc_loss": 0.0340384878218174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40384867740795e-05, "grad_norm": 19.40935707092285, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.867843508720398, "num_tokens": 781078077.0, "step": 20469 }, { "epoch": 2.603994402747742, "ewc_loss": 0.033948417752981186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3948417694773525e-05, "grad_norm": 19.451623916625977, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8732016086578369, "num_tokens": 781122516.0, "step": 20470 }, { "epoch": 2.6041216130263325, "ewc_loss": 0.03404626622796059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4046264772769064e-05, "grad_norm": 19.422832489013672, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8664520382881165, "num_tokens": 781156290.0, "step": 20471 }, { "epoch": 2.604248823304923, "ewc_loss": 0.03397364914417267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397365071577951e-05, "grad_norm": 19.43643569946289, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8822889924049377, "num_tokens": 781192769.0, "step": 20472 }, { "epoch": 2.6043760335835135, "ewc_loss": 0.03403883054852486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4038832382066175e-05, "grad_norm": 19.516624450683594, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8786354660987854, "num_tokens": 781227594.0, "step": 20473 }, { "epoch": 2.604503243862104, "ewc_loss": 0.033942028880119324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394202940398827e-05, "grad_norm": 19.38971519470215, "learning_rate": 1e-06, "loss": 0.3476, "mean_token_accuracy": 0.8855606317520142, "num_tokens": 781259369.0, "step": 20474 }, { "epoch": 2.6046304541406946, "ewc_loss": 0.033967774361371994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396777538000606e-05, "grad_norm": 19.470142364501953, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8751298189163208, "num_tokens": 781296788.0, "step": 20475 }, { "epoch": 2.604757664419285, "ewc_loss": 0.03396353870630264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39635371346958e-05, "grad_norm": 19.42464256286621, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.876416802406311, "num_tokens": 781333103.0, "step": 20476 }, { "epoch": 2.6048848746978757, "ewc_loss": 0.033975739032030106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3975738915614784e-05, "grad_norm": 19.41754722595215, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8832912445068359, "num_tokens": 781372691.0, "step": 20477 }, { "epoch": 2.605012084976466, "ewc_loss": 0.034055616706609726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4055618016282097e-05, "grad_norm": 19.514631271362305, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8703000545501709, "num_tokens": 781410480.0, "step": 20478 }, { "epoch": 2.6051392952550567, "ewc_loss": 0.03402377665042877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402377478778362e-05, "grad_norm": 19.428930282592773, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8655880689620972, "num_tokens": 781454110.0, "step": 20479 }, { "epoch": 2.6052665055336472, "ewc_loss": 0.03403564915060997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403564915060997e-05, "grad_norm": 19.468198776245117, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8707221746444702, "num_tokens": 781494034.0, "step": 20480 }, { "epoch": 2.6053937158122378, "ewc_loss": 0.03404513746500015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4045136999338865e-05, "grad_norm": 19.370637893676758, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8720787763595581, "num_tokens": 781536225.0, "step": 20481 }, { "epoch": 2.6055209260908283, "ewc_loss": 0.034079767763614655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407976691960357e-05, "grad_norm": 19.51880645751953, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8808305263519287, "num_tokens": 781573987.0, "step": 20482 }, { "epoch": 2.6056481363694184, "ewc_loss": 0.034138333052396774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413833474041894e-05, "grad_norm": 19.448816299438477, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8790618181228638, "num_tokens": 781608987.0, "step": 20483 }, { "epoch": 2.6057753466480094, "ewc_loss": 0.03397906571626663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3979064028244466e-05, "grad_norm": 19.4796085357666, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8812386989593506, "num_tokens": 781646185.0, "step": 20484 }, { "epoch": 2.6059025569265994, "ewc_loss": 0.03411528468132019, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411528450669721e-05, "grad_norm": 19.449174880981445, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8644231557846069, "num_tokens": 781682999.0, "step": 20485 }, { "epoch": 2.6060297672051904, "ewc_loss": 0.034019019454717636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4019019949482754e-05, "grad_norm": 19.46933364868164, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8719040751457214, "num_tokens": 781726052.0, "step": 20486 }, { "epoch": 2.6061569774837805, "ewc_loss": 0.03398239240050316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3982392778852955e-05, "grad_norm": 19.479251861572266, "learning_rate": 1e-06, "loss": 0.3455, "mean_token_accuracy": 0.8882140517234802, "num_tokens": 781760562.0, "step": 20487 }, { "epoch": 2.6062841877623715, "ewc_loss": 0.033966559916734695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396656029508449e-05, "grad_norm": 19.36837387084961, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8633058667182922, "num_tokens": 781803765.0, "step": 20488 }, { "epoch": 2.6064113980409616, "ewc_loss": 0.03395885229110718, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3958851417992264e-05, "grad_norm": 19.491214752197266, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8756001591682434, "num_tokens": 781845419.0, "step": 20489 }, { "epoch": 2.606538608319552, "ewc_loss": 0.03402711823582649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402711809030734e-05, "grad_norm": 19.43460464477539, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8831303119659424, "num_tokens": 781883371.0, "step": 20490 }, { "epoch": 2.6066658185981426, "ewc_loss": 0.033970288932323456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397028922336176e-05, "grad_norm": 19.411142349243164, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8735880851745605, "num_tokens": 781919669.0, "step": 20491 }, { "epoch": 2.606793028876733, "ewc_loss": 0.03408270701766014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40827064064797e-05, "grad_norm": 19.538286209106445, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8806344270706177, "num_tokens": 781953212.0, "step": 20492 }, { "epoch": 2.6069202391553237, "ewc_loss": 0.034037988632917404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403798837098293e-05, "grad_norm": 19.41549301147461, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.87871253490448, "num_tokens": 781996824.0, "step": 20493 }, { "epoch": 2.607047449433914, "ewc_loss": 0.03393446281552315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393446240806952e-05, "grad_norm": 19.40883445739746, "learning_rate": 1e-06, "loss": 0.3284, "mean_token_accuracy": 0.8934604525566101, "num_tokens": 782035350.0, "step": 20494 }, { "epoch": 2.6071746597125047, "ewc_loss": 0.034077100455760956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407710028113797e-05, "grad_norm": 19.588829040527344, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8691404461860657, "num_tokens": 782073217.0, "step": 20495 }, { "epoch": 2.6073018699910953, "ewc_loss": 0.03396536409854889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396536340005696e-05, "grad_norm": 19.441360473632812, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8683647513389587, "num_tokens": 782110777.0, "step": 20496 }, { "epoch": 2.607429080269686, "ewc_loss": 0.0339619405567646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3961940061999485e-05, "grad_norm": 19.491683959960938, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8686212301254272, "num_tokens": 782153300.0, "step": 20497 }, { "epoch": 2.6075562905482763, "ewc_loss": 0.034047842025756836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404784365557134e-05, "grad_norm": 19.52678871154785, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.879862904548645, "num_tokens": 782191945.0, "step": 20498 }, { "epoch": 2.607683500826867, "ewc_loss": 0.03401797264814377, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401797221158631e-05, "grad_norm": 19.5699520111084, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8847899436950684, "num_tokens": 782230472.0, "step": 20499 }, { "epoch": 2.6078107111054574, "ewc_loss": 0.0339992381632328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399923662072979e-05, "grad_norm": 19.472829818725586, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8652883768081665, "num_tokens": 782263868.0, "step": 20500 }, { "epoch": 2.607937921384048, "ewc_loss": 0.033884089440107346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.388409095350653e-05, "grad_norm": 19.528156280517578, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8772092461585999, "num_tokens": 782299781.0, "step": 20501 }, { "epoch": 2.6080651316626384, "ewc_loss": 0.0340106226503849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4010623494395986e-05, "grad_norm": 19.519084930419922, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8684377670288086, "num_tokens": 782343376.0, "step": 20502 }, { "epoch": 2.608192341941229, "ewc_loss": 0.03390857204794884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.390857091289945e-05, "grad_norm": 19.49153709411621, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8856430053710938, "num_tokens": 782389894.0, "step": 20503 }, { "epoch": 2.6083195522198195, "ewc_loss": 0.033967528492212296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396752799744718e-05, "grad_norm": 19.40708351135254, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.879112720489502, "num_tokens": 782430055.0, "step": 20504 }, { "epoch": 2.60844676249841, "ewc_loss": 0.03391098603606224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.391098653082736e-05, "grad_norm": 19.48409652709961, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8761364817619324, "num_tokens": 782467211.0, "step": 20505 }, { "epoch": 2.6085739727770005, "ewc_loss": 0.03400062024593353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4000619052676484e-05, "grad_norm": 19.48613166809082, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8662490844726562, "num_tokens": 782505913.0, "step": 20506 }, { "epoch": 2.608701183055591, "ewc_loss": 0.03397186100482941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3971860830206424e-05, "grad_norm": 19.47979736328125, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8734129071235657, "num_tokens": 782541858.0, "step": 20507 }, { "epoch": 2.608828393334181, "ewc_loss": 0.03395625203847885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3956250263145193e-05, "grad_norm": 19.499652862548828, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8748451471328735, "num_tokens": 782575854.0, "step": 20508 }, { "epoch": 2.608955603612772, "ewc_loss": 0.03399216756224632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399216802790761e-05, "grad_norm": 19.414175033569336, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8766618371009827, "num_tokens": 782614005.0, "step": 20509 }, { "epoch": 2.609082813891362, "ewc_loss": 0.03391526639461517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39152647939045e-05, "grad_norm": 19.512178421020508, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8817121982574463, "num_tokens": 782650147.0, "step": 20510 }, { "epoch": 2.609210024169953, "ewc_loss": 0.03399788588285446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399788693059236e-05, "grad_norm": 19.45592498779297, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8748131990432739, "num_tokens": 782683215.0, "step": 20511 }, { "epoch": 2.6093372344485433, "ewc_loss": 0.03397154062986374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39715406880714e-05, "grad_norm": 19.523799896240234, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8798090219497681, "num_tokens": 782721184.0, "step": 20512 }, { "epoch": 2.609464444727134, "ewc_loss": 0.03400871530175209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400871355552226e-05, "grad_norm": 19.45247459411621, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8695869445800781, "num_tokens": 782759891.0, "step": 20513 }, { "epoch": 2.6095916550057243, "ewc_loss": 0.03396093100309372, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396093234186992e-05, "grad_norm": 19.46942138671875, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8650067448616028, "num_tokens": 782796398.0, "step": 20514 }, { "epoch": 2.609718865284315, "ewc_loss": 0.03401012718677521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401012872927822e-05, "grad_norm": 19.452816009521484, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8841580748558044, "num_tokens": 782835968.0, "step": 20515 }, { "epoch": 2.6098460755629054, "ewc_loss": 0.03396047651767731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3960477594519034e-05, "grad_norm": 19.46194839477539, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.8862443566322327, "num_tokens": 782871367.0, "step": 20516 }, { "epoch": 2.609973285841496, "ewc_loss": 0.03399602696299553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3996027923421934e-05, "grad_norm": 19.398448944091797, "learning_rate": 1e-06, "loss": 0.3375, "mean_token_accuracy": 0.8947685956954956, "num_tokens": 782905766.0, "step": 20517 }, { "epoch": 2.6101004961200864, "ewc_loss": 0.033958468586206436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395846943021752e-05, "grad_norm": 19.478769302368164, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8792297840118408, "num_tokens": 782942256.0, "step": 20518 }, { "epoch": 2.610227706398677, "ewc_loss": 0.03401629254221916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4016291465377435e-05, "grad_norm": 19.402158737182617, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8853474855422974, "num_tokens": 782976156.0, "step": 20519 }, { "epoch": 2.6103549166772675, "ewc_loss": 0.03398871049284935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398871194804087e-05, "grad_norm": 19.487342834472656, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8839987516403198, "num_tokens": 783011450.0, "step": 20520 }, { "epoch": 2.610482126955858, "ewc_loss": 0.03410952165722847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4109521948266774e-05, "grad_norm": 19.506046295166016, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.880275309085846, "num_tokens": 783050355.0, "step": 20521 }, { "epoch": 2.6106093372344485, "ewc_loss": 0.033996596932411194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399659544811584e-05, "grad_norm": 19.368452072143555, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8860641121864319, "num_tokens": 783091621.0, "step": 20522 }, { "epoch": 2.610736547513039, "ewc_loss": 0.034025873988866806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4025873901555315e-05, "grad_norm": 19.54315948486328, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8646451830863953, "num_tokens": 783130471.0, "step": 20523 }, { "epoch": 2.6108637577916296, "ewc_loss": 0.034092072397470474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409207420190796e-05, "grad_norm": 19.457284927368164, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.879942774772644, "num_tokens": 783168340.0, "step": 20524 }, { "epoch": 2.61099096807022, "ewc_loss": 0.033943936228752136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.394393570488319e-05, "grad_norm": 19.43659019470215, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8564632534980774, "num_tokens": 783207279.0, "step": 20525 }, { "epoch": 2.6111181783488107, "ewc_loss": 0.03404604271054268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404604285606183e-05, "grad_norm": 19.52218246459961, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8683449029922485, "num_tokens": 783243010.0, "step": 20526 }, { "epoch": 2.611245388627401, "ewc_loss": 0.034005679190158844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400567948119715e-05, "grad_norm": 19.42243766784668, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8647639155387878, "num_tokens": 783287548.0, "step": 20527 }, { "epoch": 2.6113725989059917, "ewc_loss": 0.03398411348462105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.398411354282871e-05, "grad_norm": 19.38315773010254, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.872771680355072, "num_tokens": 783331549.0, "step": 20528 }, { "epoch": 2.6114998091845822, "ewc_loss": 0.034112147986888885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4112148568965495e-05, "grad_norm": 19.412263870239258, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8844432830810547, "num_tokens": 783369445.0, "step": 20529 }, { "epoch": 2.6116270194631728, "ewc_loss": 0.0341055803000927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4105582017218694e-05, "grad_norm": 19.473814010620117, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8710856437683105, "num_tokens": 783413218.0, "step": 20530 }, { "epoch": 2.6117542297417633, "ewc_loss": 0.03410061448812485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4100616176147014e-05, "grad_norm": 19.54024887084961, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8745782375335693, "num_tokens": 783449944.0, "step": 20531 }, { "epoch": 2.611881440020354, "ewc_loss": 0.034007370471954346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4007371141342446e-05, "grad_norm": 19.380550384521484, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.875617504119873, "num_tokens": 783486142.0, "step": 20532 }, { "epoch": 2.612008650298944, "ewc_loss": 0.03405367583036423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405367533559911e-05, "grad_norm": 19.401264190673828, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8813726902008057, "num_tokens": 783521170.0, "step": 20533 }, { "epoch": 2.612135860577535, "ewc_loss": 0.034096188843250275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409618875593878e-05, "grad_norm": 19.372371673583984, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8735494613647461, "num_tokens": 783563902.0, "step": 20534 }, { "epoch": 2.612263070856125, "ewc_loss": 0.034082937985658646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4082939237123355e-05, "grad_norm": 19.443424224853516, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8789016008377075, "num_tokens": 783597655.0, "step": 20535 }, { "epoch": 2.612390281134716, "ewc_loss": 0.03408421576023102, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4084216167684644e-05, "grad_norm": 19.365253448486328, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8653046488761902, "num_tokens": 783635233.0, "step": 20536 }, { "epoch": 2.612517491413306, "ewc_loss": 0.03412759304046631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41275917890016e-05, "grad_norm": 19.437721252441406, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8763048052787781, "num_tokens": 783679677.0, "step": 20537 }, { "epoch": 2.6126447016918966, "ewc_loss": 0.034142352640628815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414235106902197e-05, "grad_norm": 19.413114547729492, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8818331956863403, "num_tokens": 783720608.0, "step": 20538 }, { "epoch": 2.612771911970487, "ewc_loss": 0.0340854786336422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4085478546330705e-05, "grad_norm": 19.495595932006836, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8853211998939514, "num_tokens": 783753198.0, "step": 20539 }, { "epoch": 2.6128991222490776, "ewc_loss": 0.0341339074075222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413390732021071e-05, "grad_norm": 19.407184600830078, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.8749939203262329, "num_tokens": 783791068.0, "step": 20540 }, { "epoch": 2.613026332527668, "ewc_loss": 0.03405790030956268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405789902899414e-05, "grad_norm": 19.4178409576416, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8749352693557739, "num_tokens": 783831319.0, "step": 20541 }, { "epoch": 2.6131535428062587, "ewc_loss": 0.03410721942782402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4107219107681885e-05, "grad_norm": 19.33875846862793, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8788750767707825, "num_tokens": 783867537.0, "step": 20542 }, { "epoch": 2.613280753084849, "ewc_loss": 0.03409182280302048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409182318137027e-05, "grad_norm": 19.458471298217773, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8704802989959717, "num_tokens": 783907787.0, "step": 20543 }, { "epoch": 2.6134079633634397, "ewc_loss": 0.034136202186346054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4136202884837985e-05, "grad_norm": 19.38519859313965, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8687623143196106, "num_tokens": 783948558.0, "step": 20544 }, { "epoch": 2.6135351736420303, "ewc_loss": 0.03407523036003113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407523036003113e-05, "grad_norm": 19.502824783325195, "learning_rate": 1e-06, "loss": 0.3399, "mean_token_accuracy": 0.8930745720863342, "num_tokens": 783986726.0, "step": 20545 }, { "epoch": 2.613662383920621, "ewc_loss": 0.034084878861904144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4084878279827535e-05, "grad_norm": 19.327905654907227, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8580746650695801, "num_tokens": 784023449.0, "step": 20546 }, { "epoch": 2.6137895941992113, "ewc_loss": 0.034049127250909805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4049127862090245e-05, "grad_norm": 19.427621841430664, "learning_rate": 1e-06, "loss": 0.3394, "mean_token_accuracy": 0.8915539383888245, "num_tokens": 784059529.0, "step": 20547 }, { "epoch": 2.613916804477802, "ewc_loss": 0.03418594226241112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418594133108854e-05, "grad_norm": 19.471839904785156, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8761457204818726, "num_tokens": 784103530.0, "step": 20548 }, { "epoch": 2.6140440147563924, "ewc_loss": 0.03407822549343109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4078224416589364e-05, "grad_norm": 19.454116821289062, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8702050447463989, "num_tokens": 784141721.0, "step": 20549 }, { "epoch": 2.614171225034983, "ewc_loss": 0.03415822982788086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4158230846514925e-05, "grad_norm": 19.4698543548584, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8763349652290344, "num_tokens": 784175868.0, "step": 20550 }, { "epoch": 2.6142984353135734, "ewc_loss": 0.034018415957689285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401841604500078e-05, "grad_norm": 19.43985939025879, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8862743377685547, "num_tokens": 784210646.0, "step": 20551 }, { "epoch": 2.614425645592164, "ewc_loss": 0.03411329537630081, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411329453228973e-05, "grad_norm": 19.509532928466797, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8814371228218079, "num_tokens": 784255588.0, "step": 20552 }, { "epoch": 2.6145528558707545, "ewc_loss": 0.03406204283237457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406204268685542e-05, "grad_norm": 19.460460662841797, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8774107694625854, "num_tokens": 784292877.0, "step": 20553 }, { "epoch": 2.614680066149345, "ewc_loss": 0.03405885398387909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40588521794416e-05, "grad_norm": 19.542377471923828, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.879779577255249, "num_tokens": 784336150.0, "step": 20554 }, { "epoch": 2.6148072764279355, "ewc_loss": 0.03401288390159607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.401288267923519e-05, "grad_norm": 19.412137985229492, "learning_rate": 1e-06, "loss": 0.3447, "mean_token_accuracy": 0.8877878189086914, "num_tokens": 784371547.0, "step": 20555 }, { "epoch": 2.6149344867065256, "ewc_loss": 0.03406035155057907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406035102671012e-05, "grad_norm": 19.562110900878906, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8778340816497803, "num_tokens": 784411473.0, "step": 20556 }, { "epoch": 2.6150616969851166, "ewc_loss": 0.03411281853914261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4112817957066e-05, "grad_norm": 19.477718353271484, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8662463426589966, "num_tokens": 784456713.0, "step": 20557 }, { "epoch": 2.6151889072637067, "ewc_loss": 0.03393658623099327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393658698769286e-05, "grad_norm": 19.48001480102539, "learning_rate": 1e-06, "loss": 0.3449, "mean_token_accuracy": 0.8895833492279053, "num_tokens": 784493685.0, "step": 20558 }, { "epoch": 2.6153161175422976, "ewc_loss": 0.034000542014837265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4000542655121535e-05, "grad_norm": 19.43218231201172, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8872461318969727, "num_tokens": 784534641.0, "step": 20559 }, { "epoch": 2.6154433278208877, "ewc_loss": 0.03399273008108139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399273191462271e-05, "grad_norm": 19.537532806396484, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8529782891273499, "num_tokens": 784576184.0, "step": 20560 }, { "epoch": 2.6155705380994787, "ewc_loss": 0.03401478752493858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4014788980130106e-05, "grad_norm": 19.45130157470703, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8803333044052124, "num_tokens": 784613851.0, "step": 20561 }, { "epoch": 2.615697748378069, "ewc_loss": 0.033929578959941864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3929580240510404e-05, "grad_norm": 19.483314514160156, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8775933980941772, "num_tokens": 784654133.0, "step": 20562 }, { "epoch": 2.6158249586566593, "ewc_loss": 0.033997539430856705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399753768462688e-05, "grad_norm": 19.443723678588867, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8760665655136108, "num_tokens": 784690529.0, "step": 20563 }, { "epoch": 2.61595216893525, "ewc_loss": 0.033966731280088425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3966731280088425e-05, "grad_norm": 19.415163040161133, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8746694326400757, "num_tokens": 784731972.0, "step": 20564 }, { "epoch": 2.6160793792138404, "ewc_loss": 0.034030161798000336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4030163078568876e-05, "grad_norm": 19.435611724853516, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.871404767036438, "num_tokens": 784773275.0, "step": 20565 }, { "epoch": 2.616206589492431, "ewc_loss": 0.03396354988217354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396354804863222e-05, "grad_norm": 19.521772384643555, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8607758283615112, "num_tokens": 784811468.0, "step": 20566 }, { "epoch": 2.6163337997710214, "ewc_loss": 0.034000080078840256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4000080631813034e-05, "grad_norm": 19.43553352355957, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8776533007621765, "num_tokens": 784850611.0, "step": 20567 }, { "epoch": 2.616461010049612, "ewc_loss": 0.03389480710029602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.389480843907222e-05, "grad_norm": 19.464767456054688, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8815695643424988, "num_tokens": 784886502.0, "step": 20568 }, { "epoch": 2.6165882203282025, "ewc_loss": 0.03396539017558098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.396538886590861e-05, "grad_norm": 19.499242782592773, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8670157790184021, "num_tokens": 784924489.0, "step": 20569 }, { "epoch": 2.616715430606793, "ewc_loss": 0.03394363820552826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3943637390621006e-05, "grad_norm": 19.488386154174805, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8802223205566406, "num_tokens": 784957224.0, "step": 20570 }, { "epoch": 2.6168426408853835, "ewc_loss": 0.03397878631949425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.397878754185513e-05, "grad_norm": 19.521028518676758, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8520140647888184, "num_tokens": 784997754.0, "step": 20571 }, { "epoch": 2.616969851163974, "ewc_loss": 0.033951535820961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39515354426112e-05, "grad_norm": 19.46855354309082, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8608345985412598, "num_tokens": 785033932.0, "step": 20572 }, { "epoch": 2.6170970614425646, "ewc_loss": 0.03393954038619995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.393954102648422e-05, "grad_norm": 19.48725128173828, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.872757077217102, "num_tokens": 785073190.0, "step": 20573 }, { "epoch": 2.617224271721155, "ewc_loss": 0.03397964686155319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.39796461048536e-05, "grad_norm": 19.499568939208984, "learning_rate": 1e-06, "loss": 0.3372, "mean_token_accuracy": 0.8916193246841431, "num_tokens": 785108397.0, "step": 20574 }, { "epoch": 2.6173514819997457, "ewc_loss": 0.033953309059143066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395331077626906e-05, "grad_norm": 19.41256332397461, "learning_rate": 1e-06, "loss": 0.3167, "mean_token_accuracy": 0.9015390872955322, "num_tokens": 785142671.0, "step": 20575 }, { "epoch": 2.617478692278336, "ewc_loss": 0.03395316004753113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395316161913797e-05, "grad_norm": 19.513700485229492, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8747549057006836, "num_tokens": 785183467.0, "step": 20576 }, { "epoch": 2.6176059025569267, "ewc_loss": 0.03399664908647537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399665001779795e-05, "grad_norm": 19.420166015625, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8649211525917053, "num_tokens": 785222072.0, "step": 20577 }, { "epoch": 2.6177331128355172, "ewc_loss": 0.03395162895321846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395163003006019e-05, "grad_norm": 19.46896743774414, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8644263744354248, "num_tokens": 785260458.0, "step": 20578 }, { "epoch": 2.6178603231141078, "ewc_loss": 0.03405512496829033, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405512325116433e-05, "grad_norm": 19.482276916503906, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.862899661064148, "num_tokens": 785302434.0, "step": 20579 }, { "epoch": 2.6179875333926983, "ewc_loss": 0.033963218331336975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.3963216992560774e-05, "grad_norm": 19.478252410888672, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8614979982376099, "num_tokens": 785338251.0, "step": 20580 }, { "epoch": 2.6181147436712884, "ewc_loss": 0.03399747610092163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399747583898716e-05, "grad_norm": 19.471681594848633, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8712711334228516, "num_tokens": 785384789.0, "step": 20581 }, { "epoch": 2.6182419539498794, "ewc_loss": 0.03395744785666466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.395744715817273e-05, "grad_norm": 19.377826690673828, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8753020763397217, "num_tokens": 785424978.0, "step": 20582 }, { "epoch": 2.6183691642284694, "ewc_loss": 0.03402632474899292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4026325010927394e-05, "grad_norm": 19.474828720092773, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8671650886535645, "num_tokens": 785464421.0, "step": 20583 }, { "epoch": 2.6184963745070604, "ewc_loss": 0.034099701792001724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4099703043466434e-05, "grad_norm": 19.466114044189453, "learning_rate": 1e-06, "loss": 0.3267, "mean_token_accuracy": 0.8936970829963684, "num_tokens": 785496166.0, "step": 20584 }, { "epoch": 2.6186235847856505, "ewc_loss": 0.03400130569934845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4001306630671024e-05, "grad_norm": 19.422225952148438, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8605140447616577, "num_tokens": 785528504.0, "step": 20585 }, { "epoch": 2.6187507950642415, "ewc_loss": 0.034013427793979645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4013428376056254e-05, "grad_norm": 19.47391128540039, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.879456639289856, "num_tokens": 785570896.0, "step": 20586 }, { "epoch": 2.6188780053428315, "ewc_loss": 0.03405633196234703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4056331060128286e-05, "grad_norm": 19.44846534729004, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8722835183143616, "num_tokens": 785612464.0, "step": 20587 }, { "epoch": 2.619005215621422, "ewc_loss": 0.03403227776288986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4032276744255796e-05, "grad_norm": 19.489561080932617, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8743418455123901, "num_tokens": 785655500.0, "step": 20588 }, { "epoch": 2.6191324259000126, "ewc_loss": 0.034025970846414566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4025972126983106e-05, "grad_norm": 19.410865783691406, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8779851198196411, "num_tokens": 785691186.0, "step": 20589 }, { "epoch": 2.619259636178603, "ewc_loss": 0.03405241668224335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4052416594931856e-05, "grad_norm": 19.496723175048828, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8875479698181152, "num_tokens": 785728575.0, "step": 20590 }, { "epoch": 2.6193868464571937, "ewc_loss": 0.03403649106621742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403648952371441e-05, "grad_norm": 19.421005249023438, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8733339309692383, "num_tokens": 785765782.0, "step": 20591 }, { "epoch": 2.619514056735784, "ewc_loss": 0.03405735269188881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405735333217308e-05, "grad_norm": 19.447708129882812, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8804089426994324, "num_tokens": 785800130.0, "step": 20592 }, { "epoch": 2.6196412670143747, "ewc_loss": 0.03410135209560394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4101351047866046e-05, "grad_norm": 19.382888793945312, "learning_rate": 1e-06, "loss": 0.3062, "mean_token_accuracy": 0.9029037952423096, "num_tokens": 785836647.0, "step": 20593 }, { "epoch": 2.6197684772929652, "ewc_loss": 0.034116365015506744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4116364986402914e-05, "grad_norm": 19.488082885742188, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8696001172065735, "num_tokens": 785873897.0, "step": 20594 }, { "epoch": 2.6198956875715558, "ewc_loss": 0.03414272889494896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414272941881791e-05, "grad_norm": 19.439620971679688, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8712543845176697, "num_tokens": 785914296.0, "step": 20595 }, { "epoch": 2.6200228978501463, "ewc_loss": 0.03405771777033806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405771713005379e-05, "grad_norm": 19.453550338745117, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8631885051727295, "num_tokens": 785951426.0, "step": 20596 }, { "epoch": 2.620150108128737, "ewc_loss": 0.03414294123649597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414294042158872e-05, "grad_norm": 19.42298698425293, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8780462741851807, "num_tokens": 785989906.0, "step": 20597 }, { "epoch": 2.6202773184073274, "ewc_loss": 0.03413263335824013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413263402762823e-05, "grad_norm": 19.458330154418945, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8699410557746887, "num_tokens": 786024763.0, "step": 20598 }, { "epoch": 2.620404528685918, "ewc_loss": 0.03407823294401169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407823169254698e-05, "grad_norm": 19.416183471679688, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8711789846420288, "num_tokens": 786063055.0, "step": 20599 }, { "epoch": 2.6205317389645084, "ewc_loss": 0.03410803899168968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.410803765291348e-05, "grad_norm": 19.441791534423828, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8505971431732178, "num_tokens": 786097675.0, "step": 20600 }, { "epoch": 2.620658949243099, "ewc_loss": 0.03415215387940407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415215542190708e-05, "grad_norm": 19.409711837768555, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8876873850822449, "num_tokens": 786133721.0, "step": 20601 }, { "epoch": 2.6207861595216895, "ewc_loss": 0.03413413465023041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413413287489675e-05, "grad_norm": 19.548564910888672, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8570401668548584, "num_tokens": 786179615.0, "step": 20602 }, { "epoch": 2.62091336980028, "ewc_loss": 0.03415431082248688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415430910536088e-05, "grad_norm": 19.39867401123047, "learning_rate": 1e-06, "loss": 0.35, "mean_token_accuracy": 0.888749361038208, "num_tokens": 786214945.0, "step": 20603 }, { "epoch": 2.6210405800788705, "ewc_loss": 0.034132394939661026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413239392102696e-05, "grad_norm": 19.507675170898438, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8779207468032837, "num_tokens": 786250880.0, "step": 20604 }, { "epoch": 2.621167790357461, "ewc_loss": 0.03417295590043068, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417295738472603e-05, "grad_norm": 19.42167091369629, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8755131363868713, "num_tokens": 786286095.0, "step": 20605 }, { "epoch": 2.621295000636051, "ewc_loss": 0.03414576128125191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4145759855164215e-05, "grad_norm": 19.482879638671875, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8804616332054138, "num_tokens": 786319547.0, "step": 20606 }, { "epoch": 2.621422210914642, "ewc_loss": 0.034154780209064484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4154778404626995e-05, "grad_norm": 19.431011199951172, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.874683141708374, "num_tokens": 786358699.0, "step": 20607 }, { "epoch": 2.621549421193232, "ewc_loss": 0.034126169979572296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4126169339288026e-05, "grad_norm": 19.39729881286621, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8788686990737915, "num_tokens": 786399732.0, "step": 20608 }, { "epoch": 2.621676631471823, "ewc_loss": 0.03416677564382553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4166776458732784e-05, "grad_norm": 19.421741485595703, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8661835193634033, "num_tokens": 786437375.0, "step": 20609 }, { "epoch": 2.6218038417504133, "ewc_loss": 0.03414108604192734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41410850523971e-05, "grad_norm": 19.394981384277344, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8650637269020081, "num_tokens": 786475538.0, "step": 20610 }, { "epoch": 2.621931052029004, "ewc_loss": 0.034265145659446716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4265147405676544e-05, "grad_norm": 19.492013931274414, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8773167133331299, "num_tokens": 786516802.0, "step": 20611 }, { "epoch": 2.6220582623075943, "ewc_loss": 0.03413998335599899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413998274481855e-05, "grad_norm": 19.4279842376709, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8759522438049316, "num_tokens": 786558470.0, "step": 20612 }, { "epoch": 2.622185472586185, "ewc_loss": 0.03416552022099495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416552135604434e-05, "grad_norm": 19.503917694091797, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8705558776855469, "num_tokens": 786601163.0, "step": 20613 }, { "epoch": 2.6223126828647754, "ewc_loss": 0.03409937769174576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40993792633526e-05, "grad_norm": 19.3878116607666, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8818317651748657, "num_tokens": 786636741.0, "step": 20614 }, { "epoch": 2.622439893143366, "ewc_loss": 0.0340825654566288, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4082564525306225e-05, "grad_norm": 19.425485610961914, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8642148971557617, "num_tokens": 786672931.0, "step": 20615 }, { "epoch": 2.6225671034219564, "ewc_loss": 0.03419511392712593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419511267566122e-05, "grad_norm": 19.443811416625977, "learning_rate": 1e-06, "loss": 0.3551, "mean_token_accuracy": 0.8881697654724121, "num_tokens": 786709310.0, "step": 20616 }, { "epoch": 2.622694313700547, "ewc_loss": 0.03411460667848587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411460784263909e-05, "grad_norm": 19.428367614746094, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8812012076377869, "num_tokens": 786747113.0, "step": 20617 }, { "epoch": 2.6228215239791375, "ewc_loss": 0.03411781042814255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411780926398933e-05, "grad_norm": 19.487060546875, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8738204836845398, "num_tokens": 786782033.0, "step": 20618 }, { "epoch": 2.622948734257728, "ewc_loss": 0.03412189707159996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41218983521685e-05, "grad_norm": 19.36119842529297, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.876032292842865, "num_tokens": 786827229.0, "step": 20619 }, { "epoch": 2.6230759445363185, "ewc_loss": 0.034129466861486435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.412946534808725e-05, "grad_norm": 19.485687255859375, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8828046321868896, "num_tokens": 786861722.0, "step": 20620 }, { "epoch": 2.623203154814909, "ewc_loss": 0.03411290794610977, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411290890653618e-05, "grad_norm": 19.40469741821289, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.878176748752594, "num_tokens": 786896202.0, "step": 20621 }, { "epoch": 2.6233303650934996, "ewc_loss": 0.034156378358602524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4156379115302116e-05, "grad_norm": 19.40979766845703, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8767185211181641, "num_tokens": 786932154.0, "step": 20622 }, { "epoch": 2.62345757537209, "ewc_loss": 0.034145135432481766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4145134122809395e-05, "grad_norm": 19.45583152770996, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8660709261894226, "num_tokens": 786972184.0, "step": 20623 }, { "epoch": 2.6235847856506807, "ewc_loss": 0.0341796912252903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417969128349796e-05, "grad_norm": 19.40988540649414, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8692293763160706, "num_tokens": 787015514.0, "step": 20624 }, { "epoch": 2.623711995929271, "ewc_loss": 0.0341433621942997, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414336242713034e-05, "grad_norm": 19.47827911376953, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.875994086265564, "num_tokens": 787054023.0, "step": 20625 }, { "epoch": 2.6238392062078617, "ewc_loss": 0.03415614739060402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415614628465846e-05, "grad_norm": 19.463703155517578, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8721977472305298, "num_tokens": 787088858.0, "step": 20626 }, { "epoch": 2.6239664164864522, "ewc_loss": 0.03415154665708542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41515478794463e-05, "grad_norm": 19.46198272705078, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8783901929855347, "num_tokens": 787124805.0, "step": 20627 }, { "epoch": 2.6240936267650428, "ewc_loss": 0.034163299947977066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4163298550993204e-05, "grad_norm": 19.452600479125977, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8867157101631165, "num_tokens": 787164922.0, "step": 20628 }, { "epoch": 2.6242208370436333, "ewc_loss": 0.03414808213710785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414808088564314e-05, "grad_norm": 19.40644073486328, "learning_rate": 1e-06, "loss": 0.3512, "mean_token_accuracy": 0.8867520689964294, "num_tokens": 787204936.0, "step": 20629 }, { "epoch": 2.624348047322224, "ewc_loss": 0.03414067625999451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414067759877071e-05, "grad_norm": 19.472929000854492, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.8859597444534302, "num_tokens": 787238292.0, "step": 20630 }, { "epoch": 2.624475257600814, "ewc_loss": 0.0342065766453743, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420657594688237e-05, "grad_norm": 19.512975692749023, "learning_rate": 1e-06, "loss": 0.3291, "mean_token_accuracy": 0.8950871229171753, "num_tokens": 787269054.0, "step": 20631 }, { "epoch": 2.624602467879405, "ewc_loss": 0.034122567623853683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4122567740269005e-05, "grad_norm": 19.386457443237305, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8742933869361877, "num_tokens": 787310022.0, "step": 20632 }, { "epoch": 2.624729678157995, "ewc_loss": 0.03408297523856163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4082975616911426e-05, "grad_norm": 19.394927978515625, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8654152750968933, "num_tokens": 787350917.0, "step": 20633 }, { "epoch": 2.624856888436586, "ewc_loss": 0.03417622670531273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417622792767361e-05, "grad_norm": 19.411075592041016, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8841924667358398, "num_tokens": 787388187.0, "step": 20634 }, { "epoch": 2.624984098715176, "ewc_loss": 0.03418520465493202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418520645936951e-05, "grad_norm": 19.468868255615234, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8819409012794495, "num_tokens": 787424998.0, "step": 20635 }, { "epoch": 2.6251113089937665, "ewc_loss": 0.03424501046538353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424501119297929e-05, "grad_norm": 19.50261116027832, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.8872522115707397, "num_tokens": 787465456.0, "step": 20636 }, { "epoch": 2.625238519272357, "ewc_loss": 0.03414447605609894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414447564864531e-05, "grad_norm": 19.37770652770996, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8736992478370667, "num_tokens": 787504797.0, "step": 20637 }, { "epoch": 2.6253657295509476, "ewc_loss": 0.03414369374513626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414369348320179e-05, "grad_norm": 19.404586791992188, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8853927850723267, "num_tokens": 787542911.0, "step": 20638 }, { "epoch": 2.625492939829538, "ewc_loss": 0.03415655344724655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4156553738284856e-05, "grad_norm": 19.37403678894043, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8666315078735352, "num_tokens": 787576143.0, "step": 20639 }, { "epoch": 2.6256201501081287, "ewc_loss": 0.034179624170064926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417962579987943e-05, "grad_norm": 19.450313568115234, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8775299191474915, "num_tokens": 787619912.0, "step": 20640 }, { "epoch": 2.625747360386719, "ewc_loss": 0.03422367200255394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422367080929689e-05, "grad_norm": 19.481903076171875, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.866182804107666, "num_tokens": 787658181.0, "step": 20641 }, { "epoch": 2.6258745706653097, "ewc_loss": 0.03416037932038307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4160380891989917e-05, "grad_norm": 19.46310806274414, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8666880130767822, "num_tokens": 787694889.0, "step": 20642 }, { "epoch": 2.6260017809439002, "ewc_loss": 0.03415751829743385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4157517802668735e-05, "grad_norm": 19.404502868652344, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8887655735015869, "num_tokens": 787737633.0, "step": 20643 }, { "epoch": 2.6261289912224908, "ewc_loss": 0.03414706140756607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4147062251577154e-05, "grad_norm": 19.46841812133789, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8631876707077026, "num_tokens": 787779069.0, "step": 20644 }, { "epoch": 2.6262562015010813, "ewc_loss": 0.03416665270924568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416665276745334e-05, "grad_norm": 19.468658447265625, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8634392619132996, "num_tokens": 787815544.0, "step": 20645 }, { "epoch": 2.626383411779672, "ewc_loss": 0.034061603248119354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406160249141976e-05, "grad_norm": 19.455480575561523, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8704119920730591, "num_tokens": 787852684.0, "step": 20646 }, { "epoch": 2.6265106220582624, "ewc_loss": 0.034162238240242004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416223989916034e-05, "grad_norm": 19.456039428710938, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8713274002075195, "num_tokens": 787893452.0, "step": 20647 }, { "epoch": 2.626637832336853, "ewc_loss": 0.03412708267569542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4127082471968606e-05, "grad_norm": 19.44200897216797, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8749593496322632, "num_tokens": 787929430.0, "step": 20648 }, { "epoch": 2.6267650426154434, "ewc_loss": 0.03411000967025757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4110009437426925e-05, "grad_norm": 19.475997924804688, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8724507093429565, "num_tokens": 787969768.0, "step": 20649 }, { "epoch": 2.626892252894034, "ewc_loss": 0.03415268287062645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415268292883411e-05, "grad_norm": 19.445693969726562, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8748082518577576, "num_tokens": 788009605.0, "step": 20650 }, { "epoch": 2.6270194631726245, "ewc_loss": 0.03411063179373741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411063153180294e-05, "grad_norm": 19.440839767456055, "learning_rate": 1e-06, "loss": 0.3501, "mean_token_accuracy": 0.8865746259689331, "num_tokens": 788044652.0, "step": 20651 }, { "epoch": 2.627146673451215, "ewc_loss": 0.03407377004623413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4073771530529484e-05, "grad_norm": 19.492124557495117, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8502556085586548, "num_tokens": 788085981.0, "step": 20652 }, { "epoch": 2.6272738837298055, "ewc_loss": 0.03419328108429909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4193282772321254e-05, "grad_norm": 19.448951721191406, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8723850846290588, "num_tokens": 788132196.0, "step": 20653 }, { "epoch": 2.6274010940083956, "ewc_loss": 0.0340387225151062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403872324270196e-05, "grad_norm": 19.410104751586914, "learning_rate": 1e-06, "loss": 0.3439, "mean_token_accuracy": 0.8882811665534973, "num_tokens": 788160099.0, "step": 20654 }, { "epoch": 2.6275283042869866, "ewc_loss": 0.034124165773391724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.412416481296532e-05, "grad_norm": 19.4544677734375, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.873892068862915, "num_tokens": 788197264.0, "step": 20655 }, { "epoch": 2.6276555145655767, "ewc_loss": 0.03409948572516441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409948476473801e-05, "grad_norm": 19.41915512084961, "learning_rate": 1e-06, "loss": 0.3516, "mean_token_accuracy": 0.8866838216781616, "num_tokens": 788232900.0, "step": 20656 }, { "epoch": 2.6277827248441676, "ewc_loss": 0.0340755358338356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4075535950250924e-05, "grad_norm": 19.396371841430664, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8700597286224365, "num_tokens": 788273720.0, "step": 20657 }, { "epoch": 2.6279099351227577, "ewc_loss": 0.03414735570549965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414735692786053e-05, "grad_norm": 19.49853515625, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8776580095291138, "num_tokens": 788308425.0, "step": 20658 }, { "epoch": 2.6280371454013487, "ewc_loss": 0.03410612791776657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.410612771403976e-05, "grad_norm": 19.348705291748047, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8677198886871338, "num_tokens": 788343701.0, "step": 20659 }, { "epoch": 2.628164355679939, "ewc_loss": 0.03404705598950386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40470542141702e-05, "grad_norm": 19.364253997802734, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8603086471557617, "num_tokens": 788372789.0, "step": 20660 }, { "epoch": 2.6282915659585293, "ewc_loss": 0.03424946591258049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424946771701798e-05, "grad_norm": 19.37363624572754, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.879326343536377, "num_tokens": 788408422.0, "step": 20661 }, { "epoch": 2.62841877623712, "ewc_loss": 0.034205112606287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4205113479401916e-05, "grad_norm": 19.463947296142578, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8792442083358765, "num_tokens": 788446113.0, "step": 20662 }, { "epoch": 2.6285459865157104, "ewc_loss": 0.03426380455493927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426380499149673e-05, "grad_norm": 19.37322425842285, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8701744079589844, "num_tokens": 788485009.0, "step": 20663 }, { "epoch": 2.628673196794301, "ewc_loss": 0.03419129177927971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4191292797913775e-05, "grad_norm": 19.387025833129883, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8783882856369019, "num_tokens": 788523446.0, "step": 20664 }, { "epoch": 2.6288004070728914, "ewc_loss": 0.03426147252321243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426147304708138e-05, "grad_norm": 19.381515502929688, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8842326998710632, "num_tokens": 788557423.0, "step": 20665 }, { "epoch": 2.628927617351482, "ewc_loss": 0.03433747589588165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4337474062340334e-05, "grad_norm": 19.500608444213867, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.872089147567749, "num_tokens": 788588177.0, "step": 20666 }, { "epoch": 2.6290548276300725, "ewc_loss": 0.034298356622457504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429835487622768e-05, "grad_norm": 19.41172218322754, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8766826391220093, "num_tokens": 788624877.0, "step": 20667 }, { "epoch": 2.629182037908663, "ewc_loss": 0.03427887707948685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427887713769451e-05, "grad_norm": 19.455350875854492, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8852882385253906, "num_tokens": 788662532.0, "step": 20668 }, { "epoch": 2.6293092481872535, "ewc_loss": 0.03427913039922714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4279131796211004e-05, "grad_norm": 19.47069549560547, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8760250210762024, "num_tokens": 788702734.0, "step": 20669 }, { "epoch": 2.629436458465844, "ewc_loss": 0.034237831830978394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42378334607929e-05, "grad_norm": 19.415281295776367, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.875981867313385, "num_tokens": 788738041.0, "step": 20670 }, { "epoch": 2.6295636687444346, "ewc_loss": 0.03425385430455208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425385511945933e-05, "grad_norm": 19.48043441772461, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.876634955406189, "num_tokens": 788772998.0, "step": 20671 }, { "epoch": 2.629690879023025, "ewc_loss": 0.03432784974575043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432784797041677e-05, "grad_norm": 19.445072174072266, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8590224981307983, "num_tokens": 788814310.0, "step": 20672 }, { "epoch": 2.6298180893016156, "ewc_loss": 0.034295450896024704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429545176913962e-05, "grad_norm": 19.44740104675293, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.878952145576477, "num_tokens": 788852811.0, "step": 20673 }, { "epoch": 2.629945299580206, "ewc_loss": 0.03427545726299286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427545743761584e-05, "grad_norm": 19.451719284057617, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8838388919830322, "num_tokens": 788888756.0, "step": 20674 }, { "epoch": 2.6300725098587967, "ewc_loss": 0.03423821181058884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423821181058884e-05, "grad_norm": 19.41261863708496, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8794884085655212, "num_tokens": 788926292.0, "step": 20675 }, { "epoch": 2.6301997201373872, "ewc_loss": 0.03420509770512581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420509892748669e-05, "grad_norm": 19.403865814208984, "learning_rate": 1e-06, "loss": 0.3535, "mean_token_accuracy": 0.8856659531593323, "num_tokens": 788965625.0, "step": 20676 }, { "epoch": 2.6303269304159778, "ewc_loss": 0.03421798348426819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421798464842141e-05, "grad_norm": 19.50053596496582, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8794831037521362, "num_tokens": 789007390.0, "step": 20677 }, { "epoch": 2.6304541406945683, "ewc_loss": 0.03423596918582916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423597081564367e-05, "grad_norm": 19.484947204589844, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8797740936279297, "num_tokens": 789047940.0, "step": 20678 }, { "epoch": 2.6305813509731584, "ewc_loss": 0.034218162298202515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4218162909382954e-05, "grad_norm": 19.500871658325195, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8747633695602417, "num_tokens": 789086321.0, "step": 20679 }, { "epoch": 2.6307085612517493, "ewc_loss": 0.03421946242451668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4219461667817086e-05, "grad_norm": 19.44404411315918, "learning_rate": 1e-06, "loss": 0.3507, "mean_token_accuracy": 0.8866119980812073, "num_tokens": 789121757.0, "step": 20680 }, { "epoch": 2.6308357715303394, "ewc_loss": 0.03414876013994217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4148761187680066e-05, "grad_norm": 19.504810333251953, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.8892456293106079, "num_tokens": 789160076.0, "step": 20681 }, { "epoch": 2.6309629818089304, "ewc_loss": 0.03420634567737579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420634675421752e-05, "grad_norm": 19.472484588623047, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8660739064216614, "num_tokens": 789197080.0, "step": 20682 }, { "epoch": 2.6310901920875205, "ewc_loss": 0.03409730643033981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409730561543256e-05, "grad_norm": 19.409263610839844, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8706966638565063, "num_tokens": 789249876.0, "step": 20683 }, { "epoch": 2.6312174023661115, "ewc_loss": 0.03417610004544258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417610059841536e-05, "grad_norm": 19.489770889282227, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.873966634273529, "num_tokens": 789290146.0, "step": 20684 }, { "epoch": 2.6313446126447015, "ewc_loss": 0.03420698270201683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420698340050876e-05, "grad_norm": 19.471059799194336, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8785891532897949, "num_tokens": 789335610.0, "step": 20685 }, { "epoch": 2.631471822923292, "ewc_loss": 0.034184351563453674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4184351534349844e-05, "grad_norm": 19.46338653564453, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8714243769645691, "num_tokens": 789371956.0, "step": 20686 }, { "epoch": 2.6315990332018826, "ewc_loss": 0.03409711644053459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409711644053459e-05, "grad_norm": 19.39637565612793, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8707849979400635, "num_tokens": 789415257.0, "step": 20687 }, { "epoch": 2.631726243480473, "ewc_loss": 0.034131087362766266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4131087886635214e-05, "grad_norm": 19.429407119750977, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8759409785270691, "num_tokens": 789452533.0, "step": 20688 }, { "epoch": 2.6318534537590637, "ewc_loss": 0.03416184335947037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416184335947037e-05, "grad_norm": 19.412641525268555, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8720917701721191, "num_tokens": 789488780.0, "step": 20689 }, { "epoch": 2.631980664037654, "ewc_loss": 0.03410734236240387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4107342798961326e-05, "grad_norm": 19.357467651367188, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8733761310577393, "num_tokens": 789528169.0, "step": 20690 }, { "epoch": 2.6321078743162447, "ewc_loss": 0.03416254371404648, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416254185140133e-05, "grad_norm": 19.506946563720703, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8687134981155396, "num_tokens": 789568382.0, "step": 20691 }, { "epoch": 2.6322350845948352, "ewc_loss": 0.03419990465044975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419990389375016e-05, "grad_norm": 19.514488220214844, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8734236359596252, "num_tokens": 789606895.0, "step": 20692 }, { "epoch": 2.6323622948734258, "ewc_loss": 0.03415758162736893, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415758328628726e-05, "grad_norm": 19.527631759643555, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8612523078918457, "num_tokens": 789649106.0, "step": 20693 }, { "epoch": 2.6324895051520163, "ewc_loss": 0.03404194861650467, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404195012990385e-05, "grad_norm": 19.39940643310547, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8785045742988586, "num_tokens": 789689449.0, "step": 20694 }, { "epoch": 2.632616715430607, "ewc_loss": 0.03401893004775047, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4018929000012577e-05, "grad_norm": 19.453561782836914, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.8859832882881165, "num_tokens": 789726179.0, "step": 20695 }, { "epoch": 2.6327439257091974, "ewc_loss": 0.03410593420267105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.410593490116298e-05, "grad_norm": 19.521812438964844, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.859054684638977, "num_tokens": 789764858.0, "step": 20696 }, { "epoch": 2.632871135987788, "ewc_loss": 0.03406607732176781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4066077205352485e-05, "grad_norm": 19.409093856811523, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8628281950950623, "num_tokens": 789799196.0, "step": 20697 }, { "epoch": 2.6329983462663784, "ewc_loss": 0.034140896052122116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414089587749913e-05, "grad_norm": 19.545801162719727, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8736535310745239, "num_tokens": 789840498.0, "step": 20698 }, { "epoch": 2.633125556544969, "ewc_loss": 0.03405974432826042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.405974348424934e-05, "grad_norm": 19.404930114746094, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8791870474815369, "num_tokens": 789874269.0, "step": 20699 }, { "epoch": 2.6332527668235595, "ewc_loss": 0.03399457037448883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.399456909392029e-05, "grad_norm": 19.47607421875, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8763590455055237, "num_tokens": 789914938.0, "step": 20700 }, { "epoch": 2.63337997710215, "ewc_loss": 0.03409525379538536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409525379538536e-05, "grad_norm": 19.48766326904297, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8749684691429138, "num_tokens": 789959219.0, "step": 20701 }, { "epoch": 2.6335071873807405, "ewc_loss": 0.03401549160480499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4015491110039875e-05, "grad_norm": 19.479400634765625, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8689640164375305, "num_tokens": 789993707.0, "step": 20702 }, { "epoch": 2.633634397659331, "ewc_loss": 0.03404819965362549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4048200177494437e-05, "grad_norm": 19.442174911499023, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8556204438209534, "num_tokens": 790034285.0, "step": 20703 }, { "epoch": 2.633761607937921, "ewc_loss": 0.0340774767100811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4077474992955104e-05, "grad_norm": 19.459468841552734, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8784009218215942, "num_tokens": 790069714.0, "step": 20704 }, { "epoch": 2.633888818216512, "ewc_loss": 0.03406317159533501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406317046028562e-05, "grad_norm": 19.42351531982422, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8736332654953003, "num_tokens": 790110733.0, "step": 20705 }, { "epoch": 2.634016028495102, "ewc_loss": 0.03406056761741638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406056930543855e-05, "grad_norm": 19.472991943359375, "learning_rate": 1e-06, "loss": 0.3724, "mean_token_accuracy": 0.8791613578796387, "num_tokens": 790144468.0, "step": 20706 }, { "epoch": 2.634143238773693, "ewc_loss": 0.03406495973467827, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406496034585871e-05, "grad_norm": 19.493610382080078, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8732084035873413, "num_tokens": 790186881.0, "step": 20707 }, { "epoch": 2.6342704490522832, "ewc_loss": 0.03403070569038391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403070513741113e-05, "grad_norm": 19.48591423034668, "learning_rate": 1e-06, "loss": 0.368, "mean_token_accuracy": 0.882138729095459, "num_tokens": 790225083.0, "step": 20708 }, { "epoch": 2.6343976593308738, "ewc_loss": 0.03402763232588768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.402763104531914e-05, "grad_norm": 19.499574661254883, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8745050430297852, "num_tokens": 790266822.0, "step": 20709 }, { "epoch": 2.6345248696094643, "ewc_loss": 0.03402290865778923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4022908948827535e-05, "grad_norm": 19.472713470458984, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8715124130249023, "num_tokens": 790305948.0, "step": 20710 }, { "epoch": 2.634652079888055, "ewc_loss": 0.034036874771118164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403687514946796e-05, "grad_norm": 19.450658798217773, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8767933249473572, "num_tokens": 790341639.0, "step": 20711 }, { "epoch": 2.6347792901666454, "ewc_loss": 0.03409149497747421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4091495763277635e-05, "grad_norm": 19.432369232177734, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.8845583200454712, "num_tokens": 790377442.0, "step": 20712 }, { "epoch": 2.634906500445236, "ewc_loss": 0.03404650837182999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.404650851734914e-05, "grad_norm": 19.500850677490234, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8759732246398926, "num_tokens": 790414911.0, "step": 20713 }, { "epoch": 2.6350337107238264, "ewc_loss": 0.03411580249667168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4115801099687815e-05, "grad_norm": 19.484033584594727, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.873971700668335, "num_tokens": 790453968.0, "step": 20714 }, { "epoch": 2.635160921002417, "ewc_loss": 0.0340098962187767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.400989589863457e-05, "grad_norm": 19.410198211669922, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8709147572517395, "num_tokens": 790493216.0, "step": 20715 }, { "epoch": 2.6352881312810075, "ewc_loss": 0.03411555662751198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.411555735510774e-05, "grad_norm": 19.50065040588379, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.871701717376709, "num_tokens": 790529380.0, "step": 20716 }, { "epoch": 2.635415341559598, "ewc_loss": 0.034030210226774216, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.403021037229337e-05, "grad_norm": 19.407346725463867, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8702180981636047, "num_tokens": 790569568.0, "step": 20717 }, { "epoch": 2.6355425518381885, "ewc_loss": 0.03408138081431389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.408138218219392e-05, "grad_norm": 19.518672943115234, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8817698955535889, "num_tokens": 790602269.0, "step": 20718 }, { "epoch": 2.635669762116779, "ewc_loss": 0.034136585891246796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413658487261273e-05, "grad_norm": 19.560588836669922, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8630980253219604, "num_tokens": 790639723.0, "step": 20719 }, { "epoch": 2.6357969723953696, "ewc_loss": 0.03409375250339508, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409375131013803e-05, "grad_norm": 19.501611709594727, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8619867563247681, "num_tokens": 790676103.0, "step": 20720 }, { "epoch": 2.63592418267396, "ewc_loss": 0.034065693616867065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.406569521757774e-05, "grad_norm": 19.50411033630371, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.877764880657196, "num_tokens": 790712118.0, "step": 20721 }, { "epoch": 2.6360513929525506, "ewc_loss": 0.034085243940353394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.408524571568705e-05, "grad_norm": 19.45121955871582, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8745666742324829, "num_tokens": 790747079.0, "step": 20722 }, { "epoch": 2.636178603231141, "ewc_loss": 0.03408358246088028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.408358315937221e-05, "grad_norm": 19.492355346679688, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8651355504989624, "num_tokens": 790781741.0, "step": 20723 }, { "epoch": 2.6363058135097317, "ewc_loss": 0.03408403322100639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.408403426874429e-05, "grad_norm": 19.422866821289062, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8738963007926941, "num_tokens": 790820243.0, "step": 20724 }, { "epoch": 2.6364330237883222, "ewc_loss": 0.03409431502223015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.409431519685313e-05, "grad_norm": 19.460222244262695, "learning_rate": 1e-06, "loss": 0.3461, "mean_token_accuracy": 0.8897528648376465, "num_tokens": 790851492.0, "step": 20725 }, { "epoch": 2.6365602340669128, "ewc_loss": 0.034163735806941986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4163735108450055e-05, "grad_norm": 19.451793670654297, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8813643455505371, "num_tokens": 790882293.0, "step": 20726 }, { "epoch": 2.636687444345503, "ewc_loss": 0.03418108820915222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418108826735988e-05, "grad_norm": 19.450056076049805, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8715593814849854, "num_tokens": 790922188.0, "step": 20727 }, { "epoch": 2.636814654624094, "ewc_loss": 0.03417212516069412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417212428757921e-05, "grad_norm": 19.476730346679688, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8852627277374268, "num_tokens": 790960675.0, "step": 20728 }, { "epoch": 2.636941864902684, "ewc_loss": 0.034234512597322464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4234511986142024e-05, "grad_norm": 19.4537410736084, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8840745091438293, "num_tokens": 790999352.0, "step": 20729 }, { "epoch": 2.637069075181275, "ewc_loss": 0.03420335799455643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420335633563809e-05, "grad_norm": 19.440126419067383, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8756399750709534, "num_tokens": 791035898.0, "step": 20730 }, { "epoch": 2.637196285459865, "ewc_loss": 0.034263286739587784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426328839850612e-05, "grad_norm": 19.472801208496094, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8514386415481567, "num_tokens": 791079731.0, "step": 20731 }, { "epoch": 2.637323495738456, "ewc_loss": 0.034196559339761734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4196560591226444e-05, "grad_norm": 19.492467880249023, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8879775404930115, "num_tokens": 791118565.0, "step": 20732 }, { "epoch": 2.637450706017046, "ewc_loss": 0.03419653698801994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41965387633536e-05, "grad_norm": 19.39029312133789, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.876052975654602, "num_tokens": 791154876.0, "step": 20733 }, { "epoch": 2.6375779162956365, "ewc_loss": 0.03418490290641785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418490450712852e-05, "grad_norm": 19.419130325317383, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8756395578384399, "num_tokens": 791196003.0, "step": 20734 }, { "epoch": 2.637705126574227, "ewc_loss": 0.03421381860971451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421381916268729e-05, "grad_norm": 19.4290714263916, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8727211952209473, "num_tokens": 791236462.0, "step": 20735 }, { "epoch": 2.6378323368528176, "ewc_loss": 0.0342683419585228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426834155106917e-05, "grad_norm": 19.46565055847168, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.861655592918396, "num_tokens": 791271918.0, "step": 20736 }, { "epoch": 2.637959547131408, "ewc_loss": 0.03421568125486374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421568180783652e-05, "grad_norm": 19.436899185180664, "learning_rate": 1e-06, "loss": 0.3503, "mean_token_accuracy": 0.8865998983383179, "num_tokens": 791309813.0, "step": 20737 }, { "epoch": 2.6380867574099987, "ewc_loss": 0.03422727435827255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422727604629472e-05, "grad_norm": 19.45307731628418, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8844686150550842, "num_tokens": 791338854.0, "step": 20738 }, { "epoch": 2.638213967688589, "ewc_loss": 0.03433666378259659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433666279306635e-05, "grad_norm": 19.57589340209961, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8802745938301086, "num_tokens": 791376580.0, "step": 20739 }, { "epoch": 2.6383411779671797, "ewc_loss": 0.03419223055243492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4192231396446005e-05, "grad_norm": 19.434762954711914, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8710172176361084, "num_tokens": 791415931.0, "step": 20740 }, { "epoch": 2.6384683882457702, "ewc_loss": 0.034146398305892944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414640013943426e-05, "grad_norm": 19.479656219482422, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.87068110704422, "num_tokens": 791448201.0, "step": 20741 }, { "epoch": 2.6385955985243608, "ewc_loss": 0.03420679643750191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42067978635896e-05, "grad_norm": 19.39916229248047, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.868280827999115, "num_tokens": 791489183.0, "step": 20742 }, { "epoch": 2.6387228088029513, "ewc_loss": 0.034153051674366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415305036469363e-05, "grad_norm": 19.419282913208008, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8757094740867615, "num_tokens": 791528414.0, "step": 20743 }, { "epoch": 2.638850019081542, "ewc_loss": 0.03424215689301491, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4242155379615724e-05, "grad_norm": 19.36606788635254, "learning_rate": 1e-06, "loss": 0.3325, "mean_token_accuracy": 0.8942118883132935, "num_tokens": 791566379.0, "step": 20744 }, { "epoch": 2.6389772293601323, "ewc_loss": 0.0341825857758522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4182587114628404e-05, "grad_norm": 19.39988899230957, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8776021003723145, "num_tokens": 791599789.0, "step": 20745 }, { "epoch": 2.639104439638723, "ewc_loss": 0.03427126631140709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427126648603007e-05, "grad_norm": 19.43792152404785, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8623517155647278, "num_tokens": 791642333.0, "step": 20746 }, { "epoch": 2.6392316499173134, "ewc_loss": 0.03425971418619156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4259715903317556e-05, "grad_norm": 19.392528533935547, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8759816884994507, "num_tokens": 791687190.0, "step": 20747 }, { "epoch": 2.639358860195904, "ewc_loss": 0.034249067306518555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42490675393492e-05, "grad_norm": 19.425901412963867, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8602526187896729, "num_tokens": 791722806.0, "step": 20748 }, { "epoch": 2.6394860704744945, "ewc_loss": 0.03426766395568848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426766488701105e-05, "grad_norm": 19.465784072875977, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8602585792541504, "num_tokens": 791757726.0, "step": 20749 }, { "epoch": 2.639613280753085, "ewc_loss": 0.03422890231013298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422890222282149e-05, "grad_norm": 19.346590042114258, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8827897906303406, "num_tokens": 791795160.0, "step": 20750 }, { "epoch": 2.6397404910316755, "ewc_loss": 0.03421403095126152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42140301654581e-05, "grad_norm": 19.46671485900879, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8869864344596863, "num_tokens": 791831682.0, "step": 20751 }, { "epoch": 2.6398677013102656, "ewc_loss": 0.03428433835506439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428433774388395e-05, "grad_norm": 19.341785430908203, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8847512602806091, "num_tokens": 791863230.0, "step": 20752 }, { "epoch": 2.6399949115888566, "ewc_loss": 0.03423156961798668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423156886128709e-05, "grad_norm": 19.471879959106445, "learning_rate": 1e-06, "loss": 0.3438, "mean_token_accuracy": 0.8868857622146606, "num_tokens": 791904323.0, "step": 20753 }, { "epoch": 2.6401221218674467, "ewc_loss": 0.034380704164505005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4380704164505005e-05, "grad_norm": 19.37493896484375, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8683419227600098, "num_tokens": 791951836.0, "step": 20754 }, { "epoch": 2.6402493321460376, "ewc_loss": 0.034237321466207504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42373205057811e-05, "grad_norm": 19.52275276184082, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8774714469909668, "num_tokens": 791989782.0, "step": 20755 }, { "epoch": 2.6403765424246277, "ewc_loss": 0.03432397171854973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432396988500841e-05, "grad_norm": 19.370676040649414, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8763155341148376, "num_tokens": 792024038.0, "step": 20756 }, { "epoch": 2.6405037527032187, "ewc_loss": 0.034229960292577744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422996087465435e-05, "grad_norm": 19.474332809448242, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8753477334976196, "num_tokens": 792067893.0, "step": 20757 }, { "epoch": 2.6406309629818088, "ewc_loss": 0.034355808049440384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4355809475528076e-05, "grad_norm": 19.422042846679688, "learning_rate": 1e-06, "loss": 0.3549, "mean_token_accuracy": 0.8842365145683289, "num_tokens": 792108593.0, "step": 20758 }, { "epoch": 2.6407581732603993, "ewc_loss": 0.03423861041665077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423861198825762e-05, "grad_norm": 19.433347702026367, "learning_rate": 1e-06, "loss": 0.3495, "mean_token_accuracy": 0.8878065347671509, "num_tokens": 792141201.0, "step": 20759 }, { "epoch": 2.64088538353899, "ewc_loss": 0.03425722196698189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425722024985589e-05, "grad_norm": 19.453231811523438, "learning_rate": 1e-06, "loss": 0.3347, "mean_token_accuracy": 0.8945812582969666, "num_tokens": 792182908.0, "step": 20760 }, { "epoch": 2.6410125938175804, "ewc_loss": 0.03426350653171539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4263506677234545e-05, "grad_norm": 19.448732376098633, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8757270574569702, "num_tokens": 792225223.0, "step": 20761 }, { "epoch": 2.641139804096171, "ewc_loss": 0.03424039110541344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4240390959894285e-05, "grad_norm": 19.440916061401367, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8588372468948364, "num_tokens": 792258131.0, "step": 20762 }, { "epoch": 2.6412670143747614, "ewc_loss": 0.034206897020339966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420689608901739e-05, "grad_norm": 19.406246185302734, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8810266256332397, "num_tokens": 792291429.0, "step": 20763 }, { "epoch": 2.641394224653352, "ewc_loss": 0.03420472517609596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420472421566956e-05, "grad_norm": 19.40654754638672, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8716260194778442, "num_tokens": 792327777.0, "step": 20764 }, { "epoch": 2.6415214349319425, "ewc_loss": 0.03428338095545769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428338095545769e-05, "grad_norm": 19.458942413330078, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8770731687545776, "num_tokens": 792367918.0, "step": 20765 }, { "epoch": 2.641648645210533, "ewc_loss": 0.034205760806798935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420576103962958e-05, "grad_norm": 19.4144287109375, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8789281845092773, "num_tokens": 792405550.0, "step": 20766 }, { "epoch": 2.6417758554891235, "ewc_loss": 0.0342143252491951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421432484174147e-05, "grad_norm": 19.403995513916016, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8774529695510864, "num_tokens": 792440948.0, "step": 20767 }, { "epoch": 2.641903065767714, "ewc_loss": 0.03428024798631668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428024865570478e-05, "grad_norm": 19.36339569091797, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8826432824134827, "num_tokens": 792488065.0, "step": 20768 }, { "epoch": 2.6420302760463046, "ewc_loss": 0.03426402062177658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4264019632246345e-05, "grad_norm": 19.506248474121094, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8828642964363098, "num_tokens": 792522845.0, "step": 20769 }, { "epoch": 2.642157486324895, "ewc_loss": 0.03427676856517792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4276767109986395e-05, "grad_norm": 19.436159133911133, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8857467174530029, "num_tokens": 792561118.0, "step": 20770 }, { "epoch": 2.6422846966034856, "ewc_loss": 0.034164782613515854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41647828463465e-05, "grad_norm": 19.36896324157715, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8744169473648071, "num_tokens": 792600125.0, "step": 20771 }, { "epoch": 2.642411906882076, "ewc_loss": 0.03426733985543251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426734110689722e-05, "grad_norm": 19.582782745361328, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8817050457000732, "num_tokens": 792633856.0, "step": 20772 }, { "epoch": 2.6425391171606667, "ewc_loss": 0.03420007973909378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42000785167329e-05, "grad_norm": 19.382253646850586, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8724879026412964, "num_tokens": 792672427.0, "step": 20773 }, { "epoch": 2.6426663274392572, "ewc_loss": 0.03416796028614044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416795880184509e-05, "grad_norm": 19.501699447631836, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.854313313961029, "num_tokens": 792713856.0, "step": 20774 }, { "epoch": 2.6427935377178478, "ewc_loss": 0.03424680233001709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424680107855238e-05, "grad_norm": 19.434144973754883, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.869373619556427, "num_tokens": 792757602.0, "step": 20775 }, { "epoch": 2.6429207479964383, "ewc_loss": 0.03420380875468254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420380744501017e-05, "grad_norm": 19.457914352416992, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8674825429916382, "num_tokens": 792797067.0, "step": 20776 }, { "epoch": 2.6430479582750284, "ewc_loss": 0.03424666076898575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4246659197378904e-05, "grad_norm": 19.47345542907715, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8796817064285278, "num_tokens": 792835748.0, "step": 20777 }, { "epoch": 2.6431751685536193, "ewc_loss": 0.03415185958147049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415186074562371e-05, "grad_norm": 19.44275665283203, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8777261972427368, "num_tokens": 792875039.0, "step": 20778 }, { "epoch": 2.6433023788322094, "ewc_loss": 0.03422338515520096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422338340897113e-05, "grad_norm": 19.487062454223633, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8813705444335938, "num_tokens": 792916580.0, "step": 20779 }, { "epoch": 2.6434295891108004, "ewc_loss": 0.034164708107709885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416470644879155e-05, "grad_norm": 19.44481086730957, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.877814769744873, "num_tokens": 792952507.0, "step": 20780 }, { "epoch": 2.6435567993893905, "ewc_loss": 0.03417336195707321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417336120037362e-05, "grad_norm": 19.48539161682129, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8789446949958801, "num_tokens": 792994046.0, "step": 20781 }, { "epoch": 2.6436840096679814, "ewc_loss": 0.03417294844985008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417295010876842e-05, "grad_norm": 19.468013763427734, "learning_rate": 1e-06, "loss": 0.3378, "mean_token_accuracy": 0.8942185044288635, "num_tokens": 793033855.0, "step": 20782 }, { "epoch": 2.6438112199465715, "ewc_loss": 0.03414524346590042, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.414524326217361e-05, "grad_norm": 19.4226131439209, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8792024850845337, "num_tokens": 793072226.0, "step": 20783 }, { "epoch": 2.643938430225162, "ewc_loss": 0.034072376787662506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.407237818464637e-05, "grad_norm": 19.403175354003906, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8798410892486572, "num_tokens": 793110884.0, "step": 20784 }, { "epoch": 2.6440656405037526, "ewc_loss": 0.034205030649900436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420502980588935e-05, "grad_norm": 19.527294158935547, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8583865165710449, "num_tokens": 793151885.0, "step": 20785 }, { "epoch": 2.644192850782343, "ewc_loss": 0.03415709361433983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4157092159148306e-05, "grad_norm": 19.485166549682617, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8541607856750488, "num_tokens": 793187253.0, "step": 20786 }, { "epoch": 2.6443200610609336, "ewc_loss": 0.03412900120019913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4128999686799943e-05, "grad_norm": 19.50181007385254, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8812433481216431, "num_tokens": 793231249.0, "step": 20787 }, { "epoch": 2.644447271339524, "ewc_loss": 0.03414744883775711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4147447877330706e-05, "grad_norm": 19.45981216430664, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.8869947195053101, "num_tokens": 793269918.0, "step": 20788 }, { "epoch": 2.6445744816181147, "ewc_loss": 0.03404638543725014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.40463848260697e-05, "grad_norm": 19.497058868408203, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8857625722885132, "num_tokens": 793306856.0, "step": 20789 }, { "epoch": 2.6447016918967052, "ewc_loss": 0.03413242846727371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413243030081503e-05, "grad_norm": 19.421663284301758, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8669885396957397, "num_tokens": 793347866.0, "step": 20790 }, { "epoch": 2.6448289021752958, "ewc_loss": 0.03410978615283966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.410978752071969e-05, "grad_norm": 19.46190643310547, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8742785453796387, "num_tokens": 793391037.0, "step": 20791 }, { "epoch": 2.6449561124538863, "ewc_loss": 0.03416329622268677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41632949130144e-05, "grad_norm": 19.41750717163086, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8630181550979614, "num_tokens": 793423560.0, "step": 20792 }, { "epoch": 2.645083322732477, "ewc_loss": 0.03418787941336632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418788037379272e-05, "grad_norm": 19.510162353515625, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8831848502159119, "num_tokens": 793460215.0, "step": 20793 }, { "epoch": 2.6452105330110673, "ewc_loss": 0.03417100012302399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417100015212782e-05, "grad_norm": 19.44768714904785, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8635053634643555, "num_tokens": 793492671.0, "step": 20794 }, { "epoch": 2.645337743289658, "ewc_loss": 0.03415250405669212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415250466787256e-05, "grad_norm": 19.489818572998047, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8749549984931946, "num_tokens": 793529526.0, "step": 20795 }, { "epoch": 2.6454649535682484, "ewc_loss": 0.034253351390361786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425335307838395e-05, "grad_norm": 19.5123348236084, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.884026288986206, "num_tokens": 793566748.0, "step": 20796 }, { "epoch": 2.645592163846839, "ewc_loss": 0.034157633781433105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.415763421799056e-05, "grad_norm": 19.408336639404297, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.8907985687255859, "num_tokens": 793600703.0, "step": 20797 }, { "epoch": 2.6457193741254295, "ewc_loss": 0.034164804965257645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416480467421934e-05, "grad_norm": 19.54939079284668, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8754294514656067, "num_tokens": 793639226.0, "step": 20798 }, { "epoch": 2.64584658440402, "ewc_loss": 0.03420255705714226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420255598030053e-05, "grad_norm": 19.377025604248047, "learning_rate": 1e-06, "loss": 0.3415, "mean_token_accuracy": 0.8883665800094604, "num_tokens": 793676147.0, "step": 20799 }, { "epoch": 2.6459737946826105, "ewc_loss": 0.03408847004175186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4088468964910135e-05, "grad_norm": 19.462339401245117, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8854162693023682, "num_tokens": 793715778.0, "step": 20800 }, { "epoch": 2.646101004961201, "ewc_loss": 0.03427954390645027, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4279542887816206e-05, "grad_norm": 19.41253662109375, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8912119269371033, "num_tokens": 793752096.0, "step": 20801 }, { "epoch": 2.646228215239791, "ewc_loss": 0.034178394824266434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4178396163042635e-05, "grad_norm": 19.436674118041992, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8715417385101318, "num_tokens": 793789809.0, "step": 20802 }, { "epoch": 2.646355425518382, "ewc_loss": 0.03422105684876442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422105510253459e-05, "grad_norm": 19.392614364624023, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.872565746307373, "num_tokens": 793826505.0, "step": 20803 }, { "epoch": 2.646482635796972, "ewc_loss": 0.0341939739882946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.41939739882946e-05, "grad_norm": 19.382713317871094, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8756393194198608, "num_tokens": 793868360.0, "step": 20804 }, { "epoch": 2.646609846075563, "ewc_loss": 0.03425084054470062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425083923502825e-05, "grad_norm": 19.39063835144043, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8786089420318604, "num_tokens": 793906566.0, "step": 20805 }, { "epoch": 2.6467370563541532, "ewc_loss": 0.03422699496150017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422699592192657e-05, "grad_norm": 19.431177139282227, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8734453320503235, "num_tokens": 793947814.0, "step": 20806 }, { "epoch": 2.6468642666327438, "ewc_loss": 0.03431837633252144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43183746736031e-05, "grad_norm": 19.481502532958984, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8810160756111145, "num_tokens": 793987391.0, "step": 20807 }, { "epoch": 2.6469914769113343, "ewc_loss": 0.034331098198890686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43310966854915e-05, "grad_norm": 19.46402359008789, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8868288993835449, "num_tokens": 794024045.0, "step": 20808 }, { "epoch": 2.647118687189925, "ewc_loss": 0.03422842547297478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422842564759776e-05, "grad_norm": 19.461503982543945, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8808010220527649, "num_tokens": 794063405.0, "step": 20809 }, { "epoch": 2.6472458974685154, "ewc_loss": 0.03423496335744858, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4234963095514104e-05, "grad_norm": 19.459896087646484, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8528950810432434, "num_tokens": 794099729.0, "step": 20810 }, { "epoch": 2.647373107747106, "ewc_loss": 0.034228384494781494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422838562983088e-05, "grad_norm": 19.429487228393555, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8681701421737671, "num_tokens": 794139669.0, "step": 20811 }, { "epoch": 2.6475003180256964, "ewc_loss": 0.03427528962492943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4275290090590715e-05, "grad_norm": 19.456186294555664, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8775750994682312, "num_tokens": 794176906.0, "step": 20812 }, { "epoch": 2.647627528304287, "ewc_loss": 0.03420306369662285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4203065297333524e-05, "grad_norm": 19.433622360229492, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.878695547580719, "num_tokens": 794216594.0, "step": 20813 }, { "epoch": 2.6477547385828775, "ewc_loss": 0.034298334270715714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4298333048354834e-05, "grad_norm": 19.4991455078125, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8788248896598816, "num_tokens": 794248379.0, "step": 20814 }, { "epoch": 2.647881948861468, "ewc_loss": 0.03423547372221947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4235472412547097e-05, "grad_norm": 19.460657119750977, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8543580174446106, "num_tokens": 794293604.0, "step": 20815 }, { "epoch": 2.6480091591400585, "ewc_loss": 0.034167397767305374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4167398553108796e-05, "grad_norm": 19.414531707763672, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8666408658027649, "num_tokens": 794332084.0, "step": 20816 }, { "epoch": 2.648136369418649, "ewc_loss": 0.034282878041267395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428287891438231e-05, "grad_norm": 19.438241958618164, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8904896974563599, "num_tokens": 794369158.0, "step": 20817 }, { "epoch": 2.6482635796972396, "ewc_loss": 0.03424517437815666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424517490202561e-05, "grad_norm": 19.45223045349121, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.879322350025177, "num_tokens": 794410531.0, "step": 20818 }, { "epoch": 2.64839078997583, "ewc_loss": 0.03420884907245636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42088496836368e-05, "grad_norm": 19.418659210205078, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8761563897132874, "num_tokens": 794457556.0, "step": 20819 }, { "epoch": 2.6485180002544206, "ewc_loss": 0.03424413874745369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424413807806559e-05, "grad_norm": 19.445510864257812, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8647133111953735, "num_tokens": 794492737.0, "step": 20820 }, { "epoch": 2.648645210533011, "ewc_loss": 0.034299224615097046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429922435316257e-05, "grad_norm": 19.504735946655273, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8866416811943054, "num_tokens": 794532589.0, "step": 20821 }, { "epoch": 2.6487724208116017, "ewc_loss": 0.03422116860747337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422116787987761e-05, "grad_norm": 19.430858612060547, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8650193214416504, "num_tokens": 794566982.0, "step": 20822 }, { "epoch": 2.648899631090192, "ewc_loss": 0.03423481807112694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423481757636182e-05, "grad_norm": 19.45084571838379, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8598361611366272, "num_tokens": 794604278.0, "step": 20823 }, { "epoch": 2.6490268413687827, "ewc_loss": 0.03426935896277428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4269360185135156e-05, "grad_norm": 19.448081970214844, "learning_rate": 1e-06, "loss": 0.3598, "mean_token_accuracy": 0.8858656883239746, "num_tokens": 794643785.0, "step": 20824 }, { "epoch": 2.649154051647373, "ewc_loss": 0.03427717089653015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427717092563398e-05, "grad_norm": 19.393512725830078, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8860219120979309, "num_tokens": 794678302.0, "step": 20825 }, { "epoch": 2.649281261925964, "ewc_loss": 0.034256450831890106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425644899834879e-05, "grad_norm": 19.481090545654297, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8657922744750977, "num_tokens": 794715732.0, "step": 20826 }, { "epoch": 2.649408472204554, "ewc_loss": 0.034293580800294876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429358184803277e-05, "grad_norm": 19.40976905822754, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8805584907531738, "num_tokens": 794754359.0, "step": 20827 }, { "epoch": 2.649535682483145, "ewc_loss": 0.03426216542720795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426216426305473e-05, "grad_norm": 19.459543228149414, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8722363114356995, "num_tokens": 794795770.0, "step": 20828 }, { "epoch": 2.649662892761735, "ewc_loss": 0.03433491662144661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433491656323895e-05, "grad_norm": 19.445178985595703, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8693224191665649, "num_tokens": 794835654.0, "step": 20829 }, { "epoch": 2.649790103040326, "ewc_loss": 0.03426650911569595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4266508009750396e-05, "grad_norm": 19.497032165527344, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8776806592941284, "num_tokens": 794872527.0, "step": 20830 }, { "epoch": 2.649917313318916, "ewc_loss": 0.03425045683979988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425045724725351e-05, "grad_norm": 19.3927001953125, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8798719644546509, "num_tokens": 794918577.0, "step": 20831 }, { "epoch": 2.6500445235975065, "ewc_loss": 0.03427719697356224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427719639148563e-05, "grad_norm": 19.5090274810791, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8857625722885132, "num_tokens": 794953566.0, "step": 20832 }, { "epoch": 2.650171733876097, "ewc_loss": 0.034326016902923584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43260180670768e-05, "grad_norm": 19.43838119506836, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8826237916946411, "num_tokens": 794990547.0, "step": 20833 }, { "epoch": 2.6502989441546876, "ewc_loss": 0.03423438221216202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423438101890497e-05, "grad_norm": 19.469125747680664, "learning_rate": 1e-06, "loss": 0.3375, "mean_token_accuracy": 0.8922640085220337, "num_tokens": 795028419.0, "step": 20834 }, { "epoch": 2.650426154433278, "ewc_loss": 0.03432030230760574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432030280237086e-05, "grad_norm": 19.493175506591797, "learning_rate": 1e-06, "loss": 0.3405, "mean_token_accuracy": 0.8921387791633606, "num_tokens": 795064303.0, "step": 20835 }, { "epoch": 2.6505533647118686, "ewc_loss": 0.03423534706234932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4235348721267655e-05, "grad_norm": 19.410842895507812, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8549972772598267, "num_tokens": 795104280.0, "step": 20836 }, { "epoch": 2.650680574990459, "ewc_loss": 0.034241192042827606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4241191315231845e-05, "grad_norm": 19.507884979248047, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8667783737182617, "num_tokens": 795142547.0, "step": 20837 }, { "epoch": 2.6508077852690497, "ewc_loss": 0.03425343707203865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4253436751896515e-05, "grad_norm": 19.392736434936523, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8741598725318909, "num_tokens": 795186193.0, "step": 20838 }, { "epoch": 2.6509349955476402, "ewc_loss": 0.034214768558740616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421476867515594e-05, "grad_norm": 19.45302391052246, "learning_rate": 1e-06, "loss": 0.3338, "mean_token_accuracy": 0.8921055197715759, "num_tokens": 795215453.0, "step": 20839 }, { "epoch": 2.6510622058262308, "ewc_loss": 0.034370940178632736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437093982938677e-05, "grad_norm": 19.52576446533203, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.870879054069519, "num_tokens": 795252820.0, "step": 20840 }, { "epoch": 2.6511894161048213, "ewc_loss": 0.03423331677913666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423331509111449e-05, "grad_norm": 19.422122955322266, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8740159273147583, "num_tokens": 795290000.0, "step": 20841 }, { "epoch": 2.651316626383412, "ewc_loss": 0.034203119575977325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420311986701563e-05, "grad_norm": 19.41668701171875, "learning_rate": 1e-06, "loss": 0.3334, "mean_token_accuracy": 0.8936058878898621, "num_tokens": 795330389.0, "step": 20842 }, { "epoch": 2.6514438366620023, "ewc_loss": 0.03429432958364487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4294331271667033e-05, "grad_norm": 19.459699630737305, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8739458322525024, "num_tokens": 795366693.0, "step": 20843 }, { "epoch": 2.651571046940593, "ewc_loss": 0.03431704640388489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4317046811338514e-05, "grad_norm": 19.514049530029297, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8776601552963257, "num_tokens": 795405748.0, "step": 20844 }, { "epoch": 2.6516982572191834, "ewc_loss": 0.03427263721823692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4272638004040346e-05, "grad_norm": 19.48153305053711, "learning_rate": 1e-06, "loss": 0.3426, "mean_token_accuracy": 0.8873693943023682, "num_tokens": 795446179.0, "step": 20845 }, { "epoch": 2.651825467497774, "ewc_loss": 0.034194931387901306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4194930776720867e-05, "grad_norm": 19.42347526550293, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8713961839675903, "num_tokens": 795488234.0, "step": 20846 }, { "epoch": 2.6519526777763645, "ewc_loss": 0.034211091697216034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421109067858197e-05, "grad_norm": 19.43990135192871, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.873641312122345, "num_tokens": 795526508.0, "step": 20847 }, { "epoch": 2.652079888054955, "ewc_loss": 0.0342383086681366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423831003601663e-05, "grad_norm": 19.413602828979492, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8828238248825073, "num_tokens": 795566299.0, "step": 20848 }, { "epoch": 2.6522070983335455, "ewc_loss": 0.034201428294181824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420142820687033e-05, "grad_norm": 19.496932983398438, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8713803887367249, "num_tokens": 795603892.0, "step": 20849 }, { "epoch": 2.6523343086121356, "ewc_loss": 0.034283027052879333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42830280715134e-05, "grad_norm": 19.495241165161133, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8835627436637878, "num_tokens": 795639622.0, "step": 20850 }, { "epoch": 2.6524615188907266, "ewc_loss": 0.034188978374004364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4188979043392465e-05, "grad_norm": 19.44513511657715, "learning_rate": 1e-06, "loss": 0.3225, "mean_token_accuracy": 0.8972562551498413, "num_tokens": 795677299.0, "step": 20851 }, { "epoch": 2.6525887291693167, "ewc_loss": 0.03419387713074684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419387576286681e-05, "grad_norm": 19.444252014160156, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.876602292060852, "num_tokens": 795714304.0, "step": 20852 }, { "epoch": 2.6527159394479076, "ewc_loss": 0.03417728841304779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.417729021748528e-05, "grad_norm": 19.43070411682129, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8718225359916687, "num_tokens": 795752330.0, "step": 20853 }, { "epoch": 2.6528431497264977, "ewc_loss": 0.03424513339996338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424513488425873e-05, "grad_norm": 19.505035400390625, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8554831743240356, "num_tokens": 795789503.0, "step": 20854 }, { "epoch": 2.6529703600050887, "ewc_loss": 0.03422421216964722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422421286813915e-05, "grad_norm": 19.479509353637695, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8758407235145569, "num_tokens": 795828449.0, "step": 20855 }, { "epoch": 2.6530975702836788, "ewc_loss": 0.03418131545186043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418131382204592e-05, "grad_norm": 19.456008911132812, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.875839114189148, "num_tokens": 795868454.0, "step": 20856 }, { "epoch": 2.6532247805622693, "ewc_loss": 0.03422209620475769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422209556447342e-05, "grad_norm": 19.461360931396484, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8627997040748596, "num_tokens": 795904629.0, "step": 20857 }, { "epoch": 2.65335199084086, "ewc_loss": 0.03418635204434395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4186352422693744e-05, "grad_norm": 19.425416946411133, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8624787330627441, "num_tokens": 795943197.0, "step": 20858 }, { "epoch": 2.6534792011194503, "ewc_loss": 0.034239284694194794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423928501433693e-05, "grad_norm": 19.53691864013672, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8716691136360168, "num_tokens": 795984672.0, "step": 20859 }, { "epoch": 2.653606411398041, "ewc_loss": 0.034268543124198914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426854163990356e-05, "grad_norm": 19.51115608215332, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8577623963356018, "num_tokens": 796022599.0, "step": 20860 }, { "epoch": 2.6537336216766314, "ewc_loss": 0.03422592952847481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4225929994136095e-05, "grad_norm": 19.527240753173828, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8731494545936584, "num_tokens": 796057798.0, "step": 20861 }, { "epoch": 2.653860831955222, "ewc_loss": 0.03426271304488182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42627135978546e-05, "grad_norm": 19.468650817871094, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8758768439292908, "num_tokens": 796094928.0, "step": 20862 }, { "epoch": 2.6539880422338125, "ewc_loss": 0.03418594226241112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418594133108854e-05, "grad_norm": 19.52276039123535, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8782950639724731, "num_tokens": 796132614.0, "step": 20863 }, { "epoch": 2.654115252512403, "ewc_loss": 0.03426547721028328, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426547846174799e-05, "grad_norm": 19.51397132873535, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8750590085983276, "num_tokens": 796171653.0, "step": 20864 }, { "epoch": 2.6542424627909935, "ewc_loss": 0.034144170582294464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4144170058425516e-05, "grad_norm": 19.467100143432617, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.870826005935669, "num_tokens": 796215218.0, "step": 20865 }, { "epoch": 2.654369673069584, "ewc_loss": 0.034189775586128235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418977576075122e-05, "grad_norm": 19.435768127441406, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8871539831161499, "num_tokens": 796249165.0, "step": 20866 }, { "epoch": 2.6544968833481746, "ewc_loss": 0.0342060886323452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420608845772222e-05, "grad_norm": 19.514963150024414, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.872848629951477, "num_tokens": 796283878.0, "step": 20867 }, { "epoch": 2.654624093626765, "ewc_loss": 0.03427061438560486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42706152878236e-05, "grad_norm": 19.416786193847656, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.862946093082428, "num_tokens": 796330760.0, "step": 20868 }, { "epoch": 2.6547513039053556, "ewc_loss": 0.034189704805612564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4189703001175076e-05, "grad_norm": 19.393962860107422, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8651499152183533, "num_tokens": 796371547.0, "step": 20869 }, { "epoch": 2.654878514183946, "ewc_loss": 0.03425856679677963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4258566302014515e-05, "grad_norm": 19.463512420654297, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8600994944572449, "num_tokens": 796409893.0, "step": 20870 }, { "epoch": 2.6550057244625367, "ewc_loss": 0.03426992520689964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4269924071850255e-05, "grad_norm": 19.507658004760742, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8607044219970703, "num_tokens": 796442493.0, "step": 20871 }, { "epoch": 2.655132934741127, "ewc_loss": 0.034245580434799194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42455787176732e-05, "grad_norm": 19.44428062438965, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8776177763938904, "num_tokens": 796480853.0, "step": 20872 }, { "epoch": 2.6552601450197177, "ewc_loss": 0.03427534177899361, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4275341022294015e-05, "grad_norm": 19.420169830322266, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8821049332618713, "num_tokens": 796520057.0, "step": 20873 }, { "epoch": 2.6553873552983083, "ewc_loss": 0.03426801785826683, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426801777095534e-05, "grad_norm": 19.530515670776367, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8794066905975342, "num_tokens": 796549779.0, "step": 20874 }, { "epoch": 2.6555145655768984, "ewc_loss": 0.03430362045764923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430361903156154e-05, "grad_norm": 19.40916633605957, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8807185888290405, "num_tokens": 796590923.0, "step": 20875 }, { "epoch": 2.6556417758554893, "ewc_loss": 0.0342283695936203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4228371077915654e-05, "grad_norm": 19.41957664489746, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8706322908401489, "num_tokens": 796629455.0, "step": 20876 }, { "epoch": 2.6557689861340794, "ewc_loss": 0.03435221686959267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435221515246667e-05, "grad_norm": 19.53752899169922, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8719795942306519, "num_tokens": 796669950.0, "step": 20877 }, { "epoch": 2.6558961964126704, "ewc_loss": 0.03424648940563202, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424648821237497e-05, "grad_norm": 19.361968994140625, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8652755618095398, "num_tokens": 796712263.0, "step": 20878 }, { "epoch": 2.6560234066912605, "ewc_loss": 0.03426918014883995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426918192417361e-05, "grad_norm": 19.48110008239746, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8756604790687561, "num_tokens": 796753025.0, "step": 20879 }, { "epoch": 2.6561506169698514, "ewc_loss": 0.034329552203416824, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432955054449849e-05, "grad_norm": 19.47857093811035, "learning_rate": 1e-06, "loss": 0.345, "mean_token_accuracy": 0.8872165083885193, "num_tokens": 796788810.0, "step": 20880 }, { "epoch": 2.6562778272484415, "ewc_loss": 0.03427554666996002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427554838708602e-05, "grad_norm": 19.429094314575195, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8776273727416992, "num_tokens": 796826275.0, "step": 20881 }, { "epoch": 2.656405037527032, "ewc_loss": 0.03428245708346367, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4282456908840686e-05, "grad_norm": 19.466733932495117, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8709561228752136, "num_tokens": 796859587.0, "step": 20882 }, { "epoch": 2.6565322478056226, "ewc_loss": 0.034317102283239365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431710138102062e-05, "grad_norm": 19.464387893676758, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.881206750869751, "num_tokens": 796899440.0, "step": 20883 }, { "epoch": 2.656659458084213, "ewc_loss": 0.03427598997950554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427598858252168e-05, "grad_norm": 19.45923614501953, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.865656852722168, "num_tokens": 796938703.0, "step": 20884 }, { "epoch": 2.6567866683628036, "ewc_loss": 0.034345317631959915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4345317544648424e-05, "grad_norm": 19.486631393432617, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.861947238445282, "num_tokens": 796982287.0, "step": 20885 }, { "epoch": 2.656913878641394, "ewc_loss": 0.03431321308016777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431321238167584e-05, "grad_norm": 19.511186599731445, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8769856691360474, "num_tokens": 797017640.0, "step": 20886 }, { "epoch": 2.6570410889199847, "ewc_loss": 0.03428656980395317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428657146287151e-05, "grad_norm": 19.475927352905273, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8601094484329224, "num_tokens": 797054050.0, "step": 20887 }, { "epoch": 2.6571682991985752, "ewc_loss": 0.0342792384326458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427923729759641e-05, "grad_norm": 19.469751358032227, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8792207837104797, "num_tokens": 797092429.0, "step": 20888 }, { "epoch": 2.6572955094771658, "ewc_loss": 0.03429115191102028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4291151678189635e-05, "grad_norm": 19.557817459106445, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8728193044662476, "num_tokens": 797125646.0, "step": 20889 }, { "epoch": 2.6574227197557563, "ewc_loss": 0.03431035578250885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431035656831227e-05, "grad_norm": 19.448558807373047, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8779227137565613, "num_tokens": 797164560.0, "step": 20890 }, { "epoch": 2.657549930034347, "ewc_loss": 0.034253623336553574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425362228881568e-05, "grad_norm": 19.4763240814209, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8732994794845581, "num_tokens": 797194168.0, "step": 20891 }, { "epoch": 2.6576771403129373, "ewc_loss": 0.03434596210718155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434596146689728e-05, "grad_norm": 19.502910614013672, "learning_rate": 1e-06, "loss": 0.3353, "mean_token_accuracy": 0.8940030932426453, "num_tokens": 797227045.0, "step": 20892 }, { "epoch": 2.657804350591528, "ewc_loss": 0.03428814932703972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4288150345673785e-05, "grad_norm": 19.359636306762695, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8890389204025269, "num_tokens": 797267607.0, "step": 20893 }, { "epoch": 2.6579315608701184, "ewc_loss": 0.03430287167429924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4302873245906085e-05, "grad_norm": 19.469696044921875, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8646277785301208, "num_tokens": 797309320.0, "step": 20894 }, { "epoch": 2.658058771148709, "ewc_loss": 0.03437555953860283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437556006247178e-05, "grad_norm": 19.413043975830078, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8726072311401367, "num_tokens": 797348258.0, "step": 20895 }, { "epoch": 2.6581859814272994, "ewc_loss": 0.03428183123469353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4281831176485866e-05, "grad_norm": 19.45616340637207, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8749426603317261, "num_tokens": 797391563.0, "step": 20896 }, { "epoch": 2.65831319170589, "ewc_loss": 0.03445008024573326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4450080420356244e-05, "grad_norm": 19.535886764526367, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8837830424308777, "num_tokens": 797428314.0, "step": 20897 }, { "epoch": 2.6584404019844805, "ewc_loss": 0.03431950509548187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431950608501211e-05, "grad_norm": 19.506166458129883, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8557090759277344, "num_tokens": 797464480.0, "step": 20898 }, { "epoch": 2.658567612263071, "ewc_loss": 0.03438230976462364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438230851315893e-05, "grad_norm": 19.50588035583496, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8765287399291992, "num_tokens": 797504135.0, "step": 20899 }, { "epoch": 2.658694822541661, "ewc_loss": 0.034376244992017746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437624400248751e-05, "grad_norm": 19.533416748046875, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8777916431427002, "num_tokens": 797544222.0, "step": 20900 }, { "epoch": 2.658822032820252, "ewc_loss": 0.03425212576985359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425212707952596e-05, "grad_norm": 19.439464569091797, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8814654350280762, "num_tokens": 797583695.0, "step": 20901 }, { "epoch": 2.658949243098842, "ewc_loss": 0.03434737026691437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4347369364695624e-05, "grad_norm": 19.54317855834961, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.862317681312561, "num_tokens": 797625660.0, "step": 20902 }, { "epoch": 2.659076453377433, "ewc_loss": 0.03431606665253639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4316068195039406e-05, "grad_norm": 19.417776107788086, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8747859001159668, "num_tokens": 797665807.0, "step": 20903 }, { "epoch": 2.6592036636560232, "ewc_loss": 0.0341891273856163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4189128200523555e-05, "grad_norm": 19.495573043823242, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8597862124443054, "num_tokens": 797707935.0, "step": 20904 }, { "epoch": 2.6593308739346138, "ewc_loss": 0.03435589745640755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4355896787019446e-05, "grad_norm": 19.482465744018555, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8684431314468384, "num_tokens": 797745502.0, "step": 20905 }, { "epoch": 2.6594580842132043, "ewc_loss": 0.034230850636959076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423085217946209e-05, "grad_norm": 19.469440460205078, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8726816177368164, "num_tokens": 797779412.0, "step": 20906 }, { "epoch": 2.659585294491795, "ewc_loss": 0.034348417073488235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4348417102592066e-05, "grad_norm": 19.491430282592773, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8779543042182922, "num_tokens": 797816677.0, "step": 20907 }, { "epoch": 2.6597125047703853, "ewc_loss": 0.034240540117025375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4240540117025375e-05, "grad_norm": 19.480714797973633, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8572632670402527, "num_tokens": 797854260.0, "step": 20908 }, { "epoch": 2.659839715048976, "ewc_loss": 0.03426747024059296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4267472074134275e-05, "grad_norm": 19.46957015991211, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8624842166900635, "num_tokens": 797892349.0, "step": 20909 }, { "epoch": 2.6599669253275664, "ewc_loss": 0.03426159545779228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426159673836082e-05, "grad_norm": 19.510875701904297, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8792351484298706, "num_tokens": 797928028.0, "step": 20910 }, { "epoch": 2.660094135606157, "ewc_loss": 0.03430428355932236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4304284781683236e-05, "grad_norm": 19.452253341674805, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8611098527908325, "num_tokens": 797963885.0, "step": 20911 }, { "epoch": 2.6602213458847475, "ewc_loss": 0.034223586320877075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422358713578433e-05, "grad_norm": 19.499683380126953, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8623342514038086, "num_tokens": 798009842.0, "step": 20912 }, { "epoch": 2.660348556163338, "ewc_loss": 0.034294936805963516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429493517614901e-05, "grad_norm": 19.47093963623047, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.8879088163375854, "num_tokens": 798043207.0, "step": 20913 }, { "epoch": 2.6604757664419285, "ewc_loss": 0.03421402722597122, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421402652747929e-05, "grad_norm": 19.499691009521484, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8912301063537598, "num_tokens": 798080887.0, "step": 20914 }, { "epoch": 2.660602976720519, "ewc_loss": 0.03427715227007866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4277152735739946e-05, "grad_norm": 19.538593292236328, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.884562611579895, "num_tokens": 798126393.0, "step": 20915 }, { "epoch": 2.6607301869991096, "ewc_loss": 0.034256525337696075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4256525395903736e-05, "grad_norm": 19.481630325317383, "learning_rate": 1e-06, "loss": 0.3472, "mean_token_accuracy": 0.8906854391098022, "num_tokens": 798166099.0, "step": 20916 }, { "epoch": 2.6608573972777, "ewc_loss": 0.03428798168897629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428798299864866e-05, "grad_norm": 19.469852447509766, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8693046569824219, "num_tokens": 798202285.0, "step": 20917 }, { "epoch": 2.6609846075562906, "ewc_loss": 0.03427460789680481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427460615057498e-05, "grad_norm": 19.56379508972168, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8822449445724487, "num_tokens": 798243463.0, "step": 20918 }, { "epoch": 2.661111817834881, "ewc_loss": 0.034254446625709534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425444811000489e-05, "grad_norm": 19.413755416870117, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8777931332588196, "num_tokens": 798275507.0, "step": 20919 }, { "epoch": 2.6612390281134717, "ewc_loss": 0.03419935330748558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419935455895029e-05, "grad_norm": 19.474849700927734, "learning_rate": 1e-06, "loss": 0.3329, "mean_token_accuracy": 0.8928561806678772, "num_tokens": 798310536.0, "step": 20920 }, { "epoch": 2.661366238392062, "ewc_loss": 0.03430917486548424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430917422519997e-05, "grad_norm": 19.46881675720215, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8698225021362305, "num_tokens": 798347874.0, "step": 20921 }, { "epoch": 2.6614934486706527, "ewc_loss": 0.03418876230716705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418876076466404e-05, "grad_norm": 19.42625617980957, "learning_rate": 1e-06, "loss": 0.3174, "mean_token_accuracy": 0.8990182876586914, "num_tokens": 798381865.0, "step": 20922 }, { "epoch": 2.661620658949243, "ewc_loss": 0.03433447703719139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433447636780329e-05, "grad_norm": 19.519012451171875, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8743540644645691, "num_tokens": 798423378.0, "step": 20923 }, { "epoch": 2.661747869227834, "ewc_loss": 0.03429361805319786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4293618227820843e-05, "grad_norm": 19.476728439331055, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8817150592803955, "num_tokens": 798460278.0, "step": 20924 }, { "epoch": 2.661875079506424, "ewc_loss": 0.03426443412899971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4264434361830354e-05, "grad_norm": 19.52723503112793, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8763110041618347, "num_tokens": 798496622.0, "step": 20925 }, { "epoch": 2.662002289785015, "ewc_loss": 0.03432837873697281, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4328379115322605e-05, "grad_norm": 19.558441162109375, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.871187686920166, "num_tokens": 798533061.0, "step": 20926 }, { "epoch": 2.662129500063605, "ewc_loss": 0.03422423079609871, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422423105803318e-05, "grad_norm": 19.54092025756836, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8676874041557312, "num_tokens": 798573689.0, "step": 20927 }, { "epoch": 2.662256710342196, "ewc_loss": 0.034214965999126434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421496512601152e-05, "grad_norm": 19.51351547241211, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8849546909332275, "num_tokens": 798614058.0, "step": 20928 }, { "epoch": 2.662383920620786, "ewc_loss": 0.0341818630695343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418186315684579e-05, "grad_norm": 19.446247100830078, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8648619651794434, "num_tokens": 798657769.0, "step": 20929 }, { "epoch": 2.6625111308993765, "ewc_loss": 0.034221988171339035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422198642510921e-05, "grad_norm": 19.51352310180664, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8790960311889648, "num_tokens": 798701084.0, "step": 20930 }, { "epoch": 2.662638341177967, "ewc_loss": 0.03431817516684532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431817458476871e-05, "grad_norm": 19.540531158447266, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8808392286300659, "num_tokens": 798738029.0, "step": 20931 }, { "epoch": 2.6627655514565576, "ewc_loss": 0.0342165008187294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421650035306811e-05, "grad_norm": 19.43172836303711, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8710943460464478, "num_tokens": 798773537.0, "step": 20932 }, { "epoch": 2.662892761735148, "ewc_loss": 0.03418450057506561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4184500691480935e-05, "grad_norm": 19.459653854370117, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.871881365776062, "num_tokens": 798810397.0, "step": 20933 }, { "epoch": 2.6630199720137386, "ewc_loss": 0.034267082810401917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4267082810401917e-05, "grad_norm": 19.507366180419922, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8714457750320435, "num_tokens": 798850796.0, "step": 20934 }, { "epoch": 2.663147182292329, "ewc_loss": 0.0342264287173748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4226428397232667e-05, "grad_norm": 19.49629020690918, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8742387890815735, "num_tokens": 798892379.0, "step": 20935 }, { "epoch": 2.6632743925709197, "ewc_loss": 0.03426903113722801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426903276704252e-05, "grad_norm": 19.547748565673828, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8640706539154053, "num_tokens": 798929444.0, "step": 20936 }, { "epoch": 2.66340160284951, "ewc_loss": 0.0341976061463356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419760469114408e-05, "grad_norm": 19.49374771118164, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.8815897107124329, "num_tokens": 798964086.0, "step": 20937 }, { "epoch": 2.6635288131281007, "ewc_loss": 0.034185007214546204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418500637053512e-05, "grad_norm": 19.49492835998535, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.882815957069397, "num_tokens": 799007408.0, "step": 20938 }, { "epoch": 2.6636560234066913, "ewc_loss": 0.034166157245635986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416615800233558e-05, "grad_norm": 19.486774444580078, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8804870843887329, "num_tokens": 799045930.0, "step": 20939 }, { "epoch": 2.663783233685282, "ewc_loss": 0.03420007973909378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42000785167329e-05, "grad_norm": 19.490938186645508, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8850304484367371, "num_tokens": 799083568.0, "step": 20940 }, { "epoch": 2.6639104439638723, "ewc_loss": 0.034259624779224396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425962495384738e-05, "grad_norm": 19.477602005004883, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8718696236610413, "num_tokens": 799123681.0, "step": 20941 }, { "epoch": 2.664037654242463, "ewc_loss": 0.03419141843914986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419142012717202e-05, "grad_norm": 19.518903732299805, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8715842962265015, "num_tokens": 799164623.0, "step": 20942 }, { "epoch": 2.6641648645210534, "ewc_loss": 0.03418637439608574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418637425056659e-05, "grad_norm": 19.438438415527344, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8752878904342651, "num_tokens": 799200408.0, "step": 20943 }, { "epoch": 2.664292074799644, "ewc_loss": 0.034215476363897324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421547808102332e-05, "grad_norm": 19.460163116455078, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8631654977798462, "num_tokens": 799245106.0, "step": 20944 }, { "epoch": 2.6644192850782344, "ewc_loss": 0.03429540619254112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4295404475415125e-05, "grad_norm": 19.520296096801758, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8823405504226685, "num_tokens": 799290429.0, "step": 20945 }, { "epoch": 2.664546495356825, "ewc_loss": 0.034199297428131104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419929635128938e-05, "grad_norm": 19.435455322265625, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8729812502861023, "num_tokens": 799322951.0, "step": 20946 }, { "epoch": 2.6646737056354155, "ewc_loss": 0.034215621650218964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42156199621968e-05, "grad_norm": 19.494930267333984, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8715215921401978, "num_tokens": 799365498.0, "step": 20947 }, { "epoch": 2.6648009159140056, "ewc_loss": 0.034138746559619904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.413874583202414e-05, "grad_norm": 19.398845672607422, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8694908618927002, "num_tokens": 799405894.0, "step": 20948 }, { "epoch": 2.6649281261925966, "ewc_loss": 0.03419078513979912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.419078348088078e-05, "grad_norm": 19.545597076416016, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8755723237991333, "num_tokens": 799439750.0, "step": 20949 }, { "epoch": 2.6650553364711866, "ewc_loss": 0.03423771262168884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4237713407492265e-05, "grad_norm": 19.47803497314453, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8671491146087646, "num_tokens": 799475170.0, "step": 20950 }, { "epoch": 2.6651825467497776, "ewc_loss": 0.03418317064642906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.418316919123754e-05, "grad_norm": 19.489660263061523, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8619443774223328, "num_tokens": 799514231.0, "step": 20951 }, { "epoch": 2.6653097570283677, "ewc_loss": 0.03426843136548996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426843250053935e-05, "grad_norm": 19.494136810302734, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8762708306312561, "num_tokens": 799549512.0, "step": 20952 }, { "epoch": 2.6654369673069587, "ewc_loss": 0.034188076853752136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4188076824648306e-05, "grad_norm": 19.39714813232422, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8705741167068481, "num_tokens": 799592144.0, "step": 20953 }, { "epoch": 2.6655641775855488, "ewc_loss": 0.034320276230573654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432027733651921e-05, "grad_norm": 19.51890754699707, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8608576655387878, "num_tokens": 799634779.0, "step": 20954 }, { "epoch": 2.6656913878641393, "ewc_loss": 0.034316856414079666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4316857636440545e-05, "grad_norm": 19.518760681152344, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8869323134422302, "num_tokens": 799677548.0, "step": 20955 }, { "epoch": 2.66581859814273, "ewc_loss": 0.03420817479491234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420817301957868e-05, "grad_norm": 19.446897506713867, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8578174114227295, "num_tokens": 799715679.0, "step": 20956 }, { "epoch": 2.6659458084213203, "ewc_loss": 0.03426774963736534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4267748560523614e-05, "grad_norm": 19.444412231445312, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8676119446754456, "num_tokens": 799755701.0, "step": 20957 }, { "epoch": 2.666073018699911, "ewc_loss": 0.03428554907441139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4285549190826714e-05, "grad_norm": 19.490943908691406, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8695331811904907, "num_tokens": 799795151.0, "step": 20958 }, { "epoch": 2.6662002289785014, "ewc_loss": 0.03431783616542816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431783625273965e-05, "grad_norm": 19.497941970825195, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8812235593795776, "num_tokens": 799832249.0, "step": 20959 }, { "epoch": 2.666327439257092, "ewc_loss": 0.03425613045692444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425612885621376e-05, "grad_norm": 19.4329833984375, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8551532030105591, "num_tokens": 799867762.0, "step": 20960 }, { "epoch": 2.6664546495356825, "ewc_loss": 0.03432666137814522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432666198932566e-05, "grad_norm": 19.475460052490234, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8707472085952759, "num_tokens": 799909012.0, "step": 20961 }, { "epoch": 2.666581859814273, "ewc_loss": 0.03430335968732834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4303360735066235e-05, "grad_norm": 19.480730056762695, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8718440532684326, "num_tokens": 799950316.0, "step": 20962 }, { "epoch": 2.6667090700928635, "ewc_loss": 0.03433210775256157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4332108043599874e-05, "grad_norm": 19.470279693603516, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8893594741821289, "num_tokens": 799990037.0, "step": 20963 }, { "epoch": 2.666836280371454, "ewc_loss": 0.03425050899386406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425050817895681e-05, "grad_norm": 19.5582275390625, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8838144540786743, "num_tokens": 800023935.0, "step": 20964 }, { "epoch": 2.6669634906500446, "ewc_loss": 0.03432278707623482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4322787541896105e-05, "grad_norm": 19.409276962280273, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8799415826797485, "num_tokens": 800066357.0, "step": 20965 }, { "epoch": 2.667090700928635, "ewc_loss": 0.03425408527255058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425408431212418e-05, "grad_norm": 19.57179069519043, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8617156744003296, "num_tokens": 800100667.0, "step": 20966 }, { "epoch": 2.6672179112072256, "ewc_loss": 0.03434666618704796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4346667234785855e-05, "grad_norm": 19.52808952331543, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8783982992172241, "num_tokens": 800135101.0, "step": 20967 }, { "epoch": 2.667345121485816, "ewc_loss": 0.034198861569166183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4198859793832526e-05, "grad_norm": 19.424087524414062, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.888969898223877, "num_tokens": 800176308.0, "step": 20968 }, { "epoch": 2.6674723317644067, "ewc_loss": 0.034278154373168945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4278153179911897e-05, "grad_norm": 19.47374153137207, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.885918378829956, "num_tokens": 800205585.0, "step": 20969 }, { "epoch": 2.667599542042997, "ewc_loss": 0.03426670283079147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426670446060598e-05, "grad_norm": 19.463876724243164, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8749053478240967, "num_tokens": 800241400.0, "step": 20970 }, { "epoch": 2.6677267523215877, "ewc_loss": 0.0343133918941021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431339064263739e-05, "grad_norm": 19.556276321411133, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.885438084602356, "num_tokens": 800279060.0, "step": 20971 }, { "epoch": 2.6678539626001783, "ewc_loss": 0.034308284521102905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430828292039223e-05, "grad_norm": 19.53233528137207, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.883348822593689, "num_tokens": 800318606.0, "step": 20972 }, { "epoch": 2.6679811728787683, "ewc_loss": 0.03425353392958641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425353497732431e-05, "grad_norm": 19.41493034362793, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8736843466758728, "num_tokens": 800356795.0, "step": 20973 }, { "epoch": 2.6681083831573593, "ewc_loss": 0.03429970145225525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42997009283863e-05, "grad_norm": 19.561925888061523, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8831919431686401, "num_tokens": 800393786.0, "step": 20974 }, { "epoch": 2.6682355934359494, "ewc_loss": 0.03428799659013748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428799755056389e-05, "grad_norm": 19.395782470703125, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8731126189231873, "num_tokens": 800440153.0, "step": 20975 }, { "epoch": 2.6683628037145404, "ewc_loss": 0.03423643112182617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423643283895217e-05, "grad_norm": 19.5817813873291, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8581333160400391, "num_tokens": 800476343.0, "step": 20976 }, { "epoch": 2.6684900139931305, "ewc_loss": 0.034375421702861786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437542181927711e-05, "grad_norm": 19.446332931518555, "learning_rate": 1e-06, "loss": 0.3536, "mean_token_accuracy": 0.8873679041862488, "num_tokens": 800514227.0, "step": 20977 }, { "epoch": 2.668617224271721, "ewc_loss": 0.034243494272232056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4243494155816734e-05, "grad_norm": 19.431127548217773, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8730082511901855, "num_tokens": 800552357.0, "step": 20978 }, { "epoch": 2.6687444345503115, "ewc_loss": 0.03436575084924698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436575207160786e-05, "grad_norm": 19.512283325195312, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8827404975891113, "num_tokens": 800587624.0, "step": 20979 }, { "epoch": 2.668871644828902, "ewc_loss": 0.03428801894187927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428801937843673e-05, "grad_norm": 19.418615341186523, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8697208166122437, "num_tokens": 800631183.0, "step": 20980 }, { "epoch": 2.6689988551074926, "ewc_loss": 0.03435134515166283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4351345675531775e-05, "grad_norm": 19.510602951049805, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.8799883127212524, "num_tokens": 800670334.0, "step": 20981 }, { "epoch": 2.669126065386083, "ewc_loss": 0.03435320407152176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43532046827022e-05, "grad_norm": 19.545991897583008, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8717927932739258, "num_tokens": 800703500.0, "step": 20982 }, { "epoch": 2.6692532756646736, "ewc_loss": 0.03433667868375778, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433667734498158e-05, "grad_norm": 19.484935760498047, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.869286060333252, "num_tokens": 800742904.0, "step": 20983 }, { "epoch": 2.669380485943264, "ewc_loss": 0.03427909314632416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4279091778444126e-05, "grad_norm": 19.464271545410156, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8720858693122864, "num_tokens": 800777556.0, "step": 20984 }, { "epoch": 2.6695076962218547, "ewc_loss": 0.034346286207437515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434628524701111e-05, "grad_norm": 19.495214462280273, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8670275211334229, "num_tokens": 800817532.0, "step": 20985 }, { "epoch": 2.669634906500445, "ewc_loss": 0.0343501977622509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4350196074228734e-05, "grad_norm": 19.42074203491211, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.8841896653175354, "num_tokens": 800856567.0, "step": 20986 }, { "epoch": 2.6697621167790357, "ewc_loss": 0.03435342386364937, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4353422961430624e-05, "grad_norm": 19.543840408325195, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8522772789001465, "num_tokens": 800901234.0, "step": 20987 }, { "epoch": 2.6698893270576263, "ewc_loss": 0.034459806978702545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445980837568641e-05, "grad_norm": 19.541841506958008, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8725868463516235, "num_tokens": 800942647.0, "step": 20988 }, { "epoch": 2.670016537336217, "ewc_loss": 0.03431599587202072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4315995435463265e-05, "grad_norm": 19.587818145751953, "learning_rate": 1e-06, "loss": 0.3739, "mean_token_accuracy": 0.8821057081222534, "num_tokens": 800980194.0, "step": 20989 }, { "epoch": 2.6701437476148073, "ewc_loss": 0.03434126824140549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434126847423613e-05, "grad_norm": 19.46234893798828, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.879401683807373, "num_tokens": 801016978.0, "step": 20990 }, { "epoch": 2.670270957893398, "ewc_loss": 0.0343133881688118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431338700465858e-05, "grad_norm": 19.589101791381836, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8617463707923889, "num_tokens": 801051921.0, "step": 20991 }, { "epoch": 2.6703981681719884, "ewc_loss": 0.03437760844826698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437760824454017e-05, "grad_norm": 19.48748207092285, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8501607775688171, "num_tokens": 801087189.0, "step": 20992 }, { "epoch": 2.670525378450579, "ewc_loss": 0.03428558260202408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428558193263598e-05, "grad_norm": 19.52408790588379, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8786804676055908, "num_tokens": 801121574.0, "step": 20993 }, { "epoch": 2.6706525887291694, "ewc_loss": 0.03441677242517471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441677108639851e-05, "grad_norm": 19.48282814025879, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8801825642585754, "num_tokens": 801160497.0, "step": 20994 }, { "epoch": 2.67077979900776, "ewc_loss": 0.03430060297250748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430060314713046e-05, "grad_norm": 19.522367477416992, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8755226135253906, "num_tokens": 801198556.0, "step": 20995 }, { "epoch": 2.6709070092863505, "ewc_loss": 0.03434542566537857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4345426684012637e-05, "grad_norm": 19.47445297241211, "learning_rate": 1e-06, "loss": 0.3126, "mean_token_accuracy": 0.8984419107437134, "num_tokens": 801230616.0, "step": 20996 }, { "epoch": 2.671034219564941, "ewc_loss": 0.03422074019908905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4220738598378375e-05, "grad_norm": 19.432710647583008, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8731878995895386, "num_tokens": 801266582.0, "step": 20997 }, { "epoch": 2.671161429843531, "ewc_loss": 0.03436674922704697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4366748877801e-05, "grad_norm": 19.48609733581543, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8860083222389221, "num_tokens": 801305127.0, "step": 20998 }, { "epoch": 2.671288640122122, "ewc_loss": 0.03431505337357521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431505319895223e-05, "grad_norm": 19.493677139282227, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8797222971916199, "num_tokens": 801348284.0, "step": 20999 }, { "epoch": 2.671415850400712, "ewc_loss": 0.034319277852773666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431927689234726e-05, "grad_norm": 19.5251522064209, "learning_rate": 1e-06, "loss": 0.3288, "mean_token_accuracy": 0.8941371440887451, "num_tokens": 801385298.0, "step": 21000 }, { "epoch": 2.671543060679303, "ewc_loss": 0.03429849445819855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4298493119422346e-05, "grad_norm": 19.508716583251953, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8666606545448303, "num_tokens": 801417020.0, "step": 21001 }, { "epoch": 2.6716702709578932, "ewc_loss": 0.03435168415307999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4351684007560834e-05, "grad_norm": 19.567102432250977, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8679369688034058, "num_tokens": 801455328.0, "step": 21002 }, { "epoch": 2.6717974812364838, "ewc_loss": 0.03426116704940796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4261167456861585e-05, "grad_norm": 19.461091995239258, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.884621798992157, "num_tokens": 801494342.0, "step": 21003 }, { "epoch": 2.6719246915150743, "ewc_loss": 0.03426985815167427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426985858823173e-05, "grad_norm": 19.498340606689453, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8773984909057617, "num_tokens": 801528182.0, "step": 21004 }, { "epoch": 2.672051901793665, "ewc_loss": 0.03436204418540001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436204497120343e-05, "grad_norm": 19.55745506286621, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8754034042358398, "num_tokens": 801569424.0, "step": 21005 }, { "epoch": 2.6721791120722553, "ewc_loss": 0.03428453952074051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428453783271834e-05, "grad_norm": 19.409997940063477, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8727689981460571, "num_tokens": 801608809.0, "step": 21006 }, { "epoch": 2.672306322350846, "ewc_loss": 0.03423601761460304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423601810936816e-05, "grad_norm": 19.441638946533203, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8669548034667969, "num_tokens": 801651910.0, "step": 21007 }, { "epoch": 2.6724335326294364, "ewc_loss": 0.03434744477272034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434744576225057e-05, "grad_norm": 19.49910545349121, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8732255697250366, "num_tokens": 801692961.0, "step": 21008 }, { "epoch": 2.672560742908027, "ewc_loss": 0.03431033343076706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431033474043943e-05, "grad_norm": 19.521514892578125, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8631088733673096, "num_tokens": 801731278.0, "step": 21009 }, { "epoch": 2.6726879531866174, "ewc_loss": 0.034296829253435135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4296830563107505e-05, "grad_norm": 19.455455780029297, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8708503246307373, "num_tokens": 801769562.0, "step": 21010 }, { "epoch": 2.672815163465208, "ewc_loss": 0.03425632789731026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4256328945048153e-05, "grad_norm": 19.484506607055664, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8700921535491943, "num_tokens": 801811046.0, "step": 21011 }, { "epoch": 2.6729423737437985, "ewc_loss": 0.03433258831501007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433258825680241e-05, "grad_norm": 19.518617630004883, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8524352312088013, "num_tokens": 801847833.0, "step": 21012 }, { "epoch": 2.673069584022389, "ewc_loss": 0.03430650010704994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4306500310776755e-05, "grad_norm": 19.49285316467285, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8699519038200378, "num_tokens": 801887819.0, "step": 21013 }, { "epoch": 2.6731967943009796, "ewc_loss": 0.03427761793136597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4277618397027254e-05, "grad_norm": 19.47943687438965, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8860979676246643, "num_tokens": 801922535.0, "step": 21014 }, { "epoch": 2.67332400457957, "ewc_loss": 0.03429143503308296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429143544053659e-05, "grad_norm": 19.497846603393555, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8800060749053955, "num_tokens": 801958025.0, "step": 21015 }, { "epoch": 2.6734512148581606, "ewc_loss": 0.03432194143533707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432193989283405e-05, "grad_norm": 19.47408676147461, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8804538249969482, "num_tokens": 801991291.0, "step": 21016 }, { "epoch": 2.673578425136751, "ewc_loss": 0.03427286818623543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427286719670519e-05, "grad_norm": 19.497629165649414, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8765865564346313, "num_tokens": 802024952.0, "step": 21017 }, { "epoch": 2.6737056354153417, "ewc_loss": 0.03437517210841179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437517079873942e-05, "grad_norm": 19.514822006225586, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.873694658279419, "num_tokens": 802065741.0, "step": 21018 }, { "epoch": 2.673832845693932, "ewc_loss": 0.03433147072792053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433147139730863e-05, "grad_norm": 19.52104949951172, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8744556903839111, "num_tokens": 802107153.0, "step": 21019 }, { "epoch": 2.6739600559725227, "ewc_loss": 0.034292031079530716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429203206906095e-05, "grad_norm": 19.407930374145508, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8802036643028259, "num_tokens": 802148177.0, "step": 21020 }, { "epoch": 2.674087266251113, "ewc_loss": 0.034318551421165466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431855293456465e-05, "grad_norm": 19.56484031677246, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8763799071311951, "num_tokens": 802186144.0, "step": 21021 }, { "epoch": 2.674214476529704, "ewc_loss": 0.03440555930137634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4405558835715055e-05, "grad_norm": 19.45789909362793, "learning_rate": 1e-06, "loss": 0.3503, "mean_token_accuracy": 0.8885009288787842, "num_tokens": 802224054.0, "step": 21022 }, { "epoch": 2.674341686808294, "ewc_loss": 0.03428908437490463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428908530622721e-05, "grad_norm": 19.548152923583984, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8586069345474243, "num_tokens": 802264483.0, "step": 21023 }, { "epoch": 2.674468897086885, "ewc_loss": 0.03437959775328636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437959821894765e-05, "grad_norm": 19.442949295043945, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.8827874064445496, "num_tokens": 802302933.0, "step": 21024 }, { "epoch": 2.674596107365475, "ewc_loss": 0.03429229184985161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4292290365556255e-05, "grad_norm": 19.548925399780273, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8681844472885132, "num_tokens": 802347354.0, "step": 21025 }, { "epoch": 2.674723317644066, "ewc_loss": 0.03437086194753647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437086343183182e-05, "grad_norm": 19.505573272705078, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8837023973464966, "num_tokens": 802392159.0, "step": 21026 }, { "epoch": 2.674850527922656, "ewc_loss": 0.03431744873523712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4317446989007294e-05, "grad_norm": 19.519886016845703, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8789687156677246, "num_tokens": 802432629.0, "step": 21027 }, { "epoch": 2.6749777382012465, "ewc_loss": 0.03432837501168251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43283754773438e-05, "grad_norm": 19.449052810668945, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8830240964889526, "num_tokens": 802471776.0, "step": 21028 }, { "epoch": 2.675104948479837, "ewc_loss": 0.03430410474538803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430410652072169e-05, "grad_norm": 19.502290725708008, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.869010865688324, "num_tokens": 802507314.0, "step": 21029 }, { "epoch": 2.6752321587584276, "ewc_loss": 0.03436359018087387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4363591112196445e-05, "grad_norm": 19.519845962524414, "learning_rate": 1e-06, "loss": 0.3611, "mean_token_accuracy": 0.8829087018966675, "num_tokens": 802550587.0, "step": 21030 }, { "epoch": 2.675359369037018, "ewc_loss": 0.034251805394887924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425180693739094e-05, "grad_norm": 19.44927406311035, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8755305409431458, "num_tokens": 802591544.0, "step": 21031 }, { "epoch": 2.6754865793156086, "ewc_loss": 0.03430170938372612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4301709092687815e-05, "grad_norm": 19.51449203491211, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8829728364944458, "num_tokens": 802627336.0, "step": 21032 }, { "epoch": 2.675613789594199, "ewc_loss": 0.034368496388196945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4368495107628405e-05, "grad_norm": 19.560970306396484, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8739185333251953, "num_tokens": 802668358.0, "step": 21033 }, { "epoch": 2.6757409998727897, "ewc_loss": 0.03421081602573395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.421081419219263e-05, "grad_norm": 19.488576889038086, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8820362091064453, "num_tokens": 802708023.0, "step": 21034 }, { "epoch": 2.67586821015138, "ewc_loss": 0.034312281757593155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4312281059101224e-05, "grad_norm": 19.530662536621094, "learning_rate": 1e-06, "loss": 0.3394, "mean_token_accuracy": 0.8953618407249451, "num_tokens": 802745856.0, "step": 21035 }, { "epoch": 2.6759954204299707, "ewc_loss": 0.03420822694897652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.420822758926079e-05, "grad_norm": 19.390235900878906, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8870325684547424, "num_tokens": 802779032.0, "step": 21036 }, { "epoch": 2.6761226307085613, "ewc_loss": 0.03416713327169418, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.416713298065588e-05, "grad_norm": 19.47823715209961, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8762302994728088, "num_tokens": 802815736.0, "step": 21037 }, { "epoch": 2.676249840987152, "ewc_loss": 0.0343160517513752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431605000514537e-05, "grad_norm": 19.47736167907715, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8722273707389832, "num_tokens": 802854133.0, "step": 21038 }, { "epoch": 2.6763770512657423, "ewc_loss": 0.0342494435608387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424944225116633e-05, "grad_norm": 19.467288970947266, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8823872804641724, "num_tokens": 802892969.0, "step": 21039 }, { "epoch": 2.676504261544333, "ewc_loss": 0.03423761948943138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.423761882004328e-05, "grad_norm": 19.42207908630371, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8891712427139282, "num_tokens": 802936998.0, "step": 21040 }, { "epoch": 2.6766314718229234, "ewc_loss": 0.03428666293621063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4286662412341684e-05, "grad_norm": 19.450756072998047, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8845182657241821, "num_tokens": 802979040.0, "step": 21041 }, { "epoch": 2.676758682101514, "ewc_loss": 0.034271854907274246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427185583859682e-05, "grad_norm": 19.478759765625, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8867263793945312, "num_tokens": 803016516.0, "step": 21042 }, { "epoch": 2.6768858923801044, "ewc_loss": 0.03424511477351189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.424511305638589e-05, "grad_norm": 19.49142074584961, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8735047578811646, "num_tokens": 803055566.0, "step": 21043 }, { "epoch": 2.677013102658695, "ewc_loss": 0.03425227850675583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425227987463586e-05, "grad_norm": 19.519332885742188, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.87703937292099, "num_tokens": 803093683.0, "step": 21044 }, { "epoch": 2.6771403129372855, "ewc_loss": 0.03429929167032242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42992898367811e-05, "grad_norm": 19.486865997314453, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8844119310379028, "num_tokens": 803133691.0, "step": 21045 }, { "epoch": 2.6772675232158756, "ewc_loss": 0.03426462039351463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4264619898749515e-05, "grad_norm": 19.570068359375, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.884113609790802, "num_tokens": 803169870.0, "step": 21046 }, { "epoch": 2.6773947334944665, "ewc_loss": 0.03429291024804115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429290882195346e-05, "grad_norm": 19.537500381469727, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8687071800231934, "num_tokens": 803209038.0, "step": 21047 }, { "epoch": 2.6775219437730566, "ewc_loss": 0.03422732278704643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422732334001921e-05, "grad_norm": 19.53199005126953, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8614858388900757, "num_tokens": 803239725.0, "step": 21048 }, { "epoch": 2.6776491540516476, "ewc_loss": 0.03429929167032242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42992898367811e-05, "grad_norm": 19.537771224975586, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8738672733306885, "num_tokens": 803273580.0, "step": 21049 }, { "epoch": 2.6777763643302377, "ewc_loss": 0.034251611679792404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4251610486535355e-05, "grad_norm": 19.382125854492188, "learning_rate": 1e-06, "loss": 0.3686, "mean_token_accuracy": 0.8814849853515625, "num_tokens": 803313583.0, "step": 21050 }, { "epoch": 2.6779035746088287, "ewc_loss": 0.03423942252993584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42394232575316e-05, "grad_norm": 19.569427490234375, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8836628198623657, "num_tokens": 803347517.0, "step": 21051 }, { "epoch": 2.6780307848874187, "ewc_loss": 0.03441115468740463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441115404712036e-05, "grad_norm": 19.46959114074707, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8620216250419617, "num_tokens": 803388183.0, "step": 21052 }, { "epoch": 2.6781579951660093, "ewc_loss": 0.03422292321920395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.422292502364144e-05, "grad_norm": 19.457717895507812, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8758103847503662, "num_tokens": 803428377.0, "step": 21053 }, { "epoch": 2.6782852054446, "ewc_loss": 0.034346118569374084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4346117899985984e-05, "grad_norm": 19.537813186645508, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8842790126800537, "num_tokens": 803460930.0, "step": 21054 }, { "epoch": 2.6784124157231903, "ewc_loss": 0.03433994576334953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433994424995035e-05, "grad_norm": 19.470773696899414, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8839901089668274, "num_tokens": 803497971.0, "step": 21055 }, { "epoch": 2.678539626001781, "ewc_loss": 0.03425150364637375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425150498514995e-05, "grad_norm": 19.46044921875, "learning_rate": 1e-06, "loss": 0.3436, "mean_token_accuracy": 0.890690803527832, "num_tokens": 803535514.0, "step": 21056 }, { "epoch": 2.6786668362803714, "ewc_loss": 0.03435147553682327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435147664276883e-05, "grad_norm": 19.43560028076172, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8722630143165588, "num_tokens": 803580767.0, "step": 21057 }, { "epoch": 2.678794046558962, "ewc_loss": 0.034406498074531555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4406497434247285e-05, "grad_norm": 19.523883819580078, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8789172768592834, "num_tokens": 803619168.0, "step": 21058 }, { "epoch": 2.6789212568375524, "ewc_loss": 0.03434944152832031, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4349443012615666e-05, "grad_norm": 19.43147087097168, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8627834320068359, "num_tokens": 803654785.0, "step": 21059 }, { "epoch": 2.679048467116143, "ewc_loss": 0.034340959042310715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434095924603753e-05, "grad_norm": 19.52975845336914, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8840552568435669, "num_tokens": 803692505.0, "step": 21060 }, { "epoch": 2.6791756773947335, "ewc_loss": 0.03438282012939453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438282146817073e-05, "grad_norm": 19.4052791595459, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.883222222328186, "num_tokens": 803732592.0, "step": 21061 }, { "epoch": 2.679302887673324, "ewc_loss": 0.03436386212706566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436386396060698e-05, "grad_norm": 19.51400375366211, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8643012046813965, "num_tokens": 803767601.0, "step": 21062 }, { "epoch": 2.6794300979519146, "ewc_loss": 0.034407325088977814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44073268934153e-05, "grad_norm": 19.495580673217773, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8696460723876953, "num_tokens": 803804329.0, "step": 21063 }, { "epoch": 2.679557308230505, "ewc_loss": 0.03435874730348587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4358748962404206e-05, "grad_norm": 19.43491554260254, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.871474027633667, "num_tokens": 803846182.0, "step": 21064 }, { "epoch": 2.6796845185090956, "ewc_loss": 0.03436980023980141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436980114202015e-05, "grad_norm": 19.50368309020996, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8718893527984619, "num_tokens": 803883561.0, "step": 21065 }, { "epoch": 2.679811728787686, "ewc_loss": 0.03443918377161026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4439184673829004e-05, "grad_norm": 19.588459014892578, "learning_rate": 1e-06, "loss": 0.3338, "mean_token_accuracy": 0.8940049409866333, "num_tokens": 803916860.0, "step": 21066 }, { "epoch": 2.6799389390662767, "ewc_loss": 0.034359533339738846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435953476582654e-05, "grad_norm": 19.428815841674805, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8762167692184448, "num_tokens": 803951949.0, "step": 21067 }, { "epoch": 2.680066149344867, "ewc_loss": 0.0344109907746315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4410990338074043e-05, "grad_norm": 19.53216552734375, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8726131319999695, "num_tokens": 803992094.0, "step": 21068 }, { "epoch": 2.6801933596234577, "ewc_loss": 0.03439255431294441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43925530614797e-05, "grad_norm": 19.43037223815918, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.876307487487793, "num_tokens": 804034058.0, "step": 21069 }, { "epoch": 2.6803205699020483, "ewc_loss": 0.03430714085698128, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43071405950468e-05, "grad_norm": 19.499265670776367, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8679113388061523, "num_tokens": 804072722.0, "step": 21070 }, { "epoch": 2.6804477801806383, "ewc_loss": 0.034487053751945496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448705319897272e-05, "grad_norm": 19.509727478027344, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8817480206489563, "num_tokens": 804114253.0, "step": 21071 }, { "epoch": 2.6805749904592293, "ewc_loss": 0.034309737384319305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430973811191507e-05, "grad_norm": 19.47045135498047, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8781161308288574, "num_tokens": 804154877.0, "step": 21072 }, { "epoch": 2.6807022007378194, "ewc_loss": 0.03437188267707825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437188206589781e-05, "grad_norm": 19.537673950195312, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8703927993774414, "num_tokens": 804191060.0, "step": 21073 }, { "epoch": 2.6808294110164104, "ewc_loss": 0.03435959666967392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435959661146626e-05, "grad_norm": 19.495920181274414, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8681195974349976, "num_tokens": 804231574.0, "step": 21074 }, { "epoch": 2.6809566212950005, "ewc_loss": 0.034318238496780396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431824006838724e-05, "grad_norm": 19.539039611816406, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.854381263256073, "num_tokens": 804264027.0, "step": 21075 }, { "epoch": 2.681083831573591, "ewc_loss": 0.03436293825507164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4362939913989976e-05, "grad_norm": 19.476150512695312, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.877672553062439, "num_tokens": 804302906.0, "step": 21076 }, { "epoch": 2.6812110418521815, "ewc_loss": 0.034271709620952606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.427171031944454e-05, "grad_norm": 19.529150009155273, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8713006377220154, "num_tokens": 804339107.0, "step": 21077 }, { "epoch": 2.681338252130772, "ewc_loss": 0.034385453909635544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438545536482707e-05, "grad_norm": 19.52577018737793, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8805458545684814, "num_tokens": 804374292.0, "step": 21078 }, { "epoch": 2.6814654624093626, "ewc_loss": 0.03437270224094391, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43727006111294e-05, "grad_norm": 19.566240310668945, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8768669366836548, "num_tokens": 804420287.0, "step": 21079 }, { "epoch": 2.681592672687953, "ewc_loss": 0.03429368883371353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429368734941818e-05, "grad_norm": 19.460227966308594, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8812099695205688, "num_tokens": 804453977.0, "step": 21080 }, { "epoch": 2.6817198829665436, "ewc_loss": 0.034309666603803635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4309665352338925e-05, "grad_norm": 19.568387985229492, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8603554368019104, "num_tokens": 804494958.0, "step": 21081 }, { "epoch": 2.681847093245134, "ewc_loss": 0.03438330814242363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438330895733088e-05, "grad_norm": 19.53250503540039, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8571179509162903, "num_tokens": 804532209.0, "step": 21082 }, { "epoch": 2.6819743035237247, "ewc_loss": 0.034262217581272125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4262218832736835e-05, "grad_norm": 19.52735137939453, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8777718544006348, "num_tokens": 804572937.0, "step": 21083 }, { "epoch": 2.682101513802315, "ewc_loss": 0.03432226926088333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43222709489055e-05, "grad_norm": 19.512617111206055, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8754667639732361, "num_tokens": 804610263.0, "step": 21084 }, { "epoch": 2.6822287240809057, "ewc_loss": 0.03426622226834297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4266220609424636e-05, "grad_norm": 19.541889190673828, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8791838884353638, "num_tokens": 804645516.0, "step": 21085 }, { "epoch": 2.6823559343594963, "ewc_loss": 0.03436049446463585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436049519223161e-05, "grad_norm": 19.530431747436523, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8861068487167358, "num_tokens": 804684540.0, "step": 21086 }, { "epoch": 2.682483144638087, "ewc_loss": 0.03426222875714302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4262229746673256e-05, "grad_norm": 19.506624221801758, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8767213821411133, "num_tokens": 804723030.0, "step": 21087 }, { "epoch": 2.6826103549166773, "ewc_loss": 0.034299351274967194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429935168242082e-05, "grad_norm": 19.598976135253906, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8847554922103882, "num_tokens": 804759202.0, "step": 21088 }, { "epoch": 2.682737565195268, "ewc_loss": 0.034286823123693466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4286822483409196e-05, "grad_norm": 19.553367614746094, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8723791837692261, "num_tokens": 804803199.0, "step": 21089 }, { "epoch": 2.6828647754738584, "ewc_loss": 0.034163009375333786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4163011150667444e-05, "grad_norm": 19.498125076293945, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8731807470321655, "num_tokens": 804835675.0, "step": 21090 }, { "epoch": 2.682991985752449, "ewc_loss": 0.03428808972239494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4288088500034064e-05, "grad_norm": 19.584243774414062, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8631675243377686, "num_tokens": 804876583.0, "step": 21091 }, { "epoch": 2.6831191960310394, "ewc_loss": 0.034275539219379425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42755374731496e-05, "grad_norm": 19.54527473449707, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8667562007904053, "num_tokens": 804906110.0, "step": 21092 }, { "epoch": 2.68324640630963, "ewc_loss": 0.03429882973432541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4298831451451406e-05, "grad_norm": 19.523958206176758, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8671901822090149, "num_tokens": 804942196.0, "step": 21093 }, { "epoch": 2.6833736165882205, "ewc_loss": 0.03426852822303772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426852708798833e-05, "grad_norm": 19.56474494934082, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8751966953277588, "num_tokens": 804978760.0, "step": 21094 }, { "epoch": 2.683500826866811, "ewc_loss": 0.03429882228374481, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4298820537514985e-05, "grad_norm": 19.564579010009766, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8680077791213989, "num_tokens": 805014212.0, "step": 21095 }, { "epoch": 2.683628037145401, "ewc_loss": 0.03432441130280495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432441008044407e-05, "grad_norm": 19.611846923828125, "learning_rate": 1e-06, "loss": 0.3197, "mean_token_accuracy": 0.8977323770523071, "num_tokens": 805050528.0, "step": 21096 }, { "epoch": 2.683755247423992, "ewc_loss": 0.03427181392908096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4271812182851136e-05, "grad_norm": 19.407737731933594, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8756588101387024, "num_tokens": 805087355.0, "step": 21097 }, { "epoch": 2.683882457702582, "ewc_loss": 0.034255459904670715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.425545946811326e-05, "grad_norm": 19.512882232666016, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8613520860671997, "num_tokens": 805117837.0, "step": 21098 }, { "epoch": 2.684009667981173, "ewc_loss": 0.03440788388252258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440788350417279e-05, "grad_norm": 19.50916290283203, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8745192289352417, "num_tokens": 805157734.0, "step": 21099 }, { "epoch": 2.684136878259763, "ewc_loss": 0.03432523459196091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432523590163328e-05, "grad_norm": 19.582782745361328, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8818684816360474, "num_tokens": 805190657.0, "step": 21100 }, { "epoch": 2.6842640885383537, "ewc_loss": 0.034332238137722015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433223901083693e-05, "grad_norm": 19.42679786682129, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.869024395942688, "num_tokens": 805229654.0, "step": 21101 }, { "epoch": 2.6843912988169443, "ewc_loss": 0.03438493609428406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438493513385765e-05, "grad_norm": 19.460975646972656, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8749282360076904, "num_tokens": 805269199.0, "step": 21102 }, { "epoch": 2.684518509095535, "ewc_loss": 0.03441235423088074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44123545801267e-05, "grad_norm": 19.525251388549805, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8707227110862732, "num_tokens": 805308918.0, "step": 21103 }, { "epoch": 2.6846457193741253, "ewc_loss": 0.03439686447381973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4396864066366106e-05, "grad_norm": 19.454641342163086, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8725876808166504, "num_tokens": 805345014.0, "step": 21104 }, { "epoch": 2.684772929652716, "ewc_loss": 0.03440672159194946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4406722988933325e-05, "grad_norm": 19.491649627685547, "learning_rate": 1e-06, "loss": 0.3536, "mean_token_accuracy": 0.8878488540649414, "num_tokens": 805383674.0, "step": 21105 }, { "epoch": 2.6849001399313064, "ewc_loss": 0.03443208709359169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443208697717637e-05, "grad_norm": 19.51551055908203, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.879747748374939, "num_tokens": 805419765.0, "step": 21106 }, { "epoch": 2.685027350209897, "ewc_loss": 0.03435300663113594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4353008231846616e-05, "grad_norm": 19.532672882080078, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8761777877807617, "num_tokens": 805461346.0, "step": 21107 }, { "epoch": 2.6851545604884874, "ewc_loss": 0.03439316153526306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439316060394049e-05, "grad_norm": 19.48647689819336, "learning_rate": 1e-06, "loss": 0.3712, "mean_token_accuracy": 0.8820388317108154, "num_tokens": 805494519.0, "step": 21108 }, { "epoch": 2.685281770767078, "ewc_loss": 0.034307654947042465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43076535500586e-05, "grad_norm": 19.451364517211914, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8659729361534119, "num_tokens": 805530542.0, "step": 21109 }, { "epoch": 2.6854089810456685, "ewc_loss": 0.03438465669751167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4384655009489506e-05, "grad_norm": 19.477951049804688, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8740121722221375, "num_tokens": 805564531.0, "step": 21110 }, { "epoch": 2.685536191324259, "ewc_loss": 0.03442570939660072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4425709600327536e-05, "grad_norm": 19.510934829711914, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8718445301055908, "num_tokens": 805606011.0, "step": 21111 }, { "epoch": 2.6856634016028496, "ewc_loss": 0.03440752625465393, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440752698224969e-05, "grad_norm": 19.4787654876709, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8689273595809937, "num_tokens": 805643078.0, "step": 21112 }, { "epoch": 2.68579061188144, "ewc_loss": 0.03440389409661293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440389264142141e-05, "grad_norm": 19.47992515563965, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8787446618080139, "num_tokens": 805678554.0, "step": 21113 }, { "epoch": 2.6859178221600306, "ewc_loss": 0.03439933806657791, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439933789195493e-05, "grad_norm": 19.477266311645508, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8852421045303345, "num_tokens": 805713736.0, "step": 21114 }, { "epoch": 2.686045032438621, "ewc_loss": 0.03443063423037529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443063542363234e-05, "grad_norm": 19.491666793823242, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8686254024505615, "num_tokens": 805751798.0, "step": 21115 }, { "epoch": 2.6861722427172117, "ewc_loss": 0.03446280211210251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4462802432244644e-05, "grad_norm": 19.560199737548828, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.882488489151001, "num_tokens": 805789205.0, "step": 21116 }, { "epoch": 2.686299452995802, "ewc_loss": 0.03438226133584976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438226121943444e-05, "grad_norm": 19.398483276367188, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8782360553741455, "num_tokens": 805824910.0, "step": 21117 }, { "epoch": 2.6864266632743927, "ewc_loss": 0.03439890965819359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439890861045569e-05, "grad_norm": 19.50005340576172, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8728170394897461, "num_tokens": 805861335.0, "step": 21118 }, { "epoch": 2.686553873552983, "ewc_loss": 0.03448377922177315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4483779018046334e-05, "grad_norm": 19.51793098449707, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8717881441116333, "num_tokens": 805901488.0, "step": 21119 }, { "epoch": 2.686681083831574, "ewc_loss": 0.034439049661159515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443905006861314e-05, "grad_norm": 19.64826202392578, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8390399217605591, "num_tokens": 805941336.0, "step": 21120 }, { "epoch": 2.686808294110164, "ewc_loss": 0.03449240326881409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449240466579795e-05, "grad_norm": 19.433809280395508, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8837366104125977, "num_tokens": 805974222.0, "step": 21121 }, { "epoch": 2.686935504388755, "ewc_loss": 0.03434872254729271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434872269281186e-05, "grad_norm": 19.514076232910156, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8717539310455322, "num_tokens": 806014942.0, "step": 21122 }, { "epoch": 2.687062714667345, "ewc_loss": 0.03446148335933685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446148184593767e-05, "grad_norm": 19.47708511352539, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.885205090045929, "num_tokens": 806048894.0, "step": 21123 }, { "epoch": 2.687189924945936, "ewc_loss": 0.03434499725699425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43449974025134e-05, "grad_norm": 19.44095802307129, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8733440041542053, "num_tokens": 806089714.0, "step": 21124 }, { "epoch": 2.687317135224526, "ewc_loss": 0.034419454634189606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4419455914758146e-05, "grad_norm": 19.450382232666016, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8634162545204163, "num_tokens": 806128974.0, "step": 21125 }, { "epoch": 2.6874443455031165, "ewc_loss": 0.0344141460955143, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441414446569979e-05, "grad_norm": 19.49612045288086, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8745869398117065, "num_tokens": 806169857.0, "step": 21126 }, { "epoch": 2.687571555781707, "ewc_loss": 0.034427400678396225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4427401260472834e-05, "grad_norm": 19.52443504333496, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.87619948387146, "num_tokens": 806202826.0, "step": 21127 }, { "epoch": 2.6876987660602976, "ewc_loss": 0.03446989879012108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446990012889728e-05, "grad_norm": 19.502670288085938, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.87278151512146, "num_tokens": 806242850.0, "step": 21128 }, { "epoch": 2.687825976338888, "ewc_loss": 0.03439176827669144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439176725805737e-05, "grad_norm": 19.544836044311523, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8762060403823853, "num_tokens": 806279024.0, "step": 21129 }, { "epoch": 2.6879531866174786, "ewc_loss": 0.034381356090307236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4381355362711474e-05, "grad_norm": 19.459575653076172, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8617963790893555, "num_tokens": 806313669.0, "step": 21130 }, { "epoch": 2.688080396896069, "ewc_loss": 0.03437548503279686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437548366491683e-05, "grad_norm": 19.5502986907959, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8781599998474121, "num_tokens": 806353128.0, "step": 21131 }, { "epoch": 2.6882076071746597, "ewc_loss": 0.03445436805486679, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4454369597369805e-05, "grad_norm": 19.484508514404297, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8754885792732239, "num_tokens": 806385044.0, "step": 21132 }, { "epoch": 2.68833481745325, "ewc_loss": 0.03439238667488098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4392385714454576e-05, "grad_norm": 19.51346778869629, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8674976825714111, "num_tokens": 806422157.0, "step": 21133 }, { "epoch": 2.6884620277318407, "ewc_loss": 0.0345175564289093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451755765127018e-05, "grad_norm": 19.56023597717285, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8745718002319336, "num_tokens": 806463180.0, "step": 21134 }, { "epoch": 2.6885892380104313, "ewc_loss": 0.03439467400312424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439467400312424e-05, "grad_norm": 19.452974319458008, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8719731569290161, "num_tokens": 806502031.0, "step": 21135 }, { "epoch": 2.688716448289022, "ewc_loss": 0.03440496325492859, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440496220719069e-05, "grad_norm": 19.49168586730957, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.87661212682724, "num_tokens": 806539585.0, "step": 21136 }, { "epoch": 2.6888436585676123, "ewc_loss": 0.03443066030740738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443066088948399e-05, "grad_norm": 19.47955322265625, "learning_rate": 1e-06, "loss": 0.3315, "mean_token_accuracy": 0.8951599597930908, "num_tokens": 806573147.0, "step": 21137 }, { "epoch": 2.688970868846203, "ewc_loss": 0.034412235021591187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441223452682607e-05, "grad_norm": 19.509557723999023, "learning_rate": 1e-06, "loss": 0.3539, "mean_token_accuracy": 0.8882347345352173, "num_tokens": 806609785.0, "step": 21138 }, { "epoch": 2.6890980791247934, "ewc_loss": 0.03438493236899376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4384931495878845e-05, "grad_norm": 19.530235290527344, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8835035562515259, "num_tokens": 806647709.0, "step": 21139 }, { "epoch": 2.689225289403384, "ewc_loss": 0.03442123159766197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4421231248416007e-05, "grad_norm": 19.52631378173828, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8785086274147034, "num_tokens": 806684448.0, "step": 21140 }, { "epoch": 2.6893524996819744, "ewc_loss": 0.03442027047276497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4420270822010934e-05, "grad_norm": 19.55175018310547, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8782895803451538, "num_tokens": 806725326.0, "step": 21141 }, { "epoch": 2.689479709960565, "ewc_loss": 0.03436378389596939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436378392507322e-05, "grad_norm": 19.458721160888672, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8752585649490356, "num_tokens": 806759797.0, "step": 21142 }, { "epoch": 2.6896069202391555, "ewc_loss": 0.03437909483909607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437909617787227e-05, "grad_norm": 19.492109298706055, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8819865584373474, "num_tokens": 806799986.0, "step": 21143 }, { "epoch": 2.6897341305177456, "ewc_loss": 0.0344109870493412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4410986700095236e-05, "grad_norm": 19.43338966369629, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8695453405380249, "num_tokens": 806843055.0, "step": 21144 }, { "epoch": 2.6898613407963365, "ewc_loss": 0.034443244338035583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444324465817772e-05, "grad_norm": 19.546260833740234, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.889285683631897, "num_tokens": 806875897.0, "step": 21145 }, { "epoch": 2.6899885510749266, "ewc_loss": 0.03443571925163269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4435717680025846e-05, "grad_norm": 19.58572769165039, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.883355438709259, "num_tokens": 806915583.0, "step": 21146 }, { "epoch": 2.6901157613535176, "ewc_loss": 0.03441889211535454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4418892028043047e-05, "grad_norm": 19.477294921875, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8779356479644775, "num_tokens": 806953851.0, "step": 21147 }, { "epoch": 2.6902429716321077, "ewc_loss": 0.034381985664367676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43819847330451e-05, "grad_norm": 19.64254379272461, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8720443844795227, "num_tokens": 806988519.0, "step": 21148 }, { "epoch": 2.6903701819106987, "ewc_loss": 0.034390889108181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439089050516486e-05, "grad_norm": 19.41817283630371, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8788813352584839, "num_tokens": 807029612.0, "step": 21149 }, { "epoch": 2.6904973921892887, "ewc_loss": 0.03435272350907326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435272446949966e-05, "grad_norm": 19.526241302490234, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8603284358978271, "num_tokens": 807064009.0, "step": 21150 }, { "epoch": 2.6906246024678793, "ewc_loss": 0.0344667062163353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446670598350465e-05, "grad_norm": 19.46929931640625, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8725509643554688, "num_tokens": 807104694.0, "step": 21151 }, { "epoch": 2.69075181274647, "ewc_loss": 0.034319281578063965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431928053032607e-05, "grad_norm": 19.585412979125977, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8544063568115234, "num_tokens": 807142500.0, "step": 21152 }, { "epoch": 2.6908790230250603, "ewc_loss": 0.03446630388498306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4466302167857066e-05, "grad_norm": 19.489192962646484, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8718791007995605, "num_tokens": 807172123.0, "step": 21153 }, { "epoch": 2.691006233303651, "ewc_loss": 0.03437032550573349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437032501096837e-05, "grad_norm": 19.52654266357422, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8772373199462891, "num_tokens": 807206422.0, "step": 21154 }, { "epoch": 2.6911334435822414, "ewc_loss": 0.03451291099190712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4512911952333525e-05, "grad_norm": 19.605100631713867, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8609962463378906, "num_tokens": 807240867.0, "step": 21155 }, { "epoch": 2.691260653860832, "ewc_loss": 0.03441299498081207, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441299486439675e-05, "grad_norm": 19.4491024017334, "learning_rate": 1e-06, "loss": 0.3555, "mean_token_accuracy": 0.8847885131835938, "num_tokens": 807275838.0, "step": 21156 }, { "epoch": 2.6913878641394224, "ewc_loss": 0.034407906234264374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440790533204563e-05, "grad_norm": 19.632993698120117, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8846486210823059, "num_tokens": 807315164.0, "step": 21157 }, { "epoch": 2.691515074418013, "ewc_loss": 0.03441757336258888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441757507971488e-05, "grad_norm": 19.43000030517578, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8664098978042603, "num_tokens": 807358016.0, "step": 21158 }, { "epoch": 2.6916422846966035, "ewc_loss": 0.034359320998191833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435932012507692e-05, "grad_norm": 19.535707473754883, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8738064765930176, "num_tokens": 807402751.0, "step": 21159 }, { "epoch": 2.691769494975194, "ewc_loss": 0.03444182500243187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444182584644295e-05, "grad_norm": 19.559329986572266, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8795596361160278, "num_tokens": 807442761.0, "step": 21160 }, { "epoch": 2.6918967052537845, "ewc_loss": 0.03438593074679375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4385931940050796e-05, "grad_norm": 19.551687240600586, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8831577301025391, "num_tokens": 807478550.0, "step": 21161 }, { "epoch": 2.692023915532375, "ewc_loss": 0.03438456729054451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4384567697998136e-05, "grad_norm": 19.426258087158203, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8837243914604187, "num_tokens": 807522756.0, "step": 21162 }, { "epoch": 2.6921511258109656, "ewc_loss": 0.03435920551419258, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43592073477339e-05, "grad_norm": 19.529483795166016, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8803849816322327, "num_tokens": 807561701.0, "step": 21163 }, { "epoch": 2.692278336089556, "ewc_loss": 0.03441229462623596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441229637246579e-05, "grad_norm": 19.471506118774414, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8741134405136108, "num_tokens": 807604058.0, "step": 21164 }, { "epoch": 2.6924055463681467, "ewc_loss": 0.034365784376859665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436578481341712e-05, "grad_norm": 19.571271896362305, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8804824948310852, "num_tokens": 807637344.0, "step": 21165 }, { "epoch": 2.692532756646737, "ewc_loss": 0.034420400857925415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442040178924799e-05, "grad_norm": 19.472436904907227, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.868722677230835, "num_tokens": 807676867.0, "step": 21166 }, { "epoch": 2.6926599669253277, "ewc_loss": 0.03426636755466461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426636612857692e-05, "grad_norm": 19.48885154724121, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8780105113983154, "num_tokens": 807707183.0, "step": 21167 }, { "epoch": 2.6927871772039182, "ewc_loss": 0.0344732291996479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447322887950577e-05, "grad_norm": 19.62232208251953, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8672319650650024, "num_tokens": 807737617.0, "step": 21168 }, { "epoch": 2.6929143874825083, "ewc_loss": 0.03437965735793114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437965642660856e-05, "grad_norm": 19.471601486206055, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8683956861495972, "num_tokens": 807774141.0, "step": 21169 }, { "epoch": 2.6930415977610993, "ewc_loss": 0.03438609093427658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438609201111831e-05, "grad_norm": 19.55453872680664, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8799954652786255, "num_tokens": 807813375.0, "step": 21170 }, { "epoch": 2.6931688080396894, "ewc_loss": 0.034419264644384384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441926310188137e-05, "grad_norm": 19.529870986938477, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8620770573616028, "num_tokens": 807852317.0, "step": 21171 }, { "epoch": 2.6932960183182804, "ewc_loss": 0.034373264759778976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43732644978445e-05, "grad_norm": 19.603336334228516, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8775652647018433, "num_tokens": 807895266.0, "step": 21172 }, { "epoch": 2.6934232285968704, "ewc_loss": 0.03440612182021141, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4406122722430155e-05, "grad_norm": 19.462194442749023, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8771047592163086, "num_tokens": 807938600.0, "step": 21173 }, { "epoch": 2.693550438875461, "ewc_loss": 0.03432691469788551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432691300986335e-05, "grad_norm": 19.6129207611084, "learning_rate": 1e-06, "loss": 0.3293, "mean_token_accuracy": 0.8945937156677246, "num_tokens": 807974329.0, "step": 21174 }, { "epoch": 2.6936776491540515, "ewc_loss": 0.0344722718000412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44722720910795e-05, "grad_norm": 19.551862716674805, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.8869538307189941, "num_tokens": 808015666.0, "step": 21175 }, { "epoch": 2.693804859432642, "ewc_loss": 0.034325335174798965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432533412706107e-05, "grad_norm": 19.560972213745117, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8857488632202148, "num_tokens": 808050176.0, "step": 21176 }, { "epoch": 2.6939320697112326, "ewc_loss": 0.0342959426343441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4295942896278575e-05, "grad_norm": 19.510988235473633, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8682640790939331, "num_tokens": 808091910.0, "step": 21177 }, { "epoch": 2.694059279989823, "ewc_loss": 0.03435998409986496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435998587519862e-05, "grad_norm": 19.542325973510742, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8707149624824524, "num_tokens": 808128646.0, "step": 21178 }, { "epoch": 2.6941864902684136, "ewc_loss": 0.03438933193683624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4389333450235426e-05, "grad_norm": 19.493370056152344, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.878933310508728, "num_tokens": 808166544.0, "step": 21179 }, { "epoch": 2.694313700547004, "ewc_loss": 0.03433627262711525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4336273529333994e-05, "grad_norm": 19.503223419189453, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8652465343475342, "num_tokens": 808200051.0, "step": 21180 }, { "epoch": 2.6944409108255947, "ewc_loss": 0.03441976383328438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441976514295675e-05, "grad_norm": 19.538801193237305, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8775845766067505, "num_tokens": 808239251.0, "step": 21181 }, { "epoch": 2.694568121104185, "ewc_loss": 0.03433742746710777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433742676861584e-05, "grad_norm": 19.471385955810547, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8695578575134277, "num_tokens": 808280639.0, "step": 21182 }, { "epoch": 2.6946953313827757, "ewc_loss": 0.034324780106544495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432478115428239e-05, "grad_norm": 19.540321350097656, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8812519311904907, "num_tokens": 808316141.0, "step": 21183 }, { "epoch": 2.6948225416613663, "ewc_loss": 0.03440862521529198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4408625651849434e-05, "grad_norm": 19.543901443481445, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.874364972114563, "num_tokens": 808359172.0, "step": 21184 }, { "epoch": 2.694949751939957, "ewc_loss": 0.03432455286383629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4324551961617544e-05, "grad_norm": 19.539073944091797, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8485810160636902, "num_tokens": 808397823.0, "step": 21185 }, { "epoch": 2.6950769622185473, "ewc_loss": 0.034341104328632355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434110476518981e-05, "grad_norm": 19.542892456054688, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8851194977760315, "num_tokens": 808435355.0, "step": 21186 }, { "epoch": 2.695204172497138, "ewc_loss": 0.034311216324567795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4311215131310746e-05, "grad_norm": 19.493192672729492, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8825225234031677, "num_tokens": 808470792.0, "step": 21187 }, { "epoch": 2.6953313827757284, "ewc_loss": 0.03435412049293518, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4354121453361586e-05, "grad_norm": 19.63755226135254, "learning_rate": 1e-06, "loss": 0.337, "mean_token_accuracy": 0.8924562931060791, "num_tokens": 808504591.0, "step": 21188 }, { "epoch": 2.695458593054319, "ewc_loss": 0.034392520785331726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439252031967044e-05, "grad_norm": 19.542194366455078, "learning_rate": 1e-06, "loss": 0.3459, "mean_token_accuracy": 0.886364758014679, "num_tokens": 808537167.0, "step": 21189 }, { "epoch": 2.6955858033329094, "ewc_loss": 0.03426190838217735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.426190960453823e-05, "grad_norm": 19.559152603149414, "learning_rate": 1e-06, "loss": 0.3442, "mean_token_accuracy": 0.8909536600112915, "num_tokens": 808572055.0, "step": 21190 }, { "epoch": 2.6957130136115, "ewc_loss": 0.034300003200769424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430000288062729e-05, "grad_norm": 19.45522689819336, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.8877542614936829, "num_tokens": 808605239.0, "step": 21191 }, { "epoch": 2.6958402238900905, "ewc_loss": 0.03433670848608017, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4336710086790845e-05, "grad_norm": 19.56867790222168, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8538089394569397, "num_tokens": 808643227.0, "step": 21192 }, { "epoch": 2.695967434168681, "ewc_loss": 0.034386590123176575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438659041421488e-05, "grad_norm": 19.527236938476562, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.870256781578064, "num_tokens": 808683855.0, "step": 21193 }, { "epoch": 2.696094644447271, "ewc_loss": 0.03437292203307152, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4372922527836636e-05, "grad_norm": 19.602991104125977, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8816581964492798, "num_tokens": 808723467.0, "step": 21194 }, { "epoch": 2.696221854725862, "ewc_loss": 0.03434181585907936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4341814171057194e-05, "grad_norm": 19.452495574951172, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8653182983398438, "num_tokens": 808759805.0, "step": 21195 }, { "epoch": 2.696349065004452, "ewc_loss": 0.0342855304479599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428553100093268e-05, "grad_norm": 19.53407096862793, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8586454391479492, "num_tokens": 808802024.0, "step": 21196 }, { "epoch": 2.696476275283043, "ewc_loss": 0.034391045570373535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439104693825357e-05, "grad_norm": 19.552738189697266, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8678566813468933, "num_tokens": 808837086.0, "step": 21197 }, { "epoch": 2.696603485561633, "ewc_loss": 0.034353990107774734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435399048612453e-05, "grad_norm": 19.551729202270508, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8745384216308594, "num_tokens": 808875261.0, "step": 21198 }, { "epoch": 2.6967306958402237, "ewc_loss": 0.03438396751880646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4383967431494966e-05, "grad_norm": 19.4882869720459, "learning_rate": 1e-06, "loss": 0.3629, "mean_token_accuracy": 0.8818647265434265, "num_tokens": 808913044.0, "step": 21199 }, { "epoch": 2.6968579061188143, "ewc_loss": 0.034406017512083054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440601722104475e-05, "grad_norm": 19.517791748046875, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8837519884109497, "num_tokens": 808952374.0, "step": 21200 }, { "epoch": 2.696985116397405, "ewc_loss": 0.03441702201962471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44170221069362e-05, "grad_norm": 19.573848724365234, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8768059015274048, "num_tokens": 808994397.0, "step": 21201 }, { "epoch": 2.6971123266759953, "ewc_loss": 0.034384820610284805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438482235651463e-05, "grad_norm": 19.516551971435547, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.878531813621521, "num_tokens": 809036401.0, "step": 21202 }, { "epoch": 2.697239536954586, "ewc_loss": 0.03441566601395607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4415665140841156e-05, "grad_norm": 19.576419830322266, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8810982704162598, "num_tokens": 809073073.0, "step": 21203 }, { "epoch": 2.6973667472331764, "ewc_loss": 0.034406598657369614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4406599297653884e-05, "grad_norm": 19.5335750579834, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8751040697097778, "num_tokens": 809117348.0, "step": 21204 }, { "epoch": 2.697493957511767, "ewc_loss": 0.0343034490942955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4303448046557605e-05, "grad_norm": 19.536754608154297, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8879091739654541, "num_tokens": 809151357.0, "step": 21205 }, { "epoch": 2.6976211677903574, "ewc_loss": 0.034365102648735046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436510087340139e-05, "grad_norm": 19.600357055664062, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8701573014259338, "num_tokens": 809189000.0, "step": 21206 }, { "epoch": 2.697748378068948, "ewc_loss": 0.034322746098041534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432274752412923e-05, "grad_norm": 19.442584991455078, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8788312673568726, "num_tokens": 809227380.0, "step": 21207 }, { "epoch": 2.6978755883475385, "ewc_loss": 0.034366246312856674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436624683672562e-05, "grad_norm": 19.575138092041016, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8752785921096802, "num_tokens": 809265131.0, "step": 21208 }, { "epoch": 2.698002798626129, "ewc_loss": 0.03443238511681557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443238529143855e-05, "grad_norm": 19.50242805480957, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8749433755874634, "num_tokens": 809297190.0, "step": 21209 }, { "epoch": 2.6981300089047195, "ewc_loss": 0.03432981297373772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43298124789726e-05, "grad_norm": 19.514135360717773, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.884598970413208, "num_tokens": 809336774.0, "step": 21210 }, { "epoch": 2.69825721918331, "ewc_loss": 0.03448358550667763, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448358620516956e-05, "grad_norm": 19.545473098754883, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8600056767463684, "num_tokens": 809376612.0, "step": 21211 }, { "epoch": 2.6983844294619006, "ewc_loss": 0.03435812145471573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435812323004939e-05, "grad_norm": 19.50240707397461, "learning_rate": 1e-06, "loss": 0.3149, "mean_token_accuracy": 0.8980851769447327, "num_tokens": 809416058.0, "step": 21212 }, { "epoch": 2.698511639740491, "ewc_loss": 0.03438933566212654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4389337088214234e-05, "grad_norm": 19.53218650817871, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8860448598861694, "num_tokens": 809449580.0, "step": 21213 }, { "epoch": 2.6986388500190817, "ewc_loss": 0.034321129322052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432112862356007e-05, "grad_norm": 19.49164581298828, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8788192868232727, "num_tokens": 809484983.0, "step": 21214 }, { "epoch": 2.698766060297672, "ewc_loss": 0.0344272218644619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442722299951129e-05, "grad_norm": 19.603099822998047, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8780533075332642, "num_tokens": 809521086.0, "step": 21215 }, { "epoch": 2.6988932705762627, "ewc_loss": 0.034364596009254456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43645951943472e-05, "grad_norm": 19.482864379882812, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8731228113174438, "num_tokens": 809562475.0, "step": 21216 }, { "epoch": 2.699020480854853, "ewc_loss": 0.034403640776872635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440364162088372e-05, "grad_norm": 19.57956886291504, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.862832248210907, "num_tokens": 809596686.0, "step": 21217 }, { "epoch": 2.6991476911334438, "ewc_loss": 0.034389302134513855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438930070842616e-05, "grad_norm": 19.451833724975586, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8766729831695557, "num_tokens": 809636614.0, "step": 21218 }, { "epoch": 2.699274901412034, "ewc_loss": 0.034411728382110596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441172884777188e-05, "grad_norm": 19.64139747619629, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8646103143692017, "num_tokens": 809674542.0, "step": 21219 }, { "epoch": 2.699402111690625, "ewc_loss": 0.03452793136239052, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4527933166828007e-05, "grad_norm": 19.47141456604004, "learning_rate": 1e-06, "loss": 0.3457, "mean_token_accuracy": 0.8907204866409302, "num_tokens": 809716726.0, "step": 21220 }, { "epoch": 2.699529321969215, "ewc_loss": 0.03435635566711426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435635517234914e-05, "grad_norm": 19.620769500732422, "learning_rate": 1e-06, "loss": 0.3598, "mean_token_accuracy": 0.8848930597305298, "num_tokens": 809756790.0, "step": 21221 }, { "epoch": 2.699656532247806, "ewc_loss": 0.03450418636202812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450418807915412e-05, "grad_norm": 19.46731948852539, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8730863332748413, "num_tokens": 809794616.0, "step": 21222 }, { "epoch": 2.699783742526396, "ewc_loss": 0.03439628332853317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439628198975697e-05, "grad_norm": 19.554485321044922, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8817681074142456, "num_tokens": 809836662.0, "step": 21223 }, { "epoch": 2.6999109528049865, "ewc_loss": 0.034470219165086746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4470220271032304e-05, "grad_norm": 19.520339965820312, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8736966848373413, "num_tokens": 809873812.0, "step": 21224 }, { "epoch": 2.700038163083577, "ewc_loss": 0.03437411040067673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4374112146906555e-05, "grad_norm": 19.544963836669922, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.875639796257019, "num_tokens": 809913139.0, "step": 21225 }, { "epoch": 2.7001653733621676, "ewc_loss": 0.034442074596881866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4442073229001835e-05, "grad_norm": 19.57413101196289, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8772726058959961, "num_tokens": 809947741.0, "step": 21226 }, { "epoch": 2.700292583640758, "ewc_loss": 0.03439747169613838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439747160882689e-05, "grad_norm": 19.511754989624023, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8690722584724426, "num_tokens": 809987120.0, "step": 21227 }, { "epoch": 2.7004197939193486, "ewc_loss": 0.03440707549452782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440707587287761e-05, "grad_norm": 19.588621139526367, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8700506687164307, "num_tokens": 810026972.0, "step": 21228 }, { "epoch": 2.700547004197939, "ewc_loss": 0.03447641432285309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447641574894078e-05, "grad_norm": 19.56199836730957, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8754386305809021, "num_tokens": 810062767.0, "step": 21229 }, { "epoch": 2.7006742144765297, "ewc_loss": 0.03439582884311676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4395827242406085e-05, "grad_norm": 19.562786102294922, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8832780122756958, "num_tokens": 810096930.0, "step": 21230 }, { "epoch": 2.70080142475512, "ewc_loss": 0.034399572759866714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439957436057739e-05, "grad_norm": 19.507400512695312, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8633044362068176, "num_tokens": 810134600.0, "step": 21231 }, { "epoch": 2.7009286350337107, "ewc_loss": 0.034388769418001175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438876956352033e-05, "grad_norm": 19.505155563354492, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.87725830078125, "num_tokens": 810169535.0, "step": 21232 }, { "epoch": 2.7010558453123013, "ewc_loss": 0.03449380770325661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4493808925617486e-05, "grad_norm": 19.56678009033203, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8654454946517944, "num_tokens": 810205288.0, "step": 21233 }, { "epoch": 2.701183055590892, "ewc_loss": 0.03440305218100548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440305226831697e-05, "grad_norm": 19.509334564208984, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8729531168937683, "num_tokens": 810242868.0, "step": 21234 }, { "epoch": 2.7013102658694823, "ewc_loss": 0.03441737964749336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4417378628859296e-05, "grad_norm": 19.60565185546875, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8636972904205322, "num_tokens": 810286539.0, "step": 21235 }, { "epoch": 2.701437476148073, "ewc_loss": 0.034422267228364944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442226807237603e-05, "grad_norm": 19.42046356201172, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8796699047088623, "num_tokens": 810331598.0, "step": 21236 }, { "epoch": 2.7015646864266634, "ewc_loss": 0.03439559414982796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439559441176243e-05, "grad_norm": 19.62051010131836, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8837810754776001, "num_tokens": 810369501.0, "step": 21237 }, { "epoch": 2.701691896705254, "ewc_loss": 0.0345117524266243, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451175143709406e-05, "grad_norm": 19.572275161743164, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8726094961166382, "num_tokens": 810410727.0, "step": 21238 }, { "epoch": 2.7018191069838444, "ewc_loss": 0.03430239483714104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4302396670682356e-05, "grad_norm": 19.528310775756836, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8670181632041931, "num_tokens": 810450315.0, "step": 21239 }, { "epoch": 2.701946317262435, "ewc_loss": 0.03443990275263786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4439901355654e-05, "grad_norm": 19.588130950927734, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8681168556213379, "num_tokens": 810493972.0, "step": 21240 }, { "epoch": 2.7020735275410255, "ewc_loss": 0.03438081219792366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438081330386922e-05, "grad_norm": 19.52454376220703, "learning_rate": 1e-06, "loss": 0.3535, "mean_token_accuracy": 0.8869478702545166, "num_tokens": 810526038.0, "step": 21241 }, { "epoch": 2.7022007378196156, "ewc_loss": 0.03444444015622139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444444155320525e-05, "grad_norm": 19.626943588256836, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8795117735862732, "num_tokens": 810560772.0, "step": 21242 }, { "epoch": 2.7023279480982065, "ewc_loss": 0.03442402556538582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442402521613985e-05, "grad_norm": 19.571308135986328, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8832541108131409, "num_tokens": 810598322.0, "step": 21243 }, { "epoch": 2.7024551583767966, "ewc_loss": 0.034348003566265106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434800237300806e-05, "grad_norm": 19.5693302154541, "learning_rate": 1e-06, "loss": 0.335, "mean_token_accuracy": 0.8948674201965332, "num_tokens": 810638652.0, "step": 21244 }, { "epoch": 2.7025823686553876, "ewc_loss": 0.03441053628921509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441053559072316e-05, "grad_norm": 19.546371459960938, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8749011158943176, "num_tokens": 810680318.0, "step": 21245 }, { "epoch": 2.7027095789339777, "ewc_loss": 0.034372229129076004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437222767388448e-05, "grad_norm": 19.579803466796875, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8805385828018188, "num_tokens": 810715001.0, "step": 21246 }, { "epoch": 2.7028367892125686, "ewc_loss": 0.03438948094844818, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438948260736652e-05, "grad_norm": 19.581642150878906, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8685286045074463, "num_tokens": 810748897.0, "step": 21247 }, { "epoch": 2.7029639994911587, "ewc_loss": 0.03431843966245651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431844015722163e-05, "grad_norm": 19.543926239013672, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8695620894432068, "num_tokens": 810788954.0, "step": 21248 }, { "epoch": 2.7030912097697493, "ewc_loss": 0.03437142074108124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437142004258931e-05, "grad_norm": 19.543703079223633, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8786593079566956, "num_tokens": 810828897.0, "step": 21249 }, { "epoch": 2.70321842004834, "ewc_loss": 0.03438989445567131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438989369897172e-05, "grad_norm": 19.55756378173828, "learning_rate": 1e-06, "loss": 0.3359, "mean_token_accuracy": 0.8913601636886597, "num_tokens": 810866698.0, "step": 21250 }, { "epoch": 2.7033456303269303, "ewc_loss": 0.03435389697551727, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4353895898675546e-05, "grad_norm": 19.542200088500977, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8686994314193726, "num_tokens": 810903140.0, "step": 21251 }, { "epoch": 2.703472840605521, "ewc_loss": 0.03435513749718666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4355136449448764e-05, "grad_norm": 19.517860412597656, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8803303241729736, "num_tokens": 810940664.0, "step": 21252 }, { "epoch": 2.7036000508841114, "ewc_loss": 0.0343550369143486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435503822402097e-05, "grad_norm": 19.592857360839844, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8736675977706909, "num_tokens": 810981818.0, "step": 21253 }, { "epoch": 2.703727261162702, "ewc_loss": 0.03445850685238838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445850597927347e-05, "grad_norm": 19.524770736694336, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8751857280731201, "num_tokens": 811019897.0, "step": 21254 }, { "epoch": 2.7038544714412924, "ewc_loss": 0.03435065224766731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435065082157962e-05, "grad_norm": 19.521604537963867, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8793913125991821, "num_tokens": 811056310.0, "step": 21255 }, { "epoch": 2.703981681719883, "ewc_loss": 0.034361355006694794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4361353755230084e-05, "grad_norm": 19.551191329956055, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8647743463516235, "num_tokens": 811091505.0, "step": 21256 }, { "epoch": 2.7041088919984735, "ewc_loss": 0.03439735993742943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439735883148387e-05, "grad_norm": 19.548526763916016, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8867433071136475, "num_tokens": 811134861.0, "step": 21257 }, { "epoch": 2.704236102277064, "ewc_loss": 0.03442982956767082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4429827792337164e-05, "grad_norm": 19.596397399902344, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8680965304374695, "num_tokens": 811170001.0, "step": 21258 }, { "epoch": 2.7043633125556545, "ewc_loss": 0.03438617289066315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4386172046652064e-05, "grad_norm": 19.5456485748291, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8803210258483887, "num_tokens": 811204203.0, "step": 21259 }, { "epoch": 2.704490522834245, "ewc_loss": 0.034339211881160736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4339213016210124e-05, "grad_norm": 19.579118728637695, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8795397877693176, "num_tokens": 811244598.0, "step": 21260 }, { "epoch": 2.7046177331128356, "ewc_loss": 0.034430041909217834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443004243308678e-05, "grad_norm": 19.582799911499023, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.886267900466919, "num_tokens": 811283352.0, "step": 21261 }, { "epoch": 2.704744943391426, "ewc_loss": 0.034386590123176575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438659041421488e-05, "grad_norm": 19.64422607421875, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8641816973686218, "num_tokens": 811317605.0, "step": 21262 }, { "epoch": 2.7048721536700167, "ewc_loss": 0.03445225954055786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445225956966169e-05, "grad_norm": 19.56182098388672, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.8920277953147888, "num_tokens": 811352117.0, "step": 21263 }, { "epoch": 2.704999363948607, "ewc_loss": 0.034341029822826385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434102836763486e-05, "grad_norm": 19.611480712890625, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8679442405700684, "num_tokens": 811386118.0, "step": 21264 }, { "epoch": 2.7051265742271977, "ewc_loss": 0.03446382284164429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446382106631063e-05, "grad_norm": 19.523242950439453, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8858470916748047, "num_tokens": 811424479.0, "step": 21265 }, { "epoch": 2.7052537845057882, "ewc_loss": 0.03441532328724861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441532317083329e-05, "grad_norm": 19.63907814025879, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8623860478401184, "num_tokens": 811464683.0, "step": 21266 }, { "epoch": 2.7053809947843783, "ewc_loss": 0.03438267111778259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438267231103964e-05, "grad_norm": 19.423032760620117, "learning_rate": 1e-06, "loss": 0.3466, "mean_token_accuracy": 0.8907033801078796, "num_tokens": 811504801.0, "step": 21267 }, { "epoch": 2.7055082050629693, "ewc_loss": 0.03438633680343628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438633575569838e-05, "grad_norm": 19.679744720458984, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8675807118415833, "num_tokens": 811546278.0, "step": 21268 }, { "epoch": 2.7056354153415594, "ewc_loss": 0.0344666913151741, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4466691431589425e-05, "grad_norm": 19.493173599243164, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8627128005027771, "num_tokens": 811587019.0, "step": 21269 }, { "epoch": 2.7057626256201504, "ewc_loss": 0.03434198349714279, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434198515606113e-05, "grad_norm": 19.63584327697754, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8516536951065063, "num_tokens": 811619843.0, "step": 21270 }, { "epoch": 2.7058898358987404, "ewc_loss": 0.034492820501327515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449281939538196e-05, "grad_norm": 19.570154190063477, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8816161155700684, "num_tokens": 811655797.0, "step": 21271 }, { "epoch": 2.706017046177331, "ewc_loss": 0.034416671842336655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441667286097072e-05, "grad_norm": 19.576711654663086, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8677555322647095, "num_tokens": 811691626.0, "step": 21272 }, { "epoch": 2.7061442564559215, "ewc_loss": 0.034502290189266205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450228905421682e-05, "grad_norm": 19.66537857055664, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8771913051605225, "num_tokens": 811726046.0, "step": 21273 }, { "epoch": 2.706271466734512, "ewc_loss": 0.03439071401953697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4390712244203314e-05, "grad_norm": 19.538301467895508, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8658697605133057, "num_tokens": 811765686.0, "step": 21274 }, { "epoch": 2.7063986770131025, "ewc_loss": 0.034393489360809326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4393488022033125e-05, "grad_norm": 19.60681915283203, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8823615312576294, "num_tokens": 811801143.0, "step": 21275 }, { "epoch": 2.706525887291693, "ewc_loss": 0.03440352529287338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440352520556189e-05, "grad_norm": 19.57187271118164, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8828272819519043, "num_tokens": 811834807.0, "step": 21276 }, { "epoch": 2.7066530975702836, "ewc_loss": 0.034430816769599915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443081732257269e-05, "grad_norm": 19.616554260253906, "learning_rate": 1e-06, "loss": 0.3358, "mean_token_accuracy": 0.894798755645752, "num_tokens": 811865676.0, "step": 21277 }, { "epoch": 2.706780307848874, "ewc_loss": 0.03448255732655525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448255665716715e-05, "grad_norm": 19.58102798461914, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8738383054733276, "num_tokens": 811901259.0, "step": 21278 }, { "epoch": 2.7069075181274647, "ewc_loss": 0.03436312451958656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436312545090914e-05, "grad_norm": 19.57115364074707, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8624519109725952, "num_tokens": 811937768.0, "step": 21279 }, { "epoch": 2.707034728406055, "ewc_loss": 0.034497711807489395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4497712476877496e-05, "grad_norm": 19.67422866821289, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8757320046424866, "num_tokens": 811967805.0, "step": 21280 }, { "epoch": 2.7071619386846457, "ewc_loss": 0.03446582332253456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446582195465453e-05, "grad_norm": 19.548070907592773, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8712095022201538, "num_tokens": 812007468.0, "step": 21281 }, { "epoch": 2.7072891489632362, "ewc_loss": 0.03444936126470566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444936010055244e-05, "grad_norm": 19.66138458251953, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8773790001869202, "num_tokens": 812048127.0, "step": 21282 }, { "epoch": 2.7074163592418268, "ewc_loss": 0.0344543531537056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445435140747577e-05, "grad_norm": 19.485097885131836, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8765944838523865, "num_tokens": 812083904.0, "step": 21283 }, { "epoch": 2.7075435695204173, "ewc_loss": 0.03439374268054962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439374268054962e-05, "grad_norm": 19.612613677978516, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8812018632888794, "num_tokens": 812119643.0, "step": 21284 }, { "epoch": 2.707670779799008, "ewc_loss": 0.034454621374607086, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4454620617907494e-05, "grad_norm": 19.607004165649414, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8742737770080566, "num_tokens": 812155338.0, "step": 21285 }, { "epoch": 2.7077979900775984, "ewc_loss": 0.034414879977703094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4414879337418824e-05, "grad_norm": 19.562362670898438, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8758515119552612, "num_tokens": 812193075.0, "step": 21286 }, { "epoch": 2.707925200356189, "ewc_loss": 0.034426912665367126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4426913771312684e-05, "grad_norm": 19.48170280456543, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.884192168712616, "num_tokens": 812228163.0, "step": 21287 }, { "epoch": 2.7080524106347794, "ewc_loss": 0.03447965905070305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4479660826036707e-05, "grad_norm": 19.651336669921875, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8701454401016235, "num_tokens": 812268565.0, "step": 21288 }, { "epoch": 2.70817962091337, "ewc_loss": 0.03445739299058914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44573927577585e-05, "grad_norm": 19.51896858215332, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8810513615608215, "num_tokens": 812307292.0, "step": 21289 }, { "epoch": 2.7083068311919605, "ewc_loss": 0.034402668476104736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440266664256342e-05, "grad_norm": 19.55760955810547, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.8860036134719849, "num_tokens": 812346180.0, "step": 21290 }, { "epoch": 2.708434041470551, "ewc_loss": 0.03449697047472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449697032920085e-05, "grad_norm": 19.5513858795166, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8726568222045898, "num_tokens": 812376033.0, "step": 21291 }, { "epoch": 2.708561251749141, "ewc_loss": 0.034418411552906036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441841181484051e-05, "grad_norm": 19.59070587158203, "learning_rate": 1e-06, "loss": 0.3389, "mean_token_accuracy": 0.8932061791419983, "num_tokens": 812407925.0, "step": 21292 }, { "epoch": 2.708688462027732, "ewc_loss": 0.03448222950100899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448222923907451e-05, "grad_norm": 19.471471786499023, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8813961744308472, "num_tokens": 812452354.0, "step": 21293 }, { "epoch": 2.708815672306322, "ewc_loss": 0.03437488153576851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437487976043485e-05, "grad_norm": 19.593652725219727, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8755207061767578, "num_tokens": 812490015.0, "step": 21294 }, { "epoch": 2.708942882584913, "ewc_loss": 0.0345466248691082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4546625101938844e-05, "grad_norm": 19.58432388305664, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8726464509963989, "num_tokens": 812529919.0, "step": 21295 }, { "epoch": 2.709070092863503, "ewc_loss": 0.03445559740066528, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4455595596227795e-05, "grad_norm": 19.708688735961914, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.887503445148468, "num_tokens": 812568541.0, "step": 21296 }, { "epoch": 2.7091973031420937, "ewc_loss": 0.03445417433977127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445417314651422e-05, "grad_norm": 19.535873413085938, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8679481744766235, "num_tokens": 812612098.0, "step": 21297 }, { "epoch": 2.7093245134206843, "ewc_loss": 0.034349702298641205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434970130911097e-05, "grad_norm": 19.586118698120117, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8712116479873657, "num_tokens": 812648843.0, "step": 21298 }, { "epoch": 2.709451723699275, "ewc_loss": 0.03449154272675514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449154246482067e-05, "grad_norm": 19.527694702148438, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8756643533706665, "num_tokens": 812692080.0, "step": 21299 }, { "epoch": 2.7095789339778653, "ewc_loss": 0.034372735768556595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4372736990917474e-05, "grad_norm": 19.626033782958984, "learning_rate": 1e-06, "loss": 0.358, "mean_token_accuracy": 0.8900965452194214, "num_tokens": 812729441.0, "step": 21300 }, { "epoch": 2.709706144256456, "ewc_loss": 0.03442662954330444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442663000896573e-05, "grad_norm": 19.530546188354492, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.869066059589386, "num_tokens": 812768908.0, "step": 21301 }, { "epoch": 2.7098333545350464, "ewc_loss": 0.03438218683004379, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43821884598583e-05, "grad_norm": 19.60011863708496, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8760915994644165, "num_tokens": 812808384.0, "step": 21302 }, { "epoch": 2.709960564813637, "ewc_loss": 0.03441861271858215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44186119036749e-05, "grad_norm": 19.592533111572266, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.868327260017395, "num_tokens": 812847658.0, "step": 21303 }, { "epoch": 2.7100877750922274, "ewc_loss": 0.034407153725624084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440715227043256e-05, "grad_norm": 19.570167541503906, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8765855431556702, "num_tokens": 812889394.0, "step": 21304 }, { "epoch": 2.710214985370818, "ewc_loss": 0.034421321004629135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4421322197886184e-05, "grad_norm": 19.667272567749023, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8645461797714233, "num_tokens": 812924202.0, "step": 21305 }, { "epoch": 2.7103421956494085, "ewc_loss": 0.03439681977033615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439682041062042e-05, "grad_norm": 19.53253173828125, "learning_rate": 1e-06, "loss": 0.3271, "mean_token_accuracy": 0.8950759172439575, "num_tokens": 812954391.0, "step": 21306 }, { "epoch": 2.710469405927999, "ewc_loss": 0.034298110753297806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.42981111316476e-05, "grad_norm": 19.59742546081543, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8678779602050781, "num_tokens": 812989498.0, "step": 21307 }, { "epoch": 2.7105966162065895, "ewc_loss": 0.034397248178720474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439724969211966e-05, "grad_norm": 19.564088821411133, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8834373950958252, "num_tokens": 813030968.0, "step": 21308 }, { "epoch": 2.71072382648518, "ewc_loss": 0.034350428730249405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435042890487239e-05, "grad_norm": 19.55305290222168, "learning_rate": 1e-06, "loss": 0.3211, "mean_token_accuracy": 0.8968765735626221, "num_tokens": 813065843.0, "step": 21309 }, { "epoch": 2.7108510367637706, "ewc_loss": 0.03440141677856445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440141517785378e-05, "grad_norm": 19.582042694091797, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8819379806518555, "num_tokens": 813107410.0, "step": 21310 }, { "epoch": 2.710978247042361, "ewc_loss": 0.03446086868643761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446086702751927e-05, "grad_norm": 19.677148818969727, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8826532363891602, "num_tokens": 813155959.0, "step": 21311 }, { "epoch": 2.7111054573209517, "ewc_loss": 0.034384630620479584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438462954363786e-05, "grad_norm": 19.546302795410156, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8825143575668335, "num_tokens": 813194133.0, "step": 21312 }, { "epoch": 2.711232667599542, "ewc_loss": 0.03434876725077629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434876634855755e-05, "grad_norm": 19.64799690246582, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8636461496353149, "num_tokens": 813231015.0, "step": 21313 }, { "epoch": 2.7113598778781327, "ewc_loss": 0.03445901349186897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445901529630646e-05, "grad_norm": 19.594493865966797, "learning_rate": 1e-06, "loss": 0.3347, "mean_token_accuracy": 0.8911164402961731, "num_tokens": 813271497.0, "step": 21314 }, { "epoch": 2.711487088156723, "ewc_loss": 0.03434368222951889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434368409216404e-05, "grad_norm": 19.5666561126709, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8783647418022156, "num_tokens": 813305758.0, "step": 21315 }, { "epoch": 2.7116142984353138, "ewc_loss": 0.03440222144126892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440222280914895e-05, "grad_norm": 19.56652069091797, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8751162886619568, "num_tokens": 813345958.0, "step": 21316 }, { "epoch": 2.711741508713904, "ewc_loss": 0.03445008024573326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4450080420356244e-05, "grad_norm": 19.67251968383789, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8798487186431885, "num_tokens": 813381630.0, "step": 21317 }, { "epoch": 2.711868718992495, "ewc_loss": 0.034385982900857925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4385982871754095e-05, "grad_norm": 19.67521095275879, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.877111554145813, "num_tokens": 813421320.0, "step": 21318 }, { "epoch": 2.711995929271085, "ewc_loss": 0.034294236451387405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.429423668421805e-05, "grad_norm": 19.576435089111328, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8701006770133972, "num_tokens": 813458323.0, "step": 21319 }, { "epoch": 2.712123139549676, "ewc_loss": 0.034370310604572296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4370310459053144e-05, "grad_norm": 19.60378646850586, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8575121760368347, "num_tokens": 813501436.0, "step": 21320 }, { "epoch": 2.712250349828266, "ewc_loss": 0.03431093320250511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431093136896379e-05, "grad_norm": 19.595874786376953, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8760722279548645, "num_tokens": 813545942.0, "step": 21321 }, { "epoch": 2.7123775601068565, "ewc_loss": 0.034351374953985214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435137477936223e-05, "grad_norm": 19.64411163330078, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.871232807636261, "num_tokens": 813582202.0, "step": 21322 }, { "epoch": 2.712504770385447, "ewc_loss": 0.034391872584819794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4391872759442776e-05, "grad_norm": 19.634553909301758, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8622462749481201, "num_tokens": 813621711.0, "step": 21323 }, { "epoch": 2.7126319806640375, "ewc_loss": 0.03428512066602707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428511990932748e-05, "grad_norm": 19.586536407470703, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8707234859466553, "num_tokens": 813665404.0, "step": 21324 }, { "epoch": 2.712759190942628, "ewc_loss": 0.03436977043747902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4369772038189694e-05, "grad_norm": 19.648109436035156, "learning_rate": 1e-06, "loss": 0.3622, "mean_token_accuracy": 0.8832690715789795, "num_tokens": 813697470.0, "step": 21325 }, { "epoch": 2.7128864012212186, "ewc_loss": 0.034286633133888245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.428663330851123e-05, "grad_norm": 19.59541130065918, "learning_rate": 1e-06, "loss": 0.3662, "mean_token_accuracy": 0.8822137117385864, "num_tokens": 813733308.0, "step": 21326 }, { "epoch": 2.713013611499809, "ewc_loss": 0.03432406857609749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43240681104362e-05, "grad_norm": 19.597789764404297, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8620851039886475, "num_tokens": 813770881.0, "step": 21327 }, { "epoch": 2.7131408217783997, "ewc_loss": 0.03429257869720459, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4292577765882015e-05, "grad_norm": 19.57666015625, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8817082643508911, "num_tokens": 813806495.0, "step": 21328 }, { "epoch": 2.71326803205699, "ewc_loss": 0.03433772921562195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433772872085683e-05, "grad_norm": 19.663854598999023, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.881303608417511, "num_tokens": 813842408.0, "step": 21329 }, { "epoch": 2.7133952423355807, "ewc_loss": 0.03433392569422722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433392703300342e-05, "grad_norm": 19.592897415161133, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8724555969238281, "num_tokens": 813883041.0, "step": 21330 }, { "epoch": 2.7135224526141712, "ewc_loss": 0.034340646117925644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434064637986012e-05, "grad_norm": 19.58002281188965, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8725993633270264, "num_tokens": 813922901.0, "step": 21331 }, { "epoch": 2.7136496628927618, "ewc_loss": 0.03430471569299698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430471406318247e-05, "grad_norm": 19.493270874023438, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.87869793176651, "num_tokens": 813961948.0, "step": 21332 }, { "epoch": 2.7137768731713523, "ewc_loss": 0.03443356603384018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4433567634550855e-05, "grad_norm": 19.589048385620117, "learning_rate": 1e-06, "loss": 0.3461, "mean_token_accuracy": 0.8895641565322876, "num_tokens": 813999194.0, "step": 21333 }, { "epoch": 2.713904083449943, "ewc_loss": 0.034419938921928406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441993976593949e-05, "grad_norm": 19.55626106262207, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8648977875709534, "num_tokens": 814038986.0, "step": 21334 }, { "epoch": 2.7140312937285334, "ewc_loss": 0.03439117223024368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439117062953301e-05, "grad_norm": 19.58641242980957, "learning_rate": 1e-06, "loss": 0.3486, "mean_token_accuracy": 0.8893495798110962, "num_tokens": 814074351.0, "step": 21335 }, { "epoch": 2.714158504007124, "ewc_loss": 0.03440338000655174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440337968640961e-05, "grad_norm": 19.560791015625, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8761190176010132, "num_tokens": 814109255.0, "step": 21336 }, { "epoch": 2.7142857142857144, "ewc_loss": 0.03437449783086777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4374497772660106e-05, "grad_norm": 19.64993667602539, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8723324537277222, "num_tokens": 814146601.0, "step": 21337 }, { "epoch": 2.714412924564305, "ewc_loss": 0.03441713750362396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441713852225803e-05, "grad_norm": 19.567096710205078, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8810385465621948, "num_tokens": 814181299.0, "step": 21338 }, { "epoch": 2.7145401348428955, "ewc_loss": 0.03436344116926193, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4363441955065355e-05, "grad_norm": 19.564119338989258, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8842899799346924, "num_tokens": 814218178.0, "step": 21339 }, { "epoch": 2.7146673451214856, "ewc_loss": 0.03438762575387955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43876272381749e-05, "grad_norm": 19.603723526000977, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8782609701156616, "num_tokens": 814254161.0, "step": 21340 }, { "epoch": 2.7147945554000765, "ewc_loss": 0.03445650264620781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445650145295076e-05, "grad_norm": 19.568490982055664, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8765627145767212, "num_tokens": 814290164.0, "step": 21341 }, { "epoch": 2.7149217656786666, "ewc_loss": 0.03434795141220093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434795144130476e-05, "grad_norm": 19.53350830078125, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8563106060028076, "num_tokens": 814334459.0, "step": 21342 }, { "epoch": 2.7150489759572576, "ewc_loss": 0.03446200117468834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4462002076907083e-05, "grad_norm": 19.615375518798828, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8683844804763794, "num_tokens": 814374291.0, "step": 21343 }, { "epoch": 2.7151761862358477, "ewc_loss": 0.034432195127010345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443219611654058e-05, "grad_norm": 19.59804916381836, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8714067935943604, "num_tokens": 814413196.0, "step": 21344 }, { "epoch": 2.7153033965144386, "ewc_loss": 0.03433770686388016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.433770689298399e-05, "grad_norm": 19.611492156982422, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8737523555755615, "num_tokens": 814455312.0, "step": 21345 }, { "epoch": 2.7154306067930287, "ewc_loss": 0.03441809117794037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4418091672705486e-05, "grad_norm": 19.55013084411621, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8716285228729248, "num_tokens": 814496586.0, "step": 21346 }, { "epoch": 2.7155578170716193, "ewc_loss": 0.034339260309934616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4339260309934616e-05, "grad_norm": 19.592832565307617, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8815165758132935, "num_tokens": 814531566.0, "step": 21347 }, { "epoch": 2.71568502735021, "ewc_loss": 0.034442149102687836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4442149626556784e-05, "grad_norm": 19.660654067993164, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8817422389984131, "num_tokens": 814566168.0, "step": 21348 }, { "epoch": 2.7158122376288003, "ewc_loss": 0.03431431204080582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431431105127558e-05, "grad_norm": 19.625022888183594, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8760517239570618, "num_tokens": 814604927.0, "step": 21349 }, { "epoch": 2.715939447907391, "ewc_loss": 0.03435921296477318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4359214623691514e-05, "grad_norm": 19.60708236694336, "learning_rate": 1e-06, "loss": 0.3449, "mean_token_accuracy": 0.8893154263496399, "num_tokens": 814641401.0, "step": 21350 }, { "epoch": 2.7160666581859814, "ewc_loss": 0.034308917820453644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430891956668347e-05, "grad_norm": 19.653854370117188, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8814254999160767, "num_tokens": 814674252.0, "step": 21351 }, { "epoch": 2.716193868464572, "ewc_loss": 0.034353017807006836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435301914578304e-05, "grad_norm": 19.632612228393555, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.875327467918396, "num_tokens": 814704965.0, "step": 21352 }, { "epoch": 2.7163210787431624, "ewc_loss": 0.03431975841522217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43197571055498e-05, "grad_norm": 19.657129287719727, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8476341366767883, "num_tokens": 814743740.0, "step": 21353 }, { "epoch": 2.716448289021753, "ewc_loss": 0.034319739788770676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431973891565576e-05, "grad_norm": 19.521652221679688, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.882500946521759, "num_tokens": 814784598.0, "step": 21354 }, { "epoch": 2.7165754993003435, "ewc_loss": 0.03430217504501343, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.430217475397512e-05, "grad_norm": 19.571144104003906, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8697757720947266, "num_tokens": 814821437.0, "step": 21355 }, { "epoch": 2.716702709578934, "ewc_loss": 0.03440326079726219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4403259633108974e-05, "grad_norm": 19.527904510498047, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8767340183258057, "num_tokens": 814861638.0, "step": 21356 }, { "epoch": 2.7168299198575245, "ewc_loss": 0.034340281039476395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434028258197941e-05, "grad_norm": 19.574628829956055, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8770826458930969, "num_tokens": 814893799.0, "step": 21357 }, { "epoch": 2.716957130136115, "ewc_loss": 0.03440230339765549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440230284468271e-05, "grad_norm": 19.605348587036133, "learning_rate": 1e-06, "loss": 0.3506, "mean_token_accuracy": 0.8829563856124878, "num_tokens": 814930726.0, "step": 21358 }, { "epoch": 2.7170843404147056, "ewc_loss": 0.03433995321393013, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4339951525907964e-05, "grad_norm": 19.505054473876953, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8751540184020996, "num_tokens": 814970041.0, "step": 21359 }, { "epoch": 2.717211550693296, "ewc_loss": 0.03442498669028282, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4424985642544925e-05, "grad_norm": 19.637847900390625, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8798955678939819, "num_tokens": 815008587.0, "step": 21360 }, { "epoch": 2.7173387609718866, "ewc_loss": 0.03445949777960777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4459499147487804e-05, "grad_norm": 19.50823211669922, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8634757995605469, "num_tokens": 815047659.0, "step": 21361 }, { "epoch": 2.717465971250477, "ewc_loss": 0.03432990238070488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4329903428442776e-05, "grad_norm": 19.5137996673584, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.876034677028656, "num_tokens": 815082631.0, "step": 21362 }, { "epoch": 2.7175931815290677, "ewc_loss": 0.03440428897738457, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440428918111138e-05, "grad_norm": 19.559497833251953, "learning_rate": 1e-06, "loss": 0.33, "mean_token_accuracy": 0.894345760345459, "num_tokens": 815117568.0, "step": 21363 }, { "epoch": 2.7177203918076582, "ewc_loss": 0.03441173955798149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4411739761708304e-05, "grad_norm": 19.563920974731445, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8775491714477539, "num_tokens": 815155019.0, "step": 21364 }, { "epoch": 2.7178476020862483, "ewc_loss": 0.03446289151906967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4462889743736014e-05, "grad_norm": 19.60923194885254, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8762786984443665, "num_tokens": 815195386.0, "step": 21365 }, { "epoch": 2.7179748123648393, "ewc_loss": 0.03441129997372627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4411299566272646e-05, "grad_norm": 19.564666748046875, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8804537057876587, "num_tokens": 815228304.0, "step": 21366 }, { "epoch": 2.7181020226434294, "ewc_loss": 0.03435374051332474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435373946558684e-05, "grad_norm": 19.535083770751953, "learning_rate": 1e-06, "loss": 0.3243, "mean_token_accuracy": 0.8952134251594543, "num_tokens": 815260111.0, "step": 21367 }, { "epoch": 2.7182292329220203, "ewc_loss": 0.034423843026161194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44238433171995e-05, "grad_norm": 19.595243453979492, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8761688470840454, "num_tokens": 815297272.0, "step": 21368 }, { "epoch": 2.7183564432006104, "ewc_loss": 0.034465376287698746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446537448326126e-05, "grad_norm": 19.58777618408203, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8749662637710571, "num_tokens": 815336016.0, "step": 21369 }, { "epoch": 2.718483653479201, "ewc_loss": 0.03447732701897621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4477325243642554e-05, "grad_norm": 19.570459365844727, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.850628674030304, "num_tokens": 815373940.0, "step": 21370 }, { "epoch": 2.7186108637577915, "ewc_loss": 0.03444995731115341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44499567290768e-05, "grad_norm": 19.527446746826172, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8662915825843811, "num_tokens": 815410836.0, "step": 21371 }, { "epoch": 2.718738074036382, "ewc_loss": 0.03440746292471886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4407461498631164e-05, "grad_norm": 19.564863204956055, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8815957307815552, "num_tokens": 815449577.0, "step": 21372 }, { "epoch": 2.7188652843149725, "ewc_loss": 0.034463223069906235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4463224437786266e-05, "grad_norm": 19.602161407470703, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8788913488388062, "num_tokens": 815485869.0, "step": 21373 }, { "epoch": 2.718992494593563, "ewc_loss": 0.03450086712837219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4500866604503244e-05, "grad_norm": 19.597904205322266, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8571916818618774, "num_tokens": 815523614.0, "step": 21374 }, { "epoch": 2.7191197048721536, "ewc_loss": 0.03446934372186661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4469343518139794e-05, "grad_norm": 19.563623428344727, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8561074137687683, "num_tokens": 815561835.0, "step": 21375 }, { "epoch": 2.719246915150744, "ewc_loss": 0.034487348049879074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448734787525609e-05, "grad_norm": 19.605554580688477, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8607115745544434, "num_tokens": 815605596.0, "step": 21376 }, { "epoch": 2.7193741254293347, "ewc_loss": 0.03446532040834427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446531991357915e-05, "grad_norm": 19.58402442932129, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8709502220153809, "num_tokens": 815638720.0, "step": 21377 }, { "epoch": 2.719501335707925, "ewc_loss": 0.03448532894253731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4485328797018155e-05, "grad_norm": 19.514087677001953, "learning_rate": 1e-06, "loss": 0.3429, "mean_token_accuracy": 0.8910068869590759, "num_tokens": 815673996.0, "step": 21378 }, { "epoch": 2.7196285459865157, "ewc_loss": 0.03447655215859413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447655399213545e-05, "grad_norm": 19.57367515563965, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8642798066139221, "num_tokens": 815716571.0, "step": 21379 }, { "epoch": 2.7197557562651062, "ewc_loss": 0.03452775627374649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452775490586646e-05, "grad_norm": 19.600547790527344, "learning_rate": 1e-06, "loss": 0.3627, "mean_token_accuracy": 0.8825775384902954, "num_tokens": 815748973.0, "step": 21380 }, { "epoch": 2.7198829665436968, "ewc_loss": 0.03453392535448074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4533924917923287e-05, "grad_norm": 19.569881439208984, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8788416981697083, "num_tokens": 815794754.0, "step": 21381 }, { "epoch": 2.7200101768222873, "ewc_loss": 0.034513987600803375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4513988794060424e-05, "grad_norm": 19.631513595581055, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.879264235496521, "num_tokens": 815831051.0, "step": 21382 }, { "epoch": 2.720137387100878, "ewc_loss": 0.03455808013677597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4558081097202376e-05, "grad_norm": 19.646928787231445, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.878738522529602, "num_tokens": 815874291.0, "step": 21383 }, { "epoch": 2.7202645973794684, "ewc_loss": 0.03445243462920189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445243419264443e-05, "grad_norm": 19.63395118713379, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8725938200950623, "num_tokens": 815912484.0, "step": 21384 }, { "epoch": 2.720391807658059, "ewc_loss": 0.034458886831998825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445888796704821e-05, "grad_norm": 19.617971420288086, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8602524995803833, "num_tokens": 815948626.0, "step": 21385 }, { "epoch": 2.7205190179366494, "ewc_loss": 0.034434013068675995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443401146796532e-05, "grad_norm": 19.596824645996094, "learning_rate": 1e-06, "loss": 0.3512, "mean_token_accuracy": 0.8845769166946411, "num_tokens": 815983154.0, "step": 21386 }, { "epoch": 2.72064622821524, "ewc_loss": 0.03450485318899155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450485382927582e-05, "grad_norm": 19.66986846923828, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8793299198150635, "num_tokens": 816017524.0, "step": 21387 }, { "epoch": 2.7207734384938305, "ewc_loss": 0.03445619344711304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445619222475216e-05, "grad_norm": 19.606178283691406, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8707592487335205, "num_tokens": 816055204.0, "step": 21388 }, { "epoch": 2.720900648772421, "ewc_loss": 0.034442558884620667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444255708018318e-05, "grad_norm": 19.61182975769043, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8831314444541931, "num_tokens": 816086671.0, "step": 21389 }, { "epoch": 2.721027859051011, "ewc_loss": 0.03437918797135353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437918712734245e-05, "grad_norm": 19.51545524597168, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8565587997436523, "num_tokens": 816114173.0, "step": 21390 }, { "epoch": 2.721155069329602, "ewc_loss": 0.03446201980113983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446202026680112e-05, "grad_norm": 19.55521583557129, "learning_rate": 1e-06, "loss": 0.3525, "mean_token_accuracy": 0.8879349231719971, "num_tokens": 816156434.0, "step": 21391 }, { "epoch": 2.721282279608192, "ewc_loss": 0.034562159329652786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4562159271445125e-05, "grad_norm": 19.65276527404785, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.871764600276947, "num_tokens": 816191037.0, "step": 21392 }, { "epoch": 2.721409489886783, "ewc_loss": 0.034502286463975906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450228541623801e-05, "grad_norm": 19.572917938232422, "learning_rate": 1e-06, "loss": 0.3481, "mean_token_accuracy": 0.8894248008728027, "num_tokens": 816227031.0, "step": 21393 }, { "epoch": 2.721536700165373, "ewc_loss": 0.03444959223270416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4449592931196094e-05, "grad_norm": 19.628314971923828, "learning_rate": 1e-06, "loss": 0.3738, "mean_token_accuracy": 0.879214346408844, "num_tokens": 816264308.0, "step": 21394 }, { "epoch": 2.7216639104439637, "ewc_loss": 0.03453649953007698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453650060691871e-05, "grad_norm": 19.694536209106445, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8745906352996826, "num_tokens": 816300955.0, "step": 21395 }, { "epoch": 2.7217911207225542, "ewc_loss": 0.03444097563624382, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444097455940209e-05, "grad_norm": 19.6090145111084, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8766331672668457, "num_tokens": 816332881.0, "step": 21396 }, { "epoch": 2.7219183310011448, "ewc_loss": 0.034501977264881134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450197618803941e-05, "grad_norm": 19.661287307739258, "learning_rate": 1e-06, "loss": 0.3326, "mean_token_accuracy": 0.8943053483963013, "num_tokens": 816367446.0, "step": 21397 }, { "epoch": 2.7220455412797353, "ewc_loss": 0.03454160690307617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4541608329163864e-05, "grad_norm": 19.68427276611328, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8596686124801636, "num_tokens": 816407286.0, "step": 21398 }, { "epoch": 2.722172751558326, "ewc_loss": 0.03446220979094505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446220944169909e-05, "grad_norm": 19.557695388793945, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8845603466033936, "num_tokens": 816445277.0, "step": 21399 }, { "epoch": 2.7222999618369164, "ewc_loss": 0.034508123993873596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450812437222339e-05, "grad_norm": 19.670095443725586, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8811882734298706, "num_tokens": 816481638.0, "step": 21400 }, { "epoch": 2.722427172115507, "ewc_loss": 0.03452800214290619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452800228842534e-05, "grad_norm": 19.62962532043457, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8771662712097168, "num_tokens": 816521386.0, "step": 21401 }, { "epoch": 2.7225543823940974, "ewc_loss": 0.034458816051483154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445881520747207e-05, "grad_norm": 19.593177795410156, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8879477977752686, "num_tokens": 816554760.0, "step": 21402 }, { "epoch": 2.722681592672688, "ewc_loss": 0.034541018307209015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4541018976597115e-05, "grad_norm": 19.627099990844727, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8851461410522461, "num_tokens": 816595195.0, "step": 21403 }, { "epoch": 2.7228088029512785, "ewc_loss": 0.03451802209019661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451802331255749e-05, "grad_norm": 19.601795196533203, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8784283399581909, "num_tokens": 816633046.0, "step": 21404 }, { "epoch": 2.722936013229869, "ewc_loss": 0.03452320024371147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452320015639998e-05, "grad_norm": 19.66058349609375, "learning_rate": 1e-06, "loss": 0.3494, "mean_token_accuracy": 0.8902338147163391, "num_tokens": 816675283.0, "step": 21405 }, { "epoch": 2.7230632235084595, "ewc_loss": 0.03452211245894432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452211240073666e-05, "grad_norm": 19.650299072265625, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8674103021621704, "num_tokens": 816716499.0, "step": 21406 }, { "epoch": 2.72319043378705, "ewc_loss": 0.03446070849895477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446070695645176e-05, "grad_norm": 19.652755737304688, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8699448108673096, "num_tokens": 816761825.0, "step": 21407 }, { "epoch": 2.7233176440656406, "ewc_loss": 0.03446410596370697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446410482865758e-05, "grad_norm": 19.706424713134766, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8826546669006348, "num_tokens": 816792080.0, "step": 21408 }, { "epoch": 2.723444854344231, "ewc_loss": 0.03443380072712898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443380046519451e-05, "grad_norm": 19.65419578552246, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.86696857213974, "num_tokens": 816832121.0, "step": 21409 }, { "epoch": 2.7235720646228216, "ewc_loss": 0.03443218767642975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443218884058297e-05, "grad_norm": 19.693588256835938, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8702000379562378, "num_tokens": 816874032.0, "step": 21410 }, { "epoch": 2.723699274901412, "ewc_loss": 0.034341879189014435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434187965467572e-05, "grad_norm": 19.57716178894043, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8579629063606262, "num_tokens": 816920471.0, "step": 21411 }, { "epoch": 2.7238264851800027, "ewc_loss": 0.0344279408454895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4427939681336284e-05, "grad_norm": 19.698610305786133, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8770761489868164, "num_tokens": 816958042.0, "step": 21412 }, { "epoch": 2.723953695458593, "ewc_loss": 0.03438634052872658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438633939367719e-05, "grad_norm": 19.637269973754883, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8673981428146362, "num_tokens": 816992158.0, "step": 21413 }, { "epoch": 2.7240809057371838, "ewc_loss": 0.03439616039395332, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439616193645634e-05, "grad_norm": 19.559650421142578, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8656501770019531, "num_tokens": 817032552.0, "step": 21414 }, { "epoch": 2.724208116015774, "ewc_loss": 0.0343606136739254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4360615245532244e-05, "grad_norm": 19.623380661010742, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8729639053344727, "num_tokens": 817072240.0, "step": 21415 }, { "epoch": 2.724335326294365, "ewc_loss": 0.034407760947942734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4407759812893346e-05, "grad_norm": 19.59566879272461, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.867162823677063, "num_tokens": 817104741.0, "step": 21416 }, { "epoch": 2.724462536572955, "ewc_loss": 0.03439778462052345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43977844750043e-05, "grad_norm": 19.63438606262207, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8638117909431458, "num_tokens": 817143876.0, "step": 21417 }, { "epoch": 2.724589746851546, "ewc_loss": 0.03445827588438988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445827678660862e-05, "grad_norm": 19.590818405151367, "learning_rate": 1e-06, "loss": 0.3154, "mean_token_accuracy": 0.8981318473815918, "num_tokens": 817183647.0, "step": 21418 }, { "epoch": 2.724716957130136, "ewc_loss": 0.034373000264167786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4372998925391585e-05, "grad_norm": 19.60899543762207, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8715460300445557, "num_tokens": 817214520.0, "step": 21419 }, { "epoch": 2.7248441674087265, "ewc_loss": 0.03444451466202736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4444514312781394e-05, "grad_norm": 19.593406677246094, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8669980764389038, "num_tokens": 817256936.0, "step": 21420 }, { "epoch": 2.724971377687317, "ewc_loss": 0.03434913605451584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.434913742239587e-05, "grad_norm": 19.62211036682129, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8767167329788208, "num_tokens": 817292840.0, "step": 21421 }, { "epoch": 2.7250985879659075, "ewc_loss": 0.03442494198679924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442494198679924e-05, "grad_norm": 19.534379959106445, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8717951774597168, "num_tokens": 817332872.0, "step": 21422 }, { "epoch": 2.725225798244498, "ewc_loss": 0.034388769418001175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438876956352033e-05, "grad_norm": 19.63463020324707, "learning_rate": 1e-06, "loss": 0.38, "mean_token_accuracy": 0.8845711946487427, "num_tokens": 817367668.0, "step": 21423 }, { "epoch": 2.7253530085230886, "ewc_loss": 0.03440215438604355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440215368755162e-05, "grad_norm": 19.538162231445312, "learning_rate": 1e-06, "loss": 0.3342, "mean_token_accuracy": 0.8948159217834473, "num_tokens": 817407726.0, "step": 21424 }, { "epoch": 2.725480218801679, "ewc_loss": 0.03441885858774185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441885928623378e-05, "grad_norm": 19.682374954223633, "learning_rate": 1e-06, "loss": 0.3531, "mean_token_accuracy": 0.886273205280304, "num_tokens": 817441003.0, "step": 21425 }, { "epoch": 2.7256074290802697, "ewc_loss": 0.034470587968826294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447058770689182e-05, "grad_norm": 19.533050537109375, "learning_rate": 1e-06, "loss": 0.3672, "mean_token_accuracy": 0.8812938928604126, "num_tokens": 817482587.0, "step": 21426 }, { "epoch": 2.72573463935886, "ewc_loss": 0.034329600632190704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432960147620179e-05, "grad_norm": 19.692214965820312, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8807849287986755, "num_tokens": 817519727.0, "step": 21427 }, { "epoch": 2.7258618496374507, "ewc_loss": 0.03446412831544876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4464126656530425e-05, "grad_norm": 19.551359176635742, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8730512857437134, "num_tokens": 817553809.0, "step": 21428 }, { "epoch": 2.7259890599160412, "ewc_loss": 0.03432149440050125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.432149605941959e-05, "grad_norm": 19.6123046875, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8603218793869019, "num_tokens": 817588893.0, "step": 21429 }, { "epoch": 2.7261162701946318, "ewc_loss": 0.03450528159737587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4505283110775054e-05, "grad_norm": 19.650497436523438, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8820309042930603, "num_tokens": 817627789.0, "step": 21430 }, { "epoch": 2.7262434804732223, "ewc_loss": 0.034311458468437195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.431145887589082e-05, "grad_norm": 19.63837432861328, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8667261004447937, "num_tokens": 817670048.0, "step": 21431 }, { "epoch": 2.726370690751813, "ewc_loss": 0.034371569752693176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.43715691997204e-05, "grad_norm": 19.573976516723633, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8812820911407471, "num_tokens": 817707108.0, "step": 21432 }, { "epoch": 2.7264979010304033, "ewc_loss": 0.034373939037323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4373937523923814e-05, "grad_norm": 19.593870162963867, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8744715452194214, "num_tokens": 817748126.0, "step": 21433 }, { "epoch": 2.726625111308994, "ewc_loss": 0.03435453400015831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435453254496679e-05, "grad_norm": 19.637027740478516, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8797755837440491, "num_tokens": 817781578.0, "step": 21434 }, { "epoch": 2.7267523215875844, "ewc_loss": 0.034395284950733185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439528518356383e-05, "grad_norm": 19.515073776245117, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8828065395355225, "num_tokens": 817822143.0, "step": 21435 }, { "epoch": 2.726879531866175, "ewc_loss": 0.03438020497560501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438020576140843e-05, "grad_norm": 19.592910766601562, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.886370062828064, "num_tokens": 817858456.0, "step": 21436 }, { "epoch": 2.7270067421447655, "ewc_loss": 0.03448104113340378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4481039620004594e-05, "grad_norm": 19.618526458740234, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8831914067268372, "num_tokens": 817892863.0, "step": 21437 }, { "epoch": 2.7271339524233555, "ewc_loss": 0.034461796283721924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446179471211508e-05, "grad_norm": 19.60895347595215, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.879914402961731, "num_tokens": 817931120.0, "step": 21438 }, { "epoch": 2.7272611627019465, "ewc_loss": 0.03434436395764351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4344364394200966e-05, "grad_norm": 19.650222778320312, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8800641298294067, "num_tokens": 817963076.0, "step": 21439 }, { "epoch": 2.7273883729805366, "ewc_loss": 0.03449983894824982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4499840694479644e-05, "grad_norm": 19.571571350097656, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8779029250144958, "num_tokens": 818003844.0, "step": 21440 }, { "epoch": 2.7275155832591276, "ewc_loss": 0.03438328579068184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438328712945804e-05, "grad_norm": 19.57929039001465, "learning_rate": 1e-06, "loss": 0.3793, "mean_token_accuracy": 0.8791650533676147, "num_tokens": 818048533.0, "step": 21441 }, { "epoch": 2.7276427935377177, "ewc_loss": 0.034444261342287064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44442596542649e-05, "grad_norm": 19.6243896484375, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8729394674301147, "num_tokens": 818083290.0, "step": 21442 }, { "epoch": 2.7277700038163086, "ewc_loss": 0.034479156136512756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447915514698252e-05, "grad_norm": 19.589082717895508, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8837116956710815, "num_tokens": 818120037.0, "step": 21443 }, { "epoch": 2.7278972140948987, "ewc_loss": 0.034373193979263306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437319537624717e-05, "grad_norm": 19.59198760986328, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8771026134490967, "num_tokens": 818164122.0, "step": 21444 }, { "epoch": 2.7280244243734892, "ewc_loss": 0.03447344899177551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4473447158234194e-05, "grad_norm": 19.579425811767578, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8783443570137024, "num_tokens": 818198065.0, "step": 21445 }, { "epoch": 2.7281516346520798, "ewc_loss": 0.03451038897037506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451039083302021e-05, "grad_norm": 19.63426399230957, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8792203664779663, "num_tokens": 818236843.0, "step": 21446 }, { "epoch": 2.7282788449306703, "ewc_loss": 0.03443218022584915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443218156462535e-05, "grad_norm": 19.631086349487305, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8781681656837463, "num_tokens": 818271300.0, "step": 21447 }, { "epoch": 2.728406055209261, "ewc_loss": 0.03449717536568642, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4497174056014046e-05, "grad_norm": 19.59189224243164, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8623571991920471, "num_tokens": 818311799.0, "step": 21448 }, { "epoch": 2.7285332654878514, "ewc_loss": 0.034451231360435486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4451230021659285e-05, "grad_norm": 19.686899185180664, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8819122314453125, "num_tokens": 818346662.0, "step": 21449 }, { "epoch": 2.728660475766442, "ewc_loss": 0.03447422385215759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447422568569891e-05, "grad_norm": 19.561065673828125, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8796675205230713, "num_tokens": 818384465.0, "step": 21450 }, { "epoch": 2.7287876860450324, "ewc_loss": 0.034420114010572433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442011438892223e-05, "grad_norm": 19.686697006225586, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.8746724128723145, "num_tokens": 818425625.0, "step": 21451 }, { "epoch": 2.728914896323623, "ewc_loss": 0.034533824771642685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453382305451669e-05, "grad_norm": 19.587020874023438, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8813252449035645, "num_tokens": 818458544.0, "step": 21452 }, { "epoch": 2.7290421066022135, "ewc_loss": 0.034431297332048416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443129753577523e-05, "grad_norm": 19.65153694152832, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8726422786712646, "num_tokens": 818503507.0, "step": 21453 }, { "epoch": 2.729169316880804, "ewc_loss": 0.03456977382302284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456977356108837e-05, "grad_norm": 19.644575119018555, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8824856281280518, "num_tokens": 818545411.0, "step": 21454 }, { "epoch": 2.7292965271593945, "ewc_loss": 0.03444380313158035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444380490691401e-05, "grad_norm": 19.675870895385742, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8721359968185425, "num_tokens": 818582100.0, "step": 21455 }, { "epoch": 2.729423737437985, "ewc_loss": 0.03448590263724327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4485903597669676e-05, "grad_norm": 19.689998626708984, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8734052777290344, "num_tokens": 818615549.0, "step": 21456 }, { "epoch": 2.7295509477165756, "ewc_loss": 0.03440568223595619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4405682526994497e-05, "grad_norm": 19.588497161865234, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8810915946960449, "num_tokens": 818652476.0, "step": 21457 }, { "epoch": 2.729678157995166, "ewc_loss": 0.03441596031188965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441595981712453e-05, "grad_norm": 19.686559677124023, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8794978857040405, "num_tokens": 818692435.0, "step": 21458 }, { "epoch": 2.7298053682737566, "ewc_loss": 0.03443603217601776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4436030546203256e-05, "grad_norm": 19.59749984741211, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8756232261657715, "num_tokens": 818733206.0, "step": 21459 }, { "epoch": 2.729932578552347, "ewc_loss": 0.03433876112103462, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4338761906838045e-05, "grad_norm": 19.557819366455078, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.879753589630127, "num_tokens": 818771717.0, "step": 21460 }, { "epoch": 2.7300597888309377, "ewc_loss": 0.03445127606391907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445127731538378e-05, "grad_norm": 19.628780364990234, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8757199048995972, "num_tokens": 818806999.0, "step": 21461 }, { "epoch": 2.7301869991095282, "ewc_loss": 0.03442332148551941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4423323086230084e-05, "grad_norm": 19.580860137939453, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8820034861564636, "num_tokens": 818846226.0, "step": 21462 }, { "epoch": 2.7303142093881183, "ewc_loss": 0.034464817494153976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446481787250377e-05, "grad_norm": 19.638425827026367, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.882629930973053, "num_tokens": 818885667.0, "step": 21463 }, { "epoch": 2.7304414196667093, "ewc_loss": 0.03442833945155144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4428339859005064e-05, "grad_norm": 19.6293888092041, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8822723627090454, "num_tokens": 818924009.0, "step": 21464 }, { "epoch": 2.7305686299452994, "ewc_loss": 0.03445475175976753, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445475158514455e-05, "grad_norm": 19.62972068786621, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8852633833885193, "num_tokens": 818962614.0, "step": 21465 }, { "epoch": 2.7306958402238903, "ewc_loss": 0.0343666635453701, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.436666520428844e-05, "grad_norm": 19.578521728515625, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8779737949371338, "num_tokens": 819004486.0, "step": 21466 }, { "epoch": 2.7308230505024804, "ewc_loss": 0.03445008397102356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445008405833505e-05, "grad_norm": 19.60051155090332, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8707810640335083, "num_tokens": 819048364.0, "step": 21467 }, { "epoch": 2.730950260781071, "ewc_loss": 0.03443227335810661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443227251409553e-05, "grad_norm": 19.6943359375, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8737470507621765, "num_tokens": 819086250.0, "step": 21468 }, { "epoch": 2.7310774710596615, "ewc_loss": 0.034449368715286255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4449367376510054e-05, "grad_norm": 19.624361038208008, "learning_rate": 1e-06, "loss": 0.3545, "mean_token_accuracy": 0.8875325918197632, "num_tokens": 819126705.0, "step": 21469 }, { "epoch": 2.731204681338252, "ewc_loss": 0.03437810763716698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437810664763674e-05, "grad_norm": 19.6048641204834, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8778408765792847, "num_tokens": 819165967.0, "step": 21470 }, { "epoch": 2.7313318916168425, "ewc_loss": 0.03446727246046066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446727350819856e-05, "grad_norm": 19.572542190551758, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8734504580497742, "num_tokens": 819208066.0, "step": 21471 }, { "epoch": 2.731459101895433, "ewc_loss": 0.03438480198383331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438480052864179e-05, "grad_norm": 19.669580459594727, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8739802837371826, "num_tokens": 819246269.0, "step": 21472 }, { "epoch": 2.7315863121740236, "ewc_loss": 0.03447607532143593, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4476073778932914e-05, "grad_norm": 19.565155029296875, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8779018521308899, "num_tokens": 819286027.0, "step": 21473 }, { "epoch": 2.731713522452614, "ewc_loss": 0.03443407267332077, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443407331360504e-05, "grad_norm": 19.686635971069336, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8853590488433838, "num_tokens": 819319994.0, "step": 21474 }, { "epoch": 2.7318407327312046, "ewc_loss": 0.0343971773982048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4397176932543516e-05, "grad_norm": 19.561471939086914, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8730611801147461, "num_tokens": 819360113.0, "step": 21475 }, { "epoch": 2.731967943009795, "ewc_loss": 0.034350186586380005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435018516029231e-05, "grad_norm": 19.58250617980957, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8855849504470825, "num_tokens": 819394575.0, "step": 21476 }, { "epoch": 2.7320951532883857, "ewc_loss": 0.03446677699685097, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4466778743080795e-05, "grad_norm": 19.700788497924805, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8727733492851257, "num_tokens": 819434545.0, "step": 21477 }, { "epoch": 2.7322223635669762, "ewc_loss": 0.034441955387592316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444195681368001e-05, "grad_norm": 19.540218353271484, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8868709802627563, "num_tokens": 819470699.0, "step": 21478 }, { "epoch": 2.7323495738455668, "ewc_loss": 0.03435393050312996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.435393227846362e-05, "grad_norm": 19.490928649902344, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8715105056762695, "num_tokens": 819514447.0, "step": 21479 }, { "epoch": 2.7324767841241573, "ewc_loss": 0.03452344611287117, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452344753895886e-05, "grad_norm": 19.67462158203125, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8714487552642822, "num_tokens": 819553434.0, "step": 21480 }, { "epoch": 2.732603994402748, "ewc_loss": 0.03446575254201889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4465752833057195e-05, "grad_norm": 19.643543243408203, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8695064783096313, "num_tokens": 819591594.0, "step": 21481 }, { "epoch": 2.7327312046813383, "ewc_loss": 0.03438527137041092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4385269827907905e-05, "grad_norm": 19.608488082885742, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8735381364822388, "num_tokens": 819634530.0, "step": 21482 }, { "epoch": 2.732858414959929, "ewc_loss": 0.034408267587423325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440826912992634e-05, "grad_norm": 19.62039566040039, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8677709698677063, "num_tokens": 819669358.0, "step": 21483 }, { "epoch": 2.7329856252385194, "ewc_loss": 0.03442249819636345, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4422497265040874e-05, "grad_norm": 19.590940475463867, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8761285543441772, "num_tokens": 819711336.0, "step": 21484 }, { "epoch": 2.73311283551711, "ewc_loss": 0.034443795680999756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444379399297759e-05, "grad_norm": 19.66946792602539, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8675081729888916, "num_tokens": 819746973.0, "step": 21485 }, { "epoch": 2.7332400457957005, "ewc_loss": 0.03442307934165001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442307934165001e-05, "grad_norm": 19.562889099121094, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8862067461013794, "num_tokens": 819780305.0, "step": 21486 }, { "epoch": 2.733367256074291, "ewc_loss": 0.034436579793691635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4436579881003127e-05, "grad_norm": 19.666879653930664, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.8856039643287659, "num_tokens": 819817326.0, "step": 21487 }, { "epoch": 2.733494466352881, "ewc_loss": 0.03451816737651825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451816883170977e-05, "grad_norm": 19.616729736328125, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8707094192504883, "num_tokens": 819857597.0, "step": 21488 }, { "epoch": 2.733621676631472, "ewc_loss": 0.03449004888534546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4490047255530953e-05, "grad_norm": 19.693214416503906, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.879961371421814, "num_tokens": 819893471.0, "step": 21489 }, { "epoch": 2.733748886910062, "ewc_loss": 0.03448969125747681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448969073360786e-05, "grad_norm": 19.54405403137207, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8598575592041016, "num_tokens": 819934175.0, "step": 21490 }, { "epoch": 2.733876097188653, "ewc_loss": 0.034480348229408264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448034840403125e-05, "grad_norm": 19.645971298217773, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8846662044525146, "num_tokens": 819971451.0, "step": 21491 }, { "epoch": 2.734003307467243, "ewc_loss": 0.034533992409706116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453399403952062e-05, "grad_norm": 19.610483169555664, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8716119527816772, "num_tokens": 820011566.0, "step": 21492 }, { "epoch": 2.7341305177458337, "ewc_loss": 0.03436097875237465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4360979043412954e-05, "grad_norm": 19.55868911743164, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8534739017486572, "num_tokens": 820047944.0, "step": 21493 }, { "epoch": 2.7342577280244242, "ewc_loss": 0.03454684466123581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454684338066727e-05, "grad_norm": 19.680099487304688, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8648979663848877, "num_tokens": 820083416.0, "step": 21494 }, { "epoch": 2.7343849383030148, "ewc_loss": 0.03450864553451538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4508644603192806e-05, "grad_norm": 19.60205841064453, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8680440783500671, "num_tokens": 820122476.0, "step": 21495 }, { "epoch": 2.7345121485816053, "ewc_loss": 0.03450073301792145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450073199928738e-05, "grad_norm": 19.62609100341797, "learning_rate": 1e-06, "loss": 0.3706, "mean_token_accuracy": 0.881949782371521, "num_tokens": 820159866.0, "step": 21496 }, { "epoch": 2.734639358860196, "ewc_loss": 0.03456808999180794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4568089176900685e-05, "grad_norm": 19.684316635131836, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.876661479473114, "num_tokens": 820202710.0, "step": 21497 }, { "epoch": 2.7347665691387864, "ewc_loss": 0.03447785973548889, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4477860026527196e-05, "grad_norm": 19.62685203552246, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8744433522224426, "num_tokens": 820239156.0, "step": 21498 }, { "epoch": 2.734893779417377, "ewc_loss": 0.03448003903031349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4480039175832644e-05, "grad_norm": 19.679800033569336, "learning_rate": 1e-06, "loss": 0.3244, "mean_token_accuracy": 0.8943251371383667, "num_tokens": 820274010.0, "step": 21499 }, { "epoch": 2.7350209896959674, "ewc_loss": 0.03448870778083801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4488708479329944e-05, "grad_norm": 19.51865005493164, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8743331432342529, "num_tokens": 820319171.0, "step": 21500 }, { "epoch": 2.735148199974558, "ewc_loss": 0.03440958261489868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44095824402757e-05, "grad_norm": 19.623117446899414, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8625538349151611, "num_tokens": 820355038.0, "step": 21501 }, { "epoch": 2.7352754102531485, "ewc_loss": 0.034569937735795975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456993727013469e-05, "grad_norm": 19.608154296875, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8727536201477051, "num_tokens": 820390973.0, "step": 21502 }, { "epoch": 2.735402620531739, "ewc_loss": 0.03445328399538994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445328547968529e-05, "grad_norm": 19.621448516845703, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8716295957565308, "num_tokens": 820424543.0, "step": 21503 }, { "epoch": 2.7355298308103295, "ewc_loss": 0.034530989825725555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4530989069025964e-05, "grad_norm": 19.551477432250977, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8742172718048096, "num_tokens": 820462702.0, "step": 21504 }, { "epoch": 2.73565704108892, "ewc_loss": 0.03450765833258629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4507658710936084e-05, "grad_norm": 19.64966583251953, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8854321241378784, "num_tokens": 820498840.0, "step": 21505 }, { "epoch": 2.7357842513675106, "ewc_loss": 0.03455245494842529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455245678196661e-05, "grad_norm": 19.559268951416016, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8696134090423584, "num_tokens": 820534083.0, "step": 21506 }, { "epoch": 2.735911461646101, "ewc_loss": 0.03451954573392868, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451954398769885e-05, "grad_norm": 19.577449798583984, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8795527219772339, "num_tokens": 820572479.0, "step": 21507 }, { "epoch": 2.7360386719246916, "ewc_loss": 0.03456844389438629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456844206084497e-05, "grad_norm": 19.53602409362793, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8648044466972351, "num_tokens": 820610283.0, "step": 21508 }, { "epoch": 2.736165882203282, "ewc_loss": 0.03459279239177704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459279105300084e-05, "grad_norm": 19.677717208862305, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8917019963264465, "num_tokens": 820647580.0, "step": 21509 }, { "epoch": 2.7362930924818727, "ewc_loss": 0.03459245339035988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459245272097178e-05, "grad_norm": 19.626365661621094, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8818008899688721, "num_tokens": 820686774.0, "step": 21510 }, { "epoch": 2.7364203027604628, "ewc_loss": 0.03459136560559273, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459136496530846e-05, "grad_norm": 19.651742935180664, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8858692646026611, "num_tokens": 820724635.0, "step": 21511 }, { "epoch": 2.7365475130390537, "ewc_loss": 0.03460448980331421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4604490792844445e-05, "grad_norm": 19.666677474975586, "learning_rate": 1e-06, "loss": 0.3694, "mean_token_accuracy": 0.8823490142822266, "num_tokens": 820761467.0, "step": 21512 }, { "epoch": 2.736674723317644, "ewc_loss": 0.03453486040234566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453485987847671e-05, "grad_norm": 19.64068603515625, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8635061383247375, "num_tokens": 820801001.0, "step": 21513 }, { "epoch": 2.736801933596235, "ewc_loss": 0.034547608345746994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454760735621676e-05, "grad_norm": 19.628198623657227, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8721784353256226, "num_tokens": 820841028.0, "step": 21514 }, { "epoch": 2.736929143874825, "ewc_loss": 0.0345340259373188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4534026781329885e-05, "grad_norm": 19.600059509277344, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.8840727806091309, "num_tokens": 820881613.0, "step": 21515 }, { "epoch": 2.737056354153416, "ewc_loss": 0.03451007977128029, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451008160482161e-05, "grad_norm": 19.568220138549805, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8837307691574097, "num_tokens": 820918205.0, "step": 21516 }, { "epoch": 2.737183564432006, "ewc_loss": 0.034524671733379364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452467353781685e-05, "grad_norm": 19.574615478515625, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8689006567001343, "num_tokens": 820950751.0, "step": 21517 }, { "epoch": 2.7373107747105965, "ewc_loss": 0.03450975567102432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450975418672897e-05, "grad_norm": 19.657075881958008, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8787432909011841, "num_tokens": 820987385.0, "step": 21518 }, { "epoch": 2.737437984989187, "ewc_loss": 0.034562159329652786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4562159271445125e-05, "grad_norm": 19.563356399536133, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8689831495285034, "num_tokens": 821026273.0, "step": 21519 }, { "epoch": 2.7375651952677775, "ewc_loss": 0.034453488886356354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445348920649849e-05, "grad_norm": 19.62872314453125, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.868489682674408, "num_tokens": 821067978.0, "step": 21520 }, { "epoch": 2.737692405546368, "ewc_loss": 0.034542832523584366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4542834328021854e-05, "grad_norm": 19.548179626464844, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8825288414955139, "num_tokens": 821111925.0, "step": 21521 }, { "epoch": 2.7378196158249586, "ewc_loss": 0.03445835039019585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445834954618476e-05, "grad_norm": 19.624996185302734, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8801430463790894, "num_tokens": 821151019.0, "step": 21522 }, { "epoch": 2.737946826103549, "ewc_loss": 0.03455042093992233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455041951383464e-05, "grad_norm": 19.600908279418945, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8834636211395264, "num_tokens": 821185520.0, "step": 21523 }, { "epoch": 2.7380740363821396, "ewc_loss": 0.03449197858572006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449197902227752e-05, "grad_norm": 19.592390060424805, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8780855536460876, "num_tokens": 821224273.0, "step": 21524 }, { "epoch": 2.73820124666073, "ewc_loss": 0.034603435546159744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460343577899039e-05, "grad_norm": 19.607912063598633, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8818092346191406, "num_tokens": 821263541.0, "step": 21525 }, { "epoch": 2.7383284569393207, "ewc_loss": 0.034526001662015915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452600140008144e-05, "grad_norm": 19.686979293823242, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8650939464569092, "num_tokens": 821304053.0, "step": 21526 }, { "epoch": 2.7384556672179112, "ewc_loss": 0.03452415019273758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452414966886863e-05, "grad_norm": 19.651500701904297, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.876312255859375, "num_tokens": 821339710.0, "step": 21527 }, { "epoch": 2.7385828774965018, "ewc_loss": 0.0344671867787838, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446718619670719e-05, "grad_norm": 19.6721134185791, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8761640191078186, "num_tokens": 821378377.0, "step": 21528 }, { "epoch": 2.7387100877750923, "ewc_loss": 0.03446764126420021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4467640944058076e-05, "grad_norm": 19.582347869873047, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8854008913040161, "num_tokens": 821417434.0, "step": 21529 }, { "epoch": 2.738837298053683, "ewc_loss": 0.03448132425546646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448132338235155e-05, "grad_norm": 19.645755767822266, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8721039295196533, "num_tokens": 821464323.0, "step": 21530 }, { "epoch": 2.7389645083322733, "ewc_loss": 0.03448916971683502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4489170502638444e-05, "grad_norm": 19.656923294067383, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8735085725784302, "num_tokens": 821509891.0, "step": 21531 }, { "epoch": 2.739091718610864, "ewc_loss": 0.03445231541991234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44523141393438e-05, "grad_norm": 19.596765518188477, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8655381202697754, "num_tokens": 821557309.0, "step": 21532 }, { "epoch": 2.7392189288894544, "ewc_loss": 0.03439550846815109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439550710027106e-05, "grad_norm": 19.666841506958008, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8715533018112183, "num_tokens": 821597751.0, "step": 21533 }, { "epoch": 2.739346139168045, "ewc_loss": 0.03437165170907974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437165287323296e-05, "grad_norm": 19.558101654052734, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8870729207992554, "num_tokens": 821637961.0, "step": 21534 }, { "epoch": 2.7394733494466355, "ewc_loss": 0.03443284332752228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443284367676824e-05, "grad_norm": 19.638145446777344, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8593105673789978, "num_tokens": 821684728.0, "step": 21535 }, { "epoch": 2.7396005597252255, "ewc_loss": 0.034381553530693054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438155181356706e-05, "grad_norm": 19.616764068603516, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8809162378311157, "num_tokens": 821715359.0, "step": 21536 }, { "epoch": 2.7397277700038165, "ewc_loss": 0.034367043524980545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4367043554084376e-05, "grad_norm": 19.579866409301758, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.870468020439148, "num_tokens": 821755540.0, "step": 21537 }, { "epoch": 2.7398549802824066, "ewc_loss": 0.034393634647130966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439363354118541e-05, "grad_norm": 19.690879821777344, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8770302534103394, "num_tokens": 821788506.0, "step": 21538 }, { "epoch": 2.7399821905609976, "ewc_loss": 0.034477245062589645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.44772452081088e-05, "grad_norm": 19.59311866760254, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.875598132610321, "num_tokens": 821826555.0, "step": 21539 }, { "epoch": 2.7401094008395877, "ewc_loss": 0.03431784734129906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4317847166676074e-05, "grad_norm": 19.600238800048828, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8659102916717529, "num_tokens": 821866160.0, "step": 21540 }, { "epoch": 2.7402366111181786, "ewc_loss": 0.034457724541425705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4457723813829944e-05, "grad_norm": 19.634096145629883, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8812733888626099, "num_tokens": 821905595.0, "step": 21541 }, { "epoch": 2.7403638213967687, "ewc_loss": 0.034419722855091095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4419721487211064e-05, "grad_norm": 19.586027145385742, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8663965463638306, "num_tokens": 821944769.0, "step": 21542 }, { "epoch": 2.7404910316753592, "ewc_loss": 0.034375742077827454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.437574196141213e-05, "grad_norm": 19.57950782775879, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8830693960189819, "num_tokens": 821988889.0, "step": 21543 }, { "epoch": 2.7406182419539498, "ewc_loss": 0.034398604184389114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4398603020235896e-05, "grad_norm": 19.617435455322266, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.879317045211792, "num_tokens": 822025343.0, "step": 21544 }, { "epoch": 2.7407454522325403, "ewc_loss": 0.034400325268507004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440032378421165e-05, "grad_norm": 19.651559829711914, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8647313714027405, "num_tokens": 822067651.0, "step": 21545 }, { "epoch": 2.740872662511131, "ewc_loss": 0.03442345932126045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442345769144595e-05, "grad_norm": 19.6033992767334, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8658232688903809, "num_tokens": 822102885.0, "step": 21546 }, { "epoch": 2.7409998727897213, "ewc_loss": 0.0343964584171772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.439646025071852e-05, "grad_norm": 19.55621910095215, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8685619235038757, "num_tokens": 822142515.0, "step": 21547 }, { "epoch": 2.741127083068312, "ewc_loss": 0.03445296362042427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445296533755027e-05, "grad_norm": 19.604646682739258, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8728766441345215, "num_tokens": 822180745.0, "step": 21548 }, { "epoch": 2.7412542933469024, "ewc_loss": 0.03443900868296623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4439010050846264e-05, "grad_norm": 19.601062774658203, "learning_rate": 1e-06, "loss": 0.3624, "mean_token_accuracy": 0.8828772306442261, "num_tokens": 822215117.0, "step": 21549 }, { "epoch": 2.741381503625493, "ewc_loss": 0.03444225341081619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444225512794219e-05, "grad_norm": 19.560482025146484, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8660355806350708, "num_tokens": 822258883.0, "step": 21550 }, { "epoch": 2.7415087139040835, "ewc_loss": 0.03449225798249245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4492259146645665e-05, "grad_norm": 19.60403060913086, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.874456524848938, "num_tokens": 822295254.0, "step": 21551 }, { "epoch": 2.741635924182674, "ewc_loss": 0.03450247645378113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450247459113598e-05, "grad_norm": 19.57162094116211, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.877771258354187, "num_tokens": 822335769.0, "step": 21552 }, { "epoch": 2.7417631344612645, "ewc_loss": 0.034519266337156296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4519267501309514e-05, "grad_norm": 19.61143684387207, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8850197792053223, "num_tokens": 822376873.0, "step": 21553 }, { "epoch": 2.741890344739855, "ewc_loss": 0.03452321141958237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45232110703364e-05, "grad_norm": 19.591035842895508, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8734085559844971, "num_tokens": 822419868.0, "step": 21554 }, { "epoch": 2.7420175550184456, "ewc_loss": 0.03454490005970001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454490069998428e-05, "grad_norm": 19.641090393066406, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8748213648796082, "num_tokens": 822456757.0, "step": 21555 }, { "epoch": 2.742144765297036, "ewc_loss": 0.03453604876995087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453604949754663e-05, "grad_norm": 19.61370277404785, "learning_rate": 1e-06, "loss": 0.3416, "mean_token_accuracy": 0.8883794546127319, "num_tokens": 822492772.0, "step": 21556 }, { "epoch": 2.7422719755756266, "ewc_loss": 0.03448551893234253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4485517971916124e-05, "grad_norm": 19.560348510742188, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8602091670036316, "num_tokens": 822535400.0, "step": 21557 }, { "epoch": 2.742399185854217, "ewc_loss": 0.03448823094367981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4488231904106215e-05, "grad_norm": 19.620372772216797, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8807085752487183, "num_tokens": 822568313.0, "step": 21558 }, { "epoch": 2.7425263961328077, "ewc_loss": 0.03454776853322983, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454776742728427e-05, "grad_norm": 19.630191802978516, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8826476335525513, "num_tokens": 822608487.0, "step": 21559 }, { "epoch": 2.742653606411398, "ewc_loss": 0.034520991146564484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4520991903264076e-05, "grad_norm": 19.592416763305664, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8762310743331909, "num_tokens": 822647938.0, "step": 21560 }, { "epoch": 2.7427808166899883, "ewc_loss": 0.03447753190994263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447753260843456e-05, "grad_norm": 19.60822296142578, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.871523380279541, "num_tokens": 822685974.0, "step": 21561 }, { "epoch": 2.7429080269685793, "ewc_loss": 0.03451703116297722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451703014434315e-05, "grad_norm": 19.670352935791016, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8786924481391907, "num_tokens": 822719166.0, "step": 21562 }, { "epoch": 2.7430352372471694, "ewc_loss": 0.034533947706222534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453394674579613e-05, "grad_norm": 19.614023208618164, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8648998141288757, "num_tokens": 822758364.0, "step": 21563 }, { "epoch": 2.7431624475257603, "ewc_loss": 0.03445166349411011, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445166294113733e-05, "grad_norm": 19.568004608154297, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8833432197570801, "num_tokens": 822798002.0, "step": 21564 }, { "epoch": 2.7432896578043504, "ewc_loss": 0.03456752374768257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4567525290185586e-05, "grad_norm": 19.68355369567871, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.876885712146759, "num_tokens": 822837958.0, "step": 21565 }, { "epoch": 2.743416868082941, "ewc_loss": 0.03450651839375496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4506520023569465e-05, "grad_norm": 19.716245651245117, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8764467239379883, "num_tokens": 822873076.0, "step": 21566 }, { "epoch": 2.7435440783615315, "ewc_loss": 0.03448045253753662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448045390541665e-05, "grad_norm": 19.618091583251953, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8742550611495972, "num_tokens": 822904693.0, "step": 21567 }, { "epoch": 2.743671288640122, "ewc_loss": 0.03448271378874779, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4482713090255857e-05, "grad_norm": 19.751237869262695, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.870612621307373, "num_tokens": 822941192.0, "step": 21568 }, { "epoch": 2.7437984989187125, "ewc_loss": 0.03452033922076225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4520340705057606e-05, "grad_norm": 19.620969772338867, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8637627363204956, "num_tokens": 822982166.0, "step": 21569 }, { "epoch": 2.743925709197303, "ewc_loss": 0.03440370410680771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.440370346652344e-05, "grad_norm": 19.624652862548828, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8700496554374695, "num_tokens": 823021630.0, "step": 21570 }, { "epoch": 2.7440529194758936, "ewc_loss": 0.03448175638914108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448175630182959e-05, "grad_norm": 19.584552764892578, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8756823539733887, "num_tokens": 823062133.0, "step": 21571 }, { "epoch": 2.744180129754484, "ewc_loss": 0.034460995346307755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446099435677752e-05, "grad_norm": 19.641582489013672, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8720035552978516, "num_tokens": 823105163.0, "step": 21572 }, { "epoch": 2.7443073400330746, "ewc_loss": 0.034573379904031754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4573378798086196e-05, "grad_norm": 19.672107696533203, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8562026619911194, "num_tokens": 823146878.0, "step": 21573 }, { "epoch": 2.744434550311665, "ewc_loss": 0.03445182740688324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445182665018365e-05, "grad_norm": 19.61733055114746, "learning_rate": 1e-06, "loss": 0.3363, "mean_token_accuracy": 0.8921279907226562, "num_tokens": 823182216.0, "step": 21574 }, { "epoch": 2.7445617605902557, "ewc_loss": 0.034482572227716446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448257120908238e-05, "grad_norm": 19.596071243286133, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8830126523971558, "num_tokens": 823227838.0, "step": 21575 }, { "epoch": 2.7446889708688462, "ewc_loss": 0.03446735069155693, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446734990575351e-05, "grad_norm": 19.54690933227539, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.867956280708313, "num_tokens": 823268609.0, "step": 21576 }, { "epoch": 2.7448161811474368, "ewc_loss": 0.03451015427708626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451015436439775e-05, "grad_norm": 19.67399787902832, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8842748403549194, "num_tokens": 823309426.0, "step": 21577 }, { "epoch": 2.7449433914260273, "ewc_loss": 0.034500326961278915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4500328183639795e-05, "grad_norm": 19.64430809020996, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8708561062812805, "num_tokens": 823346049.0, "step": 21578 }, { "epoch": 2.745070601704618, "ewc_loss": 0.03448161110281944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448161078267731e-05, "grad_norm": 19.693979263305664, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8688356280326843, "num_tokens": 823378915.0, "step": 21579 }, { "epoch": 2.7451978119832083, "ewc_loss": 0.034503087401390076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450308577157557e-05, "grad_norm": 19.60651397705078, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8649352788925171, "num_tokens": 823422985.0, "step": 21580 }, { "epoch": 2.745325022261799, "ewc_loss": 0.03445141762495041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4451419196557254e-05, "grad_norm": 19.623594284057617, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8627987504005432, "num_tokens": 823460703.0, "step": 21581 }, { "epoch": 2.7454522325403894, "ewc_loss": 0.034501172602176666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450117219472304e-05, "grad_norm": 19.68017578125, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8782423138618469, "num_tokens": 823499651.0, "step": 21582 }, { "epoch": 2.74557944281898, "ewc_loss": 0.03445957601070404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445957554504275e-05, "grad_norm": 19.52296257019043, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.874238908290863, "num_tokens": 823537044.0, "step": 21583 }, { "epoch": 2.7457066530975704, "ewc_loss": 0.034453023225069046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.445302354521118e-05, "grad_norm": 19.64348602294922, "learning_rate": 1e-06, "loss": 0.3433, "mean_token_accuracy": 0.8886370658874512, "num_tokens": 823571832.0, "step": 21584 }, { "epoch": 2.745833863376161, "ewc_loss": 0.034563254565000534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456325430306606e-05, "grad_norm": 19.5382080078125, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8819477558135986, "num_tokens": 823609852.0, "step": 21585 }, { "epoch": 2.745961073654751, "ewc_loss": 0.03442436456680298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442436354816891e-05, "grad_norm": 19.648221969604492, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8793588280677795, "num_tokens": 823644419.0, "step": 21586 }, { "epoch": 2.746088283933342, "ewc_loss": 0.03460608795285225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460608786554076e-05, "grad_norm": 19.563655853271484, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8815827965736389, "num_tokens": 823682451.0, "step": 21587 }, { "epoch": 2.746215494211932, "ewc_loss": 0.03444087877869606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.444087997195311e-05, "grad_norm": 19.58552360534668, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8859785795211792, "num_tokens": 823723051.0, "step": 21588 }, { "epoch": 2.746342704490523, "ewc_loss": 0.03450924903154373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4509248507674783e-05, "grad_norm": 19.60953140258789, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8728962540626526, "num_tokens": 823761353.0, "step": 21589 }, { "epoch": 2.746469914769113, "ewc_loss": 0.034493349492549896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449335054028779e-05, "grad_norm": 19.546010971069336, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8620165586471558, "num_tokens": 823800281.0, "step": 21590 }, { "epoch": 2.7465971250477037, "ewc_loss": 0.034578461199998856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45784610544797e-05, "grad_norm": 19.663854598999023, "learning_rate": 1e-06, "loss": 0.3467, "mean_token_accuracy": 0.8884236216545105, "num_tokens": 823843255.0, "step": 21591 }, { "epoch": 2.7467243353262942, "ewc_loss": 0.034508682787418365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450868098298088e-05, "grad_norm": 19.546764373779297, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8726367950439453, "num_tokens": 823880736.0, "step": 21592 }, { "epoch": 2.7468515456048848, "ewc_loss": 0.034474704414606094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447470589890145e-05, "grad_norm": 19.594507217407227, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8723204135894775, "num_tokens": 823923232.0, "step": 21593 }, { "epoch": 2.7469787558834753, "ewc_loss": 0.03461167961359024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461167943896726e-05, "grad_norm": 19.653881072998047, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.878733217716217, "num_tokens": 823957488.0, "step": 21594 }, { "epoch": 2.747105966162066, "ewc_loss": 0.03447382152080536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4473821870051324e-05, "grad_norm": 19.540159225463867, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8709138631820679, "num_tokens": 823999568.0, "step": 21595 }, { "epoch": 2.7472331764406563, "ewc_loss": 0.03450700640678406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4507007512729615e-05, "grad_norm": 19.609699249267578, "learning_rate": 1e-06, "loss": 0.3561, "mean_token_accuracy": 0.8872079849243164, "num_tokens": 824036947.0, "step": 21596 }, { "epoch": 2.747360386719247, "ewc_loss": 0.03458111360669136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458111314103007e-05, "grad_norm": 19.637537002563477, "learning_rate": 1e-06, "loss": 0.3496, "mean_token_accuracy": 0.8879627585411072, "num_tokens": 824067966.0, "step": 21597 }, { "epoch": 2.7474875969978374, "ewc_loss": 0.0345325842499733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453258250374347e-05, "grad_norm": 19.623271942138672, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8741843700408936, "num_tokens": 824112216.0, "step": 21598 }, { "epoch": 2.747614807276428, "ewc_loss": 0.034530289471149445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4530290577095e-05, "grad_norm": 19.628551483154297, "learning_rate": 1e-06, "loss": 0.3471, "mean_token_accuracy": 0.8859163522720337, "num_tokens": 824148749.0, "step": 21599 }, { "epoch": 2.7477420175550185, "ewc_loss": 0.03453873470425606, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453873432590626e-05, "grad_norm": 19.626667022705078, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8740091323852539, "num_tokens": 824186685.0, "step": 21600 }, { "epoch": 2.747869227833609, "ewc_loss": 0.03456469625234604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456469494267367e-05, "grad_norm": 19.684688568115234, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8706146478652954, "num_tokens": 824223716.0, "step": 21601 }, { "epoch": 2.7479964381121995, "ewc_loss": 0.03451675549149513, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4516753657953814e-05, "grad_norm": 19.61920738220215, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8740745186805725, "num_tokens": 824264627.0, "step": 21602 }, { "epoch": 2.74812364839079, "ewc_loss": 0.0345345139503479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4534514270490035e-05, "grad_norm": 19.673080444335938, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8752696514129639, "num_tokens": 824301901.0, "step": 21603 }, { "epoch": 2.7482508586693806, "ewc_loss": 0.03449594974517822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4495948057156056e-05, "grad_norm": 19.627912521362305, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8790303468704224, "num_tokens": 824344621.0, "step": 21604 }, { "epoch": 2.748378068947971, "ewc_loss": 0.03449159115552902, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449158975854516e-05, "grad_norm": 19.564014434814453, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8805263042449951, "num_tokens": 824385850.0, "step": 21605 }, { "epoch": 2.7485052792265616, "ewc_loss": 0.034578837454319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4578835766296834e-05, "grad_norm": 19.69870948791504, "learning_rate": 1e-06, "loss": 0.3054, "mean_token_accuracy": 0.902052640914917, "num_tokens": 824425545.0, "step": 21606 }, { "epoch": 2.748632489505152, "ewc_loss": 0.03451107069849968, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4511071135057136e-05, "grad_norm": 19.585346221923828, "learning_rate": 1e-06, "loss": 0.3389, "mean_token_accuracy": 0.8931545615196228, "num_tokens": 824468611.0, "step": 21607 }, { "epoch": 2.7487596997837427, "ewc_loss": 0.03451169654726982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4511696867411956e-05, "grad_norm": 19.69066619873047, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.88075852394104, "num_tokens": 824512161.0, "step": 21608 }, { "epoch": 2.7488869100623328, "ewc_loss": 0.03449491411447525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449491487117484e-05, "grad_norm": 19.66358184814453, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8879594802856445, "num_tokens": 824555502.0, "step": 21609 }, { "epoch": 2.7490141203409237, "ewc_loss": 0.034432075917720795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.443207606323995e-05, "grad_norm": 19.6142520904541, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8695626258850098, "num_tokens": 824596277.0, "step": 21610 }, { "epoch": 2.749141330619514, "ewc_loss": 0.034519072622060776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451907105045393e-05, "grad_norm": 19.749183654785156, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8632857799530029, "num_tokens": 824633704.0, "step": 21611 }, { "epoch": 2.749268540898105, "ewc_loss": 0.03448221832513809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448221832513809e-05, "grad_norm": 19.55663299560547, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8802838325500488, "num_tokens": 824673584.0, "step": 21612 }, { "epoch": 2.749395751176695, "ewc_loss": 0.03443600609898567, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4436005080351606e-05, "grad_norm": 19.71137237548828, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8700336217880249, "num_tokens": 824712508.0, "step": 21613 }, { "epoch": 2.749522961455286, "ewc_loss": 0.034460052847862244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446005212026648e-05, "grad_norm": 19.622072219848633, "learning_rate": 1e-06, "loss": 0.3545, "mean_token_accuracy": 0.8872804045677185, "num_tokens": 824754353.0, "step": 21614 }, { "epoch": 2.749650171733876, "ewc_loss": 0.03434871882200241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4348719054833055e-05, "grad_norm": 19.55523681640625, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8576270937919617, "num_tokens": 824796333.0, "step": 21615 }, { "epoch": 2.7497773820124665, "ewc_loss": 0.03444388881325722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4443888580426574e-05, "grad_norm": 19.634214401245117, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8698089718818665, "num_tokens": 824832048.0, "step": 21616 }, { "epoch": 2.749904592291057, "ewc_loss": 0.03442021459341049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.442021625232883e-05, "grad_norm": 19.558908462524414, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8702298402786255, "num_tokens": 824871889.0, "step": 21617 }, { "epoch": 2.7500318025696475, "ewc_loss": 0.03446407988667488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446407936280593e-05, "grad_norm": 19.59307289123535, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8858013153076172, "num_tokens": 824912547.0, "step": 21618 }, { "epoch": 2.750159012848238, "ewc_loss": 0.03446902334690094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446902337600477e-05, "grad_norm": 19.60835075378418, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8566045761108398, "num_tokens": 824955626.0, "step": 21619 }, { "epoch": 2.7502862231268286, "ewc_loss": 0.034513819962739944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45138214470353e-05, "grad_norm": 19.66855812072754, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8840650916099548, "num_tokens": 824991813.0, "step": 21620 }, { "epoch": 2.750413433405419, "ewc_loss": 0.03449217230081558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4492171835154295e-05, "grad_norm": 19.6203670501709, "learning_rate": 1e-06, "loss": 0.3556, "mean_token_accuracy": 0.8849723935127258, "num_tokens": 825024356.0, "step": 21621 }, { "epoch": 2.7505406436840096, "ewc_loss": 0.034444715827703476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4444714401615784e-05, "grad_norm": 19.643596649169922, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.8735605478286743, "num_tokens": 825060316.0, "step": 21622 }, { "epoch": 2.7506678539626, "ewc_loss": 0.03450693562626839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450693475315347e-05, "grad_norm": 19.62551498413086, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8688167333602905, "num_tokens": 825096044.0, "step": 21623 }, { "epoch": 2.7507950642411907, "ewc_loss": 0.034386150538921356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.438615021877922e-05, "grad_norm": 19.61951446533203, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8736269474029541, "num_tokens": 825123999.0, "step": 21624 }, { "epoch": 2.750922274519781, "ewc_loss": 0.03447037562727928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447037670412101e-05, "grad_norm": 19.640451431274414, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8657128810882568, "num_tokens": 825170155.0, "step": 21625 }, { "epoch": 2.7510494847983717, "ewc_loss": 0.03448056802153587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448056668275967e-05, "grad_norm": 19.65402603149414, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8751243948936462, "num_tokens": 825213388.0, "step": 21626 }, { "epoch": 2.7511766950769623, "ewc_loss": 0.034485749900341034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448575080255978e-05, "grad_norm": 19.638879776000977, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8823471069335938, "num_tokens": 825247171.0, "step": 21627 }, { "epoch": 2.751303905355553, "ewc_loss": 0.034459736198186874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4459735616110265e-05, "grad_norm": 19.615888595581055, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8768813610076904, "num_tokens": 825283271.0, "step": 21628 }, { "epoch": 2.7514311156341433, "ewc_loss": 0.03454148769378662, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454148827586323e-05, "grad_norm": 19.661041259765625, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8740881681442261, "num_tokens": 825318976.0, "step": 21629 }, { "epoch": 2.751558325912734, "ewc_loss": 0.03451970964670181, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451971133472398e-05, "grad_norm": 19.636032104492188, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8751749396324158, "num_tokens": 825358019.0, "step": 21630 }, { "epoch": 2.7516855361913244, "ewc_loss": 0.03454628959298134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454629040788859e-05, "grad_norm": 19.62281608581543, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8914932608604431, "num_tokens": 825394025.0, "step": 21631 }, { "epoch": 2.751812746469915, "ewc_loss": 0.034596651792526245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459665094851516e-05, "grad_norm": 19.634645462036133, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8863378763198853, "num_tokens": 825433021.0, "step": 21632 }, { "epoch": 2.7519399567485054, "ewc_loss": 0.03460041806101799, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460041989455931e-05, "grad_norm": 19.553136825561523, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8773213028907776, "num_tokens": 825477026.0, "step": 21633 }, { "epoch": 2.7520671670270955, "ewc_loss": 0.03454908728599548, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4549088013591245e-05, "grad_norm": 19.686603546142578, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8810734152793884, "num_tokens": 825511054.0, "step": 21634 }, { "epoch": 2.7521943773056865, "ewc_loss": 0.03458366170525551, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4583663364173844e-05, "grad_norm": 19.59868812561035, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8617504239082336, "num_tokens": 825549654.0, "step": 21635 }, { "epoch": 2.7523215875842766, "ewc_loss": 0.03453605994582176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453606041148305e-05, "grad_norm": 19.639657974243164, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.8783969879150391, "num_tokens": 825587269.0, "step": 21636 }, { "epoch": 2.7524487978628676, "ewc_loss": 0.03459993749856949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4599936043377966e-05, "grad_norm": 19.62112045288086, "learning_rate": 1e-06, "loss": 0.3176, "mean_token_accuracy": 0.8986068367958069, "num_tokens": 825624864.0, "step": 21637 }, { "epoch": 2.7525760081414576, "ewc_loss": 0.03453979641199112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453979661571793e-05, "grad_norm": 19.634902954101562, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8680006265640259, "num_tokens": 825667025.0, "step": 21638 }, { "epoch": 2.7527032184200486, "ewc_loss": 0.03456950560212135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4569504350656644e-05, "grad_norm": 19.641101837158203, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8615105748176575, "num_tokens": 825709505.0, "step": 21639 }, { "epoch": 2.7528304286986387, "ewc_loss": 0.034568559378385544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45685584761668e-05, "grad_norm": 19.617630004882812, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.87917560338974, "num_tokens": 825747376.0, "step": 21640 }, { "epoch": 2.7529576389772292, "ewc_loss": 0.034567710012197495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456771082710475e-05, "grad_norm": 19.71092414855957, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.882677435874939, "num_tokens": 825784817.0, "step": 21641 }, { "epoch": 2.7530848492558198, "ewc_loss": 0.034517478197813034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4517477615736425e-05, "grad_norm": 19.508106231689453, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.8744221925735474, "num_tokens": 825815631.0, "step": 21642 }, { "epoch": 2.7532120595344103, "ewc_loss": 0.03455767035484314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4557670005597174e-05, "grad_norm": 19.668344497680664, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8758217096328735, "num_tokens": 825854202.0, "step": 21643 }, { "epoch": 2.753339269813001, "ewc_loss": 0.034614626318216324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4614626201801e-05, "grad_norm": 19.54568099975586, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.8842454552650452, "num_tokens": 825890636.0, "step": 21644 }, { "epoch": 2.7534664800915913, "ewc_loss": 0.03459273278713226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4592732845339924e-05, "grad_norm": 19.69355583190918, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8535297513008118, "num_tokens": 825931054.0, "step": 21645 }, { "epoch": 2.753593690370182, "ewc_loss": 0.03464748337864876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4647484426386654e-05, "grad_norm": 19.546361923217773, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8721915483474731, "num_tokens": 825972519.0, "step": 21646 }, { "epoch": 2.7537209006487724, "ewc_loss": 0.03458324447274208, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458324499661103e-05, "grad_norm": 19.600847244262695, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8780604600906372, "num_tokens": 826010010.0, "step": 21647 }, { "epoch": 2.753848110927363, "ewc_loss": 0.03463854268193245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463854227447882e-05, "grad_norm": 19.618932723999023, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8748663067817688, "num_tokens": 826048348.0, "step": 21648 }, { "epoch": 2.7539753212059535, "ewc_loss": 0.03453133627772331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453133467701264e-05, "grad_norm": 19.516054153442383, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8726996183395386, "num_tokens": 826087807.0, "step": 21649 }, { "epoch": 2.754102531484544, "ewc_loss": 0.03452874347567558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452874443610199e-05, "grad_norm": 19.54668426513672, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8730505108833313, "num_tokens": 826124657.0, "step": 21650 }, { "epoch": 2.7542297417631345, "ewc_loss": 0.034595787525177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459578874753788e-05, "grad_norm": 19.661972045898438, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8776479959487915, "num_tokens": 826164190.0, "step": 21651 }, { "epoch": 2.754356952041725, "ewc_loss": 0.03459850326180458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459850267972797e-05, "grad_norm": 19.588871002197266, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8773795962333679, "num_tokens": 826202876.0, "step": 21652 }, { "epoch": 2.7544841623203156, "ewc_loss": 0.03462393581867218, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462393578956835e-05, "grad_norm": 19.725143432617188, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8752079010009766, "num_tokens": 826242955.0, "step": 21653 }, { "epoch": 2.754611372598906, "ewc_loss": 0.03455081582069397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455081605352461e-05, "grad_norm": 19.527143478393555, "learning_rate": 1e-06, "loss": 0.3577, "mean_token_accuracy": 0.8849508762359619, "num_tokens": 826279925.0, "step": 21654 }, { "epoch": 2.7547385828774966, "ewc_loss": 0.03451399877667427, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4513999707996845e-05, "grad_norm": 19.718769073486328, "learning_rate": 1e-06, "loss": 0.3569, "mean_token_accuracy": 0.8856915235519409, "num_tokens": 826314076.0, "step": 21655 }, { "epoch": 2.754865793156087, "ewc_loss": 0.03463928401470184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463928442215547e-05, "grad_norm": 19.600900650024414, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.873092532157898, "num_tokens": 826354138.0, "step": 21656 }, { "epoch": 2.7549930034346777, "ewc_loss": 0.03449555113911629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449555151746608e-05, "grad_norm": 19.668365478515625, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8715825080871582, "num_tokens": 826395393.0, "step": 21657 }, { "epoch": 2.755120213713268, "ewc_loss": 0.03461726754903793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461726737441495e-05, "grad_norm": 19.57744598388672, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8666176795959473, "num_tokens": 826437539.0, "step": 21658 }, { "epoch": 2.7552474239918583, "ewc_loss": 0.034468356519937515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446835762588307e-05, "grad_norm": 19.66330909729004, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8701030015945435, "num_tokens": 826475857.0, "step": 21659 }, { "epoch": 2.7553746342704493, "ewc_loss": 0.034564945846796036, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456494596321136e-05, "grad_norm": 19.634193420410156, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.8846383690834045, "num_tokens": 826513956.0, "step": 21660 }, { "epoch": 2.7555018445490393, "ewc_loss": 0.034493688493967056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449368887231685e-05, "grad_norm": 19.58084487915039, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8735896348953247, "num_tokens": 826552747.0, "step": 21661 }, { "epoch": 2.7556290548276303, "ewc_loss": 0.0344729907810688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.447299241088331e-05, "grad_norm": 19.614309310913086, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8829391598701477, "num_tokens": 826590830.0, "step": 21662 }, { "epoch": 2.7557562651062204, "ewc_loss": 0.034499213099479675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4499214962124825e-05, "grad_norm": 19.66338348388672, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8791943788528442, "num_tokens": 826633414.0, "step": 21663 }, { "epoch": 2.755883475384811, "ewc_loss": 0.03451763838529587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451763768680394e-05, "grad_norm": 19.63860511779785, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.885216474533081, "num_tokens": 826668344.0, "step": 21664 }, { "epoch": 2.7560106856634015, "ewc_loss": 0.03452010825276375, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.452010787441395e-05, "grad_norm": 19.596845626831055, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8588751554489136, "num_tokens": 826710587.0, "step": 21665 }, { "epoch": 2.756137895941992, "ewc_loss": 0.03456553444266319, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4565535315778106e-05, "grad_norm": 19.64348030090332, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8661203384399414, "num_tokens": 826748854.0, "step": 21666 }, { "epoch": 2.7562651062205825, "ewc_loss": 0.0345650278031826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4565025998745114e-05, "grad_norm": 19.645376205444336, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.863615095615387, "num_tokens": 826789591.0, "step": 21667 }, { "epoch": 2.756392316499173, "ewc_loss": 0.03445056825876236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4450567909516394e-05, "grad_norm": 19.490697860717773, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8717744946479797, "num_tokens": 826831890.0, "step": 21668 }, { "epoch": 2.7565195267777636, "ewc_loss": 0.03456178307533264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4561784559627995e-05, "grad_norm": 19.666282653808594, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8580321073532104, "num_tokens": 826870265.0, "step": 21669 }, { "epoch": 2.756646737056354, "ewc_loss": 0.034589000046253204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458900027908385e-05, "grad_norm": 19.489173889160156, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8747736811637878, "num_tokens": 826910993.0, "step": 21670 }, { "epoch": 2.7567739473349446, "ewc_loss": 0.034566376358270645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4566375688882545e-05, "grad_norm": 19.712263107299805, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8736733198165894, "num_tokens": 826947492.0, "step": 21671 }, { "epoch": 2.756901157613535, "ewc_loss": 0.034604769200086594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460477091721259e-05, "grad_norm": 19.6007022857666, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8792468309402466, "num_tokens": 826987215.0, "step": 21672 }, { "epoch": 2.7570283678921257, "ewc_loss": 0.03453775867819786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453775934758596e-05, "grad_norm": 19.584060668945312, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8730021119117737, "num_tokens": 827024092.0, "step": 21673 }, { "epoch": 2.757155578170716, "ewc_loss": 0.03462829440832138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4628294088179246e-05, "grad_norm": 19.650341033935547, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8591803312301636, "num_tokens": 827060541.0, "step": 21674 }, { "epoch": 2.7572827884493067, "ewc_loss": 0.03453850373625755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4538505133241415e-05, "grad_norm": 19.5058650970459, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8743311166763306, "num_tokens": 827101205.0, "step": 21675 }, { "epoch": 2.7574099987278973, "ewc_loss": 0.03461787849664688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461787855485454e-05, "grad_norm": 19.661693572998047, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8691495656967163, "num_tokens": 827132441.0, "step": 21676 }, { "epoch": 2.757537209006488, "ewc_loss": 0.03464788571000099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4647884604055434e-05, "grad_norm": 19.610050201416016, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8755939602851868, "num_tokens": 827169530.0, "step": 21677 }, { "epoch": 2.7576644192850783, "ewc_loss": 0.03455151617527008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4551514545455575e-05, "grad_norm": 19.63460922241211, "learning_rate": 1e-06, "loss": 0.3643, "mean_token_accuracy": 0.8825463652610779, "num_tokens": 827207352.0, "step": 21678 }, { "epoch": 2.757791629563669, "ewc_loss": 0.034660857170820236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4660857636481524e-05, "grad_norm": 19.6382999420166, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8697106838226318, "num_tokens": 827242790.0, "step": 21679 }, { "epoch": 2.7579188398422594, "ewc_loss": 0.03468961641192436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4689615858951584e-05, "grad_norm": 19.72167205810547, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8771862387657166, "num_tokens": 827281676.0, "step": 21680 }, { "epoch": 2.75804605012085, "ewc_loss": 0.034682076424360275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468207796686329e-05, "grad_norm": 19.59064483642578, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.88934725522995, "num_tokens": 827313676.0, "step": 21681 }, { "epoch": 2.7581732603994404, "ewc_loss": 0.03461887687444687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4618875361047685e-05, "grad_norm": 19.636449813842773, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8751601576805115, "num_tokens": 827355017.0, "step": 21682 }, { "epoch": 2.758300470678031, "ewc_loss": 0.0346359983086586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4635999327292666e-05, "grad_norm": 19.583641052246094, "learning_rate": 1e-06, "loss": 0.3409, "mean_token_accuracy": 0.8924949765205383, "num_tokens": 827391773.0, "step": 21683 }, { "epoch": 2.758427680956621, "ewc_loss": 0.0346723347902298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46723354596179e-05, "grad_norm": 19.661258697509766, "learning_rate": 1e-06, "loss": 0.3504, "mean_token_accuracy": 0.8903499841690063, "num_tokens": 827433034.0, "step": 21684 }, { "epoch": 2.758554891235212, "ewc_loss": 0.03464138135313988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4641379897948354e-05, "grad_norm": 19.537147521972656, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8858263492584229, "num_tokens": 827475729.0, "step": 21685 }, { "epoch": 2.758682101513802, "ewc_loss": 0.03460688143968582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4606880944920704e-05, "grad_norm": 19.68865203857422, "learning_rate": 1e-06, "loss": 0.3753, "mean_token_accuracy": 0.8812128305435181, "num_tokens": 827514127.0, "step": 21686 }, { "epoch": 2.758809311792393, "ewc_loss": 0.034701868891716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470186857157387e-05, "grad_norm": 19.65765953063965, "learning_rate": 1e-06, "loss": 0.3596, "mean_token_accuracy": 0.8859876394271851, "num_tokens": 827546709.0, "step": 21687 }, { "epoch": 2.758936522070983, "ewc_loss": 0.03464918211102486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4649183362489566e-05, "grad_norm": 19.66647720336914, "learning_rate": 1e-06, "loss": 0.3258, "mean_token_accuracy": 0.8954776525497437, "num_tokens": 827581450.0, "step": 21688 }, { "epoch": 2.7590637323495737, "ewc_loss": 0.03466879576444626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46687957062386e-05, "grad_norm": 19.671714782714844, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8713403344154358, "num_tokens": 827623161.0, "step": 21689 }, { "epoch": 2.7591909426281642, "ewc_loss": 0.03465636819601059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465636837063357e-05, "grad_norm": 19.54546546936035, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8714577555656433, "num_tokens": 827659460.0, "step": 21690 }, { "epoch": 2.7593181529067548, "ewc_loss": 0.034571122378110886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45711232512258e-05, "grad_norm": 19.684568405151367, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8703407049179077, "num_tokens": 827701110.0, "step": 21691 }, { "epoch": 2.7594453631853453, "ewc_loss": 0.03470494598150253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470494630164467e-05, "grad_norm": 19.559844970703125, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8719347715377808, "num_tokens": 827738992.0, "step": 21692 }, { "epoch": 2.759572573463936, "ewc_loss": 0.03457757085561752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4577569749671966e-05, "grad_norm": 19.634536743164062, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8617675304412842, "num_tokens": 827775853.0, "step": 21693 }, { "epoch": 2.7596997837425263, "ewc_loss": 0.034723442047834396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472344178589992e-05, "grad_norm": 19.68783187866211, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8763219714164734, "num_tokens": 827819438.0, "step": 21694 }, { "epoch": 2.759826994021117, "ewc_loss": 0.03464415669441223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4644155675778165e-05, "grad_norm": 19.655168533325195, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8699408769607544, "num_tokens": 827858833.0, "step": 21695 }, { "epoch": 2.7599542042997074, "ewc_loss": 0.034569285809993744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456928607192822e-05, "grad_norm": 19.57493019104004, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8692764639854431, "num_tokens": 827895069.0, "step": 21696 }, { "epoch": 2.760081414578298, "ewc_loss": 0.034513622522354126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.451362135820091e-05, "grad_norm": 19.618898391723633, "learning_rate": 1e-06, "loss": 0.3495, "mean_token_accuracy": 0.8894498944282532, "num_tokens": 827932970.0, "step": 21697 }, { "epoch": 2.7602086248568884, "ewc_loss": 0.03459962457418442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4599623177200556e-05, "grad_norm": 19.624114990234375, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.8866814374923706, "num_tokens": 827981859.0, "step": 21698 }, { "epoch": 2.760335835135479, "ewc_loss": 0.03453238680958748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453238605288789e-05, "grad_norm": 19.641653060913086, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8689863085746765, "num_tokens": 828022018.0, "step": 21699 }, { "epoch": 2.7604630454140695, "ewc_loss": 0.03460179269313812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460179141256958e-05, "grad_norm": 19.587432861328125, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8708479404449463, "num_tokens": 828061546.0, "step": 21700 }, { "epoch": 2.76059025569266, "ewc_loss": 0.034533917903900146, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453391764196567e-05, "grad_norm": 19.619644165039062, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8622997999191284, "num_tokens": 828104464.0, "step": 21701 }, { "epoch": 2.7607174659712506, "ewc_loss": 0.03457396477460861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457396451267414e-05, "grad_norm": 19.5677490234375, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8665408492088318, "num_tokens": 828142527.0, "step": 21702 }, { "epoch": 2.760844676249841, "ewc_loss": 0.03458631411194801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4586315450724214e-05, "grad_norm": 19.641286849975586, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8750851154327393, "num_tokens": 828189670.0, "step": 21703 }, { "epoch": 2.7609718865284316, "ewc_loss": 0.034571126103401184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4571126889204606e-05, "grad_norm": 19.560211181640625, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8813644647598267, "num_tokens": 828231511.0, "step": 21704 }, { "epoch": 2.761099096807022, "ewc_loss": 0.03456608206033707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456608101259917e-05, "grad_norm": 19.703779220581055, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8760703206062317, "num_tokens": 828274387.0, "step": 21705 }, { "epoch": 2.7612263070856127, "ewc_loss": 0.034581199288368225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458120045252144e-05, "grad_norm": 19.63624382019043, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8686938285827637, "num_tokens": 828312263.0, "step": 21706 }, { "epoch": 2.7613535173642028, "ewc_loss": 0.03448287397623062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.448287316132337e-05, "grad_norm": 19.58968162536621, "learning_rate": 1e-06, "loss": 0.3774, "mean_token_accuracy": 0.8762736320495605, "num_tokens": 828345550.0, "step": 21707 }, { "epoch": 2.7614807276427937, "ewc_loss": 0.03458793833851814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458793798927218e-05, "grad_norm": 19.743629455566406, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8644282221794128, "num_tokens": 828378633.0, "step": 21708 }, { "epoch": 2.761607937921384, "ewc_loss": 0.03452271968126297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4522719943197444e-05, "grad_norm": 19.52947998046875, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8698817491531372, "num_tokens": 828414292.0, "step": 21709 }, { "epoch": 2.761735148199975, "ewc_loss": 0.03452606126666069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4526059607742354e-05, "grad_norm": 19.769725799560547, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8748448491096497, "num_tokens": 828452682.0, "step": 21710 }, { "epoch": 2.761862358478565, "ewc_loss": 0.03458813577890396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458813444012776e-05, "grad_norm": 19.57943344116211, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.8911247849464417, "num_tokens": 828494162.0, "step": 21711 }, { "epoch": 2.761989568757156, "ewc_loss": 0.034494128078222275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.449412906775251e-05, "grad_norm": 19.709426879882812, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8804155588150024, "num_tokens": 828525997.0, "step": 21712 }, { "epoch": 2.762116779035746, "ewc_loss": 0.03460510075092316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4605101973284036e-05, "grad_norm": 19.621797561645508, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.874836266040802, "num_tokens": 828562488.0, "step": 21713 }, { "epoch": 2.7622439893143365, "ewc_loss": 0.034516215324401855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4516215237090364e-05, "grad_norm": 19.66775131225586, "learning_rate": 1e-06, "loss": 0.3338, "mean_token_accuracy": 0.8934952020645142, "num_tokens": 828591818.0, "step": 21714 }, { "epoch": 2.762371199592927, "ewc_loss": 0.034583043307065964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458304490777664e-05, "grad_norm": 19.667938232421875, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.86354660987854, "num_tokens": 828628872.0, "step": 21715 }, { "epoch": 2.7624984098715175, "ewc_loss": 0.034548357129096985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454835677985102e-05, "grad_norm": 19.65138816833496, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8839507102966309, "num_tokens": 828670939.0, "step": 21716 }, { "epoch": 2.762625620150108, "ewc_loss": 0.03458596393465996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4585962566779926e-05, "grad_norm": 19.682083129882812, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8585593700408936, "num_tokens": 828708446.0, "step": 21717 }, { "epoch": 2.7627528304286986, "ewc_loss": 0.034587159752845764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458715946180746e-05, "grad_norm": 19.582979202270508, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8795821070671082, "num_tokens": 828749873.0, "step": 21718 }, { "epoch": 2.762880040707289, "ewc_loss": 0.034590933471918106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4590932045830414e-05, "grad_norm": 19.598981857299805, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8688501715660095, "num_tokens": 828790066.0, "step": 21719 }, { "epoch": 2.7630072509858796, "ewc_loss": 0.034577447921037674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457744969637133e-05, "grad_norm": 19.619182586669922, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8724771738052368, "num_tokens": 828821911.0, "step": 21720 }, { "epoch": 2.76313446126447, "ewc_loss": 0.03462864086031914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462863969616592e-05, "grad_norm": 19.58319854736328, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8709944486618042, "num_tokens": 828856572.0, "step": 21721 }, { "epoch": 2.7632616715430607, "ewc_loss": 0.03469032794237137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4690328902797773e-05, "grad_norm": 19.760557174682617, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8869929909706116, "num_tokens": 828898865.0, "step": 21722 }, { "epoch": 2.763388881821651, "ewc_loss": 0.03467166796326637, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46716697094962e-05, "grad_norm": 19.644725799560547, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8830870389938354, "num_tokens": 828933768.0, "step": 21723 }, { "epoch": 2.7635160921002417, "ewc_loss": 0.03454131260514259, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454131365288049e-05, "grad_norm": 19.556781768798828, "learning_rate": 1e-06, "loss": 0.3451, "mean_token_accuracy": 0.8913794755935669, "num_tokens": 828968869.0, "step": 21724 }, { "epoch": 2.7636433023788323, "ewc_loss": 0.0346708819270134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467088026809506e-05, "grad_norm": 19.742582321166992, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.859756350517273, "num_tokens": 829005621.0, "step": 21725 }, { "epoch": 2.763770512657423, "ewc_loss": 0.0346505232155323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4650522138690576e-05, "grad_norm": 19.57052230834961, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8652156591415405, "num_tokens": 829045046.0, "step": 21726 }, { "epoch": 2.7638977229360133, "ewc_loss": 0.034554652869701385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4554654121166095e-05, "grad_norm": 19.594999313354492, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8890756964683533, "num_tokens": 829084588.0, "step": 21727 }, { "epoch": 2.764024933214604, "ewc_loss": 0.03473256528377533, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473256583674811e-05, "grad_norm": 19.687408447265625, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.86080002784729, "num_tokens": 829121010.0, "step": 21728 }, { "epoch": 2.7641521434931944, "ewc_loss": 0.034691378474235535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469138027867302e-05, "grad_norm": 19.62617301940918, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.8834022879600525, "num_tokens": 829159618.0, "step": 21729 }, { "epoch": 2.764279353771785, "ewc_loss": 0.034622225910425186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462222593952902e-05, "grad_norm": 19.654644012451172, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8627504110336304, "num_tokens": 829204826.0, "step": 21730 }, { "epoch": 2.7644065640503754, "ewc_loss": 0.034704674035310745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4704673453234136e-05, "grad_norm": 19.713232040405273, "learning_rate": 1e-06, "loss": 0.3452, "mean_token_accuracy": 0.8859768509864807, "num_tokens": 829240741.0, "step": 21731 }, { "epoch": 2.7645337743289655, "ewc_loss": 0.03461453318595886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461453161435202e-05, "grad_norm": 19.611379623413086, "learning_rate": 1e-06, "loss": 0.3382, "mean_token_accuracy": 0.8942364454269409, "num_tokens": 829278065.0, "step": 21732 }, { "epoch": 2.7646609846075565, "ewc_loss": 0.03458563610911369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458563514868729e-05, "grad_norm": 19.739871978759766, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8712509870529175, "num_tokens": 829319438.0, "step": 21733 }, { "epoch": 2.7647881948861466, "ewc_loss": 0.03461093828082085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461093729129061e-05, "grad_norm": 19.613595962524414, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8853539228439331, "num_tokens": 829357299.0, "step": 21734 }, { "epoch": 2.7649154051647375, "ewc_loss": 0.03446902334690094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.446902337600477e-05, "grad_norm": 19.6051082611084, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8768696784973145, "num_tokens": 829398600.0, "step": 21735 }, { "epoch": 2.7650426154433276, "ewc_loss": 0.03469351679086685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4693515772232786e-05, "grad_norm": 19.75684928894043, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8641414046287537, "num_tokens": 829439214.0, "step": 21736 }, { "epoch": 2.7651698257219186, "ewc_loss": 0.034582458436489105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45824591931887e-05, "grad_norm": 19.64453887939453, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8650651574134827, "num_tokens": 829477634.0, "step": 21737 }, { "epoch": 2.7652970360005087, "ewc_loss": 0.03459370508790016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459370418568142e-05, "grad_norm": 19.623538970947266, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8822766542434692, "num_tokens": 829514527.0, "step": 21738 }, { "epoch": 2.765424246279099, "ewc_loss": 0.03454503044486046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454503166722134e-05, "grad_norm": 19.695310592651367, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8773385286331177, "num_tokens": 829558531.0, "step": 21739 }, { "epoch": 2.7655514565576897, "ewc_loss": 0.03461265191435814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461265077930875e-05, "grad_norm": 19.725194931030273, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8782789707183838, "num_tokens": 829600769.0, "step": 21740 }, { "epoch": 2.7656786668362803, "ewc_loss": 0.034417763352394104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.441776425461285e-05, "grad_norm": 19.622461318969727, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8802151679992676, "num_tokens": 829635938.0, "step": 21741 }, { "epoch": 2.765805877114871, "ewc_loss": 0.034492384642362595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4492382837925106e-05, "grad_norm": 19.627214431762695, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8756005764007568, "num_tokens": 829672956.0, "step": 21742 }, { "epoch": 2.7659330873934613, "ewc_loss": 0.034548528492450714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454852776485495e-05, "grad_norm": 19.66883087158203, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8751262426376343, "num_tokens": 829712658.0, "step": 21743 }, { "epoch": 2.766060297672052, "ewc_loss": 0.034500326961278915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4500328183639795e-05, "grad_norm": 19.627683639526367, "learning_rate": 1e-06, "loss": 0.3789, "mean_token_accuracy": 0.8777170181274414, "num_tokens": 829755429.0, "step": 21744 }, { "epoch": 2.7661875079506424, "ewc_loss": 0.03456737846136093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45673797710333e-05, "grad_norm": 19.68113899230957, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.874346137046814, "num_tokens": 829796373.0, "step": 21745 }, { "epoch": 2.766314718229233, "ewc_loss": 0.03457162156701088, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457162165432237e-05, "grad_norm": 19.612028121948242, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8713122606277466, "num_tokens": 829834049.0, "step": 21746 }, { "epoch": 2.7664419285078234, "ewc_loss": 0.03454975038766861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4549750125734136e-05, "grad_norm": 19.579694747924805, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8753054738044739, "num_tokens": 829870853.0, "step": 21747 }, { "epoch": 2.766569138786414, "ewc_loss": 0.03456015884876251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4560158383101225e-05, "grad_norm": 19.660463333129883, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8698917627334595, "num_tokens": 829907519.0, "step": 21748 }, { "epoch": 2.7666963490650045, "ewc_loss": 0.03455010801553726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455010664765723e-05, "grad_norm": 19.532146453857422, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8616478443145752, "num_tokens": 829947162.0, "step": 21749 }, { "epoch": 2.766823559343595, "ewc_loss": 0.034563805907964706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456380727584474e-05, "grad_norm": 19.64455223083496, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8731503486633301, "num_tokens": 829991981.0, "step": 21750 }, { "epoch": 2.7669507696221856, "ewc_loss": 0.0346309132874012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463091343292035e-05, "grad_norm": 19.678955078125, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8816523551940918, "num_tokens": 830032409.0, "step": 21751 }, { "epoch": 2.767077979900776, "ewc_loss": 0.03453034535050392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453034514677711e-05, "grad_norm": 19.603342056274414, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.879127025604248, "num_tokens": 830072738.0, "step": 21752 }, { "epoch": 2.7672051901793666, "ewc_loss": 0.03455053269863129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455053229117766e-05, "grad_norm": 19.617355346679688, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8722918629646301, "num_tokens": 830110880.0, "step": 21753 }, { "epoch": 2.767332400457957, "ewc_loss": 0.034576624631881714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457662387518212e-05, "grad_norm": 19.619714736938477, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8707930445671082, "num_tokens": 830150148.0, "step": 21754 }, { "epoch": 2.7674596107365477, "ewc_loss": 0.034543875604867935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454387478996068e-05, "grad_norm": 19.633922576904297, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8635779619216919, "num_tokens": 830190717.0, "step": 21755 }, { "epoch": 2.767586821015138, "ewc_loss": 0.0345325842499733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453258250374347e-05, "grad_norm": 19.650392532348633, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8767576813697815, "num_tokens": 830226972.0, "step": 21756 }, { "epoch": 2.7677140312937283, "ewc_loss": 0.03457844257354736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457844286458567e-05, "grad_norm": 19.662994384765625, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8837087154388428, "num_tokens": 830262612.0, "step": 21757 }, { "epoch": 2.7678412415723193, "ewc_loss": 0.034594547003507614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459454819676466e-05, "grad_norm": 19.6906681060791, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.881721556186676, "num_tokens": 830306856.0, "step": 21758 }, { "epoch": 2.7679684518509093, "ewc_loss": 0.034598685801029205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4598684578668326e-05, "grad_norm": 19.629291534423828, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8739688992500305, "num_tokens": 830340726.0, "step": 21759 }, { "epoch": 2.7680956621295003, "ewc_loss": 0.03455453738570213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455453770584427e-05, "grad_norm": 19.616840362548828, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8752778768539429, "num_tokens": 830380697.0, "step": 21760 }, { "epoch": 2.7682228724080904, "ewc_loss": 0.034564998000860214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456499689491466e-05, "grad_norm": 19.66145896911621, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8535549640655518, "num_tokens": 830414737.0, "step": 21761 }, { "epoch": 2.768350082686681, "ewc_loss": 0.03456754982471466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4567550756037235e-05, "grad_norm": 19.621341705322266, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8756681084632874, "num_tokens": 830456518.0, "step": 21762 }, { "epoch": 2.7684772929652715, "ewc_loss": 0.03454031050205231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.454030957072973e-05, "grad_norm": 19.65719223022461, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8718698024749756, "num_tokens": 830494192.0, "step": 21763 }, { "epoch": 2.768604503243862, "ewc_loss": 0.034579161554574966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457916318438947e-05, "grad_norm": 19.708477020263672, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8688124418258667, "num_tokens": 830533659.0, "step": 21764 }, { "epoch": 2.7687317135224525, "ewc_loss": 0.03450282663106918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4502827475080267e-05, "grad_norm": 19.64936637878418, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.869748592376709, "num_tokens": 830569605.0, "step": 21765 }, { "epoch": 2.768858923801043, "ewc_loss": 0.034557465463876724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.455746627878398e-05, "grad_norm": 19.704986572265625, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8538892269134521, "num_tokens": 830610814.0, "step": 21766 }, { "epoch": 2.7689861340796336, "ewc_loss": 0.03461923450231552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461923552094959e-05, "grad_norm": 19.694515228271484, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.873764157295227, "num_tokens": 830646811.0, "step": 21767 }, { "epoch": 2.769113344358224, "ewc_loss": 0.03455096110701561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45509615726769e-05, "grad_norm": 19.755704879760742, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8785680532455444, "num_tokens": 830680984.0, "step": 21768 }, { "epoch": 2.7692405546368146, "ewc_loss": 0.03453529253602028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.453529279795475e-05, "grad_norm": 19.62114715576172, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8747150897979736, "num_tokens": 830713384.0, "step": 21769 }, { "epoch": 2.769367764915405, "ewc_loss": 0.03456094488501549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456094418652356e-05, "grad_norm": 19.720624923706055, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8620622158050537, "num_tokens": 830746922.0, "step": 21770 }, { "epoch": 2.7694949751939957, "ewc_loss": 0.034603722393512726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460372317931615e-05, "grad_norm": 19.619184494018555, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8809284567832947, "num_tokens": 830789459.0, "step": 21771 }, { "epoch": 2.769622185472586, "ewc_loss": 0.03459502011537552, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4595021134009585e-05, "grad_norm": 19.739164352416992, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8726726770401001, "num_tokens": 830826736.0, "step": 21772 }, { "epoch": 2.7697493957511767, "ewc_loss": 0.03464120626449585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4641205274965614e-05, "grad_norm": 19.684154510498047, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8774889707565308, "num_tokens": 830868118.0, "step": 21773 }, { "epoch": 2.7698766060297673, "ewc_loss": 0.034505508840084076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.450550866546109e-05, "grad_norm": 19.613245010375977, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8725872039794922, "num_tokens": 830911658.0, "step": 21774 }, { "epoch": 2.770003816308358, "ewc_loss": 0.034593820571899414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4593820601003245e-05, "grad_norm": 19.690275192260742, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8766063451766968, "num_tokens": 830948456.0, "step": 21775 }, { "epoch": 2.7701310265869483, "ewc_loss": 0.0345669649541378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4566965041449293e-05, "grad_norm": 19.596839904785156, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8835972547531128, "num_tokens": 830988790.0, "step": 21776 }, { "epoch": 2.770258236865539, "ewc_loss": 0.03456529602408409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456529520917684e-05, "grad_norm": 19.59359359741211, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8676822185516357, "num_tokens": 831024551.0, "step": 21777 }, { "epoch": 2.7703854471441294, "ewc_loss": 0.03464803099632263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464803012320772e-05, "grad_norm": 19.66324234008789, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8732246160507202, "num_tokens": 831060082.0, "step": 21778 }, { "epoch": 2.77051265742272, "ewc_loss": 0.03466837853193283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.466837733867578e-05, "grad_norm": 19.65651512145996, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8762990832328796, "num_tokens": 831100067.0, "step": 21779 }, { "epoch": 2.77063986770131, "ewc_loss": 0.034642111510038376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464211113168858e-05, "grad_norm": 19.645910263061523, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8862578868865967, "num_tokens": 831131923.0, "step": 21780 }, { "epoch": 2.770767077979901, "ewc_loss": 0.03465290367603302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465290501480922e-05, "grad_norm": 19.600624084472656, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.860895037651062, "num_tokens": 831167985.0, "step": 21781 }, { "epoch": 2.770894288258491, "ewc_loss": 0.03468240052461624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468240174697712e-05, "grad_norm": 19.701087951660156, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8702021241188049, "num_tokens": 831211737.0, "step": 21782 }, { "epoch": 2.771021498537082, "ewc_loss": 0.034695349633693695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469534931355156e-05, "grad_norm": 19.641429901123047, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8647357821464539, "num_tokens": 831248764.0, "step": 21783 }, { "epoch": 2.771148708815672, "ewc_loss": 0.03464328125119209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4643282560864463e-05, "grad_norm": 19.6473388671875, "learning_rate": 1e-06, "loss": 0.3687, "mean_token_accuracy": 0.8807827830314636, "num_tokens": 831284140.0, "step": 21784 }, { "epoch": 2.771275919094263, "ewc_loss": 0.03469260409474373, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4692602639552206e-05, "grad_norm": 19.673147201538086, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8772058486938477, "num_tokens": 831318567.0, "step": 21785 }, { "epoch": 2.771403129372853, "ewc_loss": 0.034705162048339844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4705160942394286e-05, "grad_norm": 19.670024871826172, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8680789470672607, "num_tokens": 831357128.0, "step": 21786 }, { "epoch": 2.7715303396514437, "ewc_loss": 0.034762006253004074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47620079992339e-05, "grad_norm": 19.725481033325195, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8552305102348328, "num_tokens": 831391551.0, "step": 21787 }, { "epoch": 2.771657549930034, "ewc_loss": 0.034686874598264694, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4686872822931036e-05, "grad_norm": 19.67466163635254, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8707526326179504, "num_tokens": 831430332.0, "step": 21788 }, { "epoch": 2.7717847602086247, "ewc_loss": 0.03467585891485214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467586066108197e-05, "grad_norm": 19.58858299255371, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8808621168136597, "num_tokens": 831471639.0, "step": 21789 }, { "epoch": 2.7719119704872153, "ewc_loss": 0.0346481055021286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464810652076267e-05, "grad_norm": 19.67694664001465, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8810545206069946, "num_tokens": 831506470.0, "step": 21790 }, { "epoch": 2.772039180765806, "ewc_loss": 0.03468408063054085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468407885520719e-05, "grad_norm": 19.612335205078125, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8675442934036255, "num_tokens": 831547212.0, "step": 21791 }, { "epoch": 2.7721663910443963, "ewc_loss": 0.03460618481040001, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460618609096855e-05, "grad_norm": 19.628841400146484, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8601303100585938, "num_tokens": 831593181.0, "step": 21792 }, { "epoch": 2.772293601322987, "ewc_loss": 0.034760065376758575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4760065318550915e-05, "grad_norm": 19.663850784301758, "learning_rate": 1e-06, "loss": 0.3334, "mean_token_accuracy": 0.8911212086677551, "num_tokens": 831624889.0, "step": 21793 }, { "epoch": 2.7724208116015774, "ewc_loss": 0.034628693014383316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4628694265848026e-05, "grad_norm": 19.69998550415039, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.881202220916748, "num_tokens": 831662389.0, "step": 21794 }, { "epoch": 2.772548021880168, "ewc_loss": 0.034683823585510254, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4683824196690693e-05, "grad_norm": 19.604175567626953, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8768572211265564, "num_tokens": 831705282.0, "step": 21795 }, { "epoch": 2.7726752321587584, "ewc_loss": 0.03465621918439865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465621921350248e-05, "grad_norm": 19.664762496948242, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8812874555587769, "num_tokens": 831741604.0, "step": 21796 }, { "epoch": 2.772802442437349, "ewc_loss": 0.034729693084955215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4729691833490506e-05, "grad_norm": 19.66210174560547, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8663891553878784, "num_tokens": 831784165.0, "step": 21797 }, { "epoch": 2.7729296527159395, "ewc_loss": 0.03462987393140793, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4629872970981523e-05, "grad_norm": 19.608976364135742, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8792285919189453, "num_tokens": 831821101.0, "step": 21798 }, { "epoch": 2.77305686299453, "ewc_loss": 0.03471815958619118, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4718159440672025e-05, "grad_norm": 19.709293365478516, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.877496063709259, "num_tokens": 831855766.0, "step": 21799 }, { "epoch": 2.7731840732731206, "ewc_loss": 0.03470780327916145, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4707802115008235e-05, "grad_norm": 19.768585205078125, "learning_rate": 1e-06, "loss": 0.3731, "mean_token_accuracy": 0.8805646896362305, "num_tokens": 831895692.0, "step": 21800 }, { "epoch": 2.773311283551711, "ewc_loss": 0.03464825451374054, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464825567789376e-05, "grad_norm": 19.717039108276367, "learning_rate": 1e-06, "loss": 0.3252, "mean_token_accuracy": 0.8936284780502319, "num_tokens": 831931079.0, "step": 21801 }, { "epoch": 2.7734384938303016, "ewc_loss": 0.034571193158626556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4571192372823134e-05, "grad_norm": 19.661548614501953, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8495609760284424, "num_tokens": 831969742.0, "step": 21802 }, { "epoch": 2.773565704108892, "ewc_loss": 0.034552689641714096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4552689612610266e-05, "grad_norm": 19.69436264038086, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8647413849830627, "num_tokens": 832011059.0, "step": 21803 }, { "epoch": 2.7736929143874827, "ewc_loss": 0.03468942269682884, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468942304607481e-05, "grad_norm": 19.743989944458008, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.884687602519989, "num_tokens": 832042768.0, "step": 21804 }, { "epoch": 2.7738201246660728, "ewc_loss": 0.03463241457939148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463241591816768e-05, "grad_norm": 19.724258422851562, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8834830522537231, "num_tokens": 832077699.0, "step": 21805 }, { "epoch": 2.7739473349446637, "ewc_loss": 0.03459014371037483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4590142604429275e-05, "grad_norm": 19.64916229248047, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.882874608039856, "num_tokens": 832113528.0, "step": 21806 }, { "epoch": 2.774074545223254, "ewc_loss": 0.03456525132060051, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456525155343115e-05, "grad_norm": 19.691375732421875, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8797821402549744, "num_tokens": 832149108.0, "step": 21807 }, { "epoch": 2.774201755501845, "ewc_loss": 0.03468340262770653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468340219114907e-05, "grad_norm": 19.756399154663086, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8914490938186646, "num_tokens": 832189279.0, "step": 21808 }, { "epoch": 2.774328965780435, "ewc_loss": 0.0346514917910099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465149347903207e-05, "grad_norm": 19.753475189208984, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8766740560531616, "num_tokens": 832224423.0, "step": 21809 }, { "epoch": 2.774456176059026, "ewc_loss": 0.03459415212273598, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459415165707469e-05, "grad_norm": 19.710433959960938, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8693070411682129, "num_tokens": 832260389.0, "step": 21810 }, { "epoch": 2.774583386337616, "ewc_loss": 0.03461381793022156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461381857050583e-05, "grad_norm": 19.729825973510742, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8743423819541931, "num_tokens": 832297591.0, "step": 21811 }, { "epoch": 2.7747105966162064, "ewc_loss": 0.03461134806275368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461134838289581e-05, "grad_norm": 19.765531539916992, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8804154396057129, "num_tokens": 832330195.0, "step": 21812 }, { "epoch": 2.774837806894797, "ewc_loss": 0.03456955403089523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456955528235994e-05, "grad_norm": 19.67670440673828, "learning_rate": 1e-06, "loss": 0.3621, "mean_token_accuracy": 0.8825490474700928, "num_tokens": 832361613.0, "step": 21813 }, { "epoch": 2.7749650171733875, "ewc_loss": 0.03457256034016609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45725602528546e-05, "grad_norm": 19.635221481323242, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8806699514389038, "num_tokens": 832402800.0, "step": 21814 }, { "epoch": 2.775092227451978, "ewc_loss": 0.03460981324315071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460981315583922e-05, "grad_norm": 19.69001579284668, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8778855800628662, "num_tokens": 832445161.0, "step": 21815 }, { "epoch": 2.7752194377305686, "ewc_loss": 0.03464631363749504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464631299721077e-05, "grad_norm": 19.660247802734375, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8608194589614868, "num_tokens": 832487905.0, "step": 21816 }, { "epoch": 2.775346648009159, "ewc_loss": 0.03468988090753555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46898814314045e-05, "grad_norm": 19.672107696533203, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8766978979110718, "num_tokens": 832521107.0, "step": 21817 }, { "epoch": 2.7754738582877496, "ewc_loss": 0.03464311361312866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464311521383934e-05, "grad_norm": 19.70632553100586, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8714276552200317, "num_tokens": 832557213.0, "step": 21818 }, { "epoch": 2.77560106856634, "ewc_loss": 0.034652210772037506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4652210160857067e-05, "grad_norm": 19.67909812927246, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8628681898117065, "num_tokens": 832600445.0, "step": 21819 }, { "epoch": 2.7757282788449307, "ewc_loss": 0.03468907251954079, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468907380010933e-05, "grad_norm": 19.72272491455078, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8741014003753662, "num_tokens": 832635994.0, "step": 21820 }, { "epoch": 2.775855489123521, "ewc_loss": 0.03464582562446594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464582550805062e-05, "grad_norm": 19.68332862854004, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8858193159103394, "num_tokens": 832677151.0, "step": 21821 }, { "epoch": 2.7759826994021117, "ewc_loss": 0.03463887795805931, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4638876968529075e-05, "grad_norm": 19.65442657470703, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8740679025650024, "num_tokens": 832713995.0, "step": 21822 }, { "epoch": 2.7761099096807023, "ewc_loss": 0.03462809696793556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462809763732366e-05, "grad_norm": 19.702938079833984, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8652685880661011, "num_tokens": 832754201.0, "step": 21823 }, { "epoch": 2.776237119959293, "ewc_loss": 0.034654125571250916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46541237377096e-05, "grad_norm": 19.72637939453125, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8715811967849731, "num_tokens": 832796920.0, "step": 21824 }, { "epoch": 2.7763643302378833, "ewc_loss": 0.03465171903371811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465171903371811e-05, "grad_norm": 19.64426040649414, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8726968169212341, "num_tokens": 832835952.0, "step": 21825 }, { "epoch": 2.776491540516474, "ewc_loss": 0.03461237996816635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4612381568877026e-05, "grad_norm": 19.72092628479004, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8784918189048767, "num_tokens": 832878306.0, "step": 21826 }, { "epoch": 2.7766187507950644, "ewc_loss": 0.03463691100478172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4636912459973246e-05, "grad_norm": 19.682954788208008, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8673248291015625, "num_tokens": 832914315.0, "step": 21827 }, { "epoch": 2.776745961073655, "ewc_loss": 0.03467992693185806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46799279213883e-05, "grad_norm": 19.72663688659668, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8712289929389954, "num_tokens": 832956661.0, "step": 21828 }, { "epoch": 2.7768731713522454, "ewc_loss": 0.03460683301091194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460683365119621e-05, "grad_norm": 19.610218048095703, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8813661932945251, "num_tokens": 832999446.0, "step": 21829 }, { "epoch": 2.7770003816308355, "ewc_loss": 0.03463783487677574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463783650659025e-05, "grad_norm": 19.737552642822266, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8484264612197876, "num_tokens": 833039001.0, "step": 21830 }, { "epoch": 2.7771275919094265, "ewc_loss": 0.034678660333156586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467866190476343e-05, "grad_norm": 19.693368911743164, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8733457326889038, "num_tokens": 833074334.0, "step": 21831 }, { "epoch": 2.7772548021880166, "ewc_loss": 0.034580763429403305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.458076389506459e-05, "grad_norm": 19.722209930419922, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8638480305671692, "num_tokens": 833113089.0, "step": 21832 }, { "epoch": 2.7773820124666075, "ewc_loss": 0.03464687243103981, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464687324594706e-05, "grad_norm": 19.703859329223633, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.872151255607605, "num_tokens": 833149755.0, "step": 21833 }, { "epoch": 2.7775092227451976, "ewc_loss": 0.034566786140203476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4566786780487746e-05, "grad_norm": 19.75722312927246, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8661059141159058, "num_tokens": 833187718.0, "step": 21834 }, { "epoch": 2.7776364330237886, "ewc_loss": 0.034606948494911194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460695006651804e-05, "grad_norm": 19.662500381469727, "learning_rate": 1e-06, "loss": 0.3652, "mean_token_accuracy": 0.882234513759613, "num_tokens": 833221416.0, "step": 21835 }, { "epoch": 2.7777636433023787, "ewc_loss": 0.03456951677799225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4569515264593065e-05, "grad_norm": 19.721006393432617, "learning_rate": 1e-06, "loss": 0.352, "mean_token_accuracy": 0.8890135884284973, "num_tokens": 833261101.0, "step": 21836 }, { "epoch": 2.777890853580969, "ewc_loss": 0.034662846475839615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4662847610889e-05, "grad_norm": 19.687503814697266, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8836239576339722, "num_tokens": 833296804.0, "step": 21837 }, { "epoch": 2.7780180638595597, "ewc_loss": 0.034566979855298996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.456697959336452e-05, "grad_norm": 19.68313217163086, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8752663135528564, "num_tokens": 833332005.0, "step": 21838 }, { "epoch": 2.7781452741381503, "ewc_loss": 0.03465079516172409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465079498710111e-05, "grad_norm": 19.727384567260742, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8717193603515625, "num_tokens": 833371991.0, "step": 21839 }, { "epoch": 2.778272484416741, "ewc_loss": 0.03462166339159012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462166205281392e-05, "grad_norm": 19.647672653198242, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8863853216171265, "num_tokens": 833407514.0, "step": 21840 }, { "epoch": 2.7783996946953313, "ewc_loss": 0.03459172695875168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459172512521036e-05, "grad_norm": 19.638675689697266, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8731950521469116, "num_tokens": 833444519.0, "step": 21841 }, { "epoch": 2.778526904973922, "ewc_loss": 0.0346272736787796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462727181613445e-05, "grad_norm": 19.668256759643555, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8703939914703369, "num_tokens": 833482252.0, "step": 21842 }, { "epoch": 2.7786541152525124, "ewc_loss": 0.034604527056217194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4604527172632515e-05, "grad_norm": 19.610721588134766, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8733572363853455, "num_tokens": 833515513.0, "step": 21843 }, { "epoch": 2.778781325531103, "ewc_loss": 0.034735057502985, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4735057852230966e-05, "grad_norm": 19.878145217895508, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8824788331985474, "num_tokens": 833551188.0, "step": 21844 }, { "epoch": 2.7789085358096934, "ewc_loss": 0.03469366580247879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4693664929363877e-05, "grad_norm": 19.61021614074707, "learning_rate": 1e-06, "loss": 0.3576, "mean_token_accuracy": 0.8845944404602051, "num_tokens": 833589159.0, "step": 21845 }, { "epoch": 2.779035746088284, "ewc_loss": 0.034553103148937225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4553104342194274e-05, "grad_norm": 19.711090087890625, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8659782409667969, "num_tokens": 833623581.0, "step": 21846 }, { "epoch": 2.7791629563668745, "ewc_loss": 0.0347413532435894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4741351555567235e-05, "grad_norm": 19.70220947265625, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8637220859527588, "num_tokens": 833660466.0, "step": 21847 }, { "epoch": 2.779290166645465, "ewc_loss": 0.03456820175051689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4568201954243705e-05, "grad_norm": 19.591533660888672, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8648833632469177, "num_tokens": 833697465.0, "step": 21848 }, { "epoch": 2.7794173769240555, "ewc_loss": 0.03473934158682823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473933975328691e-05, "grad_norm": 19.6616268157959, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8726906776428223, "num_tokens": 833735178.0, "step": 21849 }, { "epoch": 2.779544587202646, "ewc_loss": 0.034675076603889465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467507485765964e-05, "grad_norm": 19.69255256652832, "learning_rate": 1e-06, "loss": 0.3391, "mean_token_accuracy": 0.8924590349197388, "num_tokens": 833770193.0, "step": 21850 }, { "epoch": 2.7796717974812366, "ewc_loss": 0.0347115583717823, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471156014711596e-05, "grad_norm": 19.64867401123047, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8752439022064209, "num_tokens": 833807836.0, "step": 21851 }, { "epoch": 2.779799007759827, "ewc_loss": 0.034662071615457535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.466207272140309e-05, "grad_norm": 19.735971450805664, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8719311356544495, "num_tokens": 833847605.0, "step": 21852 }, { "epoch": 2.7799262180384177, "ewc_loss": 0.034789618104696274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478961662040092e-05, "grad_norm": 19.68447494506836, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8786197900772095, "num_tokens": 833884239.0, "step": 21853 }, { "epoch": 2.780053428317008, "ewc_loss": 0.03462976962327957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4629771107574925e-05, "grad_norm": 19.75973892211914, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8764330148696899, "num_tokens": 833921305.0, "step": 21854 }, { "epoch": 2.7801806385955983, "ewc_loss": 0.03473515808582306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4735159715637565e-05, "grad_norm": 19.646953582763672, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8668339252471924, "num_tokens": 833959297.0, "step": 21855 }, { "epoch": 2.7803078488741892, "ewc_loss": 0.0346280001103878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462799941189587e-05, "grad_norm": 19.762479782104492, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8534252643585205, "num_tokens": 833992135.0, "step": 21856 }, { "epoch": 2.7804350591527793, "ewc_loss": 0.03469209745526314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469209696049802e-05, "grad_norm": 19.639049530029297, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.86524498462677, "num_tokens": 834035933.0, "step": 21857 }, { "epoch": 2.7805622694313703, "ewc_loss": 0.03466310352087021, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46631022694055e-05, "grad_norm": 19.740631103515625, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8744674324989319, "num_tokens": 834070187.0, "step": 21858 }, { "epoch": 2.7806894797099604, "ewc_loss": 0.03465191274881363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4651911846594885e-05, "grad_norm": 19.57733154296875, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8756625652313232, "num_tokens": 834107370.0, "step": 21859 }, { "epoch": 2.780816689988551, "ewc_loss": 0.03471871465444565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471871605142951e-05, "grad_norm": 19.772188186645508, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.867455244064331, "num_tokens": 834145357.0, "step": 21860 }, { "epoch": 2.7809439002671414, "ewc_loss": 0.03472645953297615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472646130830981e-05, "grad_norm": 19.620250701904297, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.872378945350647, "num_tokens": 834187250.0, "step": 21861 }, { "epoch": 2.781071110545732, "ewc_loss": 0.03466615453362465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.466615453362465e-05, "grad_norm": 19.77154541015625, "learning_rate": 1e-06, "loss": 0.3575, "mean_token_accuracy": 0.8847078084945679, "num_tokens": 834227598.0, "step": 21862 }, { "epoch": 2.7811983208243225, "ewc_loss": 0.034729499369859695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472949902061373e-05, "grad_norm": 19.627002716064453, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.888656497001648, "num_tokens": 834267032.0, "step": 21863 }, { "epoch": 2.781325531102913, "ewc_loss": 0.034605175256729126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460517473286018e-05, "grad_norm": 19.719295501708984, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8650671243667603, "num_tokens": 834308479.0, "step": 21864 }, { "epoch": 2.7814527413815036, "ewc_loss": 0.03470971807837486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470971932983957e-05, "grad_norm": 19.68012046813965, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8832334876060486, "num_tokens": 834346185.0, "step": 21865 }, { "epoch": 2.781579951660094, "ewc_loss": 0.03473120555281639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4731205232674256e-05, "grad_norm": 19.74530792236328, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8833642601966858, "num_tokens": 834384404.0, "step": 21866 }, { "epoch": 2.7817071619386846, "ewc_loss": 0.034749895334243774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474989716778509e-05, "grad_norm": 19.672842025756836, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8722320795059204, "num_tokens": 834424833.0, "step": 21867 }, { "epoch": 2.781834372217275, "ewc_loss": 0.03466074913740158, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.466074849711731e-05, "grad_norm": 19.650205612182617, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8771072626113892, "num_tokens": 834464215.0, "step": 21868 }, { "epoch": 2.7819615824958657, "ewc_loss": 0.034705955535173416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470595402177423e-05, "grad_norm": 19.766925811767578, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8522325158119202, "num_tokens": 834502452.0, "step": 21869 }, { "epoch": 2.782088792774456, "ewc_loss": 0.0346868634223938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4686861908994615e-05, "grad_norm": 19.673233032226562, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8750536441802979, "num_tokens": 834539690.0, "step": 21870 }, { "epoch": 2.7822160030530467, "ewc_loss": 0.034654486924409866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4654487535590306e-05, "grad_norm": 19.781112670898438, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.864872932434082, "num_tokens": 834583193.0, "step": 21871 }, { "epoch": 2.7823432133316373, "ewc_loss": 0.034639135003089905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463913526502438e-05, "grad_norm": 19.716997146606445, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8692126870155334, "num_tokens": 834620028.0, "step": 21872 }, { "epoch": 2.782470423610228, "ewc_loss": 0.034639738500118256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4639739169506356e-05, "grad_norm": 19.713354110717773, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8653372526168823, "num_tokens": 834657351.0, "step": 21873 }, { "epoch": 2.7825976338888183, "ewc_loss": 0.034658756107091904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4658754884731025e-05, "grad_norm": 19.686716079711914, "learning_rate": 1e-06, "loss": 0.3313, "mean_token_accuracy": 0.8891155123710632, "num_tokens": 834693687.0, "step": 21874 }, { "epoch": 2.782724844167409, "ewc_loss": 0.034618012607097626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4618013160070404e-05, "grad_norm": 19.720739364624023, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8737891912460327, "num_tokens": 834736171.0, "step": 21875 }, { "epoch": 2.7828520544459994, "ewc_loss": 0.03467322513461113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4673226764425635e-05, "grad_norm": 19.709531784057617, "learning_rate": 1e-06, "loss": 0.3491, "mean_token_accuracy": 0.8889472484588623, "num_tokens": 834775699.0, "step": 21876 }, { "epoch": 2.78297926472459, "ewc_loss": 0.03465549647808075, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465549525571987e-05, "grad_norm": 19.701200485229492, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8694779872894287, "num_tokens": 834816053.0, "step": 21877 }, { "epoch": 2.78310647500318, "ewc_loss": 0.03456747904419899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4567477996461093e-05, "grad_norm": 19.66761016845703, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8710348606109619, "num_tokens": 834857125.0, "step": 21878 }, { "epoch": 2.783233685281771, "ewc_loss": 0.034579358994960785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4579359635245055e-05, "grad_norm": 19.63565444946289, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8872988820075989, "num_tokens": 834895676.0, "step": 21879 }, { "epoch": 2.783360895560361, "ewc_loss": 0.0346006378531456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4600638173287734e-05, "grad_norm": 19.706396102905273, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8780539035797119, "num_tokens": 834927748.0, "step": 21880 }, { "epoch": 2.783488105838952, "ewc_loss": 0.034609854221343994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46098531736061e-05, "grad_norm": 19.627531051635742, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8711985349655151, "num_tokens": 834966722.0, "step": 21881 }, { "epoch": 2.783615316117542, "ewc_loss": 0.03459245711565018, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4592456358950585e-05, "grad_norm": 19.652929306030273, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8771907091140747, "num_tokens": 835003770.0, "step": 21882 }, { "epoch": 2.783742526396133, "ewc_loss": 0.034662529826164246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4662531106732786e-05, "grad_norm": 19.666383743286133, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.883147120475769, "num_tokens": 835041504.0, "step": 21883 }, { "epoch": 2.783869736674723, "ewc_loss": 0.034570347517728806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.457034836173989e-05, "grad_norm": 19.684185028076172, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.877152681350708, "num_tokens": 835075179.0, "step": 21884 }, { "epoch": 2.7839969469533137, "ewc_loss": 0.03471798449754715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4717984817689285e-05, "grad_norm": 19.681621551513672, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8646004796028137, "num_tokens": 835113080.0, "step": 21885 }, { "epoch": 2.784124157231904, "ewc_loss": 0.034635916352272034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46359156537801e-05, "grad_norm": 19.67460823059082, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8744453191757202, "num_tokens": 835150586.0, "step": 21886 }, { "epoch": 2.7842513675104947, "ewc_loss": 0.03464334085583687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464334076852538e-05, "grad_norm": 19.62533187866211, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8725032806396484, "num_tokens": 835184049.0, "step": 21887 }, { "epoch": 2.7843785777890853, "ewc_loss": 0.03463898226618767, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463898246991448e-05, "grad_norm": 19.576793670654297, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8693225383758545, "num_tokens": 835222745.0, "step": 21888 }, { "epoch": 2.784505788067676, "ewc_loss": 0.03470504656434059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4705048165051267e-05, "grad_norm": 19.690183639526367, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8629441261291504, "num_tokens": 835263017.0, "step": 21889 }, { "epoch": 2.7846329983462663, "ewc_loss": 0.03479577600955963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479577571852133e-05, "grad_norm": 19.71368408203125, "learning_rate": 1e-06, "loss": 0.3485, "mean_token_accuracy": 0.8880501389503479, "num_tokens": 835303916.0, "step": 21890 }, { "epoch": 2.784760208624857, "ewc_loss": 0.034678682684898376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467868373263627e-05, "grad_norm": 19.603422164916992, "learning_rate": 1e-06, "loss": 0.3524, "mean_token_accuracy": 0.8899385929107666, "num_tokens": 835343148.0, "step": 21891 }, { "epoch": 2.7848874189034474, "ewc_loss": 0.034701984375715256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4701984986895695e-05, "grad_norm": 19.748106002807617, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8722777366638184, "num_tokens": 835382337.0, "step": 21892 }, { "epoch": 2.785014629182038, "ewc_loss": 0.03476811945438385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4768119803629816e-05, "grad_norm": 19.640378952026367, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8682722449302673, "num_tokens": 835423645.0, "step": 21893 }, { "epoch": 2.7851418394606284, "ewc_loss": 0.03470328077673912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470328010735102e-05, "grad_norm": 19.687902450561523, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.847468376159668, "num_tokens": 835464590.0, "step": 21894 }, { "epoch": 2.785269049739219, "ewc_loss": 0.0347924530506134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479245424387045e-05, "grad_norm": 19.694040298461914, "learning_rate": 1e-06, "loss": 0.3252, "mean_token_accuracy": 0.8978309035301208, "num_tokens": 835496376.0, "step": 21895 }, { "epoch": 2.7853962600178095, "ewc_loss": 0.034718628972768784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471862873993814e-05, "grad_norm": 19.675600051879883, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.860145092010498, "num_tokens": 835541231.0, "step": 21896 }, { "epoch": 2.7855234702964, "ewc_loss": 0.03474566712975502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4745666198432446e-05, "grad_norm": 19.661651611328125, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.87844318151474, "num_tokens": 835584038.0, "step": 21897 }, { "epoch": 2.7856506805749905, "ewc_loss": 0.03467739745974541, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467739588813856e-05, "grad_norm": 19.6441707611084, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8687577843666077, "num_tokens": 835627189.0, "step": 21898 }, { "epoch": 2.785777890853581, "ewc_loss": 0.03469490259885788, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469490184215829e-05, "grad_norm": 19.59906005859375, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.875102162361145, "num_tokens": 835659701.0, "step": 21899 }, { "epoch": 2.7859051011321716, "ewc_loss": 0.03474994748830795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474994809948839e-05, "grad_norm": 19.728788375854492, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8806236982345581, "num_tokens": 835694045.0, "step": 21900 }, { "epoch": 2.786032311410762, "ewc_loss": 0.0347401462495327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474014738458209e-05, "grad_norm": 19.659204483032227, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8718801736831665, "num_tokens": 835732118.0, "step": 21901 }, { "epoch": 2.7861595216893527, "ewc_loss": 0.034634631127119064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46346314472612e-05, "grad_norm": 19.62552833557129, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8836354613304138, "num_tokens": 835772700.0, "step": 21902 }, { "epoch": 2.7862867319679427, "ewc_loss": 0.03474844992160797, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474844925221987e-05, "grad_norm": 19.68412971496582, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8584001660346985, "num_tokens": 835811206.0, "step": 21903 }, { "epoch": 2.7864139422465337, "ewc_loss": 0.03471200168132782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471200034255162e-05, "grad_norm": 19.702808380126953, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8801021575927734, "num_tokens": 835844381.0, "step": 21904 }, { "epoch": 2.786541152525124, "ewc_loss": 0.03473258391022682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4732584026642144e-05, "grad_norm": 19.710412979125977, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8670119047164917, "num_tokens": 835879926.0, "step": 21905 }, { "epoch": 2.7866683628037148, "ewc_loss": 0.03464282304048538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464282417553477e-05, "grad_norm": 19.593021392822266, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8637371063232422, "num_tokens": 835924014.0, "step": 21906 }, { "epoch": 2.786795573082305, "ewc_loss": 0.03466381877660751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4663818951230496e-05, "grad_norm": 19.77223014831543, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8689035773277283, "num_tokens": 835956054.0, "step": 21907 }, { "epoch": 2.786922783360896, "ewc_loss": 0.034691669046878815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469166767899878e-05, "grad_norm": 19.629724502563477, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8801093101501465, "num_tokens": 835988885.0, "step": 21908 }, { "epoch": 2.787049993639486, "ewc_loss": 0.03468379005789757, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468379145488143e-05, "grad_norm": 19.700790405273438, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8731091022491455, "num_tokens": 836028885.0, "step": 21909 }, { "epoch": 2.7871772039180764, "ewc_loss": 0.03476153314113617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476153142401017e-05, "grad_norm": 19.653963088989258, "learning_rate": 1e-06, "loss": 0.3593, "mean_token_accuracy": 0.888045072555542, "num_tokens": 836067340.0, "step": 21910 }, { "epoch": 2.787304414196667, "ewc_loss": 0.03473310172557831, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473310061963275e-05, "grad_norm": 19.747764587402344, "learning_rate": 1e-06, "loss": 0.3814, "mean_token_accuracy": 0.8750593662261963, "num_tokens": 836104432.0, "step": 21911 }, { "epoch": 2.7874316244752575, "ewc_loss": 0.03473547101020813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4735472581814975e-05, "grad_norm": 19.643447875976562, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8695489764213562, "num_tokens": 836141197.0, "step": 21912 }, { "epoch": 2.787558834753848, "ewc_loss": 0.03466907516121864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4669075830606744e-05, "grad_norm": 19.717552185058594, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8860946297645569, "num_tokens": 836176949.0, "step": 21913 }, { "epoch": 2.7876860450324386, "ewc_loss": 0.03478730469942093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478730286587961e-05, "grad_norm": 19.734350204467773, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8723580837249756, "num_tokens": 836206484.0, "step": 21914 }, { "epoch": 2.787813255311029, "ewc_loss": 0.03472617268562317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472617390798405e-05, "grad_norm": 19.67028045654297, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8797104358673096, "num_tokens": 836243056.0, "step": 21915 }, { "epoch": 2.7879404655896196, "ewc_loss": 0.03477351367473602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477351492620073e-05, "grad_norm": 19.727392196655273, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8699856996536255, "num_tokens": 836281154.0, "step": 21916 }, { "epoch": 2.78806767586821, "ewc_loss": 0.034771110862493515, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4771110222209245e-05, "grad_norm": 19.69940757751465, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8744345307350159, "num_tokens": 836319895.0, "step": 21917 }, { "epoch": 2.7881948861468007, "ewc_loss": 0.03469151630997658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4691514883888885e-05, "grad_norm": 19.668874740600586, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8797104358673096, "num_tokens": 836357227.0, "step": 21918 }, { "epoch": 2.788322096425391, "ewc_loss": 0.03472365811467171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472365642664954e-05, "grad_norm": 19.71902084350586, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8676161766052246, "num_tokens": 836398442.0, "step": 21919 }, { "epoch": 2.7884493067039817, "ewc_loss": 0.034707311540842056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470731098786928e-05, "grad_norm": 19.66254997253418, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8821690082550049, "num_tokens": 836436660.0, "step": 21920 }, { "epoch": 2.7885765169825723, "ewc_loss": 0.034705400466918945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4705401048995554e-05, "grad_norm": 19.683107376098633, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8684307932853699, "num_tokens": 836479180.0, "step": 21921 }, { "epoch": 2.788703727261163, "ewc_loss": 0.03473241627216339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473241667961702e-05, "grad_norm": 19.760133743286133, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8807566165924072, "num_tokens": 836518874.0, "step": 21922 }, { "epoch": 2.7888309375397533, "ewc_loss": 0.034696612507104874, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469661169219762e-05, "grad_norm": 19.700937271118164, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.870798647403717, "num_tokens": 836556910.0, "step": 21923 }, { "epoch": 2.788958147818344, "ewc_loss": 0.03466456010937691, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.466456109890714e-05, "grad_norm": 19.746755599975586, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8865526914596558, "num_tokens": 836598151.0, "step": 21924 }, { "epoch": 2.7890853580969344, "ewc_loss": 0.03471962735056877, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471962554613128e-05, "grad_norm": 19.76583480834961, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8664072751998901, "num_tokens": 836637952.0, "step": 21925 }, { "epoch": 2.789212568375525, "ewc_loss": 0.03463706746697426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463706889306195e-05, "grad_norm": 19.628055572509766, "learning_rate": 1e-06, "loss": 0.3493, "mean_token_accuracy": 0.8883454203605652, "num_tokens": 836677138.0, "step": 21926 }, { "epoch": 2.7893397786541154, "ewc_loss": 0.03468678146600723, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468678187346086e-05, "grad_norm": 19.8164119720459, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8796868324279785, "num_tokens": 836716880.0, "step": 21927 }, { "epoch": 2.7894669889327055, "ewc_loss": 0.034661151468753815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46611523127649e-05, "grad_norm": 19.68901252746582, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8870618343353271, "num_tokens": 836752572.0, "step": 21928 }, { "epoch": 2.7895941992112965, "ewc_loss": 0.034637708216905594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4637709177332e-05, "grad_norm": 19.73503875732422, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8713029623031616, "num_tokens": 836794321.0, "step": 21929 }, { "epoch": 2.7897214094898866, "ewc_loss": 0.03462985157966614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462985114310868e-05, "grad_norm": 19.676027297973633, "learning_rate": 1e-06, "loss": 0.3364, "mean_token_accuracy": 0.889127254486084, "num_tokens": 836829183.0, "step": 21930 }, { "epoch": 2.7898486197684775, "ewc_loss": 0.03459587320685387, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4595872421050444e-05, "grad_norm": 19.699739456176758, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8638597726821899, "num_tokens": 836864644.0, "step": 21931 }, { "epoch": 2.7899758300470676, "ewc_loss": 0.034591980278491974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4591979783726856e-05, "grad_norm": 19.691123962402344, "learning_rate": 1e-06, "loss": 0.3563, "mean_token_accuracy": 0.8865638375282288, "num_tokens": 836895384.0, "step": 21932 }, { "epoch": 2.7901030403256586, "ewc_loss": 0.03465129807591438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4651297028176486e-05, "grad_norm": 19.676956176757812, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8769913911819458, "num_tokens": 836943336.0, "step": 21933 }, { "epoch": 2.7902302506042487, "ewc_loss": 0.034627459943294525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462746099103242e-05, "grad_norm": 19.620500564575195, "learning_rate": 1e-06, "loss": 0.3454, "mean_token_accuracy": 0.8881440162658691, "num_tokens": 836980708.0, "step": 21934 }, { "epoch": 2.790357460882839, "ewc_loss": 0.0346364825963974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463648317847401e-05, "grad_norm": 19.720529556274414, "learning_rate": 1e-06, "loss": 0.3836, "mean_token_accuracy": 0.8793648481369019, "num_tokens": 837015738.0, "step": 21935 }, { "epoch": 2.7904846711614297, "ewc_loss": 0.0347038209438324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470382216619328e-05, "grad_norm": 19.654376983642578, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.881047785282135, "num_tokens": 837053014.0, "step": 21936 }, { "epoch": 2.7906118814400203, "ewc_loss": 0.034619927406311035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4619926736922935e-05, "grad_norm": 19.72057342529297, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8689658641815186, "num_tokens": 837083579.0, "step": 21937 }, { "epoch": 2.790739091718611, "ewc_loss": 0.03472914546728134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472914613666944e-05, "grad_norm": 19.64360809326172, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.884016752243042, "num_tokens": 837126201.0, "step": 21938 }, { "epoch": 2.7908663019972013, "ewc_loss": 0.034671250730752945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4671251341933385e-05, "grad_norm": 19.731342315673828, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8842301368713379, "num_tokens": 837162400.0, "step": 21939 }, { "epoch": 2.790993512275792, "ewc_loss": 0.03472272679209709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4722725104074925e-05, "grad_norm": 19.66921615600586, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8782879710197449, "num_tokens": 837205733.0, "step": 21940 }, { "epoch": 2.7911207225543824, "ewc_loss": 0.03460853919386864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460853986325674e-05, "grad_norm": 19.7152156829834, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8645544052124023, "num_tokens": 837243838.0, "step": 21941 }, { "epoch": 2.791247932832973, "ewc_loss": 0.03472678363323212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472678508842364e-05, "grad_norm": 19.75226593017578, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8712437748908997, "num_tokens": 837280936.0, "step": 21942 }, { "epoch": 2.7913751431115634, "ewc_loss": 0.03467481583356857, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4674816561164334e-05, "grad_norm": 19.659591674804688, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8754331469535828, "num_tokens": 837321790.0, "step": 21943 }, { "epoch": 2.791502353390154, "ewc_loss": 0.034642674028873444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464267501840368e-05, "grad_norm": 19.70046615600586, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.877880871295929, "num_tokens": 837362023.0, "step": 21944 }, { "epoch": 2.7916295636687445, "ewc_loss": 0.03472602367401123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472602475085296e-05, "grad_norm": 19.76024055480957, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8816589713096619, "num_tokens": 837404876.0, "step": 21945 }, { "epoch": 2.791756773947335, "ewc_loss": 0.03468620032072067, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4686199796851724e-05, "grad_norm": 19.734416961669922, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8680475950241089, "num_tokens": 837442899.0, "step": 21946 }, { "epoch": 2.7918839842259255, "ewc_loss": 0.03465699031949043, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4656990465009585e-05, "grad_norm": 19.674467086791992, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8848270177841187, "num_tokens": 837477611.0, "step": 21947 }, { "epoch": 2.792011194504516, "ewc_loss": 0.034591883420944214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.459188519627787e-05, "grad_norm": 19.703174591064453, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8636729717254639, "num_tokens": 837515928.0, "step": 21948 }, { "epoch": 2.7921384047831066, "ewc_loss": 0.03466083109378815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4660832170629874e-05, "grad_norm": 19.687009811401367, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8769856691360474, "num_tokens": 837553042.0, "step": 21949 }, { "epoch": 2.792265615061697, "ewc_loss": 0.03457748517394066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.45774860761594e-05, "grad_norm": 19.671903610229492, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8811007142066956, "num_tokens": 837589749.0, "step": 21950 }, { "epoch": 2.7923928253402877, "ewc_loss": 0.03469770401716232, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469770308583975e-05, "grad_norm": 19.742734909057617, "learning_rate": 1e-06, "loss": 0.3416, "mean_token_accuracy": 0.893089771270752, "num_tokens": 837631104.0, "step": 21951 }, { "epoch": 2.792520035618878, "ewc_loss": 0.03467218205332756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4672182664508e-05, "grad_norm": 19.697460174560547, "learning_rate": 1e-06, "loss": 0.3603, "mean_token_accuracy": 0.8837421536445618, "num_tokens": 837668173.0, "step": 21952 }, { "epoch": 2.7926472458974683, "ewc_loss": 0.03463000804185867, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4630007576197386e-05, "grad_norm": 19.721351623535156, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8816400170326233, "num_tokens": 837704282.0, "step": 21953 }, { "epoch": 2.7927744561760592, "ewc_loss": 0.03468187525868416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468187424005009e-05, "grad_norm": 19.670270919799805, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8661094903945923, "num_tokens": 837750790.0, "step": 21954 }, { "epoch": 2.7929016664546493, "ewc_loss": 0.034691207110881805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.469120565569028e-05, "grad_norm": 19.771747589111328, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8729974031448364, "num_tokens": 837792041.0, "step": 21955 }, { "epoch": 2.7930288767332403, "ewc_loss": 0.03464341536164284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464341352810152e-05, "grad_norm": 19.697566986083984, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.881110668182373, "num_tokens": 837818645.0, "step": 21956 }, { "epoch": 2.7931560870118304, "ewc_loss": 0.03463279455900192, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.463279426796362e-05, "grad_norm": 19.73225212097168, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8852472305297852, "num_tokens": 837859816.0, "step": 21957 }, { "epoch": 2.793283297290421, "ewc_loss": 0.03461819514632225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461819505901076e-05, "grad_norm": 19.639827728271484, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8703564405441284, "num_tokens": 837903970.0, "step": 21958 }, { "epoch": 2.7934105075690114, "ewc_loss": 0.03460821509361267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4608216083142906e-05, "grad_norm": 19.721689224243164, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8761158585548401, "num_tokens": 837941158.0, "step": 21959 }, { "epoch": 2.793537717847602, "ewc_loss": 0.03465189039707184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465189001872204e-05, "grad_norm": 19.60506820678711, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8815785646438599, "num_tokens": 837978157.0, "step": 21960 }, { "epoch": 2.7936649281261925, "ewc_loss": 0.03467031940817833, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467032001935877e-05, "grad_norm": 19.71588134765625, "learning_rate": 1e-06, "loss": 0.3498, "mean_token_accuracy": 0.8863868117332458, "num_tokens": 838017871.0, "step": 21961 }, { "epoch": 2.793792138404783, "ewc_loss": 0.03472801670432091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4728018363239244e-05, "grad_norm": 19.716880798339844, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8870717287063599, "num_tokens": 838050521.0, "step": 21962 }, { "epoch": 2.7939193486833735, "ewc_loss": 0.034617792814970016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461779124336317e-05, "grad_norm": 19.669477462768555, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.883297324180603, "num_tokens": 838084450.0, "step": 21963 }, { "epoch": 2.794046558961964, "ewc_loss": 0.03472716361284256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472716343821958e-05, "grad_norm": 19.732879638671875, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8700383901596069, "num_tokens": 838121505.0, "step": 21964 }, { "epoch": 2.7941737692405546, "ewc_loss": 0.03472958505153656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47295863321051e-05, "grad_norm": 19.659461975097656, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8739027976989746, "num_tokens": 838166148.0, "step": 21965 }, { "epoch": 2.794300979519145, "ewc_loss": 0.03472493961453438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4724940633168444e-05, "grad_norm": 19.7235050201416, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.880569577217102, "num_tokens": 838203578.0, "step": 21966 }, { "epoch": 2.7944281897977357, "ewc_loss": 0.034752070903778076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475206904113293e-05, "grad_norm": 19.64983367919922, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8810244202613831, "num_tokens": 838240702.0, "step": 21967 }, { "epoch": 2.794555400076326, "ewc_loss": 0.034648481756448746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.46484812325798e-05, "grad_norm": 19.698083877563477, "learning_rate": 1e-06, "loss": 0.3642, "mean_token_accuracy": 0.8825006484985352, "num_tokens": 838275639.0, "step": 21968 }, { "epoch": 2.7946826103549167, "ewc_loss": 0.03474672883749008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4746728488244116e-05, "grad_norm": 19.704553604125977, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8771229386329651, "num_tokens": 838310045.0, "step": 21969 }, { "epoch": 2.7948098206335072, "ewc_loss": 0.03471978381276131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4719785617198795e-05, "grad_norm": 19.648279190063477, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8765668272972107, "num_tokens": 838351571.0, "step": 21970 }, { "epoch": 2.7949370309120978, "ewc_loss": 0.03476940840482712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4769407648127526e-05, "grad_norm": 19.663511276245117, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.888359785079956, "num_tokens": 838393804.0, "step": 21971 }, { "epoch": 2.7950642411906883, "ewc_loss": 0.03471781313419342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471781383268535e-05, "grad_norm": 19.6314754486084, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8758130669593811, "num_tokens": 838435516.0, "step": 21972 }, { "epoch": 2.795191451469279, "ewc_loss": 0.03473193570971489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473193646641448e-05, "grad_norm": 19.701202392578125, "learning_rate": 1e-06, "loss": 0.3609, "mean_token_accuracy": 0.8842427730560303, "num_tokens": 838465425.0, "step": 21973 }, { "epoch": 2.7953186617478694, "ewc_loss": 0.03479732945561409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4797329135471955e-05, "grad_norm": 19.656721115112305, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8738452792167664, "num_tokens": 838512832.0, "step": 21974 }, { "epoch": 2.79544587202646, "ewc_loss": 0.03472777456045151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472777461865917e-05, "grad_norm": 19.702838897705078, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8825642466545105, "num_tokens": 838552806.0, "step": 21975 }, { "epoch": 2.79557308230505, "ewc_loss": 0.03474419564008713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474419645499438e-05, "grad_norm": 19.620716094970703, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8744564056396484, "num_tokens": 838588592.0, "step": 21976 }, { "epoch": 2.795700292583641, "ewc_loss": 0.03470733016729355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470732917776331e-05, "grad_norm": 19.726125717163086, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.874101996421814, "num_tokens": 838629402.0, "step": 21977 }, { "epoch": 2.795827502862231, "ewc_loss": 0.034850724041461945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485072375042364e-05, "grad_norm": 19.72278594970703, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8670826554298401, "num_tokens": 838676580.0, "step": 21978 }, { "epoch": 2.795954713140822, "ewc_loss": 0.03468552604317665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.468552677077241e-05, "grad_norm": 19.741487503051758, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8710678219795227, "num_tokens": 838715773.0, "step": 21979 }, { "epoch": 2.796081923419412, "ewc_loss": 0.034746501594781876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474650293355808e-05, "grad_norm": 19.706790924072266, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8657454252243042, "num_tokens": 838750270.0, "step": 21980 }, { "epoch": 2.796209133698003, "ewc_loss": 0.034712065011262894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4712065826170146e-05, "grad_norm": 19.75129508972168, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.85540372133255, "num_tokens": 838794823.0, "step": 21981 }, { "epoch": 2.796336343976593, "ewc_loss": 0.03473944589495659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473944525467232e-05, "grad_norm": 19.722047805786133, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8669779300689697, "num_tokens": 838828901.0, "step": 21982 }, { "epoch": 2.7964635542551837, "ewc_loss": 0.03466157987713814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4661581594264135e-05, "grad_norm": 19.78042221069336, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8670807480812073, "num_tokens": 838872648.0, "step": 21983 }, { "epoch": 2.796590764533774, "ewc_loss": 0.034717582166194916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47175810020417e-05, "grad_norm": 19.704227447509766, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.8895432353019714, "num_tokens": 838905150.0, "step": 21984 }, { "epoch": 2.7967179748123647, "ewc_loss": 0.03467922657728195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467922579147853e-05, "grad_norm": 19.630578994750977, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8758544921875, "num_tokens": 838949027.0, "step": 21985 }, { "epoch": 2.7968451850909553, "ewc_loss": 0.034755244851112366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475524499663152e-05, "grad_norm": 19.828609466552734, "learning_rate": 1e-06, "loss": 0.3837, "mean_token_accuracy": 0.877285897731781, "num_tokens": 838987320.0, "step": 21986 }, { "epoch": 2.796972395369546, "ewc_loss": 0.0347556546330452, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475565608823672e-05, "grad_norm": 19.652088165283203, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8710886836051941, "num_tokens": 839027396.0, "step": 21987 }, { "epoch": 2.7970996056481363, "ewc_loss": 0.03462975099682808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.462974927970208e-05, "grad_norm": 19.723608016967773, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8817763328552246, "num_tokens": 839069779.0, "step": 21988 }, { "epoch": 2.797226815926727, "ewc_loss": 0.03470553457736969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470553565421142e-05, "grad_norm": 19.73668098449707, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8823041915893555, "num_tokens": 839110919.0, "step": 21989 }, { "epoch": 2.7973540262053174, "ewc_loss": 0.03460757061839104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.460757216089405e-05, "grad_norm": 19.63163948059082, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8884739875793457, "num_tokens": 839143689.0, "step": 21990 }, { "epoch": 2.797481236483908, "ewc_loss": 0.03463801369071007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4638014767551795e-05, "grad_norm": 19.858564376831055, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8580413460731506, "num_tokens": 839181274.0, "step": 21991 }, { "epoch": 2.7976084467624984, "ewc_loss": 0.034717634320259094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4717635571723804e-05, "grad_norm": 19.640567779541016, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8603111505508423, "num_tokens": 839217038.0, "step": 21992 }, { "epoch": 2.797735657041089, "ewc_loss": 0.03463318571448326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4633187169674784e-05, "grad_norm": 19.788047790527344, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8747905492782593, "num_tokens": 839260104.0, "step": 21993 }, { "epoch": 2.7978628673196795, "ewc_loss": 0.034656886011362076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.465688496362418e-05, "grad_norm": 19.538372039794922, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8787021040916443, "num_tokens": 839294328.0, "step": 21994 }, { "epoch": 2.79799007759827, "ewc_loss": 0.034614041447639465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.461404048721306e-05, "grad_norm": 19.72422218322754, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8744673728942871, "num_tokens": 839326662.0, "step": 21995 }, { "epoch": 2.7981172878768605, "ewc_loss": 0.03473260998725891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473260949249379e-05, "grad_norm": 19.675888061523438, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8768746852874756, "num_tokens": 839355997.0, "step": 21996 }, { "epoch": 2.798244498155451, "ewc_loss": 0.03472939506173134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4729393519228324e-05, "grad_norm": 19.716232299804688, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8810209631919861, "num_tokens": 839396596.0, "step": 21997 }, { "epoch": 2.7983717084340416, "ewc_loss": 0.034795768558979034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479576844256371e-05, "grad_norm": 19.6747989654541, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8767021894454956, "num_tokens": 839432251.0, "step": 21998 }, { "epoch": 2.798498918712632, "ewc_loss": 0.034736551344394684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473655306152068e-05, "grad_norm": 19.699146270751953, "learning_rate": 1e-06, "loss": 0.3567, "mean_token_accuracy": 0.8840789794921875, "num_tokens": 839465987.0, "step": 21999 }, { "epoch": 2.7986261289912227, "ewc_loss": 0.034776657819747925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477665813989006e-05, "grad_norm": 19.64120101928711, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8785271644592285, "num_tokens": 839506147.0, "step": 22000 }, { "epoch": 2.7987533392698127, "ewc_loss": 0.03473328799009323, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473328615655191e-05, "grad_norm": 19.685283660888672, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.867763340473175, "num_tokens": 839544620.0, "step": 22001 }, { "epoch": 2.7988805495484037, "ewc_loss": 0.034759700298309326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4759701520670205e-05, "grad_norm": 19.671899795532227, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8683346509933472, "num_tokens": 839583217.0, "step": 22002 }, { "epoch": 2.799007759826994, "ewc_loss": 0.03478686511516571, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478686630842276e-05, "grad_norm": 19.613134384155273, "learning_rate": 1e-06, "loss": 0.3437, "mean_token_accuracy": 0.8937724232673645, "num_tokens": 839621283.0, "step": 22003 }, { "epoch": 2.7991349701055848, "ewc_loss": 0.03474913537502289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474913683021441e-05, "grad_norm": 19.636411666870117, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8706411123275757, "num_tokens": 839658072.0, "step": 22004 }, { "epoch": 2.799262180384175, "ewc_loss": 0.034854453057050705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485445267870091e-05, "grad_norm": 19.6944580078125, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8582133054733276, "num_tokens": 839700926.0, "step": 22005 }, { "epoch": 2.799389390662766, "ewc_loss": 0.03479975089430809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479975202935748e-05, "grad_norm": 19.67795753479004, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.8849098086357117, "num_tokens": 839736160.0, "step": 22006 }, { "epoch": 2.799516600941356, "ewc_loss": 0.03482542559504509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4825425245799124e-05, "grad_norm": 19.624065399169922, "learning_rate": 1e-06, "loss": 0.3417, "mean_token_accuracy": 0.8904550075531006, "num_tokens": 839776834.0, "step": 22007 }, { "epoch": 2.7996438112199464, "ewc_loss": 0.03478815406560898, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478815415292047e-05, "grad_norm": 19.633056640625, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.86888587474823, "num_tokens": 839818156.0, "step": 22008 }, { "epoch": 2.799771021498537, "ewc_loss": 0.03482766076922417, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4827662602765486e-05, "grad_norm": 19.636959075927734, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8694047927856445, "num_tokens": 839857133.0, "step": 22009 }, { "epoch": 2.7998982317771275, "ewc_loss": 0.03485965356230736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485965498839505e-05, "grad_norm": 19.687480926513672, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8591645359992981, "num_tokens": 839893129.0, "step": 22010 }, { "epoch": 2.800025442055718, "ewc_loss": 0.034843772649765015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484377157292329e-05, "grad_norm": 19.693161010742188, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8786402344703674, "num_tokens": 839924622.0, "step": 22011 }, { "epoch": 2.8001526523343085, "ewc_loss": 0.034860122948884964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4860124287661165e-05, "grad_norm": 19.681623458862305, "learning_rate": 1e-06, "loss": 0.334, "mean_token_accuracy": 0.8942663669586182, "num_tokens": 839961276.0, "step": 22012 }, { "epoch": 2.800279862612899, "ewc_loss": 0.03480322286486626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4803222661139444e-05, "grad_norm": 19.664445877075195, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8658466339111328, "num_tokens": 840002554.0, "step": 22013 }, { "epoch": 2.8004070728914896, "ewc_loss": 0.03483908250927925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4839082218240947e-05, "grad_norm": 19.665388107299805, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8733928799629211, "num_tokens": 840041642.0, "step": 22014 }, { "epoch": 2.80053428317008, "ewc_loss": 0.03483274579048157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483274485915899e-05, "grad_norm": 19.65055274963379, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8838195204734802, "num_tokens": 840083532.0, "step": 22015 }, { "epoch": 2.8006614934486707, "ewc_loss": 0.034818924963474274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481892417767085e-05, "grad_norm": 19.710859298706055, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8727303743362427, "num_tokens": 840124221.0, "step": 22016 }, { "epoch": 2.800788703727261, "ewc_loss": 0.03476734086871147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47673412761651e-05, "grad_norm": 19.625425338745117, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8796178102493286, "num_tokens": 840162309.0, "step": 22017 }, { "epoch": 2.8009159140058517, "ewc_loss": 0.03479962423443794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479962470009923e-05, "grad_norm": 19.76319694519043, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8778529763221741, "num_tokens": 840194778.0, "step": 22018 }, { "epoch": 2.8010431242844422, "ewc_loss": 0.034860819578170776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486081914161332e-05, "grad_norm": 19.672517776489258, "learning_rate": 1e-06, "loss": 0.3346, "mean_token_accuracy": 0.8937048316001892, "num_tokens": 840225081.0, "step": 22019 }, { "epoch": 2.8011703345630328, "ewc_loss": 0.034827765077352524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4827764466172084e-05, "grad_norm": 19.75528907775879, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8745299577713013, "num_tokens": 840267698.0, "step": 22020 }, { "epoch": 2.8012975448416233, "ewc_loss": 0.034867044538259506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4867043723352253e-05, "grad_norm": 19.774314880371094, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8688411712646484, "num_tokens": 840303255.0, "step": 22021 }, { "epoch": 2.801424755120214, "ewc_loss": 0.03475940227508545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4759403206408024e-05, "grad_norm": 19.60365104675293, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8745123744010925, "num_tokens": 840344616.0, "step": 22022 }, { "epoch": 2.8015519653988044, "ewc_loss": 0.0347835011780262, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47835011780262e-05, "grad_norm": 19.70698356628418, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8765262961387634, "num_tokens": 840383293.0, "step": 22023 }, { "epoch": 2.801679175677395, "ewc_loss": 0.03481018543243408, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481018575257622e-05, "grad_norm": 19.630874633789062, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8654352426528931, "num_tokens": 840424583.0, "step": 22024 }, { "epoch": 2.8018063859559854, "ewc_loss": 0.034808870404958725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.480886880424805e-05, "grad_norm": 19.713071823120117, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8703216910362244, "num_tokens": 840458752.0, "step": 22025 }, { "epoch": 2.8019335962345755, "ewc_loss": 0.0348573662340641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485736669972539e-05, "grad_norm": 19.752634048461914, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8819122314453125, "num_tokens": 840498441.0, "step": 22026 }, { "epoch": 2.8020608065131665, "ewc_loss": 0.03472470864653587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472470780252479e-05, "grad_norm": 19.611635208129883, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8856688737869263, "num_tokens": 840537838.0, "step": 22027 }, { "epoch": 2.8021880167917566, "ewc_loss": 0.03479139134287834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479139195405878e-05, "grad_norm": 19.64733123779297, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8656727075576782, "num_tokens": 840571770.0, "step": 22028 }, { "epoch": 2.8023152270703475, "ewc_loss": 0.03487135097384453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487135109025985e-05, "grad_norm": 19.730514526367188, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8696324825286865, "num_tokens": 840607559.0, "step": 22029 }, { "epoch": 2.8024424373489376, "ewc_loss": 0.0347592793405056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475927951512858e-05, "grad_norm": 19.64546775817871, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8610098361968994, "num_tokens": 840650156.0, "step": 22030 }, { "epoch": 2.8025696476275286, "ewc_loss": 0.03480313718318939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.480313898762688e-05, "grad_norm": 19.652746200561523, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.866666316986084, "num_tokens": 840686590.0, "step": 22031 }, { "epoch": 2.8026968579061187, "ewc_loss": 0.03488180413842201, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4881803003372625e-05, "grad_norm": 19.762678146362305, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8683089017868042, "num_tokens": 840724445.0, "step": 22032 }, { "epoch": 2.802824068184709, "ewc_loss": 0.03482270613312721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482270767563023e-05, "grad_norm": 19.709867477416992, "learning_rate": 1e-06, "loss": 0.3423, "mean_token_accuracy": 0.8898510336875916, "num_tokens": 840765047.0, "step": 22033 }, { "epoch": 2.8029512784632997, "ewc_loss": 0.0347956046462059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4795604733517393e-05, "grad_norm": 19.788400650024414, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8629345893859863, "num_tokens": 840802375.0, "step": 22034 }, { "epoch": 2.8030784887418903, "ewc_loss": 0.0348394513130188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483944965410046e-05, "grad_norm": 19.70249366760254, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8745005130767822, "num_tokens": 840839994.0, "step": 22035 }, { "epoch": 2.803205699020481, "ewc_loss": 0.034751046448946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4751046769088134e-05, "grad_norm": 19.706497192382812, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8732591271400452, "num_tokens": 840877144.0, "step": 22036 }, { "epoch": 2.8033329092990713, "ewc_loss": 0.03474761173129082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474761251709424e-05, "grad_norm": 19.75679588317871, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8656076192855835, "num_tokens": 840918478.0, "step": 22037 }, { "epoch": 2.803460119577662, "ewc_loss": 0.0347629077732563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476290657999925e-05, "grad_norm": 19.567829132080078, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8840676546096802, "num_tokens": 840958484.0, "step": 22038 }, { "epoch": 2.8035873298562524, "ewc_loss": 0.03475883603096008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475883568171412e-05, "grad_norm": 19.714834213256836, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8779088258743286, "num_tokens": 841000035.0, "step": 22039 }, { "epoch": 2.803714540134843, "ewc_loss": 0.03488790988922119, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488791116978973e-05, "grad_norm": 19.66652488708496, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8699746131896973, "num_tokens": 841035836.0, "step": 22040 }, { "epoch": 2.8038417504134334, "ewc_loss": 0.03476426377892494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47642635460943e-05, "grad_norm": 19.650419235229492, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8769197463989258, "num_tokens": 841069295.0, "step": 22041 }, { "epoch": 2.803968960692024, "ewc_loss": 0.03486623615026474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486623609205708e-05, "grad_norm": 19.757123947143555, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8714741468429565, "num_tokens": 841108341.0, "step": 22042 }, { "epoch": 2.8040961709706145, "ewc_loss": 0.034827373921871185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4827375202439725e-05, "grad_norm": 19.692697525024414, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8851827383041382, "num_tokens": 841142945.0, "step": 22043 }, { "epoch": 2.804223381249205, "ewc_loss": 0.034784361720085144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478436337900348e-05, "grad_norm": 19.706697463989258, "learning_rate": 1e-06, "loss": 0.3472, "mean_token_accuracy": 0.8885055184364319, "num_tokens": 841174137.0, "step": 22044 }, { "epoch": 2.8043505915277955, "ewc_loss": 0.03486734256148338, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4867342037614435e-05, "grad_norm": 19.761625289916992, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8847898840904236, "num_tokens": 841215914.0, "step": 22045 }, { "epoch": 2.804477801806386, "ewc_loss": 0.03481943532824516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481943713268265e-05, "grad_norm": 19.712326049804688, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8769297003746033, "num_tokens": 841256097.0, "step": 22046 }, { "epoch": 2.8046050120849766, "ewc_loss": 0.03481464087963104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4814642276614904e-05, "grad_norm": 19.701528549194336, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8794054985046387, "num_tokens": 841288997.0, "step": 22047 }, { "epoch": 2.804732222363567, "ewc_loss": 0.03481045737862587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481045860098675e-05, "grad_norm": 19.722732543945312, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8703899383544922, "num_tokens": 841328053.0, "step": 22048 }, { "epoch": 2.8048594326421576, "ewc_loss": 0.03481746092438698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48174617101904e-05, "grad_norm": 19.604047775268555, "learning_rate": 1e-06, "loss": 0.3378, "mean_token_accuracy": 0.8932713270187378, "num_tokens": 841366848.0, "step": 22049 }, { "epoch": 2.804986642920748, "ewc_loss": 0.03475238010287285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475238190731034e-05, "grad_norm": 19.720382690429688, "learning_rate": 1e-06, "loss": 0.3599, "mean_token_accuracy": 0.8842242956161499, "num_tokens": 841398568.0, "step": 22050 }, { "epoch": 2.8051138531993383, "ewc_loss": 0.03490264713764191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490264862193726e-05, "grad_norm": 19.692350387573242, "learning_rate": 1e-06, "loss": 0.3908, "mean_token_accuracy": 0.873521089553833, "num_tokens": 841435722.0, "step": 22051 }, { "epoch": 2.8052410634779292, "ewc_loss": 0.034853145480155945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485314664430916e-05, "grad_norm": 19.66250228881836, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8723140954971313, "num_tokens": 841473635.0, "step": 22052 }, { "epoch": 2.8053682737565193, "ewc_loss": 0.03478576987981796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4785771276801825e-05, "grad_norm": 19.670459747314453, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8794219493865967, "num_tokens": 841512383.0, "step": 22053 }, { "epoch": 2.8054954840351103, "ewc_loss": 0.034851767122745514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4851767850341275e-05, "grad_norm": 19.6826114654541, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8707348704338074, "num_tokens": 841543174.0, "step": 22054 }, { "epoch": 2.8056226943137004, "ewc_loss": 0.03486819565296173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48681969626341e-05, "grad_norm": 19.680099487304688, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8810949325561523, "num_tokens": 841582185.0, "step": 22055 }, { "epoch": 2.805749904592291, "ewc_loss": 0.034817080944776535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4817079722415656e-05, "grad_norm": 19.62468910217285, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8706058263778687, "num_tokens": 841623179.0, "step": 22056 }, { "epoch": 2.8058771148708814, "ewc_loss": 0.034881751984357834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4881752071669325e-05, "grad_norm": 19.668804168701172, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8837258219718933, "num_tokens": 841661062.0, "step": 22057 }, { "epoch": 2.806004325149472, "ewc_loss": 0.03489626199007034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489626033115201e-05, "grad_norm": 19.661062240600586, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8736245036125183, "num_tokens": 841704259.0, "step": 22058 }, { "epoch": 2.8061315354280625, "ewc_loss": 0.0348324216902256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483242107904516e-05, "grad_norm": 19.684585571289062, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8733338713645935, "num_tokens": 841748396.0, "step": 22059 }, { "epoch": 2.806258745706653, "ewc_loss": 0.03485213220119476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4852131648221985e-05, "grad_norm": 19.677581787109375, "learning_rate": 1e-06, "loss": 0.3332, "mean_token_accuracy": 0.8935428261756897, "num_tokens": 841787809.0, "step": 22060 }, { "epoch": 2.8063859559852435, "ewc_loss": 0.03487231954932213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4872318792622536e-05, "grad_norm": 19.679443359375, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8756393194198608, "num_tokens": 841824046.0, "step": 22061 }, { "epoch": 2.806513166263834, "ewc_loss": 0.03483011573553085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4830114600481465e-05, "grad_norm": 19.66373062133789, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.8845821619033813, "num_tokens": 841856949.0, "step": 22062 }, { "epoch": 2.8066403765424246, "ewc_loss": 0.03487269952893257, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487270078039728e-05, "grad_norm": 19.752552032470703, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8627585768699646, "num_tokens": 841898644.0, "step": 22063 }, { "epoch": 2.806767586821015, "ewc_loss": 0.03485814109444618, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48581415892113e-05, "grad_norm": 19.709535598754883, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8784847259521484, "num_tokens": 841934795.0, "step": 22064 }, { "epoch": 2.8068947970996057, "ewc_loss": 0.03481147810816765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4811477235052735e-05, "grad_norm": 19.709774017333984, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8604021072387695, "num_tokens": 841976878.0, "step": 22065 }, { "epoch": 2.807022007378196, "ewc_loss": 0.034836042672395706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483604086795822e-05, "grad_norm": 19.744646072387695, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8755519390106201, "num_tokens": 842010998.0, "step": 22066 }, { "epoch": 2.8071492176567867, "ewc_loss": 0.03479897975921631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4798980777850375e-05, "grad_norm": 19.75249671936035, "learning_rate": 1e-06, "loss": 0.314, "mean_token_accuracy": 0.9001840353012085, "num_tokens": 842051410.0, "step": 22067 }, { "epoch": 2.8072764279353772, "ewc_loss": 0.03484109044075012, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484109038254246e-05, "grad_norm": 19.72051429748535, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8787325620651245, "num_tokens": 842087954.0, "step": 22068 }, { "epoch": 2.8074036382139678, "ewc_loss": 0.034719325602054596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47193272318691e-05, "grad_norm": 19.659652709960938, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8732800483703613, "num_tokens": 842128326.0, "step": 22069 }, { "epoch": 2.8075308484925583, "ewc_loss": 0.03477433696389198, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4774337109411135e-05, "grad_norm": 19.80379295349121, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8555158376693726, "num_tokens": 842171143.0, "step": 22070 }, { "epoch": 2.807658058771149, "ewc_loss": 0.03482260927557945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4822609450202435e-05, "grad_norm": 19.663667678833008, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8635240793228149, "num_tokens": 842212167.0, "step": 22071 }, { "epoch": 2.8077852690497394, "ewc_loss": 0.03467857465147972, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.467857459327206e-05, "grad_norm": 19.698209762573242, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8659044504165649, "num_tokens": 842252576.0, "step": 22072 }, { "epoch": 2.80791247932833, "ewc_loss": 0.034767959266901016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4767959732562304e-05, "grad_norm": 19.632234573364258, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8720558881759644, "num_tokens": 842287812.0, "step": 22073 }, { "epoch": 2.80803968960692, "ewc_loss": 0.03479240834712982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479240695014596e-05, "grad_norm": 19.748842239379883, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.871615469455719, "num_tokens": 842328289.0, "step": 22074 }, { "epoch": 2.808166899885511, "ewc_loss": 0.034832630306482315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483263208181597e-05, "grad_norm": 19.781278610229492, "learning_rate": 1e-06, "loss": 0.3792, "mean_token_accuracy": 0.8798151016235352, "num_tokens": 842359182.0, "step": 22075 }, { "epoch": 2.808294110164101, "ewc_loss": 0.03475835919380188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475835910649039e-05, "grad_norm": 19.701204299926758, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.866104006767273, "num_tokens": 842399820.0, "step": 22076 }, { "epoch": 2.808421320442692, "ewc_loss": 0.03476402163505554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476401980151422e-05, "grad_norm": 19.726242065429688, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8742501735687256, "num_tokens": 842437852.0, "step": 22077 }, { "epoch": 2.808548530721282, "ewc_loss": 0.03483832627534866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483832551864907e-05, "grad_norm": 19.76402473449707, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8624785542488098, "num_tokens": 842476729.0, "step": 22078 }, { "epoch": 2.808675740999873, "ewc_loss": 0.034795306622982025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479530641925521e-05, "grad_norm": 19.767803192138672, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.863269031047821, "num_tokens": 842514590.0, "step": 22079 }, { "epoch": 2.808802951278463, "ewc_loss": 0.03478430211544037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478430153336376e-05, "grad_norm": 19.743419647216797, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8733308911323547, "num_tokens": 842556502.0, "step": 22080 }, { "epoch": 2.8089301615570537, "ewc_loss": 0.03477027639746666, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477027712506242e-05, "grad_norm": 19.6538143157959, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8638017177581787, "num_tokens": 842594299.0, "step": 22081 }, { "epoch": 2.809057371835644, "ewc_loss": 0.034779004752635956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4779004636220634e-05, "grad_norm": 19.754199981689453, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8728575706481934, "num_tokens": 842633189.0, "step": 22082 }, { "epoch": 2.8091845821142347, "ewc_loss": 0.034756384789943695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475638368399814e-05, "grad_norm": 19.721853256225586, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.877050518989563, "num_tokens": 842673626.0, "step": 22083 }, { "epoch": 2.8093117923928252, "ewc_loss": 0.03472607210278511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472607204457745e-05, "grad_norm": 19.688804626464844, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8693738579750061, "num_tokens": 842712574.0, "step": 22084 }, { "epoch": 2.8094390026714158, "ewc_loss": 0.034759484231472015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475948324194178e-05, "grad_norm": 19.750593185424805, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.873268187046051, "num_tokens": 842749534.0, "step": 22085 }, { "epoch": 2.8095662129500063, "ewc_loss": 0.03483638912439346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48363901139237e-05, "grad_norm": 19.71915626525879, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8669580221176147, "num_tokens": 842784989.0, "step": 22086 }, { "epoch": 2.809693423228597, "ewc_loss": 0.034714631736278534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4714630601229146e-05, "grad_norm": 19.63027000427246, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8756662607192993, "num_tokens": 842823858.0, "step": 22087 }, { "epoch": 2.8098206335071874, "ewc_loss": 0.034793656319379807, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479365477687679e-05, "grad_norm": 19.750391006469727, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.875659704208374, "num_tokens": 842858849.0, "step": 22088 }, { "epoch": 2.809947843785778, "ewc_loss": 0.034778665751218796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4778666304191574e-05, "grad_norm": 19.572772979736328, "learning_rate": 1e-06, "loss": 0.3324, "mean_token_accuracy": 0.8949851989746094, "num_tokens": 842895809.0, "step": 22089 }, { "epoch": 2.8100750540643684, "ewc_loss": 0.03470539674162865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470539741101675e-05, "grad_norm": 19.744428634643555, "learning_rate": 1e-06, "loss": 0.3079, "mean_token_accuracy": 0.9025046825408936, "num_tokens": 842933650.0, "step": 22090 }, { "epoch": 2.810202264342959, "ewc_loss": 0.034864023327827454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486402420094237e-05, "grad_norm": 19.669700622558594, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8811353445053101, "num_tokens": 842967200.0, "step": 22091 }, { "epoch": 2.8103294746215495, "ewc_loss": 0.03470007702708244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470007868600078e-05, "grad_norm": 19.664506912231445, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8668557405471802, "num_tokens": 843002498.0, "step": 22092 }, { "epoch": 2.81045668490014, "ewc_loss": 0.034870415925979614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487041612970643e-05, "grad_norm": 19.725404739379883, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8798182010650635, "num_tokens": 843042627.0, "step": 22093 }, { "epoch": 2.8105838951787305, "ewc_loss": 0.03481735289096832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481735257082619e-05, "grad_norm": 19.72379493713379, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8719272613525391, "num_tokens": 843076862.0, "step": 22094 }, { "epoch": 2.810711105457321, "ewc_loss": 0.034823473542928696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482347528915852e-05, "grad_norm": 19.734224319458008, "learning_rate": 1e-06, "loss": 0.3916, "mean_token_accuracy": 0.8759642243385315, "num_tokens": 843115000.0, "step": 22095 }, { "epoch": 2.8108383157359116, "ewc_loss": 0.034821875393390656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48218745784834e-05, "grad_norm": 19.6479434967041, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8816137313842773, "num_tokens": 843152408.0, "step": 22096 }, { "epoch": 2.810965526014502, "ewc_loss": 0.03484911099076271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48491121258121e-05, "grad_norm": 19.684066772460938, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8764921426773071, "num_tokens": 843189269.0, "step": 22097 }, { "epoch": 2.8110927362930926, "ewc_loss": 0.03482731804251671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482731699477881e-05, "grad_norm": 19.714258193969727, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8706676363945007, "num_tokens": 843227148.0, "step": 22098 }, { "epoch": 2.8112199465716827, "ewc_loss": 0.034836214035749435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483621549094096e-05, "grad_norm": 19.72795295715332, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8664735555648804, "num_tokens": 843259736.0, "step": 22099 }, { "epoch": 2.8113471568502737, "ewc_loss": 0.03483355790376663, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4833556128432974e-05, "grad_norm": 19.697704315185547, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8715388774871826, "num_tokens": 843302598.0, "step": 22100 }, { "epoch": 2.811474367128864, "ewc_loss": 0.03483591973781586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4835920814657584e-05, "grad_norm": 19.694169998168945, "learning_rate": 1e-06, "loss": 0.3478, "mean_token_accuracy": 0.8906264901161194, "num_tokens": 843337615.0, "step": 22101 }, { "epoch": 2.8116015774074548, "ewc_loss": 0.034851983189582825, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4851982491090894e-05, "grad_norm": 19.695039749145508, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8905526399612427, "num_tokens": 843378845.0, "step": 22102 }, { "epoch": 2.811728787686045, "ewc_loss": 0.034841135144233704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4841134038288146e-05, "grad_norm": 19.65384292602539, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8660382628440857, "num_tokens": 843414128.0, "step": 22103 }, { "epoch": 2.811855997964636, "ewc_loss": 0.03481495752930641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481495878077112e-05, "grad_norm": 19.687562942504883, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8900631666183472, "num_tokens": 843454255.0, "step": 22104 }, { "epoch": 2.811983208243226, "ewc_loss": 0.034852586686611176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485258639557287e-05, "grad_norm": 19.66144371032715, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8775892853736877, "num_tokens": 843498332.0, "step": 22105 }, { "epoch": 2.8121104185218164, "ewc_loss": 0.03485018014907837, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485018169158138e-05, "grad_norm": 19.648422241210938, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8661654591560364, "num_tokens": 843539196.0, "step": 22106 }, { "epoch": 2.812237628800407, "ewc_loss": 0.03479175269603729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479175211396068e-05, "grad_norm": 19.755996704101562, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8794097900390625, "num_tokens": 843573767.0, "step": 22107 }, { "epoch": 2.8123648390789975, "ewc_loss": 0.03484385833144188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484385888441466e-05, "grad_norm": 19.646215438842773, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8634623289108276, "num_tokens": 843611156.0, "step": 22108 }, { "epoch": 2.812492049357588, "ewc_loss": 0.034794121980667114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.47941204381641e-05, "grad_norm": 19.68966293334961, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8822356462478638, "num_tokens": 843644026.0, "step": 22109 }, { "epoch": 2.8126192596361785, "ewc_loss": 0.03486749529838562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486749483272433e-05, "grad_norm": 19.672170639038086, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8791127800941467, "num_tokens": 843683477.0, "step": 22110 }, { "epoch": 2.812746469914769, "ewc_loss": 0.03478087857365608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4780878195306286e-05, "grad_norm": 19.683818817138672, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8628206253051758, "num_tokens": 843722815.0, "step": 22111 }, { "epoch": 2.8128736801933596, "ewc_loss": 0.034863319247961044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486331843305379e-05, "grad_norm": 19.696741104125977, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8767762184143066, "num_tokens": 843763843.0, "step": 22112 }, { "epoch": 2.81300089047195, "ewc_loss": 0.034846387803554535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484638727968559e-05, "grad_norm": 19.75246810913086, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8686613440513611, "num_tokens": 843805327.0, "step": 22113 }, { "epoch": 2.8131281007505406, "ewc_loss": 0.03484557196497917, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48455723724328e-05, "grad_norm": 19.650880813598633, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8764896392822266, "num_tokens": 843850132.0, "step": 22114 }, { "epoch": 2.813255311029131, "ewc_loss": 0.03473450988531113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4734508517431095e-05, "grad_norm": 19.707130432128906, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8746863603591919, "num_tokens": 843887425.0, "step": 22115 }, { "epoch": 2.8133825213077217, "ewc_loss": 0.03488175943493843, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488175934762694e-05, "grad_norm": 19.672746658325195, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8839370012283325, "num_tokens": 843920728.0, "step": 22116 }, { "epoch": 2.8135097315863122, "ewc_loss": 0.03476586192846298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476586061879061e-05, "grad_norm": 19.67430877685547, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8753615617752075, "num_tokens": 843959733.0, "step": 22117 }, { "epoch": 2.8136369418649028, "ewc_loss": 0.0348236970603466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4823697205865756e-05, "grad_norm": 19.61223030090332, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.875952959060669, "num_tokens": 844005616.0, "step": 22118 }, { "epoch": 2.8137641521434933, "ewc_loss": 0.034795597195625305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479559745755978e-05, "grad_norm": 19.72162437438965, "learning_rate": 1e-06, "loss": 0.3331, "mean_token_accuracy": 0.8910540342330933, "num_tokens": 844034216.0, "step": 22119 }, { "epoch": 2.813891362422084, "ewc_loss": 0.034845706075429916, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484570697764866e-05, "grad_norm": 19.620344161987305, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8729067444801331, "num_tokens": 844071173.0, "step": 22120 }, { "epoch": 2.8140185727006743, "ewc_loss": 0.03478090092539787, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478090002317913e-05, "grad_norm": 19.716142654418945, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8732695579528809, "num_tokens": 844113665.0, "step": 22121 }, { "epoch": 2.814145782979265, "ewc_loss": 0.03485151380300522, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485151319182478e-05, "grad_norm": 19.659412384033203, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8760883212089539, "num_tokens": 844154063.0, "step": 22122 }, { "epoch": 2.8142729932578554, "ewc_loss": 0.03478472679853439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478472717688419e-05, "grad_norm": 19.746030807495117, "learning_rate": 1e-06, "loss": 0.336, "mean_token_accuracy": 0.8940670490264893, "num_tokens": 844185634.0, "step": 22123 }, { "epoch": 2.8144002035364455, "ewc_loss": 0.03485924005508423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485924025881104e-05, "grad_norm": 19.677709579467773, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8651937246322632, "num_tokens": 844221755.0, "step": 22124 }, { "epoch": 2.8145274138150365, "ewc_loss": 0.034762606024742126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4762604627758265e-05, "grad_norm": 19.7174129486084, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8804012537002563, "num_tokens": 844256230.0, "step": 22125 }, { "epoch": 2.8146546240936265, "ewc_loss": 0.034842997789382935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484299668343738e-05, "grad_norm": 19.714481353759766, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8693614602088928, "num_tokens": 844292823.0, "step": 22126 }, { "epoch": 2.8147818343722175, "ewc_loss": 0.03482472896575928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482473039184697e-05, "grad_norm": 19.711612701416016, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8539175987243652, "num_tokens": 844331843.0, "step": 22127 }, { "epoch": 2.8149090446508076, "ewc_loss": 0.034826070070266724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482606916804798e-05, "grad_norm": 19.66956329345703, "learning_rate": 1e-06, "loss": 0.3623, "mean_token_accuracy": 0.8856582045555115, "num_tokens": 844375823.0, "step": 22128 }, { "epoch": 2.815036254929398, "ewc_loss": 0.03486757352948189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486757486825809e-05, "grad_norm": 19.6927547454834, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8748388886451721, "num_tokens": 844415187.0, "step": 22129 }, { "epoch": 2.8151634652079887, "ewc_loss": 0.03486272692680359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4862725442508236e-05, "grad_norm": 19.739843368530273, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8781291842460632, "num_tokens": 844454986.0, "step": 22130 }, { "epoch": 2.815290675486579, "ewc_loss": 0.034844763576984406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4844764741137624e-05, "grad_norm": 19.600505828857422, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8813072443008423, "num_tokens": 844495514.0, "step": 22131 }, { "epoch": 2.8154178857651697, "ewc_loss": 0.03477303311228752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4773034712998196e-05, "grad_norm": 19.749479293823242, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8742377758026123, "num_tokens": 844532406.0, "step": 22132 }, { "epoch": 2.8155450960437602, "ewc_loss": 0.0349060483276844, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4906046494143084e-05, "grad_norm": 19.62981605529785, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8751587867736816, "num_tokens": 844569561.0, "step": 22133 }, { "epoch": 2.8156723063223508, "ewc_loss": 0.034764811396598816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476481288089417e-05, "grad_norm": 19.697328567504883, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8811962604522705, "num_tokens": 844613504.0, "step": 22134 }, { "epoch": 2.8157995166009413, "ewc_loss": 0.034951332956552505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495133205433376e-05, "grad_norm": 19.855451583862305, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8740047812461853, "num_tokens": 844649255.0, "step": 22135 }, { "epoch": 2.815926726879532, "ewc_loss": 0.03478128835558891, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478128928691149e-05, "grad_norm": 19.689817428588867, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8778947591781616, "num_tokens": 844687202.0, "step": 22136 }, { "epoch": 2.8160539371581224, "ewc_loss": 0.0348239429295063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482394458842464e-05, "grad_norm": 19.870380401611328, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8768426179885864, "num_tokens": 844723096.0, "step": 22137 }, { "epoch": 2.816181147436713, "ewc_loss": 0.034818489104509354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4818487620214e-05, "grad_norm": 19.679576873779297, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8768162727355957, "num_tokens": 844759754.0, "step": 22138 }, { "epoch": 2.8163083577153034, "ewc_loss": 0.034648239612579346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.464824112597853e-05, "grad_norm": 19.719091415405273, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8787252902984619, "num_tokens": 844798372.0, "step": 22139 }, { "epoch": 2.816435567993894, "ewc_loss": 0.03483135625720024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4831355151254684e-05, "grad_norm": 19.7108211517334, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8706700801849365, "num_tokens": 844832738.0, "step": 22140 }, { "epoch": 2.8165627782724845, "ewc_loss": 0.03470033034682274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470032970653847e-05, "grad_norm": 19.689306259155273, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8772980570793152, "num_tokens": 844871684.0, "step": 22141 }, { "epoch": 2.816689988551075, "ewc_loss": 0.03480702266097069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4807024348992854e-05, "grad_norm": 19.68907928466797, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.869038462638855, "num_tokens": 844912140.0, "step": 22142 }, { "epoch": 2.8168171988296655, "ewc_loss": 0.03481101617217064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4811015211744234e-05, "grad_norm": 19.722990036010742, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8644829392433167, "num_tokens": 844948256.0, "step": 22143 }, { "epoch": 2.816944409108256, "ewc_loss": 0.03477763012051582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477762948023155e-05, "grad_norm": 19.76210594177246, "learning_rate": 1e-06, "loss": 0.3382, "mean_token_accuracy": 0.890965461730957, "num_tokens": 844983821.0, "step": 22144 }, { "epoch": 2.8170716193868466, "ewc_loss": 0.034832607954740524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483260661596432e-05, "grad_norm": 19.770273208618164, "learning_rate": 1e-06, "loss": 0.3873, "mean_token_accuracy": 0.8798590898513794, "num_tokens": 845020622.0, "step": 22145 }, { "epoch": 2.817198829665437, "ewc_loss": 0.0347457118332386, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474571349215694e-05, "grad_norm": 19.68861198425293, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8670449256896973, "num_tokens": 845056748.0, "step": 22146 }, { "epoch": 2.8173260399440276, "ewc_loss": 0.03478333353996277, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478333383100107e-05, "grad_norm": 19.644607543945312, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8583996891975403, "num_tokens": 845096294.0, "step": 22147 }, { "epoch": 2.817453250222618, "ewc_loss": 0.034827906638383865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482790634734556e-05, "grad_norm": 19.75684356689453, "learning_rate": 1e-06, "loss": 0.3383, "mean_token_accuracy": 0.8910319209098816, "num_tokens": 845135132.0, "step": 22148 }, { "epoch": 2.8175804605012083, "ewc_loss": 0.03482464328408241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48246430803556e-05, "grad_norm": 19.58032989501953, "learning_rate": 1e-06, "loss": 0.359, "mean_token_accuracy": 0.88338303565979, "num_tokens": 845172164.0, "step": 22149 }, { "epoch": 2.8177076707797992, "ewc_loss": 0.03482383117079735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482383181108162e-05, "grad_norm": 19.745210647583008, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.87680584192276, "num_tokens": 845210270.0, "step": 22150 }, { "epoch": 2.8178348810583893, "ewc_loss": 0.03490844741463661, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4908447560155764e-05, "grad_norm": 19.677265167236328, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8794577121734619, "num_tokens": 845251863.0, "step": 22151 }, { "epoch": 2.8179620913369803, "ewc_loss": 0.034837864339351654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483786349534057e-05, "grad_norm": 19.756704330444336, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8805289268493652, "num_tokens": 845290262.0, "step": 22152 }, { "epoch": 2.8180893016155704, "ewc_loss": 0.03493817150592804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4938169847009704e-05, "grad_norm": 19.717641830444336, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8833260536193848, "num_tokens": 845326894.0, "step": 22153 }, { "epoch": 2.818216511894161, "ewc_loss": 0.03483767434954643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48376743204426e-05, "grad_norm": 19.775951385498047, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8851007223129272, "num_tokens": 845367545.0, "step": 22154 }, { "epoch": 2.8183437221727514, "ewc_loss": 0.03487041965126991, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4870419767685235e-05, "grad_norm": 19.699106216430664, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8615353107452393, "num_tokens": 845400920.0, "step": 22155 }, { "epoch": 2.818470932451342, "ewc_loss": 0.0348135307431221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4813529055099934e-05, "grad_norm": 19.687742233276367, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8718023300170898, "num_tokens": 845436279.0, "step": 22156 }, { "epoch": 2.8185981427299325, "ewc_loss": 0.034934721887111664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493472104310058e-05, "grad_norm": 19.649551391601562, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8802130818367004, "num_tokens": 845476308.0, "step": 22157 }, { "epoch": 2.818725353008523, "ewc_loss": 0.034818071871995926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481807289062999e-05, "grad_norm": 19.67869758605957, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8715000152587891, "num_tokens": 845516624.0, "step": 22158 }, { "epoch": 2.8188525632871135, "ewc_loss": 0.03490951657295227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490951712592505e-05, "grad_norm": 19.69202423095703, "learning_rate": 1e-06, "loss": 0.3313, "mean_token_accuracy": 0.8929323554039001, "num_tokens": 845552385.0, "step": 22159 }, { "epoch": 2.818979773565704, "ewc_loss": 0.03490843251347542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4908433008240536e-05, "grad_norm": 19.735197067260742, "learning_rate": 1e-06, "loss": 0.3573, "mean_token_accuracy": 0.885549783706665, "num_tokens": 845592600.0, "step": 22160 }, { "epoch": 2.8191069838442946, "ewc_loss": 0.03488799184560776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488799120532349e-05, "grad_norm": 19.679401397705078, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8843625783920288, "num_tokens": 845630028.0, "step": 22161 }, { "epoch": 2.819234194122885, "ewc_loss": 0.03483675792813301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4836757549783215e-05, "grad_norm": 19.71499252319336, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8719069361686707, "num_tokens": 845665874.0, "step": 22162 }, { "epoch": 2.8193614044014756, "ewc_loss": 0.03489027917385101, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489027949399315e-05, "grad_norm": 19.713916778564453, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8641222715377808, "num_tokens": 845702738.0, "step": 22163 }, { "epoch": 2.819488614680066, "ewc_loss": 0.03489580377936363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489580558380112e-05, "grad_norm": 19.70802116394043, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8798593878746033, "num_tokens": 845744785.0, "step": 22164 }, { "epoch": 2.8196158249586567, "ewc_loss": 0.03490554913878441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490554809104651e-05, "grad_norm": 19.718059539794922, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.87657231092453, "num_tokens": 845779196.0, "step": 22165 }, { "epoch": 2.8197430352372472, "ewc_loss": 0.034871574491262436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487157300696708e-05, "grad_norm": 19.74521255493164, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8663572669029236, "num_tokens": 845819016.0, "step": 22166 }, { "epoch": 2.8198702455158378, "ewc_loss": 0.034916263073682785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49162619386334e-05, "grad_norm": 19.696687698364258, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8778302073478699, "num_tokens": 845855014.0, "step": 22167 }, { "epoch": 2.8199974557944283, "ewc_loss": 0.034795913845300674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4795913961715996e-05, "grad_norm": 19.712371826171875, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8842588663101196, "num_tokens": 845895491.0, "step": 22168 }, { "epoch": 2.820124666073019, "ewc_loss": 0.03482097014784813, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482096872176044e-05, "grad_norm": 19.689586639404297, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8678175210952759, "num_tokens": 845931560.0, "step": 22169 }, { "epoch": 2.8202518763516093, "ewc_loss": 0.034884750843048096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488474976620637e-05, "grad_norm": 19.692279815673828, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8759294748306274, "num_tokens": 845966701.0, "step": 22170 }, { "epoch": 2.8203790866302, "ewc_loss": 0.03481893241405487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4818931453628466e-05, "grad_norm": 19.700218200683594, "learning_rate": 1e-06, "loss": 0.343, "mean_token_accuracy": 0.8927855491638184, "num_tokens": 846002327.0, "step": 22171 }, { "epoch": 2.82050629690879, "ewc_loss": 0.03495319187641144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4953191061504185e-05, "grad_norm": 19.7325496673584, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8764322400093079, "num_tokens": 846042157.0, "step": 22172 }, { "epoch": 2.820633507187381, "ewc_loss": 0.03486737236380577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486737114144489e-05, "grad_norm": 19.67184829711914, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8612693548202515, "num_tokens": 846082851.0, "step": 22173 }, { "epoch": 2.820760717465971, "ewc_loss": 0.034921929240226746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4921929909614846e-05, "grad_norm": 19.7431697845459, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8747859001159668, "num_tokens": 846116846.0, "step": 22174 }, { "epoch": 2.820887927744562, "ewc_loss": 0.03492254391312599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4922544728033245e-05, "grad_norm": 19.733871459960938, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8783489465713501, "num_tokens": 846154373.0, "step": 22175 }, { "epoch": 2.821015138023152, "ewc_loss": 0.034865789115428925, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486578862066381e-05, "grad_norm": 19.698511123657227, "learning_rate": 1e-06, "loss": 0.3286, "mean_token_accuracy": 0.892779529094696, "num_tokens": 846195329.0, "step": 22176 }, { "epoch": 2.821142348301743, "ewc_loss": 0.034860000014305115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4860000596381724e-05, "grad_norm": 19.718063354492188, "learning_rate": 1e-06, "loss": 0.351, "mean_token_accuracy": 0.8873049020767212, "num_tokens": 846233078.0, "step": 22177 }, { "epoch": 2.821269558580333, "ewc_loss": 0.0348166897892952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48166904586833e-05, "grad_norm": 19.7213134765625, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.87507563829422, "num_tokens": 846269030.0, "step": 22178 }, { "epoch": 2.8213967688589237, "ewc_loss": 0.03484120965003967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4841210435843095e-05, "grad_norm": 19.743080139160156, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8734826445579529, "num_tokens": 846307269.0, "step": 22179 }, { "epoch": 2.821523979137514, "ewc_loss": 0.03482376039028168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4823759051505476e-05, "grad_norm": 19.749996185302734, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8666814565658569, "num_tokens": 846339116.0, "step": 22180 }, { "epoch": 2.8216511894161047, "ewc_loss": 0.03480919823050499, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4809199860319495e-05, "grad_norm": 19.55470848083496, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8676578998565674, "num_tokens": 846384968.0, "step": 22181 }, { "epoch": 2.8217783996946952, "ewc_loss": 0.03483163192868233, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483163163764402e-05, "grad_norm": 19.704946517944336, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8609365224838257, "num_tokens": 846426235.0, "step": 22182 }, { "epoch": 2.8219056099732858, "ewc_loss": 0.034868836402893066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486883724690415e-05, "grad_norm": 19.60856819152832, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8632161617279053, "num_tokens": 846465715.0, "step": 22183 }, { "epoch": 2.8220328202518763, "ewc_loss": 0.0348421186208725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484211993054487e-05, "grad_norm": 19.74412727355957, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8630579113960266, "num_tokens": 846499972.0, "step": 22184 }, { "epoch": 2.822160030530467, "ewc_loss": 0.03498523682355881, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4985238016815856e-05, "grad_norm": 19.675600051879883, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8751584887504578, "num_tokens": 846535906.0, "step": 22185 }, { "epoch": 2.8222872408090574, "ewc_loss": 0.0348554402589798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485543857095763e-05, "grad_norm": 19.67113494873047, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8712807297706604, "num_tokens": 846572723.0, "step": 22186 }, { "epoch": 2.822414451087648, "ewc_loss": 0.03489462658762932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489462687866762e-05, "grad_norm": 19.713808059692383, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8804904222488403, "num_tokens": 846613501.0, "step": 22187 }, { "epoch": 2.8225416613662384, "ewc_loss": 0.03486083447933197, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486083369352855e-05, "grad_norm": 19.64025115966797, "learning_rate": 1e-06, "loss": 0.333, "mean_token_accuracy": 0.8964914083480835, "num_tokens": 846650001.0, "step": 22188 }, { "epoch": 2.822668871644829, "ewc_loss": 0.03492817282676697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4928172681247815e-05, "grad_norm": 19.656797409057617, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8642174005508423, "num_tokens": 846690636.0, "step": 22189 }, { "epoch": 2.8227960819234195, "ewc_loss": 0.03493858873844147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493858821457252e-05, "grad_norm": 19.712244033813477, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8549829721450806, "num_tokens": 846728513.0, "step": 22190 }, { "epoch": 2.82292329220201, "ewc_loss": 0.034894607961177826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489460868877359e-05, "grad_norm": 19.64677619934082, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8629064559936523, "num_tokens": 846771989.0, "step": 22191 }, { "epoch": 2.8230505024806005, "ewc_loss": 0.034950803965330124, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495080454740673e-05, "grad_norm": 19.688304901123047, "learning_rate": 1e-06, "loss": 0.3369, "mean_token_accuracy": 0.8928880095481873, "num_tokens": 846808393.0, "step": 22192 }, { "epoch": 2.823177712759191, "ewc_loss": 0.03496027737855911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49602778442204e-05, "grad_norm": 19.72758674621582, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8656539916992188, "num_tokens": 846846928.0, "step": 22193 }, { "epoch": 2.8233049230377816, "ewc_loss": 0.034938473254442215, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493847179925069e-05, "grad_norm": 19.688426971435547, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8662173748016357, "num_tokens": 846884637.0, "step": 22194 }, { "epoch": 2.823432133316372, "ewc_loss": 0.03488895669579506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488895526970737e-05, "grad_norm": 19.66429901123047, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8790819644927979, "num_tokens": 846925749.0, "step": 22195 }, { "epoch": 2.8235593435949626, "ewc_loss": 0.034972213208675385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497221405268647e-05, "grad_norm": 19.784061431884766, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.861659049987793, "num_tokens": 846967191.0, "step": 22196 }, { "epoch": 2.8236865538735527, "ewc_loss": 0.0348631776869297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4863176551880315e-05, "grad_norm": 19.63011932373047, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8812083005905151, "num_tokens": 847003280.0, "step": 22197 }, { "epoch": 2.8238137641521437, "ewc_loss": 0.034916240721940994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4916240110760555e-05, "grad_norm": 19.71567726135254, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8635717034339905, "num_tokens": 847042179.0, "step": 22198 }, { "epoch": 2.8239409744307338, "ewc_loss": 0.03494240343570709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494240445434116e-05, "grad_norm": 19.76755142211914, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8710401058197021, "num_tokens": 847083008.0, "step": 22199 }, { "epoch": 2.8240681847093247, "ewc_loss": 0.03489869832992554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489869777695276e-05, "grad_norm": 19.76052474975586, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8553463220596313, "num_tokens": 847124009.0, "step": 22200 }, { "epoch": 2.824195394987915, "ewc_loss": 0.03487890213727951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487890353426337e-05, "grad_norm": 19.765504837036133, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8689578771591187, "num_tokens": 847163526.0, "step": 22201 }, { "epoch": 2.824322605266506, "ewc_loss": 0.03486107662320137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486107743810862e-05, "grad_norm": 19.70541763305664, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8741312623023987, "num_tokens": 847201374.0, "step": 22202 }, { "epoch": 2.824449815545096, "ewc_loss": 0.03488673269748688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4886732464656234e-05, "grad_norm": 19.785444259643555, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8616287708282471, "num_tokens": 847239469.0, "step": 22203 }, { "epoch": 2.8245770258236864, "ewc_loss": 0.034855905920267105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485590423224494e-05, "grad_norm": 19.696943283081055, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8734123706817627, "num_tokens": 847275964.0, "step": 22204 }, { "epoch": 2.824704236102277, "ewc_loss": 0.034840915352106094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484091575955972e-05, "grad_norm": 19.80171775817871, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8767356872558594, "num_tokens": 847316815.0, "step": 22205 }, { "epoch": 2.8248314463808675, "ewc_loss": 0.034892141819000244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489214213914238e-05, "grad_norm": 19.69478416442871, "learning_rate": 1e-06, "loss": 0.3776, "mean_token_accuracy": 0.8801270127296448, "num_tokens": 847350949.0, "step": 22206 }, { "epoch": 2.824958656659458, "ewc_loss": 0.034805264323949814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.480526356725022e-05, "grad_norm": 19.779590606689453, "learning_rate": 1e-06, "loss": 0.3681, "mean_token_accuracy": 0.8798807859420776, "num_tokens": 847388945.0, "step": 22207 }, { "epoch": 2.8250858669380485, "ewc_loss": 0.034902505576610565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4902506740763783e-05, "grad_norm": 19.707056045532227, "learning_rate": 1e-06, "loss": 0.356, "mean_token_accuracy": 0.8866328001022339, "num_tokens": 847435114.0, "step": 22208 }, { "epoch": 2.825213077216639, "ewc_loss": 0.03482452407479286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4824523027054965e-05, "grad_norm": 19.709394454956055, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.867651104927063, "num_tokens": 847470815.0, "step": 22209 }, { "epoch": 2.8253402874952296, "ewc_loss": 0.03482789546251297, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482789543340914e-05, "grad_norm": 19.74148178100586, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8733901977539062, "num_tokens": 847509379.0, "step": 22210 }, { "epoch": 2.82546749777382, "ewc_loss": 0.03478783369064331, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4787834010785446e-05, "grad_norm": 19.692859649658203, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8785147666931152, "num_tokens": 847547427.0, "step": 22211 }, { "epoch": 2.8255947080524106, "ewc_loss": 0.03479442745447159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4794426028383896e-05, "grad_norm": 19.63059425354004, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8589939475059509, "num_tokens": 847584740.0, "step": 22212 }, { "epoch": 2.825721918331001, "ewc_loss": 0.03481130674481392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48113062500488e-05, "grad_norm": 19.72047996520996, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8640657663345337, "num_tokens": 847626201.0, "step": 22213 }, { "epoch": 2.8258491286095917, "ewc_loss": 0.03490738570690155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4907385270344093e-05, "grad_norm": 19.738601684570312, "learning_rate": 1e-06, "loss": 0.3494, "mean_token_accuracy": 0.8897943496704102, "num_tokens": 847661311.0, "step": 22214 }, { "epoch": 2.8259763388881822, "ewc_loss": 0.034863606095314026, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486360583337955e-05, "grad_norm": 19.782360076904297, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.873161792755127, "num_tokens": 847695234.0, "step": 22215 }, { "epoch": 2.8261035491667728, "ewc_loss": 0.034867823123931885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486782225081697e-05, "grad_norm": 19.78390884399414, "learning_rate": 1e-06, "loss": 0.3349, "mean_token_accuracy": 0.8916715979576111, "num_tokens": 847738848.0, "step": 22216 }, { "epoch": 2.8262307594453633, "ewc_loss": 0.034794073551893234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479407314443961e-05, "grad_norm": 19.72364044189453, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8783705234527588, "num_tokens": 847775952.0, "step": 22217 }, { "epoch": 2.826357969723954, "ewc_loss": 0.0347558818757534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475588164292276e-05, "grad_norm": 19.731319427490234, "learning_rate": 1e-06, "loss": 0.3444, "mean_token_accuracy": 0.8908401727676392, "num_tokens": 847809381.0, "step": 22218 }, { "epoch": 2.8264851800025443, "ewc_loss": 0.03477184846997261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4771848731907085e-05, "grad_norm": 19.680482864379883, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8711639046669006, "num_tokens": 847849835.0, "step": 22219 }, { "epoch": 2.826612390281135, "ewc_loss": 0.03475812077522278, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475812263786793e-05, "grad_norm": 19.79563331604004, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8735970854759216, "num_tokens": 847882726.0, "step": 22220 }, { "epoch": 2.8267396005597254, "ewc_loss": 0.03478773310780525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478773214737885e-05, "grad_norm": 19.7268123626709, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8748531937599182, "num_tokens": 847920663.0, "step": 22221 }, { "epoch": 2.8268668108383155, "ewc_loss": 0.03472500294446945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4725002478808165e-05, "grad_norm": 19.73086929321289, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.883129894733429, "num_tokens": 847962204.0, "step": 22222 }, { "epoch": 2.8269940211169065, "ewc_loss": 0.03473618999123573, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473618926363997e-05, "grad_norm": 19.82646369934082, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8688880205154419, "num_tokens": 848003722.0, "step": 22223 }, { "epoch": 2.8271212313954965, "ewc_loss": 0.034742727875709534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4742726711556315e-05, "grad_norm": 19.72119140625, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8839930891990662, "num_tokens": 848045335.0, "step": 22224 }, { "epoch": 2.8272484416740875, "ewc_loss": 0.034720707684755325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472070602583699e-05, "grad_norm": 19.807722091674805, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8674271702766418, "num_tokens": 848088817.0, "step": 22225 }, { "epoch": 2.8273756519526776, "ewc_loss": 0.03476347029209137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476347046671435e-05, "grad_norm": 19.681821823120117, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8675670027732849, "num_tokens": 848120993.0, "step": 22226 }, { "epoch": 2.827502862231268, "ewc_loss": 0.03464984893798828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4649849112611264e-05, "grad_norm": 19.698246002197266, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8680058121681213, "num_tokens": 848162542.0, "step": 22227 }, { "epoch": 2.8276300725098586, "ewc_loss": 0.0348050594329834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4805059840437025e-05, "grad_norm": 19.746511459350586, "learning_rate": 1e-06, "loss": 0.3498, "mean_token_accuracy": 0.889543354511261, "num_tokens": 848198194.0, "step": 22228 }, { "epoch": 2.827757282788449, "ewc_loss": 0.03474920243024826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474920231383294e-05, "grad_norm": 19.722097396850586, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.870182991027832, "num_tokens": 848240575.0, "step": 22229 }, { "epoch": 2.8278844930670397, "ewc_loss": 0.034777019172906876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477701829979196e-05, "grad_norm": 19.724504470825195, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8749396800994873, "num_tokens": 848289026.0, "step": 22230 }, { "epoch": 2.8280117033456302, "ewc_loss": 0.03475276380777359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475276389508508e-05, "grad_norm": 19.713655471801758, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8772068619728088, "num_tokens": 848326438.0, "step": 22231 }, { "epoch": 2.8281389136242208, "ewc_loss": 0.03483545407652855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4835455153370276e-05, "grad_norm": 19.76359748840332, "learning_rate": 1e-06, "loss": 0.3734, "mean_token_accuracy": 0.8816810846328735, "num_tokens": 848368106.0, "step": 22232 }, { "epoch": 2.8282661239028113, "ewc_loss": 0.034772519022226334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477251812000759e-05, "grad_norm": 19.719558715820312, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8713936805725098, "num_tokens": 848409775.0, "step": 22233 }, { "epoch": 2.828393334181402, "ewc_loss": 0.034775882959365845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477588325040415e-05, "grad_norm": 19.818361282348633, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8757306933403015, "num_tokens": 848442149.0, "step": 22234 }, { "epoch": 2.8285205444599923, "ewc_loss": 0.03473954275250435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.473954348010011e-05, "grad_norm": 19.805496215820312, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8696855306625366, "num_tokens": 848475468.0, "step": 22235 }, { "epoch": 2.828647754738583, "ewc_loss": 0.03474382683634758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.474382538115606e-05, "grad_norm": 19.74808120727539, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8762122392654419, "num_tokens": 848509565.0, "step": 22236 }, { "epoch": 2.8287749650171734, "ewc_loss": 0.034777600318193436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4777600376401097e-05, "grad_norm": 19.70868492126465, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8763326406478882, "num_tokens": 848546357.0, "step": 22237 }, { "epoch": 2.828902175295764, "ewc_loss": 0.03478109464049339, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4781092836055905e-05, "grad_norm": 19.780797958374023, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8635316491127014, "num_tokens": 848587661.0, "step": 22238 }, { "epoch": 2.8290293855743545, "ewc_loss": 0.03482547774910927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4825476177502424e-05, "grad_norm": 19.706531524658203, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8793475031852722, "num_tokens": 848628250.0, "step": 22239 }, { "epoch": 2.829156595852945, "ewc_loss": 0.03477789834141731, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477789869066328e-05, "grad_norm": 19.787080764770508, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8757721185684204, "num_tokens": 848670452.0, "step": 22240 }, { "epoch": 2.8292838061315355, "ewc_loss": 0.0348280593752861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482805914245546e-05, "grad_norm": 19.765718460083008, "learning_rate": 1e-06, "loss": 0.3817, "mean_token_accuracy": 0.8770195841789246, "num_tokens": 848703592.0, "step": 22241 }, { "epoch": 2.829411016410126, "ewc_loss": 0.03478747233748436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4787473850883543e-05, "grad_norm": 19.77016830444336, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8595726490020752, "num_tokens": 848739291.0, "step": 22242 }, { "epoch": 2.8295382266887166, "ewc_loss": 0.034880831837654114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488083166303113e-05, "grad_norm": 19.6872501373291, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.869308352470398, "num_tokens": 848776039.0, "step": 22243 }, { "epoch": 2.829665436967307, "ewc_loss": 0.03488980606198311, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4889806556748226e-05, "grad_norm": 19.751953125, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8778039216995239, "num_tokens": 848814799.0, "step": 22244 }, { "epoch": 2.8297926472458976, "ewc_loss": 0.03490721806883812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490721792331897e-05, "grad_norm": 19.67955207824707, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8765395879745483, "num_tokens": 848860123.0, "step": 22245 }, { "epoch": 2.829919857524488, "ewc_loss": 0.03485973924398422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485973866190761e-05, "grad_norm": 19.747528076171875, "learning_rate": 1e-06, "loss": 0.3323, "mean_token_accuracy": 0.8923364281654358, "num_tokens": 848900447.0, "step": 22246 }, { "epoch": 2.8300470678030782, "ewc_loss": 0.03488081693649292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48808171111159e-05, "grad_norm": 19.722251892089844, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8695180416107178, "num_tokens": 848945385.0, "step": 22247 }, { "epoch": 2.830174278081669, "ewc_loss": 0.03480716422200203, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4807162592187524e-05, "grad_norm": 19.80552864074707, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8654926419258118, "num_tokens": 848978783.0, "step": 22248 }, { "epoch": 2.8303014883602593, "ewc_loss": 0.03481603041291237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4816031984519213e-05, "grad_norm": 19.661968231201172, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.86365807056427, "num_tokens": 849017324.0, "step": 22249 }, { "epoch": 2.8304286986388503, "ewc_loss": 0.03481592983007431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4815930121112615e-05, "grad_norm": 19.788793563842773, "learning_rate": 1e-06, "loss": 0.3349, "mean_token_accuracy": 0.8945719599723816, "num_tokens": 849045928.0, "step": 22250 }, { "epoch": 2.8305559089174404, "ewc_loss": 0.0349246971309185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492469841148704e-05, "grad_norm": 19.787281036376953, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8749312162399292, "num_tokens": 849080855.0, "step": 22251 }, { "epoch": 2.830683119196031, "ewc_loss": 0.03478120639920235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4781205613398924e-05, "grad_norm": 19.751020431518555, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8776381015777588, "num_tokens": 849114835.0, "step": 22252 }, { "epoch": 2.8308103294746214, "ewc_loss": 0.034851010888814926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48510111507494e-05, "grad_norm": 19.778560638427734, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8643054962158203, "num_tokens": 849152632.0, "step": 22253 }, { "epoch": 2.830937539753212, "ewc_loss": 0.03485150262713432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485150227788836e-05, "grad_norm": 19.735857009887695, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8840590119361877, "num_tokens": 849189911.0, "step": 22254 }, { "epoch": 2.8310647500318025, "ewc_loss": 0.03483062982559204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483063119347207e-05, "grad_norm": 19.791748046875, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.874039888381958, "num_tokens": 849227847.0, "step": 22255 }, { "epoch": 2.831191960310393, "ewc_loss": 0.034858811646699905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4858810977311805e-05, "grad_norm": 19.703018188476562, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8799521923065186, "num_tokens": 849265778.0, "step": 22256 }, { "epoch": 2.8313191705889835, "ewc_loss": 0.034796617925167084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479661972960457e-05, "grad_norm": 19.7836856842041, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8750640749931335, "num_tokens": 849302488.0, "step": 22257 }, { "epoch": 2.831446380867574, "ewc_loss": 0.03487224504351616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4872246033046395e-05, "grad_norm": 19.70684814453125, "learning_rate": 1e-06, "loss": 0.3403, "mean_token_accuracy": 0.8902891278266907, "num_tokens": 849337483.0, "step": 22258 }, { "epoch": 2.8315735911461646, "ewc_loss": 0.03479794040322304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4797940315911546e-05, "grad_norm": 19.64435577392578, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8809401988983154, "num_tokens": 849378491.0, "step": 22259 }, { "epoch": 2.831700801424755, "ewc_loss": 0.03483998775482178, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483998807496391e-05, "grad_norm": 19.72515869140625, "learning_rate": 1e-06, "loss": 0.3721, "mean_token_accuracy": 0.8823219537734985, "num_tokens": 849418897.0, "step": 22260 }, { "epoch": 2.8318280117033456, "ewc_loss": 0.034930724650621414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493072290439159e-05, "grad_norm": 19.824581146240234, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8801758885383606, "num_tokens": 849457057.0, "step": 22261 }, { "epoch": 2.831955221981936, "ewc_loss": 0.03478598967194557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478598955553025e-05, "grad_norm": 19.728994369506836, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8792898654937744, "num_tokens": 849488825.0, "step": 22262 }, { "epoch": 2.8320824322605267, "ewc_loss": 0.03479204699397087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479204679024406e-05, "grad_norm": 19.779972076416016, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8814544081687927, "num_tokens": 849528336.0, "step": 22263 }, { "epoch": 2.8322096425391172, "ewc_loss": 0.034868545830249786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486854620859958e-05, "grad_norm": 19.778667449951172, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8883825540542603, "num_tokens": 849565373.0, "step": 22264 }, { "epoch": 2.8323368528177078, "ewc_loss": 0.034862443804740906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486244531814009e-05, "grad_norm": 19.829145431518555, "learning_rate": 1e-06, "loss": 0.3464, "mean_token_accuracy": 0.887041449546814, "num_tokens": 849600909.0, "step": 22265 }, { "epoch": 2.8324640630962983, "ewc_loss": 0.034794881939888, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479488077573478e-05, "grad_norm": 19.72823715209961, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8663561940193176, "num_tokens": 849639499.0, "step": 22266 }, { "epoch": 2.832591273374889, "ewc_loss": 0.03481318801641464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481318708509207e-05, "grad_norm": 19.845333099365234, "learning_rate": 1e-06, "loss": 0.3515, "mean_token_accuracy": 0.8865582942962646, "num_tokens": 849677232.0, "step": 22267 }, { "epoch": 2.8327184836534793, "ewc_loss": 0.0348847433924675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4884742490248755e-05, "grad_norm": 19.757915496826172, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8817734718322754, "num_tokens": 849717187.0, "step": 22268 }, { "epoch": 2.83284569393207, "ewc_loss": 0.03472602739930153, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4726028388831764e-05, "grad_norm": 19.748149871826172, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8862638473510742, "num_tokens": 849756545.0, "step": 22269 }, { "epoch": 2.83297290421066, "ewc_loss": 0.03475738689303398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4757387766148895e-05, "grad_norm": 19.692886352539062, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8801487684249878, "num_tokens": 849788670.0, "step": 22270 }, { "epoch": 2.833100114489251, "ewc_loss": 0.03476981818675995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.476981873973273e-05, "grad_norm": 19.713054656982422, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8748968839645386, "num_tokens": 849825715.0, "step": 22271 }, { "epoch": 2.833227324767841, "ewc_loss": 0.034858424216508865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485842535155825e-05, "grad_norm": 19.72644805908203, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8818375468254089, "num_tokens": 849856435.0, "step": 22272 }, { "epoch": 2.833354535046432, "ewc_loss": 0.034827373921871185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4827375202439725e-05, "grad_norm": 19.750741958618164, "learning_rate": 1e-06, "loss": 0.3441, "mean_token_accuracy": 0.8905600309371948, "num_tokens": 849895634.0, "step": 22273 }, { "epoch": 2.833481745325022, "ewc_loss": 0.03479773923754692, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4797740227077156e-05, "grad_norm": 19.691946029663086, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8549323678016663, "num_tokens": 849938646.0, "step": 22274 }, { "epoch": 2.833608955603613, "ewc_loss": 0.034826043993234634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482604370219633e-05, "grad_norm": 19.717395782470703, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8719616532325745, "num_tokens": 849978642.0, "step": 22275 }, { "epoch": 2.833736165882203, "ewc_loss": 0.03485800325870514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485800334601663e-05, "grad_norm": 19.759723663330078, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8748652935028076, "num_tokens": 850017820.0, "step": 22276 }, { "epoch": 2.8338633761607936, "ewc_loss": 0.034836072474718094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483607360976748e-05, "grad_norm": 19.696170806884766, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8567669987678528, "num_tokens": 850056619.0, "step": 22277 }, { "epoch": 2.833990586439384, "ewc_loss": 0.0348498597741127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484986154944636e-05, "grad_norm": 19.743730545043945, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8742088079452515, "num_tokens": 850092456.0, "step": 22278 }, { "epoch": 2.8341177967179747, "ewc_loss": 0.0349002406001091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4900240279966965e-05, "grad_norm": 19.718082427978516, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8726009130477905, "num_tokens": 850131598.0, "step": 22279 }, { "epoch": 2.8342450069965652, "ewc_loss": 0.034837283194065094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483728505671024e-05, "grad_norm": 19.747655868530273, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8656747937202454, "num_tokens": 850170884.0, "step": 22280 }, { "epoch": 2.8343722172751558, "ewc_loss": 0.03489953652024269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48995381500572e-05, "grad_norm": 19.755908966064453, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8757490515708923, "num_tokens": 850206989.0, "step": 22281 }, { "epoch": 2.8344994275537463, "ewc_loss": 0.03485262766480446, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485262641333975e-05, "grad_norm": 19.737585067749023, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8723499774932861, "num_tokens": 850248602.0, "step": 22282 }, { "epoch": 2.834626637832337, "ewc_loss": 0.0349227599799633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492275936878286e-05, "grad_norm": 19.718278884887695, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8687195181846619, "num_tokens": 850289790.0, "step": 22283 }, { "epoch": 2.8347538481109273, "ewc_loss": 0.034882545471191406, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488254515104927e-05, "grad_norm": 19.733821868896484, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8834575414657593, "num_tokens": 850326688.0, "step": 22284 }, { "epoch": 2.834881058389518, "ewc_loss": 0.0349198654294014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4919867175631225e-05, "grad_norm": 19.7694034576416, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.8868775963783264, "num_tokens": 850366460.0, "step": 22285 }, { "epoch": 2.8350082686681084, "ewc_loss": 0.034852270036935806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4852269891416654e-05, "grad_norm": 19.692766189575195, "learning_rate": 1e-06, "loss": 0.3674, "mean_token_accuracy": 0.8809236884117126, "num_tokens": 850400046.0, "step": 22286 }, { "epoch": 2.835135478946699, "ewc_loss": 0.03487405553460121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487405410851352e-05, "grad_norm": 19.778270721435547, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8673715591430664, "num_tokens": 850434374.0, "step": 22287 }, { "epoch": 2.8352626892252895, "ewc_loss": 0.034919414669275284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4919416066259146e-05, "grad_norm": 19.799753189086914, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8773016929626465, "num_tokens": 850469056.0, "step": 22288 }, { "epoch": 2.83538989950388, "ewc_loss": 0.034821826964616776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482182728475891e-05, "grad_norm": 19.62824821472168, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8734433650970459, "num_tokens": 850505805.0, "step": 22289 }, { "epoch": 2.8355171097824705, "ewc_loss": 0.034880995750427246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488099537207745e-05, "grad_norm": 19.726089477539062, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8639214038848877, "num_tokens": 850550056.0, "step": 22290 }, { "epoch": 2.835644320061061, "ewc_loss": 0.03497191518545151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4971915738424286e-05, "grad_norm": 19.768047332763672, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8823632001876831, "num_tokens": 850585075.0, "step": 22291 }, { "epoch": 2.8357715303396516, "ewc_loss": 0.03486742451786995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4867425711127e-05, "grad_norm": 19.68950653076172, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8850399255752563, "num_tokens": 850625758.0, "step": 22292 }, { "epoch": 2.835898740618242, "ewc_loss": 0.034889932721853256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4889933886006474e-05, "grad_norm": 19.684913635253906, "learning_rate": 1e-06, "loss": 0.3397, "mean_token_accuracy": 0.8881391882896423, "num_tokens": 850664608.0, "step": 22293 }, { "epoch": 2.8360259508968326, "ewc_loss": 0.03486248478293419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486248533590697e-05, "grad_norm": 19.702646255493164, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8823712468147278, "num_tokens": 850705897.0, "step": 22294 }, { "epoch": 2.8361531611754227, "ewc_loss": 0.03495625779032707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4956257877638564e-05, "grad_norm": 19.803499221801758, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8705447912216187, "num_tokens": 850746195.0, "step": 22295 }, { "epoch": 2.8362803714540137, "ewc_loss": 0.03490005061030388, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4900051105068997e-05, "grad_norm": 19.685230255126953, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8831707835197449, "num_tokens": 850785183.0, "step": 22296 }, { "epoch": 2.8364075817326038, "ewc_loss": 0.034831441938877106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4831442462746054e-05, "grad_norm": 19.68468475341797, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8811619281768799, "num_tokens": 850819558.0, "step": 22297 }, { "epoch": 2.8365347920111947, "ewc_loss": 0.03486941009759903, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486940840957686e-05, "grad_norm": 19.734689712524414, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8663004040718079, "num_tokens": 850857385.0, "step": 22298 }, { "epoch": 2.836662002289785, "ewc_loss": 0.0348924957215786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489249502308667e-05, "grad_norm": 19.73661994934082, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8711755275726318, "num_tokens": 850895297.0, "step": 22299 }, { "epoch": 2.836789212568376, "ewc_loss": 0.03487388417124748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4873883123509586e-05, "grad_norm": 19.74253273010254, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8743889927864075, "num_tokens": 850930982.0, "step": 22300 }, { "epoch": 2.836916422846966, "ewc_loss": 0.03489036113023758, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4890359529526904e-05, "grad_norm": 19.848926544189453, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8720759153366089, "num_tokens": 850969032.0, "step": 22301 }, { "epoch": 2.8370436331255564, "ewc_loss": 0.034880489110946655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4880489693023264e-05, "grad_norm": 19.640132904052734, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8794068098068237, "num_tokens": 851008325.0, "step": 22302 }, { "epoch": 2.837170843404147, "ewc_loss": 0.034783363342285156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478336293483153e-05, "grad_norm": 19.787782669067383, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8776060342788696, "num_tokens": 851044289.0, "step": 22303 }, { "epoch": 2.8372980536827375, "ewc_loss": 0.03491563722491264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491563620627858e-05, "grad_norm": 19.681800842285156, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8821170330047607, "num_tokens": 851079887.0, "step": 22304 }, { "epoch": 2.837425263961328, "ewc_loss": 0.03480890020728111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4808901546057314e-05, "grad_norm": 19.733116149902344, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8793612122535706, "num_tokens": 851124535.0, "step": 22305 }, { "epoch": 2.8375524742399185, "ewc_loss": 0.034889426082372665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488942456897348e-05, "grad_norm": 19.69164276123047, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.883998692035675, "num_tokens": 851162982.0, "step": 22306 }, { "epoch": 2.837679684518509, "ewc_loss": 0.03485359624028206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485359775368124e-05, "grad_norm": 19.75617027282715, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8840665817260742, "num_tokens": 851201864.0, "step": 22307 }, { "epoch": 2.8378068947970996, "ewc_loss": 0.03492826223373413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492826363071799e-05, "grad_norm": 19.681913375854492, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8798062801361084, "num_tokens": 851241659.0, "step": 22308 }, { "epoch": 2.83793410507569, "ewc_loss": 0.03485250100493431, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485250272206031e-05, "grad_norm": 19.773494720458984, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8682214021682739, "num_tokens": 851278861.0, "step": 22309 }, { "epoch": 2.8380613153542806, "ewc_loss": 0.0349092073738575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4909207897726446e-05, "grad_norm": 19.684316635131836, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8780868053436279, "num_tokens": 851314442.0, "step": 22310 }, { "epoch": 2.838188525632871, "ewc_loss": 0.03482981398701668, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482981264824048e-05, "grad_norm": 19.776771545410156, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8716332316398621, "num_tokens": 851356322.0, "step": 22311 }, { "epoch": 2.8383157359114617, "ewc_loss": 0.034880075603723526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4880074963439256e-05, "grad_norm": 19.669940948486328, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8642143607139587, "num_tokens": 851395374.0, "step": 22312 }, { "epoch": 2.838442946190052, "ewc_loss": 0.034836746752262115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4836746635846794e-05, "grad_norm": 19.777475357055664, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8743880987167358, "num_tokens": 851430957.0, "step": 22313 }, { "epoch": 2.8385701564686427, "ewc_loss": 0.03488257899880409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4882577892858535e-05, "grad_norm": 19.803760528564453, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8705444931983948, "num_tokens": 851474357.0, "step": 22314 }, { "epoch": 2.8386973667472333, "ewc_loss": 0.034842561930418015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4842560125980526e-05, "grad_norm": 19.80628776550293, "learning_rate": 1e-06, "loss": 0.3468, "mean_token_accuracy": 0.8882753849029541, "num_tokens": 851510346.0, "step": 22315 }, { "epoch": 2.838824577025824, "ewc_loss": 0.03480196371674538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.480196392047219e-05, "grad_norm": 19.725980758666992, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8773672580718994, "num_tokens": 851546798.0, "step": 22316 }, { "epoch": 2.8389517873044143, "ewc_loss": 0.03479010611772537, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.479010774753988e-05, "grad_norm": 19.863040924072266, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8809826374053955, "num_tokens": 851587819.0, "step": 22317 }, { "epoch": 2.839078997583005, "ewc_loss": 0.03483596071600914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.483596083242446e-05, "grad_norm": 19.650657653808594, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.880764365196228, "num_tokens": 851628764.0, "step": 22318 }, { "epoch": 2.8392062078615954, "ewc_loss": 0.03471797704696655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.471797754173167e-05, "grad_norm": 19.77123260498047, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8631147146224976, "num_tokens": 851672206.0, "step": 22319 }, { "epoch": 2.8393334181401855, "ewc_loss": 0.034867484122514725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486748391878791e-05, "grad_norm": 19.71937370300293, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8703615665435791, "num_tokens": 851709969.0, "step": 22320 }, { "epoch": 2.8394606284187764, "ewc_loss": 0.03475324064493179, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.475324047030881e-05, "grad_norm": 19.774389266967773, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8694727420806885, "num_tokens": 851749015.0, "step": 22321 }, { "epoch": 2.8395878386973665, "ewc_loss": 0.034820567816495895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482056854409166e-05, "grad_norm": 19.753131866455078, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8802818655967712, "num_tokens": 851787210.0, "step": 22322 }, { "epoch": 2.8397150489759575, "ewc_loss": 0.034803569316864014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.480356826912612e-05, "grad_norm": 19.766725540161133, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8706574440002441, "num_tokens": 851824677.0, "step": 22323 }, { "epoch": 2.8398422592545476, "ewc_loss": 0.03478189930319786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478190046735108e-05, "grad_norm": 19.768760681152344, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8809018135070801, "num_tokens": 851863614.0, "step": 22324 }, { "epoch": 2.839969469533138, "ewc_loss": 0.034816887229681015, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481688690953888e-05, "grad_norm": 19.747724533081055, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8694438934326172, "num_tokens": 851908153.0, "step": 22325 }, { "epoch": 2.8400966798117286, "ewc_loss": 0.034825555980205536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.482555621303618e-05, "grad_norm": 19.85567855834961, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8718208074569702, "num_tokens": 851946473.0, "step": 22326 }, { "epoch": 2.840223890090319, "ewc_loss": 0.03482906147837639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4829063224606216e-05, "grad_norm": 19.713674545288086, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8653655648231506, "num_tokens": 851991600.0, "step": 22327 }, { "epoch": 2.8403511003689097, "ewc_loss": 0.03470912203192711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.470912270131521e-05, "grad_norm": 19.79934310913086, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8776044845581055, "num_tokens": 852029468.0, "step": 22328 }, { "epoch": 2.8404783106475002, "ewc_loss": 0.03477156162261963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4771561331581324e-05, "grad_norm": 19.62626075744629, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8823554515838623, "num_tokens": 852061400.0, "step": 22329 }, { "epoch": 2.8406055209260908, "ewc_loss": 0.034771062433719635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.477106292848475e-05, "grad_norm": 19.862468719482422, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8747130036354065, "num_tokens": 852100596.0, "step": 22330 }, { "epoch": 2.8407327312046813, "ewc_loss": 0.034877896308898926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487789581413381e-05, "grad_norm": 19.700469970703125, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8781651258468628, "num_tokens": 852140320.0, "step": 22331 }, { "epoch": 2.840859941483272, "ewc_loss": 0.034727346152067184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.472734533715993e-05, "grad_norm": 19.7457275390625, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8742568492889404, "num_tokens": 852175969.0, "step": 22332 }, { "epoch": 2.8409871517618623, "ewc_loss": 0.03486926853656769, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4869270166382194e-05, "grad_norm": 19.792999267578125, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8621581792831421, "num_tokens": 852210816.0, "step": 22333 }, { "epoch": 2.841114362040453, "ewc_loss": 0.034783266484737396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4783268347382545e-05, "grad_norm": 19.78038215637207, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8746485114097595, "num_tokens": 852246684.0, "step": 22334 }, { "epoch": 2.8412415723190434, "ewc_loss": 0.03488389402627945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48838948411867e-05, "grad_norm": 19.764225006103516, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8790425062179565, "num_tokens": 852285283.0, "step": 22335 }, { "epoch": 2.841368782597634, "ewc_loss": 0.03478433936834335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.478433791315183e-05, "grad_norm": 19.743406295776367, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8843013644218445, "num_tokens": 852323310.0, "step": 22336 }, { "epoch": 2.8414959928762245, "ewc_loss": 0.03485865518450737, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48586545442231e-05, "grad_norm": 19.7867374420166, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8797968626022339, "num_tokens": 852360850.0, "step": 22337 }, { "epoch": 2.841623203154815, "ewc_loss": 0.03483697026968002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4836968552554026e-05, "grad_norm": 19.745803833007812, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8695629835128784, "num_tokens": 852399320.0, "step": 22338 }, { "epoch": 2.8417504134334055, "ewc_loss": 0.034811265766620636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481126623228192e-05, "grad_norm": 19.76786231994629, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8731855750083923, "num_tokens": 852430980.0, "step": 22339 }, { "epoch": 2.841877623711996, "ewc_loss": 0.03482239693403244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4822398447431624e-05, "grad_norm": 19.730791091918945, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8873960375785828, "num_tokens": 852465688.0, "step": 22340 }, { "epoch": 2.8420048339905866, "ewc_loss": 0.034853480756282806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4853481338359416e-05, "grad_norm": 19.693342208862305, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.875899076461792, "num_tokens": 852504372.0, "step": 22341 }, { "epoch": 2.842132044269177, "ewc_loss": 0.03481947258114815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481947351247072e-05, "grad_norm": 19.727184295654297, "learning_rate": 1e-06, "loss": 0.3716, "mean_token_accuracy": 0.8797135949134827, "num_tokens": 852542232.0, "step": 22342 }, { "epoch": 2.8422592545477676, "ewc_loss": 0.03487497940659523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487497815513052e-05, "grad_norm": 19.702558517456055, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8664337396621704, "num_tokens": 852580005.0, "step": 22343 }, { "epoch": 2.842386464826358, "ewc_loss": 0.03485060855746269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485060733510181e-05, "grad_norm": 19.72888946533203, "learning_rate": 1e-06, "loss": 0.351, "mean_token_accuracy": 0.8865144848823547, "num_tokens": 852619912.0, "step": 22344 }, { "epoch": 2.8425136751049482, "ewc_loss": 0.034872911870479584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487291178316809e-05, "grad_norm": 19.738468170166016, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8796936273574829, "num_tokens": 852654349.0, "step": 22345 }, { "epoch": 2.842640885383539, "ewc_loss": 0.03492824360728264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492824544082396e-05, "grad_norm": 19.803287506103516, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8680450916290283, "num_tokens": 852691300.0, "step": 22346 }, { "epoch": 2.8427680956621293, "ewc_loss": 0.034886714071035385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48867142747622e-05, "grad_norm": 19.71028709411621, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8738974928855896, "num_tokens": 852731228.0, "step": 22347 }, { "epoch": 2.8428953059407203, "ewc_loss": 0.03485671430826187, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485671550151892e-05, "grad_norm": 19.753108978271484, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.8746842741966248, "num_tokens": 852768333.0, "step": 22348 }, { "epoch": 2.8430225162193103, "ewc_loss": 0.034905314445495605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490531526040286e-05, "grad_norm": 19.745344161987305, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8448376655578613, "num_tokens": 852810962.0, "step": 22349 }, { "epoch": 2.843149726497901, "ewc_loss": 0.034890107810497284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4890108508989215e-05, "grad_norm": 19.70738410949707, "learning_rate": 1e-06, "loss": 0.346, "mean_token_accuracy": 0.8922491073608398, "num_tokens": 852844326.0, "step": 22350 }, { "epoch": 2.8432769367764914, "ewc_loss": 0.034905001521110535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490500239422545e-05, "grad_norm": 19.749557495117188, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8707459568977356, "num_tokens": 852877737.0, "step": 22351 }, { "epoch": 2.843404147055082, "ewc_loss": 0.03489280492067337, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489280425128527e-05, "grad_norm": 19.747331619262695, "learning_rate": 1e-06, "loss": 0.361, "mean_token_accuracy": 0.8831445574760437, "num_tokens": 852922085.0, "step": 22352 }, { "epoch": 2.8435313573336725, "ewc_loss": 0.03484461456537247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484461558400653e-05, "grad_norm": 19.676746368408203, "learning_rate": 1e-06, "loss": 0.3337, "mean_token_accuracy": 0.8914345502853394, "num_tokens": 852960742.0, "step": 22353 }, { "epoch": 2.843658567612263, "ewc_loss": 0.034897904843091965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489790469757281e-05, "grad_norm": 19.707508087158203, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8804311752319336, "num_tokens": 852993502.0, "step": 22354 }, { "epoch": 2.8437857778908535, "ewc_loss": 0.03486737236380577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486737114144489e-05, "grad_norm": 19.67721939086914, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8804628849029541, "num_tokens": 853036844.0, "step": 22355 }, { "epoch": 2.843912988169444, "ewc_loss": 0.03485376387834549, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485376510070637e-05, "grad_norm": 19.70718765258789, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8754367828369141, "num_tokens": 853070851.0, "step": 22356 }, { "epoch": 2.8440401984480346, "ewc_loss": 0.034858252853155136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485825436655432e-05, "grad_norm": 19.688201904296875, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8763259053230286, "num_tokens": 853106116.0, "step": 22357 }, { "epoch": 2.844167408726625, "ewc_loss": 0.034849803894758224, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4849803341785446e-05, "grad_norm": 19.618820190429688, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8738117218017578, "num_tokens": 853152776.0, "step": 22358 }, { "epoch": 2.8442946190052156, "ewc_loss": 0.03488364443182945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488364382064901e-05, "grad_norm": 19.731943130493164, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8746505975723267, "num_tokens": 853194176.0, "step": 22359 }, { "epoch": 2.844421829283806, "ewc_loss": 0.03493977338075638, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493977419566363e-05, "grad_norm": 19.70866584777832, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8737117052078247, "num_tokens": 853236834.0, "step": 22360 }, { "epoch": 2.8445490395623967, "ewc_loss": 0.03482125699520111, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48212561220862e-05, "grad_norm": 19.72272491455078, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8672617673873901, "num_tokens": 853274145.0, "step": 22361 }, { "epoch": 2.844676249840987, "ewc_loss": 0.03489334508776665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4893346310127527e-05, "grad_norm": 19.64964485168457, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8786653280258179, "num_tokens": 853312569.0, "step": 22362 }, { "epoch": 2.8448034601195777, "ewc_loss": 0.03494974598288536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494974589557387e-05, "grad_norm": 19.822635650634766, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8809892535209656, "num_tokens": 853351742.0, "step": 22363 }, { "epoch": 2.8449306703981683, "ewc_loss": 0.034898996353149414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489899609121494e-05, "grad_norm": 19.64289093017578, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8778302073478699, "num_tokens": 853389607.0, "step": 22364 }, { "epoch": 2.845057880676759, "ewc_loss": 0.034852251410484314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485225170152262e-05, "grad_norm": 19.701519012451172, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8835687637329102, "num_tokens": 853427218.0, "step": 22365 }, { "epoch": 2.8451850909553493, "ewc_loss": 0.034918297082185745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491829556878656e-05, "grad_norm": 19.69563865661621, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8798446655273438, "num_tokens": 853464773.0, "step": 22366 }, { "epoch": 2.84531230123394, "ewc_loss": 0.034840524196624756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4840522857848555e-05, "grad_norm": 19.68752670288086, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.8761433362960815, "num_tokens": 853507250.0, "step": 22367 }, { "epoch": 2.84543951151253, "ewc_loss": 0.034912362694740295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4912362025352195e-05, "grad_norm": 19.661357879638672, "learning_rate": 1e-06, "loss": 0.3799, "mean_token_accuracy": 0.8769358992576599, "num_tokens": 853551947.0, "step": 22368 }, { "epoch": 2.845566721791121, "ewc_loss": 0.03484565019607544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484564876998775e-05, "grad_norm": 19.762794494628906, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8679664731025696, "num_tokens": 853590557.0, "step": 22369 }, { "epoch": 2.845693932069711, "ewc_loss": 0.034951191395521164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4951190173160285e-05, "grad_norm": 19.794214248657227, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8707990646362305, "num_tokens": 853630870.0, "step": 22370 }, { "epoch": 2.845821142348302, "ewc_loss": 0.03477248549461365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4772485378198326e-05, "grad_norm": 19.794649124145508, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8742820024490356, "num_tokens": 853662360.0, "step": 22371 }, { "epoch": 2.845948352626892, "ewc_loss": 0.03488558530807495, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4885586501332e-05, "grad_norm": 19.763639450073242, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8698431849479675, "num_tokens": 853703661.0, "step": 22372 }, { "epoch": 2.846075562905483, "ewc_loss": 0.034876998513936996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4876997233368456e-05, "grad_norm": 19.80575942993164, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8658136129379272, "num_tokens": 853742398.0, "step": 22373 }, { "epoch": 2.846202773184073, "ewc_loss": 0.03491164371371269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49116453435272e-05, "grad_norm": 19.75379180908203, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8825690150260925, "num_tokens": 853781479.0, "step": 22374 }, { "epoch": 2.8463299834626636, "ewc_loss": 0.034840404987335205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484040644252673e-05, "grad_norm": 19.727195739746094, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8510613441467285, "num_tokens": 853822330.0, "step": 22375 }, { "epoch": 2.846457193741254, "ewc_loss": 0.03495043143630028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49504298355896e-05, "grad_norm": 19.786731719970703, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.863675057888031, "num_tokens": 853862747.0, "step": 22376 }, { "epoch": 2.8465844040198447, "ewc_loss": 0.03484266996383667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484266926534474e-05, "grad_norm": 19.716928482055664, "learning_rate": 1e-06, "loss": 0.3198, "mean_token_accuracy": 0.8981651067733765, "num_tokens": 853897124.0, "step": 22377 }, { "epoch": 2.8467116142984352, "ewc_loss": 0.03484354913234711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4843549656216055e-05, "grad_norm": 19.71719741821289, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8625174760818481, "num_tokens": 853930516.0, "step": 22378 }, { "epoch": 2.8468388245770258, "ewc_loss": 0.034869808703660965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486980858724564e-05, "grad_norm": 19.73904037475586, "learning_rate": 1e-06, "loss": 0.364, "mean_token_accuracy": 0.8796041011810303, "num_tokens": 853967016.0, "step": 22379 }, { "epoch": 2.8469660348556163, "ewc_loss": 0.034867312759160995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486731293378398e-05, "grad_norm": 19.65349578857422, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8596594929695129, "num_tokens": 854008804.0, "step": 22380 }, { "epoch": 2.847093245134207, "ewc_loss": 0.03493765369057655, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4937653254019096e-05, "grad_norm": 19.76538848876953, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8633537888526917, "num_tokens": 854048393.0, "step": 22381 }, { "epoch": 2.8472204554127973, "ewc_loss": 0.03495776653289795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4957767638843507e-05, "grad_norm": 19.701311111450195, "learning_rate": 1e-06, "loss": 0.3649, "mean_token_accuracy": 0.8800802230834961, "num_tokens": 854089480.0, "step": 22382 }, { "epoch": 2.847347665691388, "ewc_loss": 0.03492116928100586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4921169572044164e-05, "grad_norm": 19.765377044677734, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8873641490936279, "num_tokens": 854125677.0, "step": 22383 }, { "epoch": 2.8474748759699784, "ewc_loss": 0.03494728356599808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494728298392147e-05, "grad_norm": 19.691333770751953, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8704758882522583, "num_tokens": 854160485.0, "step": 22384 }, { "epoch": 2.847602086248569, "ewc_loss": 0.034921951591968536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492195173748769e-05, "grad_norm": 19.84362030029297, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8674707412719727, "num_tokens": 854200635.0, "step": 22385 }, { "epoch": 2.8477292965271594, "ewc_loss": 0.03494513779878616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4945136576425284e-05, "grad_norm": 19.691104888916016, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.879534125328064, "num_tokens": 854240046.0, "step": 22386 }, { "epoch": 2.84785650680575, "ewc_loss": 0.03484772518277168, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4847726055886596e-05, "grad_norm": 19.74148941040039, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8861525058746338, "num_tokens": 854275880.0, "step": 22387 }, { "epoch": 2.8479837170843405, "ewc_loss": 0.0349220372736454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492203904897906e-05, "grad_norm": 19.743080139160156, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.8765428066253662, "num_tokens": 854311204.0, "step": 22388 }, { "epoch": 2.848110927362931, "ewc_loss": 0.03489162400364876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489162554615177e-05, "grad_norm": 19.72109031677246, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8723461627960205, "num_tokens": 854346076.0, "step": 22389 }, { "epoch": 2.8482381376415216, "ewc_loss": 0.03493976593017578, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493976691970602e-05, "grad_norm": 19.82986831665039, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8823333978652954, "num_tokens": 854383075.0, "step": 22390 }, { "epoch": 2.848365347920112, "ewc_loss": 0.03485073521733284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485073466436006e-05, "grad_norm": 19.6851863861084, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8693296909332275, "num_tokens": 854416011.0, "step": 22391 }, { "epoch": 2.8484925581987026, "ewc_loss": 0.0348123200237751, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.481232124613598e-05, "grad_norm": 19.720844268798828, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.877555787563324, "num_tokens": 854449520.0, "step": 22392 }, { "epoch": 2.8486197684772927, "ewc_loss": 0.03494561091065407, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4945609513670206e-05, "grad_norm": 19.64305877685547, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8730285167694092, "num_tokens": 854484355.0, "step": 22393 }, { "epoch": 2.8487469787558837, "ewc_loss": 0.034944064915180206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494406337267719e-05, "grad_norm": 19.78508758544922, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8625592589378357, "num_tokens": 854526238.0, "step": 22394 }, { "epoch": 2.8488741890344738, "ewc_loss": 0.035032518208026886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.503251718939282e-05, "grad_norm": 19.754180908203125, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8787562251091003, "num_tokens": 854562619.0, "step": 22395 }, { "epoch": 2.8490013993130647, "ewc_loss": 0.03492313250899315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4923134080599993e-05, "grad_norm": 19.7321834564209, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8807042837142944, "num_tokens": 854604974.0, "step": 22396 }, { "epoch": 2.849128609591655, "ewc_loss": 0.0349828265607357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4982826036866754e-05, "grad_norm": 19.773643493652344, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8806425333023071, "num_tokens": 854644066.0, "step": 22397 }, { "epoch": 2.849255819870246, "ewc_loss": 0.034957487136125565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495748751447536e-05, "grad_norm": 19.77581787109375, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8783521056175232, "num_tokens": 854682723.0, "step": 22398 }, { "epoch": 2.849383030148836, "ewc_loss": 0.03498150780797005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498150908853859e-05, "grad_norm": 19.723913192749023, "learning_rate": 1e-06, "loss": 0.3613, "mean_token_accuracy": 0.8852857947349548, "num_tokens": 854720292.0, "step": 22399 }, { "epoch": 2.8495102404274264, "ewc_loss": 0.03487281873822212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487281719571911e-05, "grad_norm": 19.73774528503418, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.871843695640564, "num_tokens": 854756917.0, "step": 22400 }, { "epoch": 2.849637450706017, "ewc_loss": 0.03495931997895241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4959321055794135e-05, "grad_norm": 19.768207550048828, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8774669170379639, "num_tokens": 854798771.0, "step": 22401 }, { "epoch": 2.8497646609846075, "ewc_loss": 0.03495025262236595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4950251574628055e-05, "grad_norm": 19.647953033447266, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8726101517677307, "num_tokens": 854838193.0, "step": 22402 }, { "epoch": 2.849891871263198, "ewc_loss": 0.03492061421275139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492061296128668e-05, "grad_norm": 19.838361740112305, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8691301345825195, "num_tokens": 854871650.0, "step": 22403 }, { "epoch": 2.8500190815417885, "ewc_loss": 0.034984588623046875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4984590456588194e-05, "grad_norm": 19.725173950195312, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8728798031806946, "num_tokens": 854910757.0, "step": 22404 }, { "epoch": 2.850146291820379, "ewc_loss": 0.03486896678805351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4868968214141205e-05, "grad_norm": 19.760639190673828, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8634359836578369, "num_tokens": 854945018.0, "step": 22405 }, { "epoch": 2.8502735020989696, "ewc_loss": 0.034960392862558365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496039425954223e-05, "grad_norm": 19.741188049316406, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8674939274787903, "num_tokens": 854988369.0, "step": 22406 }, { "epoch": 2.85040071237756, "ewc_loss": 0.03488761559128761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488761649350636e-05, "grad_norm": 19.756120681762695, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8780791163444519, "num_tokens": 855023935.0, "step": 22407 }, { "epoch": 2.8505279226561506, "ewc_loss": 0.03491514176130295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491514144116081e-05, "grad_norm": 19.707054138183594, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8748620748519897, "num_tokens": 855062002.0, "step": 22408 }, { "epoch": 2.850655132934741, "ewc_loss": 0.03485359996557236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.485360139166005e-05, "grad_norm": 19.645862579345703, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8789674043655396, "num_tokens": 855101257.0, "step": 22409 }, { "epoch": 2.8507823432133317, "ewc_loss": 0.034938424825668335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49384245055262e-05, "grad_norm": 19.731779098510742, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8779630064964294, "num_tokens": 855135834.0, "step": 22410 }, { "epoch": 2.850909553491922, "ewc_loss": 0.03492457792162895, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492457835818641e-05, "grad_norm": 19.70355796813965, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8776912093162537, "num_tokens": 855174156.0, "step": 22411 }, { "epoch": 2.8510367637705127, "ewc_loss": 0.03495388478040695, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495388591545634e-05, "grad_norm": 19.67413330078125, "learning_rate": 1e-06, "loss": 0.3602, "mean_token_accuracy": 0.8853371143341064, "num_tokens": 855207698.0, "step": 22412 }, { "epoch": 2.8511639740491033, "ewc_loss": 0.035004258155822754, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500425737001933e-05, "grad_norm": 19.747116088867188, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8757361173629761, "num_tokens": 855240636.0, "step": 22413 }, { "epoch": 2.851291184327694, "ewc_loss": 0.03500300645828247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500300590530969e-05, "grad_norm": 19.6799259185791, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8786302804946899, "num_tokens": 855279923.0, "step": 22414 }, { "epoch": 2.8514183946062843, "ewc_loss": 0.034978292882442474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497829311527312e-05, "grad_norm": 19.691160202026367, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8637373447418213, "num_tokens": 855315460.0, "step": 22415 }, { "epoch": 2.851545604884875, "ewc_loss": 0.0350591316819191, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50591326423455e-05, "grad_norm": 19.761865615844727, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.8832333087921143, "num_tokens": 855351548.0, "step": 22416 }, { "epoch": 2.8516728151634654, "ewc_loss": 0.03501959890127182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501959872664884e-05, "grad_norm": 19.722190856933594, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8580283522605896, "num_tokens": 855390715.0, "step": 22417 }, { "epoch": 2.8518000254420555, "ewc_loss": 0.034984469413757324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498447040328756e-05, "grad_norm": 19.62619972229004, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8622190952301025, "num_tokens": 855431997.0, "step": 22418 }, { "epoch": 2.8519272357206464, "ewc_loss": 0.03499659523367882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4996595786651596e-05, "grad_norm": 19.73339080810547, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8695535659790039, "num_tokens": 855470242.0, "step": 22419 }, { "epoch": 2.8520544459992365, "ewc_loss": 0.03503596782684326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.503596963128075e-05, "grad_norm": 19.551420211791992, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8628785610198975, "num_tokens": 855508901.0, "step": 22420 }, { "epoch": 2.8521816562778275, "ewc_loss": 0.034960661083459854, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4960659831995144e-05, "grad_norm": 19.703763961791992, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8726643323898315, "num_tokens": 855545290.0, "step": 22421 }, { "epoch": 2.8523088665564176, "ewc_loss": 0.03516169264912605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5161690902896225e-05, "grad_norm": 19.719234466552734, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8735771775245667, "num_tokens": 855579567.0, "step": 22422 }, { "epoch": 2.852436076835008, "ewc_loss": 0.03507310524582863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507310611894354e-05, "grad_norm": 19.807865142822266, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8682904839515686, "num_tokens": 855621117.0, "step": 22423 }, { "epoch": 2.8525632871135986, "ewc_loss": 0.03505781665444374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5057815694017336e-05, "grad_norm": 19.66933822631836, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8770794868469238, "num_tokens": 855660494.0, "step": 22424 }, { "epoch": 2.852690497392189, "ewc_loss": 0.035014476627111435, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501447645248845e-05, "grad_norm": 19.80413246154785, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8755269646644592, "num_tokens": 855694043.0, "step": 22425 }, { "epoch": 2.8528177076707797, "ewc_loss": 0.03514743223786354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514743366395123e-05, "grad_norm": 19.79527473449707, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8621416091918945, "num_tokens": 855734615.0, "step": 22426 }, { "epoch": 2.85294491794937, "ewc_loss": 0.0350632518529892, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506325083435513e-05, "grad_norm": 19.82187271118164, "learning_rate": 1e-06, "loss": 0.3586, "mean_token_accuracy": 0.8861638903617859, "num_tokens": 855771141.0, "step": 22427 }, { "epoch": 2.8530721282279607, "ewc_loss": 0.03502286598086357, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5022865631617606e-05, "grad_norm": 19.742496490478516, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8750853538513184, "num_tokens": 855806877.0, "step": 22428 }, { "epoch": 2.8531993385065513, "ewc_loss": 0.03498680144548416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4986802347702906e-05, "grad_norm": 19.82004165649414, "learning_rate": 1e-06, "loss": 0.3446, "mean_token_accuracy": 0.8891290426254272, "num_tokens": 855840076.0, "step": 22429 }, { "epoch": 2.853326548785142, "ewc_loss": 0.03496922180056572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496922363410704e-05, "grad_norm": 19.65558624267578, "learning_rate": 1e-06, "loss": 0.3701, "mean_token_accuracy": 0.8845919966697693, "num_tokens": 855877972.0, "step": 22430 }, { "epoch": 2.8534537590637323, "ewc_loss": 0.03508057817816734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508057852741331e-05, "grad_norm": 19.77342414855957, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8694366216659546, "num_tokens": 855917424.0, "step": 22431 }, { "epoch": 2.853580969342323, "ewc_loss": 0.035057999193668365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505799759295769e-05, "grad_norm": 19.770540237426758, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8650554418563843, "num_tokens": 855954853.0, "step": 22432 }, { "epoch": 2.8537081796209134, "ewc_loss": 0.03499568998813629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499568992992863e-05, "grad_norm": 19.666885375976562, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8767755627632141, "num_tokens": 855993300.0, "step": 22433 }, { "epoch": 2.853835389899504, "ewc_loss": 0.03504389151930809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5043893149122596e-05, "grad_norm": 19.795333862304688, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8790425658226013, "num_tokens": 856035307.0, "step": 22434 }, { "epoch": 2.8539626001780944, "ewc_loss": 0.035086262971162796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508626468828879e-05, "grad_norm": 19.731155395507812, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8731716871261597, "num_tokens": 856071801.0, "step": 22435 }, { "epoch": 2.854089810456685, "ewc_loss": 0.03501860424876213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50186055584345e-05, "grad_norm": 19.778989791870117, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8627965450286865, "num_tokens": 856112035.0, "step": 22436 }, { "epoch": 2.8542170207352755, "ewc_loss": 0.03508203104138374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508203008095734e-05, "grad_norm": 19.776914596557617, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8809404969215393, "num_tokens": 856148275.0, "step": 22437 }, { "epoch": 2.854344231013866, "ewc_loss": 0.03505667671561241, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5056677006650716e-05, "grad_norm": 19.809913635253906, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8776915073394775, "num_tokens": 856180330.0, "step": 22438 }, { "epoch": 2.8544714412924566, "ewc_loss": 0.03500905632972717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5009055864065886e-05, "grad_norm": 19.71533966064453, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8626114726066589, "num_tokens": 856219354.0, "step": 22439 }, { "epoch": 2.854598651571047, "ewc_loss": 0.03495952486991882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495952478260733e-05, "grad_norm": 19.73118019104004, "learning_rate": 1e-06, "loss": 0.3595, "mean_token_accuracy": 0.8864660859107971, "num_tokens": 856257583.0, "step": 22440 }, { "epoch": 2.8547258618496376, "ewc_loss": 0.034996312111616135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499631202430464e-05, "grad_norm": 19.74463653564453, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.85869300365448, "num_tokens": 856298056.0, "step": 22441 }, { "epoch": 2.854853072128228, "ewc_loss": 0.03497229889035225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497229772619903e-05, "grad_norm": 19.7353515625, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8734641075134277, "num_tokens": 856334109.0, "step": 22442 }, { "epoch": 2.8549802824068182, "ewc_loss": 0.035005442798137665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5005443351110443e-05, "grad_norm": 19.781734466552734, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8767703771591187, "num_tokens": 856376655.0, "step": 22443 }, { "epoch": 2.855107492685409, "ewc_loss": 0.03499860689043999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499860758893192e-05, "grad_norm": 19.78019905090332, "learning_rate": 1e-06, "loss": 0.3351, "mean_token_accuracy": 0.8913953304290771, "num_tokens": 856409926.0, "step": 22444 }, { "epoch": 2.8552347029639993, "ewc_loss": 0.03491297364234924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4912973205791786e-05, "grad_norm": 19.82107925415039, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8669582605361938, "num_tokens": 856451471.0, "step": 22445 }, { "epoch": 2.8553619132425903, "ewc_loss": 0.034944914281368256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494491465971805e-05, "grad_norm": 19.765827178955078, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8835154175758362, "num_tokens": 856483237.0, "step": 22446 }, { "epoch": 2.8554891235211803, "ewc_loss": 0.034967921674251556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49679212376941e-05, "grad_norm": 19.821533203125, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8666684627532959, "num_tokens": 856521278.0, "step": 22447 }, { "epoch": 2.855616333799771, "ewc_loss": 0.0349670946598053, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496709541650489e-05, "grad_norm": 19.8072566986084, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8723506331443787, "num_tokens": 856565048.0, "step": 22448 }, { "epoch": 2.8557435440783614, "ewc_loss": 0.03484281152486801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4842811146518216e-05, "grad_norm": 19.70789337158203, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8778084516525269, "num_tokens": 856603756.0, "step": 22449 }, { "epoch": 2.855870754356952, "ewc_loss": 0.03486483171582222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486483183223754e-05, "grad_norm": 19.781370162963867, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8812982439994812, "num_tokens": 856640677.0, "step": 22450 }, { "epoch": 2.8559979646355425, "ewc_loss": 0.03493650257587433, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4936503652716056e-05, "grad_norm": 19.703718185424805, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8759527802467346, "num_tokens": 856679982.0, "step": 22451 }, { "epoch": 2.856125174914133, "ewc_loss": 0.034919511526823044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491951065370813e-05, "grad_norm": 19.850553512573242, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8639812469482422, "num_tokens": 856720621.0, "step": 22452 }, { "epoch": 2.8562523851927235, "ewc_loss": 0.034924738109111786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492473842925392e-05, "grad_norm": 19.714597702026367, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8829957246780396, "num_tokens": 856757172.0, "step": 22453 }, { "epoch": 2.856379595471314, "ewc_loss": 0.034847430884838104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484743137960322e-05, "grad_norm": 19.811269760131836, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8727748394012451, "num_tokens": 856796876.0, "step": 22454 }, { "epoch": 2.8565068057499046, "ewc_loss": 0.0349145382642746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4914537536678836e-05, "grad_norm": 19.73550796508789, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8756253719329834, "num_tokens": 856831148.0, "step": 22455 }, { "epoch": 2.856634016028495, "ewc_loss": 0.034881897270679474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.488189759082161e-05, "grad_norm": 19.7801570892334, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8716825246810913, "num_tokens": 856867091.0, "step": 22456 }, { "epoch": 2.8567612263070856, "ewc_loss": 0.03497140109539032, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4971402783412486e-05, "grad_norm": 19.770545959472656, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8656933307647705, "num_tokens": 856906118.0, "step": 22457 }, { "epoch": 2.856888436585676, "ewc_loss": 0.03489973396062851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489973460091278e-05, "grad_norm": 19.777605056762695, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8503212928771973, "num_tokens": 856942694.0, "step": 22458 }, { "epoch": 2.8570156468642667, "ewc_loss": 0.03498799353837967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4987991966772825e-05, "grad_norm": 19.824569702148438, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8666194677352905, "num_tokens": 856977530.0, "step": 22459 }, { "epoch": 2.857142857142857, "ewc_loss": 0.03499226272106171, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499226295389235e-05, "grad_norm": 19.779415130615234, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.8794159889221191, "num_tokens": 857012417.0, "step": 22460 }, { "epoch": 2.8572700674214477, "ewc_loss": 0.03487393632531166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487393769319169e-05, "grad_norm": 19.743749618530273, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8690252304077148, "num_tokens": 857053556.0, "step": 22461 }, { "epoch": 2.8573972777000383, "ewc_loss": 0.03498068451881409, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498068326734938e-05, "grad_norm": 19.82888412475586, "learning_rate": 1e-06, "loss": 0.3744, "mean_token_accuracy": 0.8793186545372009, "num_tokens": 857094514.0, "step": 22462 }, { "epoch": 2.857524487978629, "ewc_loss": 0.03493080660700798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493080657790415e-05, "grad_norm": 19.796302795410156, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8762258291244507, "num_tokens": 857133670.0, "step": 22463 }, { "epoch": 2.8576516982572193, "ewc_loss": 0.03492394834756851, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492394898785278e-05, "grad_norm": 19.77791976928711, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8689144849777222, "num_tokens": 857167934.0, "step": 22464 }, { "epoch": 2.85777890853581, "ewc_loss": 0.03498803451657295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498803562251851e-05, "grad_norm": 19.881855010986328, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8584123253822327, "num_tokens": 857208263.0, "step": 22465 }, { "epoch": 2.8579061188144, "ewc_loss": 0.03495606780052185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4956068702740595e-05, "grad_norm": 19.755380630493164, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8576648235321045, "num_tokens": 857255107.0, "step": 22466 }, { "epoch": 2.858033329092991, "ewc_loss": 0.03486726060509682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486726200208068e-05, "grad_norm": 19.805700302124023, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8721389770507812, "num_tokens": 857298368.0, "step": 22467 }, { "epoch": 2.858160539371581, "ewc_loss": 0.035017382353544235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501738319755532e-05, "grad_norm": 19.80500602722168, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8750810623168945, "num_tokens": 857336899.0, "step": 22468 }, { "epoch": 2.858287749650172, "ewc_loss": 0.034843381494283676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.484338230919093e-05, "grad_norm": 19.739503860473633, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8796504139900208, "num_tokens": 857378482.0, "step": 22469 }, { "epoch": 2.858414959928762, "ewc_loss": 0.03494570031762123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494570046314038e-05, "grad_norm": 19.89504623413086, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8845283389091492, "num_tokens": 857413203.0, "step": 22470 }, { "epoch": 2.858542170207353, "ewc_loss": 0.0349145270884037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4914526622742414e-05, "grad_norm": 19.750070571899414, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8734506368637085, "num_tokens": 857446706.0, "step": 22471 }, { "epoch": 2.858669380485943, "ewc_loss": 0.034877266734838486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.487726644380018e-05, "grad_norm": 19.77120018005371, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8789383172988892, "num_tokens": 857483415.0, "step": 22472 }, { "epoch": 2.8587965907645336, "ewc_loss": 0.034954726696014404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495472628856078e-05, "grad_norm": 19.826763153076172, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8810523748397827, "num_tokens": 857525323.0, "step": 22473 }, { "epoch": 2.858923801043124, "ewc_loss": 0.03491385653614998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491385723464191e-05, "grad_norm": 19.834489822387695, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8918913006782532, "num_tokens": 857564400.0, "step": 22474 }, { "epoch": 2.8590510113217147, "ewc_loss": 0.03486728295683861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.486728382995352e-05, "grad_norm": 19.775352478027344, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8744063973426819, "num_tokens": 857602756.0, "step": 22475 }, { "epoch": 2.859178221600305, "ewc_loss": 0.034839533269405365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4839533327613026e-05, "grad_norm": 19.806642532348633, "learning_rate": 1e-06, "loss": 0.3902, "mean_token_accuracy": 0.8760737180709839, "num_tokens": 857636875.0, "step": 22476 }, { "epoch": 2.8593054318788957, "ewc_loss": 0.03496142849326134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496142744552344e-05, "grad_norm": 19.7663631439209, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8921749591827393, "num_tokens": 857671950.0, "step": 22477 }, { "epoch": 2.8594326421574863, "ewc_loss": 0.03490269556641579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490269591566175e-05, "grad_norm": 19.751506805419922, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8882097005844116, "num_tokens": 857705274.0, "step": 22478 }, { "epoch": 2.859559852436077, "ewc_loss": 0.03496476635336876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496476711006835e-05, "grad_norm": 19.736108779907227, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8687168955802917, "num_tokens": 857743730.0, "step": 22479 }, { "epoch": 2.8596870627146673, "ewc_loss": 0.03498665243387222, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4986653190571815e-05, "grad_norm": 19.79401397705078, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8745512366294861, "num_tokens": 857784055.0, "step": 22480 }, { "epoch": 2.859814272993258, "ewc_loss": 0.03495100140571594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4951000998262316e-05, "grad_norm": 19.7179012298584, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8828476667404175, "num_tokens": 857822431.0, "step": 22481 }, { "epoch": 2.8599414832718484, "ewc_loss": 0.03499823063611984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499822923913598e-05, "grad_norm": 19.768306732177734, "learning_rate": 1e-06, "loss": 0.3546, "mean_token_accuracy": 0.8863909244537354, "num_tokens": 857859258.0, "step": 22482 }, { "epoch": 2.860068693550439, "ewc_loss": 0.03493212163448334, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493211988825351e-05, "grad_norm": 19.69327163696289, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8664954900741577, "num_tokens": 857901209.0, "step": 22483 }, { "epoch": 2.8601959038290294, "ewc_loss": 0.035010695457458496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5010696592507884e-05, "grad_norm": 19.84507179260254, "learning_rate": 1e-06, "loss": 0.349, "mean_token_accuracy": 0.8885778188705444, "num_tokens": 857935563.0, "step": 22484 }, { "epoch": 2.86032311410762, "ewc_loss": 0.0350906178355217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509061934892088e-05, "grad_norm": 19.7471923828125, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8669930696487427, "num_tokens": 857966777.0, "step": 22485 }, { "epoch": 2.8604503243862105, "ewc_loss": 0.03502270579338074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5022705560550094e-05, "grad_norm": 19.845443725585938, "learning_rate": 1e-06, "loss": 0.3768, "mean_token_accuracy": 0.8802916407585144, "num_tokens": 858005078.0, "step": 22486 }, { "epoch": 2.860577534664801, "ewc_loss": 0.035094838589429855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509483940433711e-05, "grad_norm": 19.793569564819336, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8628147840499878, "num_tokens": 858049288.0, "step": 22487 }, { "epoch": 2.8607047449433916, "ewc_loss": 0.0349433608353138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4943361242767423e-05, "grad_norm": 19.748685836791992, "learning_rate": 1e-06, "loss": 0.3328, "mean_token_accuracy": 0.8949885964393616, "num_tokens": 858089834.0, "step": 22488 }, { "epoch": 2.860831955221982, "ewc_loss": 0.035005662590265274, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500566162983887e-05, "grad_norm": 19.772504806518555, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8799852132797241, "num_tokens": 858124152.0, "step": 22489 }, { "epoch": 2.8609591655005726, "ewc_loss": 0.034979622811079025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4979624615516514e-05, "grad_norm": 19.801679611206055, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8799108266830444, "num_tokens": 858163202.0, "step": 22490 }, { "epoch": 2.8610863757791627, "ewc_loss": 0.03498736396431923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49873625964392e-05, "grad_norm": 19.723918914794922, "learning_rate": 1e-06, "loss": 0.3288, "mean_token_accuracy": 0.894542932510376, "num_tokens": 858199173.0, "step": 22491 }, { "epoch": 2.8612135860577537, "ewc_loss": 0.034943610429763794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4943608625326306e-05, "grad_norm": 19.733448028564453, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8737572431564331, "num_tokens": 858239693.0, "step": 22492 }, { "epoch": 2.8613407963363438, "ewc_loss": 0.03499678894877434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499678859952837e-05, "grad_norm": 19.75445556640625, "learning_rate": 1e-06, "loss": 0.3682, "mean_token_accuracy": 0.883061945438385, "num_tokens": 858280720.0, "step": 22493 }, { "epoch": 2.8614680066149347, "ewc_loss": 0.035007718950510025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500772072584368e-05, "grad_norm": 19.738115310668945, "learning_rate": 1e-06, "loss": 0.3533, "mean_token_accuracy": 0.8878231048583984, "num_tokens": 858314145.0, "step": 22494 }, { "epoch": 2.861595216893525, "ewc_loss": 0.034934062510728836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49340625689365e-05, "grad_norm": 19.782682418823242, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8779900074005127, "num_tokens": 858355177.0, "step": 22495 }, { "epoch": 2.861722427172116, "ewc_loss": 0.035023078322410583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502307663438842e-05, "grad_norm": 19.844425201416016, "learning_rate": 1e-06, "loss": 0.3795, "mean_token_accuracy": 0.8796908855438232, "num_tokens": 858393070.0, "step": 22496 }, { "epoch": 2.861849637450706, "ewc_loss": 0.034937530755996704, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4937529562739655e-05, "grad_norm": 19.677331924438477, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8761000633239746, "num_tokens": 858431827.0, "step": 22497 }, { "epoch": 2.8619768477292964, "ewc_loss": 0.03495743125677109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4957432944793254e-05, "grad_norm": 19.677949905395508, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8729943037033081, "num_tokens": 858474990.0, "step": 22498 }, { "epoch": 2.862104058007887, "ewc_loss": 0.03507748246192932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5077482607448474e-05, "grad_norm": 19.815343856811523, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8890846967697144, "num_tokens": 858514698.0, "step": 22499 }, { "epoch": 2.8622312682864774, "ewc_loss": 0.035042475908994675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504247433738783e-05, "grad_norm": 19.75611114501953, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8776651620864868, "num_tokens": 858557601.0, "step": 22500 }, { "epoch": 2.862358478565068, "ewc_loss": 0.03496002405881882, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4960023185703903e-05, "grad_norm": 19.729984283447266, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8847246170043945, "num_tokens": 858596830.0, "step": 22501 }, { "epoch": 2.8624856888436585, "ewc_loss": 0.03500281646847725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500281673041172e-05, "grad_norm": 19.838306427001953, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.866857647895813, "num_tokens": 858637555.0, "step": 22502 }, { "epoch": 2.862612899122249, "ewc_loss": 0.034992773085832596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499277227092534e-05, "grad_norm": 19.76082420349121, "learning_rate": 1e-06, "loss": 0.3273, "mean_token_accuracy": 0.8945682048797607, "num_tokens": 858672813.0, "step": 22503 }, { "epoch": 2.8627401094008396, "ewc_loss": 0.03500779718160629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500779712339863e-05, "grad_norm": 19.882841110229492, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.881384015083313, "num_tokens": 858710743.0, "step": 22504 }, { "epoch": 2.86286731967943, "ewc_loss": 0.03495779260993004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4957793104695156e-05, "grad_norm": 19.757104873657227, "learning_rate": 1e-06, "loss": 0.344, "mean_token_accuracy": 0.8884669542312622, "num_tokens": 858739486.0, "step": 22505 }, { "epoch": 2.8629945299580206, "ewc_loss": 0.03483803570270538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4838034480344504e-05, "grad_norm": 19.74298667907715, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8555349111557007, "num_tokens": 858780678.0, "step": 22506 }, { "epoch": 2.863121740236611, "ewc_loss": 0.034988660365343094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498866135487333e-05, "grad_norm": 19.767501831054688, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.871864378452301, "num_tokens": 858826112.0, "step": 22507 }, { "epoch": 2.8632489505152017, "ewc_loss": 0.03489774093031883, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489774098852649e-05, "grad_norm": 19.805652618408203, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8869779109954834, "num_tokens": 858862158.0, "step": 22508 }, { "epoch": 2.863376160793792, "ewc_loss": 0.03496520593762398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496520730550401e-05, "grad_norm": 19.820804595947266, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8802562952041626, "num_tokens": 858905270.0, "step": 22509 }, { "epoch": 2.8635033710723827, "ewc_loss": 0.03495657444000244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495657438179478e-05, "grad_norm": 19.804927825927734, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8687394857406616, "num_tokens": 858941802.0, "step": 22510 }, { "epoch": 2.8636305813509733, "ewc_loss": 0.034892838448286057, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4892836993094534e-05, "grad_norm": 19.806259155273438, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8797024488449097, "num_tokens": 858974672.0, "step": 22511 }, { "epoch": 2.863757791629564, "ewc_loss": 0.03489252179861069, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489252048893832e-05, "grad_norm": 19.746952056884766, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8743769526481628, "num_tokens": 859015305.0, "step": 22512 }, { "epoch": 2.8638850019081543, "ewc_loss": 0.03488300368189812, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4883003536378965e-05, "grad_norm": 19.824071884155273, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.867342472076416, "num_tokens": 859050646.0, "step": 22513 }, { "epoch": 2.864012212186745, "ewc_loss": 0.03492516651749611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492516771075316e-05, "grad_norm": 19.78716278076172, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.8852808475494385, "num_tokens": 859090139.0, "step": 22514 }, { "epoch": 2.8641394224653354, "ewc_loss": 0.0348627083003521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.48627072526142e-05, "grad_norm": 19.75465202331543, "learning_rate": 1e-06, "loss": 0.3645, "mean_token_accuracy": 0.8805456161499023, "num_tokens": 859130661.0, "step": 22515 }, { "epoch": 2.8642666327439255, "ewc_loss": 0.03490835428237915, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490835297270678e-05, "grad_norm": 19.84263801574707, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.875633180141449, "num_tokens": 859166399.0, "step": 22516 }, { "epoch": 2.8643938430225164, "ewc_loss": 0.034979306161403656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497930447338149e-05, "grad_norm": 19.759746551513672, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.8759793639183044, "num_tokens": 859208349.0, "step": 22517 }, { "epoch": 2.8645210533011065, "ewc_loss": 0.03491156920790672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491156894597225e-05, "grad_norm": 19.834749221801758, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8785387873649597, "num_tokens": 859243784.0, "step": 22518 }, { "epoch": 2.8646482635796975, "ewc_loss": 0.034987639635801315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498763908282854e-05, "grad_norm": 19.74751091003418, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8623316884040833, "num_tokens": 859278345.0, "step": 22519 }, { "epoch": 2.8647754738582876, "ewc_loss": 0.034992657601833344, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4992655855603516e-05, "grad_norm": 19.837032318115234, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8662198185920715, "num_tokens": 859317454.0, "step": 22520 }, { "epoch": 2.864902684136878, "ewc_loss": 0.03494849056005478, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494849079288542e-05, "grad_norm": 19.73576545715332, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8711420297622681, "num_tokens": 859352076.0, "step": 22521 }, { "epoch": 2.8650298944154686, "ewc_loss": 0.03495723754167557, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495723649393767e-05, "grad_norm": 19.805448532104492, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8842001557350159, "num_tokens": 859389957.0, "step": 22522 }, { "epoch": 2.865157104694059, "ewc_loss": 0.034982360899448395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4982360375579447e-05, "grad_norm": 19.796510696411133, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8766175508499146, "num_tokens": 859425167.0, "step": 22523 }, { "epoch": 2.8652843149726497, "ewc_loss": 0.034942466765642166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494246629998088e-05, "grad_norm": 19.843463897705078, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.873333215713501, "num_tokens": 859470999.0, "step": 22524 }, { "epoch": 2.86541152525124, "ewc_loss": 0.03500039502978325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50003938365262e-05, "grad_norm": 19.84689712524414, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8823180794715881, "num_tokens": 859503763.0, "step": 22525 }, { "epoch": 2.8655387355298307, "ewc_loss": 0.03491124510765076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491124516585842e-05, "grad_norm": 19.79647445678711, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8851331472396851, "num_tokens": 859537739.0, "step": 22526 }, { "epoch": 2.8656659458084213, "ewc_loss": 0.0349508598446846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495085911708884e-05, "grad_norm": 19.774816513061523, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8849037885665894, "num_tokens": 859573497.0, "step": 22527 }, { "epoch": 2.865793156087012, "ewc_loss": 0.03495052456855774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495052442303859e-05, "grad_norm": 19.844921112060547, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8720749616622925, "num_tokens": 859606997.0, "step": 22528 }, { "epoch": 2.8659203663656023, "ewc_loss": 0.034996189177036285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49961883330252e-05, "grad_norm": 19.878664016723633, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8646577596664429, "num_tokens": 859642278.0, "step": 22529 }, { "epoch": 2.866047576644193, "ewc_loss": 0.03490850329399109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.490850212983787e-05, "grad_norm": 19.811025619506836, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8874804377555847, "num_tokens": 859684728.0, "step": 22530 }, { "epoch": 2.8661747869227834, "ewc_loss": 0.034927453845739365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492745236144401e-05, "grad_norm": 19.782527923583984, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8696408271789551, "num_tokens": 859720946.0, "step": 22531 }, { "epoch": 2.866301997201374, "ewc_loss": 0.034910865128040314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491086681606248e-05, "grad_norm": 19.8395938873291, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8743836879730225, "num_tokens": 859752605.0, "step": 22532 }, { "epoch": 2.8664292074799644, "ewc_loss": 0.03495658189058304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4956581657752395e-05, "grad_norm": 19.750032424926758, "learning_rate": 1e-06, "loss": 0.3528, "mean_token_accuracy": 0.888611912727356, "num_tokens": 859784206.0, "step": 22533 }, { "epoch": 2.866556417758555, "ewc_loss": 0.03491309657692909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.491309689707123e-05, "grad_norm": 19.831077575683594, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.872096061706543, "num_tokens": 859828142.0, "step": 22534 }, { "epoch": 2.8666836280371455, "ewc_loss": 0.035021208226680756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502120671328157e-05, "grad_norm": 19.774028778076172, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8712337017059326, "num_tokens": 859870032.0, "step": 22535 }, { "epoch": 2.866810838315736, "ewc_loss": 0.03499700874090195, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49970068782568e-05, "grad_norm": 19.8546085357666, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.86648029088974, "num_tokens": 859911417.0, "step": 22536 }, { "epoch": 2.8669380485943265, "ewc_loss": 0.034974705427885056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4974706068169326e-05, "grad_norm": 19.774051666259766, "learning_rate": 1e-06, "loss": 0.3421, "mean_token_accuracy": 0.8865848779678345, "num_tokens": 859953929.0, "step": 22537 }, { "epoch": 2.867065258872917, "ewc_loss": 0.034896090626716614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4896089346148074e-05, "grad_norm": 19.752933502197266, "learning_rate": 1e-06, "loss": 0.3714, "mean_token_accuracy": 0.8802824020385742, "num_tokens": 859985402.0, "step": 22538 }, { "epoch": 2.8671924691515076, "ewc_loss": 0.0349276177585125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492761607049033e-05, "grad_norm": 19.856687545776367, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.874087929725647, "num_tokens": 860017374.0, "step": 22539 }, { "epoch": 2.867319679430098, "ewc_loss": 0.0349988155066967, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499881495372392e-05, "grad_norm": 19.753929138183594, "learning_rate": 1e-06, "loss": 0.3566, "mean_token_accuracy": 0.8852763175964355, "num_tokens": 860058217.0, "step": 22540 }, { "epoch": 2.867446889708688, "ewc_loss": 0.034893494099378586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.489349546725862e-05, "grad_norm": 19.830944061279297, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8596246838569641, "num_tokens": 860092317.0, "step": 22541 }, { "epoch": 2.867574099987279, "ewc_loss": 0.034991513937711716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499151353025809e-05, "grad_norm": 19.811845779418945, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.872734010219574, "num_tokens": 860125067.0, "step": 22542 }, { "epoch": 2.8677013102658693, "ewc_loss": 0.03499989956617355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4999899071408436e-05, "grad_norm": 19.837806701660156, "learning_rate": 1e-06, "loss": 0.3518, "mean_token_accuracy": 0.8879362344741821, "num_tokens": 860163805.0, "step": 22543 }, { "epoch": 2.8678285205444602, "ewc_loss": 0.03496512025594711, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496511999401264e-05, "grad_norm": 19.797687530517578, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8635520935058594, "num_tokens": 860200178.0, "step": 22544 }, { "epoch": 2.8679557308230503, "ewc_loss": 0.03496456891298294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496457065921277e-05, "grad_norm": 19.765710830688477, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8757615089416504, "num_tokens": 860235283.0, "step": 22545 }, { "epoch": 2.868082941101641, "ewc_loss": 0.03500444442033768, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500444290693849e-05, "grad_norm": 19.790481567382812, "learning_rate": 1e-06, "loss": 0.3361, "mean_token_accuracy": 0.8925157785415649, "num_tokens": 860276776.0, "step": 22546 }, { "epoch": 2.8682101513802314, "ewc_loss": 0.03497191518545151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4971915738424286e-05, "grad_norm": 19.784107208251953, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8803879618644714, "num_tokens": 860318492.0, "step": 22547 }, { "epoch": 2.868337361658822, "ewc_loss": 0.03494085371494293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494085467536934e-05, "grad_norm": 19.813440322875977, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8621987700462341, "num_tokens": 860358556.0, "step": 22548 }, { "epoch": 2.8684645719374124, "ewc_loss": 0.034985557198524475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498555815895088e-05, "grad_norm": 19.82541275024414, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8665721416473389, "num_tokens": 860391114.0, "step": 22549 }, { "epoch": 2.868591782216003, "ewc_loss": 0.03497686609625816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497686702758074e-05, "grad_norm": 19.836767196655273, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8719620704650879, "num_tokens": 860430696.0, "step": 22550 }, { "epoch": 2.8687189924945935, "ewc_loss": 0.035024214535951614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502421532175504e-05, "grad_norm": 19.758249282836914, "learning_rate": 1e-06, "loss": 0.3597, "mean_token_accuracy": 0.8830121755599976, "num_tokens": 860471055.0, "step": 22551 }, { "epoch": 2.868846202773184, "ewc_loss": 0.03496107459068298, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496107456157915e-05, "grad_norm": 19.853132247924805, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8693984746932983, "num_tokens": 860512472.0, "step": 22552 }, { "epoch": 2.8689734130517746, "ewc_loss": 0.034942626953125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494262637104839e-05, "grad_norm": 19.768592834472656, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8717483282089233, "num_tokens": 860541014.0, "step": 22553 }, { "epoch": 2.869100623330365, "ewc_loss": 0.03495129942893982, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49512993125245e-05, "grad_norm": 19.782575607299805, "learning_rate": 1e-06, "loss": 0.3451, "mean_token_accuracy": 0.8903988003730774, "num_tokens": 860585025.0, "step": 22554 }, { "epoch": 2.8692278336089556, "ewc_loss": 0.0349961556494236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499615559121594e-05, "grad_norm": 19.90848731994629, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8847948908805847, "num_tokens": 860616361.0, "step": 22555 }, { "epoch": 2.869355043887546, "ewc_loss": 0.035013921558856964, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501391984173097e-05, "grad_norm": 19.817155838012695, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8684428930282593, "num_tokens": 860655976.0, "step": 22556 }, { "epoch": 2.8694822541661367, "ewc_loss": 0.034908510744571686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4908509405795485e-05, "grad_norm": 19.835493087768555, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8640214204788208, "num_tokens": 860703949.0, "step": 22557 }, { "epoch": 2.869609464444727, "ewc_loss": 0.034993089735507965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499308877508156e-05, "grad_norm": 19.800111770629883, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8759036064147949, "num_tokens": 860739935.0, "step": 22558 }, { "epoch": 2.8697366747233177, "ewc_loss": 0.034976065158843994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497606667224318e-05, "grad_norm": 19.734586715698242, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8790587186813354, "num_tokens": 860782483.0, "step": 22559 }, { "epoch": 2.8698638850019083, "ewc_loss": 0.03501511365175247, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501511309877969e-05, "grad_norm": 19.862703323364258, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8651923537254333, "num_tokens": 860816821.0, "step": 22560 }, { "epoch": 2.869991095280499, "ewc_loss": 0.035031601786613464, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.503160041873343e-05, "grad_norm": 19.834096908569336, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8642947673797607, "num_tokens": 860851313.0, "step": 22561 }, { "epoch": 2.8701183055590893, "ewc_loss": 0.034988995641469955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498899604892358e-05, "grad_norm": 19.840940475463867, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8847262263298035, "num_tokens": 860890540.0, "step": 22562 }, { "epoch": 2.87024551583768, "ewc_loss": 0.03499631956219673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499631930026226e-05, "grad_norm": 19.882551193237305, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8697757720947266, "num_tokens": 860929686.0, "step": 22563 }, { "epoch": 2.87037272611627, "ewc_loss": 0.034987058490514755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49870570062194e-05, "grad_norm": 19.778047561645508, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8789476156234741, "num_tokens": 860968186.0, "step": 22564 }, { "epoch": 2.870499936394861, "ewc_loss": 0.03494592756032944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.494592601782642e-05, "grad_norm": 19.808429718017578, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8716529607772827, "num_tokens": 861007252.0, "step": 22565 }, { "epoch": 2.870627146673451, "ewc_loss": 0.03498312085866928, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498312071315013e-05, "grad_norm": 19.804929733276367, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8769307732582092, "num_tokens": 861049898.0, "step": 22566 }, { "epoch": 2.870754356952042, "ewc_loss": 0.034962791949510574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49627916875761e-05, "grad_norm": 19.766536712646484, "learning_rate": 1e-06, "loss": 0.3684, "mean_token_accuracy": 0.8839160203933716, "num_tokens": 861087711.0, "step": 22567 }, { "epoch": 2.870881567230632, "ewc_loss": 0.03493727371096611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493727490422316e-05, "grad_norm": 19.73530387878418, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8564563989639282, "num_tokens": 861128566.0, "step": 22568 }, { "epoch": 2.871008777509223, "ewc_loss": 0.03505556285381317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5055563785135746e-05, "grad_norm": 19.824934005737305, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8799991011619568, "num_tokens": 861156299.0, "step": 22569 }, { "epoch": 2.871135987787813, "ewc_loss": 0.03499538451433182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4995384339708835e-05, "grad_norm": 19.82098388671875, "learning_rate": 1e-06, "loss": 0.3499, "mean_token_accuracy": 0.8878270983695984, "num_tokens": 861189376.0, "step": 22570 }, { "epoch": 2.8712631980664036, "ewc_loss": 0.034985583275556564, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498558362480253e-05, "grad_norm": 19.75858497619629, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8724883198738098, "num_tokens": 861227156.0, "step": 22571 }, { "epoch": 2.871390408344994, "ewc_loss": 0.03497485816478729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4974858863279223e-05, "grad_norm": 19.837482452392578, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8750830292701721, "num_tokens": 861268088.0, "step": 22572 }, { "epoch": 2.8715176186235847, "ewc_loss": 0.03504462540149689, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504462438286282e-05, "grad_norm": 19.772289276123047, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8792558908462524, "num_tokens": 861308892.0, "step": 22573 }, { "epoch": 2.871644828902175, "ewc_loss": 0.03497954085469246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497954094200395e-05, "grad_norm": 19.778343200683594, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8778444528579712, "num_tokens": 861350108.0, "step": 22574 }, { "epoch": 2.8717720391807657, "ewc_loss": 0.03501402959227562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501402898109518e-05, "grad_norm": 19.827991485595703, "learning_rate": 1e-06, "loss": 0.3614, "mean_token_accuracy": 0.8852709531784058, "num_tokens": 861390553.0, "step": 22575 }, { "epoch": 2.8718992494593563, "ewc_loss": 0.03498866781592369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4988668630830944e-05, "grad_norm": 19.718978881835938, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.8783929347991943, "num_tokens": 861425878.0, "step": 22576 }, { "epoch": 2.872026459737947, "ewc_loss": 0.03500326722860336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50032678397838e-05, "grad_norm": 19.81018829345703, "learning_rate": 1e-06, "loss": 0.343, "mean_token_accuracy": 0.8912662863731384, "num_tokens": 861460935.0, "step": 22577 }, { "epoch": 2.8721536700165373, "ewc_loss": 0.035022955387830734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502295658108778e-05, "grad_norm": 19.804227828979492, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.860414981842041, "num_tokens": 861500446.0, "step": 22578 }, { "epoch": 2.872280880295128, "ewc_loss": 0.03502264991402626, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502265099086799e-05, "grad_norm": 19.826061248779297, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8645777702331543, "num_tokens": 861538000.0, "step": 22579 }, { "epoch": 2.8724080905737184, "ewc_loss": 0.03500749543309212, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500749517115764e-05, "grad_norm": 19.734384536743164, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8787158727645874, "num_tokens": 861578052.0, "step": 22580 }, { "epoch": 2.872535300852309, "ewc_loss": 0.03497694060206413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497693978715688e-05, "grad_norm": 19.748779296875, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8713403344154358, "num_tokens": 861619065.0, "step": 22581 }, { "epoch": 2.8726625111308994, "ewc_loss": 0.03498440980911255, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498440855764784e-05, "grad_norm": 19.76007843017578, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8790242671966553, "num_tokens": 861660112.0, "step": 22582 }, { "epoch": 2.87278972140949, "ewc_loss": 0.03501737862825394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501737955957651e-05, "grad_norm": 19.694868087768555, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8778353929519653, "num_tokens": 861699312.0, "step": 22583 }, { "epoch": 2.8729169316880805, "ewc_loss": 0.035075534135103226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507553265080787e-05, "grad_norm": 19.898273468017578, "learning_rate": 1e-06, "loss": 0.3341, "mean_token_accuracy": 0.8932304978370667, "num_tokens": 861735167.0, "step": 22584 }, { "epoch": 2.873044141966671, "ewc_loss": 0.03506980091333389, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5069799196207896e-05, "grad_norm": 19.728782653808594, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8702296018600464, "num_tokens": 861778594.0, "step": 22585 }, { "epoch": 2.8731713522452615, "ewc_loss": 0.03497430682182312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4974305890500546e-05, "grad_norm": 19.825389862060547, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8755388259887695, "num_tokens": 861814754.0, "step": 22586 }, { "epoch": 2.873298562523852, "ewc_loss": 0.03505344316363335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505344284349121e-05, "grad_norm": 19.81827735900879, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8617250919342041, "num_tokens": 861849019.0, "step": 22587 }, { "epoch": 2.8734257728024426, "ewc_loss": 0.03495882824063301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.495882992865518e-05, "grad_norm": 19.839481353759766, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8728883266448975, "num_tokens": 861889471.0, "step": 22588 }, { "epoch": 2.8735529830810327, "ewc_loss": 0.035010334104299545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5010332794627175e-05, "grad_norm": 19.766338348388672, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8668733835220337, "num_tokens": 861926811.0, "step": 22589 }, { "epoch": 2.8736801933596237, "ewc_loss": 0.03504852578043938, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5048524296144024e-05, "grad_norm": 19.838987350463867, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8737140893936157, "num_tokens": 861963628.0, "step": 22590 }, { "epoch": 2.8738074036382137, "ewc_loss": 0.03496658429503441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496658246149309e-05, "grad_norm": 19.776260375976562, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8646467328071594, "num_tokens": 862002915.0, "step": 22591 }, { "epoch": 2.8739346139168047, "ewc_loss": 0.035002753138542175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5002754884772e-05, "grad_norm": 19.719158172607422, "learning_rate": 1e-06, "loss": 0.3689, "mean_token_accuracy": 0.8826442956924438, "num_tokens": 862047276.0, "step": 22592 }, { "epoch": 2.874061824195395, "ewc_loss": 0.035085540264844894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508554073050618e-05, "grad_norm": 19.853511810302734, "learning_rate": 1e-06, "loss": 0.3648, "mean_token_accuracy": 0.8835523128509521, "num_tokens": 862091241.0, "step": 22593 }, { "epoch": 2.8741890344739858, "ewc_loss": 0.03503274917602539, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5032750020036474e-05, "grad_norm": 19.774282455444336, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8794276118278503, "num_tokens": 862126025.0, "step": 22594 }, { "epoch": 2.874316244752576, "ewc_loss": 0.034973498433828354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497349825920537e-05, "grad_norm": 19.787277221679688, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8674604892730713, "num_tokens": 862160822.0, "step": 22595 }, { "epoch": 2.8744434550311664, "ewc_loss": 0.03501173481345177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5011733416467905e-05, "grad_norm": 19.892545700073242, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8801066875457764, "num_tokens": 862196808.0, "step": 22596 }, { "epoch": 2.874570665309757, "ewc_loss": 0.035088829696178436, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508882946334779e-05, "grad_norm": 19.88300132751465, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8626223802566528, "num_tokens": 862233722.0, "step": 22597 }, { "epoch": 2.8746978755883474, "ewc_loss": 0.03498954325914383, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4989541745744646e-05, "grad_norm": 19.74932289123535, "learning_rate": 1e-06, "loss": 0.3398, "mean_token_accuracy": 0.8911315202713013, "num_tokens": 862271914.0, "step": 22598 }, { "epoch": 2.874825085866938, "ewc_loss": 0.035013820976018906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5013821616303176e-05, "grad_norm": 19.889751434326172, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8724203705787659, "num_tokens": 862313696.0, "step": 22599 }, { "epoch": 2.8749522961455285, "ewc_loss": 0.03504788875579834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504788764985278e-05, "grad_norm": 19.869731903076172, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8693389892578125, "num_tokens": 862355366.0, "step": 22600 }, { "epoch": 2.875079506424119, "ewc_loss": 0.03488036245107651, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4880362363765016e-05, "grad_norm": 19.704458236694336, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8678717017173767, "num_tokens": 862395630.0, "step": 22601 }, { "epoch": 2.8752067167027096, "ewc_loss": 0.03501211479306221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501211540424265e-05, "grad_norm": 19.817834854125977, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8767412304878235, "num_tokens": 862429650.0, "step": 22602 }, { "epoch": 2.8753339269813, "ewc_loss": 0.03499402850866318, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499402737361379e-05, "grad_norm": 19.79778289794922, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8727138042449951, "num_tokens": 862470921.0, "step": 22603 }, { "epoch": 2.8754611372598906, "ewc_loss": 0.034928154200315475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492815449135378e-05, "grad_norm": 19.719995498657227, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.8825819492340088, "num_tokens": 862505204.0, "step": 22604 }, { "epoch": 2.875588347538481, "ewc_loss": 0.03507724404335022, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5077242500847206e-05, "grad_norm": 19.81380271911621, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8751434087753296, "num_tokens": 862548686.0, "step": 22605 }, { "epoch": 2.8757155578170717, "ewc_loss": 0.03500649333000183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500649472698569e-05, "grad_norm": 19.75493621826172, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8876391649246216, "num_tokens": 862585920.0, "step": 22606 }, { "epoch": 2.875842768095662, "ewc_loss": 0.03502558171749115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50255832017865e-05, "grad_norm": 19.795289993286133, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8701369762420654, "num_tokens": 862619364.0, "step": 22607 }, { "epoch": 2.8759699783742527, "ewc_loss": 0.035048261284828186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504826236166991e-05, "grad_norm": 19.875524520874023, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8724324107170105, "num_tokens": 862661317.0, "step": 22608 }, { "epoch": 2.8760971886528433, "ewc_loss": 0.034963712096214294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4963712096214294e-05, "grad_norm": 19.745040893554688, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8797465562820435, "num_tokens": 862698236.0, "step": 22609 }, { "epoch": 2.876224398931434, "ewc_loss": 0.03506981581449509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506981738610193e-05, "grad_norm": 19.88221549987793, "learning_rate": 1e-06, "loss": 0.3523, "mean_token_accuracy": 0.8863147497177124, "num_tokens": 862739224.0, "step": 22610 }, { "epoch": 2.8763516092100243, "ewc_loss": 0.035067081451416016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5067081626039e-05, "grad_norm": 19.805097579956055, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8903417587280273, "num_tokens": 862776534.0, "step": 22611 }, { "epoch": 2.876478819488615, "ewc_loss": 0.03497380018234253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497380021144636e-05, "grad_norm": 19.906517028808594, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8689097762107849, "num_tokens": 862815690.0, "step": 22612 }, { "epoch": 2.8766060297672054, "ewc_loss": 0.03500153124332428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500153252389282e-05, "grad_norm": 19.733386993408203, "learning_rate": 1e-06, "loss": 0.3865, "mean_token_accuracy": 0.875092625617981, "num_tokens": 862851021.0, "step": 22613 }, { "epoch": 2.8767332400457954, "ewc_loss": 0.034929171204566956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492916948744096e-05, "grad_norm": 19.800390243530273, "learning_rate": 1e-06, "loss": 0.3641, "mean_token_accuracy": 0.883783757686615, "num_tokens": 862885122.0, "step": 22614 }, { "epoch": 2.8768604503243864, "ewc_loss": 0.0350554995238781, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505549830151722e-05, "grad_norm": 19.811811447143555, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8814035654067993, "num_tokens": 862922233.0, "step": 22615 }, { "epoch": 2.8769876606029765, "ewc_loss": 0.03493129834532738, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493129770504311e-05, "grad_norm": 19.70570945739746, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8860505223274231, "num_tokens": 862961355.0, "step": 22616 }, { "epoch": 2.8771148708815675, "ewc_loss": 0.03500771522521973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5007717087864876e-05, "grad_norm": 19.801124572753906, "learning_rate": 1e-06, "loss": 0.3695, "mean_token_accuracy": 0.8836970329284668, "num_tokens": 862998523.0, "step": 22617 }, { "epoch": 2.8772420811601576, "ewc_loss": 0.03501881659030914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501881656120531e-05, "grad_norm": 19.7808780670166, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8772737979888916, "num_tokens": 863039684.0, "step": 22618 }, { "epoch": 2.877369291438748, "ewc_loss": 0.03498530760407448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498530713841319e-05, "grad_norm": 19.806201934814453, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8803160786628723, "num_tokens": 863077649.0, "step": 22619 }, { "epoch": 2.8774965017173386, "ewc_loss": 0.03502412512898445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502412437228486e-05, "grad_norm": 19.82634925842285, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8715440034866333, "num_tokens": 863116816.0, "step": 22620 }, { "epoch": 2.877623711995929, "ewc_loss": 0.034955546259880066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4955544833792374e-05, "grad_norm": 19.768476486206055, "learning_rate": 1e-06, "loss": 0.3571, "mean_token_accuracy": 0.8869825601577759, "num_tokens": 863155306.0, "step": 22621 }, { "epoch": 2.8777509222745197, "ewc_loss": 0.03499029949307442, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499029844533652e-05, "grad_norm": 19.80963706970215, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8833762407302856, "num_tokens": 863194381.0, "step": 22622 }, { "epoch": 2.87787813255311, "ewc_loss": 0.03496589511632919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496589488349855e-05, "grad_norm": 19.75411033630371, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8754522800445557, "num_tokens": 863231470.0, "step": 22623 }, { "epoch": 2.8780053428317007, "ewc_loss": 0.03493848815560341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493848998914473e-05, "grad_norm": 19.687862396240234, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.870151162147522, "num_tokens": 863265617.0, "step": 22624 }, { "epoch": 2.8781325531102913, "ewc_loss": 0.035022441297769547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5022439988097176e-05, "grad_norm": 19.905261993408203, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8660622835159302, "num_tokens": 863300111.0, "step": 22625 }, { "epoch": 2.878259763388882, "ewc_loss": 0.03502357006072998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502357139950618e-05, "grad_norm": 19.740922927856445, "learning_rate": 1e-06, "loss": 0.3587, "mean_token_accuracy": 0.883671224117279, "num_tokens": 863338626.0, "step": 22626 }, { "epoch": 2.8783869736674723, "ewc_loss": 0.03500046953558922, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500047023408115e-05, "grad_norm": 19.931398391723633, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8689645528793335, "num_tokens": 863374672.0, "step": 22627 }, { "epoch": 2.878514183946063, "ewc_loss": 0.03499715402722359, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499715239740908e-05, "grad_norm": 19.707250595092773, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8721383810043335, "num_tokens": 863412717.0, "step": 22628 }, { "epoch": 2.8786413942246534, "ewc_loss": 0.03498828783631325, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49882866430562e-05, "grad_norm": 19.841903686523438, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8845953941345215, "num_tokens": 863447776.0, "step": 22629 }, { "epoch": 2.878768604503244, "ewc_loss": 0.03510124981403351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5101249522995204e-05, "grad_norm": 19.77558135986328, "learning_rate": 1e-06, "loss": 0.3401, "mean_token_accuracy": 0.8927261829376221, "num_tokens": 863485697.0, "step": 22630 }, { "epoch": 2.8788958147818344, "ewc_loss": 0.03500014916062355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5000150091946125e-05, "grad_norm": 19.825328826904297, "learning_rate": 1e-06, "loss": 0.3942, "mean_token_accuracy": 0.870652973651886, "num_tokens": 863528706.0, "step": 22631 }, { "epoch": 2.879023025060425, "ewc_loss": 0.03506471589207649, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506471693981439e-05, "grad_norm": 19.794002532958984, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8578252196311951, "num_tokens": 863569181.0, "step": 22632 }, { "epoch": 2.8791502353390155, "ewc_loss": 0.03496257960796356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496258068480529e-05, "grad_norm": 19.762752532958984, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.861577033996582, "num_tokens": 863602985.0, "step": 22633 }, { "epoch": 2.879277445617606, "ewc_loss": 0.035016924142837524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5016924812225625e-05, "grad_norm": 19.69961166381836, "learning_rate": 1e-06, "loss": 0.3741, "mean_token_accuracy": 0.8805394172668457, "num_tokens": 863634590.0, "step": 22634 }, { "epoch": 2.8794046558961965, "ewc_loss": 0.035057127475738525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5057128116022795e-05, "grad_norm": 19.80150604248047, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8743704557418823, "num_tokens": 863677671.0, "step": 22635 }, { "epoch": 2.879531866174787, "ewc_loss": 0.03511855751276016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5118559026159346e-05, "grad_norm": 19.743274688720703, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.878021240234375, "num_tokens": 863716102.0, "step": 22636 }, { "epoch": 2.8796590764533776, "ewc_loss": 0.03501617908477783, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501617902657017e-05, "grad_norm": 19.771831512451172, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.868518590927124, "num_tokens": 863751560.0, "step": 22637 }, { "epoch": 2.879786286731968, "ewc_loss": 0.03512522205710411, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512522380333394e-05, "grad_norm": 19.857275009155273, "learning_rate": 1e-06, "loss": 0.3347, "mean_token_accuracy": 0.8909091353416443, "num_tokens": 863792373.0, "step": 22638 }, { "epoch": 2.879913497010558, "ewc_loss": 0.03507997468113899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507997462293133e-05, "grad_norm": 19.801549911499023, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8714137673377991, "num_tokens": 863829969.0, "step": 22639 }, { "epoch": 2.880040707289149, "ewc_loss": 0.035099562257528305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509956150082871e-05, "grad_norm": 19.789432525634766, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8814299702644348, "num_tokens": 863867080.0, "step": 22640 }, { "epoch": 2.8801679175677393, "ewc_loss": 0.03508075326681137, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508075315039605e-05, "grad_norm": 19.728179931640625, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8805125951766968, "num_tokens": 863904230.0, "step": 22641 }, { "epoch": 2.8802951278463302, "ewc_loss": 0.03505830466747284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5058303183177486e-05, "grad_norm": 19.789161682128906, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8812568187713623, "num_tokens": 863947309.0, "step": 22642 }, { "epoch": 2.8804223381249203, "ewc_loss": 0.03512665256857872, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5126653529005125e-05, "grad_norm": 19.796600341796875, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8820686340332031, "num_tokens": 863985557.0, "step": 22643 }, { "epoch": 2.880549548403511, "ewc_loss": 0.035047497600317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5047498386120424e-05, "grad_norm": 19.75535011291504, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8647697567939758, "num_tokens": 864021842.0, "step": 22644 }, { "epoch": 2.8806767586821014, "ewc_loss": 0.035107310861349106, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510731039568782e-05, "grad_norm": 19.7548770904541, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8680331707000732, "num_tokens": 864060765.0, "step": 22645 }, { "epoch": 2.880803968960692, "ewc_loss": 0.03504978120326996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504978303681128e-05, "grad_norm": 19.66636085510254, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8789710998535156, "num_tokens": 864100624.0, "step": 22646 }, { "epoch": 2.8809311792392824, "ewc_loss": 0.035152651369571686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51526505255606e-05, "grad_norm": 19.840805053710938, "learning_rate": 1e-06, "loss": 0.3543, "mean_token_accuracy": 0.8867618441581726, "num_tokens": 864136599.0, "step": 22647 }, { "epoch": 2.881058389517873, "ewc_loss": 0.03512503579258919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512503462843597e-05, "grad_norm": 19.75710105895996, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8793573379516602, "num_tokens": 864169876.0, "step": 22648 }, { "epoch": 2.8811855997964635, "ewc_loss": 0.034999050199985504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499905142234638e-05, "grad_norm": 19.698612213134766, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8748577833175659, "num_tokens": 864213709.0, "step": 22649 }, { "epoch": 2.881312810075054, "ewc_loss": 0.035089246928691864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508924783091061e-05, "grad_norm": 19.850900650024414, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8815118670463562, "num_tokens": 864245581.0, "step": 22650 }, { "epoch": 2.8814400203536445, "ewc_loss": 0.03509417548775673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509417729219422e-05, "grad_norm": 19.7120304107666, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8621616363525391, "num_tokens": 864287081.0, "step": 22651 }, { "epoch": 2.881567230632235, "ewc_loss": 0.03501252830028534, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501252649584785e-05, "grad_norm": 19.749038696289062, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8601304888725281, "num_tokens": 864321480.0, "step": 22652 }, { "epoch": 2.8816944409108256, "ewc_loss": 0.03514951094985008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514951094985008e-05, "grad_norm": 19.77638053894043, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8822188973426819, "num_tokens": 864358580.0, "step": 22653 }, { "epoch": 2.881821651189416, "ewc_loss": 0.0351119190454483, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5111919714836404e-05, "grad_norm": 19.714887619018555, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8681997060775757, "num_tokens": 864396277.0, "step": 22654 }, { "epoch": 2.8819488614680067, "ewc_loss": 0.03511921316385269, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511921386234462e-05, "grad_norm": 19.74482536315918, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8719726204872131, "num_tokens": 864433282.0, "step": 22655 }, { "epoch": 2.882076071746597, "ewc_loss": 0.03516019508242607, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516019569360651e-05, "grad_norm": 19.7673397064209, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.868876576423645, "num_tokens": 864469305.0, "step": 22656 }, { "epoch": 2.8822032820251877, "ewc_loss": 0.03511585295200348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511585236992687e-05, "grad_norm": 19.739755630493164, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8779234290122986, "num_tokens": 864507246.0, "step": 22657 }, { "epoch": 2.8823304923037782, "ewc_loss": 0.03517266735434532, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517266668495722e-05, "grad_norm": 19.73581886291504, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8707693219184875, "num_tokens": 864547315.0, "step": 22658 }, { "epoch": 2.8824577025823688, "ewc_loss": 0.03516192361712456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516192373353988e-05, "grad_norm": 19.77156639099121, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8703811764717102, "num_tokens": 864590813.0, "step": 22659 }, { "epoch": 2.8825849128609593, "ewc_loss": 0.0351838544011116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518385346978903e-05, "grad_norm": 19.852195739746094, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8676204681396484, "num_tokens": 864625078.0, "step": 22660 }, { "epoch": 2.88271212313955, "ewc_loss": 0.035132553428411484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513255433063023e-05, "grad_norm": 19.719356536865234, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8834643959999084, "num_tokens": 864663714.0, "step": 22661 }, { "epoch": 2.88283933341814, "ewc_loss": 0.03507503867149353, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507503788569011e-05, "grad_norm": 19.811660766601562, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8748916387557983, "num_tokens": 864697026.0, "step": 22662 }, { "epoch": 2.882966543696731, "ewc_loss": 0.03519963100552559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5199631383875385e-05, "grad_norm": 19.83505630493164, "learning_rate": 1e-06, "loss": 0.365, "mean_token_accuracy": 0.8864656090736389, "num_tokens": 864732337.0, "step": 22663 }, { "epoch": 2.883093753975321, "ewc_loss": 0.035120639950037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5120639950037e-05, "grad_norm": 19.783357620239258, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8696891069412231, "num_tokens": 864776598.0, "step": 22664 }, { "epoch": 2.883220964253912, "ewc_loss": 0.03509999066591263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509999078232795e-05, "grad_norm": 19.80396842956543, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8627583384513855, "num_tokens": 864817279.0, "step": 22665 }, { "epoch": 2.883348174532502, "ewc_loss": 0.03505096584558487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505096537992358e-05, "grad_norm": 19.751476287841797, "learning_rate": 1e-06, "loss": 0.3685, "mean_token_accuracy": 0.8831920623779297, "num_tokens": 864854914.0, "step": 22666 }, { "epoch": 2.883475384811093, "ewc_loss": 0.03506755828857422, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506755820126273e-05, "grad_norm": 19.908327102661133, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8785824775695801, "num_tokens": 864892914.0, "step": 22667 }, { "epoch": 2.883602595089683, "ewc_loss": 0.03515268489718437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5152683267369866e-05, "grad_norm": 19.772891998291016, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8735893368721008, "num_tokens": 864934189.0, "step": 22668 }, { "epoch": 2.8837298053682736, "ewc_loss": 0.034974049776792526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497405123198405e-05, "grad_norm": 19.826412200927734, "learning_rate": 1e-06, "loss": 0.3655, "mean_token_accuracy": 0.8851892948150635, "num_tokens": 864972085.0, "step": 22669 }, { "epoch": 2.883857015646864, "ewc_loss": 0.035076264292001724, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50762638845481e-05, "grad_norm": 19.68527603149414, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8601187467575073, "num_tokens": 865008674.0, "step": 22670 }, { "epoch": 2.8839842259254547, "ewc_loss": 0.034964073449373245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4964072256116197e-05, "grad_norm": 19.838178634643555, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8669837117195129, "num_tokens": 865046004.0, "step": 22671 }, { "epoch": 2.884111436204045, "ewc_loss": 0.035193223506212234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5193224903196096e-05, "grad_norm": 19.757801055908203, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8807985782623291, "num_tokens": 865082409.0, "step": 22672 }, { "epoch": 2.8842386464826357, "ewc_loss": 0.03498285636305809, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498285514069721e-05, "grad_norm": 19.741579055786133, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8759094476699829, "num_tokens": 865124457.0, "step": 22673 }, { "epoch": 2.8843658567612263, "ewc_loss": 0.035097718238830566, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509771704557352e-05, "grad_norm": 19.75990867614746, "learning_rate": 1e-06, "loss": 0.3584, "mean_token_accuracy": 0.8874150514602661, "num_tokens": 865162290.0, "step": 22674 }, { "epoch": 2.884493067039817, "ewc_loss": 0.035044897347688675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504489723127335e-05, "grad_norm": 19.897518157958984, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8737086653709412, "num_tokens": 865201686.0, "step": 22675 }, { "epoch": 2.8846202773184073, "ewc_loss": 0.03510279208421707, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510279202600941e-05, "grad_norm": 19.715181350708008, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8663325309753418, "num_tokens": 865236876.0, "step": 22676 }, { "epoch": 2.884747487596998, "ewc_loss": 0.03503469005227089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5034689062740654e-05, "grad_norm": 19.86907386779785, "learning_rate": 1e-06, "loss": 0.3762, "mean_token_accuracy": 0.8802660703659058, "num_tokens": 865276994.0, "step": 22677 }, { "epoch": 2.8848746978755884, "ewc_loss": 0.03513098508119583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513098636176437e-05, "grad_norm": 19.75539207458496, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8752193450927734, "num_tokens": 865311990.0, "step": 22678 }, { "epoch": 2.885001908154179, "ewc_loss": 0.034984979778528214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498497972032055e-05, "grad_norm": 19.882537841796875, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8820663690567017, "num_tokens": 865352324.0, "step": 22679 }, { "epoch": 2.8851291184327694, "ewc_loss": 0.035094331949949265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509433372528292e-05, "grad_norm": 19.79814338684082, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8755947947502136, "num_tokens": 865391428.0, "step": 22680 }, { "epoch": 2.88525632871136, "ewc_loss": 0.03495253250002861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49525325873401e-05, "grad_norm": 19.89665985107422, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8744451999664307, "num_tokens": 865428478.0, "step": 22681 }, { "epoch": 2.8853835389899505, "ewc_loss": 0.03511320799589157, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5113207559334114e-05, "grad_norm": 19.783620834350586, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8758727312088013, "num_tokens": 865466531.0, "step": 22682 }, { "epoch": 2.885510749268541, "ewc_loss": 0.035006389021873474, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500638922560029e-05, "grad_norm": 19.842792510986328, "learning_rate": 1e-06, "loss": 0.3708, "mean_token_accuracy": 0.8828451037406921, "num_tokens": 865503177.0, "step": 22683 }, { "epoch": 2.8856379595471315, "ewc_loss": 0.035092633217573166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509263478918001e-05, "grad_norm": 19.726579666137695, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8690823316574097, "num_tokens": 865539454.0, "step": 22684 }, { "epoch": 2.885765169825722, "ewc_loss": 0.03504108265042305, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5041080991504714e-05, "grad_norm": 19.813316345214844, "learning_rate": 1e-06, "loss": 0.3358, "mean_token_accuracy": 0.8931688070297241, "num_tokens": 865571937.0, "step": 22685 }, { "epoch": 2.8858923801043126, "ewc_loss": 0.035128477960824966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5128479794366285e-05, "grad_norm": 19.796207427978516, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8787153959274292, "num_tokens": 865609457.0, "step": 22686 }, { "epoch": 2.8860195903829027, "ewc_loss": 0.035051267594099045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505126733216457e-05, "grad_norm": 19.838987350463867, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8627945780754089, "num_tokens": 865647391.0, "step": 22687 }, { "epoch": 2.8861468006614936, "ewc_loss": 0.03509802371263504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509802263579331e-05, "grad_norm": 19.840469360351562, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8745368719100952, "num_tokens": 865680578.0, "step": 22688 }, { "epoch": 2.8862740109400837, "ewc_loss": 0.03503168374300003, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5031684092245996e-05, "grad_norm": 19.812780380249023, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8723514080047607, "num_tokens": 865714610.0, "step": 22689 }, { "epoch": 2.8864012212186747, "ewc_loss": 0.035080134868621826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5080134693998843e-05, "grad_norm": 19.802824020385742, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8715823888778687, "num_tokens": 865755672.0, "step": 22690 }, { "epoch": 2.886528431497265, "ewc_loss": 0.0350940003991127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509399903123267e-05, "grad_norm": 19.81371307373047, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.884002685546875, "num_tokens": 865790111.0, "step": 22691 }, { "epoch": 2.8866556417758558, "ewc_loss": 0.03509331867098808, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5093318729195744e-05, "grad_norm": 19.7965145111084, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8770122528076172, "num_tokens": 865825580.0, "step": 22692 }, { "epoch": 2.886782852054446, "ewc_loss": 0.03510915860533714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510915848892182e-05, "grad_norm": 19.805570602416992, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8792304992675781, "num_tokens": 865869099.0, "step": 22693 }, { "epoch": 2.8869100623330364, "ewc_loss": 0.03510838374495506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510838359943591e-05, "grad_norm": 19.782590866088867, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8666079640388489, "num_tokens": 865906957.0, "step": 22694 }, { "epoch": 2.887037272611627, "ewc_loss": 0.03510155901312828, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5101558751193807e-05, "grad_norm": 19.78021240234375, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.884432315826416, "num_tokens": 865945712.0, "step": 22695 }, { "epoch": 2.8871644828902174, "ewc_loss": 0.03509318828582764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509318776195869e-05, "grad_norm": 19.75921630859375, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8615505695343018, "num_tokens": 865984469.0, "step": 22696 }, { "epoch": 2.887291693168808, "ewc_loss": 0.03513456508517265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513456613291055e-05, "grad_norm": 19.787273406982422, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.876615047454834, "num_tokens": 866024624.0, "step": 22697 }, { "epoch": 2.8874189034473985, "ewc_loss": 0.035160377621650696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5160377592546865e-05, "grad_norm": 19.821533203125, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8726810216903687, "num_tokens": 866062446.0, "step": 22698 }, { "epoch": 2.887546113725989, "ewc_loss": 0.0351836234331131, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518362427712418e-05, "grad_norm": 19.82613754272461, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8854109644889832, "num_tokens": 866100896.0, "step": 22699 }, { "epoch": 2.8876733240045795, "ewc_loss": 0.03506100922822952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506100983940996e-05, "grad_norm": 19.773193359375, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8614131212234497, "num_tokens": 866134668.0, "step": 22700 }, { "epoch": 2.88780053428317, "ewc_loss": 0.03509724512696266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5097244108328596e-05, "grad_norm": 19.7999210357666, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8694040775299072, "num_tokens": 866173834.0, "step": 22701 }, { "epoch": 2.8879277445617606, "ewc_loss": 0.03506607562303543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5066073905909434e-05, "grad_norm": 19.74978256225586, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8600045442581177, "num_tokens": 866211691.0, "step": 22702 }, { "epoch": 2.888054954840351, "ewc_loss": 0.03507884219288826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5078843211522326e-05, "grad_norm": 19.838987350463867, "learning_rate": 1e-06, "loss": 0.3861, "mean_token_accuracy": 0.876319169998169, "num_tokens": 866240878.0, "step": 22703 }, { "epoch": 2.8881821651189417, "ewc_loss": 0.035151246935129166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5151246265741065e-05, "grad_norm": 19.725566864013672, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8630455732345581, "num_tokens": 866289552.0, "step": 22704 }, { "epoch": 2.888309375397532, "ewc_loss": 0.035018425434827805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501842365949415e-05, "grad_norm": 19.738798141479492, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8664827942848206, "num_tokens": 866330361.0, "step": 22705 }, { "epoch": 2.8884365856761227, "ewc_loss": 0.035144973546266556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514497439027764e-05, "grad_norm": 19.793344497680664, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8786674737930298, "num_tokens": 866371557.0, "step": 22706 }, { "epoch": 2.8885637959547132, "ewc_loss": 0.035149917006492615, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514991840347648e-05, "grad_norm": 19.81007194519043, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8759080767631531, "num_tokens": 866410627.0, "step": 22707 }, { "epoch": 2.8886910062333038, "ewc_loss": 0.03515564277768135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515564458211884e-05, "grad_norm": 19.757064819335938, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8743959665298462, "num_tokens": 866446036.0, "step": 22708 }, { "epoch": 2.8888182165118943, "ewc_loss": 0.035146281123161316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5146280424669385e-05, "grad_norm": 19.831483840942383, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8656301498413086, "num_tokens": 866489655.0, "step": 22709 }, { "epoch": 2.888945426790485, "ewc_loss": 0.03514599800109863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514599666232243e-05, "grad_norm": 19.7833194732666, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8685222864151001, "num_tokens": 866532750.0, "step": 22710 }, { "epoch": 2.8890726370690754, "ewc_loss": 0.03509765863418579, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5097658837912604e-05, "grad_norm": 19.866647720336914, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8810634016990662, "num_tokens": 866570748.0, "step": 22711 }, { "epoch": 2.8891998473476654, "ewc_loss": 0.035107702016830444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5107703297398984e-05, "grad_norm": 19.728551864624023, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8698787689208984, "num_tokens": 866609422.0, "step": 22712 }, { "epoch": 2.8893270576262564, "ewc_loss": 0.03501281887292862, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501281753415242e-05, "grad_norm": 19.876440048217773, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8771191835403442, "num_tokens": 866650865.0, "step": 22713 }, { "epoch": 2.8894542679048465, "ewc_loss": 0.03515006974339485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5150071198586375e-05, "grad_norm": 19.788747787475586, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8728954792022705, "num_tokens": 866687605.0, "step": 22714 }, { "epoch": 2.8895814781834375, "ewc_loss": 0.03494606912136078, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49460678989999e-05, "grad_norm": 19.84775161743164, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8670046329498291, "num_tokens": 866722974.0, "step": 22715 }, { "epoch": 2.8897086884620276, "ewc_loss": 0.03519348427653313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51934831996914e-05, "grad_norm": 19.878507614135742, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8723767995834351, "num_tokens": 866760694.0, "step": 22716 }, { "epoch": 2.889835898740618, "ewc_loss": 0.035044461488723755, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50444606738165e-05, "grad_norm": 19.78370475769043, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8576478958129883, "num_tokens": 866795102.0, "step": 22717 }, { "epoch": 2.8899631090192086, "ewc_loss": 0.03504161909222603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5041619412368163e-05, "grad_norm": 19.753097534179688, "learning_rate": 1e-06, "loss": 0.3498, "mean_token_accuracy": 0.8876339197158813, "num_tokens": 866831282.0, "step": 22718 }, { "epoch": 2.890090319297799, "ewc_loss": 0.035009000450372696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.500900129438378e-05, "grad_norm": 19.69748878479004, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8677271604537964, "num_tokens": 866872661.0, "step": 22719 }, { "epoch": 2.8902175295763897, "ewc_loss": 0.03508201241493225, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50820118910633e-05, "grad_norm": 19.78714370727539, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8656190037727356, "num_tokens": 866917576.0, "step": 22720 }, { "epoch": 2.89034473985498, "ewc_loss": 0.0351925864815712, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5192588256904855e-05, "grad_norm": 19.863224029541016, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8784584999084473, "num_tokens": 866957245.0, "step": 22721 }, { "epoch": 2.8904719501335707, "ewc_loss": 0.03512480854988098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512480907374993e-05, "grad_norm": 19.825851440429688, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8471320271492004, "num_tokens": 866998695.0, "step": 22722 }, { "epoch": 2.8905991604121613, "ewc_loss": 0.03507780656218529, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5077806387562305e-05, "grad_norm": 19.75284194946289, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8619875907897949, "num_tokens": 867033502.0, "step": 22723 }, { "epoch": 2.890726370690752, "ewc_loss": 0.03505620360374451, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5056204069405794e-05, "grad_norm": 19.779611587524414, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8580561876296997, "num_tokens": 867076032.0, "step": 22724 }, { "epoch": 2.8908535809693423, "ewc_loss": 0.03514358401298523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514358468237333e-05, "grad_norm": 19.814334869384766, "learning_rate": 1e-06, "loss": 0.3633, "mean_token_accuracy": 0.8747873902320862, "num_tokens": 867107429.0, "step": 22725 }, { "epoch": 2.890980791247933, "ewc_loss": 0.03510335460305214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510335591272451e-05, "grad_norm": 19.837190628051758, "learning_rate": 1e-06, "loss": 0.3614, "mean_token_accuracy": 0.8774689435958862, "num_tokens": 867139432.0, "step": 22726 }, { "epoch": 2.8911080015265234, "ewc_loss": 0.03512508422136307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512508556013927e-05, "grad_norm": 19.800559997558594, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8715068697929382, "num_tokens": 867180588.0, "step": 22727 }, { "epoch": 2.891235211805114, "ewc_loss": 0.035160329192876816, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516033029882237e-05, "grad_norm": 19.85966682434082, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8727066516876221, "num_tokens": 867215170.0, "step": 22728 }, { "epoch": 2.8913624220837044, "ewc_loss": 0.035121094435453415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512109469738789e-05, "grad_norm": 19.775848388671875, "learning_rate": 1e-06, "loss": 0.3488, "mean_token_accuracy": 0.8897393941879272, "num_tokens": 867254958.0, "step": 22729 }, { "epoch": 2.891489632362295, "ewc_loss": 0.03509945422410965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509945599944331e-05, "grad_norm": 19.792898178100586, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8735814094543457, "num_tokens": 867291135.0, "step": 22730 }, { "epoch": 2.8916168426408855, "ewc_loss": 0.035132549703121185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513255069265142e-05, "grad_norm": 19.80335807800293, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8828630447387695, "num_tokens": 867326926.0, "step": 22731 }, { "epoch": 2.891744052919476, "ewc_loss": 0.03513622656464577, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5136225051246583e-05, "grad_norm": 19.768024444580078, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8670320510864258, "num_tokens": 867361142.0, "step": 22732 }, { "epoch": 2.8918712631980665, "ewc_loss": 0.03516298159956932, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516298238537274e-05, "grad_norm": 19.834749221801758, "learning_rate": 1e-06, "loss": 0.3414, "mean_token_accuracy": 0.8923739194869995, "num_tokens": 867398344.0, "step": 22733 }, { "epoch": 2.891998473476657, "ewc_loss": 0.03512140363454819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512140392558649e-05, "grad_norm": 19.758237838745117, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8830188512802124, "num_tokens": 867437642.0, "step": 22734 }, { "epoch": 2.8921256837552476, "ewc_loss": 0.03513173758983612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513173942337744e-05, "grad_norm": 19.819971084594727, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8651249408721924, "num_tokens": 867471646.0, "step": 22735 }, { "epoch": 2.892252894033838, "ewc_loss": 0.03513779118657112, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513778938213363e-05, "grad_norm": 19.851327896118164, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8754867315292358, "num_tokens": 867509682.0, "step": 22736 }, { "epoch": 2.892380104312428, "ewc_loss": 0.03509315103292465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509315138217062e-05, "grad_norm": 19.83892822265625, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8715622425079346, "num_tokens": 867544425.0, "step": 22737 }, { "epoch": 2.892507314591019, "ewc_loss": 0.03509591892361641, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5095919884042814e-05, "grad_norm": 19.776025772094727, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8683522939682007, "num_tokens": 867584978.0, "step": 22738 }, { "epoch": 2.8926345248696093, "ewc_loss": 0.03503016009926796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5030159779125825e-05, "grad_norm": 19.72393035888672, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.8765404224395752, "num_tokens": 867620360.0, "step": 22739 }, { "epoch": 2.8927617351482002, "ewc_loss": 0.03514617681503296, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5146178561262786e-05, "grad_norm": 19.808025360107422, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8833233118057251, "num_tokens": 867655246.0, "step": 22740 }, { "epoch": 2.8928889454267903, "ewc_loss": 0.035114072263240814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51140733982902e-05, "grad_norm": 19.764253616333008, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8666524887084961, "num_tokens": 867692792.0, "step": 22741 }, { "epoch": 2.893016155705381, "ewc_loss": 0.035101186484098434, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510118767735548e-05, "grad_norm": 19.817201614379883, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8775321245193481, "num_tokens": 867732365.0, "step": 22742 }, { "epoch": 2.8931433659839714, "ewc_loss": 0.035169169306755066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5169170587323606e-05, "grad_norm": 19.809545516967773, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8626197576522827, "num_tokens": 867764800.0, "step": 22743 }, { "epoch": 2.893270576262562, "ewc_loss": 0.0351560041308403, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515600474202074e-05, "grad_norm": 19.782390594482422, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8684388399124146, "num_tokens": 867807107.0, "step": 22744 }, { "epoch": 2.8933977865411524, "ewc_loss": 0.03521236032247543, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52123606717214e-05, "grad_norm": 19.9635009765625, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8667718172073364, "num_tokens": 867850225.0, "step": 22745 }, { "epoch": 2.893524996819743, "ewc_loss": 0.03518569469451904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5185694287065417e-05, "grad_norm": 19.740385055541992, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8715373277664185, "num_tokens": 867890457.0, "step": 22746 }, { "epoch": 2.8936522070983335, "ewc_loss": 0.035041842609643936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5041841329075396e-05, "grad_norm": 19.88949966430664, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8823392987251282, "num_tokens": 867925880.0, "step": 22747 }, { "epoch": 2.893779417376924, "ewc_loss": 0.03518027067184448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518027006066404e-05, "grad_norm": 19.736936569213867, "learning_rate": 1e-06, "loss": 0.347, "mean_token_accuracy": 0.8873088955879211, "num_tokens": 867963923.0, "step": 22748 }, { "epoch": 2.8939066276555145, "ewc_loss": 0.03514029085636139, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514029231155291e-05, "grad_norm": 19.915180206298828, "learning_rate": 1e-06, "loss": 0.3764, "mean_token_accuracy": 0.8792927265167236, "num_tokens": 868000489.0, "step": 22749 }, { "epoch": 2.894033837934105, "ewc_loss": 0.035198017954826355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519801975926384e-05, "grad_norm": 19.830703735351562, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8783940076828003, "num_tokens": 868034001.0, "step": 22750 }, { "epoch": 2.8941610482126956, "ewc_loss": 0.035073522478342056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507352084852755e-05, "grad_norm": 19.852033615112305, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8800054788589478, "num_tokens": 868074956.0, "step": 22751 }, { "epoch": 2.894288258491286, "ewc_loss": 0.03522888943552971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5228888009442016e-05, "grad_norm": 19.878963470458984, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8829081058502197, "num_tokens": 868110727.0, "step": 22752 }, { "epoch": 2.8944154687698767, "ewc_loss": 0.035079196095466614, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5079196095466614e-05, "grad_norm": 19.847415924072266, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8578534126281738, "num_tokens": 868151480.0, "step": 22753 }, { "epoch": 2.894542679048467, "ewc_loss": 0.03510339558124542, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510339593049139e-05, "grad_norm": 19.751850128173828, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8657656908035278, "num_tokens": 868190620.0, "step": 22754 }, { "epoch": 2.8946698893270577, "ewc_loss": 0.03509744256734848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5097444197162986e-05, "grad_norm": 19.830245971679688, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8728775978088379, "num_tokens": 868230598.0, "step": 22755 }, { "epoch": 2.8947970996056482, "ewc_loss": 0.035206228494644165, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520622703945264e-05, "grad_norm": 19.822572708129883, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8688799738883972, "num_tokens": 868265933.0, "step": 22756 }, { "epoch": 2.8949243098842388, "ewc_loss": 0.03520771116018295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5207711334805936e-05, "grad_norm": 19.867774963378906, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8781259655952454, "num_tokens": 868300282.0, "step": 22757 }, { "epoch": 2.8950515201628293, "ewc_loss": 0.03513585031032562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513585033942945e-05, "grad_norm": 19.747272491455078, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.8902637958526611, "num_tokens": 868331031.0, "step": 22758 }, { "epoch": 2.89517873044142, "ewc_loss": 0.0351322665810585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513226693030447e-05, "grad_norm": 19.819211959838867, "learning_rate": 1e-06, "loss": 0.3728, "mean_token_accuracy": 0.877918004989624, "num_tokens": 868368268.0, "step": 22759 }, { "epoch": 2.89530594072001, "ewc_loss": 0.03516201302409172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516201468301006e-05, "grad_norm": 19.706506729125977, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8607655167579651, "num_tokens": 868407551.0, "step": 22760 }, { "epoch": 2.895433150998601, "ewc_loss": 0.035119883716106415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511988325044513e-05, "grad_norm": 19.856653213500977, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8722929954528809, "num_tokens": 868449301.0, "step": 22761 }, { "epoch": 2.895560361277191, "ewc_loss": 0.035202983766794205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5202985600335523e-05, "grad_norm": 19.72393798828125, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8718560934066772, "num_tokens": 868482853.0, "step": 22762 }, { "epoch": 2.895687571555782, "ewc_loss": 0.03509862348437309, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5098622902296484e-05, "grad_norm": 19.821651458740234, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.871063768863678, "num_tokens": 868523960.0, "step": 22763 }, { "epoch": 2.895814781834372, "ewc_loss": 0.03521263226866722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521263352013193e-05, "grad_norm": 19.807180404663086, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8676124811172485, "num_tokens": 868566447.0, "step": 22764 }, { "epoch": 2.895941992112963, "ewc_loss": 0.035106003284454346, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510600436129607e-05, "grad_norm": 19.7131404876709, "learning_rate": 1e-06, "loss": 0.3612, "mean_token_accuracy": 0.8838775157928467, "num_tokens": 868608342.0, "step": 22765 }, { "epoch": 2.896069202391553, "ewc_loss": 0.035193778574466705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5193777875974774e-05, "grad_norm": 19.79407501220703, "learning_rate": 1e-06, "loss": 0.3632, "mean_token_accuracy": 0.8874843120574951, "num_tokens": 868647354.0, "step": 22766 }, { "epoch": 2.8961964126701436, "ewc_loss": 0.03522736579179764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522736733430065e-05, "grad_norm": 19.854019165039062, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8569812178611755, "num_tokens": 868690716.0, "step": 22767 }, { "epoch": 2.896323622948734, "ewc_loss": 0.03522369638085365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5223696613684297e-05, "grad_norm": 19.784149169921875, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8819699287414551, "num_tokens": 868728318.0, "step": 22768 }, { "epoch": 2.8964508332273247, "ewc_loss": 0.035158585757017136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515858406899497e-05, "grad_norm": 19.836713790893555, "learning_rate": 1e-06, "loss": 0.3415, "mean_token_accuracy": 0.8906457424163818, "num_tokens": 868764323.0, "step": 22769 }, { "epoch": 2.896578043505915, "ewc_loss": 0.035187140107154846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518713856465183e-05, "grad_norm": 19.837520599365234, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8587753772735596, "num_tokens": 868801371.0, "step": 22770 }, { "epoch": 2.8967052537845057, "ewc_loss": 0.03510911017656326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510911119519733e-05, "grad_norm": 19.807838439941406, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8800920248031616, "num_tokens": 868846679.0, "step": 22771 }, { "epoch": 2.8968324640630962, "ewc_loss": 0.035095565021038055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509556336211972e-05, "grad_norm": 19.85788345336914, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8618475198745728, "num_tokens": 868880076.0, "step": 22772 }, { "epoch": 2.8969596743416868, "ewc_loss": 0.035081956535577774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5081957321381196e-05, "grad_norm": 19.73560905456543, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8762326240539551, "num_tokens": 868917086.0, "step": 22773 }, { "epoch": 2.8970868846202773, "ewc_loss": 0.03508732467889786, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508732334012166e-05, "grad_norm": 19.824811935424805, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8822144865989685, "num_tokens": 868954952.0, "step": 22774 }, { "epoch": 2.897214094898868, "ewc_loss": 0.03514951467514038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514951458782889e-05, "grad_norm": 19.764440536499023, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8715239763259888, "num_tokens": 868990911.0, "step": 22775 }, { "epoch": 2.8973413051774584, "ewc_loss": 0.035067591816186905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506759094307199e-05, "grad_norm": 19.760501861572266, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8857914805412292, "num_tokens": 869033398.0, "step": 22776 }, { "epoch": 2.897468515456049, "ewc_loss": 0.035066794604063034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506679422571324e-05, "grad_norm": 19.788015365600586, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8625414371490479, "num_tokens": 869070513.0, "step": 22777 }, { "epoch": 2.8975957257346394, "ewc_loss": 0.03509688004851341, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5096880310447887e-05, "grad_norm": 19.777055740356445, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8822788000106812, "num_tokens": 869113494.0, "step": 22778 }, { "epoch": 2.89772293601323, "ewc_loss": 0.0351400189101696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514001946314238e-05, "grad_norm": 19.827638626098633, "learning_rate": 1e-06, "loss": 0.3654, "mean_token_accuracy": 0.8840084075927734, "num_tokens": 869147433.0, "step": 22779 }, { "epoch": 2.8978501462918205, "ewc_loss": 0.035163071006536484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516306969686411e-05, "grad_norm": 19.855966567993164, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8704394102096558, "num_tokens": 869186763.0, "step": 22780 }, { "epoch": 2.897977356570411, "ewc_loss": 0.035082459449768066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5082459362456575e-05, "grad_norm": 19.88675880432129, "learning_rate": 1e-06, "loss": 0.3542, "mean_token_accuracy": 0.8875159025192261, "num_tokens": 869222624.0, "step": 22781 }, { "epoch": 2.8981045668490015, "ewc_loss": 0.035134825855493546, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513482442940585e-05, "grad_norm": 19.833730697631836, "learning_rate": 1e-06, "loss": 0.3315, "mean_token_accuracy": 0.8956516981124878, "num_tokens": 869263027.0, "step": 22782 }, { "epoch": 2.898231777127592, "ewc_loss": 0.035071250051259995, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5071250749751925e-05, "grad_norm": 19.828516006469727, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8706993460655212, "num_tokens": 869304232.0, "step": 22783 }, { "epoch": 2.8983589874061826, "ewc_loss": 0.035136736929416656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5136738006258383e-05, "grad_norm": 19.917530059814453, "learning_rate": 1e-06, "loss": 0.3713, "mean_token_accuracy": 0.8808132410049438, "num_tokens": 869340141.0, "step": 22784 }, { "epoch": 2.8984861976847727, "ewc_loss": 0.03501635044813156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5016350011574104e-05, "grad_norm": 19.850265502929688, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8668231964111328, "num_tokens": 869382428.0, "step": 22785 }, { "epoch": 2.8986134079633636, "ewc_loss": 0.03501875698566437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50187583535444e-05, "grad_norm": 19.8121337890625, "learning_rate": 1e-06, "loss": 0.3661, "mean_token_accuracy": 0.8833039402961731, "num_tokens": 869418890.0, "step": 22786 }, { "epoch": 2.8987406182419537, "ewc_loss": 0.03513678163290024, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513678166200407e-05, "grad_norm": 19.91044807434082, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8762485384941101, "num_tokens": 869453704.0, "step": 22787 }, { "epoch": 2.8988678285205447, "ewc_loss": 0.035058435052633286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505843415041454e-05, "grad_norm": 19.90877342224121, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8729654550552368, "num_tokens": 869491845.0, "step": 22788 }, { "epoch": 2.898995038799135, "ewc_loss": 0.03505262732505798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505262793623842e-05, "grad_norm": 19.775558471679688, "learning_rate": 1e-06, "loss": 0.3616, "mean_token_accuracy": 0.8858914375305176, "num_tokens": 869523393.0, "step": 22789 }, { "epoch": 2.8991222490777258, "ewc_loss": 0.034979186952114105, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.497918805805966e-05, "grad_norm": 19.830629348754883, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8737524747848511, "num_tokens": 869561927.0, "step": 22790 }, { "epoch": 2.899249459356316, "ewc_loss": 0.03505261614918709, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5052617022302e-05, "grad_norm": 19.787351608276367, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8745455741882324, "num_tokens": 869599568.0, "step": 22791 }, { "epoch": 2.8993766696349064, "ewc_loss": 0.03497496619820595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4974968002643436e-05, "grad_norm": 19.84840965270996, "learning_rate": 1e-06, "loss": 0.3778, "mean_token_accuracy": 0.8788098692893982, "num_tokens": 869636329.0, "step": 22792 }, { "epoch": 2.899503879913497, "ewc_loss": 0.03499631956219673, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499631930026226e-05, "grad_norm": 19.75823974609375, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8709627389907837, "num_tokens": 869679720.0, "step": 22793 }, { "epoch": 2.8996310901920874, "ewc_loss": 0.035106413066387177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510641181492247e-05, "grad_norm": 19.956377029418945, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8780399560928345, "num_tokens": 869719527.0, "step": 22794 }, { "epoch": 2.899758300470678, "ewc_loss": 0.0350506529211998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505065251374617e-05, "grad_norm": 19.766559600830078, "learning_rate": 1e-06, "loss": 0.3749, "mean_token_accuracy": 0.8802639245986938, "num_tokens": 869755139.0, "step": 22795 }, { "epoch": 2.8998855107492685, "ewc_loss": 0.03498886525630951, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4988865081686527e-05, "grad_norm": 19.817825317382812, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8698902130126953, "num_tokens": 869791860.0, "step": 22796 }, { "epoch": 2.900012721027859, "ewc_loss": 0.035139769315719604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513976844260469e-05, "grad_norm": 19.886655807495117, "learning_rate": 1e-06, "loss": 0.3291, "mean_token_accuracy": 0.8970823287963867, "num_tokens": 869827294.0, "step": 22797 }, { "epoch": 2.9001399313064495, "ewc_loss": 0.03506213799118996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506213761284016e-05, "grad_norm": 19.78685188293457, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8705673813819885, "num_tokens": 869859398.0, "step": 22798 }, { "epoch": 2.90026714158504, "ewc_loss": 0.03508888557553291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50888840330299e-05, "grad_norm": 19.819013595581055, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.87464439868927, "num_tokens": 869898215.0, "step": 22799 }, { "epoch": 2.9003943518636306, "ewc_loss": 0.035128504037857056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5128505260217935e-05, "grad_norm": 19.733055114746094, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8654085397720337, "num_tokens": 869939787.0, "step": 22800 }, { "epoch": 2.900521562142221, "ewc_loss": 0.03505963087081909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5059631045442075e-05, "grad_norm": 19.858858108520508, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8666108250617981, "num_tokens": 869976790.0, "step": 22801 }, { "epoch": 2.9006487724208116, "ewc_loss": 0.03508477658033371, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508477675495669e-05, "grad_norm": 19.755489349365234, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8801709413528442, "num_tokens": 870013547.0, "step": 22802 }, { "epoch": 2.900775982699402, "ewc_loss": 0.03503494709730148, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.503494735923596e-05, "grad_norm": 19.782974243164062, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8867732286453247, "num_tokens": 870049540.0, "step": 22803 }, { "epoch": 2.9009031929779927, "ewc_loss": 0.03514263406395912, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514263516990468e-05, "grad_norm": 19.824460983276367, "learning_rate": 1e-06, "loss": 0.3707, "mean_token_accuracy": 0.8820521235466003, "num_tokens": 870088247.0, "step": 22804 }, { "epoch": 2.9010304032565832, "ewc_loss": 0.035113923251628876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511392424115911e-05, "grad_norm": 19.880659103393555, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8737368583679199, "num_tokens": 870129112.0, "step": 22805 }, { "epoch": 2.9011576135351738, "ewc_loss": 0.035099808126688004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5099808883387595e-05, "grad_norm": 19.82126235961914, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8765599131584167, "num_tokens": 870162082.0, "step": 22806 }, { "epoch": 2.9012848238137643, "ewc_loss": 0.0350666418671608, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506664143060334e-05, "grad_norm": 19.90277099609375, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8864222764968872, "num_tokens": 870199868.0, "step": 22807 }, { "epoch": 2.901412034092355, "ewc_loss": 0.03511130437254906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5111304896418005e-05, "grad_norm": 19.84556770324707, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8808391094207764, "num_tokens": 870238121.0, "step": 22808 }, { "epoch": 2.9015392443709453, "ewc_loss": 0.035034213215112686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5034212487516925e-05, "grad_norm": 19.858692169189453, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8664841055870056, "num_tokens": 870278008.0, "step": 22809 }, { "epoch": 2.9016664546495354, "ewc_loss": 0.035102248191833496, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5102249967167154e-05, "grad_norm": 19.924978256225586, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8725621700286865, "num_tokens": 870317671.0, "step": 22810 }, { "epoch": 2.9017936649281264, "ewc_loss": 0.034999001771211624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4999000490643084e-05, "grad_norm": 19.795068740844727, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.882719099521637, "num_tokens": 870352893.0, "step": 22811 }, { "epoch": 2.9019208752067165, "ewc_loss": 0.034954071044921875, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.49540714523755e-05, "grad_norm": 19.904531478881836, "learning_rate": 1e-06, "loss": 0.3676, "mean_token_accuracy": 0.8839961290359497, "num_tokens": 870387513.0, "step": 22812 }, { "epoch": 2.9020480854853075, "ewc_loss": 0.035036828368902206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5036828194279224e-05, "grad_norm": 19.829164505004883, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.88222336769104, "num_tokens": 870428649.0, "step": 22813 }, { "epoch": 2.9021752957638975, "ewc_loss": 0.034922029823064804, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.492202813504264e-05, "grad_norm": 19.827199935913086, "learning_rate": 1e-06, "loss": 0.3497, "mean_token_accuracy": 0.8893697261810303, "num_tokens": 870468329.0, "step": 22814 }, { "epoch": 2.902302506042488, "ewc_loss": 0.03507170081138611, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5071701859124005e-05, "grad_norm": 19.841552734375, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8750696182250977, "num_tokens": 870509273.0, "step": 22815 }, { "epoch": 2.9024297163210786, "ewc_loss": 0.0350424200296402, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5042419767705724e-05, "grad_norm": 19.889902114868164, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8703454732894897, "num_tokens": 870544206.0, "step": 22816 }, { "epoch": 2.902556926599669, "ewc_loss": 0.035069599747657776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5069599107373506e-05, "grad_norm": 19.803749084472656, "learning_rate": 1e-06, "loss": 0.3425, "mean_token_accuracy": 0.8926344513893127, "num_tokens": 870578708.0, "step": 22817 }, { "epoch": 2.9026841368782597, "ewc_loss": 0.03493253141641617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.493253097985871e-05, "grad_norm": 19.7971134185791, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8820796012878418, "num_tokens": 870620599.0, "step": 22818 }, { "epoch": 2.90281134715685, "ewc_loss": 0.035115133970975876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511513568810187e-05, "grad_norm": 19.840662002563477, "learning_rate": 1e-06, "loss": 0.3409, "mean_token_accuracy": 0.8899808526039124, "num_tokens": 870657828.0, "step": 22819 }, { "epoch": 2.9029385574354407, "ewc_loss": 0.034984756261110306, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.498475780361332e-05, "grad_norm": 19.77375602722168, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8705326318740845, "num_tokens": 870694786.0, "step": 22820 }, { "epoch": 2.9030657677140312, "ewc_loss": 0.03503658249974251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.503658081172034e-05, "grad_norm": 19.932540893554688, "learning_rate": 1e-06, "loss": 0.3432, "mean_token_accuracy": 0.8929345011711121, "num_tokens": 870733850.0, "step": 22821 }, { "epoch": 2.9031929779926218, "ewc_loss": 0.03506959602236748, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.50695954693947e-05, "grad_norm": 19.81671142578125, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.875593900680542, "num_tokens": 870771556.0, "step": 22822 }, { "epoch": 2.9033201882712123, "ewc_loss": 0.03496340289711952, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.496340286801569e-05, "grad_norm": 19.83168601989746, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8876326084136963, "num_tokens": 870809335.0, "step": 22823 }, { "epoch": 2.903447398549803, "ewc_loss": 0.03507425636053085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507425572024658e-05, "grad_norm": 19.842981338500977, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8726477026939392, "num_tokens": 870844239.0, "step": 22824 }, { "epoch": 2.9035746088283934, "ewc_loss": 0.03506828472018242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5068285797024146e-05, "grad_norm": 19.861703872680664, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8744508624076843, "num_tokens": 870879540.0, "step": 22825 }, { "epoch": 2.903701819106984, "ewc_loss": 0.034992482513189316, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.4992481232620776e-05, "grad_norm": 19.818525314331055, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8756020069122314, "num_tokens": 870917219.0, "step": 22826 }, { "epoch": 2.9038290293855744, "ewc_loss": 0.034998368471860886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499836748233065e-05, "grad_norm": 19.83480453491211, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8773242235183716, "num_tokens": 870952332.0, "step": 22827 }, { "epoch": 2.903956239664165, "ewc_loss": 0.03505216911435127, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505216955090873e-05, "grad_norm": 19.79207420349121, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8651432991027832, "num_tokens": 870991609.0, "step": 22828 }, { "epoch": 2.9040834499427555, "ewc_loss": 0.035113900899887085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511390241328627e-05, "grad_norm": 19.95236587524414, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8795828819274902, "num_tokens": 871027456.0, "step": 22829 }, { "epoch": 2.904210660221346, "ewc_loss": 0.035050053149461746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5050052247243e-05, "grad_norm": 19.79311180114746, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8668150305747986, "num_tokens": 871071833.0, "step": 22830 }, { "epoch": 2.9043378704999365, "ewc_loss": 0.03506730496883392, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506730354274623e-05, "grad_norm": 19.879207611083984, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8738969564437866, "num_tokens": 871105446.0, "step": 22831 }, { "epoch": 2.904465080778527, "ewc_loss": 0.03509536385536194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509536327328533e-05, "grad_norm": 19.74786949157715, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8739417195320129, "num_tokens": 871141408.0, "step": 22832 }, { "epoch": 2.9045922910571176, "ewc_loss": 0.035096269100904465, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5096269130008295e-05, "grad_norm": 19.865907669067383, "learning_rate": 1e-06, "loss": 0.3786, "mean_token_accuracy": 0.8789578080177307, "num_tokens": 871173296.0, "step": 22833 }, { "epoch": 2.904719501335708, "ewc_loss": 0.035154130309820175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515413118293509e-05, "grad_norm": 19.845468521118164, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8730806708335876, "num_tokens": 871210711.0, "step": 22834 }, { "epoch": 2.904846711614298, "ewc_loss": 0.03513156250119209, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513156116241589e-05, "grad_norm": 19.781455993652344, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8723154067993164, "num_tokens": 871251179.0, "step": 22835 }, { "epoch": 2.904973921892889, "ewc_loss": 0.035157814621925354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515781645546667e-05, "grad_norm": 19.771453857421875, "learning_rate": 1e-06, "loss": 0.3302, "mean_token_accuracy": 0.8947814702987671, "num_tokens": 871295758.0, "step": 22836 }, { "epoch": 2.9051011321714793, "ewc_loss": 0.035121310502290726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512130933813751e-05, "grad_norm": 19.777616500854492, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8613182902336121, "num_tokens": 871331145.0, "step": 22837 }, { "epoch": 2.9052283424500702, "ewc_loss": 0.03510908782482147, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5109089367324486e-05, "grad_norm": 19.770591735839844, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8516581654548645, "num_tokens": 871372819.0, "step": 22838 }, { "epoch": 2.9053555527286603, "ewc_loss": 0.03516902029514313, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5169021430192515e-05, "grad_norm": 19.840282440185547, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8714730143547058, "num_tokens": 871414223.0, "step": 22839 }, { "epoch": 2.905482763007251, "ewc_loss": 0.03517173230648041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51717317244038e-05, "grad_norm": 19.75547218322754, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8816156387329102, "num_tokens": 871455013.0, "step": 22840 }, { "epoch": 2.9056099732858414, "ewc_loss": 0.035146210342645645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514621130307205e-05, "grad_norm": 19.82845115661621, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.874955952167511, "num_tokens": 871495462.0, "step": 22841 }, { "epoch": 2.905737183564432, "ewc_loss": 0.03520170971751213, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5201708669774234e-05, "grad_norm": 19.829683303833008, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8786948919296265, "num_tokens": 871534999.0, "step": 22842 }, { "epoch": 2.9058643938430224, "ewc_loss": 0.03513506427407265, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513506453600712e-05, "grad_norm": 19.875011444091797, "learning_rate": 1e-06, "loss": 0.3858, "mean_token_accuracy": 0.8764879703521729, "num_tokens": 871569816.0, "step": 22843 }, { "epoch": 2.905991604121613, "ewc_loss": 0.03521565720438957, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5215656680520624e-05, "grad_norm": 19.78443145751953, "learning_rate": 1e-06, "loss": 0.3831, "mean_token_accuracy": 0.8778177499771118, "num_tokens": 871610441.0, "step": 22844 }, { "epoch": 2.9061188144002035, "ewc_loss": 0.035105861723423004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5105862480122596e-05, "grad_norm": 19.83660888671875, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8623398542404175, "num_tokens": 871649005.0, "step": 22845 }, { "epoch": 2.906246024678794, "ewc_loss": 0.03522315248847008, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522315091686323e-05, "grad_norm": 19.823259353637695, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.870998203754425, "num_tokens": 871689263.0, "step": 22846 }, { "epoch": 2.9063732349573845, "ewc_loss": 0.03513200581073761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513200499583036e-05, "grad_norm": 19.761459350585938, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8848230838775635, "num_tokens": 871725440.0, "step": 22847 }, { "epoch": 2.906500445235975, "ewc_loss": 0.03512750193476677, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512750117806718e-05, "grad_norm": 19.809782028198242, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.872688889503479, "num_tokens": 871767839.0, "step": 22848 }, { "epoch": 2.9066276555145656, "ewc_loss": 0.035203464329242706, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520346581353806e-05, "grad_norm": 19.83942413330078, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.880155622959137, "num_tokens": 871801516.0, "step": 22849 }, { "epoch": 2.906754865793156, "ewc_loss": 0.03516697138547897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5166969610145316e-05, "grad_norm": 19.860706329345703, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.871353268623352, "num_tokens": 871840359.0, "step": 22850 }, { "epoch": 2.9068820760717466, "ewc_loss": 0.0351533479988575, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5153349017491564e-05, "grad_norm": 19.832244873046875, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8808284997940063, "num_tokens": 871879396.0, "step": 22851 }, { "epoch": 2.907009286350337, "ewc_loss": 0.03509042039513588, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509041926008649e-05, "grad_norm": 19.823253631591797, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8685218095779419, "num_tokens": 871915043.0, "step": 22852 }, { "epoch": 2.9071364966289277, "ewc_loss": 0.035112980753183365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5112982004648075e-05, "grad_norm": 19.875898361206055, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8711725473403931, "num_tokens": 871953391.0, "step": 22853 }, { "epoch": 2.9072637069075182, "ewc_loss": 0.03514355793595314, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514355921652168e-05, "grad_norm": 19.862497329711914, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8802165985107422, "num_tokens": 871989879.0, "step": 22854 }, { "epoch": 2.9073909171861088, "ewc_loss": 0.035060394555330276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5060395020991564e-05, "grad_norm": 19.84099578857422, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8796365261077881, "num_tokens": 872022428.0, "step": 22855 }, { "epoch": 2.9075181274646993, "ewc_loss": 0.035118259489536285, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5118260711897165e-05, "grad_norm": 19.86699104309082, "learning_rate": 1e-06, "loss": 0.3722, "mean_token_accuracy": 0.8807101249694824, "num_tokens": 872060644.0, "step": 22856 }, { "epoch": 2.90764533774329, "ewc_loss": 0.03511032462120056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51103262801189e-05, "grad_norm": 19.878318786621094, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8728066086769104, "num_tokens": 872098230.0, "step": 22857 }, { "epoch": 2.90777254802188, "ewc_loss": 0.035092029720544815, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509203088469803e-05, "grad_norm": 19.781702041625977, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8651947975158691, "num_tokens": 872131217.0, "step": 22858 }, { "epoch": 2.907899758300471, "ewc_loss": 0.03505284711718559, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505284621496685e-05, "grad_norm": 19.821680068969727, "learning_rate": 1e-06, "loss": 0.3626, "mean_token_accuracy": 0.8852174282073975, "num_tokens": 872163912.0, "step": 22859 }, { "epoch": 2.908026968579061, "ewc_loss": 0.03511554002761841, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511553950374946e-05, "grad_norm": 19.84146499633789, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8516356945037842, "num_tokens": 872200694.0, "step": 22860 }, { "epoch": 2.908154178857652, "ewc_loss": 0.03507475182414055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507475048536435e-05, "grad_norm": 19.756057739257812, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8770049810409546, "num_tokens": 872241156.0, "step": 22861 }, { "epoch": 2.908281389136242, "ewc_loss": 0.03520271182060242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520271275192499e-05, "grad_norm": 19.847137451171875, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8790383338928223, "num_tokens": 872273048.0, "step": 22862 }, { "epoch": 2.908408599414833, "ewc_loss": 0.035113513469696045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511351314955391e-05, "grad_norm": 19.765960693359375, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8794888257980347, "num_tokens": 872312165.0, "step": 22863 }, { "epoch": 2.908535809693423, "ewc_loss": 0.035125985741615295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512598414090462e-05, "grad_norm": 19.793466567993164, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8781282305717468, "num_tokens": 872352740.0, "step": 22864 }, { "epoch": 2.9086630199720136, "ewc_loss": 0.03519133850932121, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5191336792195216e-05, "grad_norm": 19.87750816345215, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.861964225769043, "num_tokens": 872397800.0, "step": 22865 }, { "epoch": 2.908790230250604, "ewc_loss": 0.035120271146297455, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5120272514177486e-05, "grad_norm": 19.805240631103516, "learning_rate": 1e-06, "loss": 0.3595, "mean_token_accuracy": 0.8868455290794373, "num_tokens": 872429307.0, "step": 22866 }, { "epoch": 2.9089174405291947, "ewc_loss": 0.03522898256778717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5228982596891e-05, "grad_norm": 19.933446884155273, "learning_rate": 1e-06, "loss": 0.3924, "mean_token_accuracy": 0.8732056617736816, "num_tokens": 872465897.0, "step": 22867 }, { "epoch": 2.909044650807785, "ewc_loss": 0.03514020889997482, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514020863804035e-05, "grad_norm": 19.733795166015625, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8664984703063965, "num_tokens": 872503252.0, "step": 22868 }, { "epoch": 2.9091718610863757, "ewc_loss": 0.03508209064602852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508209192659706e-05, "grad_norm": 19.804855346679688, "learning_rate": 1e-06, "loss": 0.3775, "mean_token_accuracy": 0.8795698881149292, "num_tokens": 872546276.0, "step": 22869 }, { "epoch": 2.9092990713649662, "ewc_loss": 0.03520099073648453, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520099198794924e-05, "grad_norm": 19.8280029296875, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8740584254264832, "num_tokens": 872585690.0, "step": 22870 }, { "epoch": 2.9094262816435568, "ewc_loss": 0.03513478860259056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513478804961778e-05, "grad_norm": 19.802377700805664, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.865767776966095, "num_tokens": 872621718.0, "step": 22871 }, { "epoch": 2.9095534919221473, "ewc_loss": 0.035189446061849594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518944504321553e-05, "grad_norm": 19.81412696838379, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.880286693572998, "num_tokens": 872659349.0, "step": 22872 }, { "epoch": 2.909680702200738, "ewc_loss": 0.03513156622648239, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51315648003947e-05, "grad_norm": 19.80780792236328, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8711695671081543, "num_tokens": 872704468.0, "step": 22873 }, { "epoch": 2.9098079124793284, "ewc_loss": 0.03522156551480293, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522156475810334e-05, "grad_norm": 19.817485809326172, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8758318424224854, "num_tokens": 872740003.0, "step": 22874 }, { "epoch": 2.909935122757919, "ewc_loss": 0.035090137273073196, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509013549773954e-05, "grad_norm": 19.771516799926758, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8547524213790894, "num_tokens": 872780362.0, "step": 22875 }, { "epoch": 2.9100623330365094, "ewc_loss": 0.03512497991323471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512498005875386e-05, "grad_norm": 19.8308048248291, "learning_rate": 1e-06, "loss": 0.366, "mean_token_accuracy": 0.8827931880950928, "num_tokens": 872823719.0, "step": 22876 }, { "epoch": 2.9101895433151, "ewc_loss": 0.03520665690302849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520665632095188e-05, "grad_norm": 19.82787322998047, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8806087970733643, "num_tokens": 872856276.0, "step": 22877 }, { "epoch": 2.9103167535936905, "ewc_loss": 0.035114746540784836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5114746424369514e-05, "grad_norm": 19.878206253051758, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8686330914497375, "num_tokens": 872893688.0, "step": 22878 }, { "epoch": 2.910443963872281, "ewc_loss": 0.03522367402911186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5223674785811454e-05, "grad_norm": 19.861095428466797, "learning_rate": 1e-06, "loss": 0.3332, "mean_token_accuracy": 0.8933034539222717, "num_tokens": 872931169.0, "step": 22879 }, { "epoch": 2.9105711741508715, "ewc_loss": 0.03508859500288963, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508859663270414e-05, "grad_norm": 19.861114501953125, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8708220720291138, "num_tokens": 872970836.0, "step": 22880 }, { "epoch": 2.910698384429462, "ewc_loss": 0.035179875791072845, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517987715895288e-05, "grad_norm": 19.832109451293945, "learning_rate": 1e-06, "loss": 0.3479, "mean_token_accuracy": 0.8886874914169312, "num_tokens": 873010879.0, "step": 22881 }, { "epoch": 2.9108255947080526, "ewc_loss": 0.035090990364551544, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5090990422759205e-05, "grad_norm": 19.843563079833984, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8776458501815796, "num_tokens": 873058018.0, "step": 22882 }, { "epoch": 2.9109528049866427, "ewc_loss": 0.035133276134729385, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513327465043403e-05, "grad_norm": 19.856292724609375, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8653488159179688, "num_tokens": 873096259.0, "step": 22883 }, { "epoch": 2.9110800152652336, "ewc_loss": 0.03503643721342087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5036438930546865e-05, "grad_norm": 19.719043731689453, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8655185699462891, "num_tokens": 873131955.0, "step": 22884 }, { "epoch": 2.9112072255438237, "ewc_loss": 0.03512633964419365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5126340662827715e-05, "grad_norm": 19.823253631591797, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8701329827308655, "num_tokens": 873171024.0, "step": 22885 }, { "epoch": 2.9113344358224147, "ewc_loss": 0.03516558185219765, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5165583540219814e-05, "grad_norm": 19.760053634643555, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8792078495025635, "num_tokens": 873210568.0, "step": 22886 }, { "epoch": 2.9114616461010048, "ewc_loss": 0.035111650824546814, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511165050440468e-05, "grad_norm": 19.871219635009766, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8781149387359619, "num_tokens": 873250025.0, "step": 22887 }, { "epoch": 2.9115888563795957, "ewc_loss": 0.035255830734968185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525583088048734e-05, "grad_norm": 19.84442138671875, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8677798509597778, "num_tokens": 873285284.0, "step": 22888 }, { "epoch": 2.911716066658186, "ewc_loss": 0.035147130489349365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5147131711710244e-05, "grad_norm": 19.842634201049805, "learning_rate": 1e-06, "loss": 0.3819, "mean_token_accuracy": 0.8782105445861816, "num_tokens": 873320102.0, "step": 22889 }, { "epoch": 2.9118432769367764, "ewc_loss": 0.03515738621354103, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5157387173967436e-05, "grad_norm": 19.846660614013672, "learning_rate": 1e-06, "loss": 0.3656, "mean_token_accuracy": 0.8804264068603516, "num_tokens": 873354668.0, "step": 22890 }, { "epoch": 2.911970487215367, "ewc_loss": 0.035173963755369186, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517396544339135e-05, "grad_norm": 19.766721725463867, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8734106421470642, "num_tokens": 873387415.0, "step": 22891 }, { "epoch": 2.9120976974939574, "ewc_loss": 0.03522234782576561, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5222346923546866e-05, "grad_norm": 19.818714141845703, "learning_rate": 1e-06, "loss": 0.3403, "mean_token_accuracy": 0.8928895592689514, "num_tokens": 873428679.0, "step": 22892 }, { "epoch": 2.912224907772548, "ewc_loss": 0.035260122269392014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5260123695479706e-05, "grad_norm": 19.869054794311523, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8766500949859619, "num_tokens": 873466973.0, "step": 22893 }, { "epoch": 2.9123521180511385, "ewc_loss": 0.03526077792048454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526077853166498e-05, "grad_norm": 19.807594299316406, "learning_rate": 1e-06, "loss": 0.3397, "mean_token_accuracy": 0.8909431099891663, "num_tokens": 873510957.0, "step": 22894 }, { "epoch": 2.912479328329729, "ewc_loss": 0.03514730930328369, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514730997267179e-05, "grad_norm": 19.85248565673828, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8571540713310242, "num_tokens": 873546456.0, "step": 22895 }, { "epoch": 2.9126065386083195, "ewc_loss": 0.03527338057756424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527338049025275e-05, "grad_norm": 19.891860961914062, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8629940152168274, "num_tokens": 873591972.0, "step": 22896 }, { "epoch": 2.91273374888691, "ewc_loss": 0.0352114699780941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521146936691366e-05, "grad_norm": 19.905418395996094, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8743733763694763, "num_tokens": 873629790.0, "step": 22897 }, { "epoch": 2.9128609591655006, "ewc_loss": 0.03514036536216736, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5140365071129054e-05, "grad_norm": 19.746837615966797, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8760223388671875, "num_tokens": 873667419.0, "step": 22898 }, { "epoch": 2.912988169444091, "ewc_loss": 0.035194698721170425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519469828461297e-05, "grad_norm": 19.95698356628418, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8849364519119263, "num_tokens": 873705834.0, "step": 22899 }, { "epoch": 2.9131153797226816, "ewc_loss": 0.03518985956907272, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5189859772799537e-05, "grad_norm": 19.808879852294922, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8665343523025513, "num_tokens": 873748859.0, "step": 22900 }, { "epoch": 2.913242590001272, "ewc_loss": 0.03509967029094696, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5099670640192926e-05, "grad_norm": 19.83326530456543, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.867241382598877, "num_tokens": 873788068.0, "step": 22901 }, { "epoch": 2.9133698002798627, "ewc_loss": 0.03519768267869949, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5197681427234784e-05, "grad_norm": 19.828704833984375, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.8843698501586914, "num_tokens": 873827820.0, "step": 22902 }, { "epoch": 2.9134970105584532, "ewc_loss": 0.035146769136190414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5146767913829535e-05, "grad_norm": 19.807552337646484, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8720571994781494, "num_tokens": 873866271.0, "step": 22903 }, { "epoch": 2.9136242208370438, "ewc_loss": 0.03520464524626732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520464451867156e-05, "grad_norm": 19.88446807861328, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8730037212371826, "num_tokens": 873907409.0, "step": 22904 }, { "epoch": 2.9137514311156343, "ewc_loss": 0.03512091189622879, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5120912798447534e-05, "grad_norm": 19.821937561035156, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.881369411945343, "num_tokens": 873947940.0, "step": 22905 }, { "epoch": 2.913878641394225, "ewc_loss": 0.0351056233048439, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510562237352133e-05, "grad_norm": 19.86630630493164, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8611675500869751, "num_tokens": 873990062.0, "step": 22906 }, { "epoch": 2.9140058516728153, "ewc_loss": 0.03515469655394554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515469506965019e-05, "grad_norm": 19.79461669921875, "learning_rate": 1e-06, "loss": 0.3511, "mean_token_accuracy": 0.8871700763702393, "num_tokens": 874026660.0, "step": 22907 }, { "epoch": 2.9141330619514054, "ewc_loss": 0.03510456904768944, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510456735966727e-05, "grad_norm": 19.833690643310547, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8818678855895996, "num_tokens": 874064935.0, "step": 22908 }, { "epoch": 2.9142602722299964, "ewc_loss": 0.03513927012681961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513927003950812e-05, "grad_norm": 19.930007934570312, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8743599653244019, "num_tokens": 874102807.0, "step": 22909 }, { "epoch": 2.9143874825085865, "ewc_loss": 0.035144489258527756, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5144490539096296e-05, "grad_norm": 19.817096710205078, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8771812915802002, "num_tokens": 874139688.0, "step": 22910 }, { "epoch": 2.9145146927871775, "ewc_loss": 0.035071566700935364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507156725390814e-05, "grad_norm": 19.84709930419922, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8808724880218506, "num_tokens": 874184505.0, "step": 22911 }, { "epoch": 2.9146419030657675, "ewc_loss": 0.035139527171850204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513952833600342e-05, "grad_norm": 19.930814743041992, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8756488561630249, "num_tokens": 874223210.0, "step": 22912 }, { "epoch": 2.914769113344358, "ewc_loss": 0.034997906535863876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.499790545902215e-05, "grad_norm": 19.811946868896484, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8638362884521484, "num_tokens": 874263939.0, "step": 22913 }, { "epoch": 2.9148963236229486, "ewc_loss": 0.03501565754413605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.501565879560076e-05, "grad_norm": 19.79987907409668, "learning_rate": 1e-06, "loss": 0.3733, "mean_token_accuracy": 0.8828580975532532, "num_tokens": 874303946.0, "step": 22914 }, { "epoch": 2.915023533901539, "ewc_loss": 0.03508005663752556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5080058296443895e-05, "grad_norm": 19.832477569580078, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8781993389129639, "num_tokens": 874343701.0, "step": 22915 }, { "epoch": 2.9151507441801296, "ewc_loss": 0.03506709635257721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.506709617795423e-05, "grad_norm": 19.8695068359375, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8789859414100647, "num_tokens": 874380425.0, "step": 22916 }, { "epoch": 2.91527795445872, "ewc_loss": 0.03510759025812149, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5107590520055965e-05, "grad_norm": 19.82529640197754, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8760462999343872, "num_tokens": 874417643.0, "step": 22917 }, { "epoch": 2.9154051647373107, "ewc_loss": 0.035025984048843384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502598337945528e-05, "grad_norm": 19.733592987060547, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8693219423294067, "num_tokens": 874463939.0, "step": 22918 }, { "epoch": 2.9155323750159012, "ewc_loss": 0.03508691489696503, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508691588649526e-05, "grad_norm": 19.783021926879883, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8722869157791138, "num_tokens": 874503156.0, "step": 22919 }, { "epoch": 2.9156595852944918, "ewc_loss": 0.03514882177114487, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514882337185554e-05, "grad_norm": 19.841888427734375, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8666484951972961, "num_tokens": 874546754.0, "step": 22920 }, { "epoch": 2.9157867955730823, "ewc_loss": 0.035125985741615295, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512598414090462e-05, "grad_norm": 19.86769676208496, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.879358172416687, "num_tokens": 874590781.0, "step": 22921 }, { "epoch": 2.915914005851673, "ewc_loss": 0.03510676324367523, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5106764698866755e-05, "grad_norm": 19.891141891479492, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8844101428985596, "num_tokens": 874629592.0, "step": 22922 }, { "epoch": 2.9160412161302633, "ewc_loss": 0.035127315670251846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5127315641148016e-05, "grad_norm": 19.904664993286133, "learning_rate": 1e-06, "loss": 0.3583, "mean_token_accuracy": 0.8828147053718567, "num_tokens": 874667372.0, "step": 22923 }, { "epoch": 2.916168426408854, "ewc_loss": 0.03500884398818016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5008844861295074e-05, "grad_norm": 19.838441848754883, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8726714253425598, "num_tokens": 874704687.0, "step": 22924 }, { "epoch": 2.9162956366874444, "ewc_loss": 0.0350014790892601, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5001477954210714e-05, "grad_norm": 19.799057006835938, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8750207424163818, "num_tokens": 874740471.0, "step": 22925 }, { "epoch": 2.916422846966035, "ewc_loss": 0.03508681058883667, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508681038510986e-05, "grad_norm": 19.864471435546875, "learning_rate": 1e-06, "loss": 0.3669, "mean_token_accuracy": 0.8811759948730469, "num_tokens": 874780747.0, "step": 22926 }, { "epoch": 2.9165500572446255, "ewc_loss": 0.03510916605591774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5109165764879435e-05, "grad_norm": 19.878616333007812, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.8799824118614197, "num_tokens": 874814204.0, "step": 22927 }, { "epoch": 2.916677267523216, "ewc_loss": 0.03504982218146324, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5049823054578155e-05, "grad_norm": 19.849458694458008, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8748770952224731, "num_tokens": 874851872.0, "step": 22928 }, { "epoch": 2.9168044778018065, "ewc_loss": 0.035063669085502625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5063669201917946e-05, "grad_norm": 19.877016067504883, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.86027991771698, "num_tokens": 874884215.0, "step": 22929 }, { "epoch": 2.916931688080397, "ewc_loss": 0.03509977087378502, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5099772503599524e-05, "grad_norm": 19.830656051635742, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8733628988265991, "num_tokens": 874919047.0, "step": 22930 }, { "epoch": 2.917058898358987, "ewc_loss": 0.03504612669348717, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.504612686811015e-05, "grad_norm": 19.884767532348633, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8695141673088074, "num_tokens": 874956469.0, "step": 22931 }, { "epoch": 2.917186108637578, "ewc_loss": 0.0351165235042572, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5116521758027375e-05, "grad_norm": 19.880887985229492, "learning_rate": 1e-06, "loss": 0.3537, "mean_token_accuracy": 0.8871214389801025, "num_tokens": 874994633.0, "step": 22932 }, { "epoch": 2.917313318916168, "ewc_loss": 0.03507864102721214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507863948470913e-05, "grad_norm": 19.814552307128906, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8885583281517029, "num_tokens": 875028530.0, "step": 22933 }, { "epoch": 2.917440529194759, "ewc_loss": 0.03502590209245682, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.502590334392153e-05, "grad_norm": 19.809782028198242, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8682604432106018, "num_tokens": 875059785.0, "step": 22934 }, { "epoch": 2.9175677394733492, "ewc_loss": 0.035104844719171524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.510484384605661e-05, "grad_norm": 19.832992553710938, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8738427758216858, "num_tokens": 875096057.0, "step": 22935 }, { "epoch": 2.91769494975194, "ewc_loss": 0.0351264625787735, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5126464354107156e-05, "grad_norm": 19.81524658203125, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.88127601146698, "num_tokens": 875141988.0, "step": 22936 }, { "epoch": 2.9178221600305303, "ewc_loss": 0.035185739398002625, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51857379428111e-05, "grad_norm": 19.855419158935547, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.880746066570282, "num_tokens": 875176973.0, "step": 22937 }, { "epoch": 2.917949370309121, "ewc_loss": 0.03521641716361046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5216417018091306e-05, "grad_norm": 19.851055145263672, "learning_rate": 1e-06, "loss": 0.3663, "mean_token_accuracy": 0.8844550848007202, "num_tokens": 875209271.0, "step": 22938 }, { "epoch": 2.9180765805877114, "ewc_loss": 0.03516484424471855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5164845030521974e-05, "grad_norm": 19.78864288330078, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8805295825004578, "num_tokens": 875247460.0, "step": 22939 }, { "epoch": 2.918203790866302, "ewc_loss": 0.03517938032746315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5179378755856305e-05, "grad_norm": 19.897981643676758, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.873869776725769, "num_tokens": 875286013.0, "step": 22940 }, { "epoch": 2.9183310011448924, "ewc_loss": 0.03528435900807381, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528435991029255e-05, "grad_norm": 19.87390899658203, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8722515106201172, "num_tokens": 875324547.0, "step": 22941 }, { "epoch": 2.918458211423483, "ewc_loss": 0.0351446270942688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5144628782290965e-05, "grad_norm": 19.81183624267578, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.867058515548706, "num_tokens": 875366655.0, "step": 22942 }, { "epoch": 2.9185854217020735, "ewc_loss": 0.03520452231168747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5204520827392116e-05, "grad_norm": 19.85634994506836, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8697100877761841, "num_tokens": 875408571.0, "step": 22943 }, { "epoch": 2.918712631980664, "ewc_loss": 0.035196851938962936, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519685196806677e-05, "grad_norm": 19.88657569885254, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8705754280090332, "num_tokens": 875450730.0, "step": 22944 }, { "epoch": 2.9188398422592545, "ewc_loss": 0.03519821539521217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5198216210119426e-05, "grad_norm": 19.902172088623047, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8799228072166443, "num_tokens": 875486850.0, "step": 22945 }, { "epoch": 2.918967052537845, "ewc_loss": 0.03515154495835304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515154458000325e-05, "grad_norm": 19.7771053314209, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.869748592376709, "num_tokens": 875528653.0, "step": 22946 }, { "epoch": 2.9190942628164356, "ewc_loss": 0.035098783671855927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5098782973363996e-05, "grad_norm": 19.91752815246582, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8866415619850159, "num_tokens": 875564155.0, "step": 22947 }, { "epoch": 2.919221473095026, "ewc_loss": 0.03519253060221672, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519253004924394e-05, "grad_norm": 19.82904624938965, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.874075710773468, "num_tokens": 875598548.0, "step": 22948 }, { "epoch": 2.9193486833736166, "ewc_loss": 0.03507298603653908, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507298606564291e-05, "grad_norm": 19.836240768432617, "learning_rate": 1e-06, "loss": 0.3448, "mean_token_accuracy": 0.8914176225662231, "num_tokens": 875634587.0, "step": 22949 }, { "epoch": 2.919475893652207, "ewc_loss": 0.035158053040504456, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5158052924089134e-05, "grad_norm": 19.8807430267334, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8740824460983276, "num_tokens": 875671956.0, "step": 22950 }, { "epoch": 2.9196031039307977, "ewc_loss": 0.0351397804915905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513977935654111e-05, "grad_norm": 19.843229293823242, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8776019215583801, "num_tokens": 875706327.0, "step": 22951 }, { "epoch": 2.9197303142093882, "ewc_loss": 0.03507877513766289, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.507877408992499e-05, "grad_norm": 19.851560592651367, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8822497129440308, "num_tokens": 875749320.0, "step": 22952 }, { "epoch": 2.9198575244879788, "ewc_loss": 0.03510478883981705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5104789276374504e-05, "grad_norm": 19.81428337097168, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8843739032745361, "num_tokens": 875783871.0, "step": 22953 }, { "epoch": 2.9199847347665693, "ewc_loss": 0.03514942526817322, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514942363835871e-05, "grad_norm": 19.848485946655273, "learning_rate": 1e-06, "loss": 0.3893, "mean_token_accuracy": 0.8753007650375366, "num_tokens": 875821716.0, "step": 22954 }, { "epoch": 2.92011194504516, "ewc_loss": 0.03514247015118599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514247146085836e-05, "grad_norm": 19.85654067993164, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8795812129974365, "num_tokens": 875854339.0, "step": 22955 }, { "epoch": 2.92023915532375, "ewc_loss": 0.035161349922418594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516134893288836e-05, "grad_norm": 19.870704650878906, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8777639865875244, "num_tokens": 875889456.0, "step": 22956 }, { "epoch": 2.920366365602341, "ewc_loss": 0.03517047315835953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5170472983736545e-05, "grad_norm": 19.903057098388672, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8677141666412354, "num_tokens": 875926659.0, "step": 22957 }, { "epoch": 2.920493575880931, "ewc_loss": 0.035148054361343384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5148055758327246e-05, "grad_norm": 19.797863006591797, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8833599090576172, "num_tokens": 875972766.0, "step": 22958 }, { "epoch": 2.920620786159522, "ewc_loss": 0.03509549796581268, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509549787850119e-05, "grad_norm": 19.83770751953125, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8722595572471619, "num_tokens": 876011824.0, "step": 22959 }, { "epoch": 2.920747996438112, "ewc_loss": 0.035132452845573425, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513245246722363e-05, "grad_norm": 19.843961715698242, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8588377833366394, "num_tokens": 876050353.0, "step": 22960 }, { "epoch": 2.920875206716703, "ewc_loss": 0.03514833003282547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514832860673778e-05, "grad_norm": 19.912919998168945, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8619931936264038, "num_tokens": 876089358.0, "step": 22961 }, { "epoch": 2.921002416995293, "ewc_loss": 0.035119857639074326, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511985778459348e-05, "grad_norm": 19.824337005615234, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8632686138153076, "num_tokens": 876126287.0, "step": 22962 }, { "epoch": 2.9211296272738836, "ewc_loss": 0.03508801385760307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5088014556095004e-05, "grad_norm": 19.798095703125, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8845950365066528, "num_tokens": 876165157.0, "step": 22963 }, { "epoch": 2.921256837552474, "ewc_loss": 0.03516582027077675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5165820008842275e-05, "grad_norm": 19.85433006286621, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8765650391578674, "num_tokens": 876201391.0, "step": 22964 }, { "epoch": 2.9213840478310646, "ewc_loss": 0.035174671560525894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517467121127993e-05, "grad_norm": 19.75421714782715, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8825411200523376, "num_tokens": 876243004.0, "step": 22965 }, { "epoch": 2.921511258109655, "ewc_loss": 0.03516167029738426, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516166907502338e-05, "grad_norm": 19.82954978942871, "learning_rate": 1e-06, "loss": 0.379, "mean_token_accuracy": 0.8783005475997925, "num_tokens": 876279245.0, "step": 22966 }, { "epoch": 2.9216384683882457, "ewc_loss": 0.03527478501200676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5274784750072286e-05, "grad_norm": 19.842876434326172, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8750340938568115, "num_tokens": 876312145.0, "step": 22967 }, { "epoch": 2.9217656786668362, "ewc_loss": 0.03519228473305702, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519228630466387e-05, "grad_norm": 19.856353759765625, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8765607476234436, "num_tokens": 876351354.0, "step": 22968 }, { "epoch": 2.9218928889454268, "ewc_loss": 0.03524000197649002, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524000203469768e-05, "grad_norm": 19.848966598510742, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.864540159702301, "num_tokens": 876388621.0, "step": 22969 }, { "epoch": 2.9220200992240173, "ewc_loss": 0.035212669521570206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521266989992e-05, "grad_norm": 19.877897262573242, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8701603412628174, "num_tokens": 876418377.0, "step": 22970 }, { "epoch": 2.922147309502608, "ewc_loss": 0.03523869439959526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5238696000305936e-05, "grad_norm": 19.797014236450195, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8786399960517883, "num_tokens": 876453351.0, "step": 22971 }, { "epoch": 2.9222745197811983, "ewc_loss": 0.03524535894393921, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524535713950172e-05, "grad_norm": 19.833637237548828, "learning_rate": 1e-06, "loss": 0.3658, "mean_token_accuracy": 0.8827905654907227, "num_tokens": 876484830.0, "step": 22972 }, { "epoch": 2.922401730059789, "ewc_loss": 0.03525194153189659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525194188114256e-05, "grad_norm": 19.83538055419922, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8860070705413818, "num_tokens": 876524318.0, "step": 22973 }, { "epoch": 2.9225289403383794, "ewc_loss": 0.03524089232087135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524089333950542e-05, "grad_norm": 19.80757713317871, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8732900023460388, "num_tokens": 876562421.0, "step": 22974 }, { "epoch": 2.92265615061697, "ewc_loss": 0.03528551384806633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52855131495744e-05, "grad_norm": 19.905418395996094, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8697664141654968, "num_tokens": 876605681.0, "step": 22975 }, { "epoch": 2.9227833608955605, "ewc_loss": 0.03528771176934242, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528771048877388e-05, "grad_norm": 19.83778953552246, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8668031692504883, "num_tokens": 876647371.0, "step": 22976 }, { "epoch": 2.922910571174151, "ewc_loss": 0.035246580839157104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5246579500380903e-05, "grad_norm": 19.84815788269043, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8816279172897339, "num_tokens": 876685563.0, "step": 22977 }, { "epoch": 2.9230377814527415, "ewc_loss": 0.03522126004099846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5221259167883545e-05, "grad_norm": 19.797489166259766, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8717606067657471, "num_tokens": 876718404.0, "step": 22978 }, { "epoch": 2.923164991731332, "ewc_loss": 0.03525308147072792, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5253080568509176e-05, "grad_norm": 19.835065841674805, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8795835971832275, "num_tokens": 876751888.0, "step": 22979 }, { "epoch": 2.9232922020099226, "ewc_loss": 0.03530485928058624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530485992087051e-05, "grad_norm": 19.85358238220215, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8813138604164124, "num_tokens": 876786813.0, "step": 22980 }, { "epoch": 2.9234194122885127, "ewc_loss": 0.03522235527634621, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522235419950448e-05, "grad_norm": 19.858327865600586, "learning_rate": 1e-06, "loss": 0.3567, "mean_token_accuracy": 0.8850029706954956, "num_tokens": 876824511.0, "step": 22981 }, { "epoch": 2.9235466225671036, "ewc_loss": 0.03524840995669365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524840940372087e-05, "grad_norm": 19.861541748046875, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.8793879151344299, "num_tokens": 876856409.0, "step": 22982 }, { "epoch": 2.9236738328456937, "ewc_loss": 0.03522096201777458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5220960853621364e-05, "grad_norm": 19.87809181213379, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8627451062202454, "num_tokens": 876889204.0, "step": 22983 }, { "epoch": 2.9238010431242847, "ewc_loss": 0.03522913530468941, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52291353920009e-05, "grad_norm": 19.858640670776367, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8758985996246338, "num_tokens": 876927185.0, "step": 22984 }, { "epoch": 2.9239282534028748, "ewc_loss": 0.03523726761341095, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523726627463475e-05, "grad_norm": 19.837631225585938, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8729002475738525, "num_tokens": 876965744.0, "step": 22985 }, { "epoch": 2.9240554636814657, "ewc_loss": 0.035205766558647156, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520576501614414e-05, "grad_norm": 19.908815383911133, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8815155029296875, "num_tokens": 876997107.0, "step": 22986 }, { "epoch": 2.924182673960056, "ewc_loss": 0.03522207960486412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522208135109395e-05, "grad_norm": 19.788972854614258, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8663891553878784, "num_tokens": 877037894.0, "step": 22987 }, { "epoch": 2.9243098842386464, "ewc_loss": 0.03521263226866722, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521263352013193e-05, "grad_norm": 19.844816207885742, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8632301092147827, "num_tokens": 877078724.0, "step": 22988 }, { "epoch": 2.924437094517237, "ewc_loss": 0.03537367656826973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537367592798546e-05, "grad_norm": 19.863014221191406, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8759043216705322, "num_tokens": 877119685.0, "step": 22989 }, { "epoch": 2.9245643047958274, "ewc_loss": 0.03523937612771988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523937630234286e-05, "grad_norm": 19.895755767822266, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8787361979484558, "num_tokens": 877157055.0, "step": 22990 }, { "epoch": 2.924691515074418, "ewc_loss": 0.035306092351675034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530609319568612e-05, "grad_norm": 19.853466033935547, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8800135850906372, "num_tokens": 877188569.0, "step": 22991 }, { "epoch": 2.9248187253530085, "ewc_loss": 0.035237859934568405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5237859265180305e-05, "grad_norm": 19.871213912963867, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8690457940101624, "num_tokens": 877222745.0, "step": 22992 }, { "epoch": 2.924945935631599, "ewc_loss": 0.03528907150030136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5289071092847735e-05, "grad_norm": 19.89873695373535, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8605907559394836, "num_tokens": 877262510.0, "step": 22993 }, { "epoch": 2.9250731459101895, "ewc_loss": 0.03527034819126129, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527035005390644e-05, "grad_norm": 19.848175048828125, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8695107102394104, "num_tokens": 877300164.0, "step": 22994 }, { "epoch": 2.92520035618878, "ewc_loss": 0.03521718457341194, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52171846316196e-05, "grad_norm": 19.788793563842773, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8851699829101562, "num_tokens": 877337187.0, "step": 22995 }, { "epoch": 2.9253275664673706, "ewc_loss": 0.035247549414634705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52475508407224e-05, "grad_norm": 19.841106414794922, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8566754460334778, "num_tokens": 877375902.0, "step": 22996 }, { "epoch": 2.925454776745961, "ewc_loss": 0.03528670221567154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528670276864432e-05, "grad_norm": 19.845857620239258, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8744460344314575, "num_tokens": 877414406.0, "step": 22997 }, { "epoch": 2.9255819870245516, "ewc_loss": 0.03526856750249863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268567444290966e-05, "grad_norm": 19.904510498046875, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.8821620345115662, "num_tokens": 877448977.0, "step": 22998 }, { "epoch": 2.925709197303142, "ewc_loss": 0.03532557561993599, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532557457219809e-05, "grad_norm": 19.914514541625977, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8830702900886536, "num_tokens": 877486766.0, "step": 22999 }, { "epoch": 2.9258364075817327, "ewc_loss": 0.03522126376628876, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522126280586235e-05, "grad_norm": 19.91982650756836, "learning_rate": 1e-06, "loss": 0.3514, "mean_token_accuracy": 0.8888890147209167, "num_tokens": 877519320.0, "step": 23000 }, { "epoch": 2.925963617860323, "ewc_loss": 0.03527470678091049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5274708352517337e-05, "grad_norm": 19.909135818481445, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8528293967247009, "num_tokens": 877557913.0, "step": 23001 }, { "epoch": 2.9260908281389137, "ewc_loss": 0.03520069271326065, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5200693673687056e-05, "grad_norm": 19.848522186279297, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8759552240371704, "num_tokens": 877591197.0, "step": 23002 }, { "epoch": 2.9262180384175043, "ewc_loss": 0.03523630276322365, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523630221025087e-05, "grad_norm": 19.868436813354492, "learning_rate": 1e-06, "loss": 0.3387, "mean_token_accuracy": 0.8925276398658752, "num_tokens": 877625018.0, "step": 23003 }, { "epoch": 2.926345248696095, "ewc_loss": 0.035260703414678574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5260702134110034e-05, "grad_norm": 19.85259437561035, "learning_rate": 1e-06, "loss": 0.3826, "mean_token_accuracy": 0.8799567818641663, "num_tokens": 877662433.0, "step": 23004 }, { "epoch": 2.9264724589746853, "ewc_loss": 0.03521816432476044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521816324791871e-05, "grad_norm": 19.885784149169922, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8870871663093567, "num_tokens": 877703576.0, "step": 23005 }, { "epoch": 2.9265996692532754, "ewc_loss": 0.03524778038263321, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5247780033387244e-05, "grad_norm": 19.848979949951172, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8716736435890198, "num_tokens": 877734025.0, "step": 23006 }, { "epoch": 2.9267268795318664, "ewc_loss": 0.03519671782851219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5196717362850904e-05, "grad_norm": 19.821012496948242, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8851839303970337, "num_tokens": 877771942.0, "step": 23007 }, { "epoch": 2.9268540898104565, "ewc_loss": 0.035205453634262085, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520545214996673e-05, "grad_norm": 19.845388412475586, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8798204660415649, "num_tokens": 877815662.0, "step": 23008 }, { "epoch": 2.9269813000890474, "ewc_loss": 0.035226088017225266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5226086765760556e-05, "grad_norm": 19.85735321044922, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.861035943031311, "num_tokens": 877855398.0, "step": 23009 }, { "epoch": 2.9271085103676375, "ewc_loss": 0.03525451198220253, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5254510294180363e-05, "grad_norm": 19.88903045654297, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8702757358551025, "num_tokens": 877898544.0, "step": 23010 }, { "epoch": 2.927235720646228, "ewc_loss": 0.035253264009952545, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525326246744953e-05, "grad_norm": 19.89192771911621, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8750244975090027, "num_tokens": 877937914.0, "step": 23011 }, { "epoch": 2.9273629309248186, "ewc_loss": 0.03513934463262558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513934279908426e-05, "grad_norm": 19.884727478027344, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8601183891296387, "num_tokens": 877979266.0, "step": 23012 }, { "epoch": 2.927490141203409, "ewc_loss": 0.035189189016819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518919038469903e-05, "grad_norm": 19.853744506835938, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8796752691268921, "num_tokens": 878019762.0, "step": 23013 }, { "epoch": 2.9276173514819996, "ewc_loss": 0.03517089784145355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5170898627256975e-05, "grad_norm": 19.872562408447266, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8708099126815796, "num_tokens": 878062174.0, "step": 23014 }, { "epoch": 2.92774456176059, "ewc_loss": 0.03516930341720581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516930519253947e-05, "grad_norm": 19.790081024169922, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8696513175964355, "num_tokens": 878098197.0, "step": 23015 }, { "epoch": 2.9278717720391807, "ewc_loss": 0.03513076528906822, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513076444505714e-05, "grad_norm": 20.003568649291992, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8818932175636292, "num_tokens": 878130478.0, "step": 23016 }, { "epoch": 2.9279989823177712, "ewc_loss": 0.03529811650514603, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5298115108162165e-05, "grad_norm": 19.868654251098633, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.858491063117981, "num_tokens": 878175670.0, "step": 23017 }, { "epoch": 2.9281261925963618, "ewc_loss": 0.03509856015443802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509856105665676e-05, "grad_norm": 19.853193283081055, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8648406267166138, "num_tokens": 878216415.0, "step": 23018 }, { "epoch": 2.9282534028749523, "ewc_loss": 0.03525783121585846, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525783176883124e-05, "grad_norm": 19.927419662475586, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8749401569366455, "num_tokens": 878249582.0, "step": 23019 }, { "epoch": 2.928380613153543, "ewc_loss": 0.03516276925802231, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5162767744623125e-05, "grad_norm": 19.841413497924805, "learning_rate": 1e-06, "loss": 0.3536, "mean_token_accuracy": 0.8864067792892456, "num_tokens": 878290859.0, "step": 23020 }, { "epoch": 2.9285078234321333, "ewc_loss": 0.03516531363129616, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516531432978809e-05, "grad_norm": 19.81483268737793, "learning_rate": 1e-06, "loss": 0.344, "mean_token_accuracy": 0.8915544748306274, "num_tokens": 878322463.0, "step": 23021 }, { "epoch": 2.928635033710724, "ewc_loss": 0.03513295203447342, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51329508703202e-05, "grad_norm": 19.828781127929688, "learning_rate": 1e-06, "loss": 0.3562, "mean_token_accuracy": 0.8856533169746399, "num_tokens": 878365754.0, "step": 23022 }, { "epoch": 2.9287622439893144, "ewc_loss": 0.03515728935599327, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5157288948539644e-05, "grad_norm": 19.851274490356445, "learning_rate": 1e-06, "loss": 0.3517, "mean_token_accuracy": 0.8888710737228394, "num_tokens": 878410325.0, "step": 23023 }, { "epoch": 2.928889454267905, "ewc_loss": 0.03509875386953354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509875386953354e-05, "grad_norm": 19.83355712890625, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.880000650882721, "num_tokens": 878451604.0, "step": 23024 }, { "epoch": 2.9290166645464955, "ewc_loss": 0.03512430936098099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512431067065336e-05, "grad_norm": 19.793376922607422, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8789530992507935, "num_tokens": 878485174.0, "step": 23025 }, { "epoch": 2.929143874825086, "ewc_loss": 0.03520932048559189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520931932143867e-05, "grad_norm": 19.942096710205078, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.867782711982727, "num_tokens": 878525867.0, "step": 23026 }, { "epoch": 2.9292710851036765, "ewc_loss": 0.03527085483074188, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5270855732960626e-05, "grad_norm": 19.871021270751953, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8634099960327148, "num_tokens": 878561327.0, "step": 23027 }, { "epoch": 2.929398295382267, "ewc_loss": 0.03513303026556969, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513303090585396e-05, "grad_norm": 19.88278579711914, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.8825032711029053, "num_tokens": 878603045.0, "step": 23028 }, { "epoch": 2.929525505660857, "ewc_loss": 0.03514505550265312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5145054425811395e-05, "grad_norm": 19.787654876708984, "learning_rate": 1e-06, "loss": 0.3677, "mean_token_accuracy": 0.8811359405517578, "num_tokens": 878645051.0, "step": 23029 }, { "epoch": 2.929652715939448, "ewc_loss": 0.0351417250931263, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5141725675202906e-05, "grad_norm": 19.98297882080078, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8581221103668213, "num_tokens": 878682003.0, "step": 23030 }, { "epoch": 2.929779926218038, "ewc_loss": 0.03522650897502899, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522650877130218e-05, "grad_norm": 19.85488510131836, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8819136619567871, "num_tokens": 878720509.0, "step": 23031 }, { "epoch": 2.929907136496629, "ewc_loss": 0.03515178710222244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515178832458332e-05, "grad_norm": 19.953006744384766, "learning_rate": 1e-06, "loss": 0.3777, "mean_token_accuracy": 0.8816293478012085, "num_tokens": 878757950.0, "step": 23032 }, { "epoch": 2.9300343467752192, "ewc_loss": 0.035202205181121826, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5202203434892e-05, "grad_norm": 19.91598129272461, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.869195818901062, "num_tokens": 878801477.0, "step": 23033 }, { "epoch": 2.93016155705381, "ewc_loss": 0.03505728021264076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.505728091113269e-05, "grad_norm": 19.902006149291992, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.868017852306366, "num_tokens": 878838802.0, "step": 23034 }, { "epoch": 2.9302887673324003, "ewc_loss": 0.03517964109778404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5179640690330416e-05, "grad_norm": 19.87226676940918, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8721908330917358, "num_tokens": 878875566.0, "step": 23035 }, { "epoch": 2.930415977610991, "ewc_loss": 0.03515983745455742, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5159839171683416e-05, "grad_norm": 19.871389389038086, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8681364059448242, "num_tokens": 878913463.0, "step": 23036 }, { "epoch": 2.9305431878895813, "ewc_loss": 0.035096246749162674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.509624730213545e-05, "grad_norm": 19.797130584716797, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8649399280548096, "num_tokens": 878950769.0, "step": 23037 }, { "epoch": 2.930670398168172, "ewc_loss": 0.035172343254089355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517234290484339e-05, "grad_norm": 19.87340545654297, "learning_rate": 1e-06, "loss": 0.3671, "mean_token_accuracy": 0.8856298923492432, "num_tokens": 878989041.0, "step": 23038 }, { "epoch": 2.9307976084467624, "ewc_loss": 0.03524250537157059, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524250496411696e-05, "grad_norm": 19.876665115356445, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8828648924827576, "num_tokens": 879020195.0, "step": 23039 }, { "epoch": 2.930924818725353, "ewc_loss": 0.03511247783899307, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.511247632559389e-05, "grad_norm": 19.898273468017578, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8737925887107849, "num_tokens": 879057818.0, "step": 23040 }, { "epoch": 2.9310520290039435, "ewc_loss": 0.03520849347114563, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520849350024946e-05, "grad_norm": 19.860809326171875, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8723131418228149, "num_tokens": 879094747.0, "step": 23041 }, { "epoch": 2.931179239282534, "ewc_loss": 0.03518584370613098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518584344419651e-05, "grad_norm": 19.922712326049805, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8664714097976685, "num_tokens": 879131127.0, "step": 23042 }, { "epoch": 2.9313064495611245, "ewc_loss": 0.03520629554986954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520629616104998e-05, "grad_norm": 19.90288734436035, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8763784766197205, "num_tokens": 879168541.0, "step": 23043 }, { "epoch": 2.931433659839715, "ewc_loss": 0.03513350337743759, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.513350384309888e-05, "grad_norm": 19.905317306518555, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8734298944473267, "num_tokens": 879199409.0, "step": 23044 }, { "epoch": 2.9315608701183056, "ewc_loss": 0.03519351780414581, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5193515941500664e-05, "grad_norm": 19.807424545288086, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8886211514472961, "num_tokens": 879236447.0, "step": 23045 }, { "epoch": 2.931688080396896, "ewc_loss": 0.035129595547914505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512959665386006e-05, "grad_norm": 19.86433219909668, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.876096248626709, "num_tokens": 879275596.0, "step": 23046 }, { "epoch": 2.9318152906754866, "ewc_loss": 0.03529082238674164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5290820960653946e-05, "grad_norm": 19.92401123046875, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8738713264465332, "num_tokens": 879315620.0, "step": 23047 }, { "epoch": 2.931942500954077, "ewc_loss": 0.03519974648952484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519974779919721e-05, "grad_norm": 19.893505096435547, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8777247667312622, "num_tokens": 879356220.0, "step": 23048 }, { "epoch": 2.9320697112326677, "ewc_loss": 0.03519698604941368, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519698657328263e-05, "grad_norm": 19.92570686340332, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8608292937278748, "num_tokens": 879397544.0, "step": 23049 }, { "epoch": 2.932196921511258, "ewc_loss": 0.035215988755226135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521598773659207e-05, "grad_norm": 19.88256072998047, "learning_rate": 1e-06, "loss": 0.3647, "mean_token_accuracy": 0.8857434988021851, "num_tokens": 879438165.0, "step": 23050 }, { "epoch": 2.9323241317898487, "ewc_loss": 0.03512374311685562, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512374314595945e-05, "grad_norm": 19.89959716796875, "learning_rate": 1e-06, "loss": 0.3809, "mean_token_accuracy": 0.8771214485168457, "num_tokens": 879473601.0, "step": 23051 }, { "epoch": 2.9324513420684393, "ewc_loss": 0.03519488498568535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519488382153213e-05, "grad_norm": 19.848474502563477, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8684006929397583, "num_tokens": 879510998.0, "step": 23052 }, { "epoch": 2.93257855234703, "ewc_loss": 0.03516160696744919, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516160722938366e-05, "grad_norm": 19.883974075317383, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8808955550193787, "num_tokens": 879551469.0, "step": 23053 }, { "epoch": 2.93270576262562, "ewc_loss": 0.03520319238305092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520319296512753e-05, "grad_norm": 19.858219146728516, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8689203858375549, "num_tokens": 879585620.0, "step": 23054 }, { "epoch": 2.932832972904211, "ewc_loss": 0.03514058515429497, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514058334985748e-05, "grad_norm": 19.78907012939453, "learning_rate": 1e-06, "loss": 0.3572, "mean_token_accuracy": 0.8855385780334473, "num_tokens": 879621749.0, "step": 23055 }, { "epoch": 2.932960183182801, "ewc_loss": 0.0351603738963604, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.516037395456806e-05, "grad_norm": 19.860286712646484, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8795299530029297, "num_tokens": 879664529.0, "step": 23056 }, { "epoch": 2.933087393461392, "ewc_loss": 0.035233043134212494, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5233042581239715e-05, "grad_norm": 19.85980796813965, "learning_rate": 1e-06, "loss": 0.3729, "mean_token_accuracy": 0.8800818920135498, "num_tokens": 879696293.0, "step": 23057 }, { "epoch": 2.933214603739982, "ewc_loss": 0.0351899191737175, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518991798046045e-05, "grad_norm": 19.812471389770508, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.881266176700592, "num_tokens": 879733562.0, "step": 23058 }, { "epoch": 2.933341814018573, "ewc_loss": 0.035223349928855896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522335100569762e-05, "grad_norm": 19.897584915161133, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.873888373374939, "num_tokens": 879771740.0, "step": 23059 }, { "epoch": 2.933469024297163, "ewc_loss": 0.03518488630652428, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518488665577024e-05, "grad_norm": 19.78183364868164, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8723798990249634, "num_tokens": 879805578.0, "step": 23060 }, { "epoch": 2.9335962345757536, "ewc_loss": 0.035141635686159134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514163472573273e-05, "grad_norm": 19.962297439575195, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.873139500617981, "num_tokens": 879845175.0, "step": 23061 }, { "epoch": 2.933723444854344, "ewc_loss": 0.035262711346149445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526271029841155e-05, "grad_norm": 19.804880142211914, "learning_rate": 1e-06, "loss": 0.3467, "mean_token_accuracy": 0.8878045082092285, "num_tokens": 879878975.0, "step": 23062 }, { "epoch": 2.9338506551329346, "ewc_loss": 0.03509296476840973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5092965845251456e-05, "grad_norm": 19.83125114440918, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8680967092514038, "num_tokens": 879919373.0, "step": 23063 }, { "epoch": 2.933977865411525, "ewc_loss": 0.03522862493991852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5228626074967906e-05, "grad_norm": 19.924610137939453, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8547230362892151, "num_tokens": 879953821.0, "step": 23064 }, { "epoch": 2.9341050756901157, "ewc_loss": 0.03526201471686363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5262015444459394e-05, "grad_norm": 19.882204055786133, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.867000162601471, "num_tokens": 879993505.0, "step": 23065 }, { "epoch": 2.934232285968706, "ewc_loss": 0.0351567305624485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515673233778216e-05, "grad_norm": 19.903491973876953, "learning_rate": 1e-06, "loss": 0.3696, "mean_token_accuracy": 0.8802969455718994, "num_tokens": 880030124.0, "step": 23066 }, { "epoch": 2.9343594962472968, "ewc_loss": 0.03518277034163475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5182769352104515e-05, "grad_norm": 19.943679809570312, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8540827035903931, "num_tokens": 880059480.0, "step": 23067 }, { "epoch": 2.9344867065258873, "ewc_loss": 0.03523654863238335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523654959280975e-05, "grad_norm": 19.87958526611328, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8771143555641174, "num_tokens": 880093402.0, "step": 23068 }, { "epoch": 2.934613916804478, "ewc_loss": 0.03519430011510849, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5194301744922996e-05, "grad_norm": 19.8751163482666, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8737713694572449, "num_tokens": 880127874.0, "step": 23069 }, { "epoch": 2.9347411270830683, "ewc_loss": 0.035245075821876526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5245077015133575e-05, "grad_norm": 19.88922882080078, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8845433592796326, "num_tokens": 880165755.0, "step": 23070 }, { "epoch": 2.934868337361659, "ewc_loss": 0.035255830734968185, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525583088048734e-05, "grad_norm": 19.86815071105957, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8706904649734497, "num_tokens": 880203931.0, "step": 23071 }, { "epoch": 2.9349955476402494, "ewc_loss": 0.035237234085798264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5237233532825485e-05, "grad_norm": 19.93494415283203, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8781675696372986, "num_tokens": 880242844.0, "step": 23072 }, { "epoch": 2.93512275791884, "ewc_loss": 0.035237498581409454, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52374991052784e-05, "grad_norm": 19.782487869262695, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8621959686279297, "num_tokens": 880280068.0, "step": 23073 }, { "epoch": 2.9352499681974304, "ewc_loss": 0.03517972677946091, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5179728001821786e-05, "grad_norm": 19.871788024902344, "learning_rate": 1e-06, "loss": 0.3709, "mean_token_accuracy": 0.8816536664962769, "num_tokens": 880314044.0, "step": 23074 }, { "epoch": 2.935377178476021, "ewc_loss": 0.03533302620053291, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533302515279502e-05, "grad_norm": 19.872703552246094, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8714748024940491, "num_tokens": 880355077.0, "step": 23075 }, { "epoch": 2.9355043887546115, "ewc_loss": 0.035257868468761444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525786814861931e-05, "grad_norm": 19.86979103088379, "learning_rate": 1e-06, "loss": 0.341, "mean_token_accuracy": 0.8902300596237183, "num_tokens": 880388906.0, "step": 23076 }, { "epoch": 2.935631599033202, "ewc_loss": 0.035293351858854294, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529335299390368e-05, "grad_norm": 19.88849449157715, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8862758874893188, "num_tokens": 880426834.0, "step": 23077 }, { "epoch": 2.9357588093117926, "ewc_loss": 0.03533244878053665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5332450352143496e-05, "grad_norm": 19.876657485961914, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.873998761177063, "num_tokens": 880463477.0, "step": 23078 }, { "epoch": 2.9358860195903826, "ewc_loss": 0.03531787544488907, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531787660904229e-05, "grad_norm": 19.842039108276367, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.859129011631012, "num_tokens": 880500884.0, "step": 23079 }, { "epoch": 2.9360132298689736, "ewc_loss": 0.0353534072637558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535340874805115e-05, "grad_norm": 19.951169967651367, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8693183660507202, "num_tokens": 880539724.0, "step": 23080 }, { "epoch": 2.9361404401475637, "ewc_loss": 0.03528658673167229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528658635332249e-05, "grad_norm": 19.763755798339844, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8686819076538086, "num_tokens": 880577820.0, "step": 23081 }, { "epoch": 2.9362676504261547, "ewc_loss": 0.03526433929800987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5264340112917125e-05, "grad_norm": 19.844078063964844, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.8788893222808838, "num_tokens": 880612055.0, "step": 23082 }, { "epoch": 2.9363948607047448, "ewc_loss": 0.03545382618904114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5453827877063304e-05, "grad_norm": 19.956798553466797, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8828510046005249, "num_tokens": 880654907.0, "step": 23083 }, { "epoch": 2.9365220709833357, "ewc_loss": 0.03529941290616989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5299413866596296e-05, "grad_norm": 19.837862014770508, "learning_rate": 1e-06, "loss": 0.3349, "mean_token_accuracy": 0.891109824180603, "num_tokens": 880692418.0, "step": 23084 }, { "epoch": 2.936649281261926, "ewc_loss": 0.035309676080942154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53096766048111e-05, "grad_norm": 19.893024444580078, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8622715473175049, "num_tokens": 880727742.0, "step": 23085 }, { "epoch": 2.9367764915405163, "ewc_loss": 0.03529936820268631, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5299366572871804e-05, "grad_norm": 19.89943504333496, "learning_rate": 1e-06, "loss": 0.3574, "mean_token_accuracy": 0.8866614699363708, "num_tokens": 880766673.0, "step": 23086 }, { "epoch": 2.936903701819107, "ewc_loss": 0.03525852411985397, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5258522984804586e-05, "grad_norm": 19.857084274291992, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.869036853313446, "num_tokens": 880810390.0, "step": 23087 }, { "epoch": 2.9370309120976974, "ewc_loss": 0.03521675243973732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521675171214156e-05, "grad_norm": 19.888277053833008, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8701354265213013, "num_tokens": 880846920.0, "step": 23088 }, { "epoch": 2.937158122376288, "ewc_loss": 0.03525162115693092, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525162173900753e-05, "grad_norm": 19.893253326416016, "learning_rate": 1e-06, "loss": 0.363, "mean_token_accuracy": 0.8823636770248413, "num_tokens": 880885867.0, "step": 23089 }, { "epoch": 2.9372853326548785, "ewc_loss": 0.03520199656486511, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5201996070099995e-05, "grad_norm": 19.82042694091797, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8865764141082764, "num_tokens": 880928408.0, "step": 23090 }, { "epoch": 2.937412542933469, "ewc_loss": 0.03525178134441376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5251781810075045e-05, "grad_norm": 19.888723373413086, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8748816251754761, "num_tokens": 880960681.0, "step": 23091 }, { "epoch": 2.9375397532120595, "ewc_loss": 0.035230208188295364, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523020859574899e-05, "grad_norm": 19.931306838989258, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.864057719707489, "num_tokens": 881000358.0, "step": 23092 }, { "epoch": 2.93766696349065, "ewc_loss": 0.035157281905412674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515728167258203e-05, "grad_norm": 19.822288513183594, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8777044415473938, "num_tokens": 881037765.0, "step": 23093 }, { "epoch": 2.9377941737692406, "ewc_loss": 0.03517821058630943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517821096465923e-05, "grad_norm": 19.902721405029297, "learning_rate": 1e-06, "loss": 0.3506, "mean_token_accuracy": 0.8890202641487122, "num_tokens": 881076091.0, "step": 23094 }, { "epoch": 2.937921384047831, "ewc_loss": 0.035213835537433624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521383405313827e-05, "grad_norm": 19.86600112915039, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8746042847633362, "num_tokens": 881112784.0, "step": 23095 }, { "epoch": 2.9380485943264216, "ewc_loss": 0.035143718123435974, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.514371928758919e-05, "grad_norm": 19.819852828979492, "learning_rate": 1e-06, "loss": 0.374, "mean_token_accuracy": 0.8815535306930542, "num_tokens": 881149938.0, "step": 23096 }, { "epoch": 2.938175804605012, "ewc_loss": 0.03522555157542229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522555198287591e-05, "grad_norm": 19.90650749206543, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8661340475082397, "num_tokens": 881187541.0, "step": 23097 }, { "epoch": 2.9383030148836027, "ewc_loss": 0.03520732745528221, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5207325709052384e-05, "grad_norm": 19.872873306274414, "learning_rate": 1e-06, "loss": 0.3815, "mean_token_accuracy": 0.8777135610580444, "num_tokens": 881221920.0, "step": 23098 }, { "epoch": 2.938430225162193, "ewc_loss": 0.035158220678567886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.515822027111426e-05, "grad_norm": 19.85138702392578, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8606903553009033, "num_tokens": 881264804.0, "step": 23099 }, { "epoch": 2.9385574354407837, "ewc_loss": 0.03526672348380089, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526672298903577e-05, "grad_norm": 19.86668586730957, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8671890497207642, "num_tokens": 881304482.0, "step": 23100 }, { "epoch": 2.9386846457193743, "ewc_loss": 0.03519170358777046, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519170422805473e-05, "grad_norm": 19.9091854095459, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8652369976043701, "num_tokens": 881340822.0, "step": 23101 }, { "epoch": 2.938811855997965, "ewc_loss": 0.03521519526839256, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521519465721212e-05, "grad_norm": 19.79641342163086, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8743667602539062, "num_tokens": 881374352.0, "step": 23102 }, { "epoch": 2.9389390662765553, "ewc_loss": 0.035272449254989624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527244916767813e-05, "grad_norm": 19.898344039916992, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8703823685646057, "num_tokens": 881413176.0, "step": 23103 }, { "epoch": 2.9390662765551454, "ewc_loss": 0.035283640027046204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528363959048875e-05, "grad_norm": 19.784854888916016, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8705998063087463, "num_tokens": 881454309.0, "step": 23104 }, { "epoch": 2.9391934868337364, "ewc_loss": 0.03522922098636627, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522922270349227e-05, "grad_norm": 19.907215118408203, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.876581072807312, "num_tokens": 881490702.0, "step": 23105 }, { "epoch": 2.9393206971123265, "ewc_loss": 0.03533386439085007, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5333865525899455e-05, "grad_norm": 19.849687576293945, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8817489743232727, "num_tokens": 881530394.0, "step": 23106 }, { "epoch": 2.9394479073909174, "ewc_loss": 0.03522555157542229, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522555198287591e-05, "grad_norm": 19.855667114257812, "learning_rate": 1e-06, "loss": 0.346, "mean_token_accuracy": 0.8888121843338013, "num_tokens": 881567286.0, "step": 23107 }, { "epoch": 2.9395751176695075, "ewc_loss": 0.03530821576714516, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530821413733065e-05, "grad_norm": 19.86701774597168, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8704732060432434, "num_tokens": 881607017.0, "step": 23108 }, { "epoch": 2.939702327948098, "ewc_loss": 0.035289596766233444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528959859977476e-05, "grad_norm": 19.910188674926758, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8767638206481934, "num_tokens": 881645793.0, "step": 23109 }, { "epoch": 2.9398295382266886, "ewc_loss": 0.035240575671195984, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52405768353492e-05, "grad_norm": 19.80080795288086, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8780838847160339, "num_tokens": 881683596.0, "step": 23110 }, { "epoch": 2.939956748505279, "ewc_loss": 0.035234689712524414, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523469058563933e-05, "grad_norm": 19.89583396911621, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8605078458786011, "num_tokens": 881717607.0, "step": 23111 }, { "epoch": 2.9400839587838696, "ewc_loss": 0.03530994802713394, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5309949453221634e-05, "grad_norm": 19.8422908782959, "learning_rate": 1e-06, "loss": 0.3491, "mean_token_accuracy": 0.8872352838516235, "num_tokens": 881750884.0, "step": 23112 }, { "epoch": 2.94021116906246, "ewc_loss": 0.035234905779361725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5234905226388946e-05, "grad_norm": 19.86446189880371, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8703078031539917, "num_tokens": 881789345.0, "step": 23113 }, { "epoch": 2.9403383793410507, "ewc_loss": 0.03530587628483772, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530587491695769e-05, "grad_norm": 19.85629653930664, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8718997240066528, "num_tokens": 881828459.0, "step": 23114 }, { "epoch": 2.940465589619641, "ewc_loss": 0.03526867553591728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526867658365518e-05, "grad_norm": 19.886173248291016, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8697909116744995, "num_tokens": 881865294.0, "step": 23115 }, { "epoch": 2.9405927998982317, "ewc_loss": 0.03527422621846199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5274224501335993e-05, "grad_norm": 19.906042098999023, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8773360848426819, "num_tokens": 881904936.0, "step": 23116 }, { "epoch": 2.9407200101768223, "ewc_loss": 0.03528160974383354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528160959831439e-05, "grad_norm": 19.850006103515625, "learning_rate": 1e-06, "loss": 0.3554, "mean_token_accuracy": 0.886436939239502, "num_tokens": 881936300.0, "step": 23117 }, { "epoch": 2.940847220455413, "ewc_loss": 0.035286761820316315, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528676097630523e-05, "grad_norm": 19.90609359741211, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.886542797088623, "num_tokens": 881974092.0, "step": 23118 }, { "epoch": 2.9409744307340033, "ewc_loss": 0.03530338034033775, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5303379263496026e-05, "grad_norm": 19.904111862182617, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8767189383506775, "num_tokens": 882019339.0, "step": 23119 }, { "epoch": 2.941101641012594, "ewc_loss": 0.035232964903116226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5232966183684766e-05, "grad_norm": 19.887678146362305, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8799009323120117, "num_tokens": 882052837.0, "step": 23120 }, { "epoch": 2.9412288512911844, "ewc_loss": 0.03528266400098801, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5282664612168446e-05, "grad_norm": 19.916105270385742, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8798337578773499, "num_tokens": 882088449.0, "step": 23121 }, { "epoch": 2.941356061569775, "ewc_loss": 0.035272881388664246, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5272882087156177e-05, "grad_norm": 19.91729164123535, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8661565184593201, "num_tokens": 882124844.0, "step": 23122 }, { "epoch": 2.9414832718483654, "ewc_loss": 0.03526533767580986, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526533691911027e-05, "grad_norm": 19.89641571044922, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8777792453765869, "num_tokens": 882162246.0, "step": 23123 }, { "epoch": 2.941610482126956, "ewc_loss": 0.03522440046072006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5224398743594065e-05, "grad_norm": 19.857133865356445, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8632827997207642, "num_tokens": 882202825.0, "step": 23124 }, { "epoch": 2.9417376924055465, "ewc_loss": 0.035268742591142654, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268742067273706e-05, "grad_norm": 19.91505241394043, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8612416386604309, "num_tokens": 882237951.0, "step": 23125 }, { "epoch": 2.941864902684137, "ewc_loss": 0.0352313369512558, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523133636917919e-05, "grad_norm": 19.812334060668945, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.874416172504425, "num_tokens": 882279147.0, "step": 23126 }, { "epoch": 2.941992112962727, "ewc_loss": 0.035238929092884064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523892883094959e-05, "grad_norm": 19.93938446044922, "learning_rate": 1e-06, "loss": 0.3286, "mean_token_accuracy": 0.8950802087783813, "num_tokens": 882322777.0, "step": 23127 }, { "epoch": 2.942119323241318, "ewc_loss": 0.03530024364590645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530024332576431e-05, "grad_norm": 19.86801528930664, "learning_rate": 1e-06, "loss": 0.3735, "mean_token_accuracy": 0.8800363540649414, "num_tokens": 882360681.0, "step": 23128 }, { "epoch": 2.942246533519908, "ewc_loss": 0.03521121293306351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5211214708397165e-05, "grad_norm": 19.840421676635742, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8552976846694946, "num_tokens": 882403823.0, "step": 23129 }, { "epoch": 2.942373743798499, "ewc_loss": 0.03530856966972351, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5308570659253746e-05, "grad_norm": 19.83698272705078, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8831003904342651, "num_tokens": 882443723.0, "step": 23130 }, { "epoch": 2.9425009540770892, "ewc_loss": 0.03523122891783714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5231227229814976e-05, "grad_norm": 19.787328720092773, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8612944483757019, "num_tokens": 882480610.0, "step": 23131 }, { "epoch": 2.94262816435568, "ewc_loss": 0.035299692302942276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529969399096444e-05, "grad_norm": 19.907794952392578, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8778437376022339, "num_tokens": 882518789.0, "step": 23132 }, { "epoch": 2.9427553746342703, "ewc_loss": 0.035305626690387726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530562753439881e-05, "grad_norm": 19.90509605407715, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.863028883934021, "num_tokens": 882556598.0, "step": 23133 }, { "epoch": 2.942882584912861, "ewc_loss": 0.035229481756687164, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522948099998757e-05, "grad_norm": 19.86393165588379, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.87080979347229, "num_tokens": 882593788.0, "step": 23134 }, { "epoch": 2.9430097951914513, "ewc_loss": 0.03528660535812378, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5286604543216527e-05, "grad_norm": 19.900362014770508, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8822730183601379, "num_tokens": 882630861.0, "step": 23135 }, { "epoch": 2.943137005470042, "ewc_loss": 0.035300225019454956, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530022513587028e-05, "grad_norm": 19.866931915283203, "learning_rate": 1e-06, "loss": 0.3758, "mean_token_accuracy": 0.883703351020813, "num_tokens": 882670950.0, "step": 23136 }, { "epoch": 2.9432642157486324, "ewc_loss": 0.03521430119872093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521429971442558e-05, "grad_norm": 19.865158081054688, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8662022948265076, "num_tokens": 882705680.0, "step": 23137 }, { "epoch": 2.943391426027223, "ewc_loss": 0.03528803586959839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5288034268887714e-05, "grad_norm": 19.819852828979492, "learning_rate": 1e-06, "loss": 0.3636, "mean_token_accuracy": 0.8846337795257568, "num_tokens": 882737589.0, "step": 23138 }, { "epoch": 2.9435186363058135, "ewc_loss": 0.03526897355914116, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526897489791736e-05, "grad_norm": 19.984228134155273, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8609862327575684, "num_tokens": 882774502.0, "step": 23139 }, { "epoch": 2.943645846584404, "ewc_loss": 0.03529001399874687, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529001332935877e-05, "grad_norm": 19.802330017089844, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8656757473945618, "num_tokens": 882815184.0, "step": 23140 }, { "epoch": 2.9437730568629945, "ewc_loss": 0.035239286720752716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5239285352872685e-05, "grad_norm": 19.907527923583984, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8882704973220825, "num_tokens": 882856793.0, "step": 23141 }, { "epoch": 2.943900267141585, "ewc_loss": 0.03525787964463234, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525787906255573e-05, "grad_norm": 19.87044906616211, "learning_rate": 1e-06, "loss": 0.36, "mean_token_accuracy": 0.8859758377075195, "num_tokens": 882896128.0, "step": 23142 }, { "epoch": 2.9440274774201756, "ewc_loss": 0.03523377329111099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523377381497994e-05, "grad_norm": 19.897451400756836, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.883166491985321, "num_tokens": 882931253.0, "step": 23143 }, { "epoch": 2.944154687698766, "ewc_loss": 0.035262640565633774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5262641176814213e-05, "grad_norm": 19.787586212158203, "learning_rate": 1e-06, "loss": 0.3412, "mean_token_accuracy": 0.8912557363510132, "num_tokens": 882975072.0, "step": 23144 }, { "epoch": 2.9442818979773566, "ewc_loss": 0.035209231078624725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52092320099473e-05, "grad_norm": 19.774799346923828, "learning_rate": 1e-06, "loss": 0.3547, "mean_token_accuracy": 0.8877444863319397, "num_tokens": 883013576.0, "step": 23145 }, { "epoch": 2.944409108255947, "ewc_loss": 0.035270556807518005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5270557418698445e-05, "grad_norm": 19.851852416992188, "learning_rate": 1e-06, "loss": 0.3766, "mean_token_accuracy": 0.8787046670913696, "num_tokens": 883046052.0, "step": 23146 }, { "epoch": 2.9445363185345377, "ewc_loss": 0.035270966589450836, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527096487232484e-05, "grad_norm": 19.88292694091797, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8566708564758301, "num_tokens": 883080398.0, "step": 23147 }, { "epoch": 2.944663528813128, "ewc_loss": 0.03527676314115524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5276763810543343e-05, "grad_norm": 19.877655029296875, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.866783082485199, "num_tokens": 883120321.0, "step": 23148 }, { "epoch": 2.9447907390917187, "ewc_loss": 0.0352594368159771, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5259436117485166e-05, "grad_norm": 19.809782028198242, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8821035623550415, "num_tokens": 883161285.0, "step": 23149 }, { "epoch": 2.9449179493703093, "ewc_loss": 0.035275209695100784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5275210393592715e-05, "grad_norm": 19.894418716430664, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8739895224571228, "num_tokens": 883199857.0, "step": 23150 }, { "epoch": 2.9450451596489, "ewc_loss": 0.03533529117703438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5335291613591835e-05, "grad_norm": 19.86165428161621, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8821132183074951, "num_tokens": 883238325.0, "step": 23151 }, { "epoch": 2.94517236992749, "ewc_loss": 0.0352218933403492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522189217619598e-05, "grad_norm": 19.869213104248047, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8672839999198914, "num_tokens": 883281814.0, "step": 23152 }, { "epoch": 2.945299580206081, "ewc_loss": 0.035372018814086914, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5372017009649426e-05, "grad_norm": 19.81656837463379, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.879610538482666, "num_tokens": 883317385.0, "step": 23153 }, { "epoch": 2.945426790484671, "ewc_loss": 0.035269249230623245, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526924774632789e-05, "grad_norm": 19.818370819091797, "learning_rate": 1e-06, "loss": 0.3631, "mean_token_accuracy": 0.8829411268234253, "num_tokens": 883353498.0, "step": 23154 }, { "epoch": 2.945554000763262, "ewc_loss": 0.035290222615003586, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529022433212958e-05, "grad_norm": 19.866344451904297, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8839292526245117, "num_tokens": 883391709.0, "step": 23155 }, { "epoch": 2.945681211041852, "ewc_loss": 0.03537377342581749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537377415341325e-05, "grad_norm": 19.963607788085938, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8599659204483032, "num_tokens": 883426329.0, "step": 23156 }, { "epoch": 2.945808421320443, "ewc_loss": 0.035310499370098114, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5310498788021505e-05, "grad_norm": 19.821083068847656, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8730860948562622, "num_tokens": 883465797.0, "step": 23157 }, { "epoch": 2.945935631599033, "ewc_loss": 0.03526371344923973, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5263714380562305e-05, "grad_norm": 19.85997200012207, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8788936734199524, "num_tokens": 883501632.0, "step": 23158 }, { "epoch": 2.9460628418776236, "ewc_loss": 0.03537343814969063, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5373439459363e-05, "grad_norm": 19.966840744018555, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8859130144119263, "num_tokens": 883537622.0, "step": 23159 }, { "epoch": 2.946190052156214, "ewc_loss": 0.035307660698890686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530766116455197e-05, "grad_norm": 19.847110748291016, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.876896858215332, "num_tokens": 883576458.0, "step": 23160 }, { "epoch": 2.9463172624348046, "ewc_loss": 0.035178180783987045, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517818186082877e-05, "grad_norm": 19.825572967529297, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8670860528945923, "num_tokens": 883620929.0, "step": 23161 }, { "epoch": 2.946444472713395, "ewc_loss": 0.0352713018655777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52713032043539e-05, "grad_norm": 19.872663497924805, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.852912425994873, "num_tokens": 883652589.0, "step": 23162 }, { "epoch": 2.9465716829919857, "ewc_loss": 0.03528618812561035, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528618981363252e-05, "grad_norm": 19.892578125, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8750690221786499, "num_tokens": 883691844.0, "step": 23163 }, { "epoch": 2.946698893270576, "ewc_loss": 0.03525280952453613, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5252807720098644e-05, "grad_norm": 19.854907989501953, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8560132384300232, "num_tokens": 883732838.0, "step": 23164 }, { "epoch": 2.9468261035491667, "ewc_loss": 0.03533244505524635, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533244671416469e-05, "grad_norm": 19.896976470947266, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8735994696617126, "num_tokens": 883771415.0, "step": 23165 }, { "epoch": 2.9469533138277573, "ewc_loss": 0.03522660583257675, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522660699672997e-05, "grad_norm": 19.850528717041016, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8605151772499084, "num_tokens": 883808807.0, "step": 23166 }, { "epoch": 2.947080524106348, "ewc_loss": 0.035342805087566376, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534280403982848e-05, "grad_norm": 19.904699325561523, "learning_rate": 1e-06, "loss": 0.342, "mean_token_accuracy": 0.889473021030426, "num_tokens": 883844300.0, "step": 23167 }, { "epoch": 2.9472077343849383, "ewc_loss": 0.035349197685718536, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534919596859254e-05, "grad_norm": 19.86556625366211, "learning_rate": 1e-06, "loss": 0.3477, "mean_token_accuracy": 0.8898952007293701, "num_tokens": 883879337.0, "step": 23168 }, { "epoch": 2.947334944663529, "ewc_loss": 0.03523608669638634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523608756950125e-05, "grad_norm": 19.845441818237305, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8651594519615173, "num_tokens": 883925810.0, "step": 23169 }, { "epoch": 2.9474621549421194, "ewc_loss": 0.03541967645287514, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5419678170001134e-05, "grad_norm": 19.900022506713867, "learning_rate": 1e-06, "loss": 0.371, "mean_token_accuracy": 0.8850521445274353, "num_tokens": 883964753.0, "step": 23170 }, { "epoch": 2.94758936522071, "ewc_loss": 0.03530409559607506, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5304095945321023e-05, "grad_norm": 19.898500442504883, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8687975406646729, "num_tokens": 884005737.0, "step": 23171 }, { "epoch": 2.9477165754993004, "ewc_loss": 0.03537994250655174, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537994416547008e-05, "grad_norm": 19.895217895507812, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8764645457267761, "num_tokens": 884043414.0, "step": 23172 }, { "epoch": 2.947843785777891, "ewc_loss": 0.03518501669168472, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.51850176230073e-05, "grad_norm": 19.87112808227539, "learning_rate": 1e-06, "loss": 0.3754, "mean_token_accuracy": 0.8799974918365479, "num_tokens": 884077399.0, "step": 23173 }, { "epoch": 2.9479709960564815, "ewc_loss": 0.03541459143161774, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541459227562882e-05, "grad_norm": 19.94926643371582, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8731777667999268, "num_tokens": 884116539.0, "step": 23174 }, { "epoch": 2.948098206335072, "ewc_loss": 0.03526467829942703, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5264678444946185e-05, "grad_norm": 19.818449020385742, "learning_rate": 1e-06, "loss": 0.3702, "mean_token_accuracy": 0.8805524110794067, "num_tokens": 884149756.0, "step": 23175 }, { "epoch": 2.9482254166136626, "ewc_loss": 0.03522691875696182, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522691986290738e-05, "grad_norm": 19.931550979614258, "learning_rate": 1e-06, "loss": 0.34, "mean_token_accuracy": 0.8954219818115234, "num_tokens": 884181960.0, "step": 23176 }, { "epoch": 2.9483526268922526, "ewc_loss": 0.035331547260284424, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533154813339934e-05, "grad_norm": 19.800413131713867, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8715906143188477, "num_tokens": 884223562.0, "step": 23177 }, { "epoch": 2.9484798371708436, "ewc_loss": 0.03520972281694412, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520972313708626e-05, "grad_norm": 19.88508415222168, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8819037675857544, "num_tokens": 884264541.0, "step": 23178 }, { "epoch": 2.9486070474494337, "ewc_loss": 0.035374272614717484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537427255650982e-05, "grad_norm": 19.885364532470703, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8620129823684692, "num_tokens": 884301171.0, "step": 23179 }, { "epoch": 2.9487342577280247, "ewc_loss": 0.035284146666526794, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528414526954293e-05, "grad_norm": 19.885976791381836, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8742957711219788, "num_tokens": 884332481.0, "step": 23180 }, { "epoch": 2.9488614680066147, "ewc_loss": 0.035230670124292374, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523067061905749e-05, "grad_norm": 19.793859481811523, "learning_rate": 1e-06, "loss": 0.3292, "mean_token_accuracy": 0.8958026766777039, "num_tokens": 884371616.0, "step": 23181 }, { "epoch": 2.9489886782852053, "ewc_loss": 0.0353141687810421, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531416950863786e-05, "grad_norm": 19.897397994995117, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8653057217597961, "num_tokens": 884411245.0, "step": 23182 }, { "epoch": 2.949115888563796, "ewc_loss": 0.03530833497643471, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5308334190631285e-05, "grad_norm": 19.965200424194336, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8736515641212463, "num_tokens": 884445712.0, "step": 23183 }, { "epoch": 2.9492430988423863, "ewc_loss": 0.03528475761413574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5284756449982524e-05, "grad_norm": 19.854389190673828, "learning_rate": 1e-06, "loss": 0.3767, "mean_token_accuracy": 0.879065215587616, "num_tokens": 884484935.0, "step": 23184 }, { "epoch": 2.949370309120977, "ewc_loss": 0.035211436450481415, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52114366251044e-05, "grad_norm": 19.876922607421875, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.875643253326416, "num_tokens": 884518714.0, "step": 23185 }, { "epoch": 2.9494975193995674, "ewc_loss": 0.03534455597400665, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53445575456135e-05, "grad_norm": 19.8953914642334, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.867690920829773, "num_tokens": 884560430.0, "step": 23186 }, { "epoch": 2.949624729678158, "ewc_loss": 0.03527712821960449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527712760842405e-05, "grad_norm": 19.865100860595703, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8677964210510254, "num_tokens": 884595937.0, "step": 23187 }, { "epoch": 2.9497519399567484, "ewc_loss": 0.035288844257593155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5288845538161695e-05, "grad_norm": 19.878196716308594, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8703324198722839, "num_tokens": 884633201.0, "step": 23188 }, { "epoch": 2.949879150235339, "ewc_loss": 0.03529191389679909, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529191235429607e-05, "grad_norm": 19.898042678833008, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8771171569824219, "num_tokens": 884672602.0, "step": 23189 }, { "epoch": 2.9500063605139295, "ewc_loss": 0.035317059606313705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531705806381069e-05, "grad_norm": 19.918292999267578, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8792510032653809, "num_tokens": 884708378.0, "step": 23190 }, { "epoch": 2.95013357079252, "ewc_loss": 0.03534947335720062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534947245498188e-05, "grad_norm": 19.90983772277832, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.874136209487915, "num_tokens": 884747527.0, "step": 23191 }, { "epoch": 2.9502607810711106, "ewc_loss": 0.03529632091522217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529632158461027e-05, "grad_norm": 19.88297462463379, "learning_rate": 1e-06, "loss": 0.3331, "mean_token_accuracy": 0.8925060629844666, "num_tokens": 884781738.0, "step": 23192 }, { "epoch": 2.950387991349701, "ewc_loss": 0.03527817875146866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52781789842993e-05, "grad_norm": 19.886045455932617, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8640693426132202, "num_tokens": 884818884.0, "step": 23193 }, { "epoch": 2.9505152016282916, "ewc_loss": 0.03529788553714752, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529788591549732e-05, "grad_norm": 19.882349014282227, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8854411840438843, "num_tokens": 884851751.0, "step": 23194 }, { "epoch": 2.950642411906882, "ewc_loss": 0.03532068803906441, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532068876666017e-05, "grad_norm": 19.899269104003906, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.879417896270752, "num_tokens": 884892935.0, "step": 23195 }, { "epoch": 2.9507696221854727, "ewc_loss": 0.03533432260155678, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533432391122915e-05, "grad_norm": 19.843852996826172, "learning_rate": 1e-06, "loss": 0.3781, "mean_token_accuracy": 0.8796421885490417, "num_tokens": 884935820.0, "step": 23196 }, { "epoch": 2.950896832464063, "ewc_loss": 0.03526873141527176, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268731153337285e-05, "grad_norm": 19.8769474029541, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8733667731285095, "num_tokens": 884976042.0, "step": 23197 }, { "epoch": 2.9510240427426537, "ewc_loss": 0.03528803586959839, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5288034268887714e-05, "grad_norm": 19.90656852722168, "learning_rate": 1e-06, "loss": 0.3292, "mean_token_accuracy": 0.8947612047195435, "num_tokens": 885013511.0, "step": 23198 }, { "epoch": 2.9511512530212443, "ewc_loss": 0.03526643291115761, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52664319507312e-05, "grad_norm": 19.86744499206543, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8635088801383972, "num_tokens": 885047291.0, "step": 23199 }, { "epoch": 2.951278463299835, "ewc_loss": 0.03531097248196602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531097172526643e-05, "grad_norm": 19.931562423706055, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8721350431442261, "num_tokens": 885083116.0, "step": 23200 }, { "epoch": 2.9514056735784253, "ewc_loss": 0.035247672349214554, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524767089402303e-05, "grad_norm": 19.911476135253906, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8717751502990723, "num_tokens": 885123282.0, "step": 23201 }, { "epoch": 2.9515328838570154, "ewc_loss": 0.03536119684576988, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5361197660677135e-05, "grad_norm": 19.881593704223633, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.879920482635498, "num_tokens": 885166277.0, "step": 23202 }, { "epoch": 2.9516600941356064, "ewc_loss": 0.03518662229180336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5186621971661225e-05, "grad_norm": 19.84939193725586, "learning_rate": 1e-06, "loss": 0.3483, "mean_token_accuracy": 0.8865314722061157, "num_tokens": 885206414.0, "step": 23203 }, { "epoch": 2.9517873044141965, "ewc_loss": 0.03530588373541832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5305882192915305e-05, "grad_norm": 19.943889617919922, "learning_rate": 1e-06, "loss": 0.3751, "mean_token_accuracy": 0.8838887810707092, "num_tokens": 885240977.0, "step": 23204 }, { "epoch": 2.9519145146927874, "ewc_loss": 0.03526736795902252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5267366911284626e-05, "grad_norm": 19.922731399536133, "learning_rate": 1e-06, "loss": 0.3408, "mean_token_accuracy": 0.8907127976417542, "num_tokens": 885283190.0, "step": 23205 }, { "epoch": 2.9520417249713775, "ewc_loss": 0.035250768065452576, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5250766813987866e-05, "grad_norm": 19.79637908935547, "learning_rate": 1e-06, "loss": 0.3526, "mean_token_accuracy": 0.8904811143875122, "num_tokens": 885321340.0, "step": 23206 }, { "epoch": 2.952168935249968, "ewc_loss": 0.03523072600364685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52307251887396e-05, "grad_norm": 19.933507919311523, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8839892148971558, "num_tokens": 885357524.0, "step": 23207 }, { "epoch": 2.9522961455285586, "ewc_loss": 0.035284023731946945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52840252162423e-05, "grad_norm": 19.88359832763672, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8528963327407837, "num_tokens": 885389696.0, "step": 23208 }, { "epoch": 2.952423355807149, "ewc_loss": 0.035230956971645355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523095801938325e-05, "grad_norm": 19.823251724243164, "learning_rate": 1e-06, "loss": 0.3428, "mean_token_accuracy": 0.8907746076583862, "num_tokens": 885429574.0, "step": 23209 }, { "epoch": 2.9525505660857396, "ewc_loss": 0.035314787179231644, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5314787965035066e-05, "grad_norm": 19.935686111450195, "learning_rate": 1e-06, "loss": 0.376, "mean_token_accuracy": 0.8807982206344604, "num_tokens": 885459961.0, "step": 23210 }, { "epoch": 2.95267777636433, "ewc_loss": 0.03529420495033264, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529420428094454e-05, "grad_norm": 19.88294219970703, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8693631887435913, "num_tokens": 885506725.0, "step": 23211 }, { "epoch": 2.9528049866429207, "ewc_loss": 0.03525446355342865, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525446300045587e-05, "grad_norm": 19.891063690185547, "learning_rate": 1e-06, "loss": 0.3802, "mean_token_accuracy": 0.8804178237915039, "num_tokens": 885549409.0, "step": 23212 }, { "epoch": 2.952932196921511, "ewc_loss": 0.03527871146798134, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527871012920514e-05, "grad_norm": 19.91995620727539, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8807509541511536, "num_tokens": 885589272.0, "step": 23213 }, { "epoch": 2.9530594072001017, "ewc_loss": 0.035316966474056244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5316967114340514e-05, "grad_norm": 19.92661476135254, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8589004874229431, "num_tokens": 885625951.0, "step": 23214 }, { "epoch": 2.9531866174786923, "ewc_loss": 0.03519374877214432, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519374877214432e-05, "grad_norm": 19.86943244934082, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8782883286476135, "num_tokens": 885665844.0, "step": 23215 }, { "epoch": 2.953313827757283, "ewc_loss": 0.03525102138519287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525102147250436e-05, "grad_norm": 19.895030975341797, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8755391240119934, "num_tokens": 885705616.0, "step": 23216 }, { "epoch": 2.9534410380358733, "ewc_loss": 0.03525281324982643, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525281499605626e-05, "grad_norm": 19.91532325744629, "learning_rate": 1e-06, "loss": 0.3646, "mean_token_accuracy": 0.8873523473739624, "num_tokens": 885752362.0, "step": 23217 }, { "epoch": 2.953568248314464, "ewc_loss": 0.03522541746497154, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522541737766005e-05, "grad_norm": 19.87485122680664, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8726871609687805, "num_tokens": 885792485.0, "step": 23218 }, { "epoch": 2.9536954585930544, "ewc_loss": 0.03520626947283745, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520627069519833e-05, "grad_norm": 19.85849952697754, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8774756193161011, "num_tokens": 885834809.0, "step": 23219 }, { "epoch": 2.953822668871645, "ewc_loss": 0.03524360433220863, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52436036337167e-05, "grad_norm": 19.964235305786133, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8810136318206787, "num_tokens": 885870555.0, "step": 23220 }, { "epoch": 2.9539498791502354, "ewc_loss": 0.03527602180838585, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52760216628667e-05, "grad_norm": 19.85030746459961, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8783767223358154, "num_tokens": 885911911.0, "step": 23221 }, { "epoch": 2.954077089428826, "ewc_loss": 0.03521040827035904, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521040707710199e-05, "grad_norm": 19.94121742248535, "learning_rate": 1e-06, "loss": 0.3489, "mean_token_accuracy": 0.8873172402381897, "num_tokens": 885941094.0, "step": 23222 }, { "epoch": 2.9542042997074165, "ewc_loss": 0.035298701375722885, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5298700822750106e-05, "grad_norm": 19.955448150634766, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8748523592948914, "num_tokens": 885978094.0, "step": 23223 }, { "epoch": 2.954331509986007, "ewc_loss": 0.0351935550570488, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519355595926754e-05, "grad_norm": 19.923221588134766, "learning_rate": 1e-06, "loss": 0.3434, "mean_token_accuracy": 0.8925161957740784, "num_tokens": 886014859.0, "step": 23224 }, { "epoch": 2.954458720264597, "ewc_loss": 0.03519091382622719, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519091478665359e-05, "grad_norm": 19.873640060424805, "learning_rate": 1e-06, "loss": 0.3732, "mean_token_accuracy": 0.8815242052078247, "num_tokens": 886048317.0, "step": 23225 }, { "epoch": 2.954585930543188, "ewc_loss": 0.035204630345106125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520462996675633e-05, "grad_norm": 19.979938507080078, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8769840002059937, "num_tokens": 886081797.0, "step": 23226 }, { "epoch": 2.954713140821778, "ewc_loss": 0.035227321088314056, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522732004057616e-05, "grad_norm": 19.83220100402832, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8628008365631104, "num_tokens": 886124111.0, "step": 23227 }, { "epoch": 2.954840351100369, "ewc_loss": 0.03510957956314087, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5109580494463444e-05, "grad_norm": 19.823942184448242, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8864496350288391, "num_tokens": 886162823.0, "step": 23228 }, { "epoch": 2.954967561378959, "ewc_loss": 0.035235341638326645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52353417838458e-05, "grad_norm": 19.852436065673828, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8781635761260986, "num_tokens": 886204706.0, "step": 23229 }, { "epoch": 2.95509477165755, "ewc_loss": 0.035271547734737396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5271546948933974e-05, "grad_norm": 19.898847579956055, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8839383721351624, "num_tokens": 886241461.0, "step": 23230 }, { "epoch": 2.9552219819361403, "ewc_loss": 0.03522355854511261, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522355837048963e-05, "grad_norm": 19.825532913208008, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8724833726882935, "num_tokens": 886279276.0, "step": 23231 }, { "epoch": 2.955349192214731, "ewc_loss": 0.035251274704933167, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525127613102086e-05, "grad_norm": 19.89691162109375, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8780404329299927, "num_tokens": 886317703.0, "step": 23232 }, { "epoch": 2.9554764024933213, "ewc_loss": 0.03520212695002556, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520212703733705e-05, "grad_norm": 19.86093521118164, "learning_rate": 1e-06, "loss": 0.3703, "mean_token_accuracy": 0.8822915554046631, "num_tokens": 886355235.0, "step": 23233 }, { "epoch": 2.955603612771912, "ewc_loss": 0.03520524129271507, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520524114719592e-05, "grad_norm": 19.866722106933594, "learning_rate": 1e-06, "loss": 0.3169, "mean_token_accuracy": 0.8989244699478149, "num_tokens": 886387354.0, "step": 23234 }, { "epoch": 2.9557308230505024, "ewc_loss": 0.03524462506175041, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5244625905761495e-05, "grad_norm": 19.84882354736328, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8733114004135132, "num_tokens": 886419742.0, "step": 23235 }, { "epoch": 2.955858033329093, "ewc_loss": 0.03522372990846634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522372935549356e-05, "grad_norm": 19.933061599731445, "learning_rate": 1e-06, "loss": 0.3825, "mean_token_accuracy": 0.8772633075714111, "num_tokens": 886455218.0, "step": 23236 }, { "epoch": 2.9559852436076834, "ewc_loss": 0.03532571718096733, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532571645337157e-05, "grad_norm": 19.865890502929688, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8784635066986084, "num_tokens": 886494454.0, "step": 23237 }, { "epoch": 2.956112453886274, "ewc_loss": 0.03523913398385048, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523913255776279e-05, "grad_norm": 19.9068546295166, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8549752831459045, "num_tokens": 886532854.0, "step": 23238 }, { "epoch": 2.9562396641648645, "ewc_loss": 0.035360727459192276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536072836141102e-05, "grad_norm": 19.915008544921875, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8762934803962708, "num_tokens": 886564288.0, "step": 23239 }, { "epoch": 2.956366874443455, "ewc_loss": 0.03520165756344795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5201657738070935e-05, "grad_norm": 19.821949005126953, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8792494535446167, "num_tokens": 886597155.0, "step": 23240 }, { "epoch": 2.9564940847220456, "ewc_loss": 0.03526878356933594, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268782085040584e-05, "grad_norm": 19.916128158569336, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8793833255767822, "num_tokens": 886632015.0, "step": 23241 }, { "epoch": 2.956621295000636, "ewc_loss": 0.035325873643159866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5325872886460274e-05, "grad_norm": 19.832305908203125, "learning_rate": 1e-06, "loss": 0.3309, "mean_token_accuracy": 0.8931849002838135, "num_tokens": 886665736.0, "step": 23242 }, { "epoch": 2.9567485052792266, "ewc_loss": 0.035268817096948624, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268818464828655e-05, "grad_norm": 19.857925415039062, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8767273426055908, "num_tokens": 886702652.0, "step": 23243 }, { "epoch": 2.956875715557817, "ewc_loss": 0.035352881997823715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535288124112412e-05, "grad_norm": 19.88649559020996, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8744109869003296, "num_tokens": 886743573.0, "step": 23244 }, { "epoch": 2.9570029258364077, "ewc_loss": 0.035344257950782776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5344259231351316e-05, "grad_norm": 19.791784286499023, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8814888000488281, "num_tokens": 886777932.0, "step": 23245 }, { "epoch": 2.957130136114998, "ewc_loss": 0.03531734272837639, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5317341826157644e-05, "grad_norm": 19.880474090576172, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8648592233657837, "num_tokens": 886813211.0, "step": 23246 }, { "epoch": 2.9572573463935887, "ewc_loss": 0.03544003516435623, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.544003629940562e-05, "grad_norm": 19.875232696533203, "learning_rate": 1e-06, "loss": 0.3692, "mean_token_accuracy": 0.8821653723716736, "num_tokens": 886849416.0, "step": 23247 }, { "epoch": 2.9573845566721793, "ewc_loss": 0.035275254398584366, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52752540493384e-05, "grad_norm": 19.85938262939453, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8623408079147339, "num_tokens": 886887983.0, "step": 23248 }, { "epoch": 2.95751176695077, "ewc_loss": 0.03537583723664284, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537583688739687e-05, "grad_norm": 19.90612030029297, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8605389595031738, "num_tokens": 886930867.0, "step": 23249 }, { "epoch": 2.95763897722936, "ewc_loss": 0.035344306379556656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534430652507581e-05, "grad_norm": 19.826364517211914, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8743578195571899, "num_tokens": 886971862.0, "step": 23250 }, { "epoch": 2.957766187507951, "ewc_loss": 0.03533228486776352, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533228664309718e-05, "grad_norm": 19.82631492614746, "learning_rate": 1e-06, "loss": 0.3617, "mean_token_accuracy": 0.8818562030792236, "num_tokens": 887005238.0, "step": 23251 }, { "epoch": 2.957893397786541, "ewc_loss": 0.035376716405153275, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537671727826819e-05, "grad_norm": 19.999671936035156, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8697242140769958, "num_tokens": 887035588.0, "step": 23252 }, { "epoch": 2.958020608065132, "ewc_loss": 0.035383183509111404, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538318196660839e-05, "grad_norm": 19.866443634033203, "learning_rate": 1e-06, "loss": 0.3857, "mean_token_accuracy": 0.8778649568557739, "num_tokens": 887079962.0, "step": 23253 }, { "epoch": 2.958147818343722, "ewc_loss": 0.035269834101200104, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5269833460915834e-05, "grad_norm": 19.860427856445312, "learning_rate": 1e-06, "loss": 0.3246, "mean_token_accuracy": 0.8956805467605591, "num_tokens": 887114572.0, "step": 23254 }, { "epoch": 2.958275028622313, "ewc_loss": 0.03542179986834526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.542179911164567e-05, "grad_norm": 19.893085479736328, "learning_rate": 1e-06, "loss": 0.3715, "mean_token_accuracy": 0.8810092210769653, "num_tokens": 887149396.0, "step": 23255 }, { "epoch": 2.958402238900903, "ewc_loss": 0.03530162200331688, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53016221197322e-05, "grad_norm": 19.87484359741211, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8706262111663818, "num_tokens": 887191916.0, "step": 23256 }, { "epoch": 2.9585294491794936, "ewc_loss": 0.03532401844859123, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532401751726866e-05, "grad_norm": 19.91177749633789, "learning_rate": 1e-06, "loss": 0.3675, "mean_token_accuracy": 0.8859515190124512, "num_tokens": 887228263.0, "step": 23257 }, { "epoch": 2.958656659458084, "ewc_loss": 0.03534752130508423, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534752249834128e-05, "grad_norm": 19.865997314453125, "learning_rate": 1e-06, "loss": 0.3468, "mean_token_accuracy": 0.8918500542640686, "num_tokens": 887258688.0, "step": 23258 }, { "epoch": 2.9587838697366746, "ewc_loss": 0.0352388396859169, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523883788147941e-05, "grad_norm": 19.797260284423828, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8732080459594727, "num_tokens": 887301565.0, "step": 23259 }, { "epoch": 2.958911080015265, "ewc_loss": 0.03538088500499725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5380886401981115e-05, "grad_norm": 19.904911041259766, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8728448748588562, "num_tokens": 887337511.0, "step": 23260 }, { "epoch": 2.9590382902938557, "ewc_loss": 0.035358257591724396, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5358258173801005e-05, "grad_norm": 19.80209732055664, "learning_rate": 1e-06, "loss": 0.3618, "mean_token_accuracy": 0.8801630139350891, "num_tokens": 887370694.0, "step": 23261 }, { "epoch": 2.959165500572446, "ewc_loss": 0.03532684966921806, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5326847864780575e-05, "grad_norm": 19.932260513305664, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8758180141448975, "num_tokens": 887412131.0, "step": 23262 }, { "epoch": 2.9592927108510367, "ewc_loss": 0.035398196429014206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.539819590514526e-05, "grad_norm": 19.932085037231445, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8719097375869751, "num_tokens": 887447179.0, "step": 23263 }, { "epoch": 2.9594199211296273, "ewc_loss": 0.035327497869729996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5327499062987044e-05, "grad_norm": 19.855871200561523, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8722618818283081, "num_tokens": 887486079.0, "step": 23264 }, { "epoch": 2.959547131408218, "ewc_loss": 0.03527660667896271, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527660737745464e-05, "grad_norm": 19.77496337890625, "learning_rate": 1e-06, "loss": 0.3665, "mean_token_accuracy": 0.885307788848877, "num_tokens": 887519092.0, "step": 23265 }, { "epoch": 2.9596743416868083, "ewc_loss": 0.03535975515842438, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535975702106953e-05, "grad_norm": 19.887203216552734, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8800625205039978, "num_tokens": 887552008.0, "step": 23266 }, { "epoch": 2.959801551965399, "ewc_loss": 0.03536071255803108, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536071380949579e-05, "grad_norm": 19.865337371826172, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8723781108856201, "num_tokens": 887585827.0, "step": 23267 }, { "epoch": 2.9599287622439894, "ewc_loss": 0.035357728600502014, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535772702889517e-05, "grad_norm": 19.856027603149414, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8748118877410889, "num_tokens": 887628041.0, "step": 23268 }, { "epoch": 2.96005597252258, "ewc_loss": 0.03538748249411583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538748205755837e-05, "grad_norm": 19.8504581451416, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.88041752576828, "num_tokens": 887667210.0, "step": 23269 }, { "epoch": 2.9601831828011704, "ewc_loss": 0.035309161990880966, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53091636497993e-05, "grad_norm": 19.869190216064453, "learning_rate": 1e-06, "loss": 0.3606, "mean_token_accuracy": 0.8825736045837402, "num_tokens": 887705429.0, "step": 23270 }, { "epoch": 2.960310393079761, "ewc_loss": 0.03535640984773636, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5356410080567e-05, "grad_norm": 19.81568145751953, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8592302799224854, "num_tokens": 887753188.0, "step": 23271 }, { "epoch": 2.9604376033583515, "ewc_loss": 0.03534657508134842, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5346576623851433e-05, "grad_norm": 19.98720359802246, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.861031711101532, "num_tokens": 887795907.0, "step": 23272 }, { "epoch": 2.960564813636942, "ewc_loss": 0.03543292358517647, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5432924050837755e-05, "grad_norm": 19.83036994934082, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8766031265258789, "num_tokens": 887834859.0, "step": 23273 }, { "epoch": 2.9606920239155325, "ewc_loss": 0.035285692662000656, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5285691410535946e-05, "grad_norm": 19.919750213623047, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8722927570343018, "num_tokens": 887879041.0, "step": 23274 }, { "epoch": 2.9608192341941226, "ewc_loss": 0.03536619991064072, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5366199881536886e-05, "grad_norm": 19.85945701599121, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8741598129272461, "num_tokens": 887913752.0, "step": 23275 }, { "epoch": 2.9609464444727136, "ewc_loss": 0.03537890315055847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537890370353125e-05, "grad_norm": 20.01824951171875, "learning_rate": 1e-06, "loss": 0.3666, "mean_token_accuracy": 0.8824464082717896, "num_tokens": 887950946.0, "step": 23276 }, { "epoch": 2.9610736547513037, "ewc_loss": 0.0353597030043602, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535970245138742e-05, "grad_norm": 19.875762939453125, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8746411204338074, "num_tokens": 887993155.0, "step": 23277 }, { "epoch": 2.9612008650298947, "ewc_loss": 0.03526712581515312, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526712680468336e-05, "grad_norm": 19.933048248291016, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8758090138435364, "num_tokens": 888027831.0, "step": 23278 }, { "epoch": 2.9613280753084847, "ewc_loss": 0.035382241010665894, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5382239730097353e-05, "grad_norm": 19.97797966003418, "learning_rate": 1e-06, "loss": 0.3548, "mean_token_accuracy": 0.8870905637741089, "num_tokens": 888066941.0, "step": 23279 }, { "epoch": 2.9614552855870753, "ewc_loss": 0.03526819124817848, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268192732473835e-05, "grad_norm": 19.907331466674805, "learning_rate": 1e-06, "loss": 0.3413, "mean_token_accuracy": 0.8907437324523926, "num_tokens": 888099739.0, "step": 23280 }, { "epoch": 2.961582495865666, "ewc_loss": 0.03525335341691971, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525335341691971e-05, "grad_norm": 19.994686126708984, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8778948783874512, "num_tokens": 888139675.0, "step": 23281 }, { "epoch": 2.9617097061442563, "ewc_loss": 0.03527427837252617, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52742790710181e-05, "grad_norm": 19.881431579589844, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8632954955101013, "num_tokens": 888179374.0, "step": 23282 }, { "epoch": 2.961836916422847, "ewc_loss": 0.035210151225328445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5210152418585494e-05, "grad_norm": 19.93673324584961, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8633856773376465, "num_tokens": 888214139.0, "step": 23283 }, { "epoch": 2.9619641267014374, "ewc_loss": 0.03527751564979553, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527751687215641e-05, "grad_norm": 19.87885856628418, "learning_rate": 1e-06, "loss": 0.3885, "mean_token_accuracy": 0.874748170375824, "num_tokens": 888246365.0, "step": 23284 }, { "epoch": 2.962091336980028, "ewc_loss": 0.035211630165576935, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5211629437981173e-05, "grad_norm": 19.896825790405273, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8712432384490967, "num_tokens": 888290903.0, "step": 23285 }, { "epoch": 2.9622185472586184, "ewc_loss": 0.03533325716853142, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533325798343867e-05, "grad_norm": 19.89241600036621, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8732331395149231, "num_tokens": 888331329.0, "step": 23286 }, { "epoch": 2.962345757537209, "ewc_loss": 0.03528233990073204, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5282340832054615e-05, "grad_norm": 19.959129333496094, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8699175715446472, "num_tokens": 888372411.0, "step": 23287 }, { "epoch": 2.9624729678157995, "ewc_loss": 0.035309549421072006, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5309549275552854e-05, "grad_norm": 19.88840103149414, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8758503198623657, "num_tokens": 888409778.0, "step": 23288 }, { "epoch": 2.96260017809439, "ewc_loss": 0.03523377329111099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523377381497994e-05, "grad_norm": 19.863698959350586, "learning_rate": 1e-06, "loss": 0.3688, "mean_token_accuracy": 0.8822817206382751, "num_tokens": 888453593.0, "step": 23289 }, { "epoch": 2.9627273883729806, "ewc_loss": 0.03528387472033501, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528387605911121e-05, "grad_norm": 19.889862060546875, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.8891811370849609, "num_tokens": 888488245.0, "step": 23290 }, { "epoch": 2.962854598651571, "ewc_loss": 0.035324689000844955, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532469054334797e-05, "grad_norm": 19.932952880859375, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8757328987121582, "num_tokens": 888524306.0, "step": 23291 }, { "epoch": 2.9629818089301616, "ewc_loss": 0.03531263396143913, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531263428158127e-05, "grad_norm": 19.947071075439453, "learning_rate": 1e-06, "loss": 0.3538, "mean_token_accuracy": 0.8885290622711182, "num_tokens": 888563791.0, "step": 23292 }, { "epoch": 2.963109019208752, "ewc_loss": 0.03524008020758629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524007843225263e-05, "grad_norm": 19.9069766998291, "learning_rate": 1e-06, "loss": 0.375, "mean_token_accuracy": 0.8799518346786499, "num_tokens": 888605021.0, "step": 23293 }, { "epoch": 2.9632362294873427, "ewc_loss": 0.03522801771759987, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522801853250712e-05, "grad_norm": 19.891590118408203, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8767918944358826, "num_tokens": 888647382.0, "step": 23294 }, { "epoch": 2.963363439765933, "ewc_loss": 0.035240624099969864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5240624129073694e-05, "grad_norm": 19.875520706176758, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8771549463272095, "num_tokens": 888682174.0, "step": 23295 }, { "epoch": 2.9634906500445237, "ewc_loss": 0.0351981557905674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5198154364479706e-05, "grad_norm": 19.891204833984375, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8786894083023071, "num_tokens": 888724401.0, "step": 23296 }, { "epoch": 2.9636178603231143, "ewc_loss": 0.035278066992759705, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527806620695628e-05, "grad_norm": 19.934314727783203, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8830431699752808, "num_tokens": 888766230.0, "step": 23297 }, { "epoch": 2.963745070601705, "ewc_loss": 0.03525594621896744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5255947295809165e-05, "grad_norm": 20.003202438354492, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8554611206054688, "num_tokens": 888811865.0, "step": 23298 }, { "epoch": 2.9638722808802953, "ewc_loss": 0.035182081162929535, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5182081774109975e-05, "grad_norm": 19.87959098815918, "learning_rate": 1e-06, "loss": 0.3878, "mean_token_accuracy": 0.8766721487045288, "num_tokens": 888849549.0, "step": 23299 }, { "epoch": 2.9639994911588854, "ewc_loss": 0.035162366926670074, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5162367566954345e-05, "grad_norm": 19.961618423461914, "learning_rate": 1e-06, "loss": 0.3743, "mean_token_accuracy": 0.8781078457832336, "num_tokens": 888881305.0, "step": 23300 }, { "epoch": 2.9641267014374764, "ewc_loss": 0.03519250452518463, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519250458339229e-05, "grad_norm": 19.830228805541992, "learning_rate": 1e-06, "loss": 0.3783, "mean_token_accuracy": 0.8813859820365906, "num_tokens": 888917862.0, "step": 23301 }, { "epoch": 2.9642539117160664, "ewc_loss": 0.035205185413360596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5205186577513814e-05, "grad_norm": 20.060392379760742, "learning_rate": 1e-06, "loss": 0.3855, "mean_token_accuracy": 0.8766186833381653, "num_tokens": 888953883.0, "step": 23302 }, { "epoch": 2.9643811219946574, "ewc_loss": 0.03524470701813698, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524470594129525e-05, "grad_norm": 19.82639503479004, "learning_rate": 1e-06, "loss": 0.3369, "mean_token_accuracy": 0.8936817646026611, "num_tokens": 888991052.0, "step": 23303 }, { "epoch": 2.9645083322732475, "ewc_loss": 0.03508027270436287, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.508027293719351e-05, "grad_norm": 19.937406539916992, "learning_rate": 1e-06, "loss": 0.355, "mean_token_accuracy": 0.8883488178253174, "num_tokens": 889022523.0, "step": 23304 }, { "epoch": 2.964635542551838, "ewc_loss": 0.03527957201004028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527957233018242e-05, "grad_norm": 19.824596405029297, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8683712482452393, "num_tokens": 889063470.0, "step": 23305 }, { "epoch": 2.9647627528304286, "ewc_loss": 0.0351257361471653, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.512573675834574e-05, "grad_norm": 19.9047794342041, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.885067880153656, "num_tokens": 889101160.0, "step": 23306 }, { "epoch": 2.964889963109019, "ewc_loss": 0.03527957573533058, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5279575968161225e-05, "grad_norm": 19.937864303588867, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8736387491226196, "num_tokens": 889139128.0, "step": 23307 }, { "epoch": 2.9650171733876096, "ewc_loss": 0.03521351516246796, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521351391100325e-05, "grad_norm": 19.838809967041016, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8545866012573242, "num_tokens": 889175574.0, "step": 23308 }, { "epoch": 2.9651443836662, "ewc_loss": 0.035209350287914276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520934842526913e-05, "grad_norm": 19.956995010375977, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8738033771514893, "num_tokens": 889216520.0, "step": 23309 }, { "epoch": 2.9652715939447907, "ewc_loss": 0.03535808250308037, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5358083550818264e-05, "grad_norm": 19.876506805419922, "learning_rate": 1e-06, "loss": 0.3566, "mean_token_accuracy": 0.8871456980705261, "num_tokens": 889256746.0, "step": 23310 }, { "epoch": 2.965398804223381, "ewc_loss": 0.03517948463559151, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517948425724171e-05, "grad_norm": 19.866243362426758, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8720089197158813, "num_tokens": 889298868.0, "step": 23311 }, { "epoch": 2.9655260145019717, "ewc_loss": 0.035242270678281784, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524227213347331e-05, "grad_norm": 19.940753936767578, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8721165060997009, "num_tokens": 889337758.0, "step": 23312 }, { "epoch": 2.9656532247805623, "ewc_loss": 0.0352117083966732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521170947351493e-05, "grad_norm": 19.894227981567383, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8808704018592834, "num_tokens": 889375308.0, "step": 23313 }, { "epoch": 2.965780435059153, "ewc_loss": 0.0352211520075798, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522115366649814e-05, "grad_norm": 19.875656127929688, "learning_rate": 1e-06, "loss": 0.3585, "mean_token_accuracy": 0.8856062889099121, "num_tokens": 889413735.0, "step": 23314 }, { "epoch": 2.9659076453377433, "ewc_loss": 0.03528686240315437, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528686283971183e-05, "grad_norm": 20.03173065185547, "learning_rate": 1e-06, "loss": 0.3813, "mean_token_accuracy": 0.8790940046310425, "num_tokens": 889451585.0, "step": 23315 }, { "epoch": 2.966034855616334, "ewc_loss": 0.03525886312127113, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525886495481245e-05, "grad_norm": 19.881746292114258, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8606209754943848, "num_tokens": 889487405.0, "step": 23316 }, { "epoch": 2.9661620658949244, "ewc_loss": 0.03511851280927658, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5118511732434854e-05, "grad_norm": 19.946287155151367, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8660148978233337, "num_tokens": 889523051.0, "step": 23317 }, { "epoch": 2.966289276173515, "ewc_loss": 0.03529219701886177, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5292196116643026e-05, "grad_norm": 19.96930694580078, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8592036962509155, "num_tokens": 889559822.0, "step": 23318 }, { "epoch": 2.9664164864521054, "ewc_loss": 0.03513700142502785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5136999940732494e-05, "grad_norm": 19.900550842285156, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.877119243144989, "num_tokens": 889591598.0, "step": 23319 }, { "epoch": 2.966543696730696, "ewc_loss": 0.03519858419895172, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519858364597894e-05, "grad_norm": 19.89003562927246, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8862672448158264, "num_tokens": 889629787.0, "step": 23320 }, { "epoch": 2.9666709070092865, "ewc_loss": 0.0351845882833004, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.518458834150806e-05, "grad_norm": 19.936050415039062, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8823713064193726, "num_tokens": 889668730.0, "step": 23321 }, { "epoch": 2.966798117287877, "ewc_loss": 0.03526601940393448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5266020859126e-05, "grad_norm": 19.925880432128906, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8748877048492432, "num_tokens": 889705215.0, "step": 23322 }, { "epoch": 2.966925327566467, "ewc_loss": 0.035175424069166183, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5175424272893e-05, "grad_norm": 19.87288475036621, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8776915073394775, "num_tokens": 889738528.0, "step": 23323 }, { "epoch": 2.967052537845058, "ewc_loss": 0.03528652340173721, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528652450768277e-05, "grad_norm": 20.017833709716797, "learning_rate": 1e-06, "loss": 0.3847, "mean_token_accuracy": 0.8801442384719849, "num_tokens": 889771694.0, "step": 23324 }, { "epoch": 2.967179748123648, "ewc_loss": 0.03526404872536659, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526404907461256e-05, "grad_norm": 19.829620361328125, "learning_rate": 1e-06, "loss": 0.3558, "mean_token_accuracy": 0.8899234533309937, "num_tokens": 889806096.0, "step": 23325 }, { "epoch": 2.967306958402239, "ewc_loss": 0.035219866782426834, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521986582200043e-05, "grad_norm": 19.86618423461914, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8756277561187744, "num_tokens": 889848514.0, "step": 23326 }, { "epoch": 2.967434168680829, "ewc_loss": 0.03541230410337448, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541230398695916e-05, "grad_norm": 19.951433181762695, "learning_rate": 1e-06, "loss": 0.3317, "mean_token_accuracy": 0.896845817565918, "num_tokens": 889889689.0, "step": 23327 }, { "epoch": 2.96756137895942, "ewc_loss": 0.03523502126336098, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523502164171077e-05, "grad_norm": 19.85877799987793, "learning_rate": 1e-06, "loss": 0.3785, "mean_token_accuracy": 0.8794898986816406, "num_tokens": 889925550.0, "step": 23328 }, { "epoch": 2.9676885892380103, "ewc_loss": 0.03526986017823219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526985892676748e-05, "grad_norm": 19.937984466552734, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8642600774765015, "num_tokens": 889961797.0, "step": 23329 }, { "epoch": 2.967815799516601, "ewc_loss": 0.03532034903764725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532035043463111e-05, "grad_norm": 19.87896728515625, "learning_rate": 1e-06, "loss": 0.3727, "mean_token_accuracy": 0.882426381111145, "num_tokens": 889997935.0, "step": 23330 }, { "epoch": 2.9679430097951913, "ewc_loss": 0.035288840532302856, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528884190018289e-05, "grad_norm": 19.903274536132812, "learning_rate": 1e-06, "loss": 0.3493, "mean_token_accuracy": 0.8880162239074707, "num_tokens": 890037900.0, "step": 23331 }, { "epoch": 2.968070220073782, "ewc_loss": 0.03535052388906479, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535052383085713e-05, "grad_norm": 19.904098510742188, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8815902471542358, "num_tokens": 890074814.0, "step": 23332 }, { "epoch": 2.9681974303523724, "ewc_loss": 0.035256050527095795, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525604915921576e-05, "grad_norm": 19.87315559387207, "learning_rate": 1e-06, "loss": 0.3552, "mean_token_accuracy": 0.88890540599823, "num_tokens": 890107091.0, "step": 23333 }, { "epoch": 2.968324640630963, "ewc_loss": 0.035327304154634476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532730261213146e-05, "grad_norm": 19.918458938598633, "learning_rate": 1e-06, "loss": 0.3678, "mean_token_accuracy": 0.8850275874137878, "num_tokens": 890140558.0, "step": 23334 }, { "epoch": 2.9684518509095534, "ewc_loss": 0.0352964848279953, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529648529365659e-05, "grad_norm": 19.832782745361328, "learning_rate": 1e-06, "loss": 0.3828, "mean_token_accuracy": 0.8776930570602417, "num_tokens": 890174563.0, "step": 23335 }, { "epoch": 2.968579061188144, "ewc_loss": 0.035358235239982605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535823634592816e-05, "grad_norm": 19.97431755065918, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8750196695327759, "num_tokens": 890210584.0, "step": 23336 }, { "epoch": 2.9687062714667345, "ewc_loss": 0.03543899208307266, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5438992199487984e-05, "grad_norm": 19.855257034301758, "learning_rate": 1e-06, "loss": 0.351, "mean_token_accuracy": 0.8883824348449707, "num_tokens": 890251955.0, "step": 23337 }, { "epoch": 2.968833481745325, "ewc_loss": 0.035270657390356064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5270655644126236e-05, "grad_norm": 19.877927780151367, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8730725646018982, "num_tokens": 890289095.0, "step": 23338 }, { "epoch": 2.9689606920239155, "ewc_loss": 0.03540446609258652, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5404467780608684e-05, "grad_norm": 19.904571533203125, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8786110877990723, "num_tokens": 890327864.0, "step": 23339 }, { "epoch": 2.969087902302506, "ewc_loss": 0.03530028089880943, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5300279705552384e-05, "grad_norm": 19.971782684326172, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8768208026885986, "num_tokens": 890364244.0, "step": 23340 }, { "epoch": 2.9692151125810966, "ewc_loss": 0.03529836982488632, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529836976667866e-05, "grad_norm": 19.879270553588867, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.864618182182312, "num_tokens": 890397652.0, "step": 23341 }, { "epoch": 2.969342322859687, "ewc_loss": 0.03530523180961609, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5305230994708836e-05, "grad_norm": 19.97629165649414, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8811986446380615, "num_tokens": 890435598.0, "step": 23342 }, { "epoch": 2.9694695331382777, "ewc_loss": 0.03535105660557747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535105497576296e-05, "grad_norm": 19.96621322631836, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8797956109046936, "num_tokens": 890473445.0, "step": 23343 }, { "epoch": 2.969596743416868, "ewc_loss": 0.035297513008117676, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529751120368019e-05, "grad_norm": 20.012353897094727, "learning_rate": 1e-06, "loss": 0.3704, "mean_token_accuracy": 0.8813575506210327, "num_tokens": 890514263.0, "step": 23344 }, { "epoch": 2.9697239536954587, "ewc_loss": 0.03531421348452568, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5314213164383546e-05, "grad_norm": 19.894084930419922, "learning_rate": 1e-06, "loss": 0.3619, "mean_token_accuracy": 0.8824549913406372, "num_tokens": 890554123.0, "step": 23345 }, { "epoch": 2.9698511639740492, "ewc_loss": 0.035215143114328384, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5215143725508824e-05, "grad_norm": 19.94003677368164, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8774355053901672, "num_tokens": 890593938.0, "step": 23346 }, { "epoch": 2.9699783742526398, "ewc_loss": 0.035343412309885025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5343411582289264e-05, "grad_norm": 19.919918060302734, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8670067191123962, "num_tokens": 890632260.0, "step": 23347 }, { "epoch": 2.97010558453123, "ewc_loss": 0.03518737107515335, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5187371395295486e-05, "grad_norm": 19.986473083496094, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8744522929191589, "num_tokens": 890666852.0, "step": 23348 }, { "epoch": 2.970232794809821, "ewc_loss": 0.03530697152018547, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5306969948578626e-05, "grad_norm": 19.872207641601562, "learning_rate": 1e-06, "loss": 0.3947, "mean_token_accuracy": 0.8734844326972961, "num_tokens": 890707959.0, "step": 23349 }, { "epoch": 2.970360005088411, "ewc_loss": 0.03519674763083458, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519674646668136e-05, "grad_norm": 19.965545654296875, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8568443655967712, "num_tokens": 890748729.0, "step": 23350 }, { "epoch": 2.970487215367002, "ewc_loss": 0.035345882177352905, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534588176989928e-05, "grad_norm": 19.93187141418457, "learning_rate": 1e-06, "loss": 0.3472, "mean_token_accuracy": 0.890056312084198, "num_tokens": 890782116.0, "step": 23351 }, { "epoch": 2.970614425645592, "ewc_loss": 0.03521342948079109, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5213430237490684e-05, "grad_norm": 19.97127342224121, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8649817705154419, "num_tokens": 890827309.0, "step": 23352 }, { "epoch": 2.970741635924183, "ewc_loss": 0.03529326245188713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5293262044433504e-05, "grad_norm": 19.920326232910156, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8666831851005554, "num_tokens": 890863879.0, "step": 23353 }, { "epoch": 2.970868846202773, "ewc_loss": 0.035213012248277664, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521301186992787e-05, "grad_norm": 19.919940948486328, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8796536922454834, "num_tokens": 890901184.0, "step": 23354 }, { "epoch": 2.9709960564813636, "ewc_loss": 0.0352979451417923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529794412315823e-05, "grad_norm": 19.936260223388672, "learning_rate": 1e-06, "loss": 0.3541, "mean_token_accuracy": 0.8860516548156738, "num_tokens": 890937779.0, "step": 23355 }, { "epoch": 2.971123266759954, "ewc_loss": 0.035251371562480927, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525137071846984e-05, "grad_norm": 19.87337303161621, "learning_rate": 1e-06, "loss": 0.3431, "mean_token_accuracy": 0.8893007636070251, "num_tokens": 890978055.0, "step": 23356 }, { "epoch": 2.9712504770385446, "ewc_loss": 0.03522281348705292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522281258483417e-05, "grad_norm": 19.905288696289062, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8477131128311157, "num_tokens": 891010985.0, "step": 23357 }, { "epoch": 2.971377687317135, "ewc_loss": 0.03530104458332062, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5301043681101874e-05, "grad_norm": 19.839216232299805, "learning_rate": 1e-06, "loss": 0.3736, "mean_token_accuracy": 0.8812545537948608, "num_tokens": 891045904.0, "step": 23358 }, { "epoch": 2.9715048975957257, "ewc_loss": 0.03527771309018135, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5277713323011994e-05, "grad_norm": 19.89552116394043, "learning_rate": 1e-06, "loss": 0.3607, "mean_token_accuracy": 0.8802900910377502, "num_tokens": 891086240.0, "step": 23359 }, { "epoch": 2.971632107874316, "ewc_loss": 0.03532646596431732, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532646587700583e-05, "grad_norm": 19.86983871459961, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8743329048156738, "num_tokens": 891126482.0, "step": 23360 }, { "epoch": 2.9717593181529067, "ewc_loss": 0.03527326136827469, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5273260436952114e-05, "grad_norm": 19.926374435424805, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8664710521697998, "num_tokens": 891165982.0, "step": 23361 }, { "epoch": 2.9718865284314973, "ewc_loss": 0.035286709666252136, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528671004460193e-05, "grad_norm": 19.837879180908203, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.862619936466217, "num_tokens": 891199101.0, "step": 23362 }, { "epoch": 2.972013738710088, "ewc_loss": 0.035243865102529526, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524386556819081e-05, "grad_norm": 19.91883659362793, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8761872053146362, "num_tokens": 891231511.0, "step": 23363 }, { "epoch": 2.9721409489886783, "ewc_loss": 0.035344623029232025, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5344623029232025e-05, "grad_norm": 19.8911190032959, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.864829421043396, "num_tokens": 891262369.0, "step": 23364 }, { "epoch": 2.972268159267269, "ewc_loss": 0.03532250225543976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53225004801061e-05, "grad_norm": 19.85542106628418, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.861287534236908, "num_tokens": 891304206.0, "step": 23365 }, { "epoch": 2.9723953695458594, "ewc_loss": 0.03534796088933945, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534795905579813e-05, "grad_norm": 19.906728744506836, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8649042844772339, "num_tokens": 891348006.0, "step": 23366 }, { "epoch": 2.97252257982445, "ewc_loss": 0.03534310683608055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534310599206947e-05, "grad_norm": 19.8636474609375, "learning_rate": 1e-06, "loss": 0.3604, "mean_token_accuracy": 0.884028434753418, "num_tokens": 891386851.0, "step": 23367 }, { "epoch": 2.9726497901030404, "ewc_loss": 0.035453084856271744, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.545308572938666e-05, "grad_norm": 20.0275821685791, "learning_rate": 1e-06, "loss": 0.3615, "mean_token_accuracy": 0.8867371082305908, "num_tokens": 891423582.0, "step": 23368 }, { "epoch": 2.972777000381631, "ewc_loss": 0.03532828018069267, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532828122843057e-05, "grad_norm": 19.867671966552734, "learning_rate": 1e-06, "loss": 0.3344, "mean_token_accuracy": 0.890358567237854, "num_tokens": 891463773.0, "step": 23369 }, { "epoch": 2.9729042106602215, "ewc_loss": 0.03530247136950493, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5302469768794253e-05, "grad_norm": 19.950544357299805, "learning_rate": 1e-06, "loss": 0.3529, "mean_token_accuracy": 0.8877214789390564, "num_tokens": 891498117.0, "step": 23370 }, { "epoch": 2.973031420938812, "ewc_loss": 0.035409726202487946, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.540972465998493e-05, "grad_norm": 19.93804359436035, "learning_rate": 1e-06, "loss": 0.3608, "mean_token_accuracy": 0.8842916488647461, "num_tokens": 891529055.0, "step": 23371 }, { "epoch": 2.9731586312174025, "ewc_loss": 0.035244956612586975, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524495696183294e-05, "grad_norm": 19.905176162719727, "learning_rate": 1e-06, "loss": 0.3579, "mean_token_accuracy": 0.8862060308456421, "num_tokens": 891567370.0, "step": 23372 }, { "epoch": 2.9732858414959926, "ewc_loss": 0.03535281866788864, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53528193954844e-05, "grad_norm": 19.876842498779297, "learning_rate": 1e-06, "loss": 0.3769, "mean_token_accuracy": 0.8795846700668335, "num_tokens": 891602737.0, "step": 23373 }, { "epoch": 2.9734130517745836, "ewc_loss": 0.03532811626791954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532811751938425e-05, "grad_norm": 19.971040725708008, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8691813945770264, "num_tokens": 891639713.0, "step": 23374 }, { "epoch": 2.9735402620531737, "ewc_loss": 0.035414475947618484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541447586030699e-05, "grad_norm": 19.91162109375, "learning_rate": 1e-06, "loss": 0.3559, "mean_token_accuracy": 0.8887560367584229, "num_tokens": 891673931.0, "step": 23375 }, { "epoch": 2.9736674723317646, "ewc_loss": 0.03526477888226509, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526478030835278e-05, "grad_norm": 19.920711517333984, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8765856027603149, "num_tokens": 891708015.0, "step": 23376 }, { "epoch": 2.9737946826103547, "ewc_loss": 0.03540726751089096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5407269024290144e-05, "grad_norm": 19.95632553100586, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.873725175857544, "num_tokens": 891746023.0, "step": 23377 }, { "epoch": 2.9739218928889453, "ewc_loss": 0.03529564291238785, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529564128257334e-05, "grad_norm": 19.887083053588867, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8746596574783325, "num_tokens": 891777767.0, "step": 23378 }, { "epoch": 2.974049103167536, "ewc_loss": 0.03534508869051933, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534508869051933e-05, "grad_norm": 19.977046966552734, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8788248300552368, "num_tokens": 891810816.0, "step": 23379 }, { "epoch": 2.9741763134461263, "ewc_loss": 0.03539997339248657, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5399974876781926e-05, "grad_norm": 19.871057510375977, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8713723421096802, "num_tokens": 891848810.0, "step": 23380 }, { "epoch": 2.974303523724717, "ewc_loss": 0.0353698693215847, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536987060215324e-05, "grad_norm": 19.958616256713867, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.880578339099884, "num_tokens": 891892553.0, "step": 23381 }, { "epoch": 2.9744307340033074, "ewc_loss": 0.03538414090871811, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538414239301346e-05, "grad_norm": 19.87873077392578, "learning_rate": 1e-06, "loss": 0.3502, "mean_token_accuracy": 0.8875068426132202, "num_tokens": 891927539.0, "step": 23382 }, { "epoch": 2.974557944281898, "ewc_loss": 0.035308513790369034, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530851245159283e-05, "grad_norm": 19.915733337402344, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8812133073806763, "num_tokens": 891959577.0, "step": 23383 }, { "epoch": 2.9746851545604884, "ewc_loss": 0.035453613847494125, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5453613236313686e-05, "grad_norm": 19.97127342224121, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8509605526924133, "num_tokens": 892000799.0, "step": 23384 }, { "epoch": 2.974812364839079, "ewc_loss": 0.0353836789727211, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538368036970496e-05, "grad_norm": 19.900991439819336, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.870537281036377, "num_tokens": 892042200.0, "step": 23385 }, { "epoch": 2.9749395751176695, "ewc_loss": 0.03534227982163429, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534228017088026e-05, "grad_norm": 19.987079620361328, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8683565855026245, "num_tokens": 892084838.0, "step": 23386 }, { "epoch": 2.97506678539626, "ewc_loss": 0.035354964435100555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535496580298059e-05, "grad_norm": 19.87827491760254, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8604587316513062, "num_tokens": 892126442.0, "step": 23387 }, { "epoch": 2.9751939956748505, "ewc_loss": 0.035335756838321686, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533575727487914e-05, "grad_norm": 19.98768424987793, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8668410778045654, "num_tokens": 892165019.0, "step": 23388 }, { "epoch": 2.975321205953441, "ewc_loss": 0.03536565229296684, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5365650546737015e-05, "grad_norm": 19.82771873474121, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8787084817886353, "num_tokens": 892202291.0, "step": 23389 }, { "epoch": 2.9754484162320316, "ewc_loss": 0.03528891131281853, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528891102178022e-05, "grad_norm": 19.92282485961914, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8674988150596619, "num_tokens": 892235159.0, "step": 23390 }, { "epoch": 2.975575626510622, "ewc_loss": 0.03536723926663399, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5367240343475714e-05, "grad_norm": 19.865989685058594, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.864574134349823, "num_tokens": 892271967.0, "step": 23391 }, { "epoch": 2.9757028367892127, "ewc_loss": 0.03529083728790283, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529083915054798e-05, "grad_norm": 19.88788604736328, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8633133172988892, "num_tokens": 892306230.0, "step": 23392 }, { "epoch": 2.975830047067803, "ewc_loss": 0.0354098305106163, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.540983016137034e-05, "grad_norm": 19.83616828918457, "learning_rate": 1e-06, "loss": 0.3723, "mean_token_accuracy": 0.8783987760543823, "num_tokens": 892342118.0, "step": 23393 }, { "epoch": 2.9759572573463937, "ewc_loss": 0.03532132878899574, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532132905093022e-05, "grad_norm": 19.820646286010742, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8813285231590271, "num_tokens": 892380764.0, "step": 23394 }, { "epoch": 2.9760844676249842, "ewc_loss": 0.03542603552341461, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5426033718977123e-05, "grad_norm": 19.981643676757812, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8741956949234009, "num_tokens": 892421113.0, "step": 23395 }, { "epoch": 2.9762116779035748, "ewc_loss": 0.035418491810560226, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541849218891002e-05, "grad_norm": 19.959056854248047, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.8824634552001953, "num_tokens": 892457512.0, "step": 23396 }, { "epoch": 2.9763388881821653, "ewc_loss": 0.03541559353470802, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541559271980077e-05, "grad_norm": 19.971765518188477, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.880961000919342, "num_tokens": 892493599.0, "step": 23397 }, { "epoch": 2.9764660984607554, "ewc_loss": 0.035390011966228485, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53900104528293e-05, "grad_norm": 19.832841873168945, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8675315380096436, "num_tokens": 892528474.0, "step": 23398 }, { "epoch": 2.9765933087393464, "ewc_loss": 0.0353255495429039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532554910634644e-05, "grad_norm": 19.90576171875, "learning_rate": 1e-06, "loss": 0.3638, "mean_token_accuracy": 0.8823850154876709, "num_tokens": 892569591.0, "step": 23399 }, { "epoch": 2.9767205190179364, "ewc_loss": 0.035413555800914764, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.54135554516688e-05, "grad_norm": 19.928207397460938, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.881338357925415, "num_tokens": 892607185.0, "step": 23400 }, { "epoch": 2.9768477292965274, "ewc_loss": 0.03535756841301918, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535756695782766e-05, "grad_norm": 19.91869354248047, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8764660954475403, "num_tokens": 892641453.0, "step": 23401 }, { "epoch": 2.9769749395751175, "ewc_loss": 0.035359665751457214, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535966607159935e-05, "grad_norm": 19.95157814025879, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8705019354820251, "num_tokens": 892676802.0, "step": 23402 }, { "epoch": 2.977102149853708, "ewc_loss": 0.03542845696210861, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5428456612862647e-05, "grad_norm": 19.944177627563477, "learning_rate": 1e-06, "loss": 0.3639, "mean_token_accuracy": 0.8864157199859619, "num_tokens": 892711694.0, "step": 23403 }, { "epoch": 2.9772293601322986, "ewc_loss": 0.03538891300559044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538891178322956e-05, "grad_norm": 19.969999313354492, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8669460415840149, "num_tokens": 892752662.0, "step": 23404 }, { "epoch": 2.977356570410889, "ewc_loss": 0.035361867398023605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536186704877764e-05, "grad_norm": 19.939395904541016, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8783791065216064, "num_tokens": 892790170.0, "step": 23405 }, { "epoch": 2.9774837806894796, "ewc_loss": 0.03537048399448395, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537048542057164e-05, "grad_norm": 19.930875778198242, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8585182428359985, "num_tokens": 892831437.0, "step": 23406 }, { "epoch": 2.97761099096807, "ewc_loss": 0.035304225981235504, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530422691255808e-05, "grad_norm": 19.907955169677734, "learning_rate": 1e-06, "loss": 0.3239, "mean_token_accuracy": 0.8978513479232788, "num_tokens": 892869022.0, "step": 23407 }, { "epoch": 2.9777382012466607, "ewc_loss": 0.03534725680947304, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534725692588836e-05, "grad_norm": 19.91625213623047, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8784630298614502, "num_tokens": 892910738.0, "step": 23408 }, { "epoch": 2.977865411525251, "ewc_loss": 0.035332631319761276, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533263225108385e-05, "grad_norm": 19.959091186523438, "learning_rate": 1e-06, "loss": 0.3711, "mean_token_accuracy": 0.8854526877403259, "num_tokens": 892952055.0, "step": 23409 }, { "epoch": 2.9779926218038417, "ewc_loss": 0.0353684239089489, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536842268658802e-05, "grad_norm": 19.93207359313965, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8692753911018372, "num_tokens": 892985843.0, "step": 23410 }, { "epoch": 2.9781198320824323, "ewc_loss": 0.03528211638331413, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5282115277368575e-05, "grad_norm": 19.896968841552734, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8665645718574524, "num_tokens": 893031197.0, "step": 23411 }, { "epoch": 2.978247042361023, "ewc_loss": 0.035281382501125336, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528138404362835e-05, "grad_norm": 19.887548446655273, "learning_rate": 1e-06, "loss": 0.3394, "mean_token_accuracy": 0.8902416229248047, "num_tokens": 893068113.0, "step": 23412 }, { "epoch": 2.9783742526396133, "ewc_loss": 0.03531298786401749, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5312987165525556e-05, "grad_norm": 19.85557746887207, "learning_rate": 1e-06, "loss": 0.357, "mean_token_accuracy": 0.8857121467590332, "num_tokens": 893109148.0, "step": 23413 }, { "epoch": 2.978501462918204, "ewc_loss": 0.03526681289076805, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526681393850595e-05, "grad_norm": 19.919649124145508, "learning_rate": 1e-06, "loss": 0.372, "mean_token_accuracy": 0.8806544542312622, "num_tokens": 893151406.0, "step": 23414 }, { "epoch": 2.9786286731967944, "ewc_loss": 0.03534740209579468, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534740244504064e-05, "grad_norm": 19.919431686401367, "learning_rate": 1e-06, "loss": 0.373, "mean_token_accuracy": 0.8810247182846069, "num_tokens": 893190209.0, "step": 23415 }, { "epoch": 2.978755883475385, "ewc_loss": 0.03532733768224716, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532733899191953e-05, "grad_norm": 19.916339874267578, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8759457468986511, "num_tokens": 893232949.0, "step": 23416 }, { "epoch": 2.9788830937539754, "ewc_loss": 0.035262905061244965, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526290674926713e-05, "grad_norm": 19.92980194091797, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8767778873443604, "num_tokens": 893273931.0, "step": 23417 }, { "epoch": 2.979010304032566, "ewc_loss": 0.03526365011930466, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526364889694378e-05, "grad_norm": 19.944684982299805, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8576787710189819, "num_tokens": 893317465.0, "step": 23418 }, { "epoch": 2.9791375143111565, "ewc_loss": 0.03521981090307236, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.521981125231832e-05, "grad_norm": 19.851945877075195, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8616604804992676, "num_tokens": 893354089.0, "step": 23419 }, { "epoch": 2.979264724589747, "ewc_loss": 0.035205308347940445, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.520530663081445e-05, "grad_norm": 19.94074821472168, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8743860721588135, "num_tokens": 893388051.0, "step": 23420 }, { "epoch": 2.979391934868337, "ewc_loss": 0.035282690078020096, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5282690078020096e-05, "grad_norm": 19.859336853027344, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8714214563369751, "num_tokens": 893426191.0, "step": 23421 }, { "epoch": 2.979519145146928, "ewc_loss": 0.03521416336297989, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5214165109209716e-05, "grad_norm": 19.956573486328125, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8703702688217163, "num_tokens": 893462134.0, "step": 23422 }, { "epoch": 2.979646355425518, "ewc_loss": 0.0353587307035923, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535873111104593e-05, "grad_norm": 19.944652557373047, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8716973662376404, "num_tokens": 893503159.0, "step": 23423 }, { "epoch": 2.979773565704109, "ewc_loss": 0.03522008657455444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.522008773870766e-05, "grad_norm": 19.970094680786133, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8588048219680786, "num_tokens": 893538379.0, "step": 23424 }, { "epoch": 2.979900775982699, "ewc_loss": 0.03529908135533333, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529908281052485e-05, "grad_norm": 19.88182258605957, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8636856079101562, "num_tokens": 893580431.0, "step": 23425 }, { "epoch": 2.98002798626129, "ewc_loss": 0.035262465476989746, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526246655383147e-05, "grad_norm": 19.957876205444336, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.869601845741272, "num_tokens": 893622412.0, "step": 23426 }, { "epoch": 2.9801551965398803, "ewc_loss": 0.03538661450147629, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5386616218602285e-05, "grad_norm": 19.964754104614258, "learning_rate": 1e-06, "loss": 0.3903, "mean_token_accuracy": 0.8790286779403687, "num_tokens": 893658575.0, "step": 23427 }, { "epoch": 2.980282406818471, "ewc_loss": 0.035243961960077286, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52439601556398e-05, "grad_norm": 19.948225021362305, "learning_rate": 1e-06, "loss": 0.3634, "mean_token_accuracy": 0.883050799369812, "num_tokens": 893702812.0, "step": 23428 }, { "epoch": 2.9804096170970613, "ewc_loss": 0.03536556661128998, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536556687322445e-05, "grad_norm": 19.967206954956055, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8693439960479736, "num_tokens": 893742785.0, "step": 23429 }, { "epoch": 2.980536827375652, "ewc_loss": 0.035282306373119354, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528230809024535e-05, "grad_norm": 20.041339874267578, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8820681571960449, "num_tokens": 893782310.0, "step": 23430 }, { "epoch": 2.9806640376542424, "ewc_loss": 0.03528694063425064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528693923726678e-05, "grad_norm": 19.900760650634766, "learning_rate": 1e-06, "loss": 0.3309, "mean_token_accuracy": 0.8922257423400879, "num_tokens": 893822794.0, "step": 23431 }, { "epoch": 2.980791247932833, "ewc_loss": 0.03523813560605049, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5238135751569644e-05, "grad_norm": 19.973133087158203, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8808602094650269, "num_tokens": 893870462.0, "step": 23432 }, { "epoch": 2.9809184582114234, "ewc_loss": 0.03526934236288071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5269342333776876e-05, "grad_norm": 19.90834617614746, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8767976760864258, "num_tokens": 893910654.0, "step": 23433 }, { "epoch": 2.981045668490014, "ewc_loss": 0.035244688391685486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5244687751401216e-05, "grad_norm": 20.01618194580078, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.872766375541687, "num_tokens": 893948819.0, "step": 23434 }, { "epoch": 2.9811728787686045, "ewc_loss": 0.03529151529073715, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52915158146061e-05, "grad_norm": 19.9254207611084, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8827653527259827, "num_tokens": 893983325.0, "step": 23435 }, { "epoch": 2.981300089047195, "ewc_loss": 0.03516773506999016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5167733585694805e-05, "grad_norm": 19.915395736694336, "learning_rate": 1e-06, "loss": 0.3869, "mean_token_accuracy": 0.8748341798782349, "num_tokens": 894023712.0, "step": 23436 }, { "epoch": 2.9814272993257855, "ewc_loss": 0.03525368124246597, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5253680835012347e-05, "grad_norm": 19.899534225463867, "learning_rate": 1e-06, "loss": 0.3581, "mean_token_accuracy": 0.8843777179718018, "num_tokens": 894059719.0, "step": 23437 }, { "epoch": 2.981554509604376, "ewc_loss": 0.035253290086984634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525329157127999e-05, "grad_norm": 19.940763473510742, "learning_rate": 1e-06, "loss": 0.3849, "mean_token_accuracy": 0.8770353198051453, "num_tokens": 894100150.0, "step": 23438 }, { "epoch": 2.9816817198829666, "ewc_loss": 0.03527149185538292, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.527149237925187e-05, "grad_norm": 19.888526916503906, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8557841777801514, "num_tokens": 894134336.0, "step": 23439 }, { "epoch": 2.981808930161557, "ewc_loss": 0.035178087651729584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517808727337979e-05, "grad_norm": 19.939760208129883, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8621786832809448, "num_tokens": 894168972.0, "step": 23440 }, { "epoch": 2.9819361404401477, "ewc_loss": 0.035301823168992996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530182220856659e-05, "grad_norm": 19.810388565063477, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8723413944244385, "num_tokens": 894204641.0, "step": 23441 }, { "epoch": 2.982063350718738, "ewc_loss": 0.03519390523433685, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.519390520523302e-05, "grad_norm": 19.895904541015625, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8793133497238159, "num_tokens": 894248403.0, "step": 23442 }, { "epoch": 2.9821905609973287, "ewc_loss": 0.03531542047858238, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53154209733475e-05, "grad_norm": 19.872148513793945, "learning_rate": 1e-06, "loss": 0.3297, "mean_token_accuracy": 0.8929636478424072, "num_tokens": 894284035.0, "step": 23443 }, { "epoch": 2.9823177712759192, "ewc_loss": 0.035244591534137726, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524459316395223e-05, "grad_norm": 19.914531707763672, "learning_rate": 1e-06, "loss": 0.3691, "mean_token_accuracy": 0.8798351883888245, "num_tokens": 894321804.0, "step": 23444 }, { "epoch": 2.9824449815545098, "ewc_loss": 0.035285815596580505, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528581510181539e-05, "grad_norm": 19.85311508178711, "learning_rate": 1e-06, "loss": 0.3601, "mean_token_accuracy": 0.884856104850769, "num_tokens": 894355648.0, "step": 23445 }, { "epoch": 2.9825721918331, "ewc_loss": 0.03536820039153099, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5368200769880787e-05, "grad_norm": 19.923749923706055, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8770185708999634, "num_tokens": 894392947.0, "step": 23446 }, { "epoch": 2.982699402111691, "ewc_loss": 0.03533003851771355, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5330038372194394e-05, "grad_norm": 19.969348907470703, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8677937984466553, "num_tokens": 894433437.0, "step": 23447 }, { "epoch": 2.982826612390281, "ewc_loss": 0.03534629940986633, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5346300137462094e-05, "grad_norm": 19.89182472229004, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8808366060256958, "num_tokens": 894481428.0, "step": 23448 }, { "epoch": 2.982953822668872, "ewc_loss": 0.035353876650333405, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5353878047317266e-05, "grad_norm": 19.90088653564453, "learning_rate": 1e-06, "loss": 0.3895, "mean_token_accuracy": 0.8759281635284424, "num_tokens": 894518055.0, "step": 23449 }, { "epoch": 2.983081032947462, "ewc_loss": 0.035374775528907776, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53747745975852e-05, "grad_norm": 19.934818267822266, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8714442253112793, "num_tokens": 894562693.0, "step": 23450 }, { "epoch": 2.983208243226053, "ewc_loss": 0.035370372235774994, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537037264322862e-05, "grad_norm": 19.900184631347656, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8760172128677368, "num_tokens": 894604973.0, "step": 23451 }, { "epoch": 2.983335453504643, "ewc_loss": 0.03536468744277954, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5364686482353136e-05, "grad_norm": 19.962610244750977, "learning_rate": 1e-06, "loss": 0.3659, "mean_token_accuracy": 0.882343590259552, "num_tokens": 894645672.0, "step": 23452 }, { "epoch": 2.9834626637832335, "ewc_loss": 0.03535604104399681, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5356042644707486e-05, "grad_norm": 19.886030197143555, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8744235038757324, "num_tokens": 894686702.0, "step": 23453 }, { "epoch": 2.983589874061824, "ewc_loss": 0.03533168137073517, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53316827386152e-05, "grad_norm": 19.987281799316406, "learning_rate": 1e-06, "loss": 0.3644, "mean_token_accuracy": 0.8842164874076843, "num_tokens": 894722768.0, "step": 23454 }, { "epoch": 2.9837170843404146, "ewc_loss": 0.03543876111507416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.543875936884433e-05, "grad_norm": 19.890335083007812, "learning_rate": 1e-06, "loss": 0.3582, "mean_token_accuracy": 0.8878297805786133, "num_tokens": 894759216.0, "step": 23455 }, { "epoch": 2.983844294619005, "ewc_loss": 0.03537623584270477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537623706506565e-05, "grad_norm": 19.95174217224121, "learning_rate": 1e-06, "loss": 0.3699, "mean_token_accuracy": 0.8812127709388733, "num_tokens": 894799691.0, "step": 23456 }, { "epoch": 2.9839715048975957, "ewc_loss": 0.03537827357649803, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5378274333197623e-05, "grad_norm": 19.884389877319336, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8724676370620728, "num_tokens": 894835869.0, "step": 23457 }, { "epoch": 2.984098715176186, "ewc_loss": 0.03534911945462227, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534911957103759e-05, "grad_norm": 19.99312400817871, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8543479442596436, "num_tokens": 894875706.0, "step": 23458 }, { "epoch": 2.9842259254547767, "ewc_loss": 0.035359714180231094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535971336532384e-05, "grad_norm": 19.894189834594727, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8812634944915771, "num_tokens": 894920135.0, "step": 23459 }, { "epoch": 2.9843531357333672, "ewc_loss": 0.035346582531929016, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534658389980905e-05, "grad_norm": 20.001916885375977, "learning_rate": 1e-06, "loss": 0.3791, "mean_token_accuracy": 0.8778743743896484, "num_tokens": 894960825.0, "step": 23460 }, { "epoch": 2.9844803460119578, "ewc_loss": 0.03539193049073219, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5391931305639446e-05, "grad_norm": 19.962078094482422, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8781834840774536, "num_tokens": 895001935.0, "step": 23461 }, { "epoch": 2.9846075562905483, "ewc_loss": 0.035285722464323044, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.528572415234521e-05, "grad_norm": 19.95521354675293, "learning_rate": 1e-06, "loss": 0.353, "mean_token_accuracy": 0.8851977586746216, "num_tokens": 895033559.0, "step": 23462 }, { "epoch": 2.984734766569139, "ewc_loss": 0.03522421047091484, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5224209568696097e-05, "grad_norm": 19.93858528137207, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8760550022125244, "num_tokens": 895070958.0, "step": 23463 }, { "epoch": 2.9848619768477294, "ewc_loss": 0.03534029424190521, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5340293834451586e-05, "grad_norm": 19.95527458190918, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8747042417526245, "num_tokens": 895111533.0, "step": 23464 }, { "epoch": 2.98498918712632, "ewc_loss": 0.035271868109703064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5271867091069e-05, "grad_norm": 20.004541397094727, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.872573971748352, "num_tokens": 895149861.0, "step": 23465 }, { "epoch": 2.9851163974049104, "ewc_loss": 0.03532644733786583, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5326447687111795e-05, "grad_norm": 19.916452407836914, "learning_rate": 1e-06, "loss": 0.3679, "mean_token_accuracy": 0.8825828433036804, "num_tokens": 895192958.0, "step": 23466 }, { "epoch": 2.985243607683501, "ewc_loss": 0.035269517451524734, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526951695675962e-05, "grad_norm": 19.947649002075195, "learning_rate": 1e-06, "loss": 0.362, "mean_token_accuracy": 0.885908842086792, "num_tokens": 895226710.0, "step": 23467 }, { "epoch": 2.9853708179620915, "ewc_loss": 0.03525181859731674, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5251818189863116e-05, "grad_norm": 19.918092727661133, "learning_rate": 1e-06, "loss": 0.3765, "mean_token_accuracy": 0.8810604810714722, "num_tokens": 895267695.0, "step": 23468 }, { "epoch": 2.985498028240682, "ewc_loss": 0.035305555909872055, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530555477482267e-05, "grad_norm": 20.063554763793945, "learning_rate": 1e-06, "loss": 0.3458, "mean_token_accuracy": 0.8924227356910706, "num_tokens": 895303101.0, "step": 23469 }, { "epoch": 2.9856252385192725, "ewc_loss": 0.035302430391311646, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5302429751027375e-05, "grad_norm": 19.957719802856445, "learning_rate": 1e-06, "loss": 0.3667, "mean_token_accuracy": 0.8821855783462524, "num_tokens": 895340437.0, "step": 23470 }, { "epoch": 2.9857524487978626, "ewc_loss": 0.035252850502729416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525285137584433e-05, "grad_norm": 20.004840850830078, "learning_rate": 1e-06, "loss": 0.3493, "mean_token_accuracy": 0.8903592228889465, "num_tokens": 895377695.0, "step": 23471 }, { "epoch": 2.9858796590764536, "ewc_loss": 0.035357847809791565, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5357847082195804e-05, "grad_norm": 20.06720733642578, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8669342994689941, "num_tokens": 895418599.0, "step": 23472 }, { "epoch": 2.9860068693550437, "ewc_loss": 0.03526836633682251, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5268367355456576e-05, "grad_norm": 19.9318790435791, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8750784397125244, "num_tokens": 895454832.0, "step": 23473 }, { "epoch": 2.9861340796336346, "ewc_loss": 0.03523861989378929, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523861960275099e-05, "grad_norm": 19.97233772277832, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8605256080627441, "num_tokens": 895490607.0, "step": 23474 }, { "epoch": 2.9862612899122247, "ewc_loss": 0.03532605245709419, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532605114742182e-05, "grad_norm": 20.00603675842285, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8700790405273438, "num_tokens": 895531340.0, "step": 23475 }, { "epoch": 2.9863885001908153, "ewc_loss": 0.03527145832777023, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5271459637442604e-05, "grad_norm": 19.86781883239746, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8765702247619629, "num_tokens": 895568688.0, "step": 23476 }, { "epoch": 2.986515710469406, "ewc_loss": 0.035317517817020416, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5317516449140385e-05, "grad_norm": 19.96292495727539, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.866958498954773, "num_tokens": 895607316.0, "step": 23477 }, { "epoch": 2.9866429207479963, "ewc_loss": 0.03529585525393486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529585592332296e-05, "grad_norm": 19.879446029663086, "learning_rate": 1e-06, "loss": 0.3365, "mean_token_accuracy": 0.8929188251495361, "num_tokens": 895648694.0, "step": 23478 }, { "epoch": 2.986770131026587, "ewc_loss": 0.03524986281991005, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.524986459524371e-05, "grad_norm": 20.019845962524414, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8724607229232788, "num_tokens": 895687996.0, "step": 23479 }, { "epoch": 2.9868973413051774, "ewc_loss": 0.0353236086666584, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5323610063642263e-05, "grad_norm": 19.919099807739258, "learning_rate": 1e-06, "loss": 0.3742, "mean_token_accuracy": 0.8818475604057312, "num_tokens": 895726727.0, "step": 23480 }, { "epoch": 2.987024551583768, "ewc_loss": 0.03526555746793747, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52655588358175e-05, "grad_norm": 20.05190086364746, "learning_rate": 1e-06, "loss": 0.3657, "mean_token_accuracy": 0.8810534477233887, "num_tokens": 895760688.0, "step": 23481 }, { "epoch": 2.9871517618623584, "ewc_loss": 0.03534819930791855, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5348199162399396e-05, "grad_norm": 19.976511001586914, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8559209704399109, "num_tokens": 895797303.0, "step": 23482 }, { "epoch": 2.987278972140949, "ewc_loss": 0.03523187339305878, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.523187479004264e-05, "grad_norm": 19.9433536529541, "learning_rate": 1e-06, "loss": 0.3693, "mean_token_accuracy": 0.8825751543045044, "num_tokens": 895834078.0, "step": 23483 }, { "epoch": 2.9874061824195395, "ewc_loss": 0.03534206748008728, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534206916810945e-05, "grad_norm": 19.978303909301758, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8655327558517456, "num_tokens": 895872640.0, "step": 23484 }, { "epoch": 2.98753339269813, "ewc_loss": 0.03526736795902252, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5267366911284626e-05, "grad_norm": 19.808425903320312, "learning_rate": 1e-06, "loss": 0.3522, "mean_token_accuracy": 0.8902806043624878, "num_tokens": 895907646.0, "step": 23485 }, { "epoch": 2.9876606029767205, "ewc_loss": 0.03530210256576538, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530210233293474e-05, "grad_norm": 20.027023315429688, "learning_rate": 1e-06, "loss": 0.3628, "mean_token_accuracy": 0.8853198289871216, "num_tokens": 895944183.0, "step": 23486 }, { "epoch": 2.987787813255311, "ewc_loss": 0.03538161888718605, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538161763572134e-05, "grad_norm": 19.938739776611328, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8802927136421204, "num_tokens": 895983987.0, "step": 23487 }, { "epoch": 2.9879150235339016, "ewc_loss": 0.03529901057481766, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.529901005094871e-05, "grad_norm": 19.98535919189453, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8655595779418945, "num_tokens": 896020928.0, "step": 23488 }, { "epoch": 2.988042233812492, "ewc_loss": 0.035328563302755356, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532856499077752e-05, "grad_norm": 19.941904067993164, "learning_rate": 1e-06, "loss": 0.3588, "mean_token_accuracy": 0.8830657005310059, "num_tokens": 896057459.0, "step": 23489 }, { "epoch": 2.9881694440910826, "ewc_loss": 0.03529003635048866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5290035157231614e-05, "grad_norm": 19.91672706604004, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8725704550743103, "num_tokens": 896095049.0, "step": 23490 }, { "epoch": 2.988296654369673, "ewc_loss": 0.03533884510397911, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5338845918886364e-05, "grad_norm": 19.96886444091797, "learning_rate": 1e-06, "loss": 0.3568, "mean_token_accuracy": 0.8885899782180786, "num_tokens": 896127433.0, "step": 23491 }, { "epoch": 2.9884238646482637, "ewc_loss": 0.035344142466783524, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534414281602949e-05, "grad_norm": 19.957204818725586, "learning_rate": 1e-06, "loss": 0.3726, "mean_token_accuracy": 0.8828384280204773, "num_tokens": 896165662.0, "step": 23492 }, { "epoch": 2.9885510749268542, "ewc_loss": 0.03536578267812729, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536578151397407e-05, "grad_norm": 19.90485954284668, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8761255741119385, "num_tokens": 896207753.0, "step": 23493 }, { "epoch": 2.9886782852054448, "ewc_loss": 0.03530450910329819, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530451067490503e-05, "grad_norm": 20.00655746459961, "learning_rate": 1e-06, "loss": 0.3883, "mean_token_accuracy": 0.8758399486541748, "num_tokens": 896249105.0, "step": 23494 }, { "epoch": 2.9888054954840353, "ewc_loss": 0.03534063324332237, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5340632166480646e-05, "grad_norm": 19.887250900268555, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8776462078094482, "num_tokens": 896291728.0, "step": 23495 }, { "epoch": 2.9889327057626254, "ewc_loss": 0.0352647639811039, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5264765756437555e-05, "grad_norm": 20.031225204467773, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8668174743652344, "num_tokens": 896331729.0, "step": 23496 }, { "epoch": 2.9890599160412163, "ewc_loss": 0.03536280244588852, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536280200933106e-05, "grad_norm": 19.82767677307129, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8661924600601196, "num_tokens": 896371804.0, "step": 23497 }, { "epoch": 2.9891871263198064, "ewc_loss": 0.035172466188669205, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.517246659612283e-05, "grad_norm": 19.976064682006836, "learning_rate": 1e-06, "loss": 0.386, "mean_token_accuracy": 0.8750717639923096, "num_tokens": 896408398.0, "step": 23498 }, { "epoch": 2.9893143365983974, "ewc_loss": 0.035481054335832596, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.548105451045558e-05, "grad_norm": 20.002439498901367, "learning_rate": 1e-06, "loss": 0.3544, "mean_token_accuracy": 0.8908807039260864, "num_tokens": 896440496.0, "step": 23499 }, { "epoch": 2.9894415468769875, "ewc_loss": 0.035262200981378555, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5262200981378555e-05, "grad_norm": 19.93122673034668, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8776395320892334, "num_tokens": 896475212.0, "step": 23500 }, { "epoch": 2.989568757155578, "ewc_loss": 0.03531178832054138, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5311786632519215e-05, "grad_norm": 19.91766357421875, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8736926913261414, "num_tokens": 896517122.0, "step": 23501 }, { "epoch": 2.9896959674341685, "ewc_loss": 0.035298798233270645, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52987990481779e-05, "grad_norm": 19.88145637512207, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8840060830116272, "num_tokens": 896557259.0, "step": 23502 }, { "epoch": 2.989823177712759, "ewc_loss": 0.03536694869399071, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536694930517115e-05, "grad_norm": 20.001991271972656, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8678520917892456, "num_tokens": 896600237.0, "step": 23503 }, { "epoch": 2.9899503879913496, "ewc_loss": 0.03530298173427582, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5302982723806053e-05, "grad_norm": 19.845535278320312, "learning_rate": 1e-06, "loss": 0.3533, "mean_token_accuracy": 0.8883833885192871, "num_tokens": 896640564.0, "step": 23504 }, { "epoch": 2.99007759826994, "ewc_loss": 0.03526059165596962, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.526059299474582e-05, "grad_norm": 19.838850021362305, "learning_rate": 1e-06, "loss": 0.3471, "mean_token_accuracy": 0.8863656520843506, "num_tokens": 896674164.0, "step": 23505 }, { "epoch": 2.9902048085485307, "ewc_loss": 0.03538423776626587, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538423698046245e-05, "grad_norm": 19.925416946411133, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8716350197792053, "num_tokens": 896716613.0, "step": 23506 }, { "epoch": 2.990332018827121, "ewc_loss": 0.03538884222507477, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5388842661632225e-05, "grad_norm": 19.944786071777344, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8660000562667847, "num_tokens": 896755586.0, "step": 23507 }, { "epoch": 2.9904592291057117, "ewc_loss": 0.03535710647702217, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535710493451916e-05, "grad_norm": 19.94068717956543, "learning_rate": 1e-06, "loss": 0.387, "mean_token_accuracy": 0.8760772943496704, "num_tokens": 896797641.0, "step": 23508 }, { "epoch": 2.9905864393843022, "ewc_loss": 0.03532237559556961, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532237678882666e-05, "grad_norm": 19.84050178527832, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.872708797454834, "num_tokens": 896835113.0, "step": 23509 }, { "epoch": 2.9907136496628928, "ewc_loss": 0.03539411723613739, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.539411773090251e-05, "grad_norm": 19.991352081298828, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8765371441841125, "num_tokens": 896866496.0, "step": 23510 }, { "epoch": 2.9908408599414833, "ewc_loss": 0.035394586622714996, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.539458703016862e-05, "grad_norm": 19.902721405029297, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8752230405807495, "num_tokens": 896907535.0, "step": 23511 }, { "epoch": 2.990968070220074, "ewc_loss": 0.03533002361655235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5330023820279166e-05, "grad_norm": 19.946901321411133, "learning_rate": 1e-06, "loss": 0.3328, "mean_token_accuracy": 0.8931035995483398, "num_tokens": 896948574.0, "step": 23512 }, { "epoch": 2.9910952804986644, "ewc_loss": 0.03542064130306244, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5420642234385014e-05, "grad_norm": 19.915058135986328, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8706457018852234, "num_tokens": 896992377.0, "step": 23513 }, { "epoch": 2.991222490777255, "ewc_loss": 0.03537208214402199, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5372082493267953e-05, "grad_norm": 19.907859802246094, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8720603585243225, "num_tokens": 897035411.0, "step": 23514 }, { "epoch": 2.9913497010558454, "ewc_loss": 0.03536377102136612, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536376971169375e-05, "grad_norm": 19.958810806274414, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8785439729690552, "num_tokens": 897074661.0, "step": 23515 }, { "epoch": 2.991476911334436, "ewc_loss": 0.03532824665307999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53282448486425e-05, "grad_norm": 19.903186798095703, "learning_rate": 1e-06, "loss": 0.3884, "mean_token_accuracy": 0.8788968324661255, "num_tokens": 897115550.0, "step": 23516 }, { "epoch": 2.9916041216130265, "ewc_loss": 0.035348616540431976, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534861752996221e-05, "grad_norm": 19.85717010498047, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8699783682823181, "num_tokens": 897155091.0, "step": 23517 }, { "epoch": 2.991731331891617, "ewc_loss": 0.0353190116584301, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53190116584301e-05, "grad_norm": 19.88884735107422, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8713754415512085, "num_tokens": 897196551.0, "step": 23518 }, { "epoch": 2.991858542170207, "ewc_loss": 0.03536472097039223, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.53647192241624e-05, "grad_norm": 19.891828536987305, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8864914774894714, "num_tokens": 897231885.0, "step": 23519 }, { "epoch": 2.991985752448798, "ewc_loss": 0.035366903990507126, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536690564942546e-05, "grad_norm": 20.00032615661621, "learning_rate": 1e-06, "loss": 0.3906, "mean_token_accuracy": 0.877079963684082, "num_tokens": 897267944.0, "step": 23520 }, { "epoch": 2.992112962727388, "ewc_loss": 0.03544088080525398, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5440880310488865e-05, "grad_norm": 19.9184627532959, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8788801431655884, "num_tokens": 897307330.0, "step": 23521 }, { "epoch": 2.992240173005979, "ewc_loss": 0.03527458757162094, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.52745882992167e-05, "grad_norm": 19.849340438842773, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.8793331384658813, "num_tokens": 897344028.0, "step": 23522 }, { "epoch": 2.992367383284569, "ewc_loss": 0.03536712005734444, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.536712029017508e-05, "grad_norm": 19.884387969970703, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8686144948005676, "num_tokens": 897391213.0, "step": 23523 }, { "epoch": 2.99249459356316, "ewc_loss": 0.03534004092216492, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534003917593509e-05, "grad_norm": 19.853830337524414, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8703026175498962, "num_tokens": 897428222.0, "step": 23524 }, { "epoch": 2.9926218038417502, "ewc_loss": 0.035411737859249115, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541173646226525e-05, "grad_norm": 19.90968894958496, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8664867877960205, "num_tokens": 897457737.0, "step": 23525 }, { "epoch": 2.992749014120341, "ewc_loss": 0.03545563668012619, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.545563595253043e-05, "grad_norm": 19.86073875427246, "learning_rate": 1e-06, "loss": 0.3637, "mean_token_accuracy": 0.882839560508728, "num_tokens": 897498925.0, "step": 23526 }, { "epoch": 2.9928762243989313, "ewc_loss": 0.03536771237850189, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5367713280720636e-05, "grad_norm": 19.972379684448242, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8719905614852905, "num_tokens": 897533157.0, "step": 23527 }, { "epoch": 2.993003434677522, "ewc_loss": 0.03548847511410713, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5488475987222046e-05, "grad_norm": 19.916879653930664, "learning_rate": 1e-06, "loss": 0.3805, "mean_token_accuracy": 0.878951907157898, "num_tokens": 897574159.0, "step": 23528 }, { "epoch": 2.9931306449561124, "ewc_loss": 0.03535245358943939, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5352451959624887e-05, "grad_norm": 19.825458526611328, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8684186935424805, "num_tokens": 897606970.0, "step": 23529 }, { "epoch": 2.993257855234703, "ewc_loss": 0.0353880301117897, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5388031392358243e-05, "grad_norm": 19.902790069580078, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8533843159675598, "num_tokens": 897647725.0, "step": 23530 }, { "epoch": 2.9933850655132934, "ewc_loss": 0.035445693880319595, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.544569335645065e-05, "grad_norm": 19.8818359375, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.8798902034759521, "num_tokens": 897687109.0, "step": 23531 }, { "epoch": 2.993512275791884, "ewc_loss": 0.03543397784233093, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.543397906469181e-05, "grad_norm": 19.948104858398438, "learning_rate": 1e-06, "loss": 0.3589, "mean_token_accuracy": 0.8865681886672974, "num_tokens": 897720994.0, "step": 23532 }, { "epoch": 2.9936394860704745, "ewc_loss": 0.03546188399195671, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5461882362142205e-05, "grad_norm": 19.91686248779297, "learning_rate": 1e-06, "loss": 0.3377, "mean_token_accuracy": 0.893033504486084, "num_tokens": 897749855.0, "step": 23533 }, { "epoch": 2.993766696349065, "ewc_loss": 0.03542516008019447, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.542516060406342e-05, "grad_norm": 19.96567726135254, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8757641315460205, "num_tokens": 897792846.0, "step": 23534 }, { "epoch": 2.9938939066276555, "ewc_loss": 0.03546050563454628, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5460507206153125e-05, "grad_norm": 19.905635833740234, "learning_rate": 1e-06, "loss": 0.377, "mean_token_accuracy": 0.8796055316925049, "num_tokens": 897836765.0, "step": 23535 }, { "epoch": 2.994021116906246, "ewc_loss": 0.03539681434631348, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.539681347319856e-05, "grad_norm": 19.926284790039062, "learning_rate": 1e-06, "loss": 0.3866, "mean_token_accuracy": 0.8767356872558594, "num_tokens": 897880918.0, "step": 23536 }, { "epoch": 2.9941483271848366, "ewc_loss": 0.03544222563505173, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.544222636264749e-05, "grad_norm": 19.830493927001953, "learning_rate": 1e-06, "loss": 0.3898, "mean_token_accuracy": 0.875176727771759, "num_tokens": 897916676.0, "step": 23537 }, { "epoch": 2.994275537463427, "ewc_loss": 0.035396598279476166, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5396598832448944e-05, "grad_norm": 19.897655487060547, "learning_rate": 1e-06, "loss": 0.369, "mean_token_accuracy": 0.8826155662536621, "num_tokens": 897953968.0, "step": 23538 }, { "epoch": 2.9944027477420176, "ewc_loss": 0.03555203974246979, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.555203875293955e-05, "grad_norm": 19.960493087768555, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8756719827651978, "num_tokens": 897993055.0, "step": 23539 }, { "epoch": 2.994529958020608, "ewc_loss": 0.0354779027402401, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.547790402080864e-05, "grad_norm": 19.92605972290039, "learning_rate": 1e-06, "loss": 0.3595, "mean_token_accuracy": 0.8836628198623657, "num_tokens": 898030202.0, "step": 23540 }, { "epoch": 2.9946571682991987, "ewc_loss": 0.03550399839878082, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5503999242791906e-05, "grad_norm": 19.955581665039062, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8719797730445862, "num_tokens": 898067234.0, "step": 23541 }, { "epoch": 2.9947843785777892, "ewc_loss": 0.0354810394346714, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.548103995854035e-05, "grad_norm": 19.979843139648438, "learning_rate": 1e-06, "loss": 0.3697, "mean_token_accuracy": 0.8812236189842224, "num_tokens": 898105423.0, "step": 23542 }, { "epoch": 2.9949115888563798, "ewc_loss": 0.03547630459070206, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.547630331013352e-05, "grad_norm": 19.92011070251465, "learning_rate": 1e-06, "loss": 0.324, "mean_token_accuracy": 0.8957115411758423, "num_tokens": 898144550.0, "step": 23543 }, { "epoch": 2.99503879913497, "ewc_loss": 0.035386085510253906, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.538608507369645e-05, "grad_norm": 19.94347381591797, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8798670768737793, "num_tokens": 898179277.0, "step": 23544 }, { "epoch": 2.995166009413561, "ewc_loss": 0.035474929958581924, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5474928154144436e-05, "grad_norm": 19.988685607910156, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.8741186261177063, "num_tokens": 898218518.0, "step": 23545 }, { "epoch": 2.995293219692151, "ewc_loss": 0.03540683537721634, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.54068361048121e-05, "grad_norm": 19.974838256835938, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8644348382949829, "num_tokens": 898259266.0, "step": 23546 }, { "epoch": 2.995420429970742, "ewc_loss": 0.03540819138288498, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5408193070907146e-05, "grad_norm": 19.959651947021484, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.876277506351471, "num_tokens": 898292648.0, "step": 23547 }, { "epoch": 2.995547640249332, "ewc_loss": 0.035419706255197525, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541970727383159e-05, "grad_norm": 19.956693649291992, "learning_rate": 1e-06, "loss": 0.3845, "mean_token_accuracy": 0.8809154033660889, "num_tokens": 898328413.0, "step": 23548 }, { "epoch": 2.995674850527923, "ewc_loss": 0.03537553921341896, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.537553857313469e-05, "grad_norm": 19.91997718811035, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8761371374130249, "num_tokens": 898376963.0, "step": 23549 }, { "epoch": 2.995802060806513, "ewc_loss": 0.03539138287305832, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5391381970839575e-05, "grad_norm": 19.92940330505371, "learning_rate": 1e-06, "loss": 0.3487, "mean_token_accuracy": 0.8883929252624512, "num_tokens": 898407256.0, "step": 23550 }, { "epoch": 2.9959292710851035, "ewc_loss": 0.0353536456823349, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.535364521667361e-05, "grad_norm": 19.931791305541992, "learning_rate": 1e-06, "loss": 0.3877, "mean_token_accuracy": 0.8801501989364624, "num_tokens": 898448800.0, "step": 23551 }, { "epoch": 2.996056481363694, "ewc_loss": 0.03544283285737038, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.544283390510827e-05, "grad_norm": 20.00528335571289, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8739855289459229, "num_tokens": 898488912.0, "step": 23552 }, { "epoch": 2.9961836916422846, "ewc_loss": 0.03540777415037155, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.540777470334433e-05, "grad_norm": 20.009471893310547, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8707654476165771, "num_tokens": 898530434.0, "step": 23553 }, { "epoch": 2.996310901920875, "ewc_loss": 0.03536008298397064, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5360084439162165e-05, "grad_norm": 19.896860122680664, "learning_rate": 1e-06, "loss": 0.3816, "mean_token_accuracy": 0.8780468702316284, "num_tokens": 898565044.0, "step": 23554 }, { "epoch": 2.9964381121994657, "ewc_loss": 0.035330045968294144, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533004564815201e-05, "grad_norm": 19.94090461730957, "learning_rate": 1e-06, "loss": 0.3591, "mean_token_accuracy": 0.8864399194717407, "num_tokens": 898601798.0, "step": 23555 }, { "epoch": 2.996565322478056, "ewc_loss": 0.03539092466235161, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.539092358550988e-05, "grad_norm": 19.91839599609375, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8671172857284546, "num_tokens": 898644789.0, "step": 23556 }, { "epoch": 2.9966925327566467, "ewc_loss": 0.03533410653471947, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533410563250072e-05, "grad_norm": 19.9397029876709, "learning_rate": 1e-06, "loss": 0.3879, "mean_token_accuracy": 0.8763620853424072, "num_tokens": 898683928.0, "step": 23557 }, { "epoch": 2.9968197430352372, "ewc_loss": 0.035458311438560486, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.545830986695364e-05, "grad_norm": 20.011890411376953, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8721563816070557, "num_tokens": 898726217.0, "step": 23558 }, { "epoch": 2.9969469533138278, "ewc_loss": 0.0352729856967926, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5272983950562775e-05, "grad_norm": 19.891727447509766, "learning_rate": 1e-06, "loss": 0.3564, "mean_token_accuracy": 0.8865662813186646, "num_tokens": 898760575.0, "step": 23559 }, { "epoch": 2.9970741635924183, "ewc_loss": 0.035349443554878235, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534944335115142e-05, "grad_norm": 19.919994354248047, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8725666403770447, "num_tokens": 898801232.0, "step": 23560 }, { "epoch": 2.997201373871009, "ewc_loss": 0.03544241935014725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5442419175524265e-05, "grad_norm": 20.02534294128418, "learning_rate": 1e-06, "loss": 0.3625, "mean_token_accuracy": 0.8827739357948303, "num_tokens": 898837174.0, "step": 23561 }, { "epoch": 2.9973285841495994, "ewc_loss": 0.03536098822951317, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5360986657906324e-05, "grad_norm": 19.886890411376953, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8686568737030029, "num_tokens": 898875849.0, "step": 23562 }, { "epoch": 2.99745579442819, "ewc_loss": 0.03536902740597725, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5369026591069996e-05, "grad_norm": 20.067668914794922, "learning_rate": 1e-06, "loss": 0.3635, "mean_token_accuracy": 0.8851780295372009, "num_tokens": 898909975.0, "step": 23563 }, { "epoch": 2.9975830047067804, "ewc_loss": 0.03543771058320999, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.543771163094789e-05, "grad_norm": 19.957271575927734, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8709431886672974, "num_tokens": 898948250.0, "step": 23564 }, { "epoch": 2.997710214985371, "ewc_loss": 0.03525266423821449, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.525266583892517e-05, "grad_norm": 19.94892692565918, "learning_rate": 1e-06, "loss": 0.3505, "mean_token_accuracy": 0.8931871056556702, "num_tokens": 898989122.0, "step": 23565 }, { "epoch": 2.9978374252639615, "ewc_loss": 0.035413794219493866, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.541379555827007e-05, "grad_norm": 20.003028869628906, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8722927570343018, "num_tokens": 899028715.0, "step": 23566 }, { "epoch": 2.997964635542552, "ewc_loss": 0.03532588481903076, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5325883800396696e-05, "grad_norm": 20.005470275878906, "learning_rate": 1e-06, "loss": 0.3737, "mean_token_accuracy": 0.8799312710762024, "num_tokens": 899064101.0, "step": 23567 }, { "epoch": 2.9980918458211425, "ewc_loss": 0.0353056900203228, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.530568938003853e-05, "grad_norm": 19.926021575927734, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8803917169570923, "num_tokens": 899101759.0, "step": 23568 }, { "epoch": 2.9982190560997326, "ewc_loss": 0.03534875437617302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534875577315688e-05, "grad_norm": 19.955347061157227, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8701410293579102, "num_tokens": 899142273.0, "step": 23569 }, { "epoch": 2.9983462663783236, "ewc_loss": 0.03532201424241066, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532201299094595e-05, "grad_norm": 20.001876831054688, "learning_rate": 1e-06, "loss": 0.3821, "mean_token_accuracy": 0.878768265247345, "num_tokens": 899175878.0, "step": 23570 }, { "epoch": 2.9984734766569137, "ewc_loss": 0.03531398996710777, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.531399124767631e-05, "grad_norm": 19.905948638916016, "learning_rate": 1e-06, "loss": 0.3469, "mean_token_accuracy": 0.8869873285293579, "num_tokens": 899208937.0, "step": 23571 }, { "epoch": 2.9986006869355046, "ewc_loss": 0.03530692309141159, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5306922654854134e-05, "grad_norm": 20.0041446685791, "learning_rate": 1e-06, "loss": 0.3874, "mean_token_accuracy": 0.8760817646980286, "num_tokens": 899244088.0, "step": 23572 }, { "epoch": 2.9987278972140947, "ewc_loss": 0.03536752611398697, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5367527743801475e-05, "grad_norm": 19.956188201904297, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8718593120574951, "num_tokens": 899280111.0, "step": 23573 }, { "epoch": 2.9988551074926852, "ewc_loss": 0.03528553247451782, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5285531339468434e-05, "grad_norm": 20.016510009765625, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8765894174575806, "num_tokens": 899310882.0, "step": 23574 }, { "epoch": 2.9989823177712758, "ewc_loss": 0.03540854901075363, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.540854959283024e-05, "grad_norm": 19.913301467895508, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8635838031768799, "num_tokens": 899346820.0, "step": 23575 }, { "epoch": 2.9991095280498663, "ewc_loss": 0.03531639277935028, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5316392313688993e-05, "grad_norm": 19.954683303833008, "learning_rate": 1e-06, "loss": 0.3465, "mean_token_accuracy": 0.8903300762176514, "num_tokens": 899382044.0, "step": 23576 }, { "epoch": 2.999236738328457, "ewc_loss": 0.035438477993011475, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5438479244476184e-05, "grad_norm": 20.051734924316406, "learning_rate": 1e-06, "loss": 0.3779, "mean_token_accuracy": 0.8771319389343262, "num_tokens": 899418389.0, "step": 23577 }, { "epoch": 2.9993639486070474, "ewc_loss": 0.035431668162345886, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.543166894814931e-05, "grad_norm": 20.01232147216797, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.875200629234314, "num_tokens": 899458486.0, "step": 23578 }, { "epoch": 2.999491158885638, "ewc_loss": 0.035331644117832184, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.533164272084832e-05, "grad_norm": 19.959548950195312, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.876164436340332, "num_tokens": 899498168.0, "step": 23579 }, { "epoch": 2.9996183691642284, "ewc_loss": 0.03532453998923302, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.532454138621688e-05, "grad_norm": 19.925554275512695, "learning_rate": 1e-06, "loss": 0.3373, "mean_token_accuracy": 0.8932626843452454, "num_tokens": 899538655.0, "step": 23580 }, { "epoch": 2.999745579442819, "ewc_loss": 0.035345934331417084, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.534593270160258e-05, "grad_norm": 19.989835739135742, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8660637140274048, "num_tokens": 899581693.0, "step": 23581 }, { "epoch": 2.9998727897214095, "ewc_loss": 0.035424910485744476, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.542490958352573e-05, "grad_norm": 19.953115463256836, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8738547563552856, "num_tokens": 899623817.0, "step": 23582 }, { "epoch": 3.0, "ewc_loss": 0.03533089905977249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5330900573171675e-05, "grad_norm": 19.889055252075195, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8802847862243652, "num_tokens": 899664226.0, "step": 23583 }, { "epoch": 3.0, "ewc_loss": 0.03533089905977249, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 3.5330900573171675e-05, "step": 23583, "total_flos": 5.62815163329864e+19, "train_loss": 0.42502481960762, "train_runtime": 45533.7805, "train_samples_per_second": 8.286, "train_steps_per_second": 0.518 } ], "logging_steps": 1, "max_steps": 23583, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 11792, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.62815163329864e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }