{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1966, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000508646998982706, "grad_norm": 32.11302947998047, "learning_rate": 0.0, "loss": 1.2961, "mean_token_accuracy": 0.7449109554290771, "num_tokens": 156200.0, "step": 1 }, { "epoch": 0.001017293997965412, "grad_norm": 31.278301239013672, "learning_rate": 5.076142131979696e-08, "loss": 1.2669, "mean_token_accuracy": 0.7504401206970215, "num_tokens": 314183.0, "step": 2 }, { "epoch": 0.001525940996948118, "grad_norm": 30.861223220825195, "learning_rate": 1.0152284263959391e-07, "loss": 1.3067, "mean_token_accuracy": 0.7393251657485962, "num_tokens": 470040.0, "step": 3 }, { "epoch": 0.002034587995930824, "grad_norm": 31.689613342285156, "learning_rate": 1.5228426395939088e-07, "loss": 1.3077, "mean_token_accuracy": 0.7398750185966492, "num_tokens": 627227.0, "step": 4 }, { "epoch": 0.00254323499491353, "grad_norm": 31.292264938354492, "learning_rate": 2.0304568527918783e-07, "loss": 1.3225, "mean_token_accuracy": 0.7381956577301025, "num_tokens": 785176.0, "step": 5 }, { "epoch": 0.003051881993896236, "grad_norm": 32.40222930908203, "learning_rate": 2.538071065989848e-07, "loss": 1.3197, "mean_token_accuracy": 0.7405046224594116, "num_tokens": 926658.0, "step": 6 }, { "epoch": 0.003560528992878942, "grad_norm": 31.02337074279785, "learning_rate": 3.0456852791878176e-07, "loss": 1.3316, "mean_token_accuracy": 0.7354226112365723, "num_tokens": 1089803.0, "step": 7 }, { "epoch": 0.004069175991861648, "grad_norm": 30.440677642822266, "learning_rate": 3.553299492385787e-07, "loss": 1.268, "mean_token_accuracy": 0.7473543882369995, "num_tokens": 1253116.0, "step": 8 }, { "epoch": 0.004577822990844354, "grad_norm": 29.12032699584961, "learning_rate": 4.0609137055837566e-07, "loss": 1.2602, "mean_token_accuracy": 0.7476384043693542, "num_tokens": 1419240.0, "step": 9 }, { "epoch": 0.00508646998982706, "grad_norm": 29.42680549621582, "learning_rate": 4.568527918781726e-07, "loss": 1.2725, "mean_token_accuracy": 0.7454349994659424, "num_tokens": 1582136.0, "step": 10 }, { "epoch": 0.005595116988809766, "grad_norm": 28.261323928833008, "learning_rate": 5.076142131979696e-07, "loss": 1.2612, "mean_token_accuracy": 0.744758129119873, "num_tokens": 1738808.0, "step": 11 }, { "epoch": 0.006103763987792472, "grad_norm": 27.25571632385254, "learning_rate": 5.583756345177665e-07, "loss": 1.2428, "mean_token_accuracy": 0.7447201609611511, "num_tokens": 1899325.0, "step": 12 }, { "epoch": 0.006612410986775178, "grad_norm": 26.683231353759766, "learning_rate": 6.091370558375635e-07, "loss": 1.1995, "mean_token_accuracy": 0.7522713541984558, "num_tokens": 2067149.0, "step": 13 }, { "epoch": 0.007121057985757884, "grad_norm": 19.833831787109375, "learning_rate": 6.598984771573605e-07, "loss": 1.2018, "mean_token_accuracy": 0.7373824715614319, "num_tokens": 2225992.0, "step": 14 }, { "epoch": 0.0076297049847405905, "grad_norm": 20.34471893310547, "learning_rate": 7.106598984771574e-07, "loss": 1.1661, "mean_token_accuracy": 0.7451382875442505, "num_tokens": 2391475.0, "step": 15 }, { "epoch": 0.008138351983723296, "grad_norm": 17.260560989379883, "learning_rate": 7.614213197969544e-07, "loss": 1.133, "mean_token_accuracy": 0.747637152671814, "num_tokens": 2564876.0, "step": 16 }, { "epoch": 0.008646998982706003, "grad_norm": 16.89604377746582, "learning_rate": 8.121827411167513e-07, "loss": 1.167, "mean_token_accuracy": 0.7356365919113159, "num_tokens": 2728912.0, "step": 17 }, { "epoch": 0.009155645981688708, "grad_norm": 17.789722442626953, "learning_rate": 8.629441624365482e-07, "loss": 1.1113, "mean_token_accuracy": 0.752740204334259, "num_tokens": 2885094.0, "step": 18 }, { "epoch": 0.009664292980671414, "grad_norm": 9.220502853393555, "learning_rate": 9.137055837563452e-07, "loss": 1.0571, "mean_token_accuracy": 0.7442377805709839, "num_tokens": 3043505.0, "step": 19 }, { "epoch": 0.01017293997965412, "grad_norm": 6.030149459838867, "learning_rate": 9.644670050761422e-07, "loss": 0.9415, "mean_token_accuracy": 0.7668419480323792, "num_tokens": 3200547.0, "step": 20 }, { "epoch": 0.010681586978636826, "grad_norm": 5.80075216293335, "learning_rate": 1.0152284263959392e-06, "loss": 1.027, "mean_token_accuracy": 0.746906578540802, "num_tokens": 3352567.0, "step": 21 }, { "epoch": 0.011190233977619531, "grad_norm": 5.6724772453308105, "learning_rate": 1.0659898477157362e-06, "loss": 1.0436, "mean_token_accuracy": 0.7409395575523376, "num_tokens": 3522299.0, "step": 22 }, { "epoch": 0.011698880976602238, "grad_norm": 5.231161594390869, "learning_rate": 1.116751269035533e-06, "loss": 1.0285, "mean_token_accuracy": 0.7423572540283203, "num_tokens": 3678029.0, "step": 23 }, { "epoch": 0.012207527975584944, "grad_norm": 4.966853141784668, "learning_rate": 1.16751269035533e-06, "loss": 0.9824, "mean_token_accuracy": 0.7529760599136353, "num_tokens": 3843052.0, "step": 24 }, { "epoch": 0.01271617497456765, "grad_norm": 4.760066509246826, "learning_rate": 1.218274111675127e-06, "loss": 0.979, "mean_token_accuracy": 0.7546529769897461, "num_tokens": 4003590.0, "step": 25 }, { "epoch": 0.013224821973550356, "grad_norm": 5.28239631652832, "learning_rate": 1.2690355329949238e-06, "loss": 0.9364, "mean_token_accuracy": 0.7599929571151733, "num_tokens": 4163923.0, "step": 26 }, { "epoch": 0.013733468972533061, "grad_norm": 6.529881477355957, "learning_rate": 1.319796954314721e-06, "loss": 0.9794, "mean_token_accuracy": 0.7488726377487183, "num_tokens": 4329915.0, "step": 27 }, { "epoch": 0.014242115971515769, "grad_norm": 6.9578962326049805, "learning_rate": 1.3705583756345178e-06, "loss": 0.9386, "mean_token_accuracy": 0.7588946223258972, "num_tokens": 4501785.0, "step": 28 }, { "epoch": 0.014750762970498474, "grad_norm": 6.696822643280029, "learning_rate": 1.4213197969543148e-06, "loss": 0.9373, "mean_token_accuracy": 0.7576085329055786, "num_tokens": 4665203.0, "step": 29 }, { "epoch": 0.015259409969481181, "grad_norm": 6.5830302238464355, "learning_rate": 1.4720812182741118e-06, "loss": 0.9173, "mean_token_accuracy": 0.7623026371002197, "num_tokens": 4826248.0, "step": 30 }, { "epoch": 0.015768056968463885, "grad_norm": 5.765131950378418, "learning_rate": 1.5228426395939088e-06, "loss": 0.9028, "mean_token_accuracy": 0.7635811567306519, "num_tokens": 5002184.0, "step": 31 }, { "epoch": 0.01627670396744659, "grad_norm": 5.106930255889893, "learning_rate": 1.5736040609137056e-06, "loss": 0.919, "mean_token_accuracy": 0.7574912905693054, "num_tokens": 5159429.0, "step": 32 }, { "epoch": 0.0167853509664293, "grad_norm": 4.388010025024414, "learning_rate": 1.6243654822335026e-06, "loss": 0.9034, "mean_token_accuracy": 0.7649917602539062, "num_tokens": 5319981.0, "step": 33 }, { "epoch": 0.017293997965412006, "grad_norm": 3.827232837677002, "learning_rate": 1.6751269035532996e-06, "loss": 0.9191, "mean_token_accuracy": 0.7602884769439697, "num_tokens": 5470920.0, "step": 34 }, { "epoch": 0.01780264496439471, "grad_norm": 3.347505807876587, "learning_rate": 1.7258883248730964e-06, "loss": 0.9235, "mean_token_accuracy": 0.7586832642555237, "num_tokens": 5621279.0, "step": 35 }, { "epoch": 0.018311291963377416, "grad_norm": 2.964426279067993, "learning_rate": 1.7766497461928936e-06, "loss": 0.868, "mean_token_accuracy": 0.7696095705032349, "num_tokens": 5783722.0, "step": 36 }, { "epoch": 0.018819938962360123, "grad_norm": 2.932925224304199, "learning_rate": 1.8274111675126904e-06, "loss": 0.8643, "mean_token_accuracy": 0.7698394060134888, "num_tokens": 5946247.0, "step": 37 }, { "epoch": 0.019328585961342827, "grad_norm": 2.958730459213257, "learning_rate": 1.8781725888324874e-06, "loss": 0.8728, "mean_token_accuracy": 0.7667010426521301, "num_tokens": 6105483.0, "step": 38 }, { "epoch": 0.019837232960325534, "grad_norm": 2.769481658935547, "learning_rate": 1.9289340101522844e-06, "loss": 0.8852, "mean_token_accuracy": 0.7639551758766174, "num_tokens": 6267257.0, "step": 39 }, { "epoch": 0.02034587995930824, "grad_norm": 2.108130693435669, "learning_rate": 1.9796954314720814e-06, "loss": 0.8248, "mean_token_accuracy": 0.7764819860458374, "num_tokens": 6423575.0, "step": 40 }, { "epoch": 0.020854526958290945, "grad_norm": 1.7889447212219238, "learning_rate": 2.0304568527918785e-06, "loss": 0.8387, "mean_token_accuracy": 0.774937093257904, "num_tokens": 6580647.0, "step": 41 }, { "epoch": 0.021363173957273652, "grad_norm": 1.8017579317092896, "learning_rate": 2.0812182741116755e-06, "loss": 0.8172, "mean_token_accuracy": 0.7769420146942139, "num_tokens": 6740933.0, "step": 42 }, { "epoch": 0.02187182095625636, "grad_norm": 2.1292924880981445, "learning_rate": 2.1319796954314725e-06, "loss": 0.8129, "mean_token_accuracy": 0.7777750492095947, "num_tokens": 6888534.0, "step": 43 }, { "epoch": 0.022380467955239063, "grad_norm": 2.17268967628479, "learning_rate": 2.182741116751269e-06, "loss": 0.8078, "mean_token_accuracy": 0.7786726951599121, "num_tokens": 7053795.0, "step": 44 }, { "epoch": 0.02288911495422177, "grad_norm": 1.9331721067428589, "learning_rate": 2.233502538071066e-06, "loss": 0.8014, "mean_token_accuracy": 0.7789409160614014, "num_tokens": 7206610.0, "step": 45 }, { "epoch": 0.023397761953204477, "grad_norm": 1.6656444072723389, "learning_rate": 2.284263959390863e-06, "loss": 0.8457, "mean_token_accuracy": 0.7701550722122192, "num_tokens": 7375236.0, "step": 46 }, { "epoch": 0.023906408952187184, "grad_norm": 1.4546806812286377, "learning_rate": 2.33502538071066e-06, "loss": 0.7827, "mean_token_accuracy": 0.7828081846237183, "num_tokens": 7533110.0, "step": 47 }, { "epoch": 0.024415055951169887, "grad_norm": 1.4266563653945923, "learning_rate": 2.385786802030457e-06, "loss": 0.8063, "mean_token_accuracy": 0.777489185333252, "num_tokens": 7693804.0, "step": 48 }, { "epoch": 0.024923702950152594, "grad_norm": 1.5602530241012573, "learning_rate": 2.436548223350254e-06, "loss": 0.7513, "mean_token_accuracy": 0.793053388595581, "num_tokens": 7873397.0, "step": 49 }, { "epoch": 0.0254323499491353, "grad_norm": 1.5479062795639038, "learning_rate": 2.487309644670051e-06, "loss": 0.788, "mean_token_accuracy": 0.7837173938751221, "num_tokens": 8024175.0, "step": 50 }, { "epoch": 0.025940996948118005, "grad_norm": 1.4507423639297485, "learning_rate": 2.5380710659898476e-06, "loss": 0.7655, "mean_token_accuracy": 0.7892324328422546, "num_tokens": 8177438.0, "step": 51 }, { "epoch": 0.026449643947100712, "grad_norm": 1.288428544998169, "learning_rate": 2.588832487309645e-06, "loss": 0.7669, "mean_token_accuracy": 0.7886277437210083, "num_tokens": 8351946.0, "step": 52 }, { "epoch": 0.02695829094608342, "grad_norm": 1.4515126943588257, "learning_rate": 2.639593908629442e-06, "loss": 0.7712, "mean_token_accuracy": 0.7832987308502197, "num_tokens": 8504280.0, "step": 53 }, { "epoch": 0.027466937945066123, "grad_norm": 1.3418675661087036, "learning_rate": 2.6903553299492387e-06, "loss": 0.7384, "mean_token_accuracy": 0.7952812910079956, "num_tokens": 8661017.0, "step": 54 }, { "epoch": 0.02797558494404883, "grad_norm": 1.3066939115524292, "learning_rate": 2.7411167512690357e-06, "loss": 0.769, "mean_token_accuracy": 0.786482036113739, "num_tokens": 8819796.0, "step": 55 }, { "epoch": 0.028484231943031537, "grad_norm": 1.7970499992370605, "learning_rate": 2.7918781725888327e-06, "loss": 0.7347, "mean_token_accuracy": 0.7946780323982239, "num_tokens": 8975390.0, "step": 56 }, { "epoch": 0.02899287894201424, "grad_norm": 1.2918215990066528, "learning_rate": 2.8426395939086297e-06, "loss": 0.7557, "mean_token_accuracy": 0.7911810278892517, "num_tokens": 9128994.0, "step": 57 }, { "epoch": 0.029501525940996948, "grad_norm": 1.3276824951171875, "learning_rate": 2.8934010152284262e-06, "loss": 0.7342, "mean_token_accuracy": 0.7935407161712646, "num_tokens": 9280736.0, "step": 58 }, { "epoch": 0.030010172939979655, "grad_norm": 1.2609550952911377, "learning_rate": 2.9441624365482237e-06, "loss": 0.7755, "mean_token_accuracy": 0.7831066846847534, "num_tokens": 9431826.0, "step": 59 }, { "epoch": 0.030518819938962362, "grad_norm": 1.2028979063034058, "learning_rate": 2.9949238578680207e-06, "loss": 0.717, "mean_token_accuracy": 0.7988446950912476, "num_tokens": 9587259.0, "step": 60 }, { "epoch": 0.031027466937945065, "grad_norm": 1.1432055234909058, "learning_rate": 3.0456852791878177e-06, "loss": 0.7201, "mean_token_accuracy": 0.7961779236793518, "num_tokens": 9735856.0, "step": 61 }, { "epoch": 0.03153611393692777, "grad_norm": 1.147587537765503, "learning_rate": 3.0964467005076143e-06, "loss": 0.7267, "mean_token_accuracy": 0.7953379154205322, "num_tokens": 9903156.0, "step": 62 }, { "epoch": 0.03204476093591048, "grad_norm": 1.1921346187591553, "learning_rate": 3.1472081218274113e-06, "loss": 0.7245, "mean_token_accuracy": 0.7943313121795654, "num_tokens": 10054899.0, "step": 63 }, { "epoch": 0.03255340793489318, "grad_norm": 1.1654026508331299, "learning_rate": 3.1979695431472087e-06, "loss": 0.6968, "mean_token_accuracy": 0.803705096244812, "num_tokens": 10214147.0, "step": 64 }, { "epoch": 0.03306205493387589, "grad_norm": 1.1071702241897583, "learning_rate": 3.2487309644670053e-06, "loss": 0.6762, "mean_token_accuracy": 0.8057570457458496, "num_tokens": 10376357.0, "step": 65 }, { "epoch": 0.0335707019328586, "grad_norm": 1.199992299079895, "learning_rate": 3.2994923857868023e-06, "loss": 0.7448, "mean_token_accuracy": 0.7892211675643921, "num_tokens": 10528001.0, "step": 66 }, { "epoch": 0.0340793489318413, "grad_norm": 1.173454761505127, "learning_rate": 3.3502538071065993e-06, "loss": 0.742, "mean_token_accuracy": 0.7903908491134644, "num_tokens": 10697458.0, "step": 67 }, { "epoch": 0.03458799593082401, "grad_norm": 1.2794227600097656, "learning_rate": 3.4010152284263963e-06, "loss": 0.7566, "mean_token_accuracy": 0.7864009141921997, "num_tokens": 10852213.0, "step": 68 }, { "epoch": 0.035096642929806715, "grad_norm": 1.1894865036010742, "learning_rate": 3.451776649746193e-06, "loss": 0.7158, "mean_token_accuracy": 0.7968059778213501, "num_tokens": 11018742.0, "step": 69 }, { "epoch": 0.03560528992878942, "grad_norm": 1.1944557428359985, "learning_rate": 3.5025380710659903e-06, "loss": 0.7453, "mean_token_accuracy": 0.7894875407218933, "num_tokens": 11178241.0, "step": 70 }, { "epoch": 0.03611393692777213, "grad_norm": 1.1321941614151, "learning_rate": 3.5532994923857873e-06, "loss": 0.7, "mean_token_accuracy": 0.8021224737167358, "num_tokens": 11339213.0, "step": 71 }, { "epoch": 0.03662258392675483, "grad_norm": 1.185842514038086, "learning_rate": 3.6040609137055843e-06, "loss": 0.6916, "mean_token_accuracy": 0.801837146282196, "num_tokens": 11489548.0, "step": 72 }, { "epoch": 0.037131230925737536, "grad_norm": 1.1836117506027222, "learning_rate": 3.654822335025381e-06, "loss": 0.6982, "mean_token_accuracy": 0.8014888763427734, "num_tokens": 11656772.0, "step": 73 }, { "epoch": 0.03763987792472025, "grad_norm": 1.1263853311538696, "learning_rate": 3.705583756345178e-06, "loss": 0.6811, "mean_token_accuracy": 0.8046553730964661, "num_tokens": 11818390.0, "step": 74 }, { "epoch": 0.03814852492370295, "grad_norm": 1.2295253276824951, "learning_rate": 3.756345177664975e-06, "loss": 0.7333, "mean_token_accuracy": 0.791732907295227, "num_tokens": 11965717.0, "step": 75 }, { "epoch": 0.038657171922685654, "grad_norm": 1.1593804359436035, "learning_rate": 3.8071065989847715e-06, "loss": 0.7127, "mean_token_accuracy": 0.7967889904975891, "num_tokens": 12131732.0, "step": 76 }, { "epoch": 0.039165818921668365, "grad_norm": 1.1421594619750977, "learning_rate": 3.857868020304569e-06, "loss": 0.7309, "mean_token_accuracy": 0.7926912307739258, "num_tokens": 12283927.0, "step": 77 }, { "epoch": 0.03967446592065107, "grad_norm": 1.0763299465179443, "learning_rate": 3.9086294416243655e-06, "loss": 0.6936, "mean_token_accuracy": 0.8024134635925293, "num_tokens": 12436906.0, "step": 78 }, { "epoch": 0.04018311291963377, "grad_norm": 1.1294554471969604, "learning_rate": 3.959390862944163e-06, "loss": 0.7055, "mean_token_accuracy": 0.7977326512336731, "num_tokens": 12600258.0, "step": 79 }, { "epoch": 0.04069175991861648, "grad_norm": 1.1827266216278076, "learning_rate": 4.0101522842639595e-06, "loss": 0.7044, "mean_token_accuracy": 0.7989583015441895, "num_tokens": 12747689.0, "step": 80 }, { "epoch": 0.041200406917599186, "grad_norm": 1.1358517408370972, "learning_rate": 4.060913705583757e-06, "loss": 0.6928, "mean_token_accuracy": 0.802930474281311, "num_tokens": 12911145.0, "step": 81 }, { "epoch": 0.04170905391658189, "grad_norm": 1.1222280263900757, "learning_rate": 4.1116751269035535e-06, "loss": 0.6935, "mean_token_accuracy": 0.8010543584823608, "num_tokens": 13081387.0, "step": 82 }, { "epoch": 0.0422177009155646, "grad_norm": 1.1250072717666626, "learning_rate": 4.162436548223351e-06, "loss": 0.7126, "mean_token_accuracy": 0.7962907552719116, "num_tokens": 13243319.0, "step": 83 }, { "epoch": 0.042726347914547304, "grad_norm": 1.1086766719818115, "learning_rate": 4.2131979695431475e-06, "loss": 0.694, "mean_token_accuracy": 0.8029916882514954, "num_tokens": 13408622.0, "step": 84 }, { "epoch": 0.04323499491353001, "grad_norm": 1.2479902505874634, "learning_rate": 4.263959390862945e-06, "loss": 0.7087, "mean_token_accuracy": 0.7968636155128479, "num_tokens": 13562201.0, "step": 85 }, { "epoch": 0.04374364191251272, "grad_norm": 1.1710768938064575, "learning_rate": 4.3147208121827415e-06, "loss": 0.6947, "mean_token_accuracy": 0.801063597202301, "num_tokens": 13706016.0, "step": 86 }, { "epoch": 0.04425228891149542, "grad_norm": 1.109525203704834, "learning_rate": 4.365482233502538e-06, "loss": 0.6921, "mean_token_accuracy": 0.8021470308303833, "num_tokens": 13872736.0, "step": 87 }, { "epoch": 0.044760935910478125, "grad_norm": 1.1604565382003784, "learning_rate": 4.4162436548223355e-06, "loss": 0.6485, "mean_token_accuracy": 0.8127233386039734, "num_tokens": 14029427.0, "step": 88 }, { "epoch": 0.045269582909460836, "grad_norm": 1.1403427124023438, "learning_rate": 4.467005076142132e-06, "loss": 0.6809, "mean_token_accuracy": 0.802753746509552, "num_tokens": 14192123.0, "step": 89 }, { "epoch": 0.04577822990844354, "grad_norm": 1.080385684967041, "learning_rate": 4.5177664974619295e-06, "loss": 0.6581, "mean_token_accuracy": 0.8092286586761475, "num_tokens": 14356828.0, "step": 90 }, { "epoch": 0.04628687690742624, "grad_norm": 1.1096259355545044, "learning_rate": 4.568527918781726e-06, "loss": 0.6234, "mean_token_accuracy": 0.8174171447753906, "num_tokens": 14522417.0, "step": 91 }, { "epoch": 0.04679552390640895, "grad_norm": 1.1488056182861328, "learning_rate": 4.6192893401015235e-06, "loss": 0.7016, "mean_token_accuracy": 0.7984861135482788, "num_tokens": 14675485.0, "step": 92 }, { "epoch": 0.04730417090539166, "grad_norm": 1.1531343460083008, "learning_rate": 4.67005076142132e-06, "loss": 0.6636, "mean_token_accuracy": 0.8089845180511475, "num_tokens": 14831686.0, "step": 93 }, { "epoch": 0.04781281790437437, "grad_norm": 1.1957042217254639, "learning_rate": 4.7208121827411175e-06, "loss": 0.6937, "mean_token_accuracy": 0.8009766936302185, "num_tokens": 14983974.0, "step": 94 }, { "epoch": 0.04832146490335707, "grad_norm": 1.1750566959381104, "learning_rate": 4.771573604060914e-06, "loss": 0.7169, "mean_token_accuracy": 0.7926790118217468, "num_tokens": 15135323.0, "step": 95 }, { "epoch": 0.048830111902339775, "grad_norm": 1.2351971864700317, "learning_rate": 4.822335025380711e-06, "loss": 0.6883, "mean_token_accuracy": 0.8023048639297485, "num_tokens": 15289033.0, "step": 96 }, { "epoch": 0.049338758901322485, "grad_norm": 1.113542079925537, "learning_rate": 4.873096446700508e-06, "loss": 0.6929, "mean_token_accuracy": 0.7991148829460144, "num_tokens": 15439451.0, "step": 97 }, { "epoch": 0.04984740590030519, "grad_norm": 1.1323645114898682, "learning_rate": 4.923857868020305e-06, "loss": 0.6741, "mean_token_accuracy": 0.8045241832733154, "num_tokens": 15594811.0, "step": 98 }, { "epoch": 0.05035605289928789, "grad_norm": 1.2108473777770996, "learning_rate": 4.974619289340102e-06, "loss": 0.6838, "mean_token_accuracy": 0.8022953271865845, "num_tokens": 15758944.0, "step": 99 }, { "epoch": 0.0508646998982706, "grad_norm": 1.1598291397094727, "learning_rate": 5.025380710659899e-06, "loss": 0.6273, "mean_token_accuracy": 0.8167519569396973, "num_tokens": 15922340.0, "step": 100 }, { "epoch": 0.05137334689725331, "grad_norm": 1.1451945304870605, "learning_rate": 5.076142131979695e-06, "loss": 0.6472, "mean_token_accuracy": 0.8123450875282288, "num_tokens": 16074162.0, "step": 101 }, { "epoch": 0.05188199389623601, "grad_norm": 1.183913230895996, "learning_rate": 5.126903553299493e-06, "loss": 0.6701, "mean_token_accuracy": 0.8053520321846008, "num_tokens": 16238658.0, "step": 102 }, { "epoch": 0.05239064089521872, "grad_norm": 1.09932279586792, "learning_rate": 5.17766497461929e-06, "loss": 0.6449, "mean_token_accuracy": 0.8124725818634033, "num_tokens": 16396393.0, "step": 103 }, { "epoch": 0.052899287894201424, "grad_norm": 1.1981287002563477, "learning_rate": 5.228426395939087e-06, "loss": 0.6698, "mean_token_accuracy": 0.8068869113922119, "num_tokens": 16546360.0, "step": 104 }, { "epoch": 0.05340793489318413, "grad_norm": 1.1747201681137085, "learning_rate": 5.279187817258884e-06, "loss": 0.65, "mean_token_accuracy": 0.8108594417572021, "num_tokens": 16704979.0, "step": 105 }, { "epoch": 0.05391658189216684, "grad_norm": 1.0800034999847412, "learning_rate": 5.329949238578681e-06, "loss": 0.6449, "mean_token_accuracy": 0.8102988004684448, "num_tokens": 16860358.0, "step": 106 }, { "epoch": 0.05442522889114954, "grad_norm": 1.1816664934158325, "learning_rate": 5.380710659898477e-06, "loss": 0.6662, "mean_token_accuracy": 0.8073922991752625, "num_tokens": 17034067.0, "step": 107 }, { "epoch": 0.054933875890132246, "grad_norm": 1.2378530502319336, "learning_rate": 5.431472081218274e-06, "loss": 0.696, "mean_token_accuracy": 0.7979105710983276, "num_tokens": 17181452.0, "step": 108 }, { "epoch": 0.055442522889114956, "grad_norm": 1.102335810661316, "learning_rate": 5.482233502538071e-06, "loss": 0.6308, "mean_token_accuracy": 0.8160994052886963, "num_tokens": 17338963.0, "step": 109 }, { "epoch": 0.05595116988809766, "grad_norm": 1.1627287864685059, "learning_rate": 5.532994923857869e-06, "loss": 0.6639, "mean_token_accuracy": 0.8066244125366211, "num_tokens": 17493379.0, "step": 110 }, { "epoch": 0.056459816887080364, "grad_norm": 1.1666368246078491, "learning_rate": 5.583756345177665e-06, "loss": 0.6299, "mean_token_accuracy": 0.8159629106521606, "num_tokens": 17651741.0, "step": 111 }, { "epoch": 0.056968463886063074, "grad_norm": 1.1785426139831543, "learning_rate": 5.634517766497463e-06, "loss": 0.6499, "mean_token_accuracy": 0.8092576861381531, "num_tokens": 17810924.0, "step": 112 }, { "epoch": 0.05747711088504578, "grad_norm": 1.1432498693466187, "learning_rate": 5.685279187817259e-06, "loss": 0.6509, "mean_token_accuracy": 0.8116598129272461, "num_tokens": 17976758.0, "step": 113 }, { "epoch": 0.05798575788402848, "grad_norm": 1.162224292755127, "learning_rate": 5.736040609137057e-06, "loss": 0.6921, "mean_token_accuracy": 0.8001761436462402, "num_tokens": 18134320.0, "step": 114 }, { "epoch": 0.05849440488301119, "grad_norm": 1.2580808401107788, "learning_rate": 5.7868020304568525e-06, "loss": 0.682, "mean_token_accuracy": 0.8021042346954346, "num_tokens": 18276678.0, "step": 115 }, { "epoch": 0.059003051881993895, "grad_norm": 1.1780056953430176, "learning_rate": 5.83756345177665e-06, "loss": 0.6398, "mean_token_accuracy": 0.8138629198074341, "num_tokens": 18426725.0, "step": 116 }, { "epoch": 0.0595116988809766, "grad_norm": 1.1504524946212769, "learning_rate": 5.888324873096447e-06, "loss": 0.6748, "mean_token_accuracy": 0.8074554204940796, "num_tokens": 18587522.0, "step": 117 }, { "epoch": 0.06002034587995931, "grad_norm": 1.1362825632095337, "learning_rate": 5.939086294416244e-06, "loss": 0.6635, "mean_token_accuracy": 0.8064254522323608, "num_tokens": 18749574.0, "step": 118 }, { "epoch": 0.06052899287894201, "grad_norm": 1.161048412322998, "learning_rate": 5.989847715736041e-06, "loss": 0.6445, "mean_token_accuracy": 0.8129458427429199, "num_tokens": 18906800.0, "step": 119 }, { "epoch": 0.061037639877924724, "grad_norm": 1.2150564193725586, "learning_rate": 6.040609137055839e-06, "loss": 0.6979, "mean_token_accuracy": 0.7975142002105713, "num_tokens": 19068637.0, "step": 120 }, { "epoch": 0.06154628687690743, "grad_norm": 1.2829769849777222, "learning_rate": 6.091370558375635e-06, "loss": 0.657, "mean_token_accuracy": 0.8076584339141846, "num_tokens": 19219209.0, "step": 121 }, { "epoch": 0.06205493387589013, "grad_norm": 1.1912235021591187, "learning_rate": 6.142131979695432e-06, "loss": 0.6365, "mean_token_accuracy": 0.8118331432342529, "num_tokens": 19367232.0, "step": 122 }, { "epoch": 0.06256358087487283, "grad_norm": 1.1799649000167847, "learning_rate": 6.1928934010152285e-06, "loss": 0.6319, "mean_token_accuracy": 0.8160076141357422, "num_tokens": 19525992.0, "step": 123 }, { "epoch": 0.06307222787385554, "grad_norm": 1.133954405784607, "learning_rate": 6.243654822335026e-06, "loss": 0.6529, "mean_token_accuracy": 0.8099106550216675, "num_tokens": 19693698.0, "step": 124 }, { "epoch": 0.06358087487283826, "grad_norm": 1.2654860019683838, "learning_rate": 6.2944162436548225e-06, "loss": 0.6374, "mean_token_accuracy": 0.8136156797409058, "num_tokens": 19841203.0, "step": 125 }, { "epoch": 0.06408952187182096, "grad_norm": 1.2145419120788574, "learning_rate": 6.34517766497462e-06, "loss": 0.6777, "mean_token_accuracy": 0.8022431135177612, "num_tokens": 20001680.0, "step": 126 }, { "epoch": 0.06459816887080366, "grad_norm": 1.3376954793930054, "learning_rate": 6.395939086294417e-06, "loss": 0.6395, "mean_token_accuracy": 0.8126106262207031, "num_tokens": 20150787.0, "step": 127 }, { "epoch": 0.06510681586978637, "grad_norm": 1.1641050577163696, "learning_rate": 6.446700507614214e-06, "loss": 0.6497, "mean_token_accuracy": 0.8102061152458191, "num_tokens": 20326096.0, "step": 128 }, { "epoch": 0.06561546286876907, "grad_norm": 1.2625352144241333, "learning_rate": 6.4974619289340105e-06, "loss": 0.6885, "mean_token_accuracy": 0.8006555438041687, "num_tokens": 20490433.0, "step": 129 }, { "epoch": 0.06612410986775177, "grad_norm": 1.2983770370483398, "learning_rate": 6.548223350253807e-06, "loss": 0.6793, "mean_token_accuracy": 0.8020418882369995, "num_tokens": 20657596.0, "step": 130 }, { "epoch": 0.06663275686673449, "grad_norm": 1.1498662233352661, "learning_rate": 6.5989847715736045e-06, "loss": 0.6307, "mean_token_accuracy": 0.8145186305046082, "num_tokens": 20827354.0, "step": 131 }, { "epoch": 0.0671414038657172, "grad_norm": 1.192496657371521, "learning_rate": 6.649746192893401e-06, "loss": 0.6602, "mean_token_accuracy": 0.8084685802459717, "num_tokens": 20978319.0, "step": 132 }, { "epoch": 0.0676500508646999, "grad_norm": 1.2856016159057617, "learning_rate": 6.7005076142131985e-06, "loss": 0.637, "mean_token_accuracy": 0.814816951751709, "num_tokens": 21124807.0, "step": 133 }, { "epoch": 0.0681586978636826, "grad_norm": 1.1739333868026733, "learning_rate": 6.751269035532996e-06, "loss": 0.6241, "mean_token_accuracy": 0.8179885149002075, "num_tokens": 21283777.0, "step": 134 }, { "epoch": 0.0686673448626653, "grad_norm": 1.2382988929748535, "learning_rate": 6.8020304568527926e-06, "loss": 0.6391, "mean_token_accuracy": 0.8126323223114014, "num_tokens": 21436297.0, "step": 135 }, { "epoch": 0.06917599186164802, "grad_norm": 1.2716588973999023, "learning_rate": 6.852791878172589e-06, "loss": 0.6571, "mean_token_accuracy": 0.807772159576416, "num_tokens": 21590885.0, "step": 136 }, { "epoch": 0.06968463886063073, "grad_norm": 1.269378423690796, "learning_rate": 6.903553299492386e-06, "loss": 0.6855, "mean_token_accuracy": 0.7997962236404419, "num_tokens": 21746247.0, "step": 137 }, { "epoch": 0.07019328585961343, "grad_norm": 1.189234733581543, "learning_rate": 6.954314720812183e-06, "loss": 0.6343, "mean_token_accuracy": 0.814178466796875, "num_tokens": 21898426.0, "step": 138 }, { "epoch": 0.07070193285859613, "grad_norm": 1.3098485469818115, "learning_rate": 7.0050761421319806e-06, "loss": 0.7037, "mean_token_accuracy": 0.7971333265304565, "num_tokens": 22054155.0, "step": 139 }, { "epoch": 0.07121057985757884, "grad_norm": 1.3493456840515137, "learning_rate": 7.055837563451777e-06, "loss": 0.6692, "mean_token_accuracy": 0.8043407201766968, "num_tokens": 22215818.0, "step": 140 }, { "epoch": 0.07171922685656154, "grad_norm": 1.2597171068191528, "learning_rate": 7.106598984771575e-06, "loss": 0.6511, "mean_token_accuracy": 0.8078402280807495, "num_tokens": 22367981.0, "step": 141 }, { "epoch": 0.07222787385554426, "grad_norm": 1.209790587425232, "learning_rate": 7.157360406091371e-06, "loss": 0.6494, "mean_token_accuracy": 0.8108629584312439, "num_tokens": 22537203.0, "step": 142 }, { "epoch": 0.07273652085452696, "grad_norm": 1.2577440738677979, "learning_rate": 7.208121827411169e-06, "loss": 0.6483, "mean_token_accuracy": 0.8100079298019409, "num_tokens": 22696776.0, "step": 143 }, { "epoch": 0.07324516785350967, "grad_norm": 1.2360994815826416, "learning_rate": 7.258883248730964e-06, "loss": 0.6236, "mean_token_accuracy": 0.8167132139205933, "num_tokens": 22850630.0, "step": 144 }, { "epoch": 0.07375381485249237, "grad_norm": 1.2556370496749878, "learning_rate": 7.309644670050762e-06, "loss": 0.6782, "mean_token_accuracy": 0.8049169778823853, "num_tokens": 23002043.0, "step": 145 }, { "epoch": 0.07426246185147507, "grad_norm": 1.280638337135315, "learning_rate": 7.360406091370559e-06, "loss": 0.6085, "mean_token_accuracy": 0.8195856809616089, "num_tokens": 23159234.0, "step": 146 }, { "epoch": 0.07477110885045778, "grad_norm": 1.2262206077575684, "learning_rate": 7.411167512690356e-06, "loss": 0.6348, "mean_token_accuracy": 0.8120899200439453, "num_tokens": 23310355.0, "step": 147 }, { "epoch": 0.0752797558494405, "grad_norm": 1.2795320749282837, "learning_rate": 7.461928934010153e-06, "loss": 0.6253, "mean_token_accuracy": 0.8177874684333801, "num_tokens": 23473092.0, "step": 148 }, { "epoch": 0.0757884028484232, "grad_norm": 1.2047311067581177, "learning_rate": 7.51269035532995e-06, "loss": 0.6419, "mean_token_accuracy": 0.8110628724098206, "num_tokens": 23627766.0, "step": 149 }, { "epoch": 0.0762970498474059, "grad_norm": 1.2633532285690308, "learning_rate": 7.563451776649747e-06, "loss": 0.617, "mean_token_accuracy": 0.8181522488594055, "num_tokens": 23777307.0, "step": 150 }, { "epoch": 0.0768056968463886, "grad_norm": 1.2570198774337769, "learning_rate": 7.614213197969543e-06, "loss": 0.6534, "mean_token_accuracy": 0.8082944750785828, "num_tokens": 23923715.0, "step": 151 }, { "epoch": 0.07731434384537131, "grad_norm": 1.190126895904541, "learning_rate": 7.664974619289341e-06, "loss": 0.6394, "mean_token_accuracy": 0.8122843503952026, "num_tokens": 24087522.0, "step": 152 }, { "epoch": 0.07782299084435401, "grad_norm": 1.2950968742370605, "learning_rate": 7.715736040609138e-06, "loss": 0.6162, "mean_token_accuracy": 0.8173761367797852, "num_tokens": 24231067.0, "step": 153 }, { "epoch": 0.07833163784333673, "grad_norm": 1.2913380861282349, "learning_rate": 7.766497461928934e-06, "loss": 0.6601, "mean_token_accuracy": 0.8078927993774414, "num_tokens": 24394212.0, "step": 154 }, { "epoch": 0.07884028484231943, "grad_norm": 1.3249062299728394, "learning_rate": 7.817258883248731e-06, "loss": 0.628, "mean_token_accuracy": 0.8148497343063354, "num_tokens": 24539665.0, "step": 155 }, { "epoch": 0.07934893184130214, "grad_norm": 1.1700199842453003, "learning_rate": 7.86802030456853e-06, "loss": 0.6624, "mean_token_accuracy": 0.8045659065246582, "num_tokens": 24703099.0, "step": 156 }, { "epoch": 0.07985757884028484, "grad_norm": 1.2378876209259033, "learning_rate": 7.918781725888326e-06, "loss": 0.6401, "mean_token_accuracy": 0.8125580549240112, "num_tokens": 24851707.0, "step": 157 }, { "epoch": 0.08036622583926754, "grad_norm": 1.1588832139968872, "learning_rate": 7.969543147208122e-06, "loss": 0.6225, "mean_token_accuracy": 0.8164503574371338, "num_tokens": 25018302.0, "step": 158 }, { "epoch": 0.08087487283825025, "grad_norm": 1.1309963464736938, "learning_rate": 8.020304568527919e-06, "loss": 0.6277, "mean_token_accuracy": 0.8146978616714478, "num_tokens": 25177030.0, "step": 159 }, { "epoch": 0.08138351983723296, "grad_norm": 1.3874728679656982, "learning_rate": 8.071065989847716e-06, "loss": 0.6509, "mean_token_accuracy": 0.8102186918258667, "num_tokens": 25345695.0, "step": 160 }, { "epoch": 0.08189216683621567, "grad_norm": 1.1526621580123901, "learning_rate": 8.121827411167514e-06, "loss": 0.6026, "mean_token_accuracy": 0.8226553201675415, "num_tokens": 25513624.0, "step": 161 }, { "epoch": 0.08240081383519837, "grad_norm": 1.2950594425201416, "learning_rate": 8.17258883248731e-06, "loss": 0.644, "mean_token_accuracy": 0.8108351230621338, "num_tokens": 25674463.0, "step": 162 }, { "epoch": 0.08290946083418108, "grad_norm": 1.2725694179534912, "learning_rate": 8.223350253807107e-06, "loss": 0.6204, "mean_token_accuracy": 0.8160519003868103, "num_tokens": 25834424.0, "step": 163 }, { "epoch": 0.08341810783316378, "grad_norm": 1.3356080055236816, "learning_rate": 8.274111675126905e-06, "loss": 0.6618, "mean_token_accuracy": 0.8052741289138794, "num_tokens": 25991547.0, "step": 164 }, { "epoch": 0.0839267548321465, "grad_norm": 1.3054618835449219, "learning_rate": 8.324873096446702e-06, "loss": 0.6412, "mean_token_accuracy": 0.8113834857940674, "num_tokens": 26151034.0, "step": 165 }, { "epoch": 0.0844354018311292, "grad_norm": 1.2440849542617798, "learning_rate": 8.375634517766498e-06, "loss": 0.6125, "mean_token_accuracy": 0.8189442753791809, "num_tokens": 26310637.0, "step": 166 }, { "epoch": 0.0849440488301119, "grad_norm": 1.4464467763900757, "learning_rate": 8.426395939086295e-06, "loss": 0.6112, "mean_token_accuracy": 0.8193594813346863, "num_tokens": 26465678.0, "step": 167 }, { "epoch": 0.08545269582909461, "grad_norm": 1.2905434370040894, "learning_rate": 8.477157360406092e-06, "loss": 0.6422, "mean_token_accuracy": 0.8109447360038757, "num_tokens": 26632464.0, "step": 168 }, { "epoch": 0.08596134282807731, "grad_norm": 1.292102575302124, "learning_rate": 8.52791878172589e-06, "loss": 0.5979, "mean_token_accuracy": 0.8221216201782227, "num_tokens": 26795346.0, "step": 169 }, { "epoch": 0.08646998982706001, "grad_norm": 1.452039361000061, "learning_rate": 8.578680203045686e-06, "loss": 0.6576, "mean_token_accuracy": 0.8046762943267822, "num_tokens": 26951965.0, "step": 170 }, { "epoch": 0.08697863682604273, "grad_norm": 1.2484265565872192, "learning_rate": 8.629441624365483e-06, "loss": 0.6032, "mean_token_accuracy": 0.8209149837493896, "num_tokens": 27113922.0, "step": 171 }, { "epoch": 0.08748728382502544, "grad_norm": 1.2269099950790405, "learning_rate": 8.68020304568528e-06, "loss": 0.6135, "mean_token_accuracy": 0.8181469440460205, "num_tokens": 27279172.0, "step": 172 }, { "epoch": 0.08799593082400814, "grad_norm": 1.2918200492858887, "learning_rate": 8.730964467005076e-06, "loss": 0.6379, "mean_token_accuracy": 0.8122804164886475, "num_tokens": 27449975.0, "step": 173 }, { "epoch": 0.08850457782299084, "grad_norm": 1.2979905605316162, "learning_rate": 8.781725888324873e-06, "loss": 0.647, "mean_token_accuracy": 0.8093007802963257, "num_tokens": 27595872.0, "step": 174 }, { "epoch": 0.08901322482197355, "grad_norm": 1.252805471420288, "learning_rate": 8.832487309644671e-06, "loss": 0.6201, "mean_token_accuracy": 0.8170379400253296, "num_tokens": 27760835.0, "step": 175 }, { "epoch": 0.08952187182095625, "grad_norm": 1.3609118461608887, "learning_rate": 8.883248730964468e-06, "loss": 0.6172, "mean_token_accuracy": 0.817752480506897, "num_tokens": 27923943.0, "step": 176 }, { "epoch": 0.09003051881993897, "grad_norm": 1.2665863037109375, "learning_rate": 8.934010152284264e-06, "loss": 0.6405, "mean_token_accuracy": 0.8115326166152954, "num_tokens": 28085891.0, "step": 177 }, { "epoch": 0.09053916581892167, "grad_norm": 1.2021713256835938, "learning_rate": 8.984771573604062e-06, "loss": 0.5881, "mean_token_accuracy": 0.8245677351951599, "num_tokens": 28243518.0, "step": 178 }, { "epoch": 0.09104781281790437, "grad_norm": 1.40549898147583, "learning_rate": 9.035532994923859e-06, "loss": 0.616, "mean_token_accuracy": 0.8165683150291443, "num_tokens": 28405483.0, "step": 179 }, { "epoch": 0.09155645981688708, "grad_norm": 1.2603745460510254, "learning_rate": 9.086294416243656e-06, "loss": 0.6431, "mean_token_accuracy": 0.8083624839782715, "num_tokens": 28558441.0, "step": 180 }, { "epoch": 0.09206510681586978, "grad_norm": 1.2929681539535522, "learning_rate": 9.137055837563452e-06, "loss": 0.5976, "mean_token_accuracy": 0.8222934007644653, "num_tokens": 28707646.0, "step": 181 }, { "epoch": 0.09257375381485249, "grad_norm": 1.1667423248291016, "learning_rate": 9.187817258883249e-06, "loss": 0.6265, "mean_token_accuracy": 0.8158867359161377, "num_tokens": 28877502.0, "step": 182 }, { "epoch": 0.0930824008138352, "grad_norm": 1.3487673997879028, "learning_rate": 9.238578680203047e-06, "loss": 0.6694, "mean_token_accuracy": 0.8048607110977173, "num_tokens": 29040815.0, "step": 183 }, { "epoch": 0.0935910478128179, "grad_norm": 1.283803105354309, "learning_rate": 9.289340101522844e-06, "loss": 0.6047, "mean_token_accuracy": 0.8185036778450012, "num_tokens": 29189342.0, "step": 184 }, { "epoch": 0.09409969481180061, "grad_norm": 1.2010540962219238, "learning_rate": 9.34010152284264e-06, "loss": 0.6276, "mean_token_accuracy": 0.8162474632263184, "num_tokens": 29361787.0, "step": 185 }, { "epoch": 0.09460834181078331, "grad_norm": 1.2891279458999634, "learning_rate": 9.390862944162438e-06, "loss": 0.552, "mean_token_accuracy": 0.834886908531189, "num_tokens": 29523929.0, "step": 186 }, { "epoch": 0.09511698880976602, "grad_norm": 1.2837107181549072, "learning_rate": 9.441624365482235e-06, "loss": 0.6484, "mean_token_accuracy": 0.810014009475708, "num_tokens": 29682593.0, "step": 187 }, { "epoch": 0.09562563580874874, "grad_norm": 1.3777011632919312, "learning_rate": 9.492385786802032e-06, "loss": 0.5963, "mean_token_accuracy": 0.8240934014320374, "num_tokens": 29839653.0, "step": 188 }, { "epoch": 0.09613428280773144, "grad_norm": 1.269422173500061, "learning_rate": 9.543147208121828e-06, "loss": 0.6367, "mean_token_accuracy": 0.8108811974525452, "num_tokens": 30001594.0, "step": 189 }, { "epoch": 0.09664292980671414, "grad_norm": 1.2095832824707031, "learning_rate": 9.593908629441625e-06, "loss": 0.625, "mean_token_accuracy": 0.815091073513031, "num_tokens": 30163501.0, "step": 190 }, { "epoch": 0.09715157680569685, "grad_norm": 1.3392397165298462, "learning_rate": 9.644670050761421e-06, "loss": 0.609, "mean_token_accuracy": 0.8181994557380676, "num_tokens": 30324797.0, "step": 191 }, { "epoch": 0.09766022380467955, "grad_norm": 1.3514515161514282, "learning_rate": 9.69543147208122e-06, "loss": 0.6237, "mean_token_accuracy": 0.8160215020179749, "num_tokens": 30492907.0, "step": 192 }, { "epoch": 0.09816887080366225, "grad_norm": 1.2122633457183838, "learning_rate": 9.746192893401016e-06, "loss": 0.5802, "mean_token_accuracy": 0.8255295753479004, "num_tokens": 30667482.0, "step": 193 }, { "epoch": 0.09867751780264497, "grad_norm": 1.3341679573059082, "learning_rate": 9.796954314720813e-06, "loss": 0.6094, "mean_token_accuracy": 0.8188539743423462, "num_tokens": 30830951.0, "step": 194 }, { "epoch": 0.09918616480162767, "grad_norm": 1.4670215845108032, "learning_rate": 9.84771573604061e-06, "loss": 0.6097, "mean_token_accuracy": 0.8192304968833923, "num_tokens": 30992078.0, "step": 195 }, { "epoch": 0.09969481180061038, "grad_norm": 1.3874598741531372, "learning_rate": 9.898477157360406e-06, "loss": 0.6024, "mean_token_accuracy": 0.8210733532905579, "num_tokens": 31153089.0, "step": 196 }, { "epoch": 0.10020345879959308, "grad_norm": 1.337073564529419, "learning_rate": 9.949238578680204e-06, "loss": 0.606, "mean_token_accuracy": 0.8213313817977905, "num_tokens": 31297334.0, "step": 197 }, { "epoch": 0.10071210579857579, "grad_norm": 1.4408767223358154, "learning_rate": 1e-05, "loss": 0.6499, "mean_token_accuracy": 0.809918224811554, "num_tokens": 31462790.0, "step": 198 }, { "epoch": 0.10122075279755849, "grad_norm": 1.4449899196624756, "learning_rate": 1e-05, "loss": 0.6345, "mean_token_accuracy": 0.8106706142425537, "num_tokens": 31614333.0, "step": 199 }, { "epoch": 0.1017293997965412, "grad_norm": 1.3223861455917358, "learning_rate": 1e-05, "loss": 0.6201, "mean_token_accuracy": 0.817018985748291, "num_tokens": 31757983.0, "step": 200 }, { "epoch": 0.10223804679552391, "grad_norm": 1.4164437055587769, "learning_rate": 1e-05, "loss": 0.6305, "mean_token_accuracy": 0.8113193511962891, "num_tokens": 31902198.0, "step": 201 }, { "epoch": 0.10274669379450661, "grad_norm": 1.3873207569122314, "learning_rate": 1e-05, "loss": 0.6092, "mean_token_accuracy": 0.8206071257591248, "num_tokens": 32072015.0, "step": 202 }, { "epoch": 0.10325534079348932, "grad_norm": 1.3321030139923096, "learning_rate": 1e-05, "loss": 0.6263, "mean_token_accuracy": 0.8133052587509155, "num_tokens": 32231536.0, "step": 203 }, { "epoch": 0.10376398779247202, "grad_norm": 1.3445665836334229, "learning_rate": 1e-05, "loss": 0.5779, "mean_token_accuracy": 0.8258579969406128, "num_tokens": 32401421.0, "step": 204 }, { "epoch": 0.10427263479145472, "grad_norm": 7.624189853668213, "learning_rate": 1e-05, "loss": 0.6111, "mean_token_accuracy": 0.8195592164993286, "num_tokens": 32559615.0, "step": 205 }, { "epoch": 0.10478128179043744, "grad_norm": 1.406477689743042, "learning_rate": 1e-05, "loss": 0.6131, "mean_token_accuracy": 0.8170050382614136, "num_tokens": 32710194.0, "step": 206 }, { "epoch": 0.10528992878942015, "grad_norm": 1.3041821718215942, "learning_rate": 1e-05, "loss": 0.6172, "mean_token_accuracy": 0.8164232969284058, "num_tokens": 32878068.0, "step": 207 }, { "epoch": 0.10579857578840285, "grad_norm": 1.2968215942382812, "learning_rate": 1e-05, "loss": 0.6454, "mean_token_accuracy": 0.8079843521118164, "num_tokens": 33048273.0, "step": 208 }, { "epoch": 0.10630722278738555, "grad_norm": 1.3858803510665894, "learning_rate": 1e-05, "loss": 0.6165, "mean_token_accuracy": 0.8175449967384338, "num_tokens": 33209096.0, "step": 209 }, { "epoch": 0.10681586978636826, "grad_norm": 1.4570480585098267, "learning_rate": 1e-05, "loss": 0.5919, "mean_token_accuracy": 0.8247489929199219, "num_tokens": 33375803.0, "step": 210 }, { "epoch": 0.10732451678535096, "grad_norm": 1.386375904083252, "learning_rate": 1e-05, "loss": 0.6323, "mean_token_accuracy": 0.8123111128807068, "num_tokens": 33519566.0, "step": 211 }, { "epoch": 0.10783316378433368, "grad_norm": 1.2808319330215454, "learning_rate": 1e-05, "loss": 0.6419, "mean_token_accuracy": 0.809684157371521, "num_tokens": 33675958.0, "step": 212 }, { "epoch": 0.10834181078331638, "grad_norm": 1.3665415048599243, "learning_rate": 1e-05, "loss": 0.6165, "mean_token_accuracy": 0.8161715865135193, "num_tokens": 33848664.0, "step": 213 }, { "epoch": 0.10885045778229908, "grad_norm": 1.195825219154358, "learning_rate": 1e-05, "loss": 0.6179, "mean_token_accuracy": 0.8146218061447144, "num_tokens": 34013911.0, "step": 214 }, { "epoch": 0.10935910478128179, "grad_norm": 1.2427469491958618, "learning_rate": 1e-05, "loss": 0.5986, "mean_token_accuracy": 0.8210475444793701, "num_tokens": 34175183.0, "step": 215 }, { "epoch": 0.10986775178026449, "grad_norm": 1.2118704319000244, "learning_rate": 1e-05, "loss": 0.5914, "mean_token_accuracy": 0.8235177397727966, "num_tokens": 34337717.0, "step": 216 }, { "epoch": 0.11037639877924721, "grad_norm": 1.332649827003479, "learning_rate": 1e-05, "loss": 0.6262, "mean_token_accuracy": 0.8143296241760254, "num_tokens": 34491024.0, "step": 217 }, { "epoch": 0.11088504577822991, "grad_norm": 1.221186876296997, "learning_rate": 1e-05, "loss": 0.5921, "mean_token_accuracy": 0.8230754137039185, "num_tokens": 34668424.0, "step": 218 }, { "epoch": 0.11139369277721262, "grad_norm": 1.2910295724868774, "learning_rate": 1e-05, "loss": 0.6295, "mean_token_accuracy": 0.8133854269981384, "num_tokens": 34823916.0, "step": 219 }, { "epoch": 0.11190233977619532, "grad_norm": 1.4481086730957031, "learning_rate": 1e-05, "loss": 0.6034, "mean_token_accuracy": 0.8207244873046875, "num_tokens": 34986133.0, "step": 220 }, { "epoch": 0.11241098677517802, "grad_norm": 1.1247764825820923, "learning_rate": 1e-05, "loss": 0.581, "mean_token_accuracy": 0.8286846876144409, "num_tokens": 35146182.0, "step": 221 }, { "epoch": 0.11291963377416073, "grad_norm": 1.25968337059021, "learning_rate": 1e-05, "loss": 0.6255, "mean_token_accuracy": 0.8136856555938721, "num_tokens": 35301888.0, "step": 222 }, { "epoch": 0.11342828077314344, "grad_norm": 1.1560875177383423, "learning_rate": 1e-05, "loss": 0.5882, "mean_token_accuracy": 0.8247735500335693, "num_tokens": 35461669.0, "step": 223 }, { "epoch": 0.11393692777212615, "grad_norm": 1.282809853553772, "learning_rate": 1e-05, "loss": 0.6039, "mean_token_accuracy": 0.8183197975158691, "num_tokens": 35621625.0, "step": 224 }, { "epoch": 0.11444557477110885, "grad_norm": 1.4031862020492554, "learning_rate": 1e-05, "loss": 0.5908, "mean_token_accuracy": 0.8242624998092651, "num_tokens": 35773479.0, "step": 225 }, { "epoch": 0.11495422177009156, "grad_norm": 1.1410008668899536, "learning_rate": 1e-05, "loss": 0.5865, "mean_token_accuracy": 0.8251166939735413, "num_tokens": 35936577.0, "step": 226 }, { "epoch": 0.11546286876907426, "grad_norm": 1.26992928981781, "learning_rate": 1e-05, "loss": 0.6301, "mean_token_accuracy": 0.8128260374069214, "num_tokens": 36094734.0, "step": 227 }, { "epoch": 0.11597151576805696, "grad_norm": 1.1656041145324707, "learning_rate": 1e-05, "loss": 0.5859, "mean_token_accuracy": 0.8240753412246704, "num_tokens": 36258492.0, "step": 228 }, { "epoch": 0.11648016276703968, "grad_norm": 1.1628189086914062, "learning_rate": 1e-05, "loss": 0.6029, "mean_token_accuracy": 0.8214801549911499, "num_tokens": 36423218.0, "step": 229 }, { "epoch": 0.11698880976602238, "grad_norm": 1.28502357006073, "learning_rate": 1e-05, "loss": 0.5744, "mean_token_accuracy": 0.8258035182952881, "num_tokens": 36569958.0, "step": 230 }, { "epoch": 0.11749745676500509, "grad_norm": 1.224998116493225, "learning_rate": 1e-05, "loss": 0.6284, "mean_token_accuracy": 0.813615620136261, "num_tokens": 36735839.0, "step": 231 }, { "epoch": 0.11800610376398779, "grad_norm": 1.228937029838562, "learning_rate": 1e-05, "loss": 0.6106, "mean_token_accuracy": 0.8186049461364746, "num_tokens": 36886881.0, "step": 232 }, { "epoch": 0.1185147507629705, "grad_norm": 1.2199290990829468, "learning_rate": 1e-05, "loss": 0.606, "mean_token_accuracy": 0.8183395862579346, "num_tokens": 37055602.0, "step": 233 }, { "epoch": 0.1190233977619532, "grad_norm": 1.2404224872589111, "learning_rate": 1e-05, "loss": 0.5845, "mean_token_accuracy": 0.8273381590843201, "num_tokens": 37215889.0, "step": 234 }, { "epoch": 0.11953204476093592, "grad_norm": 1.2935690879821777, "learning_rate": 1e-05, "loss": 0.5879, "mean_token_accuracy": 0.8240891695022583, "num_tokens": 37368358.0, "step": 235 }, { "epoch": 0.12004069175991862, "grad_norm": 1.1917532682418823, "learning_rate": 1e-05, "loss": 0.599, "mean_token_accuracy": 0.8223183751106262, "num_tokens": 37528382.0, "step": 236 }, { "epoch": 0.12054933875890132, "grad_norm": 1.3194689750671387, "learning_rate": 1e-05, "loss": 0.6134, "mean_token_accuracy": 0.8174057006835938, "num_tokens": 37685103.0, "step": 237 }, { "epoch": 0.12105798575788403, "grad_norm": 1.0998708009719849, "learning_rate": 1e-05, "loss": 0.6249, "mean_token_accuracy": 0.8146561980247498, "num_tokens": 37848901.0, "step": 238 }, { "epoch": 0.12156663275686673, "grad_norm": 1.173926591873169, "learning_rate": 1e-05, "loss": 0.62, "mean_token_accuracy": 0.8145238161087036, "num_tokens": 38014704.0, "step": 239 }, { "epoch": 0.12207527975584945, "grad_norm": 1.19385826587677, "learning_rate": 1e-05, "loss": 0.6203, "mean_token_accuracy": 0.8140416145324707, "num_tokens": 38166048.0, "step": 240 }, { "epoch": 0.12258392675483215, "grad_norm": 1.2071212530136108, "learning_rate": 1e-05, "loss": 0.5703, "mean_token_accuracy": 0.829688310623169, "num_tokens": 38326365.0, "step": 241 }, { "epoch": 0.12309257375381485, "grad_norm": 1.1595906019210815, "learning_rate": 1e-05, "loss": 0.5918, "mean_token_accuracy": 0.8218910694122314, "num_tokens": 38487350.0, "step": 242 }, { "epoch": 0.12360122075279756, "grad_norm": 1.1498054265975952, "learning_rate": 1e-05, "loss": 0.6357, "mean_token_accuracy": 0.8128587007522583, "num_tokens": 38653367.0, "step": 243 }, { "epoch": 0.12410986775178026, "grad_norm": 1.1582930088043213, "learning_rate": 1e-05, "loss": 0.6156, "mean_token_accuracy": 0.8154253959655762, "num_tokens": 38803903.0, "step": 244 }, { "epoch": 0.12461851475076297, "grad_norm": 1.2007235288619995, "learning_rate": 1e-05, "loss": 0.5973, "mean_token_accuracy": 0.821514904499054, "num_tokens": 38956439.0, "step": 245 }, { "epoch": 0.12512716174974567, "grad_norm": 1.2579951286315918, "learning_rate": 1e-05, "loss": 0.5807, "mean_token_accuracy": 0.8249865174293518, "num_tokens": 39109262.0, "step": 246 }, { "epoch": 0.12563580874872837, "grad_norm": 1.0463939905166626, "learning_rate": 1e-05, "loss": 0.5836, "mean_token_accuracy": 0.8253095149993896, "num_tokens": 39271741.0, "step": 247 }, { "epoch": 0.12614445574771108, "grad_norm": 1.2439155578613281, "learning_rate": 1e-05, "loss": 0.6119, "mean_token_accuracy": 0.8164927363395691, "num_tokens": 39438988.0, "step": 248 }, { "epoch": 0.1266531027466938, "grad_norm": 1.1544227600097656, "learning_rate": 1e-05, "loss": 0.6216, "mean_token_accuracy": 0.8164765238761902, "num_tokens": 39606007.0, "step": 249 }, { "epoch": 0.1271617497456765, "grad_norm": 1.2033929824829102, "learning_rate": 1e-05, "loss": 0.6558, "mean_token_accuracy": 0.8053734302520752, "num_tokens": 39780179.0, "step": 250 }, { "epoch": 0.12767039674465921, "grad_norm": 1.1998658180236816, "learning_rate": 1e-05, "loss": 0.6138, "mean_token_accuracy": 0.8166667819023132, "num_tokens": 39939761.0, "step": 251 }, { "epoch": 0.12817904374364192, "grad_norm": 1.2700233459472656, "learning_rate": 1e-05, "loss": 0.6356, "mean_token_accuracy": 0.8122010231018066, "num_tokens": 40103824.0, "step": 252 }, { "epoch": 0.12868769074262462, "grad_norm": 1.2417421340942383, "learning_rate": 1e-05, "loss": 0.6141, "mean_token_accuracy": 0.8166791796684265, "num_tokens": 40275322.0, "step": 253 }, { "epoch": 0.12919633774160733, "grad_norm": 1.2107737064361572, "learning_rate": 1e-05, "loss": 0.5892, "mean_token_accuracy": 0.8222872018814087, "num_tokens": 40433373.0, "step": 254 }, { "epoch": 0.12970498474059003, "grad_norm": 1.2591029405593872, "learning_rate": 1e-05, "loss": 0.6189, "mean_token_accuracy": 0.8156533241271973, "num_tokens": 40580246.0, "step": 255 }, { "epoch": 0.13021363173957273, "grad_norm": 1.3103513717651367, "learning_rate": 1e-05, "loss": 0.5903, "mean_token_accuracy": 0.823656439781189, "num_tokens": 40743349.0, "step": 256 }, { "epoch": 0.13072227873855544, "grad_norm": 1.1803346872329712, "learning_rate": 1e-05, "loss": 0.5796, "mean_token_accuracy": 0.8239772319793701, "num_tokens": 40909931.0, "step": 257 }, { "epoch": 0.13123092573753814, "grad_norm": 1.326135516166687, "learning_rate": 1e-05, "loss": 0.5905, "mean_token_accuracy": 0.8228780031204224, "num_tokens": 41064135.0, "step": 258 }, { "epoch": 0.13173957273652084, "grad_norm": 1.2487269639968872, "learning_rate": 1e-05, "loss": 0.5809, "mean_token_accuracy": 0.8256618976593018, "num_tokens": 41215121.0, "step": 259 }, { "epoch": 0.13224821973550355, "grad_norm": 1.339449167251587, "learning_rate": 1e-05, "loss": 0.5944, "mean_token_accuracy": 0.8231635093688965, "num_tokens": 41373300.0, "step": 260 }, { "epoch": 0.13275686673448628, "grad_norm": 1.3216067552566528, "learning_rate": 1e-05, "loss": 0.5995, "mean_token_accuracy": 0.820053219795227, "num_tokens": 41538822.0, "step": 261 }, { "epoch": 0.13326551373346898, "grad_norm": 1.2868990898132324, "learning_rate": 1e-05, "loss": 0.642, "mean_token_accuracy": 0.8083620667457581, "num_tokens": 41699217.0, "step": 262 }, { "epoch": 0.13377416073245169, "grad_norm": 1.265202522277832, "learning_rate": 1e-05, "loss": 0.5991, "mean_token_accuracy": 0.8189837336540222, "num_tokens": 41870291.0, "step": 263 }, { "epoch": 0.1342828077314344, "grad_norm": 1.1498230695724487, "learning_rate": 1e-05, "loss": 0.6326, "mean_token_accuracy": 0.8124827146530151, "num_tokens": 42029303.0, "step": 264 }, { "epoch": 0.1347914547304171, "grad_norm": 1.3529750108718872, "learning_rate": 1e-05, "loss": 0.6208, "mean_token_accuracy": 0.8141924738883972, "num_tokens": 42197688.0, "step": 265 }, { "epoch": 0.1353001017293998, "grad_norm": 1.2339895963668823, "learning_rate": 1e-05, "loss": 0.6022, "mean_token_accuracy": 0.8191970586776733, "num_tokens": 42349174.0, "step": 266 }, { "epoch": 0.1358087487283825, "grad_norm": 1.3051806688308716, "learning_rate": 1e-05, "loss": 0.5799, "mean_token_accuracy": 0.8249984979629517, "num_tokens": 42506878.0, "step": 267 }, { "epoch": 0.1363173957273652, "grad_norm": 1.1635160446166992, "learning_rate": 1e-05, "loss": 0.6192, "mean_token_accuracy": 0.8148203492164612, "num_tokens": 42676569.0, "step": 268 }, { "epoch": 0.1368260427263479, "grad_norm": 1.296911358833313, "learning_rate": 1e-05, "loss": 0.6124, "mean_token_accuracy": 0.8165633678436279, "num_tokens": 42831149.0, "step": 269 }, { "epoch": 0.1373346897253306, "grad_norm": 1.1763876676559448, "learning_rate": 1e-05, "loss": 0.6096, "mean_token_accuracy": 0.8170506358146667, "num_tokens": 42992073.0, "step": 270 }, { "epoch": 0.13784333672431331, "grad_norm": 1.1894811391830444, "learning_rate": 1e-05, "loss": 0.6216, "mean_token_accuracy": 0.8166886568069458, "num_tokens": 43157608.0, "step": 271 }, { "epoch": 0.13835198372329605, "grad_norm": 1.3227925300598145, "learning_rate": 1e-05, "loss": 0.577, "mean_token_accuracy": 0.8258155584335327, "num_tokens": 43319905.0, "step": 272 }, { "epoch": 0.13886063072227875, "grad_norm": 1.2134519815444946, "learning_rate": 1e-05, "loss": 0.6191, "mean_token_accuracy": 0.8159205317497253, "num_tokens": 43483878.0, "step": 273 }, { "epoch": 0.13936927772126145, "grad_norm": 1.390149712562561, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.826744556427002, "num_tokens": 43642021.0, "step": 274 }, { "epoch": 0.13987792472024416, "grad_norm": 1.249283790588379, "learning_rate": 1e-05, "loss": 0.5884, "mean_token_accuracy": 0.8216973543167114, "num_tokens": 43788563.0, "step": 275 }, { "epoch": 0.14038657171922686, "grad_norm": 1.194814920425415, "learning_rate": 1e-05, "loss": 0.6187, "mean_token_accuracy": 0.8158164024353027, "num_tokens": 43948725.0, "step": 276 }, { "epoch": 0.14089521871820956, "grad_norm": 1.199250340461731, "learning_rate": 1e-05, "loss": 0.5797, "mean_token_accuracy": 0.8249163031578064, "num_tokens": 44107222.0, "step": 277 }, { "epoch": 0.14140386571719227, "grad_norm": 1.2112387418746948, "learning_rate": 1e-05, "loss": 0.6018, "mean_token_accuracy": 0.8194043636322021, "num_tokens": 44265793.0, "step": 278 }, { "epoch": 0.14191251271617497, "grad_norm": 1.2595670223236084, "learning_rate": 1e-05, "loss": 0.6115, "mean_token_accuracy": 0.8172046542167664, "num_tokens": 44416663.0, "step": 279 }, { "epoch": 0.14242115971515767, "grad_norm": 1.3530234098434448, "learning_rate": 1e-05, "loss": 0.6033, "mean_token_accuracy": 0.8205140829086304, "num_tokens": 44585677.0, "step": 280 }, { "epoch": 0.14292980671414038, "grad_norm": 1.3794883489608765, "learning_rate": 1e-05, "loss": 0.6051, "mean_token_accuracy": 0.8198160529136658, "num_tokens": 44733031.0, "step": 281 }, { "epoch": 0.14343845371312308, "grad_norm": 1.2748477458953857, "learning_rate": 1e-05, "loss": 0.6258, "mean_token_accuracy": 0.8135629892349243, "num_tokens": 44896422.0, "step": 282 }, { "epoch": 0.14394710071210579, "grad_norm": 1.3169347047805786, "learning_rate": 1e-05, "loss": 0.5901, "mean_token_accuracy": 0.8225511312484741, "num_tokens": 45063428.0, "step": 283 }, { "epoch": 0.14445574771108852, "grad_norm": 1.1416714191436768, "learning_rate": 1e-05, "loss": 0.5782, "mean_token_accuracy": 0.8251551985740662, "num_tokens": 45222963.0, "step": 284 }, { "epoch": 0.14496439471007122, "grad_norm": 1.3052245378494263, "learning_rate": 1e-05, "loss": 0.5924, "mean_token_accuracy": 0.822461724281311, "num_tokens": 45369420.0, "step": 285 }, { "epoch": 0.14547304170905392, "grad_norm": 1.2727510929107666, "learning_rate": 1e-05, "loss": 0.5644, "mean_token_accuracy": 0.8323483467102051, "num_tokens": 45525565.0, "step": 286 }, { "epoch": 0.14598168870803663, "grad_norm": 1.238297700881958, "learning_rate": 1e-05, "loss": 0.6061, "mean_token_accuracy": 0.8184632062911987, "num_tokens": 45688704.0, "step": 287 }, { "epoch": 0.14649033570701933, "grad_norm": 1.3253711462020874, "learning_rate": 1e-05, "loss": 0.6018, "mean_token_accuracy": 0.8195759057998657, "num_tokens": 45852191.0, "step": 288 }, { "epoch": 0.14699898270600203, "grad_norm": 1.255407452583313, "learning_rate": 1e-05, "loss": 0.617, "mean_token_accuracy": 0.8195772171020508, "num_tokens": 46009944.0, "step": 289 }, { "epoch": 0.14750762970498474, "grad_norm": 1.3443149328231812, "learning_rate": 1e-05, "loss": 0.6452, "mean_token_accuracy": 0.8103073835372925, "num_tokens": 46163464.0, "step": 290 }, { "epoch": 0.14801627670396744, "grad_norm": 1.3617409467697144, "learning_rate": 1e-05, "loss": 0.6009, "mean_token_accuracy": 0.8228298425674438, "num_tokens": 46315469.0, "step": 291 }, { "epoch": 0.14852492370295015, "grad_norm": 1.320160150527954, "learning_rate": 1e-05, "loss": 0.5897, "mean_token_accuracy": 0.8236374855041504, "num_tokens": 46475098.0, "step": 292 }, { "epoch": 0.14903357070193285, "grad_norm": 1.1730265617370605, "learning_rate": 1e-05, "loss": 0.6014, "mean_token_accuracy": 0.8199717998504639, "num_tokens": 46645510.0, "step": 293 }, { "epoch": 0.14954221770091555, "grad_norm": 1.2772011756896973, "learning_rate": 1e-05, "loss": 0.5856, "mean_token_accuracy": 0.8237594366073608, "num_tokens": 46799760.0, "step": 294 }, { "epoch": 0.15005086469989828, "grad_norm": 1.3035606145858765, "learning_rate": 1e-05, "loss": 0.5901, "mean_token_accuracy": 0.82267165184021, "num_tokens": 46965987.0, "step": 295 }, { "epoch": 0.150559511698881, "grad_norm": 1.2326747179031372, "learning_rate": 1e-05, "loss": 0.5695, "mean_token_accuracy": 0.8280316591262817, "num_tokens": 47128771.0, "step": 296 }, { "epoch": 0.1510681586978637, "grad_norm": 1.2811074256896973, "learning_rate": 1e-05, "loss": 0.6225, "mean_token_accuracy": 0.8160161972045898, "num_tokens": 47288316.0, "step": 297 }, { "epoch": 0.1515768056968464, "grad_norm": 1.2438085079193115, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.821655809879303, "num_tokens": 47455167.0, "step": 298 }, { "epoch": 0.1520854526958291, "grad_norm": 1.30879807472229, "learning_rate": 1e-05, "loss": 0.6081, "mean_token_accuracy": 0.8191161155700684, "num_tokens": 47600859.0, "step": 299 }, { "epoch": 0.1525940996948118, "grad_norm": 1.1298332214355469, "learning_rate": 1e-05, "loss": 0.5841, "mean_token_accuracy": 0.8238013386726379, "num_tokens": 47768491.0, "step": 300 }, { "epoch": 0.1531027466937945, "grad_norm": 1.302790880203247, "learning_rate": 1e-05, "loss": 0.5917, "mean_token_accuracy": 0.8203186392784119, "num_tokens": 47918622.0, "step": 301 }, { "epoch": 0.1536113936927772, "grad_norm": 1.094289779663086, "learning_rate": 1e-05, "loss": 0.5417, "mean_token_accuracy": 0.8354231715202332, "num_tokens": 48076479.0, "step": 302 }, { "epoch": 0.1541200406917599, "grad_norm": 1.191530466079712, "learning_rate": 1e-05, "loss": 0.5879, "mean_token_accuracy": 0.8236433267593384, "num_tokens": 48226249.0, "step": 303 }, { "epoch": 0.15462868769074262, "grad_norm": 1.3071236610412598, "learning_rate": 1e-05, "loss": 0.5461, "mean_token_accuracy": 0.8359087109565735, "num_tokens": 48390048.0, "step": 304 }, { "epoch": 0.15513733468972532, "grad_norm": 1.0845615863800049, "learning_rate": 1e-05, "loss": 0.5688, "mean_token_accuracy": 0.8274515867233276, "num_tokens": 48558970.0, "step": 305 }, { "epoch": 0.15564598168870802, "grad_norm": 1.2192639112472534, "learning_rate": 1e-05, "loss": 0.577, "mean_token_accuracy": 0.8271118402481079, "num_tokens": 48712282.0, "step": 306 }, { "epoch": 0.15615462868769076, "grad_norm": 1.2620494365692139, "learning_rate": 1e-05, "loss": 0.5981, "mean_token_accuracy": 0.8221869468688965, "num_tokens": 48874280.0, "step": 307 }, { "epoch": 0.15666327568667346, "grad_norm": 1.1076061725616455, "learning_rate": 1e-05, "loss": 0.6341, "mean_token_accuracy": 0.8119561076164246, "num_tokens": 49032570.0, "step": 308 }, { "epoch": 0.15717192268565616, "grad_norm": 1.1432855129241943, "learning_rate": 1e-05, "loss": 0.577, "mean_token_accuracy": 0.826465904712677, "num_tokens": 49206160.0, "step": 309 }, { "epoch": 0.15768056968463887, "grad_norm": 1.282848596572876, "learning_rate": 1e-05, "loss": 0.6243, "mean_token_accuracy": 0.812804102897644, "num_tokens": 49357658.0, "step": 310 }, { "epoch": 0.15818921668362157, "grad_norm": 1.22379469871521, "learning_rate": 1e-05, "loss": 0.6015, "mean_token_accuracy": 0.819198727607727, "num_tokens": 49513574.0, "step": 311 }, { "epoch": 0.15869786368260427, "grad_norm": 1.2921515703201294, "learning_rate": 1e-05, "loss": 0.5575, "mean_token_accuracy": 0.831973671913147, "num_tokens": 49678022.0, "step": 312 }, { "epoch": 0.15920651068158698, "grad_norm": 1.274613380432129, "learning_rate": 1e-05, "loss": 0.6113, "mean_token_accuracy": 0.8184370994567871, "num_tokens": 49844519.0, "step": 313 }, { "epoch": 0.15971515768056968, "grad_norm": 1.2928526401519775, "learning_rate": 1e-05, "loss": 0.5517, "mean_token_accuracy": 0.8329215049743652, "num_tokens": 50003658.0, "step": 314 }, { "epoch": 0.16022380467955238, "grad_norm": 1.3217474222183228, "learning_rate": 1e-05, "loss": 0.6026, "mean_token_accuracy": 0.8197334408760071, "num_tokens": 50169430.0, "step": 315 }, { "epoch": 0.1607324516785351, "grad_norm": 1.3033621311187744, "learning_rate": 1e-05, "loss": 0.5757, "mean_token_accuracy": 0.82660973072052, "num_tokens": 50337188.0, "step": 316 }, { "epoch": 0.1612410986775178, "grad_norm": 1.292182445526123, "learning_rate": 1e-05, "loss": 0.6152, "mean_token_accuracy": 0.8158968687057495, "num_tokens": 50488304.0, "step": 317 }, { "epoch": 0.1617497456765005, "grad_norm": 1.2796605825424194, "learning_rate": 1e-05, "loss": 0.6127, "mean_token_accuracy": 0.8183920383453369, "num_tokens": 50649690.0, "step": 318 }, { "epoch": 0.16225839267548323, "grad_norm": 1.2305800914764404, "learning_rate": 1e-05, "loss": 0.5674, "mean_token_accuracy": 0.827362060546875, "num_tokens": 50809934.0, "step": 319 }, { "epoch": 0.16276703967446593, "grad_norm": 1.260392665863037, "learning_rate": 1e-05, "loss": 0.6205, "mean_token_accuracy": 0.8147035241127014, "num_tokens": 50973651.0, "step": 320 }, { "epoch": 0.16327568667344863, "grad_norm": 1.2155548334121704, "learning_rate": 1e-05, "loss": 0.6425, "mean_token_accuracy": 0.8077457547187805, "num_tokens": 51152576.0, "step": 321 }, { "epoch": 0.16378433367243134, "grad_norm": 1.1556423902511597, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.830562949180603, "num_tokens": 51302446.0, "step": 322 }, { "epoch": 0.16429298067141404, "grad_norm": 1.4814339876174927, "learning_rate": 1e-05, "loss": 0.5983, "mean_token_accuracy": 0.8216321468353271, "num_tokens": 51443754.0, "step": 323 }, { "epoch": 0.16480162767039674, "grad_norm": 1.1830353736877441, "learning_rate": 1e-05, "loss": 0.5914, "mean_token_accuracy": 0.823473334312439, "num_tokens": 51613127.0, "step": 324 }, { "epoch": 0.16531027466937945, "grad_norm": 1.1541862487792969, "learning_rate": 1e-05, "loss": 0.5801, "mean_token_accuracy": 0.8245047926902771, "num_tokens": 51778162.0, "step": 325 }, { "epoch": 0.16581892166836215, "grad_norm": 1.1448123455047607, "learning_rate": 1e-05, "loss": 0.5774, "mean_token_accuracy": 0.8273317813873291, "num_tokens": 51935798.0, "step": 326 }, { "epoch": 0.16632756866734486, "grad_norm": 1.1235575675964355, "learning_rate": 1e-05, "loss": 0.5998, "mean_token_accuracy": 0.8195568323135376, "num_tokens": 52089803.0, "step": 327 }, { "epoch": 0.16683621566632756, "grad_norm": 1.1180049180984497, "learning_rate": 1e-05, "loss": 0.5831, "mean_token_accuracy": 0.8214372396469116, "num_tokens": 52241386.0, "step": 328 }, { "epoch": 0.16734486266531026, "grad_norm": 1.107740044593811, "learning_rate": 1e-05, "loss": 0.6006, "mean_token_accuracy": 0.8191947937011719, "num_tokens": 52397569.0, "step": 329 }, { "epoch": 0.167853509664293, "grad_norm": 1.2240755558013916, "learning_rate": 1e-05, "loss": 0.5945, "mean_token_accuracy": 0.8211661577224731, "num_tokens": 52550468.0, "step": 330 }, { "epoch": 0.1683621566632757, "grad_norm": 1.105646014213562, "learning_rate": 1e-05, "loss": 0.5789, "mean_token_accuracy": 0.8263506293296814, "num_tokens": 52705147.0, "step": 331 }, { "epoch": 0.1688708036622584, "grad_norm": 1.18425714969635, "learning_rate": 1e-05, "loss": 0.6246, "mean_token_accuracy": 0.8139541149139404, "num_tokens": 52870603.0, "step": 332 }, { "epoch": 0.1693794506612411, "grad_norm": 1.1311122179031372, "learning_rate": 1e-05, "loss": 0.6008, "mean_token_accuracy": 0.8203775882720947, "num_tokens": 53027492.0, "step": 333 }, { "epoch": 0.1698880976602238, "grad_norm": 1.1832103729248047, "learning_rate": 1e-05, "loss": 0.6, "mean_token_accuracy": 0.8204811811447144, "num_tokens": 53186596.0, "step": 334 }, { "epoch": 0.1703967446592065, "grad_norm": 1.1455259323120117, "learning_rate": 1e-05, "loss": 0.5843, "mean_token_accuracy": 0.8235903978347778, "num_tokens": 53341286.0, "step": 335 }, { "epoch": 0.17090539165818922, "grad_norm": 1.187774658203125, "learning_rate": 1e-05, "loss": 0.6061, "mean_token_accuracy": 0.8166907429695129, "num_tokens": 53511107.0, "step": 336 }, { "epoch": 0.17141403865717192, "grad_norm": 1.1563060283660889, "learning_rate": 1e-05, "loss": 0.5617, "mean_token_accuracy": 0.8305507302284241, "num_tokens": 53673861.0, "step": 337 }, { "epoch": 0.17192268565615462, "grad_norm": 1.2529067993164062, "learning_rate": 1e-05, "loss": 0.5939, "mean_token_accuracy": 0.8233036994934082, "num_tokens": 53829950.0, "step": 338 }, { "epoch": 0.17243133265513733, "grad_norm": 1.1666814088821411, "learning_rate": 1e-05, "loss": 0.6117, "mean_token_accuracy": 0.818543553352356, "num_tokens": 53992561.0, "step": 339 }, { "epoch": 0.17293997965412003, "grad_norm": 1.267090082168579, "learning_rate": 1e-05, "loss": 0.6046, "mean_token_accuracy": 0.8193832635879517, "num_tokens": 54156414.0, "step": 340 }, { "epoch": 0.17344862665310273, "grad_norm": 1.122776985168457, "learning_rate": 1e-05, "loss": 0.5948, "mean_token_accuracy": 0.8207167983055115, "num_tokens": 54315479.0, "step": 341 }, { "epoch": 0.17395727365208546, "grad_norm": 1.184189796447754, "learning_rate": 1e-05, "loss": 0.589, "mean_token_accuracy": 0.821286678314209, "num_tokens": 54487854.0, "step": 342 }, { "epoch": 0.17446592065106817, "grad_norm": 1.2882232666015625, "learning_rate": 1e-05, "loss": 0.584, "mean_token_accuracy": 0.8235088586807251, "num_tokens": 54637940.0, "step": 343 }, { "epoch": 0.17497456765005087, "grad_norm": 1.1329272985458374, "learning_rate": 1e-05, "loss": 0.5877, "mean_token_accuracy": 0.8235629796981812, "num_tokens": 54799217.0, "step": 344 }, { "epoch": 0.17548321464903358, "grad_norm": 1.1960710287094116, "learning_rate": 1e-05, "loss": 0.5744, "mean_token_accuracy": 0.8274551033973694, "num_tokens": 54952384.0, "step": 345 }, { "epoch": 0.17599186164801628, "grad_norm": 1.1488741636276245, "learning_rate": 1e-05, "loss": 0.5993, "mean_token_accuracy": 0.8195698261260986, "num_tokens": 55126417.0, "step": 346 }, { "epoch": 0.17650050864699898, "grad_norm": 1.2244809865951538, "learning_rate": 1e-05, "loss": 0.6311, "mean_token_accuracy": 0.8094319701194763, "num_tokens": 55296549.0, "step": 347 }, { "epoch": 0.1770091556459817, "grad_norm": 1.1994503736495972, "learning_rate": 1e-05, "loss": 0.5694, "mean_token_accuracy": 0.8292071223258972, "num_tokens": 55457557.0, "step": 348 }, { "epoch": 0.1775178026449644, "grad_norm": 1.4169642925262451, "learning_rate": 1e-05, "loss": 0.5897, "mean_token_accuracy": 0.821076512336731, "num_tokens": 55615322.0, "step": 349 }, { "epoch": 0.1780264496439471, "grad_norm": 1.2098528146743774, "learning_rate": 1e-05, "loss": 0.6046, "mean_token_accuracy": 0.8202278017997742, "num_tokens": 55780975.0, "step": 350 }, { "epoch": 0.1785350966429298, "grad_norm": 1.3667292594909668, "learning_rate": 1e-05, "loss": 0.6017, "mean_token_accuracy": 0.820119321346283, "num_tokens": 55931561.0, "step": 351 }, { "epoch": 0.1790437436419125, "grad_norm": 1.2060900926589966, "learning_rate": 1e-05, "loss": 0.6278, "mean_token_accuracy": 0.8119494915008545, "num_tokens": 56097949.0, "step": 352 }, { "epoch": 0.17955239064089523, "grad_norm": 1.2072443962097168, "learning_rate": 1e-05, "loss": 0.6356, "mean_token_accuracy": 0.809796929359436, "num_tokens": 56256104.0, "step": 353 }, { "epoch": 0.18006103763987794, "grad_norm": 1.2915446758270264, "learning_rate": 1e-05, "loss": 0.617, "mean_token_accuracy": 0.816338062286377, "num_tokens": 56427795.0, "step": 354 }, { "epoch": 0.18056968463886064, "grad_norm": 1.329746961593628, "learning_rate": 1e-05, "loss": 0.5885, "mean_token_accuracy": 0.8227415084838867, "num_tokens": 56580755.0, "step": 355 }, { "epoch": 0.18107833163784334, "grad_norm": 1.2750502824783325, "learning_rate": 1e-05, "loss": 0.5959, "mean_token_accuracy": 0.8213445544242859, "num_tokens": 56725887.0, "step": 356 }, { "epoch": 0.18158697863682605, "grad_norm": 1.1861790418624878, "learning_rate": 1e-05, "loss": 0.5852, "mean_token_accuracy": 0.8219949007034302, "num_tokens": 56893128.0, "step": 357 }, { "epoch": 0.18209562563580875, "grad_norm": 1.1133451461791992, "learning_rate": 1e-05, "loss": 0.5786, "mean_token_accuracy": 0.8256653547286987, "num_tokens": 57066741.0, "step": 358 }, { "epoch": 0.18260427263479145, "grad_norm": 1.2725830078125, "learning_rate": 1e-05, "loss": 0.5759, "mean_token_accuracy": 0.8266841173171997, "num_tokens": 57215386.0, "step": 359 }, { "epoch": 0.18311291963377416, "grad_norm": 1.1565479040145874, "learning_rate": 1e-05, "loss": 0.5981, "mean_token_accuracy": 0.8205220699310303, "num_tokens": 57382943.0, "step": 360 }, { "epoch": 0.18362156663275686, "grad_norm": 1.2219996452331543, "learning_rate": 1e-05, "loss": 0.5773, "mean_token_accuracy": 0.8252044320106506, "num_tokens": 57533799.0, "step": 361 }, { "epoch": 0.18413021363173956, "grad_norm": 1.3008339405059814, "learning_rate": 1e-05, "loss": 0.5909, "mean_token_accuracy": 0.8215640783309937, "num_tokens": 57697387.0, "step": 362 }, { "epoch": 0.18463886063072227, "grad_norm": 1.0892812013626099, "learning_rate": 1e-05, "loss": 0.5967, "mean_token_accuracy": 0.821318507194519, "num_tokens": 57855776.0, "step": 363 }, { "epoch": 0.18514750762970497, "grad_norm": 1.2349947690963745, "learning_rate": 1e-05, "loss": 0.5748, "mean_token_accuracy": 0.8269065618515015, "num_tokens": 58024333.0, "step": 364 }, { "epoch": 0.1856561546286877, "grad_norm": 1.2553539276123047, "learning_rate": 1e-05, "loss": 0.5765, "mean_token_accuracy": 0.8262948393821716, "num_tokens": 58184182.0, "step": 365 }, { "epoch": 0.1861648016276704, "grad_norm": 1.0978028774261475, "learning_rate": 1e-05, "loss": 0.5976, "mean_token_accuracy": 0.8190889358520508, "num_tokens": 58338761.0, "step": 366 }, { "epoch": 0.1866734486266531, "grad_norm": 1.4213918447494507, "learning_rate": 1e-05, "loss": 0.5963, "mean_token_accuracy": 0.8194625973701477, "num_tokens": 58485678.0, "step": 367 }, { "epoch": 0.1871820956256358, "grad_norm": 1.233219861984253, "learning_rate": 1e-05, "loss": 0.6218, "mean_token_accuracy": 0.8158354759216309, "num_tokens": 58643138.0, "step": 368 }, { "epoch": 0.18769074262461852, "grad_norm": 1.2403314113616943, "learning_rate": 1e-05, "loss": 0.5756, "mean_token_accuracy": 0.826107382774353, "num_tokens": 58804789.0, "step": 369 }, { "epoch": 0.18819938962360122, "grad_norm": 1.2945451736450195, "learning_rate": 1e-05, "loss": 0.5785, "mean_token_accuracy": 0.8255239725112915, "num_tokens": 58978624.0, "step": 370 }, { "epoch": 0.18870803662258392, "grad_norm": 1.0491999387741089, "learning_rate": 1e-05, "loss": 0.5689, "mean_token_accuracy": 0.8316059112548828, "num_tokens": 59150156.0, "step": 371 }, { "epoch": 0.18921668362156663, "grad_norm": 1.2332353591918945, "learning_rate": 1e-05, "loss": 0.5409, "mean_token_accuracy": 0.8354936838150024, "num_tokens": 59307831.0, "step": 372 }, { "epoch": 0.18972533062054933, "grad_norm": 1.3186850547790527, "learning_rate": 1e-05, "loss": 0.6007, "mean_token_accuracy": 0.8197685480117798, "num_tokens": 59458464.0, "step": 373 }, { "epoch": 0.19023397761953204, "grad_norm": 1.1561511754989624, "learning_rate": 1e-05, "loss": 0.579, "mean_token_accuracy": 0.8258455395698547, "num_tokens": 59612951.0, "step": 374 }, { "epoch": 0.19074262461851474, "grad_norm": 1.2569273710250854, "learning_rate": 1e-05, "loss": 0.5515, "mean_token_accuracy": 0.8312716484069824, "num_tokens": 59774372.0, "step": 375 }, { "epoch": 0.19125127161749747, "grad_norm": 1.122464656829834, "learning_rate": 1e-05, "loss": 0.5549, "mean_token_accuracy": 0.8312299251556396, "num_tokens": 59940444.0, "step": 376 }, { "epoch": 0.19175991861648017, "grad_norm": 1.2286045551300049, "learning_rate": 1e-05, "loss": 0.5973, "mean_token_accuracy": 0.8190140724182129, "num_tokens": 60097403.0, "step": 377 }, { "epoch": 0.19226856561546288, "grad_norm": 1.1971975564956665, "learning_rate": 1e-05, "loss": 0.5743, "mean_token_accuracy": 0.8260021805763245, "num_tokens": 60270518.0, "step": 378 }, { "epoch": 0.19277721261444558, "grad_norm": 1.228528618812561, "learning_rate": 1e-05, "loss": 0.569, "mean_token_accuracy": 0.8281633257865906, "num_tokens": 60434705.0, "step": 379 }, { "epoch": 0.19328585961342828, "grad_norm": 1.1806086301803589, "learning_rate": 1e-05, "loss": 0.6026, "mean_token_accuracy": 0.818634569644928, "num_tokens": 60587162.0, "step": 380 }, { "epoch": 0.193794506612411, "grad_norm": 1.1534594297409058, "learning_rate": 1e-05, "loss": 0.5755, "mean_token_accuracy": 0.8261923789978027, "num_tokens": 60749142.0, "step": 381 }, { "epoch": 0.1943031536113937, "grad_norm": 1.1578582525253296, "learning_rate": 1e-05, "loss": 0.5654, "mean_token_accuracy": 0.8286859393119812, "num_tokens": 60917294.0, "step": 382 }, { "epoch": 0.1948118006103764, "grad_norm": 1.1654207706451416, "learning_rate": 1e-05, "loss": 0.5679, "mean_token_accuracy": 0.8284820318222046, "num_tokens": 61084543.0, "step": 383 }, { "epoch": 0.1953204476093591, "grad_norm": 1.355996012687683, "learning_rate": 1e-05, "loss": 0.6042, "mean_token_accuracy": 0.8220304846763611, "num_tokens": 61248579.0, "step": 384 }, { "epoch": 0.1958290946083418, "grad_norm": 1.3036282062530518, "learning_rate": 1e-05, "loss": 0.5891, "mean_token_accuracy": 0.8220359086990356, "num_tokens": 61403557.0, "step": 385 }, { "epoch": 0.1963377416073245, "grad_norm": 2.102263927459717, "learning_rate": 1e-05, "loss": 0.5846, "mean_token_accuracy": 0.8245697617530823, "num_tokens": 61568050.0, "step": 386 }, { "epoch": 0.1968463886063072, "grad_norm": 1.3433443307876587, "learning_rate": 1e-05, "loss": 0.6061, "mean_token_accuracy": 0.8184012174606323, "num_tokens": 61735941.0, "step": 387 }, { "epoch": 0.19735503560528994, "grad_norm": 1.1168278455734253, "learning_rate": 1e-05, "loss": 0.5418, "mean_token_accuracy": 0.8362807035446167, "num_tokens": 61895500.0, "step": 388 }, { "epoch": 0.19786368260427264, "grad_norm": 1.1756104230880737, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.823564887046814, "num_tokens": 62062619.0, "step": 389 }, { "epoch": 0.19837232960325535, "grad_norm": 1.446220874786377, "learning_rate": 1e-05, "loss": 0.5916, "mean_token_accuracy": 0.8246034979820251, "num_tokens": 62247115.0, "step": 390 }, { "epoch": 0.19888097660223805, "grad_norm": 1.2506608963012695, "learning_rate": 1e-05, "loss": 0.5826, "mean_token_accuracy": 0.8243875503540039, "num_tokens": 62404051.0, "step": 391 }, { "epoch": 0.19938962360122076, "grad_norm": 1.0673178434371948, "learning_rate": 1e-05, "loss": 0.5957, "mean_token_accuracy": 0.8206828832626343, "num_tokens": 62559441.0, "step": 392 }, { "epoch": 0.19989827060020346, "grad_norm": 1.2300033569335938, "learning_rate": 1e-05, "loss": 0.6234, "mean_token_accuracy": 0.8144210577011108, "num_tokens": 62719791.0, "step": 393 }, { "epoch": 0.20040691759918616, "grad_norm": 1.2013486623764038, "learning_rate": 1e-05, "loss": 0.5846, "mean_token_accuracy": 0.8223665952682495, "num_tokens": 62863338.0, "step": 394 }, { "epoch": 0.20091556459816887, "grad_norm": 1.1348063945770264, "learning_rate": 1e-05, "loss": 0.5724, "mean_token_accuracy": 0.8273745775222778, "num_tokens": 63024566.0, "step": 395 }, { "epoch": 0.20142421159715157, "grad_norm": 1.0478370189666748, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.835296094417572, "num_tokens": 63189778.0, "step": 396 }, { "epoch": 0.20193285859613427, "grad_norm": 1.1494406461715698, "learning_rate": 1e-05, "loss": 0.5979, "mean_token_accuracy": 0.8189268112182617, "num_tokens": 63347541.0, "step": 397 }, { "epoch": 0.20244150559511698, "grad_norm": 1.1450999975204468, "learning_rate": 1e-05, "loss": 0.6077, "mean_token_accuracy": 0.8168392181396484, "num_tokens": 63513614.0, "step": 398 }, { "epoch": 0.2029501525940997, "grad_norm": 1.1753997802734375, "learning_rate": 1e-05, "loss": 0.5813, "mean_token_accuracy": 0.8233904838562012, "num_tokens": 63657699.0, "step": 399 }, { "epoch": 0.2034587995930824, "grad_norm": 1.0630018711090088, "learning_rate": 1e-05, "loss": 0.5784, "mean_token_accuracy": 0.8258477449417114, "num_tokens": 63810862.0, "step": 400 }, { "epoch": 0.20396744659206512, "grad_norm": 1.2419931888580322, "learning_rate": 1e-05, "loss": 0.6028, "mean_token_accuracy": 0.8186101317405701, "num_tokens": 63976337.0, "step": 401 }, { "epoch": 0.20447609359104782, "grad_norm": 1.1836490631103516, "learning_rate": 1e-05, "loss": 0.5862, "mean_token_accuracy": 0.8243967294692993, "num_tokens": 64131809.0, "step": 402 }, { "epoch": 0.20498474059003052, "grad_norm": 1.1719073057174683, "learning_rate": 1e-05, "loss": 0.6026, "mean_token_accuracy": 0.8206160068511963, "num_tokens": 64301531.0, "step": 403 }, { "epoch": 0.20549338758901323, "grad_norm": 1.1444793939590454, "learning_rate": 1e-05, "loss": 0.5803, "mean_token_accuracy": 0.8233711123466492, "num_tokens": 64453734.0, "step": 404 }, { "epoch": 0.20600203458799593, "grad_norm": 1.15047025680542, "learning_rate": 1e-05, "loss": 0.6118, "mean_token_accuracy": 0.8195493817329407, "num_tokens": 64614638.0, "step": 405 }, { "epoch": 0.20651068158697863, "grad_norm": 1.1457444429397583, "learning_rate": 1e-05, "loss": 0.5568, "mean_token_accuracy": 0.8303655385971069, "num_tokens": 64783877.0, "step": 406 }, { "epoch": 0.20701932858596134, "grad_norm": 1.1359026432037354, "learning_rate": 1e-05, "loss": 0.5916, "mean_token_accuracy": 0.8228204250335693, "num_tokens": 64946232.0, "step": 407 }, { "epoch": 0.20752797558494404, "grad_norm": 1.1045160293579102, "learning_rate": 1e-05, "loss": 0.5741, "mean_token_accuracy": 0.8265408277511597, "num_tokens": 65105547.0, "step": 408 }, { "epoch": 0.20803662258392674, "grad_norm": 1.1697115898132324, "learning_rate": 1e-05, "loss": 0.5974, "mean_token_accuracy": 0.8219442963600159, "num_tokens": 65262260.0, "step": 409 }, { "epoch": 0.20854526958290945, "grad_norm": 1.172763466835022, "learning_rate": 1e-05, "loss": 0.5835, "mean_token_accuracy": 0.8254345655441284, "num_tokens": 65425596.0, "step": 410 }, { "epoch": 0.20905391658189218, "grad_norm": 1.169799566268921, "learning_rate": 1e-05, "loss": 0.5881, "mean_token_accuracy": 0.8227865695953369, "num_tokens": 65582379.0, "step": 411 }, { "epoch": 0.20956256358087488, "grad_norm": 1.13645601272583, "learning_rate": 1e-05, "loss": 0.606, "mean_token_accuracy": 0.8167718052864075, "num_tokens": 65753543.0, "step": 412 }, { "epoch": 0.2100712105798576, "grad_norm": 1.1254587173461914, "learning_rate": 1e-05, "loss": 0.567, "mean_token_accuracy": 0.827894926071167, "num_tokens": 65920741.0, "step": 413 }, { "epoch": 0.2105798575788403, "grad_norm": 1.0942331552505493, "learning_rate": 1e-05, "loss": 0.554, "mean_token_accuracy": 0.8323182463645935, "num_tokens": 66079261.0, "step": 414 }, { "epoch": 0.211088504577823, "grad_norm": 1.1396540403366089, "learning_rate": 1e-05, "loss": 0.5848, "mean_token_accuracy": 0.8227716684341431, "num_tokens": 66249455.0, "step": 415 }, { "epoch": 0.2115971515768057, "grad_norm": 1.1977486610412598, "learning_rate": 1e-05, "loss": 0.6108, "mean_token_accuracy": 0.816601037979126, "num_tokens": 66410765.0, "step": 416 }, { "epoch": 0.2121057985757884, "grad_norm": 1.100052833557129, "learning_rate": 1e-05, "loss": 0.5754, "mean_token_accuracy": 0.8266069889068604, "num_tokens": 66564476.0, "step": 417 }, { "epoch": 0.2126144455747711, "grad_norm": 1.5787156820297241, "learning_rate": 1e-05, "loss": 0.6063, "mean_token_accuracy": 0.8198506832122803, "num_tokens": 66737804.0, "step": 418 }, { "epoch": 0.2131230925737538, "grad_norm": 1.1844232082366943, "learning_rate": 1e-05, "loss": 0.6005, "mean_token_accuracy": 0.8195633292198181, "num_tokens": 66895240.0, "step": 419 }, { "epoch": 0.2136317395727365, "grad_norm": 1.130846619606018, "learning_rate": 1e-05, "loss": 0.581, "mean_token_accuracy": 0.8251656293869019, "num_tokens": 67056288.0, "step": 420 }, { "epoch": 0.21414038657171922, "grad_norm": 1.147178053855896, "learning_rate": 1e-05, "loss": 0.5915, "mean_token_accuracy": 0.8218474984169006, "num_tokens": 67206207.0, "step": 421 }, { "epoch": 0.21464903357070192, "grad_norm": 1.1447371244430542, "learning_rate": 1e-05, "loss": 0.551, "mean_token_accuracy": 0.8317303657531738, "num_tokens": 67369756.0, "step": 422 }, { "epoch": 0.21515768056968465, "grad_norm": 1.1209125518798828, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8276084661483765, "num_tokens": 67527516.0, "step": 423 }, { "epoch": 0.21566632756866735, "grad_norm": 1.0791105031967163, "learning_rate": 1e-05, "loss": 0.5711, "mean_token_accuracy": 0.8268887996673584, "num_tokens": 67688042.0, "step": 424 }, { "epoch": 0.21617497456765006, "grad_norm": 1.2122269868850708, "learning_rate": 1e-05, "loss": 0.5764, "mean_token_accuracy": 0.8255937099456787, "num_tokens": 67854191.0, "step": 425 }, { "epoch": 0.21668362156663276, "grad_norm": 1.1728301048278809, "learning_rate": 1e-05, "loss": 0.5644, "mean_token_accuracy": 0.8275389671325684, "num_tokens": 68008533.0, "step": 426 }, { "epoch": 0.21719226856561547, "grad_norm": 1.1808736324310303, "learning_rate": 1e-05, "loss": 0.5863, "mean_token_accuracy": 0.8220161199569702, "num_tokens": 68162894.0, "step": 427 }, { "epoch": 0.21770091556459817, "grad_norm": 1.132716178894043, "learning_rate": 1e-05, "loss": 0.5408, "mean_token_accuracy": 0.8360246419906616, "num_tokens": 68332401.0, "step": 428 }, { "epoch": 0.21820956256358087, "grad_norm": 1.187079906463623, "learning_rate": 1e-05, "loss": 0.5538, "mean_token_accuracy": 0.8317699432373047, "num_tokens": 68489170.0, "step": 429 }, { "epoch": 0.21871820956256358, "grad_norm": 1.159859538078308, "learning_rate": 1e-05, "loss": 0.5951, "mean_token_accuracy": 0.8213475942611694, "num_tokens": 68643444.0, "step": 430 }, { "epoch": 0.21922685656154628, "grad_norm": 1.2300002574920654, "learning_rate": 1e-05, "loss": 0.5801, "mean_token_accuracy": 0.82547527551651, "num_tokens": 68812499.0, "step": 431 }, { "epoch": 0.21973550356052898, "grad_norm": 1.209106206893921, "learning_rate": 1e-05, "loss": 0.5688, "mean_token_accuracy": 0.8265661001205444, "num_tokens": 68976455.0, "step": 432 }, { "epoch": 0.2202441505595117, "grad_norm": 1.1866743564605713, "learning_rate": 1e-05, "loss": 0.6141, "mean_token_accuracy": 0.8169054388999939, "num_tokens": 69142925.0, "step": 433 }, { "epoch": 0.22075279755849442, "grad_norm": 1.195504069328308, "learning_rate": 1e-05, "loss": 0.603, "mean_token_accuracy": 0.8177850246429443, "num_tokens": 69307832.0, "step": 434 }, { "epoch": 0.22126144455747712, "grad_norm": 1.1744379997253418, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.8307361602783203, "num_tokens": 69466104.0, "step": 435 }, { "epoch": 0.22177009155645983, "grad_norm": 1.1425153017044067, "learning_rate": 1e-05, "loss": 0.5404, "mean_token_accuracy": 0.8364987373352051, "num_tokens": 69619859.0, "step": 436 }, { "epoch": 0.22227873855544253, "grad_norm": 1.135064721107483, "learning_rate": 1e-05, "loss": 0.5463, "mean_token_accuracy": 0.8337409496307373, "num_tokens": 69774687.0, "step": 437 }, { "epoch": 0.22278738555442523, "grad_norm": 1.2530827522277832, "learning_rate": 1e-05, "loss": 0.6011, "mean_token_accuracy": 0.8196946978569031, "num_tokens": 69924843.0, "step": 438 }, { "epoch": 0.22329603255340794, "grad_norm": 1.1789072751998901, "learning_rate": 1e-05, "loss": 0.585, "mean_token_accuracy": 0.8250002861022949, "num_tokens": 70088441.0, "step": 439 }, { "epoch": 0.22380467955239064, "grad_norm": 1.1316715478897095, "learning_rate": 1e-05, "loss": 0.5731, "mean_token_accuracy": 0.8247166275978088, "num_tokens": 70244433.0, "step": 440 }, { "epoch": 0.22431332655137334, "grad_norm": 1.2392256259918213, "learning_rate": 1e-05, "loss": 0.5904, "mean_token_accuracy": 0.8200168609619141, "num_tokens": 70412934.0, "step": 441 }, { "epoch": 0.22482197355035605, "grad_norm": 1.2182955741882324, "learning_rate": 1e-05, "loss": 0.5784, "mean_token_accuracy": 0.8252276182174683, "num_tokens": 70568120.0, "step": 442 }, { "epoch": 0.22533062054933875, "grad_norm": 1.1843059062957764, "learning_rate": 1e-05, "loss": 0.5939, "mean_token_accuracy": 0.8191959261894226, "num_tokens": 70725251.0, "step": 443 }, { "epoch": 0.22583926754832145, "grad_norm": 1.1350986957550049, "learning_rate": 1e-05, "loss": 0.5896, "mean_token_accuracy": 0.821168839931488, "num_tokens": 70881374.0, "step": 444 }, { "epoch": 0.22634791454730416, "grad_norm": 1.2637605667114258, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.8221958875656128, "num_tokens": 71029051.0, "step": 445 }, { "epoch": 0.2268565615462869, "grad_norm": 1.313187837600708, "learning_rate": 1e-05, "loss": 0.5732, "mean_token_accuracy": 0.8275098204612732, "num_tokens": 71192310.0, "step": 446 }, { "epoch": 0.2273652085452696, "grad_norm": 1.166354775428772, "learning_rate": 1e-05, "loss": 0.6006, "mean_token_accuracy": 0.8188575506210327, "num_tokens": 71355710.0, "step": 447 }, { "epoch": 0.2278738555442523, "grad_norm": 1.203367829322815, "learning_rate": 1e-05, "loss": 0.6033, "mean_token_accuracy": 0.819640040397644, "num_tokens": 71509580.0, "step": 448 }, { "epoch": 0.228382502543235, "grad_norm": 1.1382862329483032, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8330165147781372, "num_tokens": 71666995.0, "step": 449 }, { "epoch": 0.2288911495422177, "grad_norm": 1.3082048892974854, "learning_rate": 1e-05, "loss": 0.5732, "mean_token_accuracy": 0.8270379304885864, "num_tokens": 71825201.0, "step": 450 }, { "epoch": 0.2293997965412004, "grad_norm": 1.1791752576828003, "learning_rate": 1e-05, "loss": 0.5586, "mean_token_accuracy": 0.8284664154052734, "num_tokens": 71984961.0, "step": 451 }, { "epoch": 0.2299084435401831, "grad_norm": 1.2788444757461548, "learning_rate": 1e-05, "loss": 0.5792, "mean_token_accuracy": 0.8253182172775269, "num_tokens": 72158603.0, "step": 452 }, { "epoch": 0.23041709053916581, "grad_norm": 1.2335439920425415, "learning_rate": 1e-05, "loss": 0.5577, "mean_token_accuracy": 0.832517147064209, "num_tokens": 72313855.0, "step": 453 }, { "epoch": 0.23092573753814852, "grad_norm": 1.103445053100586, "learning_rate": 1e-05, "loss": 0.5746, "mean_token_accuracy": 0.825374186038971, "num_tokens": 72476307.0, "step": 454 }, { "epoch": 0.23143438453713122, "grad_norm": 1.1981607675552368, "learning_rate": 1e-05, "loss": 0.5696, "mean_token_accuracy": 0.8257123231887817, "num_tokens": 72630768.0, "step": 455 }, { "epoch": 0.23194303153611392, "grad_norm": 1.118881106376648, "learning_rate": 1e-05, "loss": 0.5706, "mean_token_accuracy": 0.8260403275489807, "num_tokens": 72805014.0, "step": 456 }, { "epoch": 0.23245167853509666, "grad_norm": 1.1847957372665405, "learning_rate": 1e-05, "loss": 0.5345, "mean_token_accuracy": 0.838158369064331, "num_tokens": 72965078.0, "step": 457 }, { "epoch": 0.23296032553407936, "grad_norm": 1.1698648929595947, "learning_rate": 1e-05, "loss": 0.5544, "mean_token_accuracy": 0.8327252864837646, "num_tokens": 73123811.0, "step": 458 }, { "epoch": 0.23346897253306206, "grad_norm": 1.185947060585022, "learning_rate": 1e-05, "loss": 0.5565, "mean_token_accuracy": 0.8305568695068359, "num_tokens": 73286856.0, "step": 459 }, { "epoch": 0.23397761953204477, "grad_norm": 1.122018814086914, "learning_rate": 1e-05, "loss": 0.5797, "mean_token_accuracy": 0.8231082558631897, "num_tokens": 73432474.0, "step": 460 }, { "epoch": 0.23448626653102747, "grad_norm": 1.1647952795028687, "learning_rate": 1e-05, "loss": 0.5715, "mean_token_accuracy": 0.8255927562713623, "num_tokens": 73589719.0, "step": 461 }, { "epoch": 0.23499491353001017, "grad_norm": 1.113539695739746, "learning_rate": 1e-05, "loss": 0.5718, "mean_token_accuracy": 0.8253976106643677, "num_tokens": 73759617.0, "step": 462 }, { "epoch": 0.23550356052899288, "grad_norm": 1.204064130783081, "learning_rate": 1e-05, "loss": 0.5947, "mean_token_accuracy": 0.8204372525215149, "num_tokens": 73911743.0, "step": 463 }, { "epoch": 0.23601220752797558, "grad_norm": 1.2407692670822144, "learning_rate": 1e-05, "loss": 0.5819, "mean_token_accuracy": 0.8239152431488037, "num_tokens": 74082923.0, "step": 464 }, { "epoch": 0.23652085452695829, "grad_norm": 1.24295175075531, "learning_rate": 1e-05, "loss": 0.5675, "mean_token_accuracy": 0.8276124000549316, "num_tokens": 74243087.0, "step": 465 }, { "epoch": 0.237029501525941, "grad_norm": 1.0703984498977661, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.829281747341156, "num_tokens": 74410401.0, "step": 466 }, { "epoch": 0.2375381485249237, "grad_norm": 1.217100739479065, "learning_rate": 1e-05, "loss": 0.571, "mean_token_accuracy": 0.8262215256690979, "num_tokens": 74568603.0, "step": 467 }, { "epoch": 0.2380467955239064, "grad_norm": 1.1968027353286743, "learning_rate": 1e-05, "loss": 0.5797, "mean_token_accuracy": 0.8254478573799133, "num_tokens": 74727709.0, "step": 468 }, { "epoch": 0.23855544252288913, "grad_norm": 1.1246533393859863, "learning_rate": 1e-05, "loss": 0.5604, "mean_token_accuracy": 0.8291188478469849, "num_tokens": 74890980.0, "step": 469 }, { "epoch": 0.23906408952187183, "grad_norm": 1.2297966480255127, "learning_rate": 1e-05, "loss": 0.5292, "mean_token_accuracy": 0.8388729691505432, "num_tokens": 75056158.0, "step": 470 }, { "epoch": 0.23957273652085453, "grad_norm": 1.157701849937439, "learning_rate": 1e-05, "loss": 0.567, "mean_token_accuracy": 0.8297462463378906, "num_tokens": 75217454.0, "step": 471 }, { "epoch": 0.24008138351983724, "grad_norm": 1.1809066534042358, "learning_rate": 1e-05, "loss": 0.5655, "mean_token_accuracy": 0.827551007270813, "num_tokens": 75380129.0, "step": 472 }, { "epoch": 0.24059003051881994, "grad_norm": 1.1710585355758667, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.8329430818557739, "num_tokens": 75538207.0, "step": 473 }, { "epoch": 0.24109867751780265, "grad_norm": 1.2090671062469482, "learning_rate": 1e-05, "loss": 0.5412, "mean_token_accuracy": 0.8352799415588379, "num_tokens": 75692158.0, "step": 474 }, { "epoch": 0.24160732451678535, "grad_norm": 1.1606247425079346, "learning_rate": 1e-05, "loss": 0.5525, "mean_token_accuracy": 0.8310859203338623, "num_tokens": 75850325.0, "step": 475 }, { "epoch": 0.24211597151576805, "grad_norm": 1.1594878435134888, "learning_rate": 1e-05, "loss": 0.537, "mean_token_accuracy": 0.8359923958778381, "num_tokens": 76015367.0, "step": 476 }, { "epoch": 0.24262461851475076, "grad_norm": 1.145020842552185, "learning_rate": 1e-05, "loss": 0.5472, "mean_token_accuracy": 0.8321256041526794, "num_tokens": 76165209.0, "step": 477 }, { "epoch": 0.24313326551373346, "grad_norm": 1.1626603603363037, "learning_rate": 1e-05, "loss": 0.5882, "mean_token_accuracy": 0.8215240240097046, "num_tokens": 76309618.0, "step": 478 }, { "epoch": 0.24364191251271616, "grad_norm": 1.2585151195526123, "learning_rate": 1e-05, "loss": 0.5432, "mean_token_accuracy": 0.8327075242996216, "num_tokens": 76468138.0, "step": 479 }, { "epoch": 0.2441505595116989, "grad_norm": 1.1981087923049927, "learning_rate": 1e-05, "loss": 0.5643, "mean_token_accuracy": 0.8272363543510437, "num_tokens": 76638347.0, "step": 480 }, { "epoch": 0.2446592065106816, "grad_norm": 1.1107510328292847, "learning_rate": 1e-05, "loss": 0.5826, "mean_token_accuracy": 0.8227603435516357, "num_tokens": 76807912.0, "step": 481 }, { "epoch": 0.2451678535096643, "grad_norm": 1.204334020614624, "learning_rate": 1e-05, "loss": 0.5646, "mean_token_accuracy": 0.8287724256515503, "num_tokens": 76960377.0, "step": 482 }, { "epoch": 0.245676500508647, "grad_norm": 1.0959969758987427, "learning_rate": 1e-05, "loss": 0.6225, "mean_token_accuracy": 0.8141569495201111, "num_tokens": 77135219.0, "step": 483 }, { "epoch": 0.2461851475076297, "grad_norm": 1.1571955680847168, "learning_rate": 1e-05, "loss": 0.5869, "mean_token_accuracy": 0.823060929775238, "num_tokens": 77284277.0, "step": 484 }, { "epoch": 0.2466937945066124, "grad_norm": 1.116564393043518, "learning_rate": 1e-05, "loss": 0.5758, "mean_token_accuracy": 0.8271507024765015, "num_tokens": 77463576.0, "step": 485 }, { "epoch": 0.24720244150559512, "grad_norm": 1.211808443069458, "learning_rate": 1e-05, "loss": 0.5533, "mean_token_accuracy": 0.8313890695571899, "num_tokens": 77624014.0, "step": 486 }, { "epoch": 0.24771108850457782, "grad_norm": 1.2386025190353394, "learning_rate": 1e-05, "loss": 0.5741, "mean_token_accuracy": 0.8249900937080383, "num_tokens": 77786805.0, "step": 487 }, { "epoch": 0.24821973550356052, "grad_norm": 1.2798279523849487, "learning_rate": 1e-05, "loss": 0.5859, "mean_token_accuracy": 0.8219289779663086, "num_tokens": 77955105.0, "step": 488 }, { "epoch": 0.24872838250254323, "grad_norm": 1.3217813968658447, "learning_rate": 1e-05, "loss": 0.555, "mean_token_accuracy": 0.8318721652030945, "num_tokens": 78103025.0, "step": 489 }, { "epoch": 0.24923702950152593, "grad_norm": 1.253214716911316, "learning_rate": 1e-05, "loss": 0.555, "mean_token_accuracy": 0.830656886100769, "num_tokens": 78259934.0, "step": 490 }, { "epoch": 0.24974567650050863, "grad_norm": 1.133701205253601, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.8326247930526733, "num_tokens": 78420450.0, "step": 491 }, { "epoch": 0.25025432349949134, "grad_norm": 1.265200138092041, "learning_rate": 1e-05, "loss": 0.5736, "mean_token_accuracy": 0.8258102536201477, "num_tokens": 78575928.0, "step": 492 }, { "epoch": 0.25076297049847407, "grad_norm": 1.128560185432434, "learning_rate": 1e-05, "loss": 0.5646, "mean_token_accuracy": 0.8270517587661743, "num_tokens": 78723238.0, "step": 493 }, { "epoch": 0.25127161749745675, "grad_norm": 1.2510902881622314, "learning_rate": 1e-05, "loss": 0.5781, "mean_token_accuracy": 0.8246864080429077, "num_tokens": 78897539.0, "step": 494 }, { "epoch": 0.2517802644964395, "grad_norm": 1.1227195262908936, "learning_rate": 1e-05, "loss": 0.5312, "mean_token_accuracy": 0.8382051587104797, "num_tokens": 79059834.0, "step": 495 }, { "epoch": 0.25228891149542215, "grad_norm": 1.2865270376205444, "learning_rate": 1e-05, "loss": 0.5752, "mean_token_accuracy": 0.8254743218421936, "num_tokens": 79206412.0, "step": 496 }, { "epoch": 0.2527975584944049, "grad_norm": 1.1764838695526123, "learning_rate": 1e-05, "loss": 0.5741, "mean_token_accuracy": 0.8257397413253784, "num_tokens": 79379375.0, "step": 497 }, { "epoch": 0.2533062054933876, "grad_norm": 1.19319748878479, "learning_rate": 1e-05, "loss": 0.5585, "mean_token_accuracy": 0.8302797675132751, "num_tokens": 79550225.0, "step": 498 }, { "epoch": 0.2538148524923703, "grad_norm": 1.2153621912002563, "learning_rate": 1e-05, "loss": 0.5426, "mean_token_accuracy": 0.8351129293441772, "num_tokens": 79705365.0, "step": 499 }, { "epoch": 0.254323499491353, "grad_norm": 1.177880883216858, "learning_rate": 1e-05, "loss": 0.57, "mean_token_accuracy": 0.8263179063796997, "num_tokens": 79861100.0, "step": 500 }, { "epoch": 0.2548321464903357, "grad_norm": 1.2405526638031006, "learning_rate": 1e-05, "loss": 0.5707, "mean_token_accuracy": 0.8265781402587891, "num_tokens": 80017875.0, "step": 501 }, { "epoch": 0.25534079348931843, "grad_norm": 1.1290812492370605, "learning_rate": 1e-05, "loss": 0.5704, "mean_token_accuracy": 0.8263634443283081, "num_tokens": 80184953.0, "step": 502 }, { "epoch": 0.2558494404883011, "grad_norm": 1.1392712593078613, "learning_rate": 1e-05, "loss": 0.5685, "mean_token_accuracy": 0.828170120716095, "num_tokens": 80342864.0, "step": 503 }, { "epoch": 0.25635808748728384, "grad_norm": 1.1106423139572144, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.8346061706542969, "num_tokens": 80497644.0, "step": 504 }, { "epoch": 0.2568667344862665, "grad_norm": 1.235333800315857, "learning_rate": 1e-05, "loss": 0.6029, "mean_token_accuracy": 0.817632794380188, "num_tokens": 80652132.0, "step": 505 }, { "epoch": 0.25737538148524924, "grad_norm": 1.1881519556045532, "learning_rate": 1e-05, "loss": 0.5628, "mean_token_accuracy": 0.8317403793334961, "num_tokens": 80815501.0, "step": 506 }, { "epoch": 0.2578840284842319, "grad_norm": 1.3391587734222412, "learning_rate": 1e-05, "loss": 0.5836, "mean_token_accuracy": 0.8228754997253418, "num_tokens": 80982538.0, "step": 507 }, { "epoch": 0.25839267548321465, "grad_norm": 1.2976542711257935, "learning_rate": 1e-05, "loss": 0.5823, "mean_token_accuracy": 0.8225487470626831, "num_tokens": 81137163.0, "step": 508 }, { "epoch": 0.2589013224821974, "grad_norm": 1.196365475654602, "learning_rate": 1e-05, "loss": 0.5924, "mean_token_accuracy": 0.8200660943984985, "num_tokens": 81304752.0, "step": 509 }, { "epoch": 0.25940996948118006, "grad_norm": 1.3634917736053467, "learning_rate": 1e-05, "loss": 0.5546, "mean_token_accuracy": 0.829670786857605, "num_tokens": 81448192.0, "step": 510 }, { "epoch": 0.2599186164801628, "grad_norm": 1.3710311651229858, "learning_rate": 1e-05, "loss": 0.5916, "mean_token_accuracy": 0.8205992579460144, "num_tokens": 81608372.0, "step": 511 }, { "epoch": 0.26042726347914547, "grad_norm": 1.280065894126892, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8289808034896851, "num_tokens": 81757788.0, "step": 512 }, { "epoch": 0.2609359104781282, "grad_norm": 1.1710976362228394, "learning_rate": 1e-05, "loss": 0.5536, "mean_token_accuracy": 0.8313885927200317, "num_tokens": 81917301.0, "step": 513 }, { "epoch": 0.2614445574771109, "grad_norm": 1.1584504842758179, "learning_rate": 1e-05, "loss": 0.5652, "mean_token_accuracy": 0.8271463513374329, "num_tokens": 82085431.0, "step": 514 }, { "epoch": 0.2619532044760936, "grad_norm": 1.0697828531265259, "learning_rate": 1e-05, "loss": 0.5555, "mean_token_accuracy": 0.8318459391593933, "num_tokens": 82238284.0, "step": 515 }, { "epoch": 0.2624618514750763, "grad_norm": 1.1559085845947266, "learning_rate": 1e-05, "loss": 0.5851, "mean_token_accuracy": 0.8227416276931763, "num_tokens": 82404647.0, "step": 516 }, { "epoch": 0.262970498474059, "grad_norm": 1.1305633783340454, "learning_rate": 1e-05, "loss": 0.5602, "mean_token_accuracy": 0.8290926814079285, "num_tokens": 82572960.0, "step": 517 }, { "epoch": 0.2634791454730417, "grad_norm": 1.0789847373962402, "learning_rate": 1e-05, "loss": 0.5676, "mean_token_accuracy": 0.8283363580703735, "num_tokens": 82738083.0, "step": 518 }, { "epoch": 0.2639877924720244, "grad_norm": 1.2178400754928589, "learning_rate": 1e-05, "loss": 0.5596, "mean_token_accuracy": 0.8298940658569336, "num_tokens": 82887087.0, "step": 519 }, { "epoch": 0.2644964394710071, "grad_norm": 1.152214527130127, "learning_rate": 1e-05, "loss": 0.5696, "mean_token_accuracy": 0.826300323009491, "num_tokens": 83033584.0, "step": 520 }, { "epoch": 0.2650050864699898, "grad_norm": 1.2611678838729858, "learning_rate": 1e-05, "loss": 0.5777, "mean_token_accuracy": 0.8233135342597961, "num_tokens": 83195499.0, "step": 521 }, { "epoch": 0.26551373346897256, "grad_norm": 1.1738433837890625, "learning_rate": 1e-05, "loss": 0.5708, "mean_token_accuracy": 0.8280128240585327, "num_tokens": 83363928.0, "step": 522 }, { "epoch": 0.26602238046795523, "grad_norm": 1.1233623027801514, "learning_rate": 1e-05, "loss": 0.5617, "mean_token_accuracy": 0.828639805316925, "num_tokens": 83520177.0, "step": 523 }, { "epoch": 0.26653102746693796, "grad_norm": 1.09394371509552, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.8328458666801453, "num_tokens": 83680255.0, "step": 524 }, { "epoch": 0.26703967446592064, "grad_norm": 1.126006841659546, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8271574974060059, "num_tokens": 83852400.0, "step": 525 }, { "epoch": 0.26754832146490337, "grad_norm": 1.186543583869934, "learning_rate": 1e-05, "loss": 0.5908, "mean_token_accuracy": 0.8196697235107422, "num_tokens": 84014162.0, "step": 526 }, { "epoch": 0.26805696846388605, "grad_norm": 1.0805085897445679, "learning_rate": 1e-05, "loss": 0.5464, "mean_token_accuracy": 0.8328375220298767, "num_tokens": 84170100.0, "step": 527 }, { "epoch": 0.2685656154628688, "grad_norm": 1.1757475137710571, "learning_rate": 1e-05, "loss": 0.5305, "mean_token_accuracy": 0.8382119536399841, "num_tokens": 84329460.0, "step": 528 }, { "epoch": 0.26907426246185145, "grad_norm": 1.1372390985488892, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8392993211746216, "num_tokens": 84486375.0, "step": 529 }, { "epoch": 0.2695829094608342, "grad_norm": 1.1432785987854004, "learning_rate": 1e-05, "loss": 0.5336, "mean_token_accuracy": 0.837626576423645, "num_tokens": 84643661.0, "step": 530 }, { "epoch": 0.27009155645981686, "grad_norm": 1.1244240999221802, "learning_rate": 1e-05, "loss": 0.5488, "mean_token_accuracy": 0.83336341381073, "num_tokens": 84812504.0, "step": 531 }, { "epoch": 0.2706002034587996, "grad_norm": 1.108756184577942, "learning_rate": 1e-05, "loss": 0.5488, "mean_token_accuracy": 0.8323627710342407, "num_tokens": 84968874.0, "step": 532 }, { "epoch": 0.2711088504577823, "grad_norm": 1.1216325759887695, "learning_rate": 1e-05, "loss": 0.5543, "mean_token_accuracy": 0.8296369314193726, "num_tokens": 85129918.0, "step": 533 }, { "epoch": 0.271617497456765, "grad_norm": 1.092794418334961, "learning_rate": 1e-05, "loss": 0.5476, "mean_token_accuracy": 0.8335623741149902, "num_tokens": 85292100.0, "step": 534 }, { "epoch": 0.27212614445574773, "grad_norm": 1.1919782161712646, "learning_rate": 1e-05, "loss": 0.5328, "mean_token_accuracy": 0.839339017868042, "num_tokens": 85445236.0, "step": 535 }, { "epoch": 0.2726347914547304, "grad_norm": 1.1656126976013184, "learning_rate": 1e-05, "loss": 0.5675, "mean_token_accuracy": 0.827284574508667, "num_tokens": 85605945.0, "step": 536 }, { "epoch": 0.27314343845371314, "grad_norm": 1.3499900102615356, "learning_rate": 1e-05, "loss": 0.5479, "mean_token_accuracy": 0.8318436145782471, "num_tokens": 85760177.0, "step": 537 }, { "epoch": 0.2736520854526958, "grad_norm": 1.0947843790054321, "learning_rate": 1e-05, "loss": 0.5664, "mean_token_accuracy": 0.8289875984191895, "num_tokens": 85924331.0, "step": 538 }, { "epoch": 0.27416073245167855, "grad_norm": 1.0087196826934814, "learning_rate": 1e-05, "loss": 0.5853, "mean_token_accuracy": 0.8228893280029297, "num_tokens": 86087686.0, "step": 539 }, { "epoch": 0.2746693794506612, "grad_norm": 1.1464523077011108, "learning_rate": 1e-05, "loss": 0.5818, "mean_token_accuracy": 0.8230845928192139, "num_tokens": 86231923.0, "step": 540 }, { "epoch": 0.27517802644964395, "grad_norm": 1.132155179977417, "learning_rate": 1e-05, "loss": 0.5538, "mean_token_accuracy": 0.8299921751022339, "num_tokens": 86393599.0, "step": 541 }, { "epoch": 0.27568667344862663, "grad_norm": 1.1671158075332642, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8375282287597656, "num_tokens": 86546912.0, "step": 542 }, { "epoch": 0.27619532044760936, "grad_norm": 1.2495208978652954, "learning_rate": 1e-05, "loss": 0.6201, "mean_token_accuracy": 0.814518392086029, "num_tokens": 86710627.0, "step": 543 }, { "epoch": 0.2767039674465921, "grad_norm": 1.2193448543548584, "learning_rate": 1e-05, "loss": 0.5723, "mean_token_accuracy": 0.8245118856430054, "num_tokens": 86872441.0, "step": 544 }, { "epoch": 0.27721261444557477, "grad_norm": 1.4755446910858154, "learning_rate": 1e-05, "loss": 0.58, "mean_token_accuracy": 0.8226820230484009, "num_tokens": 87040426.0, "step": 545 }, { "epoch": 0.2777212614445575, "grad_norm": 1.0825291872024536, "learning_rate": 1e-05, "loss": 0.5649, "mean_token_accuracy": 0.8284255862236023, "num_tokens": 87204457.0, "step": 546 }, { "epoch": 0.2782299084435402, "grad_norm": 1.168476939201355, "learning_rate": 1e-05, "loss": 0.5733, "mean_token_accuracy": 0.8267923593521118, "num_tokens": 87350790.0, "step": 547 }, { "epoch": 0.2787385554425229, "grad_norm": 1.2543644905090332, "learning_rate": 1e-05, "loss": 0.5804, "mean_token_accuracy": 0.8234599232673645, "num_tokens": 87501470.0, "step": 548 }, { "epoch": 0.2792472024415056, "grad_norm": 1.14836847782135, "learning_rate": 1e-05, "loss": 0.5712, "mean_token_accuracy": 0.8264409303665161, "num_tokens": 87654280.0, "step": 549 }, { "epoch": 0.2797558494404883, "grad_norm": 1.1316653490066528, "learning_rate": 1e-05, "loss": 0.5536, "mean_token_accuracy": 0.8308054208755493, "num_tokens": 87816423.0, "step": 550 }, { "epoch": 0.280264496439471, "grad_norm": 1.271012783050537, "learning_rate": 1e-05, "loss": 0.5891, "mean_token_accuracy": 0.8203872442245483, "num_tokens": 87960156.0, "step": 551 }, { "epoch": 0.2807731434384537, "grad_norm": 1.1925300359725952, "learning_rate": 1e-05, "loss": 0.5343, "mean_token_accuracy": 0.8370486497879028, "num_tokens": 88105701.0, "step": 552 }, { "epoch": 0.2812817904374364, "grad_norm": 1.173293113708496, "learning_rate": 1e-05, "loss": 0.5629, "mean_token_accuracy": 0.8282473087310791, "num_tokens": 88261189.0, "step": 553 }, { "epoch": 0.28179043743641913, "grad_norm": 1.1886355876922607, "learning_rate": 1e-05, "loss": 0.5721, "mean_token_accuracy": 0.8262102603912354, "num_tokens": 88429993.0, "step": 554 }, { "epoch": 0.2822990844354018, "grad_norm": 1.221113681793213, "learning_rate": 1e-05, "loss": 0.559, "mean_token_accuracy": 0.8296666145324707, "num_tokens": 88598116.0, "step": 555 }, { "epoch": 0.28280773143438453, "grad_norm": 1.2348664999008179, "learning_rate": 1e-05, "loss": 0.5854, "mean_token_accuracy": 0.8205877542495728, "num_tokens": 88754109.0, "step": 556 }, { "epoch": 0.28331637843336727, "grad_norm": 1.3055741786956787, "learning_rate": 1e-05, "loss": 0.5708, "mean_token_accuracy": 0.8285917043685913, "num_tokens": 88911407.0, "step": 557 }, { "epoch": 0.28382502543234994, "grad_norm": 1.2409954071044922, "learning_rate": 1e-05, "loss": 0.6024, "mean_token_accuracy": 0.8154451847076416, "num_tokens": 89075556.0, "step": 558 }, { "epoch": 0.2843336724313327, "grad_norm": 1.163521647453308, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8276627659797668, "num_tokens": 89234090.0, "step": 559 }, { "epoch": 0.28484231943031535, "grad_norm": 1.3338189125061035, "learning_rate": 1e-05, "loss": 0.5765, "mean_token_accuracy": 0.8253119587898254, "num_tokens": 89394941.0, "step": 560 }, { "epoch": 0.2853509664292981, "grad_norm": 1.2435412406921387, "learning_rate": 1e-05, "loss": 0.5745, "mean_token_accuracy": 0.8244850039482117, "num_tokens": 89560585.0, "step": 561 }, { "epoch": 0.28585961342828076, "grad_norm": 1.0883127450942993, "learning_rate": 1e-05, "loss": 0.5723, "mean_token_accuracy": 0.8268938660621643, "num_tokens": 89726320.0, "step": 562 }, { "epoch": 0.2863682604272635, "grad_norm": 1.1344267129898071, "learning_rate": 1e-05, "loss": 0.5375, "mean_token_accuracy": 0.8374463319778442, "num_tokens": 89880236.0, "step": 563 }, { "epoch": 0.28687690742624616, "grad_norm": 1.095139741897583, "learning_rate": 1e-05, "loss": 0.5982, "mean_token_accuracy": 0.8192014694213867, "num_tokens": 90046963.0, "step": 564 }, { "epoch": 0.2873855544252289, "grad_norm": 1.0343623161315918, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8351205587387085, "num_tokens": 90208654.0, "step": 565 }, { "epoch": 0.28789420142421157, "grad_norm": 1.2066987752914429, "learning_rate": 1e-05, "loss": 0.5724, "mean_token_accuracy": 0.8254907131195068, "num_tokens": 90360145.0, "step": 566 }, { "epoch": 0.2884028484231943, "grad_norm": 1.2492485046386719, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8292480707168579, "num_tokens": 90517341.0, "step": 567 }, { "epoch": 0.28891149542217703, "grad_norm": 1.1790997982025146, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8293505907058716, "num_tokens": 90666248.0, "step": 568 }, { "epoch": 0.2894201424211597, "grad_norm": 1.1388493776321411, "learning_rate": 1e-05, "loss": 0.5468, "mean_token_accuracy": 0.8336046934127808, "num_tokens": 90824390.0, "step": 569 }, { "epoch": 0.28992878942014244, "grad_norm": 1.0945671796798706, "learning_rate": 1e-05, "loss": 0.5772, "mean_token_accuracy": 0.8247801661491394, "num_tokens": 90990168.0, "step": 570 }, { "epoch": 0.2904374364191251, "grad_norm": 1.185009241104126, "learning_rate": 1e-05, "loss": 0.5592, "mean_token_accuracy": 0.8298337459564209, "num_tokens": 91130945.0, "step": 571 }, { "epoch": 0.29094608341810785, "grad_norm": 1.0907163619995117, "learning_rate": 1e-05, "loss": 0.5513, "mean_token_accuracy": 0.8324260711669922, "num_tokens": 91287120.0, "step": 572 }, { "epoch": 0.2914547304170905, "grad_norm": 1.1829215288162231, "learning_rate": 1e-05, "loss": 0.5664, "mean_token_accuracy": 0.8273265957832336, "num_tokens": 91445888.0, "step": 573 }, { "epoch": 0.29196337741607326, "grad_norm": 1.0761456489562988, "learning_rate": 1e-05, "loss": 0.5296, "mean_token_accuracy": 0.837720513343811, "num_tokens": 91605888.0, "step": 574 }, { "epoch": 0.29247202441505593, "grad_norm": 1.1502619981765747, "learning_rate": 1e-05, "loss": 0.6016, "mean_token_accuracy": 0.8188234567642212, "num_tokens": 91766382.0, "step": 575 }, { "epoch": 0.29298067141403866, "grad_norm": 1.134983777999878, "learning_rate": 1e-05, "loss": 0.562, "mean_token_accuracy": 0.8294249773025513, "num_tokens": 91923111.0, "step": 576 }, { "epoch": 0.29348931841302134, "grad_norm": 1.0459659099578857, "learning_rate": 1e-05, "loss": 0.5822, "mean_token_accuracy": 0.8240058422088623, "num_tokens": 92076217.0, "step": 577 }, { "epoch": 0.29399796541200407, "grad_norm": 1.205342411994934, "learning_rate": 1e-05, "loss": 0.5151, "mean_token_accuracy": 0.8412837982177734, "num_tokens": 92233100.0, "step": 578 }, { "epoch": 0.2945066124109868, "grad_norm": 1.2601711750030518, "learning_rate": 1e-05, "loss": 0.6028, "mean_token_accuracy": 0.8175768852233887, "num_tokens": 92384392.0, "step": 579 }, { "epoch": 0.2950152594099695, "grad_norm": 1.1117498874664307, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8412041664123535, "num_tokens": 92538563.0, "step": 580 }, { "epoch": 0.2955239064089522, "grad_norm": 1.2051867246627808, "learning_rate": 1e-05, "loss": 0.5439, "mean_token_accuracy": 0.8337169885635376, "num_tokens": 92702514.0, "step": 581 }, { "epoch": 0.2960325534079349, "grad_norm": 1.1142427921295166, "learning_rate": 1e-05, "loss": 0.509, "mean_token_accuracy": 0.8424208760261536, "num_tokens": 92861897.0, "step": 582 }, { "epoch": 0.2965412004069176, "grad_norm": 1.2217726707458496, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8390108346939087, "num_tokens": 93021485.0, "step": 583 }, { "epoch": 0.2970498474059003, "grad_norm": 1.1346980333328247, "learning_rate": 1e-05, "loss": 0.4927, "mean_token_accuracy": 0.8458535075187683, "num_tokens": 93169971.0, "step": 584 }, { "epoch": 0.297558494404883, "grad_norm": 1.2165158987045288, "learning_rate": 1e-05, "loss": 0.549, "mean_token_accuracy": 0.8321750164031982, "num_tokens": 93321364.0, "step": 585 }, { "epoch": 0.2980671414038657, "grad_norm": 1.2306874990463257, "learning_rate": 1e-05, "loss": 0.5744, "mean_token_accuracy": 0.825247049331665, "num_tokens": 93476129.0, "step": 586 }, { "epoch": 0.29857578840284843, "grad_norm": 1.1397591829299927, "learning_rate": 1e-05, "loss": 0.5624, "mean_token_accuracy": 0.8268832564353943, "num_tokens": 93637944.0, "step": 587 }, { "epoch": 0.2990844354018311, "grad_norm": 1.1084394454956055, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.8345076441764832, "num_tokens": 93795891.0, "step": 588 }, { "epoch": 0.29959308240081384, "grad_norm": 1.1393333673477173, "learning_rate": 1e-05, "loss": 0.5966, "mean_token_accuracy": 0.8186751008033752, "num_tokens": 93949753.0, "step": 589 }, { "epoch": 0.30010172939979657, "grad_norm": 1.122271180152893, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8422044515609741, "num_tokens": 94099353.0, "step": 590 }, { "epoch": 0.30061037639877924, "grad_norm": 1.3281346559524536, "learning_rate": 1e-05, "loss": 0.6231, "mean_token_accuracy": 0.8126146793365479, "num_tokens": 94262648.0, "step": 591 }, { "epoch": 0.301119023397762, "grad_norm": 1.1627881526947021, "learning_rate": 1e-05, "loss": 0.5631, "mean_token_accuracy": 0.8286106586456299, "num_tokens": 94420660.0, "step": 592 }, { "epoch": 0.30162767039674465, "grad_norm": 1.3525162935256958, "learning_rate": 1e-05, "loss": 0.5532, "mean_token_accuracy": 0.8333991169929504, "num_tokens": 94580578.0, "step": 593 }, { "epoch": 0.3021363173957274, "grad_norm": 1.2686326503753662, "learning_rate": 1e-05, "loss": 0.5606, "mean_token_accuracy": 0.8290703296661377, "num_tokens": 94745810.0, "step": 594 }, { "epoch": 0.30264496439471006, "grad_norm": 1.2042104005813599, "learning_rate": 1e-05, "loss": 0.5721, "mean_token_accuracy": 0.8256835341453552, "num_tokens": 94904762.0, "step": 595 }, { "epoch": 0.3031536113936928, "grad_norm": 1.2832545042037964, "learning_rate": 1e-05, "loss": 0.5525, "mean_token_accuracy": 0.8298637866973877, "num_tokens": 95053065.0, "step": 596 }, { "epoch": 0.30366225839267547, "grad_norm": 1.1183339357376099, "learning_rate": 1e-05, "loss": 0.5705, "mean_token_accuracy": 0.8255320191383362, "num_tokens": 95207324.0, "step": 597 }, { "epoch": 0.3041709053916582, "grad_norm": 1.2548408508300781, "learning_rate": 1e-05, "loss": 0.5414, "mean_token_accuracy": 0.8333613872528076, "num_tokens": 95365564.0, "step": 598 }, { "epoch": 0.3046795523906409, "grad_norm": 1.0983479022979736, "learning_rate": 1e-05, "loss": 0.5569, "mean_token_accuracy": 0.8273598551750183, "num_tokens": 95522523.0, "step": 599 }, { "epoch": 0.3051881993896236, "grad_norm": 1.1975111961364746, "learning_rate": 1e-05, "loss": 0.5299, "mean_token_accuracy": 0.8379498720169067, "num_tokens": 95677617.0, "step": 600 }, { "epoch": 0.3056968463886063, "grad_norm": 1.0161926746368408, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.8283061981201172, "num_tokens": 95855477.0, "step": 601 }, { "epoch": 0.306205493387589, "grad_norm": 1.1558902263641357, "learning_rate": 1e-05, "loss": 0.5963, "mean_token_accuracy": 0.8191391229629517, "num_tokens": 96017692.0, "step": 602 }, { "epoch": 0.30671414038657174, "grad_norm": 1.0920082330703735, "learning_rate": 1e-05, "loss": 0.5534, "mean_token_accuracy": 0.8309683799743652, "num_tokens": 96175479.0, "step": 603 }, { "epoch": 0.3072227873855544, "grad_norm": 1.1383585929870605, "learning_rate": 1e-05, "loss": 0.5682, "mean_token_accuracy": 0.8261626958847046, "num_tokens": 96337729.0, "step": 604 }, { "epoch": 0.30773143438453715, "grad_norm": 1.082250952720642, "learning_rate": 1e-05, "loss": 0.5516, "mean_token_accuracy": 0.8316413760185242, "num_tokens": 96492659.0, "step": 605 }, { "epoch": 0.3082400813835198, "grad_norm": 1.1070678234100342, "learning_rate": 1e-05, "loss": 0.5639, "mean_token_accuracy": 0.8267689943313599, "num_tokens": 96644965.0, "step": 606 }, { "epoch": 0.30874872838250256, "grad_norm": 1.134475588798523, "learning_rate": 1e-05, "loss": 0.5262, "mean_token_accuracy": 0.8369567394256592, "num_tokens": 96801670.0, "step": 607 }, { "epoch": 0.30925737538148523, "grad_norm": 1.5056709051132202, "learning_rate": 1e-05, "loss": 0.5581, "mean_token_accuracy": 0.8286861181259155, "num_tokens": 96966380.0, "step": 608 }, { "epoch": 0.30976602238046796, "grad_norm": 1.1796578168869019, "learning_rate": 1e-05, "loss": 0.559, "mean_token_accuracy": 0.8290995955467224, "num_tokens": 97116653.0, "step": 609 }, { "epoch": 0.31027466937945064, "grad_norm": 1.1612471342086792, "learning_rate": 1e-05, "loss": 0.5519, "mean_token_accuracy": 0.8312628269195557, "num_tokens": 97279451.0, "step": 610 }, { "epoch": 0.31078331637843337, "grad_norm": 1.0776804685592651, "learning_rate": 1e-05, "loss": 0.5899, "mean_token_accuracy": 0.8217321634292603, "num_tokens": 97439569.0, "step": 611 }, { "epoch": 0.31129196337741605, "grad_norm": 1.046061396598816, "learning_rate": 1e-05, "loss": 0.5445, "mean_token_accuracy": 0.8333207368850708, "num_tokens": 97595648.0, "step": 612 }, { "epoch": 0.3118006103763988, "grad_norm": 1.1500645875930786, "learning_rate": 1e-05, "loss": 0.5834, "mean_token_accuracy": 0.8228082656860352, "num_tokens": 97754844.0, "step": 613 }, { "epoch": 0.3123092573753815, "grad_norm": 1.0216745138168335, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.8335670232772827, "num_tokens": 97916479.0, "step": 614 }, { "epoch": 0.3128179043743642, "grad_norm": 1.2384233474731445, "learning_rate": 1e-05, "loss": 0.5799, "mean_token_accuracy": 0.8242952823638916, "num_tokens": 98083229.0, "step": 615 }, { "epoch": 0.3133265513733469, "grad_norm": 1.1097160577774048, "learning_rate": 1e-05, "loss": 0.5567, "mean_token_accuracy": 0.8286508321762085, "num_tokens": 98234766.0, "step": 616 }, { "epoch": 0.3138351983723296, "grad_norm": 1.1675152778625488, "learning_rate": 1e-05, "loss": 0.5453, "mean_token_accuracy": 0.8325939774513245, "num_tokens": 98401537.0, "step": 617 }, { "epoch": 0.3143438453713123, "grad_norm": 1.1450773477554321, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.835297703742981, "num_tokens": 98567715.0, "step": 618 }, { "epoch": 0.314852492370295, "grad_norm": 1.0319682359695435, "learning_rate": 1e-05, "loss": 0.5566, "mean_token_accuracy": 0.8285013437271118, "num_tokens": 98735978.0, "step": 619 }, { "epoch": 0.31536113936927773, "grad_norm": 1.2614619731903076, "learning_rate": 1e-05, "loss": 0.5846, "mean_token_accuracy": 0.8239743113517761, "num_tokens": 98906165.0, "step": 620 }, { "epoch": 0.3158697863682604, "grad_norm": 1.1205494403839111, "learning_rate": 1e-05, "loss": 0.5651, "mean_token_accuracy": 0.8274800777435303, "num_tokens": 99056653.0, "step": 621 }, { "epoch": 0.31637843336724314, "grad_norm": 1.4943937063217163, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.8389872312545776, "num_tokens": 99211579.0, "step": 622 }, { "epoch": 0.3168870803662258, "grad_norm": 1.1277450323104858, "learning_rate": 1e-05, "loss": 0.569, "mean_token_accuracy": 0.8268486857414246, "num_tokens": 99361670.0, "step": 623 }, { "epoch": 0.31739572736520855, "grad_norm": 1.0670257806777954, "learning_rate": 1e-05, "loss": 0.585, "mean_token_accuracy": 0.8215529322624207, "num_tokens": 99530349.0, "step": 624 }, { "epoch": 0.3179043743641913, "grad_norm": 1.1288607120513916, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8349924683570862, "num_tokens": 99693143.0, "step": 625 }, { "epoch": 0.31841302136317395, "grad_norm": 1.061599612236023, "learning_rate": 1e-05, "loss": 0.6094, "mean_token_accuracy": 0.8155608177185059, "num_tokens": 99867681.0, "step": 626 }, { "epoch": 0.3189216683621567, "grad_norm": 1.1407296657562256, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8296022415161133, "num_tokens": 100026848.0, "step": 627 }, { "epoch": 0.31943031536113936, "grad_norm": 1.1488444805145264, "learning_rate": 1e-05, "loss": 0.5456, "mean_token_accuracy": 0.8349856734275818, "num_tokens": 100198041.0, "step": 628 }, { "epoch": 0.3199389623601221, "grad_norm": 1.0721195936203003, "learning_rate": 1e-05, "loss": 0.5241, "mean_token_accuracy": 0.8384602069854736, "num_tokens": 100352919.0, "step": 629 }, { "epoch": 0.32044760935910477, "grad_norm": 1.0653283596038818, "learning_rate": 1e-05, "loss": 0.5277, "mean_token_accuracy": 0.838874101638794, "num_tokens": 100509537.0, "step": 630 }, { "epoch": 0.3209562563580875, "grad_norm": 1.0388782024383545, "learning_rate": 1e-05, "loss": 0.5296, "mean_token_accuracy": 0.8368690013885498, "num_tokens": 100663059.0, "step": 631 }, { "epoch": 0.3214649033570702, "grad_norm": 1.297975778579712, "learning_rate": 1e-05, "loss": 0.5831, "mean_token_accuracy": 0.8221128582954407, "num_tokens": 100831452.0, "step": 632 }, { "epoch": 0.3219735503560529, "grad_norm": 1.128312349319458, "learning_rate": 1e-05, "loss": 0.5526, "mean_token_accuracy": 0.832910418510437, "num_tokens": 100984332.0, "step": 633 }, { "epoch": 0.3224821973550356, "grad_norm": 1.1248503923416138, "learning_rate": 1e-05, "loss": 0.5693, "mean_token_accuracy": 0.825761079788208, "num_tokens": 101131963.0, "step": 634 }, { "epoch": 0.3229908443540183, "grad_norm": 1.0807161331176758, "learning_rate": 1e-05, "loss": 0.5484, "mean_token_accuracy": 0.8321346640586853, "num_tokens": 101289236.0, "step": 635 }, { "epoch": 0.323499491353001, "grad_norm": 1.1452345848083496, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8327028751373291, "num_tokens": 101448301.0, "step": 636 }, { "epoch": 0.3240081383519837, "grad_norm": 1.174797534942627, "learning_rate": 1e-05, "loss": 0.5562, "mean_token_accuracy": 0.8287744522094727, "num_tokens": 101601568.0, "step": 637 }, { "epoch": 0.32451678535096645, "grad_norm": 1.191448450088501, "learning_rate": 1e-05, "loss": 0.5549, "mean_token_accuracy": 0.8289895057678223, "num_tokens": 101750506.0, "step": 638 }, { "epoch": 0.32502543234994913, "grad_norm": 1.159472107887268, "learning_rate": 1e-05, "loss": 0.5711, "mean_token_accuracy": 0.8275212645530701, "num_tokens": 101915028.0, "step": 639 }, { "epoch": 0.32553407934893186, "grad_norm": 1.1111050844192505, "learning_rate": 1e-05, "loss": 0.5638, "mean_token_accuracy": 0.8270004391670227, "num_tokens": 102079659.0, "step": 640 }, { "epoch": 0.32604272634791454, "grad_norm": 1.036197304725647, "learning_rate": 1e-05, "loss": 0.532, "mean_token_accuracy": 0.8362898230552673, "num_tokens": 102247764.0, "step": 641 }, { "epoch": 0.32655137334689727, "grad_norm": 1.1457844972610474, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.8351918458938599, "num_tokens": 102405734.0, "step": 642 }, { "epoch": 0.32706002034587994, "grad_norm": 1.09752357006073, "learning_rate": 1e-05, "loss": 0.552, "mean_token_accuracy": 0.8294593095779419, "num_tokens": 102566525.0, "step": 643 }, { "epoch": 0.3275686673448627, "grad_norm": 1.045760989189148, "learning_rate": 1e-05, "loss": 0.5984, "mean_token_accuracy": 0.8189845085144043, "num_tokens": 102739870.0, "step": 644 }, { "epoch": 0.32807731434384535, "grad_norm": 1.205141305923462, "learning_rate": 1e-05, "loss": 0.5725, "mean_token_accuracy": 0.8254177570343018, "num_tokens": 102895661.0, "step": 645 }, { "epoch": 0.3285859613428281, "grad_norm": 1.1817325353622437, "learning_rate": 1e-05, "loss": 0.5442, "mean_token_accuracy": 0.8346107006072998, "num_tokens": 103055954.0, "step": 646 }, { "epoch": 0.32909460834181076, "grad_norm": 1.075392246246338, "learning_rate": 1e-05, "loss": 0.5386, "mean_token_accuracy": 0.8339201211929321, "num_tokens": 103214101.0, "step": 647 }, { "epoch": 0.3296032553407935, "grad_norm": 1.1824406385421753, "learning_rate": 1e-05, "loss": 0.5771, "mean_token_accuracy": 0.8253869414329529, "num_tokens": 103373341.0, "step": 648 }, { "epoch": 0.3301119023397762, "grad_norm": 1.0850316286087036, "learning_rate": 1e-05, "loss": 0.5402, "mean_token_accuracy": 0.833788275718689, "num_tokens": 103546489.0, "step": 649 }, { "epoch": 0.3306205493387589, "grad_norm": 1.1251380443572998, "learning_rate": 1e-05, "loss": 0.5415, "mean_token_accuracy": 0.8325050473213196, "num_tokens": 103709672.0, "step": 650 }, { "epoch": 0.3311291963377416, "grad_norm": 1.1506757736206055, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8368813991546631, "num_tokens": 103875427.0, "step": 651 }, { "epoch": 0.3316378433367243, "grad_norm": 1.1591253280639648, "learning_rate": 1e-05, "loss": 0.5508, "mean_token_accuracy": 0.83094322681427, "num_tokens": 104027661.0, "step": 652 }, { "epoch": 0.33214649033570703, "grad_norm": 1.2312512397766113, "learning_rate": 1e-05, "loss": 0.5871, "mean_token_accuracy": 0.8218927979469299, "num_tokens": 104180735.0, "step": 653 }, { "epoch": 0.3326551373346897, "grad_norm": 1.1636476516723633, "learning_rate": 1e-05, "loss": 0.5472, "mean_token_accuracy": 0.83375084400177, "num_tokens": 104353014.0, "step": 654 }, { "epoch": 0.33316378433367244, "grad_norm": 1.1845154762268066, "learning_rate": 1e-05, "loss": 0.5575, "mean_token_accuracy": 0.8296653032302856, "num_tokens": 104517143.0, "step": 655 }, { "epoch": 0.3336724313326551, "grad_norm": 1.076217532157898, "learning_rate": 1e-05, "loss": 0.5901, "mean_token_accuracy": 0.8188375234603882, "num_tokens": 104669203.0, "step": 656 }, { "epoch": 0.33418107833163785, "grad_norm": 1.3136963844299316, "learning_rate": 1e-05, "loss": 0.5769, "mean_token_accuracy": 0.8237686157226562, "num_tokens": 104810466.0, "step": 657 }, { "epoch": 0.3346897253306205, "grad_norm": 1.0738552808761597, "learning_rate": 1e-05, "loss": 0.5778, "mean_token_accuracy": 0.8232786059379578, "num_tokens": 104980367.0, "step": 658 }, { "epoch": 0.33519837232960326, "grad_norm": 1.3627864122390747, "learning_rate": 1e-05, "loss": 0.5358, "mean_token_accuracy": 0.8357287645339966, "num_tokens": 105131160.0, "step": 659 }, { "epoch": 0.335707019328586, "grad_norm": 1.1652209758758545, "learning_rate": 1e-05, "loss": 0.5907, "mean_token_accuracy": 0.8218494653701782, "num_tokens": 105298008.0, "step": 660 }, { "epoch": 0.33621566632756866, "grad_norm": 1.2509148120880127, "learning_rate": 1e-05, "loss": 0.5953, "mean_token_accuracy": 0.8190317153930664, "num_tokens": 105452271.0, "step": 661 }, { "epoch": 0.3367243133265514, "grad_norm": 1.1237406730651855, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8366305232048035, "num_tokens": 105619978.0, "step": 662 }, { "epoch": 0.33723296032553407, "grad_norm": 1.064732313156128, "learning_rate": 1e-05, "loss": 0.543, "mean_token_accuracy": 0.8332208395004272, "num_tokens": 105786355.0, "step": 663 }, { "epoch": 0.3377416073245168, "grad_norm": 1.2354017496109009, "learning_rate": 1e-05, "loss": 0.5806, "mean_token_accuracy": 0.8233789801597595, "num_tokens": 105942503.0, "step": 664 }, { "epoch": 0.3382502543234995, "grad_norm": 1.099792242050171, "learning_rate": 1e-05, "loss": 0.5569, "mean_token_accuracy": 0.8300788998603821, "num_tokens": 106088220.0, "step": 665 }, { "epoch": 0.3387589013224822, "grad_norm": 1.207027792930603, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8397770524024963, "num_tokens": 106246812.0, "step": 666 }, { "epoch": 0.3392675483214649, "grad_norm": 1.0667011737823486, "learning_rate": 1e-05, "loss": 0.5166, "mean_token_accuracy": 0.8391870260238647, "num_tokens": 106405912.0, "step": 667 }, { "epoch": 0.3397761953204476, "grad_norm": 1.1162816286087036, "learning_rate": 1e-05, "loss": 0.6033, "mean_token_accuracy": 0.816855788230896, "num_tokens": 106556412.0, "step": 668 }, { "epoch": 0.3402848423194303, "grad_norm": 1.1591635942459106, "learning_rate": 1e-05, "loss": 0.5731, "mean_token_accuracy": 0.8236411809921265, "num_tokens": 106712640.0, "step": 669 }, { "epoch": 0.340793489318413, "grad_norm": 1.0984569787979126, "learning_rate": 1e-05, "loss": 0.562, "mean_token_accuracy": 0.8283123970031738, "num_tokens": 106869710.0, "step": 670 }, { "epoch": 0.34130213631739575, "grad_norm": 1.0950126647949219, "learning_rate": 1e-05, "loss": 0.5342, "mean_token_accuracy": 0.8369549512863159, "num_tokens": 107034402.0, "step": 671 }, { "epoch": 0.34181078331637843, "grad_norm": 1.1208062171936035, "learning_rate": 1e-05, "loss": 0.5633, "mean_token_accuracy": 0.8275705575942993, "num_tokens": 107197492.0, "step": 672 }, { "epoch": 0.34231943031536116, "grad_norm": 1.1074978113174438, "learning_rate": 1e-05, "loss": 0.5457, "mean_token_accuracy": 0.8320577144622803, "num_tokens": 107355096.0, "step": 673 }, { "epoch": 0.34282807731434384, "grad_norm": 1.1080939769744873, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.8340401649475098, "num_tokens": 107520176.0, "step": 674 }, { "epoch": 0.34333672431332657, "grad_norm": 1.0805479288101196, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8356107473373413, "num_tokens": 107676895.0, "step": 675 }, { "epoch": 0.34384537131230924, "grad_norm": 1.1147992610931396, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.831654965877533, "num_tokens": 107829848.0, "step": 676 }, { "epoch": 0.344354018311292, "grad_norm": 1.0968836545944214, "learning_rate": 1e-05, "loss": 0.5843, "mean_token_accuracy": 0.8204114437103271, "num_tokens": 108003143.0, "step": 677 }, { "epoch": 0.34486266531027465, "grad_norm": 1.1353141069412231, "learning_rate": 1e-05, "loss": 0.57, "mean_token_accuracy": 0.8269537687301636, "num_tokens": 108169162.0, "step": 678 }, { "epoch": 0.3453713123092574, "grad_norm": 1.1155294179916382, "learning_rate": 1e-05, "loss": 0.5328, "mean_token_accuracy": 0.8368821144104004, "num_tokens": 108326285.0, "step": 679 }, { "epoch": 0.34587995930824006, "grad_norm": 1.108866810798645, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8361458778381348, "num_tokens": 108481647.0, "step": 680 }, { "epoch": 0.3463886063072228, "grad_norm": 1.1015127897262573, "learning_rate": 1e-05, "loss": 0.5403, "mean_token_accuracy": 0.8353789448738098, "num_tokens": 108632219.0, "step": 681 }, { "epoch": 0.34689725330620547, "grad_norm": 1.1202666759490967, "learning_rate": 1e-05, "loss": 0.555, "mean_token_accuracy": 0.8291305303573608, "num_tokens": 108795594.0, "step": 682 }, { "epoch": 0.3474059003051882, "grad_norm": 1.0737191438674927, "learning_rate": 1e-05, "loss": 0.5419, "mean_token_accuracy": 0.8332039713859558, "num_tokens": 108947589.0, "step": 683 }, { "epoch": 0.34791454730417093, "grad_norm": 1.0999579429626465, "learning_rate": 1e-05, "loss": 0.5561, "mean_token_accuracy": 0.8297255635261536, "num_tokens": 109100848.0, "step": 684 }, { "epoch": 0.3484231943031536, "grad_norm": 1.2430332899093628, "learning_rate": 1e-05, "loss": 0.5895, "mean_token_accuracy": 0.8223748803138733, "num_tokens": 109268616.0, "step": 685 }, { "epoch": 0.34893184130213634, "grad_norm": 0.9805262088775635, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8367460370063782, "num_tokens": 109425497.0, "step": 686 }, { "epoch": 0.349440488301119, "grad_norm": 1.2571215629577637, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8310940265655518, "num_tokens": 109596743.0, "step": 687 }, { "epoch": 0.34994913530010174, "grad_norm": 1.0542333126068115, "learning_rate": 1e-05, "loss": 0.5245, "mean_token_accuracy": 0.838855504989624, "num_tokens": 109752861.0, "step": 688 }, { "epoch": 0.3504577822990844, "grad_norm": 1.0893378257751465, "learning_rate": 1e-05, "loss": 0.5275, "mean_token_accuracy": 0.8377349972724915, "num_tokens": 109912156.0, "step": 689 }, { "epoch": 0.35096642929806715, "grad_norm": 1.232289433479309, "learning_rate": 1e-05, "loss": 0.5344, "mean_token_accuracy": 0.8353825807571411, "num_tokens": 110075837.0, "step": 690 }, { "epoch": 0.3514750762970498, "grad_norm": 1.110674262046814, "learning_rate": 1e-05, "loss": 0.5713, "mean_token_accuracy": 0.8261686563491821, "num_tokens": 110230103.0, "step": 691 }, { "epoch": 0.35198372329603256, "grad_norm": 1.09687340259552, "learning_rate": 1e-05, "loss": 0.5476, "mean_token_accuracy": 0.8328587412834167, "num_tokens": 110388178.0, "step": 692 }, { "epoch": 0.35249237029501523, "grad_norm": 1.2102097272872925, "learning_rate": 1e-05, "loss": 0.5596, "mean_token_accuracy": 0.828652560710907, "num_tokens": 110556832.0, "step": 693 }, { "epoch": 0.35300101729399797, "grad_norm": 1.1342049837112427, "learning_rate": 1e-05, "loss": 0.565, "mean_token_accuracy": 0.8272509574890137, "num_tokens": 110712545.0, "step": 694 }, { "epoch": 0.3535096642929807, "grad_norm": 1.196113109588623, "learning_rate": 1e-05, "loss": 0.5294, "mean_token_accuracy": 0.8361930847167969, "num_tokens": 110855683.0, "step": 695 }, { "epoch": 0.3540183112919634, "grad_norm": 1.3626251220703125, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8290280103683472, "num_tokens": 111014148.0, "step": 696 }, { "epoch": 0.3545269582909461, "grad_norm": 1.1502712965011597, "learning_rate": 1e-05, "loss": 0.5533, "mean_token_accuracy": 0.8318491578102112, "num_tokens": 111168794.0, "step": 697 }, { "epoch": 0.3550356052899288, "grad_norm": 1.1832911968231201, "learning_rate": 1e-05, "loss": 0.5575, "mean_token_accuracy": 0.8285268545150757, "num_tokens": 111318802.0, "step": 698 }, { "epoch": 0.3555442522889115, "grad_norm": 1.1197277307510376, "learning_rate": 1e-05, "loss": 0.5411, "mean_token_accuracy": 0.8349237442016602, "num_tokens": 111482438.0, "step": 699 }, { "epoch": 0.3560528992878942, "grad_norm": 1.0746982097625732, "learning_rate": 1e-05, "loss": 0.5548, "mean_token_accuracy": 0.8285970091819763, "num_tokens": 111631108.0, "step": 700 }, { "epoch": 0.3565615462868769, "grad_norm": 1.0557246208190918, "learning_rate": 1e-05, "loss": 0.5652, "mean_token_accuracy": 0.827860951423645, "num_tokens": 111792659.0, "step": 701 }, { "epoch": 0.3570701932858596, "grad_norm": 1.2453454732894897, "learning_rate": 1e-05, "loss": 0.5597, "mean_token_accuracy": 0.8288803100585938, "num_tokens": 111956492.0, "step": 702 }, { "epoch": 0.3575788402848423, "grad_norm": 1.0866833925247192, "learning_rate": 1e-05, "loss": 0.5545, "mean_token_accuracy": 0.8305703401565552, "num_tokens": 112111925.0, "step": 703 }, { "epoch": 0.358087487283825, "grad_norm": 1.1169756650924683, "learning_rate": 1e-05, "loss": 0.5572, "mean_token_accuracy": 0.8297122120857239, "num_tokens": 112271716.0, "step": 704 }, { "epoch": 0.35859613428280773, "grad_norm": 1.1024497747421265, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.8370829820632935, "num_tokens": 112433508.0, "step": 705 }, { "epoch": 0.35910478128179046, "grad_norm": 1.0677741765975952, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8398569226264954, "num_tokens": 112595775.0, "step": 706 }, { "epoch": 0.35961342828077314, "grad_norm": 1.2112849950790405, "learning_rate": 1e-05, "loss": 0.5341, "mean_token_accuracy": 0.8345004916191101, "num_tokens": 112747723.0, "step": 707 }, { "epoch": 0.36012207527975587, "grad_norm": 1.1175358295440674, "learning_rate": 1e-05, "loss": 0.5488, "mean_token_accuracy": 0.8320543766021729, "num_tokens": 112906425.0, "step": 708 }, { "epoch": 0.36063072227873855, "grad_norm": 1.7222286462783813, "learning_rate": 1e-05, "loss": 0.5376, "mean_token_accuracy": 0.8360615372657776, "num_tokens": 113062355.0, "step": 709 }, { "epoch": 0.3611393692777213, "grad_norm": 1.2156181335449219, "learning_rate": 1e-05, "loss": 0.57, "mean_token_accuracy": 0.8246958255767822, "num_tokens": 113237249.0, "step": 710 }, { "epoch": 0.36164801627670395, "grad_norm": 1.1710059642791748, "learning_rate": 1e-05, "loss": 0.5676, "mean_token_accuracy": 0.8252333402633667, "num_tokens": 113399443.0, "step": 711 }, { "epoch": 0.3621566632756867, "grad_norm": 1.1496976613998413, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8372830152511597, "num_tokens": 113555518.0, "step": 712 }, { "epoch": 0.36266531027466936, "grad_norm": 1.176712989807129, "learning_rate": 1e-05, "loss": 0.5565, "mean_token_accuracy": 0.8295558094978333, "num_tokens": 113698492.0, "step": 713 }, { "epoch": 0.3631739572736521, "grad_norm": 1.0876966714859009, "learning_rate": 1e-05, "loss": 0.6004, "mean_token_accuracy": 0.8183210492134094, "num_tokens": 113869795.0, "step": 714 }, { "epoch": 0.36368260427263477, "grad_norm": 1.1487334966659546, "learning_rate": 1e-05, "loss": 0.5297, "mean_token_accuracy": 0.8361709117889404, "num_tokens": 114035400.0, "step": 715 }, { "epoch": 0.3641912512716175, "grad_norm": 1.0755226612091064, "learning_rate": 1e-05, "loss": 0.5701, "mean_token_accuracy": 0.8265610933303833, "num_tokens": 114180922.0, "step": 716 }, { "epoch": 0.3646998982706002, "grad_norm": 1.1360986232757568, "learning_rate": 1e-05, "loss": 0.5439, "mean_token_accuracy": 0.8341394066810608, "num_tokens": 114351814.0, "step": 717 }, { "epoch": 0.3652085452695829, "grad_norm": 1.1471298933029175, "learning_rate": 1e-05, "loss": 0.5271, "mean_token_accuracy": 0.8365101218223572, "num_tokens": 114503125.0, "step": 718 }, { "epoch": 0.36571719226856564, "grad_norm": 1.0503714084625244, "learning_rate": 1e-05, "loss": 0.5671, "mean_token_accuracy": 0.8277009725570679, "num_tokens": 114658004.0, "step": 719 }, { "epoch": 0.3662258392675483, "grad_norm": 1.220484972000122, "learning_rate": 1e-05, "loss": 0.5817, "mean_token_accuracy": 0.8230093717575073, "num_tokens": 114829230.0, "step": 720 }, { "epoch": 0.36673448626653105, "grad_norm": 1.265468716621399, "learning_rate": 1e-05, "loss": 0.5498, "mean_token_accuracy": 0.8305054903030396, "num_tokens": 114987847.0, "step": 721 }, { "epoch": 0.3672431332655137, "grad_norm": 1.0840364694595337, "learning_rate": 1e-05, "loss": 0.538, "mean_token_accuracy": 0.8362969756126404, "num_tokens": 115154558.0, "step": 722 }, { "epoch": 0.36775178026449645, "grad_norm": 1.1251972913742065, "learning_rate": 1e-05, "loss": 0.5431, "mean_token_accuracy": 0.8324218988418579, "num_tokens": 115315702.0, "step": 723 }, { "epoch": 0.36826042726347913, "grad_norm": 1.1791112422943115, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8332595825195312, "num_tokens": 115471150.0, "step": 724 }, { "epoch": 0.36876907426246186, "grad_norm": 1.0629161596298218, "learning_rate": 1e-05, "loss": 0.5626, "mean_token_accuracy": 0.8278679251670837, "num_tokens": 115630359.0, "step": 725 }, { "epoch": 0.36927772126144454, "grad_norm": 1.0933114290237427, "learning_rate": 1e-05, "loss": 0.5617, "mean_token_accuracy": 0.8298508524894714, "num_tokens": 115802068.0, "step": 726 }, { "epoch": 0.36978636826042727, "grad_norm": 1.0984097719192505, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8353390693664551, "num_tokens": 115954597.0, "step": 727 }, { "epoch": 0.37029501525940994, "grad_norm": 1.1613320112228394, "learning_rate": 1e-05, "loss": 0.5752, "mean_token_accuracy": 0.824324369430542, "num_tokens": 116121996.0, "step": 728 }, { "epoch": 0.3708036622583927, "grad_norm": 1.0691723823547363, "learning_rate": 1e-05, "loss": 0.5554, "mean_token_accuracy": 0.8294112086296082, "num_tokens": 116286650.0, "step": 729 }, { "epoch": 0.3713123092573754, "grad_norm": 1.1356724500656128, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.8361466526985168, "num_tokens": 116456648.0, "step": 730 }, { "epoch": 0.3718209562563581, "grad_norm": 1.0564593076705933, "learning_rate": 1e-05, "loss": 0.5391, "mean_token_accuracy": 0.8352751135826111, "num_tokens": 116621931.0, "step": 731 }, { "epoch": 0.3723296032553408, "grad_norm": 1.1024948358535767, "learning_rate": 1e-05, "loss": 0.5992, "mean_token_accuracy": 0.8183151483535767, "num_tokens": 116787232.0, "step": 732 }, { "epoch": 0.3728382502543235, "grad_norm": 1.2190114259719849, "learning_rate": 1e-05, "loss": 0.6021, "mean_token_accuracy": 0.8169412016868591, "num_tokens": 116949019.0, "step": 733 }, { "epoch": 0.3733468972533062, "grad_norm": 1.0475753545761108, "learning_rate": 1e-05, "loss": 0.5763, "mean_token_accuracy": 0.8254858255386353, "num_tokens": 117122455.0, "step": 734 }, { "epoch": 0.3738555442522889, "grad_norm": 1.2131965160369873, "learning_rate": 1e-05, "loss": 0.5491, "mean_token_accuracy": 0.8324887752532959, "num_tokens": 117267738.0, "step": 735 }, { "epoch": 0.3743641912512716, "grad_norm": 1.0916552543640137, "learning_rate": 1e-05, "loss": 0.5512, "mean_token_accuracy": 0.8287943601608276, "num_tokens": 117430802.0, "step": 736 }, { "epoch": 0.3748728382502543, "grad_norm": 1.0967727899551392, "learning_rate": 1e-05, "loss": 0.5625, "mean_token_accuracy": 0.826190710067749, "num_tokens": 117587032.0, "step": 737 }, { "epoch": 0.37538148524923703, "grad_norm": 1.0747716426849365, "learning_rate": 1e-05, "loss": 0.573, "mean_token_accuracy": 0.8252145051956177, "num_tokens": 117747701.0, "step": 738 }, { "epoch": 0.3758901322482197, "grad_norm": 1.0945491790771484, "learning_rate": 1e-05, "loss": 0.5474, "mean_token_accuracy": 0.8329147100448608, "num_tokens": 117912178.0, "step": 739 }, { "epoch": 0.37639877924720244, "grad_norm": 1.1760427951812744, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.8384370803833008, "num_tokens": 118069631.0, "step": 740 }, { "epoch": 0.3769074262461852, "grad_norm": 1.1599196195602417, "learning_rate": 1e-05, "loss": 0.5332, "mean_token_accuracy": 0.8354936838150024, "num_tokens": 118216192.0, "step": 741 }, { "epoch": 0.37741607324516785, "grad_norm": 1.2120177745819092, "learning_rate": 1e-05, "loss": 0.5334, "mean_token_accuracy": 0.8361068964004517, "num_tokens": 118364107.0, "step": 742 }, { "epoch": 0.3779247202441506, "grad_norm": 1.112884521484375, "learning_rate": 1e-05, "loss": 0.5787, "mean_token_accuracy": 0.8213391900062561, "num_tokens": 118532657.0, "step": 743 }, { "epoch": 0.37843336724313326, "grad_norm": 1.1208746433258057, "learning_rate": 1e-05, "loss": 0.5405, "mean_token_accuracy": 0.8333334922790527, "num_tokens": 118689942.0, "step": 744 }, { "epoch": 0.378942014242116, "grad_norm": 1.233646035194397, "learning_rate": 1e-05, "loss": 0.5404, "mean_token_accuracy": 0.8324503898620605, "num_tokens": 118861717.0, "step": 745 }, { "epoch": 0.37945066124109866, "grad_norm": 1.1507185697555542, "learning_rate": 1e-05, "loss": 0.5545, "mean_token_accuracy": 0.8297289609909058, "num_tokens": 119027905.0, "step": 746 }, { "epoch": 0.3799593082400814, "grad_norm": 1.1578809022903442, "learning_rate": 1e-05, "loss": 0.4967, "mean_token_accuracy": 0.8448212742805481, "num_tokens": 119173593.0, "step": 747 }, { "epoch": 0.38046795523906407, "grad_norm": 1.2863305807113647, "learning_rate": 1e-05, "loss": 0.5918, "mean_token_accuracy": 0.8200984001159668, "num_tokens": 119333549.0, "step": 748 }, { "epoch": 0.3809766022380468, "grad_norm": 1.2527709007263184, "learning_rate": 1e-05, "loss": 0.5377, "mean_token_accuracy": 0.8346661925315857, "num_tokens": 119485196.0, "step": 749 }, { "epoch": 0.3814852492370295, "grad_norm": 1.1452044248580933, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8352641463279724, "num_tokens": 119635566.0, "step": 750 }, { "epoch": 0.3819938962360122, "grad_norm": 1.3274515867233276, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.8326550722122192, "num_tokens": 119796534.0, "step": 751 }, { "epoch": 0.38250254323499494, "grad_norm": 1.1830132007598877, "learning_rate": 1e-05, "loss": 0.5864, "mean_token_accuracy": 0.8197687864303589, "num_tokens": 119965071.0, "step": 752 }, { "epoch": 0.3830111902339776, "grad_norm": 1.2463164329528809, "learning_rate": 1e-05, "loss": 0.5297, "mean_token_accuracy": 0.8368417024612427, "num_tokens": 120116111.0, "step": 753 }, { "epoch": 0.38351983723296035, "grad_norm": 1.2696422338485718, "learning_rate": 1e-05, "loss": 0.5701, "mean_token_accuracy": 0.8251786231994629, "num_tokens": 120275408.0, "step": 754 }, { "epoch": 0.384028484231943, "grad_norm": 1.212146520614624, "learning_rate": 1e-05, "loss": 0.5463, "mean_token_accuracy": 0.8328184485435486, "num_tokens": 120432203.0, "step": 755 }, { "epoch": 0.38453713123092575, "grad_norm": 1.281558632850647, "learning_rate": 1e-05, "loss": 0.583, "mean_token_accuracy": 0.8236889839172363, "num_tokens": 120600556.0, "step": 756 }, { "epoch": 0.38504577822990843, "grad_norm": 1.1393814086914062, "learning_rate": 1e-05, "loss": 0.5501, "mean_token_accuracy": 0.8314993977546692, "num_tokens": 120758883.0, "step": 757 }, { "epoch": 0.38555442522889116, "grad_norm": 1.2054526805877686, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.8376193046569824, "num_tokens": 120922100.0, "step": 758 }, { "epoch": 0.38606307222787384, "grad_norm": 1.3072320222854614, "learning_rate": 1e-05, "loss": 0.5511, "mean_token_accuracy": 0.8303576111793518, "num_tokens": 121075897.0, "step": 759 }, { "epoch": 0.38657171922685657, "grad_norm": 1.3093321323394775, "learning_rate": 1e-05, "loss": 0.5366, "mean_token_accuracy": 0.8357855677604675, "num_tokens": 121235661.0, "step": 760 }, { "epoch": 0.38708036622583925, "grad_norm": 1.2017589807510376, "learning_rate": 1e-05, "loss": 0.5646, "mean_token_accuracy": 0.8272026777267456, "num_tokens": 121393640.0, "step": 761 }, { "epoch": 0.387589013224822, "grad_norm": 1.2778068780899048, "learning_rate": 1e-05, "loss": 0.5603, "mean_token_accuracy": 0.8284906148910522, "num_tokens": 121539889.0, "step": 762 }, { "epoch": 0.38809766022380465, "grad_norm": 1.272511601448059, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.832564115524292, "num_tokens": 121699739.0, "step": 763 }, { "epoch": 0.3886063072227874, "grad_norm": 1.189244031906128, "learning_rate": 1e-05, "loss": 0.5028, "mean_token_accuracy": 0.8430757522583008, "num_tokens": 121844594.0, "step": 764 }, { "epoch": 0.3891149542217701, "grad_norm": 1.1704158782958984, "learning_rate": 1e-05, "loss": 0.5594, "mean_token_accuracy": 0.8299344778060913, "num_tokens": 122018838.0, "step": 765 }, { "epoch": 0.3896236012207528, "grad_norm": 1.1334116458892822, "learning_rate": 1e-05, "loss": 0.5787, "mean_token_accuracy": 0.8251534104347229, "num_tokens": 122181626.0, "step": 766 }, { "epoch": 0.3901322482197355, "grad_norm": 1.182862401008606, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8328073024749756, "num_tokens": 122336319.0, "step": 767 }, { "epoch": 0.3906408952187182, "grad_norm": 1.194340705871582, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8423342704772949, "num_tokens": 122486556.0, "step": 768 }, { "epoch": 0.39114954221770093, "grad_norm": 1.1797587871551514, "learning_rate": 1e-05, "loss": 0.556, "mean_token_accuracy": 0.8285393714904785, "num_tokens": 122643871.0, "step": 769 }, { "epoch": 0.3916581892166836, "grad_norm": 1.1087149381637573, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8313184976577759, "num_tokens": 122806517.0, "step": 770 }, { "epoch": 0.39216683621566634, "grad_norm": 1.30690598487854, "learning_rate": 1e-05, "loss": 0.5464, "mean_token_accuracy": 0.8317302465438843, "num_tokens": 122963330.0, "step": 771 }, { "epoch": 0.392675483214649, "grad_norm": 1.1016590595245361, "learning_rate": 1e-05, "loss": 0.5561, "mean_token_accuracy": 0.828952431678772, "num_tokens": 123130884.0, "step": 772 }, { "epoch": 0.39318413021363174, "grad_norm": 1.0075249671936035, "learning_rate": 1e-05, "loss": 0.5394, "mean_token_accuracy": 0.8364933729171753, "num_tokens": 123291806.0, "step": 773 }, { "epoch": 0.3936927772126144, "grad_norm": 1.066081166267395, "learning_rate": 1e-05, "loss": 0.4964, "mean_token_accuracy": 0.8464153409004211, "num_tokens": 123442859.0, "step": 774 }, { "epoch": 0.39420142421159715, "grad_norm": 1.0588123798370361, "learning_rate": 1e-05, "loss": 0.54, "mean_token_accuracy": 0.8324978947639465, "num_tokens": 123592681.0, "step": 775 }, { "epoch": 0.3947100712105799, "grad_norm": 1.2031190395355225, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8368514180183411, "num_tokens": 123764752.0, "step": 776 }, { "epoch": 0.39521871820956256, "grad_norm": 1.0871250629425049, "learning_rate": 1e-05, "loss": 0.5575, "mean_token_accuracy": 0.8293015360832214, "num_tokens": 123922538.0, "step": 777 }, { "epoch": 0.3957273652085453, "grad_norm": 1.3108958005905151, "learning_rate": 1e-05, "loss": 0.5285, "mean_token_accuracy": 0.8365644812583923, "num_tokens": 124083040.0, "step": 778 }, { "epoch": 0.39623601220752797, "grad_norm": 1.2524911165237427, "learning_rate": 1e-05, "loss": 0.5476, "mean_token_accuracy": 0.8321986198425293, "num_tokens": 124239220.0, "step": 779 }, { "epoch": 0.3967446592065107, "grad_norm": 1.0792248249053955, "learning_rate": 1e-05, "loss": 0.545, "mean_token_accuracy": 0.8336870670318604, "num_tokens": 124398859.0, "step": 780 }, { "epoch": 0.3972533062054934, "grad_norm": 1.1411080360412598, "learning_rate": 1e-05, "loss": 0.5672, "mean_token_accuracy": 0.8271961212158203, "num_tokens": 124549761.0, "step": 781 }, { "epoch": 0.3977619532044761, "grad_norm": 1.1141024827957153, "learning_rate": 1e-05, "loss": 0.5719, "mean_token_accuracy": 0.8256720304489136, "num_tokens": 124705300.0, "step": 782 }, { "epoch": 0.3982706002034588, "grad_norm": 1.1549406051635742, "learning_rate": 1e-05, "loss": 0.5497, "mean_token_accuracy": 0.830722451210022, "num_tokens": 124863824.0, "step": 783 }, { "epoch": 0.3987792472024415, "grad_norm": 1.238250494003296, "learning_rate": 1e-05, "loss": 0.5769, "mean_token_accuracy": 0.8239268660545349, "num_tokens": 125017794.0, "step": 784 }, { "epoch": 0.3992878942014242, "grad_norm": 1.1919786930084229, "learning_rate": 1e-05, "loss": 0.5661, "mean_token_accuracy": 0.829487144947052, "num_tokens": 125178609.0, "step": 785 }, { "epoch": 0.3997965412004069, "grad_norm": 1.2083204984664917, "learning_rate": 1e-05, "loss": 0.5622, "mean_token_accuracy": 0.8262380957603455, "num_tokens": 125331610.0, "step": 786 }, { "epoch": 0.40030518819938965, "grad_norm": 1.1720713376998901, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8383011817932129, "num_tokens": 125481250.0, "step": 787 }, { "epoch": 0.4008138351983723, "grad_norm": 1.2033262252807617, "learning_rate": 1e-05, "loss": 0.5629, "mean_token_accuracy": 0.8281571865081787, "num_tokens": 125650464.0, "step": 788 }, { "epoch": 0.40132248219735506, "grad_norm": 1.2404285669326782, "learning_rate": 1e-05, "loss": 0.5386, "mean_token_accuracy": 0.8318977355957031, "num_tokens": 125817057.0, "step": 789 }, { "epoch": 0.40183112919633773, "grad_norm": 1.0183967351913452, "learning_rate": 1e-05, "loss": 0.5271, "mean_token_accuracy": 0.8368712663650513, "num_tokens": 125964354.0, "step": 790 }, { "epoch": 0.40233977619532046, "grad_norm": 1.081229567527771, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8378585577011108, "num_tokens": 126119274.0, "step": 791 }, { "epoch": 0.40284842319430314, "grad_norm": 1.2605633735656738, "learning_rate": 1e-05, "loss": 0.5467, "mean_token_accuracy": 0.8325355052947998, "num_tokens": 126266332.0, "step": 792 }, { "epoch": 0.40335707019328587, "grad_norm": 1.0738158226013184, "learning_rate": 1e-05, "loss": 0.5535, "mean_token_accuracy": 0.8316068053245544, "num_tokens": 126416815.0, "step": 793 }, { "epoch": 0.40386571719226855, "grad_norm": 1.109156847000122, "learning_rate": 1e-05, "loss": 0.5332, "mean_token_accuracy": 0.8381474018096924, "num_tokens": 126582617.0, "step": 794 }, { "epoch": 0.4043743641912513, "grad_norm": 1.0866683721542358, "learning_rate": 1e-05, "loss": 0.5448, "mean_token_accuracy": 0.8342831134796143, "num_tokens": 126734456.0, "step": 795 }, { "epoch": 0.40488301119023395, "grad_norm": 1.0128190517425537, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.835590124130249, "num_tokens": 126894951.0, "step": 796 }, { "epoch": 0.4053916581892167, "grad_norm": 1.161100149154663, "learning_rate": 1e-05, "loss": 0.5605, "mean_token_accuracy": 0.8282392024993896, "num_tokens": 127040903.0, "step": 797 }, { "epoch": 0.4059003051881994, "grad_norm": 1.1611078977584839, "learning_rate": 1e-05, "loss": 0.5763, "mean_token_accuracy": 0.8230469226837158, "num_tokens": 127193010.0, "step": 798 }, { "epoch": 0.4064089521871821, "grad_norm": 1.0060153007507324, "learning_rate": 1e-05, "loss": 0.5567, "mean_token_accuracy": 0.8288756608963013, "num_tokens": 127356124.0, "step": 799 }, { "epoch": 0.4069175991861648, "grad_norm": 1.014762043952942, "learning_rate": 1e-05, "loss": 0.5359, "mean_token_accuracy": 0.8361398577690125, "num_tokens": 127530354.0, "step": 800 }, { "epoch": 0.4074262461851475, "grad_norm": 1.1564446687698364, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8339039087295532, "num_tokens": 127674617.0, "step": 801 }, { "epoch": 0.40793489318413023, "grad_norm": 1.0868537425994873, "learning_rate": 1e-05, "loss": 0.5377, "mean_token_accuracy": 0.8345374464988708, "num_tokens": 127833308.0, "step": 802 }, { "epoch": 0.4084435401831129, "grad_norm": 1.187218427658081, "learning_rate": 1e-05, "loss": 0.5677, "mean_token_accuracy": 0.826422393321991, "num_tokens": 127994496.0, "step": 803 }, { "epoch": 0.40895218718209564, "grad_norm": 1.1443551778793335, "learning_rate": 1e-05, "loss": 0.5443, "mean_token_accuracy": 0.833172082901001, "num_tokens": 128161302.0, "step": 804 }, { "epoch": 0.4094608341810783, "grad_norm": 1.140722632408142, "learning_rate": 1e-05, "loss": 0.555, "mean_token_accuracy": 0.8290700316429138, "num_tokens": 128318321.0, "step": 805 }, { "epoch": 0.40996948118006105, "grad_norm": 1.1972770690917969, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8306229114532471, "num_tokens": 128485010.0, "step": 806 }, { "epoch": 0.4104781281790437, "grad_norm": 1.1250247955322266, "learning_rate": 1e-05, "loss": 0.5709, "mean_token_accuracy": 0.8245084285736084, "num_tokens": 128651307.0, "step": 807 }, { "epoch": 0.41098677517802645, "grad_norm": 1.1499453783035278, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.8350856304168701, "num_tokens": 128794807.0, "step": 808 }, { "epoch": 0.41149542217700913, "grad_norm": 1.1552046537399292, "learning_rate": 1e-05, "loss": 0.5393, "mean_token_accuracy": 0.8341234922409058, "num_tokens": 128954027.0, "step": 809 }, { "epoch": 0.41200406917599186, "grad_norm": 1.1102688312530518, "learning_rate": 1e-05, "loss": 0.5383, "mean_token_accuracy": 0.83314049243927, "num_tokens": 129117445.0, "step": 810 }, { "epoch": 0.4125127161749746, "grad_norm": 1.0696158409118652, "learning_rate": 1e-05, "loss": 0.5389, "mean_token_accuracy": 0.8332682847976685, "num_tokens": 129285951.0, "step": 811 }, { "epoch": 0.41302136317395727, "grad_norm": 1.0423203706741333, "learning_rate": 1e-05, "loss": 0.5748, "mean_token_accuracy": 0.8261610269546509, "num_tokens": 129465561.0, "step": 812 }, { "epoch": 0.41353001017294, "grad_norm": 1.1869776248931885, "learning_rate": 1e-05, "loss": 0.4957, "mean_token_accuracy": 0.8446314334869385, "num_tokens": 129614099.0, "step": 813 }, { "epoch": 0.4140386571719227, "grad_norm": 1.0922024250030518, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.8350465297698975, "num_tokens": 129789431.0, "step": 814 }, { "epoch": 0.4145473041709054, "grad_norm": 0.968750536441803, "learning_rate": 1e-05, "loss": 0.4969, "mean_token_accuracy": 0.8457138538360596, "num_tokens": 129947839.0, "step": 815 }, { "epoch": 0.4150559511698881, "grad_norm": 1.3184736967086792, "learning_rate": 1e-05, "loss": 0.5797, "mean_token_accuracy": 0.8236253261566162, "num_tokens": 130099262.0, "step": 816 }, { "epoch": 0.4155645981688708, "grad_norm": 1.151092529296875, "learning_rate": 1e-05, "loss": 0.5413, "mean_token_accuracy": 0.8329277038574219, "num_tokens": 130262007.0, "step": 817 }, { "epoch": 0.4160732451678535, "grad_norm": 1.134224534034729, "learning_rate": 1e-05, "loss": 0.5709, "mean_token_accuracy": 0.82569420337677, "num_tokens": 130412634.0, "step": 818 }, { "epoch": 0.4165818921668362, "grad_norm": 1.132689356803894, "learning_rate": 1e-05, "loss": 0.554, "mean_token_accuracy": 0.8313882350921631, "num_tokens": 130576459.0, "step": 819 }, { "epoch": 0.4170905391658189, "grad_norm": 1.1339707374572754, "learning_rate": 1e-05, "loss": 0.5519, "mean_token_accuracy": 0.8296515345573425, "num_tokens": 130738545.0, "step": 820 }, { "epoch": 0.41759918616480163, "grad_norm": 0.9829337000846863, "learning_rate": 1e-05, "loss": 0.5683, "mean_token_accuracy": 0.8278769254684448, "num_tokens": 130899383.0, "step": 821 }, { "epoch": 0.41810783316378436, "grad_norm": 1.1367756128311157, "learning_rate": 1e-05, "loss": 0.5361, "mean_token_accuracy": 0.8330759406089783, "num_tokens": 131069889.0, "step": 822 }, { "epoch": 0.41861648016276704, "grad_norm": 1.0659213066101074, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8324937224388123, "num_tokens": 131230218.0, "step": 823 }, { "epoch": 0.41912512716174977, "grad_norm": 1.2729610204696655, "learning_rate": 1e-05, "loss": 0.5525, "mean_token_accuracy": 0.8298782706260681, "num_tokens": 131374265.0, "step": 824 }, { "epoch": 0.41963377416073244, "grad_norm": 1.1545099020004272, "learning_rate": 1e-05, "loss": 0.5354, "mean_token_accuracy": 0.8355467319488525, "num_tokens": 131543646.0, "step": 825 }, { "epoch": 0.4201424211597152, "grad_norm": 1.100499153137207, "learning_rate": 1e-05, "loss": 0.5781, "mean_token_accuracy": 0.8228617310523987, "num_tokens": 131702768.0, "step": 826 }, { "epoch": 0.42065106815869785, "grad_norm": 1.2159390449523926, "learning_rate": 1e-05, "loss": 0.5612, "mean_token_accuracy": 0.8297737836837769, "num_tokens": 131860643.0, "step": 827 }, { "epoch": 0.4211597151576806, "grad_norm": 1.0870367288589478, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8382449746131897, "num_tokens": 132005394.0, "step": 828 }, { "epoch": 0.42166836215666326, "grad_norm": 1.1786516904830933, "learning_rate": 1e-05, "loss": 0.5291, "mean_token_accuracy": 0.835383415222168, "num_tokens": 132162321.0, "step": 829 }, { "epoch": 0.422177009155646, "grad_norm": 1.0985463857650757, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.8396868109703064, "num_tokens": 132318929.0, "step": 830 }, { "epoch": 0.42268565615462866, "grad_norm": 1.0966908931732178, "learning_rate": 1e-05, "loss": 0.5353, "mean_token_accuracy": 0.836506724357605, "num_tokens": 132479785.0, "step": 831 }, { "epoch": 0.4231943031536114, "grad_norm": 1.1082007884979248, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8426880836486816, "num_tokens": 132639135.0, "step": 832 }, { "epoch": 0.4237029501525941, "grad_norm": 1.3057457208633423, "learning_rate": 1e-05, "loss": 0.5942, "mean_token_accuracy": 0.817604660987854, "num_tokens": 132798835.0, "step": 833 }, { "epoch": 0.4242115971515768, "grad_norm": 1.0456715822219849, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8367403745651245, "num_tokens": 132955884.0, "step": 834 }, { "epoch": 0.42472024415055953, "grad_norm": 1.2031054496765137, "learning_rate": 1e-05, "loss": 0.525, "mean_token_accuracy": 0.837424635887146, "num_tokens": 133118521.0, "step": 835 }, { "epoch": 0.4252288911495422, "grad_norm": 1.1078040599822998, "learning_rate": 1e-05, "loss": 0.5021, "mean_token_accuracy": 0.8442510366439819, "num_tokens": 133281242.0, "step": 836 }, { "epoch": 0.42573753814852494, "grad_norm": 1.1191807985305786, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8353614211082458, "num_tokens": 133450654.0, "step": 837 }, { "epoch": 0.4262461851475076, "grad_norm": 1.1722224950790405, "learning_rate": 1e-05, "loss": 0.5612, "mean_token_accuracy": 0.828338623046875, "num_tokens": 133620766.0, "step": 838 }, { "epoch": 0.42675483214649035, "grad_norm": 1.0733799934387207, "learning_rate": 1e-05, "loss": 0.566, "mean_token_accuracy": 0.8271428346633911, "num_tokens": 133790293.0, "step": 839 }, { "epoch": 0.427263479145473, "grad_norm": 1.1412605047225952, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8385905027389526, "num_tokens": 133962681.0, "step": 840 }, { "epoch": 0.42777212614445576, "grad_norm": 1.1903674602508545, "learning_rate": 1e-05, "loss": 0.5572, "mean_token_accuracy": 0.8283593654632568, "num_tokens": 134128875.0, "step": 841 }, { "epoch": 0.42828077314343843, "grad_norm": 1.1169062852859497, "learning_rate": 1e-05, "loss": 0.5915, "mean_token_accuracy": 0.8202317953109741, "num_tokens": 134299862.0, "step": 842 }, { "epoch": 0.42878942014242116, "grad_norm": 1.2607040405273438, "learning_rate": 1e-05, "loss": 0.5223, "mean_token_accuracy": 0.838632345199585, "num_tokens": 134447887.0, "step": 843 }, { "epoch": 0.42929806714140384, "grad_norm": 1.2096004486083984, "learning_rate": 1e-05, "loss": 0.5424, "mean_token_accuracy": 0.8333189487457275, "num_tokens": 134603411.0, "step": 844 }, { "epoch": 0.42980671414038657, "grad_norm": 1.0911368131637573, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8395025730133057, "num_tokens": 134753687.0, "step": 845 }, { "epoch": 0.4303153611393693, "grad_norm": 1.1959718465805054, "learning_rate": 1e-05, "loss": 0.5493, "mean_token_accuracy": 0.8322297930717468, "num_tokens": 134910477.0, "step": 846 }, { "epoch": 0.430824008138352, "grad_norm": 1.0427497625350952, "learning_rate": 1e-05, "loss": 0.5564, "mean_token_accuracy": 0.8288239240646362, "num_tokens": 135065619.0, "step": 847 }, { "epoch": 0.4313326551373347, "grad_norm": 1.1838765144348145, "learning_rate": 1e-05, "loss": 0.5826, "mean_token_accuracy": 0.8237420320510864, "num_tokens": 135215377.0, "step": 848 }, { "epoch": 0.4318413021363174, "grad_norm": 1.099975824356079, "learning_rate": 1e-05, "loss": 0.5574, "mean_token_accuracy": 0.8286048769950867, "num_tokens": 135371149.0, "step": 849 }, { "epoch": 0.4323499491353001, "grad_norm": 0.9886288046836853, "learning_rate": 1e-05, "loss": 0.5407, "mean_token_accuracy": 0.8347184062004089, "num_tokens": 135530087.0, "step": 850 }, { "epoch": 0.4328585961342828, "grad_norm": 1.0195449590682983, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.8346019983291626, "num_tokens": 135689766.0, "step": 851 }, { "epoch": 0.4333672431332655, "grad_norm": 1.000313639640808, "learning_rate": 1e-05, "loss": 0.5138, "mean_token_accuracy": 0.8414555191993713, "num_tokens": 135852811.0, "step": 852 }, { "epoch": 0.4338758901322482, "grad_norm": 1.0192416906356812, "learning_rate": 1e-05, "loss": 0.5371, "mean_token_accuracy": 0.8356699347496033, "num_tokens": 136011756.0, "step": 853 }, { "epoch": 0.43438453713123093, "grad_norm": 1.1201744079589844, "learning_rate": 1e-05, "loss": 0.5373, "mean_token_accuracy": 0.8338669538497925, "num_tokens": 136177024.0, "step": 854 }, { "epoch": 0.4348931841302136, "grad_norm": 1.1117981672286987, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.8413081169128418, "num_tokens": 136325905.0, "step": 855 }, { "epoch": 0.43540183112919634, "grad_norm": 1.0561431646347046, "learning_rate": 1e-05, "loss": 0.564, "mean_token_accuracy": 0.8267576098442078, "num_tokens": 136482739.0, "step": 856 }, { "epoch": 0.43591047812817907, "grad_norm": 1.1021794080734253, "learning_rate": 1e-05, "loss": 0.5187, "mean_token_accuracy": 0.840574324131012, "num_tokens": 136652674.0, "step": 857 }, { "epoch": 0.43641912512716174, "grad_norm": 1.1251834630966187, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8371627330780029, "num_tokens": 136822688.0, "step": 858 }, { "epoch": 0.4369277721261445, "grad_norm": 1.0258177518844604, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.833103597164154, "num_tokens": 136980565.0, "step": 859 }, { "epoch": 0.43743641912512715, "grad_norm": 1.0182185173034668, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.8441320657730103, "num_tokens": 137137295.0, "step": 860 }, { "epoch": 0.4379450661241099, "grad_norm": 1.1967811584472656, "learning_rate": 1e-05, "loss": 0.5279, "mean_token_accuracy": 0.8357078433036804, "num_tokens": 137301870.0, "step": 861 }, { "epoch": 0.43845371312309256, "grad_norm": 1.0632413625717163, "learning_rate": 1e-05, "loss": 0.5429, "mean_token_accuracy": 0.8325663805007935, "num_tokens": 137449013.0, "step": 862 }, { "epoch": 0.4389623601220753, "grad_norm": 1.1115227937698364, "learning_rate": 1e-05, "loss": 0.5444, "mean_token_accuracy": 0.8315908908843994, "num_tokens": 137604020.0, "step": 863 }, { "epoch": 0.43947100712105797, "grad_norm": 1.0673210620880127, "learning_rate": 1e-05, "loss": 0.5252, "mean_token_accuracy": 0.8367598652839661, "num_tokens": 137767680.0, "step": 864 }, { "epoch": 0.4399796541200407, "grad_norm": 1.0349266529083252, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8359603881835938, "num_tokens": 137912184.0, "step": 865 }, { "epoch": 0.4404883011190234, "grad_norm": 1.1605753898620605, "learning_rate": 1e-05, "loss": 0.5534, "mean_token_accuracy": 0.8329131007194519, "num_tokens": 138066689.0, "step": 866 }, { "epoch": 0.4409969481180061, "grad_norm": 1.0888206958770752, "learning_rate": 1e-05, "loss": 0.5543, "mean_token_accuracy": 0.8302878141403198, "num_tokens": 138235428.0, "step": 867 }, { "epoch": 0.44150559511698884, "grad_norm": 1.126081943511963, "learning_rate": 1e-05, "loss": 0.6004, "mean_token_accuracy": 0.8197178244590759, "num_tokens": 138390463.0, "step": 868 }, { "epoch": 0.4420142421159715, "grad_norm": 1.0957306623458862, "learning_rate": 1e-05, "loss": 0.5584, "mean_token_accuracy": 0.8287143111228943, "num_tokens": 138552942.0, "step": 869 }, { "epoch": 0.44252288911495424, "grad_norm": 1.1695512533187866, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8339910507202148, "num_tokens": 138712726.0, "step": 870 }, { "epoch": 0.4430315361139369, "grad_norm": 1.050856351852417, "learning_rate": 1e-05, "loss": 0.5312, "mean_token_accuracy": 0.83653324842453, "num_tokens": 138880829.0, "step": 871 }, { "epoch": 0.44354018311291965, "grad_norm": 1.1957542896270752, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.8297691345214844, "num_tokens": 139056494.0, "step": 872 }, { "epoch": 0.4440488301119023, "grad_norm": 1.187698245048523, "learning_rate": 1e-05, "loss": 0.6039, "mean_token_accuracy": 0.8158071041107178, "num_tokens": 139216913.0, "step": 873 }, { "epoch": 0.44455747711088506, "grad_norm": 1.103418231010437, "learning_rate": 1e-05, "loss": 0.4986, "mean_token_accuracy": 0.8445761799812317, "num_tokens": 139362971.0, "step": 874 }, { "epoch": 0.44506612410986773, "grad_norm": 1.1682536602020264, "learning_rate": 1e-05, "loss": 0.5678, "mean_token_accuracy": 0.8246084451675415, "num_tokens": 139513998.0, "step": 875 }, { "epoch": 0.44557477110885046, "grad_norm": 1.117625117301941, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8425447940826416, "num_tokens": 139661330.0, "step": 876 }, { "epoch": 0.44608341810783314, "grad_norm": 1.0645250082015991, "learning_rate": 1e-05, "loss": 0.5313, "mean_token_accuracy": 0.8361184597015381, "num_tokens": 139830108.0, "step": 877 }, { "epoch": 0.44659206510681587, "grad_norm": 1.0478426218032837, "learning_rate": 1e-05, "loss": 0.5779, "mean_token_accuracy": 0.8240824937820435, "num_tokens": 139996848.0, "step": 878 }, { "epoch": 0.4471007121057986, "grad_norm": 1.070643663406372, "learning_rate": 1e-05, "loss": 0.5489, "mean_token_accuracy": 0.8307640552520752, "num_tokens": 140162511.0, "step": 879 }, { "epoch": 0.4476093591047813, "grad_norm": 1.0568501949310303, "learning_rate": 1e-05, "loss": 0.5285, "mean_token_accuracy": 0.8367209434509277, "num_tokens": 140315404.0, "step": 880 }, { "epoch": 0.448118006103764, "grad_norm": 1.106792688369751, "learning_rate": 1e-05, "loss": 0.5716, "mean_token_accuracy": 0.8244082927703857, "num_tokens": 140482165.0, "step": 881 }, { "epoch": 0.4486266531027467, "grad_norm": 1.1020784378051758, "learning_rate": 1e-05, "loss": 0.5338, "mean_token_accuracy": 0.8342344164848328, "num_tokens": 140632686.0, "step": 882 }, { "epoch": 0.4491353001017294, "grad_norm": 1.0874664783477783, "learning_rate": 1e-05, "loss": 0.5439, "mean_token_accuracy": 0.8320925235748291, "num_tokens": 140795616.0, "step": 883 }, { "epoch": 0.4496439471007121, "grad_norm": 1.1139014959335327, "learning_rate": 1e-05, "loss": 0.5159, "mean_token_accuracy": 0.8387758135795593, "num_tokens": 140945277.0, "step": 884 }, { "epoch": 0.4501525940996948, "grad_norm": 1.216664433479309, "learning_rate": 1e-05, "loss": 0.5413, "mean_token_accuracy": 0.8327680826187134, "num_tokens": 141105845.0, "step": 885 }, { "epoch": 0.4506612410986775, "grad_norm": 1.028967022895813, "learning_rate": 1e-05, "loss": 0.5432, "mean_token_accuracy": 0.8323748111724854, "num_tokens": 141261262.0, "step": 886 }, { "epoch": 0.45116988809766023, "grad_norm": 1.1917214393615723, "learning_rate": 1e-05, "loss": 0.5199, "mean_token_accuracy": 0.8402753472328186, "num_tokens": 141419743.0, "step": 887 }, { "epoch": 0.4516785350966429, "grad_norm": 1.134547233581543, "learning_rate": 1e-05, "loss": 0.5542, "mean_token_accuracy": 0.8280611038208008, "num_tokens": 141584264.0, "step": 888 }, { "epoch": 0.45218718209562564, "grad_norm": 0.978706419467926, "learning_rate": 1e-05, "loss": 0.5456, "mean_token_accuracy": 0.8324958086013794, "num_tokens": 141744206.0, "step": 889 }, { "epoch": 0.4526958290946083, "grad_norm": 1.1821465492248535, "learning_rate": 1e-05, "loss": 0.492, "mean_token_accuracy": 0.8444518446922302, "num_tokens": 141902690.0, "step": 890 }, { "epoch": 0.45320447609359105, "grad_norm": 1.079921841621399, "learning_rate": 1e-05, "loss": 0.5396, "mean_token_accuracy": 0.8340380191802979, "num_tokens": 142051810.0, "step": 891 }, { "epoch": 0.4537131230925738, "grad_norm": 1.1663280725479126, "learning_rate": 1e-05, "loss": 0.5491, "mean_token_accuracy": 0.8292901515960693, "num_tokens": 142212848.0, "step": 892 }, { "epoch": 0.45422177009155645, "grad_norm": 1.1193724870681763, "learning_rate": 1e-05, "loss": 0.5637, "mean_token_accuracy": 0.8273748755455017, "num_tokens": 142378279.0, "step": 893 }, { "epoch": 0.4547304170905392, "grad_norm": 1.0705968141555786, "learning_rate": 1e-05, "loss": 0.5554, "mean_token_accuracy": 0.8292081952095032, "num_tokens": 142533419.0, "step": 894 }, { "epoch": 0.45523906408952186, "grad_norm": 1.0355265140533447, "learning_rate": 1e-05, "loss": 0.5433, "mean_token_accuracy": 0.8332676887512207, "num_tokens": 142681407.0, "step": 895 }, { "epoch": 0.4557477110885046, "grad_norm": 1.0541002750396729, "learning_rate": 1e-05, "loss": 0.5455, "mean_token_accuracy": 0.8321433067321777, "num_tokens": 142834121.0, "step": 896 }, { "epoch": 0.45625635808748727, "grad_norm": 1.1078165769577026, "learning_rate": 1e-05, "loss": 0.5396, "mean_token_accuracy": 0.8324682712554932, "num_tokens": 142995941.0, "step": 897 }, { "epoch": 0.45676500508647, "grad_norm": 1.0441187620162964, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8361738324165344, "num_tokens": 143165685.0, "step": 898 }, { "epoch": 0.4572736520854527, "grad_norm": 1.1353709697723389, "learning_rate": 1e-05, "loss": 0.5243, "mean_token_accuracy": 0.8376160860061646, "num_tokens": 143324501.0, "step": 899 }, { "epoch": 0.4577822990844354, "grad_norm": 1.0670422315597534, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8358619809150696, "num_tokens": 143485496.0, "step": 900 }, { "epoch": 0.4582909460834181, "grad_norm": 1.0932042598724365, "learning_rate": 1e-05, "loss": 0.5655, "mean_token_accuracy": 0.8243823051452637, "num_tokens": 143640935.0, "step": 901 }, { "epoch": 0.4587995930824008, "grad_norm": 1.2445162534713745, "learning_rate": 1e-05, "loss": 0.5523, "mean_token_accuracy": 0.8297701478004456, "num_tokens": 143805905.0, "step": 902 }, { "epoch": 0.45930824008138355, "grad_norm": 1.1864466667175293, "learning_rate": 1e-05, "loss": 0.5639, "mean_token_accuracy": 0.8264601230621338, "num_tokens": 143960760.0, "step": 903 }, { "epoch": 0.4598168870803662, "grad_norm": 1.0087008476257324, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8451262712478638, "num_tokens": 144107657.0, "step": 904 }, { "epoch": 0.46032553407934895, "grad_norm": 1.4060758352279663, "learning_rate": 1e-05, "loss": 0.5462, "mean_token_accuracy": 0.8316032886505127, "num_tokens": 144261684.0, "step": 905 }, { "epoch": 0.46083418107833163, "grad_norm": 1.3151096105575562, "learning_rate": 1e-05, "loss": 0.5599, "mean_token_accuracy": 0.8292255401611328, "num_tokens": 144426788.0, "step": 906 }, { "epoch": 0.46134282807731436, "grad_norm": 0.9935531616210938, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.8413008451461792, "num_tokens": 144588094.0, "step": 907 }, { "epoch": 0.46185147507629704, "grad_norm": 1.2904752492904663, "learning_rate": 1e-05, "loss": 0.5358, "mean_token_accuracy": 0.833750307559967, "num_tokens": 144742302.0, "step": 908 }, { "epoch": 0.46236012207527977, "grad_norm": 1.242739200592041, "learning_rate": 1e-05, "loss": 0.5541, "mean_token_accuracy": 0.8278312683105469, "num_tokens": 144889815.0, "step": 909 }, { "epoch": 0.46286876907426244, "grad_norm": 1.0914876461029053, "learning_rate": 1e-05, "loss": 0.5314, "mean_token_accuracy": 0.835478663444519, "num_tokens": 145041480.0, "step": 910 }, { "epoch": 0.4633774160732452, "grad_norm": 1.1101939678192139, "learning_rate": 1e-05, "loss": 0.5203, "mean_token_accuracy": 0.8407809138298035, "num_tokens": 145207314.0, "step": 911 }, { "epoch": 0.46388606307222785, "grad_norm": 1.0934215784072876, "learning_rate": 1e-05, "loss": 0.5442, "mean_token_accuracy": 0.8313193917274475, "num_tokens": 145365776.0, "step": 912 }, { "epoch": 0.4643947100712106, "grad_norm": 1.050016164779663, "learning_rate": 1e-05, "loss": 0.5408, "mean_token_accuracy": 0.8334711194038391, "num_tokens": 145530569.0, "step": 913 }, { "epoch": 0.4649033570701933, "grad_norm": 1.1077460050582886, "learning_rate": 1e-05, "loss": 0.5129, "mean_token_accuracy": 0.8392268419265747, "num_tokens": 145686829.0, "step": 914 }, { "epoch": 0.465412004069176, "grad_norm": 1.1671563386917114, "learning_rate": 1e-05, "loss": 0.5374, "mean_token_accuracy": 0.8326257467269897, "num_tokens": 145853471.0, "step": 915 }, { "epoch": 0.4659206510681587, "grad_norm": 1.0945162773132324, "learning_rate": 1e-05, "loss": 0.5565, "mean_token_accuracy": 0.8274140357971191, "num_tokens": 146031134.0, "step": 916 }, { "epoch": 0.4664292980671414, "grad_norm": 1.158488392829895, "learning_rate": 1e-05, "loss": 0.5865, "mean_token_accuracy": 0.8225713968276978, "num_tokens": 146186384.0, "step": 917 }, { "epoch": 0.4669379450661241, "grad_norm": 1.0633790493011475, "learning_rate": 1e-05, "loss": 0.5562, "mean_token_accuracy": 0.8297000527381897, "num_tokens": 146342004.0, "step": 918 }, { "epoch": 0.4674465920651068, "grad_norm": 1.045977234840393, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8342148065567017, "num_tokens": 146496365.0, "step": 919 }, { "epoch": 0.46795523906408953, "grad_norm": 1.0840096473693848, "learning_rate": 1e-05, "loss": 0.5541, "mean_token_accuracy": 0.830905556678772, "num_tokens": 146660600.0, "step": 920 }, { "epoch": 0.4684638860630722, "grad_norm": 1.1081016063690186, "learning_rate": 1e-05, "loss": 0.6068, "mean_token_accuracy": 0.8147741556167603, "num_tokens": 146823441.0, "step": 921 }, { "epoch": 0.46897253306205494, "grad_norm": 1.1600236892700195, "learning_rate": 1e-05, "loss": 0.5753, "mean_token_accuracy": 0.8257789611816406, "num_tokens": 146980645.0, "step": 922 }, { "epoch": 0.4694811800610376, "grad_norm": 1.0981653928756714, "learning_rate": 1e-05, "loss": 0.5677, "mean_token_accuracy": 0.825057864189148, "num_tokens": 147140337.0, "step": 923 }, { "epoch": 0.46998982706002035, "grad_norm": 1.1312999725341797, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.830898106098175, "num_tokens": 147298752.0, "step": 924 }, { "epoch": 0.470498474059003, "grad_norm": 1.0304560661315918, "learning_rate": 1e-05, "loss": 0.4943, "mean_token_accuracy": 0.845089316368103, "num_tokens": 147452293.0, "step": 925 }, { "epoch": 0.47100712105798576, "grad_norm": 1.380007028579712, "learning_rate": 1e-05, "loss": 0.5369, "mean_token_accuracy": 0.8345775604248047, "num_tokens": 147606035.0, "step": 926 }, { "epoch": 0.4715157680569685, "grad_norm": 1.200478196144104, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8416787385940552, "num_tokens": 147740900.0, "step": 927 }, { "epoch": 0.47202441505595116, "grad_norm": 1.1169615983963013, "learning_rate": 1e-05, "loss": 0.5081, "mean_token_accuracy": 0.8411614298820496, "num_tokens": 147900796.0, "step": 928 }, { "epoch": 0.4725330620549339, "grad_norm": 1.1735718250274658, "learning_rate": 1e-05, "loss": 0.5557, "mean_token_accuracy": 0.8280220031738281, "num_tokens": 148048395.0, "step": 929 }, { "epoch": 0.47304170905391657, "grad_norm": 1.0261050462722778, "learning_rate": 1e-05, "loss": 0.4892, "mean_token_accuracy": 0.8481217622756958, "num_tokens": 148213700.0, "step": 930 }, { "epoch": 0.4735503560528993, "grad_norm": 1.1236721277236938, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.8327924013137817, "num_tokens": 148360212.0, "step": 931 }, { "epoch": 0.474059003051882, "grad_norm": 1.1015719175338745, "learning_rate": 1e-05, "loss": 0.5319, "mean_token_accuracy": 0.835745632648468, "num_tokens": 148519502.0, "step": 932 }, { "epoch": 0.4745676500508647, "grad_norm": 1.4522173404693604, "learning_rate": 1e-05, "loss": 0.5592, "mean_token_accuracy": 0.8281233906745911, "num_tokens": 148685441.0, "step": 933 }, { "epoch": 0.4750762970498474, "grad_norm": 1.0404940843582153, "learning_rate": 1e-05, "loss": 0.5566, "mean_token_accuracy": 0.8302342891693115, "num_tokens": 148837050.0, "step": 934 }, { "epoch": 0.4755849440488301, "grad_norm": 1.0455938577651978, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.8396633267402649, "num_tokens": 149001259.0, "step": 935 }, { "epoch": 0.4760935910478128, "grad_norm": 1.0345394611358643, "learning_rate": 1e-05, "loss": 0.568, "mean_token_accuracy": 0.8266684412956238, "num_tokens": 149158085.0, "step": 936 }, { "epoch": 0.4766022380467955, "grad_norm": 1.1560389995574951, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8434706330299377, "num_tokens": 149306579.0, "step": 937 }, { "epoch": 0.47711088504577825, "grad_norm": 1.0784151554107666, "learning_rate": 1e-05, "loss": 0.5931, "mean_token_accuracy": 0.8210408687591553, "num_tokens": 149470715.0, "step": 938 }, { "epoch": 0.47761953204476093, "grad_norm": 1.043968677520752, "learning_rate": 1e-05, "loss": 0.5462, "mean_token_accuracy": 0.8326769471168518, "num_tokens": 149626418.0, "step": 939 }, { "epoch": 0.47812817904374366, "grad_norm": 1.0808452367782593, "learning_rate": 1e-05, "loss": 0.5607, "mean_token_accuracy": 0.8269791603088379, "num_tokens": 149779243.0, "step": 940 }, { "epoch": 0.47863682604272634, "grad_norm": 0.9930307269096375, "learning_rate": 1e-05, "loss": 0.5337, "mean_token_accuracy": 0.8348826169967651, "num_tokens": 149928437.0, "step": 941 }, { "epoch": 0.47914547304170907, "grad_norm": 1.5441179275512695, "learning_rate": 1e-05, "loss": 0.5901, "mean_token_accuracy": 0.8194471597671509, "num_tokens": 150086890.0, "step": 942 }, { "epoch": 0.47965412004069174, "grad_norm": 1.1243621110916138, "learning_rate": 1e-05, "loss": 0.5035, "mean_token_accuracy": 0.8437596559524536, "num_tokens": 150246489.0, "step": 943 }, { "epoch": 0.4801627670396745, "grad_norm": 1.0576550960540771, "learning_rate": 1e-05, "loss": 0.5325, "mean_token_accuracy": 0.8357880115509033, "num_tokens": 150414091.0, "step": 944 }, { "epoch": 0.48067141403865715, "grad_norm": 1.0353620052337646, "learning_rate": 1e-05, "loss": 0.5518, "mean_token_accuracy": 0.8307743668556213, "num_tokens": 150581538.0, "step": 945 }, { "epoch": 0.4811800610376399, "grad_norm": 1.1700119972229004, "learning_rate": 1e-05, "loss": 0.5589, "mean_token_accuracy": 0.8275570869445801, "num_tokens": 150749759.0, "step": 946 }, { "epoch": 0.48168870803662256, "grad_norm": 1.160486102104187, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8344378471374512, "num_tokens": 150911352.0, "step": 947 }, { "epoch": 0.4821973550356053, "grad_norm": 1.0820791721343994, "learning_rate": 1e-05, "loss": 0.5445, "mean_token_accuracy": 0.8328191041946411, "num_tokens": 151078517.0, "step": 948 }, { "epoch": 0.482706002034588, "grad_norm": 1.0750031471252441, "learning_rate": 1e-05, "loss": 0.5639, "mean_token_accuracy": 0.8277915716171265, "num_tokens": 151244643.0, "step": 949 }, { "epoch": 0.4832146490335707, "grad_norm": 1.0744576454162598, "learning_rate": 1e-05, "loss": 0.5193, "mean_token_accuracy": 0.8381385207176208, "num_tokens": 151404760.0, "step": 950 }, { "epoch": 0.48372329603255343, "grad_norm": 1.1603211164474487, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8365808725357056, "num_tokens": 151555236.0, "step": 951 }, { "epoch": 0.4842319430315361, "grad_norm": 1.0513938665390015, "learning_rate": 1e-05, "loss": 0.4976, "mean_token_accuracy": 0.84479820728302, "num_tokens": 151714374.0, "step": 952 }, { "epoch": 0.48474059003051884, "grad_norm": 1.2464724779129028, "learning_rate": 1e-05, "loss": 0.5563, "mean_token_accuracy": 0.8285852670669556, "num_tokens": 151873694.0, "step": 953 }, { "epoch": 0.4852492370295015, "grad_norm": 1.6022312641143799, "learning_rate": 1e-05, "loss": 0.5001, "mean_token_accuracy": 0.8462530374526978, "num_tokens": 152034628.0, "step": 954 }, { "epoch": 0.48575788402848424, "grad_norm": 1.1984519958496094, "learning_rate": 1e-05, "loss": 0.5662, "mean_token_accuracy": 0.8279759883880615, "num_tokens": 152182009.0, "step": 955 }, { "epoch": 0.4862665310274669, "grad_norm": 1.0882331132888794, "learning_rate": 1e-05, "loss": 0.5118, "mean_token_accuracy": 0.8404330015182495, "num_tokens": 152336970.0, "step": 956 }, { "epoch": 0.48677517802644965, "grad_norm": 0.9980266094207764, "learning_rate": 1e-05, "loss": 0.5558, "mean_token_accuracy": 0.8287216424942017, "num_tokens": 152512035.0, "step": 957 }, { "epoch": 0.4872838250254323, "grad_norm": 1.5697441101074219, "learning_rate": 1e-05, "loss": 0.5442, "mean_token_accuracy": 0.8325050473213196, "num_tokens": 152684663.0, "step": 958 }, { "epoch": 0.48779247202441506, "grad_norm": 1.0851205587387085, "learning_rate": 1e-05, "loss": 0.5731, "mean_token_accuracy": 0.8240979909896851, "num_tokens": 152850154.0, "step": 959 }, { "epoch": 0.4883011190233978, "grad_norm": 1.0393105745315552, "learning_rate": 1e-05, "loss": 0.5432, "mean_token_accuracy": 0.8336237072944641, "num_tokens": 153010247.0, "step": 960 }, { "epoch": 0.48880976602238047, "grad_norm": 0.9916489720344543, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8411122560501099, "num_tokens": 153174299.0, "step": 961 }, { "epoch": 0.4893184130213632, "grad_norm": 1.0968836545944214, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.837975263595581, "num_tokens": 153337306.0, "step": 962 }, { "epoch": 0.4898270600203459, "grad_norm": 1.5415338277816772, "learning_rate": 1e-05, "loss": 0.5503, "mean_token_accuracy": 0.8305727243423462, "num_tokens": 153492940.0, "step": 963 }, { "epoch": 0.4903357070193286, "grad_norm": 1.1375877857208252, "learning_rate": 1e-05, "loss": 0.5239, "mean_token_accuracy": 0.8357195854187012, "num_tokens": 153652759.0, "step": 964 }, { "epoch": 0.4908443540183113, "grad_norm": 1.08938729763031, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.8405740857124329, "num_tokens": 153818682.0, "step": 965 }, { "epoch": 0.491353001017294, "grad_norm": 1.066440463066101, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8383237719535828, "num_tokens": 153982904.0, "step": 966 }, { "epoch": 0.4918616480162767, "grad_norm": 1.0795077085494995, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.8331218361854553, "num_tokens": 154142701.0, "step": 967 }, { "epoch": 0.4923702950152594, "grad_norm": 1.1853792667388916, "learning_rate": 1e-05, "loss": 0.5836, "mean_token_accuracy": 0.8225834369659424, "num_tokens": 154318454.0, "step": 968 }, { "epoch": 0.4928789420142421, "grad_norm": 1.0849460363388062, "learning_rate": 1e-05, "loss": 0.5899, "mean_token_accuracy": 0.8213503360748291, "num_tokens": 154485998.0, "step": 969 }, { "epoch": 0.4933875890132248, "grad_norm": 1.0151082277297974, "learning_rate": 1e-05, "loss": 0.5389, "mean_token_accuracy": 0.8343636393547058, "num_tokens": 154646655.0, "step": 970 }, { "epoch": 0.4938962360122075, "grad_norm": 1.0000461339950562, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8375341892242432, "num_tokens": 154807349.0, "step": 971 }, { "epoch": 0.49440488301119023, "grad_norm": 1.0775938034057617, "learning_rate": 1e-05, "loss": 0.541, "mean_token_accuracy": 0.8329321146011353, "num_tokens": 154961491.0, "step": 972 }, { "epoch": 0.49491353001017296, "grad_norm": 1.1057847738265991, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8363903164863586, "num_tokens": 155114869.0, "step": 973 }, { "epoch": 0.49542217700915564, "grad_norm": 1.0250566005706787, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.8337911367416382, "num_tokens": 155273562.0, "step": 974 }, { "epoch": 0.49593082400813837, "grad_norm": 1.373247504234314, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8508755564689636, "num_tokens": 155431483.0, "step": 975 }, { "epoch": 0.49643947100712105, "grad_norm": 1.1253653764724731, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8335919380187988, "num_tokens": 155577059.0, "step": 976 }, { "epoch": 0.4969481180061038, "grad_norm": 1.032317042350769, "learning_rate": 1e-05, "loss": 0.5654, "mean_token_accuracy": 0.8286297917366028, "num_tokens": 155734946.0, "step": 977 }, { "epoch": 0.49745676500508645, "grad_norm": 1.1771360635757446, "learning_rate": 1e-05, "loss": 0.5462, "mean_token_accuracy": 0.833104133605957, "num_tokens": 155896969.0, "step": 978 }, { "epoch": 0.4979654120040692, "grad_norm": 1.0790753364562988, "learning_rate": 1e-05, "loss": 0.5577, "mean_token_accuracy": 0.8295459747314453, "num_tokens": 156060582.0, "step": 979 }, { "epoch": 0.49847405900305186, "grad_norm": 1.1007190942764282, "learning_rate": 1e-05, "loss": 0.5259, "mean_token_accuracy": 0.83788001537323, "num_tokens": 156221661.0, "step": 980 }, { "epoch": 0.4989827060020346, "grad_norm": 1.074344277381897, "learning_rate": 1e-05, "loss": 0.5278, "mean_token_accuracy": 0.8362329602241516, "num_tokens": 156378238.0, "step": 981 }, { "epoch": 0.49949135300101727, "grad_norm": 1.1053829193115234, "learning_rate": 1e-05, "loss": 0.5745, "mean_token_accuracy": 0.8221993446350098, "num_tokens": 156538559.0, "step": 982 }, { "epoch": 0.5, "grad_norm": 1.070326805114746, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8299859762191772, "num_tokens": 156700036.0, "step": 983 }, { "epoch": 0.5005086469989827, "grad_norm": 0.9463331699371338, "learning_rate": 1e-05, "loss": 0.5565, "mean_token_accuracy": 0.8299938440322876, "num_tokens": 156879118.0, "step": 984 }, { "epoch": 0.5010172939979655, "grad_norm": 1.009292721748352, "learning_rate": 1e-05, "loss": 0.4972, "mean_token_accuracy": 0.8462924957275391, "num_tokens": 157043565.0, "step": 985 }, { "epoch": 0.5015259409969481, "grad_norm": 1.0810712575912476, "learning_rate": 1e-05, "loss": 0.5365, "mean_token_accuracy": 0.833280086517334, "num_tokens": 157198178.0, "step": 986 }, { "epoch": 0.5020345879959308, "grad_norm": 1.0298830270767212, "learning_rate": 1e-05, "loss": 0.5399, "mean_token_accuracy": 0.8341619968414307, "num_tokens": 157349621.0, "step": 987 }, { "epoch": 0.5025432349949135, "grad_norm": 1.095278024673462, "learning_rate": 1e-05, "loss": 0.5261, "mean_token_accuracy": 0.8374025225639343, "num_tokens": 157519093.0, "step": 988 }, { "epoch": 0.5030518819938963, "grad_norm": 1.0355744361877441, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.8360305428504944, "num_tokens": 157671835.0, "step": 989 }, { "epoch": 0.503560528992879, "grad_norm": 1.0776753425598145, "learning_rate": 1e-05, "loss": 0.526, "mean_token_accuracy": 0.8369237780570984, "num_tokens": 157826508.0, "step": 990 }, { "epoch": 0.5040691759918616, "grad_norm": 1.1481845378875732, "learning_rate": 1e-05, "loss": 0.5528, "mean_token_accuracy": 0.8297188878059387, "num_tokens": 157983854.0, "step": 991 }, { "epoch": 0.5045778229908443, "grad_norm": 1.1202970743179321, "learning_rate": 1e-05, "loss": 0.5434, "mean_token_accuracy": 0.8322055339813232, "num_tokens": 158153856.0, "step": 992 }, { "epoch": 0.5050864699898271, "grad_norm": 0.963240921497345, "learning_rate": 1e-05, "loss": 0.5158, "mean_token_accuracy": 0.8408123254776001, "num_tokens": 158311723.0, "step": 993 }, { "epoch": 0.5055951169888098, "grad_norm": 1.0589247941970825, "learning_rate": 1e-05, "loss": 0.5716, "mean_token_accuracy": 0.8279574513435364, "num_tokens": 158468860.0, "step": 994 }, { "epoch": 0.5061037639877924, "grad_norm": 1.0449001789093018, "learning_rate": 1e-05, "loss": 0.547, "mean_token_accuracy": 0.8317117691040039, "num_tokens": 158623811.0, "step": 995 }, { "epoch": 0.5066124109867752, "grad_norm": 1.028795838356018, "learning_rate": 1e-05, "loss": 0.532, "mean_token_accuracy": 0.8349628448486328, "num_tokens": 158789067.0, "step": 996 }, { "epoch": 0.5071210579857579, "grad_norm": 1.1221290826797485, "learning_rate": 1e-05, "loss": 0.5566, "mean_token_accuracy": 0.8285014629364014, "num_tokens": 158950421.0, "step": 997 }, { "epoch": 0.5076297049847406, "grad_norm": 1.1094428300857544, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.8371928930282593, "num_tokens": 159115006.0, "step": 998 }, { "epoch": 0.5081383519837233, "grad_norm": 1.1847496032714844, "learning_rate": 1e-05, "loss": 0.5235, "mean_token_accuracy": 0.8361454010009766, "num_tokens": 159267147.0, "step": 999 }, { "epoch": 0.508646998982706, "grad_norm": 1.0200287103652954, "learning_rate": 1e-05, "loss": 0.516, "mean_token_accuracy": 0.8391585946083069, "num_tokens": 159430845.0, "step": 1000 }, { "epoch": 0.5091556459816887, "grad_norm": 1.096622109413147, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8338131904602051, "num_tokens": 159592673.0, "step": 1001 }, { "epoch": 0.5096642929806714, "grad_norm": 1.1021353006362915, "learning_rate": 1e-05, "loss": 0.546, "mean_token_accuracy": 0.8303905725479126, "num_tokens": 159766028.0, "step": 1002 }, { "epoch": 0.5101729399796541, "grad_norm": 1.0417699813842773, "learning_rate": 1e-05, "loss": 0.5339, "mean_token_accuracy": 0.8354570865631104, "num_tokens": 159928990.0, "step": 1003 }, { "epoch": 0.5106815869786369, "grad_norm": 1.0172051191329956, "learning_rate": 1e-05, "loss": 0.4989, "mean_token_accuracy": 0.8456557989120483, "num_tokens": 160093084.0, "step": 1004 }, { "epoch": 0.5111902339776195, "grad_norm": 1.036210536956787, "learning_rate": 1e-05, "loss": 0.5578, "mean_token_accuracy": 0.8279012441635132, "num_tokens": 160253282.0, "step": 1005 }, { "epoch": 0.5116988809766022, "grad_norm": 1.0002751350402832, "learning_rate": 1e-05, "loss": 0.5162, "mean_token_accuracy": 0.8411789536476135, "num_tokens": 160414551.0, "step": 1006 }, { "epoch": 0.512207527975585, "grad_norm": 1.19473397731781, "learning_rate": 1e-05, "loss": 0.5617, "mean_token_accuracy": 0.8280037641525269, "num_tokens": 160566903.0, "step": 1007 }, { "epoch": 0.5127161749745677, "grad_norm": 0.9549198746681213, "learning_rate": 1e-05, "loss": 0.5119, "mean_token_accuracy": 0.842778742313385, "num_tokens": 160732492.0, "step": 1008 }, { "epoch": 0.5132248219735503, "grad_norm": 1.104193091392517, "learning_rate": 1e-05, "loss": 0.5599, "mean_token_accuracy": 0.8274698853492737, "num_tokens": 160901405.0, "step": 1009 }, { "epoch": 0.513733468972533, "grad_norm": 1.0524988174438477, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.8397318124771118, "num_tokens": 161063180.0, "step": 1010 }, { "epoch": 0.5142421159715158, "grad_norm": 1.068765640258789, "learning_rate": 1e-05, "loss": 0.5529, "mean_token_accuracy": 0.831360936164856, "num_tokens": 161224667.0, "step": 1011 }, { "epoch": 0.5147507629704985, "grad_norm": 1.076060175895691, "learning_rate": 1e-05, "loss": 0.5444, "mean_token_accuracy": 0.8327022790908813, "num_tokens": 161375840.0, "step": 1012 }, { "epoch": 0.5152594099694812, "grad_norm": 1.0867894887924194, "learning_rate": 1e-05, "loss": 0.551, "mean_token_accuracy": 0.8314119577407837, "num_tokens": 161539381.0, "step": 1013 }, { "epoch": 0.5157680569684638, "grad_norm": 1.0638878345489502, "learning_rate": 1e-05, "loss": 0.5486, "mean_token_accuracy": 0.8310900926589966, "num_tokens": 161712451.0, "step": 1014 }, { "epoch": 0.5162767039674466, "grad_norm": 1.0727468729019165, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8312073945999146, "num_tokens": 161891850.0, "step": 1015 }, { "epoch": 0.5167853509664293, "grad_norm": 1.1078076362609863, "learning_rate": 1e-05, "loss": 0.5165, "mean_token_accuracy": 0.8393900394439697, "num_tokens": 162053742.0, "step": 1016 }, { "epoch": 0.517293997965412, "grad_norm": 1.01996648311615, "learning_rate": 1e-05, "loss": 0.5195, "mean_token_accuracy": 0.8393486738204956, "num_tokens": 162209650.0, "step": 1017 }, { "epoch": 0.5178026449643948, "grad_norm": 1.0445448160171509, "learning_rate": 1e-05, "loss": 0.5701, "mean_token_accuracy": 0.8260886669158936, "num_tokens": 162383502.0, "step": 1018 }, { "epoch": 0.5183112919633774, "grad_norm": 1.0632621049880981, "learning_rate": 1e-05, "loss": 0.5055, "mean_token_accuracy": 0.8429189324378967, "num_tokens": 162544656.0, "step": 1019 }, { "epoch": 0.5188199389623601, "grad_norm": 1.0640442371368408, "learning_rate": 1e-05, "loss": 0.5447, "mean_token_accuracy": 0.8306871056556702, "num_tokens": 162721800.0, "step": 1020 }, { "epoch": 0.5193285859613428, "grad_norm": 1.1594151258468628, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8341934680938721, "num_tokens": 162876310.0, "step": 1021 }, { "epoch": 0.5198372329603256, "grad_norm": 0.9929187297821045, "learning_rate": 1e-05, "loss": 0.5354, "mean_token_accuracy": 0.8347718715667725, "num_tokens": 163040860.0, "step": 1022 }, { "epoch": 0.5203458799593083, "grad_norm": 1.0557368993759155, "learning_rate": 1e-05, "loss": 0.546, "mean_token_accuracy": 0.8325394988059998, "num_tokens": 163207134.0, "step": 1023 }, { "epoch": 0.5208545269582909, "grad_norm": 1.1534385681152344, "learning_rate": 1e-05, "loss": 0.5223, "mean_token_accuracy": 0.8368192911148071, "num_tokens": 163354092.0, "step": 1024 }, { "epoch": 0.5213631739572736, "grad_norm": 1.0177485942840576, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8332100510597229, "num_tokens": 163506047.0, "step": 1025 }, { "epoch": 0.5218718209562564, "grad_norm": 1.1139676570892334, "learning_rate": 1e-05, "loss": 0.5349, "mean_token_accuracy": 0.8346984386444092, "num_tokens": 163677302.0, "step": 1026 }, { "epoch": 0.5223804679552391, "grad_norm": 1.1026079654693604, "learning_rate": 1e-05, "loss": 0.5222, "mean_token_accuracy": 0.8363239169120789, "num_tokens": 163838092.0, "step": 1027 }, { "epoch": 0.5228891149542217, "grad_norm": 1.0351805686950684, "learning_rate": 1e-05, "loss": 0.513, "mean_token_accuracy": 0.8408992290496826, "num_tokens": 163993198.0, "step": 1028 }, { "epoch": 0.5233977619532044, "grad_norm": 1.3836876153945923, "learning_rate": 1e-05, "loss": 0.5711, "mean_token_accuracy": 0.8245905637741089, "num_tokens": 164159543.0, "step": 1029 }, { "epoch": 0.5239064089521872, "grad_norm": 1.176855444908142, "learning_rate": 1e-05, "loss": 0.5256, "mean_token_accuracy": 0.8370753526687622, "num_tokens": 164325603.0, "step": 1030 }, { "epoch": 0.5244150559511699, "grad_norm": 1.0500699281692505, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8387798070907593, "num_tokens": 164481267.0, "step": 1031 }, { "epoch": 0.5249237029501526, "grad_norm": 1.14847731590271, "learning_rate": 1e-05, "loss": 0.5077, "mean_token_accuracy": 0.8410720825195312, "num_tokens": 164634955.0, "step": 1032 }, { "epoch": 0.5254323499491353, "grad_norm": 1.0690829753875732, "learning_rate": 1e-05, "loss": 0.5138, "mean_token_accuracy": 0.8392068147659302, "num_tokens": 164788873.0, "step": 1033 }, { "epoch": 0.525940996948118, "grad_norm": 1.161458969116211, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.838508129119873, "num_tokens": 164949736.0, "step": 1034 }, { "epoch": 0.5264496439471007, "grad_norm": 1.0988707542419434, "learning_rate": 1e-05, "loss": 0.5545, "mean_token_accuracy": 0.8288130164146423, "num_tokens": 165111991.0, "step": 1035 }, { "epoch": 0.5269582909460834, "grad_norm": 1.038994312286377, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8401618003845215, "num_tokens": 165268681.0, "step": 1036 }, { "epoch": 0.5274669379450662, "grad_norm": 0.9615954756736755, "learning_rate": 1e-05, "loss": 0.5043, "mean_token_accuracy": 0.8433043956756592, "num_tokens": 165439160.0, "step": 1037 }, { "epoch": 0.5279755849440488, "grad_norm": 0.9413687586784363, "learning_rate": 1e-05, "loss": 0.5042, "mean_token_accuracy": 0.8420565128326416, "num_tokens": 165611395.0, "step": 1038 }, { "epoch": 0.5284842319430315, "grad_norm": 1.0146604776382446, "learning_rate": 1e-05, "loss": 0.5521, "mean_token_accuracy": 0.8289092183113098, "num_tokens": 165770065.0, "step": 1039 }, { "epoch": 0.5289928789420142, "grad_norm": 1.0559642314910889, "learning_rate": 1e-05, "loss": 0.5718, "mean_token_accuracy": 0.8239895105361938, "num_tokens": 165930541.0, "step": 1040 }, { "epoch": 0.529501525940997, "grad_norm": 1.0382318496704102, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8355600237846375, "num_tokens": 166095798.0, "step": 1041 }, { "epoch": 0.5300101729399797, "grad_norm": 1.1315993070602417, "learning_rate": 1e-05, "loss": 0.5313, "mean_token_accuracy": 0.8369777202606201, "num_tokens": 166253912.0, "step": 1042 }, { "epoch": 0.5305188199389623, "grad_norm": 1.0437849760055542, "learning_rate": 1e-05, "loss": 0.5123, "mean_token_accuracy": 0.8429243564605713, "num_tokens": 166414865.0, "step": 1043 }, { "epoch": 0.5310274669379451, "grad_norm": 1.1972240209579468, "learning_rate": 1e-05, "loss": 0.5492, "mean_token_accuracy": 0.8301947116851807, "num_tokens": 166570026.0, "step": 1044 }, { "epoch": 0.5315361139369278, "grad_norm": 1.0601789951324463, "learning_rate": 1e-05, "loss": 0.5288, "mean_token_accuracy": 0.8359141945838928, "num_tokens": 166723092.0, "step": 1045 }, { "epoch": 0.5320447609359105, "grad_norm": 1.1009857654571533, "learning_rate": 1e-05, "loss": 0.5452, "mean_token_accuracy": 0.8325526714324951, "num_tokens": 166891414.0, "step": 1046 }, { "epoch": 0.5325534079348931, "grad_norm": 1.0576329231262207, "learning_rate": 1e-05, "loss": 0.566, "mean_token_accuracy": 0.8268650770187378, "num_tokens": 167043099.0, "step": 1047 }, { "epoch": 0.5330620549338759, "grad_norm": 1.107357144355774, "learning_rate": 1e-05, "loss": 0.5345, "mean_token_accuracy": 0.8358545303344727, "num_tokens": 167199448.0, "step": 1048 }, { "epoch": 0.5335707019328586, "grad_norm": 1.069495677947998, "learning_rate": 1e-05, "loss": 0.546, "mean_token_accuracy": 0.8324536681175232, "num_tokens": 167356786.0, "step": 1049 }, { "epoch": 0.5340793489318413, "grad_norm": 1.0422338247299194, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8359999060630798, "num_tokens": 167508836.0, "step": 1050 }, { "epoch": 0.534587995930824, "grad_norm": 1.057760238647461, "learning_rate": 1e-05, "loss": 0.5194, "mean_token_accuracy": 0.8387189507484436, "num_tokens": 167670690.0, "step": 1051 }, { "epoch": 0.5350966429298067, "grad_norm": 1.1019717454910278, "learning_rate": 1e-05, "loss": 0.5587, "mean_token_accuracy": 0.8284574151039124, "num_tokens": 167821540.0, "step": 1052 }, { "epoch": 0.5356052899287894, "grad_norm": 1.0659122467041016, "learning_rate": 1e-05, "loss": 0.5128, "mean_token_accuracy": 0.8404115438461304, "num_tokens": 167998236.0, "step": 1053 }, { "epoch": 0.5361139369277721, "grad_norm": 1.0259801149368286, "learning_rate": 1e-05, "loss": 0.5335, "mean_token_accuracy": 0.8355931043624878, "num_tokens": 168163810.0, "step": 1054 }, { "epoch": 0.5366225839267549, "grad_norm": 1.013709306716919, "learning_rate": 1e-05, "loss": 0.5012, "mean_token_accuracy": 0.8424390554428101, "num_tokens": 168312526.0, "step": 1055 }, { "epoch": 0.5371312309257376, "grad_norm": 1.0958232879638672, "learning_rate": 1e-05, "loss": 0.5175, "mean_token_accuracy": 0.8409861326217651, "num_tokens": 168465144.0, "step": 1056 }, { "epoch": 0.5376398779247202, "grad_norm": 1.0535881519317627, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8389724493026733, "num_tokens": 168621602.0, "step": 1057 }, { "epoch": 0.5381485249237029, "grad_norm": 1.065081238746643, "learning_rate": 1e-05, "loss": 0.5579, "mean_token_accuracy": 0.8279095888137817, "num_tokens": 168772296.0, "step": 1058 }, { "epoch": 0.5386571719226857, "grad_norm": 1.122185468673706, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8379378914833069, "num_tokens": 168930909.0, "step": 1059 }, { "epoch": 0.5391658189216684, "grad_norm": 1.0356076955795288, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8306833505630493, "num_tokens": 169084941.0, "step": 1060 }, { "epoch": 0.539674465920651, "grad_norm": 1.203890323638916, "learning_rate": 1e-05, "loss": 0.516, "mean_token_accuracy": 0.8395755290985107, "num_tokens": 169235523.0, "step": 1061 }, { "epoch": 0.5401831129196337, "grad_norm": 1.3647130727767944, "learning_rate": 1e-05, "loss": 0.516, "mean_token_accuracy": 0.8395922183990479, "num_tokens": 169398931.0, "step": 1062 }, { "epoch": 0.5406917599186165, "grad_norm": 1.1263409852981567, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8389729261398315, "num_tokens": 169559733.0, "step": 1063 }, { "epoch": 0.5412004069175992, "grad_norm": 0.9808051586151123, "learning_rate": 1e-05, "loss": 0.5292, "mean_token_accuracy": 0.8377224206924438, "num_tokens": 169713890.0, "step": 1064 }, { "epoch": 0.5417090539165819, "grad_norm": 1.0912060737609863, "learning_rate": 1e-05, "loss": 0.5197, "mean_token_accuracy": 0.8408249020576477, "num_tokens": 169872319.0, "step": 1065 }, { "epoch": 0.5422177009155646, "grad_norm": 1.1292952299118042, "learning_rate": 1e-05, "loss": 0.5675, "mean_token_accuracy": 0.8258577585220337, "num_tokens": 170042021.0, "step": 1066 }, { "epoch": 0.5427263479145473, "grad_norm": 1.1030066013336182, "learning_rate": 1e-05, "loss": 0.5364, "mean_token_accuracy": 0.8327227830886841, "num_tokens": 170190297.0, "step": 1067 }, { "epoch": 0.54323499491353, "grad_norm": 1.0513752698898315, "learning_rate": 1e-05, "loss": 0.5024, "mean_token_accuracy": 0.8434488773345947, "num_tokens": 170344681.0, "step": 1068 }, { "epoch": 0.5437436419125127, "grad_norm": 1.0956968069076538, "learning_rate": 1e-05, "loss": 0.5461, "mean_token_accuracy": 0.8309398889541626, "num_tokens": 170503123.0, "step": 1069 }, { "epoch": 0.5442522889114955, "grad_norm": 1.0473368167877197, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8388941287994385, "num_tokens": 170662002.0, "step": 1070 }, { "epoch": 0.5447609359104781, "grad_norm": 1.2247461080551147, "learning_rate": 1e-05, "loss": 0.5789, "mean_token_accuracy": 0.8226563334465027, "num_tokens": 170819743.0, "step": 1071 }, { "epoch": 0.5452695829094608, "grad_norm": 0.992435872554779, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8444223999977112, "num_tokens": 170965381.0, "step": 1072 }, { "epoch": 0.5457782299084435, "grad_norm": 1.111801266670227, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.8337465524673462, "num_tokens": 171118434.0, "step": 1073 }, { "epoch": 0.5462868769074263, "grad_norm": 0.9795582294464111, "learning_rate": 1e-05, "loss": 0.583, "mean_token_accuracy": 0.8244104981422424, "num_tokens": 171297534.0, "step": 1074 }, { "epoch": 0.546795523906409, "grad_norm": 1.0069224834442139, "learning_rate": 1e-05, "loss": 0.5169, "mean_token_accuracy": 0.8392422199249268, "num_tokens": 171454468.0, "step": 1075 }, { "epoch": 0.5473041709053916, "grad_norm": 1.1006546020507812, "learning_rate": 1e-05, "loss": 0.5477, "mean_token_accuracy": 0.830169677734375, "num_tokens": 171615049.0, "step": 1076 }, { "epoch": 0.5478128179043744, "grad_norm": 1.012514591217041, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8388389348983765, "num_tokens": 171777974.0, "step": 1077 }, { "epoch": 0.5483214649033571, "grad_norm": 1.052994728088379, "learning_rate": 1e-05, "loss": 0.5153, "mean_token_accuracy": 0.8392099738121033, "num_tokens": 171931276.0, "step": 1078 }, { "epoch": 0.5488301119023398, "grad_norm": 1.0102014541625977, "learning_rate": 1e-05, "loss": 0.5226, "mean_token_accuracy": 0.838862419128418, "num_tokens": 172092986.0, "step": 1079 }, { "epoch": 0.5493387589013224, "grad_norm": 1.006404161453247, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8348338603973389, "num_tokens": 172259334.0, "step": 1080 }, { "epoch": 0.5498474059003052, "grad_norm": 1.0299087762832642, "learning_rate": 1e-05, "loss": 0.5221, "mean_token_accuracy": 0.840072751045227, "num_tokens": 172432734.0, "step": 1081 }, { "epoch": 0.5503560528992879, "grad_norm": 1.0475842952728271, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.840729296207428, "num_tokens": 172580086.0, "step": 1082 }, { "epoch": 0.5508646998982706, "grad_norm": 1.0941567420959473, "learning_rate": 1e-05, "loss": 0.5321, "mean_token_accuracy": 0.8355224132537842, "num_tokens": 172733854.0, "step": 1083 }, { "epoch": 0.5513733468972533, "grad_norm": 1.0811351537704468, "learning_rate": 1e-05, "loss": 0.5478, "mean_token_accuracy": 0.8315902948379517, "num_tokens": 172895819.0, "step": 1084 }, { "epoch": 0.551881993896236, "grad_norm": 1.003430724143982, "learning_rate": 1e-05, "loss": 0.5164, "mean_token_accuracy": 0.8390946388244629, "num_tokens": 173057284.0, "step": 1085 }, { "epoch": 0.5523906408952187, "grad_norm": 1.0279691219329834, "learning_rate": 1e-05, "loss": 0.5143, "mean_token_accuracy": 0.8408864140510559, "num_tokens": 173207899.0, "step": 1086 }, { "epoch": 0.5528992878942014, "grad_norm": 1.1198608875274658, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.8336430788040161, "num_tokens": 173378839.0, "step": 1087 }, { "epoch": 0.5534079348931842, "grad_norm": 1.075195074081421, "learning_rate": 1e-05, "loss": 0.5266, "mean_token_accuracy": 0.8351539373397827, "num_tokens": 173547290.0, "step": 1088 }, { "epoch": 0.5539165818921669, "grad_norm": 1.0774073600769043, "learning_rate": 1e-05, "loss": 0.5401, "mean_token_accuracy": 0.8340626955032349, "num_tokens": 173706562.0, "step": 1089 }, { "epoch": 0.5544252288911495, "grad_norm": 1.069839358329773, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8282644748687744, "num_tokens": 173872148.0, "step": 1090 }, { "epoch": 0.5549338758901322, "grad_norm": 1.0688462257385254, "learning_rate": 1e-05, "loss": 0.5197, "mean_token_accuracy": 0.8376371264457703, "num_tokens": 174021510.0, "step": 1091 }, { "epoch": 0.555442522889115, "grad_norm": 1.1761980056762695, "learning_rate": 1e-05, "loss": 0.566, "mean_token_accuracy": 0.8264144659042358, "num_tokens": 174174594.0, "step": 1092 }, { "epoch": 0.5559511698880977, "grad_norm": 1.1928164958953857, "learning_rate": 1e-05, "loss": 0.5519, "mean_token_accuracy": 0.8272026181221008, "num_tokens": 174327672.0, "step": 1093 }, { "epoch": 0.5564598168870803, "grad_norm": 1.1046916246414185, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.8390681743621826, "num_tokens": 174483118.0, "step": 1094 }, { "epoch": 0.556968463886063, "grad_norm": 1.0523358583450317, "learning_rate": 1e-05, "loss": 0.5209, "mean_token_accuracy": 0.8369934558868408, "num_tokens": 174644489.0, "step": 1095 }, { "epoch": 0.5574771108850458, "grad_norm": 1.1735951900482178, "learning_rate": 1e-05, "loss": 0.5318, "mean_token_accuracy": 0.8356931209564209, "num_tokens": 174798555.0, "step": 1096 }, { "epoch": 0.5579857578840285, "grad_norm": 1.0315639972686768, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.837049126625061, "num_tokens": 174955732.0, "step": 1097 }, { "epoch": 0.5584944048830112, "grad_norm": 0.9019716382026672, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.8368342518806458, "num_tokens": 175130207.0, "step": 1098 }, { "epoch": 0.559003051881994, "grad_norm": 1.1175049543380737, "learning_rate": 1e-05, "loss": 0.507, "mean_token_accuracy": 0.8416058421134949, "num_tokens": 175283656.0, "step": 1099 }, { "epoch": 0.5595116988809766, "grad_norm": 1.0007203817367554, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8406989574432373, "num_tokens": 175445505.0, "step": 1100 }, { "epoch": 0.5600203458799593, "grad_norm": 1.105495572090149, "learning_rate": 1e-05, "loss": 0.5244, "mean_token_accuracy": 0.8378565907478333, "num_tokens": 175600838.0, "step": 1101 }, { "epoch": 0.560528992878942, "grad_norm": 1.0666669607162476, "learning_rate": 1e-05, "loss": 0.552, "mean_token_accuracy": 0.8297303915023804, "num_tokens": 175767713.0, "step": 1102 }, { "epoch": 0.5610376398779248, "grad_norm": 1.0583263635635376, "learning_rate": 1e-05, "loss": 0.5514, "mean_token_accuracy": 0.8285160064697266, "num_tokens": 175927627.0, "step": 1103 }, { "epoch": 0.5615462868769074, "grad_norm": 1.0607812404632568, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8328511714935303, "num_tokens": 176094548.0, "step": 1104 }, { "epoch": 0.5620549338758901, "grad_norm": 1.0297958850860596, "learning_rate": 1e-05, "loss": 0.525, "mean_token_accuracy": 0.8363398313522339, "num_tokens": 176249226.0, "step": 1105 }, { "epoch": 0.5625635808748728, "grad_norm": 0.9327942132949829, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.8366088271141052, "num_tokens": 176419770.0, "step": 1106 }, { "epoch": 0.5630722278738556, "grad_norm": 1.3391690254211426, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8409700393676758, "num_tokens": 176579372.0, "step": 1107 }, { "epoch": 0.5635808748728383, "grad_norm": 1.161334753036499, "learning_rate": 1e-05, "loss": 0.55, "mean_token_accuracy": 0.830707848072052, "num_tokens": 176731375.0, "step": 1108 }, { "epoch": 0.5640895218718209, "grad_norm": 0.9866883158683777, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8403599262237549, "num_tokens": 176882380.0, "step": 1109 }, { "epoch": 0.5645981688708036, "grad_norm": 1.0278397798538208, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.8405447006225586, "num_tokens": 177037040.0, "step": 1110 }, { "epoch": 0.5651068158697864, "grad_norm": 1.0140935182571411, "learning_rate": 1e-05, "loss": 0.5535, "mean_token_accuracy": 0.8283101320266724, "num_tokens": 177193855.0, "step": 1111 }, { "epoch": 0.5656154628687691, "grad_norm": 1.0519375801086426, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8352972269058228, "num_tokens": 177353869.0, "step": 1112 }, { "epoch": 0.5661241098677517, "grad_norm": 1.0424383878707886, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8369056582450867, "num_tokens": 177503862.0, "step": 1113 }, { "epoch": 0.5666327568667345, "grad_norm": 1.0207585096359253, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8427259922027588, "num_tokens": 177663239.0, "step": 1114 }, { "epoch": 0.5671414038657172, "grad_norm": 1.1228545904159546, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.8402339220046997, "num_tokens": 177822738.0, "step": 1115 }, { "epoch": 0.5676500508646999, "grad_norm": 1.2176703214645386, "learning_rate": 1e-05, "loss": 0.5286, "mean_token_accuracy": 0.836188793182373, "num_tokens": 177989713.0, "step": 1116 }, { "epoch": 0.5681586978636826, "grad_norm": 1.0226936340332031, "learning_rate": 1e-05, "loss": 0.542, "mean_token_accuracy": 0.8335666656494141, "num_tokens": 178140529.0, "step": 1117 }, { "epoch": 0.5686673448626653, "grad_norm": 0.9989066123962402, "learning_rate": 1e-05, "loss": 0.5325, "mean_token_accuracy": 0.8349342942237854, "num_tokens": 178311252.0, "step": 1118 }, { "epoch": 0.569175991861648, "grad_norm": 1.1514434814453125, "learning_rate": 1e-05, "loss": 0.5541, "mean_token_accuracy": 0.828987181186676, "num_tokens": 178475164.0, "step": 1119 }, { "epoch": 0.5696846388606307, "grad_norm": 1.0415675640106201, "learning_rate": 1e-05, "loss": 0.5576, "mean_token_accuracy": 0.8286171555519104, "num_tokens": 178635634.0, "step": 1120 }, { "epoch": 0.5701932858596134, "grad_norm": 1.0814350843429565, "learning_rate": 1e-05, "loss": 0.557, "mean_token_accuracy": 0.8278229236602783, "num_tokens": 178786348.0, "step": 1121 }, { "epoch": 0.5707019328585962, "grad_norm": 0.9982712268829346, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8349918127059937, "num_tokens": 178948373.0, "step": 1122 }, { "epoch": 0.5712105798575788, "grad_norm": 1.150430679321289, "learning_rate": 1e-05, "loss": 0.4993, "mean_token_accuracy": 0.8427037000656128, "num_tokens": 179104122.0, "step": 1123 }, { "epoch": 0.5717192268565615, "grad_norm": 0.967299222946167, "learning_rate": 1e-05, "loss": 0.5227, "mean_token_accuracy": 0.8367307782173157, "num_tokens": 179265598.0, "step": 1124 }, { "epoch": 0.5722278738555443, "grad_norm": 0.9581523537635803, "learning_rate": 1e-05, "loss": 0.5429, "mean_token_accuracy": 0.8328096866607666, "num_tokens": 179417035.0, "step": 1125 }, { "epoch": 0.572736520854527, "grad_norm": 1.031461477279663, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8411537408828735, "num_tokens": 179577100.0, "step": 1126 }, { "epoch": 0.5732451678535097, "grad_norm": 0.9838495254516602, "learning_rate": 1e-05, "loss": 0.5457, "mean_token_accuracy": 0.8309280872344971, "num_tokens": 179744420.0, "step": 1127 }, { "epoch": 0.5737538148524923, "grad_norm": 1.0091938972473145, "learning_rate": 1e-05, "loss": 0.5306, "mean_token_accuracy": 0.8353382349014282, "num_tokens": 179892505.0, "step": 1128 }, { "epoch": 0.5742624618514751, "grad_norm": 1.0297493934631348, "learning_rate": 1e-05, "loss": 0.5522, "mean_token_accuracy": 0.8297716379165649, "num_tokens": 180051796.0, "step": 1129 }, { "epoch": 0.5747711088504578, "grad_norm": 0.9671316742897034, "learning_rate": 1e-05, "loss": 0.5245, "mean_token_accuracy": 0.8383359313011169, "num_tokens": 180223161.0, "step": 1130 }, { "epoch": 0.5752797558494405, "grad_norm": 0.9914503693580627, "learning_rate": 1e-05, "loss": 0.5439, "mean_token_accuracy": 0.8331002593040466, "num_tokens": 180381038.0, "step": 1131 }, { "epoch": 0.5757884028484231, "grad_norm": 1.0318700075149536, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8343907594680786, "num_tokens": 180529292.0, "step": 1132 }, { "epoch": 0.5762970498474059, "grad_norm": 1.076903223991394, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8403801918029785, "num_tokens": 180673192.0, "step": 1133 }, { "epoch": 0.5768056968463886, "grad_norm": 1.013776183128357, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8366549015045166, "num_tokens": 180831990.0, "step": 1134 }, { "epoch": 0.5773143438453713, "grad_norm": 0.9801108241081238, "learning_rate": 1e-05, "loss": 0.5405, "mean_token_accuracy": 0.8329926133155823, "num_tokens": 180994135.0, "step": 1135 }, { "epoch": 0.5778229908443541, "grad_norm": 1.0288931131362915, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8331727981567383, "num_tokens": 181158179.0, "step": 1136 }, { "epoch": 0.5783316378433367, "grad_norm": 0.9568202495574951, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.8413141965866089, "num_tokens": 181316469.0, "step": 1137 }, { "epoch": 0.5788402848423194, "grad_norm": 0.9759686589241028, "learning_rate": 1e-05, "loss": 0.5088, "mean_token_accuracy": 0.8403933048248291, "num_tokens": 181465876.0, "step": 1138 }, { "epoch": 0.5793489318413021, "grad_norm": 0.9994838237762451, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.842954695224762, "num_tokens": 181626190.0, "step": 1139 }, { "epoch": 0.5798575788402849, "grad_norm": 1.0594063997268677, "learning_rate": 1e-05, "loss": 0.5766, "mean_token_accuracy": 0.8225234746932983, "num_tokens": 181772486.0, "step": 1140 }, { "epoch": 0.5803662258392676, "grad_norm": 0.9984065294265747, "learning_rate": 1e-05, "loss": 0.537, "mean_token_accuracy": 0.8352517485618591, "num_tokens": 181930828.0, "step": 1141 }, { "epoch": 0.5808748728382502, "grad_norm": 0.9843501448631287, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8349761366844177, "num_tokens": 182092553.0, "step": 1142 }, { "epoch": 0.5813835198372329, "grad_norm": 1.0099085569381714, "learning_rate": 1e-05, "loss": 0.5352, "mean_token_accuracy": 0.834098219871521, "num_tokens": 182242889.0, "step": 1143 }, { "epoch": 0.5818921668362157, "grad_norm": 1.0297266244888306, "learning_rate": 1e-05, "loss": 0.5475, "mean_token_accuracy": 0.8309882879257202, "num_tokens": 182399236.0, "step": 1144 }, { "epoch": 0.5824008138351984, "grad_norm": 0.9727159142494202, "learning_rate": 1e-05, "loss": 0.5545, "mean_token_accuracy": 0.8294179439544678, "num_tokens": 182555200.0, "step": 1145 }, { "epoch": 0.582909460834181, "grad_norm": 1.0565613508224487, "learning_rate": 1e-05, "loss": 0.5345, "mean_token_accuracy": 0.8345987796783447, "num_tokens": 182711095.0, "step": 1146 }, { "epoch": 0.5834181078331638, "grad_norm": 0.9976463317871094, "learning_rate": 1e-05, "loss": 0.5551, "mean_token_accuracy": 0.8283690810203552, "num_tokens": 182884563.0, "step": 1147 }, { "epoch": 0.5839267548321465, "grad_norm": 1.0735167264938354, "learning_rate": 1e-05, "loss": 0.5702, "mean_token_accuracy": 0.8263883590698242, "num_tokens": 183045279.0, "step": 1148 }, { "epoch": 0.5844354018311292, "grad_norm": 1.0010398626327515, "learning_rate": 1e-05, "loss": 0.535, "mean_token_accuracy": 0.8358119130134583, "num_tokens": 183209431.0, "step": 1149 }, { "epoch": 0.5849440488301119, "grad_norm": 1.0518771409988403, "learning_rate": 1e-05, "loss": 0.5117, "mean_token_accuracy": 0.8412382006645203, "num_tokens": 183377751.0, "step": 1150 }, { "epoch": 0.5854526958290946, "grad_norm": 1.0084384679794312, "learning_rate": 1e-05, "loss": 0.514, "mean_token_accuracy": 0.8410466313362122, "num_tokens": 183530201.0, "step": 1151 }, { "epoch": 0.5859613428280773, "grad_norm": 1.0706181526184082, "learning_rate": 1e-05, "loss": 0.5386, "mean_token_accuracy": 0.8333868980407715, "num_tokens": 183676743.0, "step": 1152 }, { "epoch": 0.58646998982706, "grad_norm": 0.9817941188812256, "learning_rate": 1e-05, "loss": 0.5193, "mean_token_accuracy": 0.8401201963424683, "num_tokens": 183838597.0, "step": 1153 }, { "epoch": 0.5869786368260427, "grad_norm": 1.0238702297210693, "learning_rate": 1e-05, "loss": 0.5262, "mean_token_accuracy": 0.8377246260643005, "num_tokens": 184006491.0, "step": 1154 }, { "epoch": 0.5874872838250255, "grad_norm": 0.975884199142456, "learning_rate": 1e-05, "loss": 0.5243, "mean_token_accuracy": 0.8374344706535339, "num_tokens": 184170537.0, "step": 1155 }, { "epoch": 0.5879959308240081, "grad_norm": 1.1449670791625977, "learning_rate": 1e-05, "loss": 0.5142, "mean_token_accuracy": 0.8390841484069824, "num_tokens": 184334536.0, "step": 1156 }, { "epoch": 0.5885045778229908, "grad_norm": 1.1105258464813232, "learning_rate": 1e-05, "loss": 0.5109, "mean_token_accuracy": 0.8375200033187866, "num_tokens": 184489719.0, "step": 1157 }, { "epoch": 0.5890132248219736, "grad_norm": 1.1190193891525269, "learning_rate": 1e-05, "loss": 0.5498, "mean_token_accuracy": 0.8310710191726685, "num_tokens": 184644555.0, "step": 1158 }, { "epoch": 0.5895218718209563, "grad_norm": 1.0898367166519165, "learning_rate": 1e-05, "loss": 0.5203, "mean_token_accuracy": 0.8381561040878296, "num_tokens": 184803771.0, "step": 1159 }, { "epoch": 0.590030518819939, "grad_norm": 1.0364891290664673, "learning_rate": 1e-05, "loss": 0.5385, "mean_token_accuracy": 0.832038164138794, "num_tokens": 184956764.0, "step": 1160 }, { "epoch": 0.5905391658189216, "grad_norm": 1.103318691253662, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.8410016298294067, "num_tokens": 185122654.0, "step": 1161 }, { "epoch": 0.5910478128179044, "grad_norm": 1.0777701139450073, "learning_rate": 1e-05, "loss": 0.5444, "mean_token_accuracy": 0.8325847387313843, "num_tokens": 185278874.0, "step": 1162 }, { "epoch": 0.5915564598168871, "grad_norm": 1.1580455303192139, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8376891613006592, "num_tokens": 185438919.0, "step": 1163 }, { "epoch": 0.5920651068158698, "grad_norm": 1.1831949949264526, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8394304513931274, "num_tokens": 185603559.0, "step": 1164 }, { "epoch": 0.5925737538148524, "grad_norm": 1.07766592502594, "learning_rate": 1e-05, "loss": 0.5117, "mean_token_accuracy": 0.8413243293762207, "num_tokens": 185745740.0, "step": 1165 }, { "epoch": 0.5930824008138352, "grad_norm": 1.0141584873199463, "learning_rate": 1e-05, "loss": 0.543, "mean_token_accuracy": 0.8322431445121765, "num_tokens": 185904158.0, "step": 1166 }, { "epoch": 0.5935910478128179, "grad_norm": 1.1730612516403198, "learning_rate": 1e-05, "loss": 0.53, "mean_token_accuracy": 0.8371715545654297, "num_tokens": 186062692.0, "step": 1167 }, { "epoch": 0.5940996948118006, "grad_norm": 1.1393283605575562, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.8407073020935059, "num_tokens": 186208990.0, "step": 1168 }, { "epoch": 0.5946083418107834, "grad_norm": 1.0351285934448242, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8370941877365112, "num_tokens": 186353260.0, "step": 1169 }, { "epoch": 0.595116988809766, "grad_norm": 1.1622053384780884, "learning_rate": 1e-05, "loss": 0.5739, "mean_token_accuracy": 0.8237625360488892, "num_tokens": 186527391.0, "step": 1170 }, { "epoch": 0.5956256358087487, "grad_norm": 1.1217572689056396, "learning_rate": 1e-05, "loss": 0.544, "mean_token_accuracy": 0.8346309065818787, "num_tokens": 186690975.0, "step": 1171 }, { "epoch": 0.5961342828077314, "grad_norm": 1.0588243007659912, "learning_rate": 1e-05, "loss": 0.4896, "mean_token_accuracy": 0.8487622737884521, "num_tokens": 186851711.0, "step": 1172 }, { "epoch": 0.5966429298067142, "grad_norm": 1.043603539466858, "learning_rate": 1e-05, "loss": 0.5245, "mean_token_accuracy": 0.8333913087844849, "num_tokens": 187016950.0, "step": 1173 }, { "epoch": 0.5971515768056969, "grad_norm": 1.1444929838180542, "learning_rate": 1e-05, "loss": 0.5408, "mean_token_accuracy": 0.8336371183395386, "num_tokens": 187158410.0, "step": 1174 }, { "epoch": 0.5976602238046795, "grad_norm": 1.0118918418884277, "learning_rate": 1e-05, "loss": 0.4959, "mean_token_accuracy": 0.8448071479797363, "num_tokens": 187319518.0, "step": 1175 }, { "epoch": 0.5981688708036622, "grad_norm": 1.0308904647827148, "learning_rate": 1e-05, "loss": 0.5207, "mean_token_accuracy": 0.8391602039337158, "num_tokens": 187492387.0, "step": 1176 }, { "epoch": 0.598677517802645, "grad_norm": 1.0049819946289062, "learning_rate": 1e-05, "loss": 0.5559, "mean_token_accuracy": 0.8291609287261963, "num_tokens": 187660820.0, "step": 1177 }, { "epoch": 0.5991861648016277, "grad_norm": 1.1178535223007202, "learning_rate": 1e-05, "loss": 0.5368, "mean_token_accuracy": 0.8333885073661804, "num_tokens": 187815200.0, "step": 1178 }, { "epoch": 0.5996948118006104, "grad_norm": 1.0717005729675293, "learning_rate": 1e-05, "loss": 0.5124, "mean_token_accuracy": 0.8403778076171875, "num_tokens": 187976720.0, "step": 1179 }, { "epoch": 0.6002034587995931, "grad_norm": 0.9705209136009216, "learning_rate": 1e-05, "loss": 0.5009, "mean_token_accuracy": 0.8434971570968628, "num_tokens": 188141681.0, "step": 1180 }, { "epoch": 0.6007121057985758, "grad_norm": 1.0719575881958008, "learning_rate": 1e-05, "loss": 0.5254, "mean_token_accuracy": 0.8383431434631348, "num_tokens": 188306729.0, "step": 1181 }, { "epoch": 0.6012207527975585, "grad_norm": 0.9925136566162109, "learning_rate": 1e-05, "loss": 0.5582, "mean_token_accuracy": 0.82879239320755, "num_tokens": 188459816.0, "step": 1182 }, { "epoch": 0.6017293997965412, "grad_norm": 1.0106931924819946, "learning_rate": 1e-05, "loss": 0.5462, "mean_token_accuracy": 0.82989501953125, "num_tokens": 188624442.0, "step": 1183 }, { "epoch": 0.602238046795524, "grad_norm": 0.9931594729423523, "learning_rate": 1e-05, "loss": 0.5471, "mean_token_accuracy": 0.8313771486282349, "num_tokens": 188793110.0, "step": 1184 }, { "epoch": 0.6027466937945066, "grad_norm": 0.9614710807800293, "learning_rate": 1e-05, "loss": 0.5012, "mean_token_accuracy": 0.8416813611984253, "num_tokens": 188959411.0, "step": 1185 }, { "epoch": 0.6032553407934893, "grad_norm": 1.049680471420288, "learning_rate": 1e-05, "loss": 0.524, "mean_token_accuracy": 0.8369648456573486, "num_tokens": 189123138.0, "step": 1186 }, { "epoch": 0.603763987792472, "grad_norm": 1.0057904720306396, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8387330770492554, "num_tokens": 189288712.0, "step": 1187 }, { "epoch": 0.6042726347914548, "grad_norm": 1.028718113899231, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.839519739151001, "num_tokens": 189448908.0, "step": 1188 }, { "epoch": 0.6047812817904374, "grad_norm": 1.1342430114746094, "learning_rate": 1e-05, "loss": 0.5422, "mean_token_accuracy": 0.8313463926315308, "num_tokens": 189601380.0, "step": 1189 }, { "epoch": 0.6052899287894201, "grad_norm": 1.1675688028335571, "learning_rate": 1e-05, "loss": 0.5418, "mean_token_accuracy": 0.8315528631210327, "num_tokens": 189760563.0, "step": 1190 }, { "epoch": 0.6057985757884028, "grad_norm": 1.0995194911956787, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8341425657272339, "num_tokens": 189919046.0, "step": 1191 }, { "epoch": 0.6063072227873856, "grad_norm": 1.0976941585540771, "learning_rate": 1e-05, "loss": 0.5386, "mean_token_accuracy": 0.8345488905906677, "num_tokens": 190067307.0, "step": 1192 }, { "epoch": 0.6068158697863683, "grad_norm": 1.0513907670974731, "learning_rate": 1e-05, "loss": 0.5177, "mean_token_accuracy": 0.8396793603897095, "num_tokens": 190216290.0, "step": 1193 }, { "epoch": 0.6073245167853509, "grad_norm": 1.115285873413086, "learning_rate": 1e-05, "loss": 0.522, "mean_token_accuracy": 0.8372184038162231, "num_tokens": 190372698.0, "step": 1194 }, { "epoch": 0.6078331637843337, "grad_norm": 1.105042815208435, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8376275300979614, "num_tokens": 190528564.0, "step": 1195 }, { "epoch": 0.6083418107833164, "grad_norm": 1.1105037927627563, "learning_rate": 1e-05, "loss": 0.5507, "mean_token_accuracy": 0.8296712040901184, "num_tokens": 190679521.0, "step": 1196 }, { "epoch": 0.6088504577822991, "grad_norm": 1.072302222251892, "learning_rate": 1e-05, "loss": 0.5351, "mean_token_accuracy": 0.8351011276245117, "num_tokens": 190826754.0, "step": 1197 }, { "epoch": 0.6093591047812817, "grad_norm": 1.2463853359222412, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.837123692035675, "num_tokens": 190990350.0, "step": 1198 }, { "epoch": 0.6098677517802645, "grad_norm": 1.14852774143219, "learning_rate": 1e-05, "loss": 0.5544, "mean_token_accuracy": 0.8295097351074219, "num_tokens": 191150121.0, "step": 1199 }, { "epoch": 0.6103763987792472, "grad_norm": 1.1386868953704834, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8363741040229797, "num_tokens": 191298356.0, "step": 1200 }, { "epoch": 0.6108850457782299, "grad_norm": 1.0642979145050049, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8421168327331543, "num_tokens": 191454560.0, "step": 1201 }, { "epoch": 0.6113936927772126, "grad_norm": 1.0878254175186157, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.8473243713378906, "num_tokens": 191607061.0, "step": 1202 }, { "epoch": 0.6119023397761953, "grad_norm": 1.0132228136062622, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.835957407951355, "num_tokens": 191764683.0, "step": 1203 }, { "epoch": 0.612410986775178, "grad_norm": 1.0476408004760742, "learning_rate": 1e-05, "loss": 0.5188, "mean_token_accuracy": 0.8387986421585083, "num_tokens": 191920982.0, "step": 1204 }, { "epoch": 0.6129196337741607, "grad_norm": 1.0326831340789795, "learning_rate": 1e-05, "loss": 0.516, "mean_token_accuracy": 0.8392593860626221, "num_tokens": 192084464.0, "step": 1205 }, { "epoch": 0.6134282807731435, "grad_norm": 1.0202337503433228, "learning_rate": 1e-05, "loss": 0.5359, "mean_token_accuracy": 0.8356767892837524, "num_tokens": 192244544.0, "step": 1206 }, { "epoch": 0.6139369277721262, "grad_norm": 1.1243996620178223, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8402092456817627, "num_tokens": 192398895.0, "step": 1207 }, { "epoch": 0.6144455747711088, "grad_norm": 1.0691757202148438, "learning_rate": 1e-05, "loss": 0.5592, "mean_token_accuracy": 0.8307750225067139, "num_tokens": 192568952.0, "step": 1208 }, { "epoch": 0.6149542217700915, "grad_norm": 1.1174930334091187, "learning_rate": 1e-05, "loss": 0.5477, "mean_token_accuracy": 0.8309003114700317, "num_tokens": 192726852.0, "step": 1209 }, { "epoch": 0.6154628687690743, "grad_norm": 1.0564061403274536, "learning_rate": 1e-05, "loss": 0.4912, "mean_token_accuracy": 0.8462415337562561, "num_tokens": 192871340.0, "step": 1210 }, { "epoch": 0.615971515768057, "grad_norm": 1.022898554801941, "learning_rate": 1e-05, "loss": 0.5056, "mean_token_accuracy": 0.8419616222381592, "num_tokens": 193042111.0, "step": 1211 }, { "epoch": 0.6164801627670397, "grad_norm": 1.1113489866256714, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.8345298767089844, "num_tokens": 193194025.0, "step": 1212 }, { "epoch": 0.6169888097660223, "grad_norm": 1.1346272230148315, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.8465094566345215, "num_tokens": 193349718.0, "step": 1213 }, { "epoch": 0.6174974567650051, "grad_norm": 1.0465701818466187, "learning_rate": 1e-05, "loss": 0.5306, "mean_token_accuracy": 0.8361408710479736, "num_tokens": 193505599.0, "step": 1214 }, { "epoch": 0.6180061037639878, "grad_norm": 1.014984130859375, "learning_rate": 1e-05, "loss": 0.5698, "mean_token_accuracy": 0.8271299600601196, "num_tokens": 193681994.0, "step": 1215 }, { "epoch": 0.6185147507629705, "grad_norm": 0.9723906517028809, "learning_rate": 1e-05, "loss": 0.548, "mean_token_accuracy": 0.8309305906295776, "num_tokens": 193846107.0, "step": 1216 }, { "epoch": 0.6190233977619533, "grad_norm": 1.0247881412506104, "learning_rate": 1e-05, "loss": 0.5244, "mean_token_accuracy": 0.8384035229682922, "num_tokens": 194001101.0, "step": 1217 }, { "epoch": 0.6195320447609359, "grad_norm": 0.9313552379608154, "learning_rate": 1e-05, "loss": 0.4889, "mean_token_accuracy": 0.846920907497406, "num_tokens": 194162440.0, "step": 1218 }, { "epoch": 0.6200406917599186, "grad_norm": 0.9980710744857788, "learning_rate": 1e-05, "loss": 0.5287, "mean_token_accuracy": 0.8353384137153625, "num_tokens": 194316830.0, "step": 1219 }, { "epoch": 0.6205493387589013, "grad_norm": 1.0040054321289062, "learning_rate": 1e-05, "loss": 0.5096, "mean_token_accuracy": 0.8394368886947632, "num_tokens": 194471827.0, "step": 1220 }, { "epoch": 0.6210579857578841, "grad_norm": 1.1046159267425537, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8353234529495239, "num_tokens": 194631877.0, "step": 1221 }, { "epoch": 0.6215666327568667, "grad_norm": 1.0001643896102905, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8372697234153748, "num_tokens": 194798176.0, "step": 1222 }, { "epoch": 0.6220752797558494, "grad_norm": 1.0163205862045288, "learning_rate": 1e-05, "loss": 0.5319, "mean_token_accuracy": 0.8357371687889099, "num_tokens": 194954556.0, "step": 1223 }, { "epoch": 0.6225839267548321, "grad_norm": 1.1208192110061646, "learning_rate": 1e-05, "loss": 0.5558, "mean_token_accuracy": 0.8277769088745117, "num_tokens": 195115010.0, "step": 1224 }, { "epoch": 0.6230925737538149, "grad_norm": 0.9982933402061462, "learning_rate": 1e-05, "loss": 0.5043, "mean_token_accuracy": 0.844214677810669, "num_tokens": 195277332.0, "step": 1225 }, { "epoch": 0.6236012207527976, "grad_norm": 1.0954415798187256, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8418806195259094, "num_tokens": 195442166.0, "step": 1226 }, { "epoch": 0.6241098677517802, "grad_norm": 0.956048846244812, "learning_rate": 1e-05, "loss": 0.5032, "mean_token_accuracy": 0.8432636857032776, "num_tokens": 195598203.0, "step": 1227 }, { "epoch": 0.624618514750763, "grad_norm": 1.0685564279556274, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.842692494392395, "num_tokens": 195755945.0, "step": 1228 }, { "epoch": 0.6251271617497457, "grad_norm": 0.9353527426719666, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8419412970542908, "num_tokens": 195923721.0, "step": 1229 }, { "epoch": 0.6256358087487284, "grad_norm": 1.064367651939392, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.8454999327659607, "num_tokens": 196076642.0, "step": 1230 }, { "epoch": 0.626144455747711, "grad_norm": 1.070304274559021, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.8377371430397034, "num_tokens": 196232622.0, "step": 1231 }, { "epoch": 0.6266531027466938, "grad_norm": 1.0615029335021973, "learning_rate": 1e-05, "loss": 0.5459, "mean_token_accuracy": 0.8300840258598328, "num_tokens": 196384640.0, "step": 1232 }, { "epoch": 0.6271617497456765, "grad_norm": 1.0311565399169922, "learning_rate": 1e-05, "loss": 0.562, "mean_token_accuracy": 0.8277008533477783, "num_tokens": 196546059.0, "step": 1233 }, { "epoch": 0.6276703967446592, "grad_norm": 1.0338419675827026, "learning_rate": 1e-05, "loss": 0.5517, "mean_token_accuracy": 0.831037700176239, "num_tokens": 196709416.0, "step": 1234 }, { "epoch": 0.6281790437436419, "grad_norm": 0.9907321333885193, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8356142640113831, "num_tokens": 196874595.0, "step": 1235 }, { "epoch": 0.6286876907426246, "grad_norm": 1.000959873199463, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8412952423095703, "num_tokens": 197038688.0, "step": 1236 }, { "epoch": 0.6291963377416073, "grad_norm": 1.143560767173767, "learning_rate": 1e-05, "loss": 0.5659, "mean_token_accuracy": 0.8249709606170654, "num_tokens": 197192465.0, "step": 1237 }, { "epoch": 0.62970498474059, "grad_norm": 1.0114355087280273, "learning_rate": 1e-05, "loss": 0.5446, "mean_token_accuracy": 0.8341284990310669, "num_tokens": 197361082.0, "step": 1238 }, { "epoch": 0.6302136317395728, "grad_norm": 1.0758358240127563, "learning_rate": 1e-05, "loss": 0.4813, "mean_token_accuracy": 0.8505460023880005, "num_tokens": 197512368.0, "step": 1239 }, { "epoch": 0.6307222787385555, "grad_norm": 1.033604621887207, "learning_rate": 1e-05, "loss": 0.547, "mean_token_accuracy": 0.8309687972068787, "num_tokens": 197654672.0, "step": 1240 }, { "epoch": 0.6312309257375381, "grad_norm": 1.1330300569534302, "learning_rate": 1e-05, "loss": 0.5497, "mean_token_accuracy": 0.830880880355835, "num_tokens": 197816279.0, "step": 1241 }, { "epoch": 0.6317395727365208, "grad_norm": 1.0106984376907349, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8433741331100464, "num_tokens": 197972971.0, "step": 1242 }, { "epoch": 0.6322482197355036, "grad_norm": 1.062400460243225, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8375513553619385, "num_tokens": 198125745.0, "step": 1243 }, { "epoch": 0.6327568667344863, "grad_norm": 0.9587567448616028, "learning_rate": 1e-05, "loss": 0.48, "mean_token_accuracy": 0.8493208885192871, "num_tokens": 198288270.0, "step": 1244 }, { "epoch": 0.633265513733469, "grad_norm": 1.0087758302688599, "learning_rate": 1e-05, "loss": 0.5336, "mean_token_accuracy": 0.834213137626648, "num_tokens": 198445798.0, "step": 1245 }, { "epoch": 0.6337741607324516, "grad_norm": 1.089056134223938, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8421186208724976, "num_tokens": 198600564.0, "step": 1246 }, { "epoch": 0.6342828077314344, "grad_norm": 0.9666113257408142, "learning_rate": 1e-05, "loss": 0.4992, "mean_token_accuracy": 0.8449921607971191, "num_tokens": 198768145.0, "step": 1247 }, { "epoch": 0.6347914547304171, "grad_norm": 1.0998759269714355, "learning_rate": 1e-05, "loss": 0.5058, "mean_token_accuracy": 0.8415104150772095, "num_tokens": 198919816.0, "step": 1248 }, { "epoch": 0.6353001017293998, "grad_norm": 1.020984411239624, "learning_rate": 1e-05, "loss": 0.5534, "mean_token_accuracy": 0.8289154767990112, "num_tokens": 199084288.0, "step": 1249 }, { "epoch": 0.6358087487283826, "grad_norm": 2.9039652347564697, "learning_rate": 1e-05, "loss": 0.5429, "mean_token_accuracy": 0.8318288922309875, "num_tokens": 199234621.0, "step": 1250 }, { "epoch": 0.6363173957273652, "grad_norm": 1.0918387174606323, "learning_rate": 1e-05, "loss": 0.5136, "mean_token_accuracy": 0.8401831388473511, "num_tokens": 199415992.0, "step": 1251 }, { "epoch": 0.6368260427263479, "grad_norm": 0.9810324311256409, "learning_rate": 1e-05, "loss": 0.5334, "mean_token_accuracy": 0.8348792791366577, "num_tokens": 199581880.0, "step": 1252 }, { "epoch": 0.6373346897253306, "grad_norm": 1.1713511943817139, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8398399949073792, "num_tokens": 199728425.0, "step": 1253 }, { "epoch": 0.6378433367243134, "grad_norm": 1.0506319999694824, "learning_rate": 1e-05, "loss": 0.5682, "mean_token_accuracy": 0.8250914216041565, "num_tokens": 199887035.0, "step": 1254 }, { "epoch": 0.638351983723296, "grad_norm": 1.083480715751648, "learning_rate": 1e-05, "loss": 0.5414, "mean_token_accuracy": 0.8333582878112793, "num_tokens": 200042873.0, "step": 1255 }, { "epoch": 0.6388606307222787, "grad_norm": 1.0766798257827759, "learning_rate": 1e-05, "loss": 0.5109, "mean_token_accuracy": 0.8416237831115723, "num_tokens": 200218153.0, "step": 1256 }, { "epoch": 0.6393692777212614, "grad_norm": 1.0037503242492676, "learning_rate": 1e-05, "loss": 0.5348, "mean_token_accuracy": 0.8360911011695862, "num_tokens": 200375972.0, "step": 1257 }, { "epoch": 0.6398779247202442, "grad_norm": 1.1143869161605835, "learning_rate": 1e-05, "loss": 0.5432, "mean_token_accuracy": 0.8323516845703125, "num_tokens": 200538925.0, "step": 1258 }, { "epoch": 0.6403865717192269, "grad_norm": 1.0148932933807373, "learning_rate": 1e-05, "loss": 0.5143, "mean_token_accuracy": 0.8395555019378662, "num_tokens": 200702936.0, "step": 1259 }, { "epoch": 0.6408952187182095, "grad_norm": 1.3639062643051147, "learning_rate": 1e-05, "loss": 0.5129, "mean_token_accuracy": 0.8397424221038818, "num_tokens": 200852605.0, "step": 1260 }, { "epoch": 0.6414038657171923, "grad_norm": 1.0762628316879272, "learning_rate": 1e-05, "loss": 0.5116, "mean_token_accuracy": 0.8400771617889404, "num_tokens": 201011038.0, "step": 1261 }, { "epoch": 0.641912512716175, "grad_norm": 1.1453301906585693, "learning_rate": 1e-05, "loss": 0.5425, "mean_token_accuracy": 0.8312779068946838, "num_tokens": 201171353.0, "step": 1262 }, { "epoch": 0.6424211597151577, "grad_norm": 0.977043628692627, "learning_rate": 1e-05, "loss": 0.5493, "mean_token_accuracy": 0.8298898935317993, "num_tokens": 201335064.0, "step": 1263 }, { "epoch": 0.6429298067141404, "grad_norm": 1.070381999015808, "learning_rate": 1e-05, "loss": 0.5072, "mean_token_accuracy": 0.8416640758514404, "num_tokens": 201496911.0, "step": 1264 }, { "epoch": 0.6434384537131231, "grad_norm": 1.0922592878341675, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8351348042488098, "num_tokens": 201650881.0, "step": 1265 }, { "epoch": 0.6439471007121058, "grad_norm": 1.032348394393921, "learning_rate": 1e-05, "loss": 0.5552, "mean_token_accuracy": 0.8285496830940247, "num_tokens": 201822744.0, "step": 1266 }, { "epoch": 0.6444557477110885, "grad_norm": 0.9909312129020691, "learning_rate": 1e-05, "loss": 0.5119, "mean_token_accuracy": 0.841721773147583, "num_tokens": 201988138.0, "step": 1267 }, { "epoch": 0.6449643947100712, "grad_norm": 0.9247465133666992, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8412255644798279, "num_tokens": 202163172.0, "step": 1268 }, { "epoch": 0.645473041709054, "grad_norm": 1.0801644325256348, "learning_rate": 1e-05, "loss": 0.5172, "mean_token_accuracy": 0.8394771814346313, "num_tokens": 202330354.0, "step": 1269 }, { "epoch": 0.6459816887080366, "grad_norm": 1.036254644393921, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8337914943695068, "num_tokens": 202477704.0, "step": 1270 }, { "epoch": 0.6464903357070193, "grad_norm": 1.0122811794281006, "learning_rate": 1e-05, "loss": 0.5626, "mean_token_accuracy": 0.8263964653015137, "num_tokens": 202638535.0, "step": 1271 }, { "epoch": 0.646998982706002, "grad_norm": 1.1269254684448242, "learning_rate": 1e-05, "loss": 0.5445, "mean_token_accuracy": 0.8313633799552917, "num_tokens": 202799169.0, "step": 1272 }, { "epoch": 0.6475076297049848, "grad_norm": 1.0821533203125, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8368152379989624, "num_tokens": 202948731.0, "step": 1273 }, { "epoch": 0.6480162767039674, "grad_norm": 1.0910481214523315, "learning_rate": 1e-05, "loss": 0.5229, "mean_token_accuracy": 0.8392539024353027, "num_tokens": 203103019.0, "step": 1274 }, { "epoch": 0.6485249237029501, "grad_norm": 1.1418254375457764, "learning_rate": 1e-05, "loss": 0.5554, "mean_token_accuracy": 0.8282456398010254, "num_tokens": 203258243.0, "step": 1275 }, { "epoch": 0.6490335707019329, "grad_norm": 1.0393235683441162, "learning_rate": 1e-05, "loss": 0.5194, "mean_token_accuracy": 0.8381021022796631, "num_tokens": 203417686.0, "step": 1276 }, { "epoch": 0.6495422177009156, "grad_norm": 1.012725830078125, "learning_rate": 1e-05, "loss": 0.5259, "mean_token_accuracy": 0.8369470238685608, "num_tokens": 203581925.0, "step": 1277 }, { "epoch": 0.6500508646998983, "grad_norm": 1.1534172296524048, "learning_rate": 1e-05, "loss": 0.5528, "mean_token_accuracy": 0.8291853666305542, "num_tokens": 203737547.0, "step": 1278 }, { "epoch": 0.6505595116988809, "grad_norm": 1.0724189281463623, "learning_rate": 1e-05, "loss": 0.5194, "mean_token_accuracy": 0.8382695913314819, "num_tokens": 203886501.0, "step": 1279 }, { "epoch": 0.6510681586978637, "grad_norm": 1.2702937126159668, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8344364166259766, "num_tokens": 204040525.0, "step": 1280 }, { "epoch": 0.6515768056968464, "grad_norm": 1.134999394416809, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.839902400970459, "num_tokens": 204205921.0, "step": 1281 }, { "epoch": 0.6520854526958291, "grad_norm": 1.0884934663772583, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.841905951499939, "num_tokens": 204371623.0, "step": 1282 }, { "epoch": 0.6525940996948117, "grad_norm": 1.0996270179748535, "learning_rate": 1e-05, "loss": 0.5095, "mean_token_accuracy": 0.8415507078170776, "num_tokens": 204531555.0, "step": 1283 }, { "epoch": 0.6531027466937945, "grad_norm": 1.0175594091415405, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8389270305633545, "num_tokens": 204695860.0, "step": 1284 }, { "epoch": 0.6536113936927772, "grad_norm": 1.1143662929534912, "learning_rate": 1e-05, "loss": 0.5131, "mean_token_accuracy": 0.8399682641029358, "num_tokens": 204849971.0, "step": 1285 }, { "epoch": 0.6541200406917599, "grad_norm": 1.0179085731506348, "learning_rate": 1e-05, "loss": 0.4899, "mean_token_accuracy": 0.8472344279289246, "num_tokens": 205022471.0, "step": 1286 }, { "epoch": 0.6546286876907427, "grad_norm": 1.0679529905319214, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8438515067100525, "num_tokens": 205191250.0, "step": 1287 }, { "epoch": 0.6551373346897253, "grad_norm": 1.1572391986846924, "learning_rate": 1e-05, "loss": 0.5431, "mean_token_accuracy": 0.8309118747711182, "num_tokens": 205338248.0, "step": 1288 }, { "epoch": 0.655645981688708, "grad_norm": 0.9996314644813538, "learning_rate": 1e-05, "loss": 0.5478, "mean_token_accuracy": 0.8296094536781311, "num_tokens": 205510777.0, "step": 1289 }, { "epoch": 0.6561546286876907, "grad_norm": 1.0826754570007324, "learning_rate": 1e-05, "loss": 0.538, "mean_token_accuracy": 0.832534909248352, "num_tokens": 205654689.0, "step": 1290 }, { "epoch": 0.6566632756866735, "grad_norm": 1.0576002597808838, "learning_rate": 1e-05, "loss": 0.5486, "mean_token_accuracy": 0.8308680057525635, "num_tokens": 205818547.0, "step": 1291 }, { "epoch": 0.6571719226856562, "grad_norm": 1.0578711032867432, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8268966674804688, "num_tokens": 205984399.0, "step": 1292 }, { "epoch": 0.6576805696846388, "grad_norm": 1.0611701011657715, "learning_rate": 1e-05, "loss": 0.5702, "mean_token_accuracy": 0.8249958157539368, "num_tokens": 206148673.0, "step": 1293 }, { "epoch": 0.6581892166836215, "grad_norm": 1.1365247964859009, "learning_rate": 1e-05, "loss": 0.5099, "mean_token_accuracy": 0.8411107063293457, "num_tokens": 206308245.0, "step": 1294 }, { "epoch": 0.6586978636826043, "grad_norm": 0.9704191088676453, "learning_rate": 1e-05, "loss": 0.4897, "mean_token_accuracy": 0.8457728624343872, "num_tokens": 206471247.0, "step": 1295 }, { "epoch": 0.659206510681587, "grad_norm": 1.032420039176941, "learning_rate": 1e-05, "loss": 0.5294, "mean_token_accuracy": 0.8350492119789124, "num_tokens": 206613022.0, "step": 1296 }, { "epoch": 0.6597151576805697, "grad_norm": 1.0824891328811646, "learning_rate": 1e-05, "loss": 0.534, "mean_token_accuracy": 0.832705557346344, "num_tokens": 206781121.0, "step": 1297 }, { "epoch": 0.6602238046795524, "grad_norm": 0.9917576909065247, "learning_rate": 1e-05, "loss": 0.5379, "mean_token_accuracy": 0.832955002784729, "num_tokens": 206935916.0, "step": 1298 }, { "epoch": 0.6607324516785351, "grad_norm": 1.120984435081482, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8467674851417542, "num_tokens": 207089114.0, "step": 1299 }, { "epoch": 0.6612410986775178, "grad_norm": 1.0345065593719482, "learning_rate": 1e-05, "loss": 0.5132, "mean_token_accuracy": 0.8396544456481934, "num_tokens": 207249857.0, "step": 1300 }, { "epoch": 0.6617497456765005, "grad_norm": 1.0298868417739868, "learning_rate": 1e-05, "loss": 0.5605, "mean_token_accuracy": 0.8278207778930664, "num_tokens": 207415519.0, "step": 1301 }, { "epoch": 0.6622583926754833, "grad_norm": 1.0737025737762451, "learning_rate": 1e-05, "loss": 0.5251, "mean_token_accuracy": 0.8366029262542725, "num_tokens": 207585946.0, "step": 1302 }, { "epoch": 0.6627670396744659, "grad_norm": 0.944338858127594, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.844987154006958, "num_tokens": 207742283.0, "step": 1303 }, { "epoch": 0.6632756866734486, "grad_norm": 0.9760245084762573, "learning_rate": 1e-05, "loss": 0.4936, "mean_token_accuracy": 0.8455411791801453, "num_tokens": 207905907.0, "step": 1304 }, { "epoch": 0.6637843336724313, "grad_norm": 1.0034639835357666, "learning_rate": 1e-05, "loss": 0.4984, "mean_token_accuracy": 0.8437528610229492, "num_tokens": 208066296.0, "step": 1305 }, { "epoch": 0.6642929806714141, "grad_norm": 0.991218626499176, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8421469926834106, "num_tokens": 208221915.0, "step": 1306 }, { "epoch": 0.6648016276703967, "grad_norm": 1.1763343811035156, "learning_rate": 1e-05, "loss": 0.4764, "mean_token_accuracy": 0.8507419228553772, "num_tokens": 208374510.0, "step": 1307 }, { "epoch": 0.6653102746693794, "grad_norm": 1.0376806259155273, "learning_rate": 1e-05, "loss": 0.5323, "mean_token_accuracy": 0.8359139561653137, "num_tokens": 208541953.0, "step": 1308 }, { "epoch": 0.6658189216683622, "grad_norm": 1.2217199802398682, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8390167355537415, "num_tokens": 208698721.0, "step": 1309 }, { "epoch": 0.6663275686673449, "grad_norm": 1.1456576585769653, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.8398691415786743, "num_tokens": 208855549.0, "step": 1310 }, { "epoch": 0.6668362156663276, "grad_norm": 1.0265754461288452, "learning_rate": 1e-05, "loss": 0.5423, "mean_token_accuracy": 0.8332537412643433, "num_tokens": 209015229.0, "step": 1311 }, { "epoch": 0.6673448626653102, "grad_norm": 1.1075199842453003, "learning_rate": 1e-05, "loss": 0.5314, "mean_token_accuracy": 0.8359677791595459, "num_tokens": 209167815.0, "step": 1312 }, { "epoch": 0.667853509664293, "grad_norm": 0.9988571405410767, "learning_rate": 1e-05, "loss": 0.5403, "mean_token_accuracy": 0.8331695795059204, "num_tokens": 209339913.0, "step": 1313 }, { "epoch": 0.6683621566632757, "grad_norm": 1.055841326713562, "learning_rate": 1e-05, "loss": 0.5642, "mean_token_accuracy": 0.8272186517715454, "num_tokens": 209506826.0, "step": 1314 }, { "epoch": 0.6688708036622584, "grad_norm": 1.170231580734253, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.840907096862793, "num_tokens": 209658534.0, "step": 1315 }, { "epoch": 0.669379450661241, "grad_norm": 1.0777288675308228, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.841313362121582, "num_tokens": 209810131.0, "step": 1316 }, { "epoch": 0.6698880976602238, "grad_norm": 1.0405516624450684, "learning_rate": 1e-05, "loss": 0.4887, "mean_token_accuracy": 0.8478529453277588, "num_tokens": 209972848.0, "step": 1317 }, { "epoch": 0.6703967446592065, "grad_norm": 1.056444764137268, "learning_rate": 1e-05, "loss": 0.5596, "mean_token_accuracy": 0.8281592130661011, "num_tokens": 210123131.0, "step": 1318 }, { "epoch": 0.6709053916581892, "grad_norm": 0.9567363858222961, "learning_rate": 1e-05, "loss": 0.5293, "mean_token_accuracy": 0.8372219800949097, "num_tokens": 210281302.0, "step": 1319 }, { "epoch": 0.671414038657172, "grad_norm": 1.0734000205993652, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8434922695159912, "num_tokens": 210444300.0, "step": 1320 }, { "epoch": 0.6719226856561547, "grad_norm": 0.9549962282180786, "learning_rate": 1e-05, "loss": 0.515, "mean_token_accuracy": 0.839155912399292, "num_tokens": 210610063.0, "step": 1321 }, { "epoch": 0.6724313326551373, "grad_norm": 1.0906566381454468, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8351768255233765, "num_tokens": 210771386.0, "step": 1322 }, { "epoch": 0.67293997965412, "grad_norm": 1.040381908416748, "learning_rate": 1e-05, "loss": 0.503, "mean_token_accuracy": 0.8433479070663452, "num_tokens": 210919989.0, "step": 1323 }, { "epoch": 0.6734486266531028, "grad_norm": 1.1143410205841064, "learning_rate": 1e-05, "loss": 0.5419, "mean_token_accuracy": 0.8338621854782104, "num_tokens": 211073799.0, "step": 1324 }, { "epoch": 0.6739572736520855, "grad_norm": 0.9642016291618347, "learning_rate": 1e-05, "loss": 0.5196, "mean_token_accuracy": 0.8390560746192932, "num_tokens": 211241360.0, "step": 1325 }, { "epoch": 0.6744659206510681, "grad_norm": 1.1692264080047607, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.8322386741638184, "num_tokens": 211407370.0, "step": 1326 }, { "epoch": 0.6749745676500508, "grad_norm": 0.965532660484314, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8356199860572815, "num_tokens": 211581491.0, "step": 1327 }, { "epoch": 0.6754832146490336, "grad_norm": 1.0441728830337524, "learning_rate": 1e-05, "loss": 0.5079, "mean_token_accuracy": 0.8422735929489136, "num_tokens": 211739189.0, "step": 1328 }, { "epoch": 0.6759918616480163, "grad_norm": 1.1415163278579712, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8360558748245239, "num_tokens": 211893052.0, "step": 1329 }, { "epoch": 0.676500508646999, "grad_norm": 0.9364932179450989, "learning_rate": 1e-05, "loss": 0.4948, "mean_token_accuracy": 0.8465912342071533, "num_tokens": 212047699.0, "step": 1330 }, { "epoch": 0.6770091556459817, "grad_norm": 1.1136362552642822, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8364666700363159, "num_tokens": 212189491.0, "step": 1331 }, { "epoch": 0.6775178026449644, "grad_norm": 1.1026740074157715, "learning_rate": 1e-05, "loss": 0.5339, "mean_token_accuracy": 0.8352006673812866, "num_tokens": 212350768.0, "step": 1332 }, { "epoch": 0.6780264496439471, "grad_norm": 1.0726979970932007, "learning_rate": 1e-05, "loss": 0.5379, "mean_token_accuracy": 0.8336712121963501, "num_tokens": 212505424.0, "step": 1333 }, { "epoch": 0.6785350966429298, "grad_norm": 1.030218243598938, "learning_rate": 1e-05, "loss": 0.5231, "mean_token_accuracy": 0.8367509245872498, "num_tokens": 212656414.0, "step": 1334 }, { "epoch": 0.6790437436419126, "grad_norm": 1.0471112728118896, "learning_rate": 1e-05, "loss": 0.5232, "mean_token_accuracy": 0.8369899988174438, "num_tokens": 212814380.0, "step": 1335 }, { "epoch": 0.6795523906408952, "grad_norm": 1.234264850616455, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8384055495262146, "num_tokens": 212970630.0, "step": 1336 }, { "epoch": 0.6800610376398779, "grad_norm": 0.9486947655677795, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.838019609451294, "num_tokens": 213135584.0, "step": 1337 }, { "epoch": 0.6805696846388606, "grad_norm": 1.022884488105774, "learning_rate": 1e-05, "loss": 0.523, "mean_token_accuracy": 0.836395263671875, "num_tokens": 213296625.0, "step": 1338 }, { "epoch": 0.6810783316378434, "grad_norm": 0.9599829316139221, "learning_rate": 1e-05, "loss": 0.5183, "mean_token_accuracy": 0.8398178815841675, "num_tokens": 213466408.0, "step": 1339 }, { "epoch": 0.681586978636826, "grad_norm": 1.0036944150924683, "learning_rate": 1e-05, "loss": 0.4879, "mean_token_accuracy": 0.8481632471084595, "num_tokens": 213608058.0, "step": 1340 }, { "epoch": 0.6820956256358087, "grad_norm": 0.992483913898468, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.837264895439148, "num_tokens": 213757840.0, "step": 1341 }, { "epoch": 0.6826042726347915, "grad_norm": 0.9334516525268555, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8425880670547485, "num_tokens": 213919287.0, "step": 1342 }, { "epoch": 0.6831129196337742, "grad_norm": 0.9441404342651367, "learning_rate": 1e-05, "loss": 0.5197, "mean_token_accuracy": 0.8386458158493042, "num_tokens": 214089237.0, "step": 1343 }, { "epoch": 0.6836215666327569, "grad_norm": 1.0068684816360474, "learning_rate": 1e-05, "loss": 0.512, "mean_token_accuracy": 0.841089129447937, "num_tokens": 214248836.0, "step": 1344 }, { "epoch": 0.6841302136317395, "grad_norm": 0.9266252517700195, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.842008113861084, "num_tokens": 214419340.0, "step": 1345 }, { "epoch": 0.6846388606307223, "grad_norm": 0.9865654110908508, "learning_rate": 1e-05, "loss": 0.5216, "mean_token_accuracy": 0.8350708484649658, "num_tokens": 214578559.0, "step": 1346 }, { "epoch": 0.685147507629705, "grad_norm": 1.041604995727539, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8384052515029907, "num_tokens": 214735210.0, "step": 1347 }, { "epoch": 0.6856561546286877, "grad_norm": 0.9903523325920105, "learning_rate": 1e-05, "loss": 0.529, "mean_token_accuracy": 0.8353708386421204, "num_tokens": 214902909.0, "step": 1348 }, { "epoch": 0.6861648016276704, "grad_norm": 0.9816705584526062, "learning_rate": 1e-05, "loss": 0.522, "mean_token_accuracy": 0.8378950953483582, "num_tokens": 215064535.0, "step": 1349 }, { "epoch": 0.6866734486266531, "grad_norm": 0.9787136316299438, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8377700448036194, "num_tokens": 215220787.0, "step": 1350 }, { "epoch": 0.6871820956256358, "grad_norm": 1.077988862991333, "learning_rate": 1e-05, "loss": 0.5432, "mean_token_accuracy": 0.8315883278846741, "num_tokens": 215375205.0, "step": 1351 }, { "epoch": 0.6876907426246185, "grad_norm": 0.9856903553009033, "learning_rate": 1e-05, "loss": 0.5344, "mean_token_accuracy": 0.8377841711044312, "num_tokens": 215532596.0, "step": 1352 }, { "epoch": 0.6881993896236012, "grad_norm": 0.9433656930923462, "learning_rate": 1e-05, "loss": 0.5328, "mean_token_accuracy": 0.836259126663208, "num_tokens": 215686435.0, "step": 1353 }, { "epoch": 0.688708036622584, "grad_norm": 1.0300414562225342, "learning_rate": 1e-05, "loss": 0.5327, "mean_token_accuracy": 0.833829402923584, "num_tokens": 215854411.0, "step": 1354 }, { "epoch": 0.6892166836215666, "grad_norm": 1.0212823152542114, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8442879319190979, "num_tokens": 216016683.0, "step": 1355 }, { "epoch": 0.6897253306205493, "grad_norm": 1.0083696842193604, "learning_rate": 1e-05, "loss": 0.5365, "mean_token_accuracy": 0.8317540884017944, "num_tokens": 216176025.0, "step": 1356 }, { "epoch": 0.6902339776195321, "grad_norm": 1.0283464193344116, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.8410217761993408, "num_tokens": 216335904.0, "step": 1357 }, { "epoch": 0.6907426246185148, "grad_norm": 1.0335670709609985, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8385999202728271, "num_tokens": 216490560.0, "step": 1358 }, { "epoch": 0.6912512716174974, "grad_norm": 1.0509625673294067, "learning_rate": 1e-05, "loss": 0.5338, "mean_token_accuracy": 0.8343715667724609, "num_tokens": 216636941.0, "step": 1359 }, { "epoch": 0.6917599186164801, "grad_norm": 0.9860029816627502, "learning_rate": 1e-05, "loss": 0.5173, "mean_token_accuracy": 0.8389407992362976, "num_tokens": 216788892.0, "step": 1360 }, { "epoch": 0.6922685656154629, "grad_norm": 1.0808199644088745, "learning_rate": 1e-05, "loss": 0.4999, "mean_token_accuracy": 0.8442080020904541, "num_tokens": 216933108.0, "step": 1361 }, { "epoch": 0.6927772126144456, "grad_norm": 1.1449096202850342, "learning_rate": 1e-05, "loss": 0.517, "mean_token_accuracy": 0.8379891514778137, "num_tokens": 217091789.0, "step": 1362 }, { "epoch": 0.6932858596134283, "grad_norm": 1.0072581768035889, "learning_rate": 1e-05, "loss": 0.5181, "mean_token_accuracy": 0.8373380899429321, "num_tokens": 217237394.0, "step": 1363 }, { "epoch": 0.6937945066124109, "grad_norm": 1.0011417865753174, "learning_rate": 1e-05, "loss": 0.5236, "mean_token_accuracy": 0.836451530456543, "num_tokens": 217391649.0, "step": 1364 }, { "epoch": 0.6943031536113937, "grad_norm": 0.9622162580490112, "learning_rate": 1e-05, "loss": 0.4995, "mean_token_accuracy": 0.8435579538345337, "num_tokens": 217554324.0, "step": 1365 }, { "epoch": 0.6948118006103764, "grad_norm": 1.0910418033599854, "learning_rate": 1e-05, "loss": 0.5147, "mean_token_accuracy": 0.8388841152191162, "num_tokens": 217705676.0, "step": 1366 }, { "epoch": 0.6953204476093591, "grad_norm": 1.0049858093261719, "learning_rate": 1e-05, "loss": 0.5367, "mean_token_accuracy": 0.8346538543701172, "num_tokens": 217870837.0, "step": 1367 }, { "epoch": 0.6958290946083419, "grad_norm": 1.0026202201843262, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.8319190144538879, "num_tokens": 218034192.0, "step": 1368 }, { "epoch": 0.6963377416073245, "grad_norm": 1.0037686824798584, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.8391927480697632, "num_tokens": 218201741.0, "step": 1369 }, { "epoch": 0.6968463886063072, "grad_norm": 0.9872056841850281, "learning_rate": 1e-05, "loss": 0.5308, "mean_token_accuracy": 0.8365633487701416, "num_tokens": 218361880.0, "step": 1370 }, { "epoch": 0.6973550356052899, "grad_norm": 1.0084397792816162, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8447875380516052, "num_tokens": 218507302.0, "step": 1371 }, { "epoch": 0.6978636826042727, "grad_norm": 0.9623077511787415, "learning_rate": 1e-05, "loss": 0.5196, "mean_token_accuracy": 0.837771475315094, "num_tokens": 218677170.0, "step": 1372 }, { "epoch": 0.6983723296032553, "grad_norm": 1.1019070148468018, "learning_rate": 1e-05, "loss": 0.5203, "mean_token_accuracy": 0.8386264443397522, "num_tokens": 218830583.0, "step": 1373 }, { "epoch": 0.698880976602238, "grad_norm": 1.0796947479248047, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8380075693130493, "num_tokens": 218990542.0, "step": 1374 }, { "epoch": 0.6993896236012207, "grad_norm": 1.0787526369094849, "learning_rate": 1e-05, "loss": 0.5301, "mean_token_accuracy": 0.8361427783966064, "num_tokens": 219150118.0, "step": 1375 }, { "epoch": 0.6998982706002035, "grad_norm": 1.0091830492019653, "learning_rate": 1e-05, "loss": 0.5049, "mean_token_accuracy": 0.8422755002975464, "num_tokens": 219320664.0, "step": 1376 }, { "epoch": 0.7004069175991862, "grad_norm": 0.9992945790290833, "learning_rate": 1e-05, "loss": 0.5343, "mean_token_accuracy": 0.833981990814209, "num_tokens": 219483168.0, "step": 1377 }, { "epoch": 0.7009155645981688, "grad_norm": 0.9681557416915894, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8369916081428528, "num_tokens": 219643460.0, "step": 1378 }, { "epoch": 0.7014242115971516, "grad_norm": 0.9948337078094482, "learning_rate": 1e-05, "loss": 0.5161, "mean_token_accuracy": 0.8396735787391663, "num_tokens": 219801607.0, "step": 1379 }, { "epoch": 0.7019328585961343, "grad_norm": 1.0998892784118652, "learning_rate": 1e-05, "loss": 0.5306, "mean_token_accuracy": 0.8362758159637451, "num_tokens": 219953874.0, "step": 1380 }, { "epoch": 0.702441505595117, "grad_norm": 1.0089136362075806, "learning_rate": 1e-05, "loss": 0.4836, "mean_token_accuracy": 0.8482525944709778, "num_tokens": 220101993.0, "step": 1381 }, { "epoch": 0.7029501525940997, "grad_norm": 0.9455198645591736, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8371497988700867, "num_tokens": 220272230.0, "step": 1382 }, { "epoch": 0.7034587995930824, "grad_norm": 1.017932415008545, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8393813371658325, "num_tokens": 220432579.0, "step": 1383 }, { "epoch": 0.7039674465920651, "grad_norm": 1.018938422203064, "learning_rate": 1e-05, "loss": 0.5252, "mean_token_accuracy": 0.8366577625274658, "num_tokens": 220588939.0, "step": 1384 }, { "epoch": 0.7044760935910478, "grad_norm": 0.9065205454826355, "learning_rate": 1e-05, "loss": 0.497, "mean_token_accuracy": 0.8459725379943848, "num_tokens": 220759155.0, "step": 1385 }, { "epoch": 0.7049847405900305, "grad_norm": 0.926688015460968, "learning_rate": 1e-05, "loss": 0.5175, "mean_token_accuracy": 0.8387424945831299, "num_tokens": 220916843.0, "step": 1386 }, { "epoch": 0.7054933875890133, "grad_norm": 0.9841321706771851, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.8358370065689087, "num_tokens": 221083858.0, "step": 1387 }, { "epoch": 0.7060020345879959, "grad_norm": 1.0243306159973145, "learning_rate": 1e-05, "loss": 0.5678, "mean_token_accuracy": 0.82574862241745, "num_tokens": 221258649.0, "step": 1388 }, { "epoch": 0.7065106815869786, "grad_norm": 0.9554473161697388, "learning_rate": 1e-05, "loss": 0.5314, "mean_token_accuracy": 0.8345929980278015, "num_tokens": 221425144.0, "step": 1389 }, { "epoch": 0.7070193285859614, "grad_norm": 0.9833611845970154, "learning_rate": 1e-05, "loss": 0.5295, "mean_token_accuracy": 0.8353415131568909, "num_tokens": 221583215.0, "step": 1390 }, { "epoch": 0.7075279755849441, "grad_norm": 0.9443389177322388, "learning_rate": 1e-05, "loss": 0.535, "mean_token_accuracy": 0.8340977430343628, "num_tokens": 221755255.0, "step": 1391 }, { "epoch": 0.7080366225839267, "grad_norm": 1.032128930091858, "learning_rate": 1e-05, "loss": 0.5473, "mean_token_accuracy": 0.8305888175964355, "num_tokens": 221923372.0, "step": 1392 }, { "epoch": 0.7085452695829094, "grad_norm": 0.9580515027046204, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8368549346923828, "num_tokens": 222080701.0, "step": 1393 }, { "epoch": 0.7090539165818922, "grad_norm": 0.9367018938064575, "learning_rate": 1e-05, "loss": 0.486, "mean_token_accuracy": 0.8447219729423523, "num_tokens": 222233287.0, "step": 1394 }, { "epoch": 0.7095625635808749, "grad_norm": 0.9627552628517151, "learning_rate": 1e-05, "loss": 0.5408, "mean_token_accuracy": 0.8316167593002319, "num_tokens": 222399597.0, "step": 1395 }, { "epoch": 0.7100712105798576, "grad_norm": 1.0496351718902588, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.8371133804321289, "num_tokens": 222549595.0, "step": 1396 }, { "epoch": 0.7105798575788402, "grad_norm": 0.9316955208778381, "learning_rate": 1e-05, "loss": 0.5483, "mean_token_accuracy": 0.8318901062011719, "num_tokens": 222711422.0, "step": 1397 }, { "epoch": 0.711088504577823, "grad_norm": 0.9864873290061951, "learning_rate": 1e-05, "loss": 0.5008, "mean_token_accuracy": 0.8416286706924438, "num_tokens": 222873104.0, "step": 1398 }, { "epoch": 0.7115971515768057, "grad_norm": 1.0518662929534912, "learning_rate": 1e-05, "loss": 0.5304, "mean_token_accuracy": 0.8358045816421509, "num_tokens": 223020625.0, "step": 1399 }, { "epoch": 0.7121057985757884, "grad_norm": 1.0203341245651245, "learning_rate": 1e-05, "loss": 0.5068, "mean_token_accuracy": 0.8423791527748108, "num_tokens": 223175189.0, "step": 1400 }, { "epoch": 0.7126144455747712, "grad_norm": 1.0138752460479736, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8377603888511658, "num_tokens": 223335073.0, "step": 1401 }, { "epoch": 0.7131230925737538, "grad_norm": 0.9489789009094238, "learning_rate": 1e-05, "loss": 0.5215, "mean_token_accuracy": 0.8382784128189087, "num_tokens": 223499553.0, "step": 1402 }, { "epoch": 0.7136317395727365, "grad_norm": 1.0367980003356934, "learning_rate": 1e-05, "loss": 0.5051, "mean_token_accuracy": 0.8410635590553284, "num_tokens": 223657911.0, "step": 1403 }, { "epoch": 0.7141403865717192, "grad_norm": 1.0104140043258667, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.8353630304336548, "num_tokens": 223808765.0, "step": 1404 }, { "epoch": 0.714649033570702, "grad_norm": 0.991611897945404, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8419798612594604, "num_tokens": 223974533.0, "step": 1405 }, { "epoch": 0.7151576805696847, "grad_norm": 1.1228381395339966, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.847956120967865, "num_tokens": 224133481.0, "step": 1406 }, { "epoch": 0.7156663275686673, "grad_norm": 1.118977427482605, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8419226408004761, "num_tokens": 224282640.0, "step": 1407 }, { "epoch": 0.71617497456765, "grad_norm": 0.9979066252708435, "learning_rate": 1e-05, "loss": 0.5444, "mean_token_accuracy": 0.8306796550750732, "num_tokens": 224447298.0, "step": 1408 }, { "epoch": 0.7166836215666328, "grad_norm": 1.0414625406265259, "learning_rate": 1e-05, "loss": 0.5029, "mean_token_accuracy": 0.8423417210578918, "num_tokens": 224596045.0, "step": 1409 }, { "epoch": 0.7171922685656155, "grad_norm": 0.9783027172088623, "learning_rate": 1e-05, "loss": 0.5285, "mean_token_accuracy": 0.8376615047454834, "num_tokens": 224757199.0, "step": 1410 }, { "epoch": 0.7177009155645981, "grad_norm": 0.9670535922050476, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8409932851791382, "num_tokens": 224913566.0, "step": 1411 }, { "epoch": 0.7182095625635809, "grad_norm": 1.0231362581253052, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8397393226623535, "num_tokens": 225078638.0, "step": 1412 }, { "epoch": 0.7187182095625636, "grad_norm": 0.9188551902770996, "learning_rate": 1e-05, "loss": 0.5248, "mean_token_accuracy": 0.8372691869735718, "num_tokens": 225242585.0, "step": 1413 }, { "epoch": 0.7192268565615463, "grad_norm": 1.0983929634094238, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8437936902046204, "num_tokens": 225402143.0, "step": 1414 }, { "epoch": 0.719735503560529, "grad_norm": 1.0025497674942017, "learning_rate": 1e-05, "loss": 0.5487, "mean_token_accuracy": 0.832549512386322, "num_tokens": 225561481.0, "step": 1415 }, { "epoch": 0.7202441505595117, "grad_norm": 0.9898235201835632, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8437957167625427, "num_tokens": 225720513.0, "step": 1416 }, { "epoch": 0.7207527975584944, "grad_norm": 1.0856152772903442, "learning_rate": 1e-05, "loss": 0.5512, "mean_token_accuracy": 0.8304049968719482, "num_tokens": 225873349.0, "step": 1417 }, { "epoch": 0.7212614445574771, "grad_norm": 0.9928951263427734, "learning_rate": 1e-05, "loss": 0.5003, "mean_token_accuracy": 0.8427510261535645, "num_tokens": 226039357.0, "step": 1418 }, { "epoch": 0.7217700915564598, "grad_norm": 1.0025311708450317, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8438827991485596, "num_tokens": 226189348.0, "step": 1419 }, { "epoch": 0.7222787385554426, "grad_norm": 0.986140787601471, "learning_rate": 1e-05, "loss": 0.5282, "mean_token_accuracy": 0.8362927436828613, "num_tokens": 226335920.0, "step": 1420 }, { "epoch": 0.7227873855544252, "grad_norm": 1.1514464616775513, "learning_rate": 1e-05, "loss": 0.5317, "mean_token_accuracy": 0.8350828289985657, "num_tokens": 226495005.0, "step": 1421 }, { "epoch": 0.7232960325534079, "grad_norm": 1.0557314157485962, "learning_rate": 1e-05, "loss": 0.49, "mean_token_accuracy": 0.8465843200683594, "num_tokens": 226657995.0, "step": 1422 }, { "epoch": 0.7238046795523907, "grad_norm": 1.168913722038269, "learning_rate": 1e-05, "loss": 0.5245, "mean_token_accuracy": 0.837516188621521, "num_tokens": 226828202.0, "step": 1423 }, { "epoch": 0.7243133265513734, "grad_norm": 1.1428192853927612, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8357243537902832, "num_tokens": 226982248.0, "step": 1424 }, { "epoch": 0.724821973550356, "grad_norm": 1.0416052341461182, "learning_rate": 1e-05, "loss": 0.5246, "mean_token_accuracy": 0.8367089629173279, "num_tokens": 227129474.0, "step": 1425 }, { "epoch": 0.7253306205493387, "grad_norm": 1.2324395179748535, "learning_rate": 1e-05, "loss": 0.5015, "mean_token_accuracy": 0.8420953154563904, "num_tokens": 227274613.0, "step": 1426 }, { "epoch": 0.7258392675483215, "grad_norm": 1.0069247484207153, "learning_rate": 1e-05, "loss": 0.539, "mean_token_accuracy": 0.8342633247375488, "num_tokens": 227440413.0, "step": 1427 }, { "epoch": 0.7263479145473042, "grad_norm": 1.1502869129180908, "learning_rate": 1e-05, "loss": 0.5142, "mean_token_accuracy": 0.8386930227279663, "num_tokens": 227605134.0, "step": 1428 }, { "epoch": 0.7268565615462869, "grad_norm": 0.9862993955612183, "learning_rate": 1e-05, "loss": 0.5063, "mean_token_accuracy": 0.8415361046791077, "num_tokens": 227761956.0, "step": 1429 }, { "epoch": 0.7273652085452695, "grad_norm": 1.174629807472229, "learning_rate": 1e-05, "loss": 0.5725, "mean_token_accuracy": 0.8255341649055481, "num_tokens": 227922627.0, "step": 1430 }, { "epoch": 0.7278738555442523, "grad_norm": 1.102269172668457, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8369065523147583, "num_tokens": 228080800.0, "step": 1431 }, { "epoch": 0.728382502543235, "grad_norm": 1.0408049821853638, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8411990404129028, "num_tokens": 228242141.0, "step": 1432 }, { "epoch": 0.7288911495422177, "grad_norm": 1.3122097253799438, "learning_rate": 1e-05, "loss": 0.5153, "mean_token_accuracy": 0.8383074402809143, "num_tokens": 228387653.0, "step": 1433 }, { "epoch": 0.7293997965412004, "grad_norm": 0.9923901557922363, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8445407152175903, "num_tokens": 228545473.0, "step": 1434 }, { "epoch": 0.7299084435401831, "grad_norm": 1.1669878959655762, "learning_rate": 1e-05, "loss": 0.5289, "mean_token_accuracy": 0.8352810144424438, "num_tokens": 228713308.0, "step": 1435 }, { "epoch": 0.7304170905391658, "grad_norm": 1.0849850177764893, "learning_rate": 1e-05, "loss": 0.5468, "mean_token_accuracy": 0.8317233920097351, "num_tokens": 228884216.0, "step": 1436 }, { "epoch": 0.7309257375381485, "grad_norm": 1.0434106588363647, "learning_rate": 1e-05, "loss": 0.5594, "mean_token_accuracy": 0.8295158743858337, "num_tokens": 229029827.0, "step": 1437 }, { "epoch": 0.7314343845371313, "grad_norm": 1.1874513626098633, "learning_rate": 1e-05, "loss": 0.537, "mean_token_accuracy": 0.8329129219055176, "num_tokens": 229181920.0, "step": 1438 }, { "epoch": 0.731943031536114, "grad_norm": 0.96856290102005, "learning_rate": 1e-05, "loss": 0.5377, "mean_token_accuracy": 0.8328238725662231, "num_tokens": 229349488.0, "step": 1439 }, { "epoch": 0.7324516785350966, "grad_norm": 1.1322166919708252, "learning_rate": 1e-05, "loss": 0.5119, "mean_token_accuracy": 0.8405275940895081, "num_tokens": 229505884.0, "step": 1440 }, { "epoch": 0.7329603255340793, "grad_norm": 1.402640700340271, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8364936113357544, "num_tokens": 229673440.0, "step": 1441 }, { "epoch": 0.7334689725330621, "grad_norm": 0.9987578988075256, "learning_rate": 1e-05, "loss": 0.5087, "mean_token_accuracy": 0.8413553237915039, "num_tokens": 229844345.0, "step": 1442 }, { "epoch": 0.7339776195320448, "grad_norm": 1.277686357498169, "learning_rate": 1e-05, "loss": 0.5684, "mean_token_accuracy": 0.826140284538269, "num_tokens": 230001344.0, "step": 1443 }, { "epoch": 0.7344862665310274, "grad_norm": 1.149610161781311, "learning_rate": 1e-05, "loss": 0.5012, "mean_token_accuracy": 0.843376874923706, "num_tokens": 230160760.0, "step": 1444 }, { "epoch": 0.7349949135300101, "grad_norm": 1.1200135946273804, "learning_rate": 1e-05, "loss": 0.5128, "mean_token_accuracy": 0.8402576446533203, "num_tokens": 230318252.0, "step": 1445 }, { "epoch": 0.7355035605289929, "grad_norm": 1.131650447845459, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8372815847396851, "num_tokens": 230480542.0, "step": 1446 }, { "epoch": 0.7360122075279756, "grad_norm": 1.0408190488815308, "learning_rate": 1e-05, "loss": 0.5353, "mean_token_accuracy": 0.8357641100883484, "num_tokens": 230631226.0, "step": 1447 }, { "epoch": 0.7365208545269583, "grad_norm": 1.0704621076583862, "learning_rate": 1e-05, "loss": 0.4936, "mean_token_accuracy": 0.8441085815429688, "num_tokens": 230793950.0, "step": 1448 }, { "epoch": 0.737029501525941, "grad_norm": 1.017288088798523, "learning_rate": 1e-05, "loss": 0.5674, "mean_token_accuracy": 0.8265225887298584, "num_tokens": 230949258.0, "step": 1449 }, { "epoch": 0.7375381485249237, "grad_norm": 1.0223156213760376, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8423564434051514, "num_tokens": 231116995.0, "step": 1450 }, { "epoch": 0.7380467955239064, "grad_norm": 0.9167082905769348, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8476444482803345, "num_tokens": 231286840.0, "step": 1451 }, { "epoch": 0.7385554425228891, "grad_norm": 0.993761420249939, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8388065099716187, "num_tokens": 231446504.0, "step": 1452 }, { "epoch": 0.7390640895218719, "grad_norm": 0.9417846202850342, "learning_rate": 1e-05, "loss": 0.5438, "mean_token_accuracy": 0.832386314868927, "num_tokens": 231614914.0, "step": 1453 }, { "epoch": 0.7395727365208545, "grad_norm": 0.9867107272148132, "learning_rate": 1e-05, "loss": 0.5138, "mean_token_accuracy": 0.8408212661743164, "num_tokens": 231772874.0, "step": 1454 }, { "epoch": 0.7400813835198372, "grad_norm": 0.9379030466079712, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8361393213272095, "num_tokens": 231937051.0, "step": 1455 }, { "epoch": 0.7405900305188199, "grad_norm": 0.9655819535255432, "learning_rate": 1e-05, "loss": 0.5098, "mean_token_accuracy": 0.8403584957122803, "num_tokens": 232087611.0, "step": 1456 }, { "epoch": 0.7410986775178027, "grad_norm": 1.0079295635223389, "learning_rate": 1e-05, "loss": 0.5616, "mean_token_accuracy": 0.8275667428970337, "num_tokens": 232246891.0, "step": 1457 }, { "epoch": 0.7416073245167853, "grad_norm": 0.9451082944869995, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8424191474914551, "num_tokens": 232405181.0, "step": 1458 }, { "epoch": 0.742115971515768, "grad_norm": 0.9628739953041077, "learning_rate": 1e-05, "loss": 0.5641, "mean_token_accuracy": 0.8226780891418457, "num_tokens": 232569215.0, "step": 1459 }, { "epoch": 0.7426246185147508, "grad_norm": 0.9538934826850891, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.8450824618339539, "num_tokens": 232730814.0, "step": 1460 }, { "epoch": 0.7431332655137335, "grad_norm": 0.9364286065101624, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.8396707773208618, "num_tokens": 232892427.0, "step": 1461 }, { "epoch": 0.7436419125127162, "grad_norm": 0.971477210521698, "learning_rate": 1e-05, "loss": 0.5105, "mean_token_accuracy": 0.8389374613761902, "num_tokens": 233062413.0, "step": 1462 }, { "epoch": 0.7441505595116988, "grad_norm": 1.0381510257720947, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.8426993489265442, "num_tokens": 233223449.0, "step": 1463 }, { "epoch": 0.7446592065106816, "grad_norm": 0.9925874471664429, "learning_rate": 1e-05, "loss": 0.5116, "mean_token_accuracy": 0.8392652273178101, "num_tokens": 233382314.0, "step": 1464 }, { "epoch": 0.7451678535096643, "grad_norm": 0.9298463463783264, "learning_rate": 1e-05, "loss": 0.5043, "mean_token_accuracy": 0.8449363708496094, "num_tokens": 233549471.0, "step": 1465 }, { "epoch": 0.745676500508647, "grad_norm": 0.9784131050109863, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.835938572883606, "num_tokens": 233714244.0, "step": 1466 }, { "epoch": 0.7461851475076297, "grad_norm": 1.0752923488616943, "learning_rate": 1e-05, "loss": 0.5157, "mean_token_accuracy": 0.8408986330032349, "num_tokens": 233873031.0, "step": 1467 }, { "epoch": 0.7466937945066124, "grad_norm": 0.9333047270774841, "learning_rate": 1e-05, "loss": 0.4983, "mean_token_accuracy": 0.8432742953300476, "num_tokens": 234028924.0, "step": 1468 }, { "epoch": 0.7472024415055951, "grad_norm": 1.0717540979385376, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8415995240211487, "num_tokens": 234187316.0, "step": 1469 }, { "epoch": 0.7477110885045778, "grad_norm": 1.023341178894043, "learning_rate": 1e-05, "loss": 0.498, "mean_token_accuracy": 0.8454621434211731, "num_tokens": 234352052.0, "step": 1470 }, { "epoch": 0.7482197355035606, "grad_norm": 1.088341236114502, "learning_rate": 1e-05, "loss": 0.499, "mean_token_accuracy": 0.8432536125183105, "num_tokens": 234522725.0, "step": 1471 }, { "epoch": 0.7487283825025433, "grad_norm": 1.048627257347107, "learning_rate": 1e-05, "loss": 0.5375, "mean_token_accuracy": 0.8310221433639526, "num_tokens": 234685164.0, "step": 1472 }, { "epoch": 0.7492370295015259, "grad_norm": 0.977834165096283, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8420848846435547, "num_tokens": 234844848.0, "step": 1473 }, { "epoch": 0.7497456765005086, "grad_norm": 1.0553686618804932, "learning_rate": 1e-05, "loss": 0.4901, "mean_token_accuracy": 0.8447784185409546, "num_tokens": 234999779.0, "step": 1474 }, { "epoch": 0.7502543234994914, "grad_norm": 1.0582385063171387, "learning_rate": 1e-05, "loss": 0.584, "mean_token_accuracy": 0.8204246163368225, "num_tokens": 235162263.0, "step": 1475 }, { "epoch": 0.7507629704984741, "grad_norm": 0.9934267997741699, "learning_rate": 1e-05, "loss": 0.5622, "mean_token_accuracy": 0.8252400755882263, "num_tokens": 235319603.0, "step": 1476 }, { "epoch": 0.7512716174974567, "grad_norm": 1.0062447786331177, "learning_rate": 1e-05, "loss": 0.5026, "mean_token_accuracy": 0.8412712812423706, "num_tokens": 235479108.0, "step": 1477 }, { "epoch": 0.7517802644964394, "grad_norm": 1.0643982887268066, "learning_rate": 1e-05, "loss": 0.521, "mean_token_accuracy": 0.8388631343841553, "num_tokens": 235635662.0, "step": 1478 }, { "epoch": 0.7522889114954222, "grad_norm": 1.0867574214935303, "learning_rate": 1e-05, "loss": 0.5331, "mean_token_accuracy": 0.8342521786689758, "num_tokens": 235778848.0, "step": 1479 }, { "epoch": 0.7527975584944049, "grad_norm": 1.0071399211883545, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.8450353145599365, "num_tokens": 235935563.0, "step": 1480 }, { "epoch": 0.7533062054933876, "grad_norm": 0.9993019104003906, "learning_rate": 1e-05, "loss": 0.5225, "mean_token_accuracy": 0.8392289280891418, "num_tokens": 236102712.0, "step": 1481 }, { "epoch": 0.7538148524923703, "grad_norm": 1.0551786422729492, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8395529985427856, "num_tokens": 236247865.0, "step": 1482 }, { "epoch": 0.754323499491353, "grad_norm": 1.036880612373352, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.838604211807251, "num_tokens": 236406459.0, "step": 1483 }, { "epoch": 0.7548321464903357, "grad_norm": 0.9874710440635681, "learning_rate": 1e-05, "loss": 0.4894, "mean_token_accuracy": 0.8457585573196411, "num_tokens": 236548474.0, "step": 1484 }, { "epoch": 0.7553407934893184, "grad_norm": 1.0621289014816284, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.8316954374313354, "num_tokens": 236699987.0, "step": 1485 }, { "epoch": 0.7558494404883012, "grad_norm": 1.032537579536438, "learning_rate": 1e-05, "loss": 0.4908, "mean_token_accuracy": 0.8453277349472046, "num_tokens": 236859308.0, "step": 1486 }, { "epoch": 0.7563580874872838, "grad_norm": 0.9991673231124878, "learning_rate": 1e-05, "loss": 0.4927, "mean_token_accuracy": 0.8444706201553345, "num_tokens": 237011851.0, "step": 1487 }, { "epoch": 0.7568667344862665, "grad_norm": 1.0048476457595825, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.837803840637207, "num_tokens": 237170473.0, "step": 1488 }, { "epoch": 0.7573753814852492, "grad_norm": 1.1486921310424805, "learning_rate": 1e-05, "loss": 0.4974, "mean_token_accuracy": 0.8430988788604736, "num_tokens": 237317583.0, "step": 1489 }, { "epoch": 0.757884028484232, "grad_norm": 1.0181177854537964, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8443738222122192, "num_tokens": 237483382.0, "step": 1490 }, { "epoch": 0.7583926754832147, "grad_norm": 0.9788311123847961, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8405285477638245, "num_tokens": 237642857.0, "step": 1491 }, { "epoch": 0.7589013224821973, "grad_norm": 0.9558936953544617, "learning_rate": 1e-05, "loss": 0.5103, "mean_token_accuracy": 0.8410489559173584, "num_tokens": 237809071.0, "step": 1492 }, { "epoch": 0.7594099694811801, "grad_norm": 0.996228039264679, "learning_rate": 1e-05, "loss": 0.533, "mean_token_accuracy": 0.835081934928894, "num_tokens": 237977918.0, "step": 1493 }, { "epoch": 0.7599186164801628, "grad_norm": 0.986886203289032, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8364228010177612, "num_tokens": 238146507.0, "step": 1494 }, { "epoch": 0.7604272634791455, "grad_norm": 0.9645000100135803, "learning_rate": 1e-05, "loss": 0.5002, "mean_token_accuracy": 0.8437860012054443, "num_tokens": 238317537.0, "step": 1495 }, { "epoch": 0.7609359104781281, "grad_norm": 1.049106478691101, "learning_rate": 1e-05, "loss": 0.501, "mean_token_accuracy": 0.8432005643844604, "num_tokens": 238483591.0, "step": 1496 }, { "epoch": 0.7614445574771109, "grad_norm": 0.9359789490699768, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.8473359942436218, "num_tokens": 238641826.0, "step": 1497 }, { "epoch": 0.7619532044760936, "grad_norm": 1.026975393295288, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8406385183334351, "num_tokens": 238793523.0, "step": 1498 }, { "epoch": 0.7624618514750763, "grad_norm": 1.0067918300628662, "learning_rate": 1e-05, "loss": 0.5171, "mean_token_accuracy": 0.8395569920539856, "num_tokens": 238956488.0, "step": 1499 }, { "epoch": 0.762970498474059, "grad_norm": 0.9238914847373962, "learning_rate": 1e-05, "loss": 0.4893, "mean_token_accuracy": 0.8466312885284424, "num_tokens": 239112968.0, "step": 1500 }, { "epoch": 0.7634791454730417, "grad_norm": 1.0350664854049683, "learning_rate": 1e-05, "loss": 0.545, "mean_token_accuracy": 0.8327237367630005, "num_tokens": 239278918.0, "step": 1501 }, { "epoch": 0.7639877924720244, "grad_norm": 1.1178715229034424, "learning_rate": 1e-05, "loss": 0.4998, "mean_token_accuracy": 0.8421357274055481, "num_tokens": 239429305.0, "step": 1502 }, { "epoch": 0.7644964394710071, "grad_norm": 0.9649775624275208, "learning_rate": 1e-05, "loss": 0.4849, "mean_token_accuracy": 0.8484745025634766, "num_tokens": 239592603.0, "step": 1503 }, { "epoch": 0.7650050864699899, "grad_norm": 1.130552887916565, "learning_rate": 1e-05, "loss": 0.5433, "mean_token_accuracy": 0.8310620784759521, "num_tokens": 239745054.0, "step": 1504 }, { "epoch": 0.7655137334689726, "grad_norm": 0.9407525658607483, "learning_rate": 1e-05, "loss": 0.5056, "mean_token_accuracy": 0.8418401479721069, "num_tokens": 239909253.0, "step": 1505 }, { "epoch": 0.7660223804679552, "grad_norm": 1.05576753616333, "learning_rate": 1e-05, "loss": 0.5058, "mean_token_accuracy": 0.8412678241729736, "num_tokens": 240051112.0, "step": 1506 }, { "epoch": 0.7665310274669379, "grad_norm": 1.0370968580245972, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8489423990249634, "num_tokens": 240211700.0, "step": 1507 }, { "epoch": 0.7670396744659207, "grad_norm": 0.9873724579811096, "learning_rate": 1e-05, "loss": 0.515, "mean_token_accuracy": 0.8384915590286255, "num_tokens": 240373170.0, "step": 1508 }, { "epoch": 0.7675483214649034, "grad_norm": 1.1496977806091309, "learning_rate": 1e-05, "loss": 0.5341, "mean_token_accuracy": 0.8343428373336792, "num_tokens": 240518936.0, "step": 1509 }, { "epoch": 0.768056968463886, "grad_norm": 0.9918933510780334, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8385894894599915, "num_tokens": 240668912.0, "step": 1510 }, { "epoch": 0.7685656154628687, "grad_norm": 1.0380821228027344, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8435571193695068, "num_tokens": 240828027.0, "step": 1511 }, { "epoch": 0.7690742624618515, "grad_norm": 0.9556916952133179, "learning_rate": 1e-05, "loss": 0.5036, "mean_token_accuracy": 0.8424919843673706, "num_tokens": 240996115.0, "step": 1512 }, { "epoch": 0.7695829094608342, "grad_norm": 1.1142911911010742, "learning_rate": 1e-05, "loss": 0.5031, "mean_token_accuracy": 0.8429108262062073, "num_tokens": 241154020.0, "step": 1513 }, { "epoch": 0.7700915564598169, "grad_norm": 0.992397665977478, "learning_rate": 1e-05, "loss": 0.4952, "mean_token_accuracy": 0.8445471525192261, "num_tokens": 241313175.0, "step": 1514 }, { "epoch": 0.7706002034587996, "grad_norm": 1.2248951196670532, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.838437557220459, "num_tokens": 241461371.0, "step": 1515 }, { "epoch": 0.7711088504577823, "grad_norm": 1.07581627368927, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8337858319282532, "num_tokens": 241631176.0, "step": 1516 }, { "epoch": 0.771617497456765, "grad_norm": 0.9590333104133606, "learning_rate": 1e-05, "loss": 0.5436, "mean_token_accuracy": 0.831385612487793, "num_tokens": 241791904.0, "step": 1517 }, { "epoch": 0.7721261444557477, "grad_norm": 1.0338011980056763, "learning_rate": 1e-05, "loss": 0.5032, "mean_token_accuracy": 0.8443799018859863, "num_tokens": 241954029.0, "step": 1518 }, { "epoch": 0.7726347914547305, "grad_norm": 0.9844240546226501, "learning_rate": 1e-05, "loss": 0.5657, "mean_token_accuracy": 0.8268498778343201, "num_tokens": 242106558.0, "step": 1519 }, { "epoch": 0.7731434384537131, "grad_norm": 1.0088313817977905, "learning_rate": 1e-05, "loss": 0.5292, "mean_token_accuracy": 0.8359307050704956, "num_tokens": 242276624.0, "step": 1520 }, { "epoch": 0.7736520854526958, "grad_norm": 0.989596426486969, "learning_rate": 1e-05, "loss": 0.5221, "mean_token_accuracy": 0.8372203707695007, "num_tokens": 242434754.0, "step": 1521 }, { "epoch": 0.7741607324516785, "grad_norm": 1.020996332168579, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8438454866409302, "num_tokens": 242576931.0, "step": 1522 }, { "epoch": 0.7746693794506613, "grad_norm": 1.0195339918136597, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.8392676711082458, "num_tokens": 242733793.0, "step": 1523 }, { "epoch": 0.775178026449644, "grad_norm": 1.117997169494629, "learning_rate": 1e-05, "loss": 0.5112, "mean_token_accuracy": 0.8406413197517395, "num_tokens": 242882978.0, "step": 1524 }, { "epoch": 0.7756866734486266, "grad_norm": 0.981311559677124, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.8393944501876831, "num_tokens": 243038462.0, "step": 1525 }, { "epoch": 0.7761953204476093, "grad_norm": 1.094862937927246, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8363096714019775, "num_tokens": 243206785.0, "step": 1526 }, { "epoch": 0.7767039674465921, "grad_norm": 1.0014318227767944, "learning_rate": 1e-05, "loss": 0.5096, "mean_token_accuracy": 0.8408504128456116, "num_tokens": 243357645.0, "step": 1527 }, { "epoch": 0.7772126144455748, "grad_norm": 1.0128889083862305, "learning_rate": 1e-05, "loss": 0.5019, "mean_token_accuracy": 0.8421143293380737, "num_tokens": 243512602.0, "step": 1528 }, { "epoch": 0.7777212614445574, "grad_norm": 1.0889755487442017, "learning_rate": 1e-05, "loss": 0.505, "mean_token_accuracy": 0.8413415551185608, "num_tokens": 243687260.0, "step": 1529 }, { "epoch": 0.7782299084435402, "grad_norm": 1.0065422058105469, "learning_rate": 1e-05, "loss": 0.5121, "mean_token_accuracy": 0.8403002619743347, "num_tokens": 243845125.0, "step": 1530 }, { "epoch": 0.7787385554425229, "grad_norm": 1.048030138015747, "learning_rate": 1e-05, "loss": 0.4737, "mean_token_accuracy": 0.8500345945358276, "num_tokens": 243985343.0, "step": 1531 }, { "epoch": 0.7792472024415056, "grad_norm": 1.0371286869049072, "learning_rate": 1e-05, "loss": 0.5294, "mean_token_accuracy": 0.8364617228507996, "num_tokens": 244139431.0, "step": 1532 }, { "epoch": 0.7797558494404883, "grad_norm": 1.0632935762405396, "learning_rate": 1e-05, "loss": 0.5255, "mean_token_accuracy": 0.8366844058036804, "num_tokens": 244296684.0, "step": 1533 }, { "epoch": 0.780264496439471, "grad_norm": 1.0461639165878296, "learning_rate": 1e-05, "loss": 0.5206, "mean_token_accuracy": 0.8383218050003052, "num_tokens": 244456341.0, "step": 1534 }, { "epoch": 0.7807731434384537, "grad_norm": 1.0430577993392944, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8446320295333862, "num_tokens": 244612234.0, "step": 1535 }, { "epoch": 0.7812817904374364, "grad_norm": 1.0549424886703491, "learning_rate": 1e-05, "loss": 0.5374, "mean_token_accuracy": 0.8326089382171631, "num_tokens": 244763392.0, "step": 1536 }, { "epoch": 0.7817904374364191, "grad_norm": 1.0101888179779053, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8328830003738403, "num_tokens": 244921544.0, "step": 1537 }, { "epoch": 0.7822990844354019, "grad_norm": 1.0776944160461426, "learning_rate": 1e-05, "loss": 0.5038, "mean_token_accuracy": 0.8413270711898804, "num_tokens": 245082597.0, "step": 1538 }, { "epoch": 0.7828077314343845, "grad_norm": 1.0002422332763672, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8351992964744568, "num_tokens": 245229522.0, "step": 1539 }, { "epoch": 0.7833163784333672, "grad_norm": 1.0443092584609985, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8371322154998779, "num_tokens": 245379953.0, "step": 1540 }, { "epoch": 0.78382502543235, "grad_norm": 0.9857534170150757, "learning_rate": 1e-05, "loss": 0.506, "mean_token_accuracy": 0.8414639830589294, "num_tokens": 245531845.0, "step": 1541 }, { "epoch": 0.7843336724313327, "grad_norm": 0.9642319083213806, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8451875448226929, "num_tokens": 245696677.0, "step": 1542 }, { "epoch": 0.7848423194303153, "grad_norm": 0.9377528429031372, "learning_rate": 1e-05, "loss": 0.5102, "mean_token_accuracy": 0.8393126726150513, "num_tokens": 245866942.0, "step": 1543 }, { "epoch": 0.785350966429298, "grad_norm": 0.992759644985199, "learning_rate": 1e-05, "loss": 0.5352, "mean_token_accuracy": 0.834294319152832, "num_tokens": 246030786.0, "step": 1544 }, { "epoch": 0.7858596134282808, "grad_norm": 1.0652512311935425, "learning_rate": 1e-05, "loss": 0.5154, "mean_token_accuracy": 0.8386496305465698, "num_tokens": 246181272.0, "step": 1545 }, { "epoch": 0.7863682604272635, "grad_norm": 1.1718790531158447, "learning_rate": 1e-05, "loss": 0.4754, "mean_token_accuracy": 0.848141074180603, "num_tokens": 246332311.0, "step": 1546 }, { "epoch": 0.7868769074262462, "grad_norm": 1.164916753768921, "learning_rate": 1e-05, "loss": 0.5441, "mean_token_accuracy": 0.8322082757949829, "num_tokens": 246491365.0, "step": 1547 }, { "epoch": 0.7873855544252288, "grad_norm": 1.1251907348632812, "learning_rate": 1e-05, "loss": 0.4793, "mean_token_accuracy": 0.8482698202133179, "num_tokens": 246646398.0, "step": 1548 }, { "epoch": 0.7878942014242116, "grad_norm": 1.0759825706481934, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8396289348602295, "num_tokens": 246806249.0, "step": 1549 }, { "epoch": 0.7884028484231943, "grad_norm": 1.0879809856414795, "learning_rate": 1e-05, "loss": 0.4741, "mean_token_accuracy": 0.848809003829956, "num_tokens": 246957754.0, "step": 1550 }, { "epoch": 0.788911495422177, "grad_norm": 0.964314341545105, "learning_rate": 1e-05, "loss": 0.4633, "mean_token_accuracy": 0.8527604341506958, "num_tokens": 247122388.0, "step": 1551 }, { "epoch": 0.7894201424211598, "grad_norm": 1.0330865383148193, "learning_rate": 1e-05, "loss": 0.4731, "mean_token_accuracy": 0.8512973785400391, "num_tokens": 247284678.0, "step": 1552 }, { "epoch": 0.7899287894201424, "grad_norm": 1.049044132232666, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.8345746397972107, "num_tokens": 247442396.0, "step": 1553 }, { "epoch": 0.7904374364191251, "grad_norm": 1.0321537256240845, "learning_rate": 1e-05, "loss": 0.5033, "mean_token_accuracy": 0.8421740531921387, "num_tokens": 247601438.0, "step": 1554 }, { "epoch": 0.7909460834181078, "grad_norm": 1.083871841430664, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8436111211776733, "num_tokens": 247766929.0, "step": 1555 }, { "epoch": 0.7914547304170906, "grad_norm": 1.0228075981140137, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8366875648498535, "num_tokens": 247935788.0, "step": 1556 }, { "epoch": 0.7919633774160733, "grad_norm": 1.103209137916565, "learning_rate": 1e-05, "loss": 0.4918, "mean_token_accuracy": 0.8470031023025513, "num_tokens": 248102197.0, "step": 1557 }, { "epoch": 0.7924720244150559, "grad_norm": 1.0742714405059814, "learning_rate": 1e-05, "loss": 0.4889, "mean_token_accuracy": 0.8469794988632202, "num_tokens": 248252748.0, "step": 1558 }, { "epoch": 0.7929806714140386, "grad_norm": 1.0506287813186646, "learning_rate": 1e-05, "loss": 0.5021, "mean_token_accuracy": 0.8429771661758423, "num_tokens": 248410690.0, "step": 1559 }, { "epoch": 0.7934893184130214, "grad_norm": 1.2658612728118896, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8446970582008362, "num_tokens": 248562629.0, "step": 1560 }, { "epoch": 0.7939979654120041, "grad_norm": 1.0374503135681152, "learning_rate": 1e-05, "loss": 0.529, "mean_token_accuracy": 0.8339532613754272, "num_tokens": 248724426.0, "step": 1561 }, { "epoch": 0.7945066124109867, "grad_norm": 1.109431266784668, "learning_rate": 1e-05, "loss": 0.5269, "mean_token_accuracy": 0.8367703557014465, "num_tokens": 248884711.0, "step": 1562 }, { "epoch": 0.7950152594099695, "grad_norm": 1.0636353492736816, "learning_rate": 1e-05, "loss": 0.5234, "mean_token_accuracy": 0.8372197151184082, "num_tokens": 249039319.0, "step": 1563 }, { "epoch": 0.7955239064089522, "grad_norm": 1.0044738054275513, "learning_rate": 1e-05, "loss": 0.5103, "mean_token_accuracy": 0.8407605290412903, "num_tokens": 249198668.0, "step": 1564 }, { "epoch": 0.7960325534079349, "grad_norm": 1.0944424867630005, "learning_rate": 1e-05, "loss": 0.5106, "mean_token_accuracy": 0.8392775058746338, "num_tokens": 249355386.0, "step": 1565 }, { "epoch": 0.7965412004069176, "grad_norm": 1.0104095935821533, "learning_rate": 1e-05, "loss": 0.5258, "mean_token_accuracy": 0.836336612701416, "num_tokens": 249506726.0, "step": 1566 }, { "epoch": 0.7970498474059003, "grad_norm": 1.019428014755249, "learning_rate": 1e-05, "loss": 0.5437, "mean_token_accuracy": 0.8311398029327393, "num_tokens": 249668433.0, "step": 1567 }, { "epoch": 0.797558494404883, "grad_norm": 1.0521516799926758, "learning_rate": 1e-05, "loss": 0.4922, "mean_token_accuracy": 0.8451628684997559, "num_tokens": 249815595.0, "step": 1568 }, { "epoch": 0.7980671414038657, "grad_norm": 1.0037367343902588, "learning_rate": 1e-05, "loss": 0.5098, "mean_token_accuracy": 0.839537501335144, "num_tokens": 249977103.0, "step": 1569 }, { "epoch": 0.7985757884028484, "grad_norm": 0.9603949189186096, "learning_rate": 1e-05, "loss": 0.5177, "mean_token_accuracy": 0.8383146524429321, "num_tokens": 250145992.0, "step": 1570 }, { "epoch": 0.7990844354018312, "grad_norm": 1.1059879064559937, "learning_rate": 1e-05, "loss": 0.5197, "mean_token_accuracy": 0.837566614151001, "num_tokens": 250280246.0, "step": 1571 }, { "epoch": 0.7995930824008138, "grad_norm": 0.9839898943901062, "learning_rate": 1e-05, "loss": 0.5232, "mean_token_accuracy": 0.8364046812057495, "num_tokens": 250443792.0, "step": 1572 }, { "epoch": 0.8001017293997965, "grad_norm": 0.919985294342041, "learning_rate": 1e-05, "loss": 0.5083, "mean_token_accuracy": 0.8388224840164185, "num_tokens": 250599631.0, "step": 1573 }, { "epoch": 0.8006103763987793, "grad_norm": 1.0868338346481323, "learning_rate": 1e-05, "loss": 0.5352, "mean_token_accuracy": 0.8346595168113708, "num_tokens": 250747877.0, "step": 1574 }, { "epoch": 0.801119023397762, "grad_norm": 1.0660415887832642, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.8343693614006042, "num_tokens": 250900217.0, "step": 1575 }, { "epoch": 0.8016276703967447, "grad_norm": 1.2554595470428467, "learning_rate": 1e-05, "loss": 0.4897, "mean_token_accuracy": 0.8446955680847168, "num_tokens": 251052502.0, "step": 1576 }, { "epoch": 0.8021363173957273, "grad_norm": 0.9247676134109497, "learning_rate": 1e-05, "loss": 0.4928, "mean_token_accuracy": 0.8457845449447632, "num_tokens": 251208564.0, "step": 1577 }, { "epoch": 0.8026449643947101, "grad_norm": 1.0211766958236694, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8419086933135986, "num_tokens": 251352775.0, "step": 1578 }, { "epoch": 0.8031536113936928, "grad_norm": 0.9540887475013733, "learning_rate": 1e-05, "loss": 0.5056, "mean_token_accuracy": 0.8426491022109985, "num_tokens": 251518171.0, "step": 1579 }, { "epoch": 0.8036622583926755, "grad_norm": 0.9458035826683044, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8412249088287354, "num_tokens": 251677652.0, "step": 1580 }, { "epoch": 0.8041709053916581, "grad_norm": 1.045344352722168, "learning_rate": 1e-05, "loss": 0.5235, "mean_token_accuracy": 0.8355517387390137, "num_tokens": 251821333.0, "step": 1581 }, { "epoch": 0.8046795523906409, "grad_norm": 1.0049554109573364, "learning_rate": 1e-05, "loss": 0.5681, "mean_token_accuracy": 0.8258787989616394, "num_tokens": 251995232.0, "step": 1582 }, { "epoch": 0.8051881993896236, "grad_norm": 0.9822063446044922, "learning_rate": 1e-05, "loss": 0.5013, "mean_token_accuracy": 0.8422402739524841, "num_tokens": 252151382.0, "step": 1583 }, { "epoch": 0.8056968463886063, "grad_norm": 0.9994479417800903, "learning_rate": 1e-05, "loss": 0.5343, "mean_token_accuracy": 0.8352287411689758, "num_tokens": 252318139.0, "step": 1584 }, { "epoch": 0.8062054933875891, "grad_norm": 1.0065675973892212, "learning_rate": 1e-05, "loss": 0.4998, "mean_token_accuracy": 0.8429774641990662, "num_tokens": 252479916.0, "step": 1585 }, { "epoch": 0.8067141403865717, "grad_norm": 0.9369513392448425, "learning_rate": 1e-05, "loss": 0.513, "mean_token_accuracy": 0.8410748839378357, "num_tokens": 252655015.0, "step": 1586 }, { "epoch": 0.8072227873855544, "grad_norm": 0.9758577942848206, "learning_rate": 1e-05, "loss": 0.5066, "mean_token_accuracy": 0.8424795866012573, "num_tokens": 252810669.0, "step": 1587 }, { "epoch": 0.8077314343845371, "grad_norm": 0.9955373406410217, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8429820537567139, "num_tokens": 252975872.0, "step": 1588 }, { "epoch": 0.8082400813835199, "grad_norm": 1.0039629936218262, "learning_rate": 1e-05, "loss": 0.5167, "mean_token_accuracy": 0.838952898979187, "num_tokens": 253140927.0, "step": 1589 }, { "epoch": 0.8087487283825026, "grad_norm": 1.0134005546569824, "learning_rate": 1e-05, "loss": 0.5233, "mean_token_accuracy": 0.8362022638320923, "num_tokens": 253298562.0, "step": 1590 }, { "epoch": 0.8092573753814852, "grad_norm": 0.9585344195365906, "learning_rate": 1e-05, "loss": 0.5182, "mean_token_accuracy": 0.8382187485694885, "num_tokens": 253469245.0, "step": 1591 }, { "epoch": 0.8097660223804679, "grad_norm": 1.0151357650756836, "learning_rate": 1e-05, "loss": 0.4573, "mean_token_accuracy": 0.8557564616203308, "num_tokens": 253623711.0, "step": 1592 }, { "epoch": 0.8102746693794507, "grad_norm": 1.0037392377853394, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8455232381820679, "num_tokens": 253793013.0, "step": 1593 }, { "epoch": 0.8107833163784334, "grad_norm": 1.0920860767364502, "learning_rate": 1e-05, "loss": 0.5494, "mean_token_accuracy": 0.8293622732162476, "num_tokens": 253951728.0, "step": 1594 }, { "epoch": 0.811291963377416, "grad_norm": 1.0275788307189941, "learning_rate": 1e-05, "loss": 0.5097, "mean_token_accuracy": 0.8406323194503784, "num_tokens": 254107635.0, "step": 1595 }, { "epoch": 0.8118006103763988, "grad_norm": 0.9545956254005432, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8365148901939392, "num_tokens": 254256769.0, "step": 1596 }, { "epoch": 0.8123092573753815, "grad_norm": 1.0170985460281372, "learning_rate": 1e-05, "loss": 0.5504, "mean_token_accuracy": 0.8302421569824219, "num_tokens": 254426121.0, "step": 1597 }, { "epoch": 0.8128179043743642, "grad_norm": 0.9216252565383911, "learning_rate": 1e-05, "loss": 0.5147, "mean_token_accuracy": 0.8396792411804199, "num_tokens": 254592698.0, "step": 1598 }, { "epoch": 0.8133265513733469, "grad_norm": 0.9474987983703613, "learning_rate": 1e-05, "loss": 0.519, "mean_token_accuracy": 0.8377450108528137, "num_tokens": 254757687.0, "step": 1599 }, { "epoch": 0.8138351983723296, "grad_norm": 1.0128676891326904, "learning_rate": 1e-05, "loss": 0.5186, "mean_token_accuracy": 0.8377546072006226, "num_tokens": 254907582.0, "step": 1600 }, { "epoch": 0.8143438453713123, "grad_norm": 1.0330684185028076, "learning_rate": 1e-05, "loss": 0.4952, "mean_token_accuracy": 0.8445876836776733, "num_tokens": 255060864.0, "step": 1601 }, { "epoch": 0.814852492370295, "grad_norm": 1.0349136590957642, "learning_rate": 1e-05, "loss": 0.4896, "mean_token_accuracy": 0.8454582691192627, "num_tokens": 255220612.0, "step": 1602 }, { "epoch": 0.8153611393692777, "grad_norm": 1.0863885879516602, "learning_rate": 1e-05, "loss": 0.5662, "mean_token_accuracy": 0.8244870901107788, "num_tokens": 255381676.0, "step": 1603 }, { "epoch": 0.8158697863682605, "grad_norm": 1.0500446557998657, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8489367961883545, "num_tokens": 255530491.0, "step": 1604 }, { "epoch": 0.8163784333672431, "grad_norm": 0.9453365206718445, "learning_rate": 1e-05, "loss": 0.5082, "mean_token_accuracy": 0.8424679040908813, "num_tokens": 255693669.0, "step": 1605 }, { "epoch": 0.8168870803662258, "grad_norm": 1.0558098554611206, "learning_rate": 1e-05, "loss": 0.5168, "mean_token_accuracy": 0.8376903533935547, "num_tokens": 255861693.0, "step": 1606 }, { "epoch": 0.8173957273652085, "grad_norm": 1.0058090686798096, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8368159532546997, "num_tokens": 256028864.0, "step": 1607 }, { "epoch": 0.8179043743641913, "grad_norm": 1.1240429878234863, "learning_rate": 1e-05, "loss": 0.4665, "mean_token_accuracy": 0.8514443635940552, "num_tokens": 256184434.0, "step": 1608 }, { "epoch": 0.818413021363174, "grad_norm": 1.0584617853164673, "learning_rate": 1e-05, "loss": 0.4774, "mean_token_accuracy": 0.8490929007530212, "num_tokens": 256342000.0, "step": 1609 }, { "epoch": 0.8189216683621566, "grad_norm": 1.1094365119934082, "learning_rate": 1e-05, "loss": 0.5219, "mean_token_accuracy": 0.8365220427513123, "num_tokens": 256517073.0, "step": 1610 }, { "epoch": 0.8194303153611394, "grad_norm": 1.046618938446045, "learning_rate": 1e-05, "loss": 0.4842, "mean_token_accuracy": 0.8463455438613892, "num_tokens": 256674055.0, "step": 1611 }, { "epoch": 0.8199389623601221, "grad_norm": 1.098393201828003, "learning_rate": 1e-05, "loss": 0.5381, "mean_token_accuracy": 0.8320057392120361, "num_tokens": 256848111.0, "step": 1612 }, { "epoch": 0.8204476093591048, "grad_norm": 1.0480852127075195, "learning_rate": 1e-05, "loss": 0.4944, "mean_token_accuracy": 0.8455042243003845, "num_tokens": 257016873.0, "step": 1613 }, { "epoch": 0.8209562563580874, "grad_norm": 0.9647024869918823, "learning_rate": 1e-05, "loss": 0.5353, "mean_token_accuracy": 0.8356543779373169, "num_tokens": 257182035.0, "step": 1614 }, { "epoch": 0.8214649033570702, "grad_norm": 1.0326327085494995, "learning_rate": 1e-05, "loss": 0.5481, "mean_token_accuracy": 0.8293918371200562, "num_tokens": 257351888.0, "step": 1615 }, { "epoch": 0.8219735503560529, "grad_norm": 0.9589417576789856, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8386868238449097, "num_tokens": 257508833.0, "step": 1616 }, { "epoch": 0.8224821973550356, "grad_norm": 1.074066400527954, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8440150618553162, "num_tokens": 257666306.0, "step": 1617 }, { "epoch": 0.8229908443540183, "grad_norm": 0.9626802206039429, "learning_rate": 1e-05, "loss": 0.5153, "mean_token_accuracy": 0.8389250636100769, "num_tokens": 257832236.0, "step": 1618 }, { "epoch": 0.823499491353001, "grad_norm": 1.091043472290039, "learning_rate": 1e-05, "loss": 0.488, "mean_token_accuracy": 0.8457168340682983, "num_tokens": 257996091.0, "step": 1619 }, { "epoch": 0.8240081383519837, "grad_norm": 0.9529052972793579, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.8395473957061768, "num_tokens": 258156268.0, "step": 1620 }, { "epoch": 0.8245167853509664, "grad_norm": 1.021721601486206, "learning_rate": 1e-05, "loss": 0.5263, "mean_token_accuracy": 0.8362045884132385, "num_tokens": 258323374.0, "step": 1621 }, { "epoch": 0.8250254323499492, "grad_norm": 0.971449077129364, "learning_rate": 1e-05, "loss": 0.5149, "mean_token_accuracy": 0.839565634727478, "num_tokens": 258495850.0, "step": 1622 }, { "epoch": 0.8255340793489319, "grad_norm": 0.9889821410179138, "learning_rate": 1e-05, "loss": 0.5132, "mean_token_accuracy": 0.8389104604721069, "num_tokens": 258653458.0, "step": 1623 }, { "epoch": 0.8260427263479145, "grad_norm": 1.0107471942901611, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8384637832641602, "num_tokens": 258823978.0, "step": 1624 }, { "epoch": 0.8265513733468972, "grad_norm": 1.0081323385238647, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.8410070538520813, "num_tokens": 258981414.0, "step": 1625 }, { "epoch": 0.82706002034588, "grad_norm": 0.9704625606536865, "learning_rate": 1e-05, "loss": 0.5111, "mean_token_accuracy": 0.8383285403251648, "num_tokens": 259143758.0, "step": 1626 }, { "epoch": 0.8275686673448627, "grad_norm": 1.0984337329864502, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8355196118354797, "num_tokens": 259297261.0, "step": 1627 }, { "epoch": 0.8280773143438453, "grad_norm": 1.045357584953308, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8388932943344116, "num_tokens": 259459034.0, "step": 1628 }, { "epoch": 0.828585961342828, "grad_norm": 0.9529820084571838, "learning_rate": 1e-05, "loss": 0.4971, "mean_token_accuracy": 0.8412728309631348, "num_tokens": 259614641.0, "step": 1629 }, { "epoch": 0.8290946083418108, "grad_norm": 1.033054232597351, "learning_rate": 1e-05, "loss": 0.4947, "mean_token_accuracy": 0.8445708751678467, "num_tokens": 259769628.0, "step": 1630 }, { "epoch": 0.8296032553407935, "grad_norm": 1.1199381351470947, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8430094718933105, "num_tokens": 259925515.0, "step": 1631 }, { "epoch": 0.8301119023397762, "grad_norm": 1.015918493270874, "learning_rate": 1e-05, "loss": 0.5572, "mean_token_accuracy": 0.8273895978927612, "num_tokens": 260091045.0, "step": 1632 }, { "epoch": 0.830620549338759, "grad_norm": 1.0634037256240845, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.8440449237823486, "num_tokens": 260243318.0, "step": 1633 }, { "epoch": 0.8311291963377416, "grad_norm": 1.0427650213241577, "learning_rate": 1e-05, "loss": 0.521, "mean_token_accuracy": 0.838854193687439, "num_tokens": 260414955.0, "step": 1634 }, { "epoch": 0.8316378433367243, "grad_norm": 1.0316675901412964, "learning_rate": 1e-05, "loss": 0.5087, "mean_token_accuracy": 0.8421332836151123, "num_tokens": 260573372.0, "step": 1635 }, { "epoch": 0.832146490335707, "grad_norm": 0.9899135828018188, "learning_rate": 1e-05, "loss": 0.4595, "mean_token_accuracy": 0.853246808052063, "num_tokens": 260738547.0, "step": 1636 }, { "epoch": 0.8326551373346898, "grad_norm": 0.9942594170570374, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8443979620933533, "num_tokens": 260892133.0, "step": 1637 }, { "epoch": 0.8331637843336724, "grad_norm": 1.1206623315811157, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8334402441978455, "num_tokens": 261051308.0, "step": 1638 }, { "epoch": 0.8336724313326551, "grad_norm": 1.0370997190475464, "learning_rate": 1e-05, "loss": 0.5182, "mean_token_accuracy": 0.8363610506057739, "num_tokens": 261205776.0, "step": 1639 }, { "epoch": 0.8341810783316378, "grad_norm": 1.0111100673675537, "learning_rate": 1e-05, "loss": 0.4809, "mean_token_accuracy": 0.8484663963317871, "num_tokens": 261358402.0, "step": 1640 }, { "epoch": 0.8346897253306206, "grad_norm": 1.0508127212524414, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.8332227468490601, "num_tokens": 261524731.0, "step": 1641 }, { "epoch": 0.8351983723296033, "grad_norm": 0.9737936854362488, "learning_rate": 1e-05, "loss": 0.5032, "mean_token_accuracy": 0.8433814644813538, "num_tokens": 261687233.0, "step": 1642 }, { "epoch": 0.8357070193285859, "grad_norm": 1.0474128723144531, "learning_rate": 1e-05, "loss": 0.5228, "mean_token_accuracy": 0.8364410400390625, "num_tokens": 261854243.0, "step": 1643 }, { "epoch": 0.8362156663275687, "grad_norm": 0.9719751477241516, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.8366215229034424, "num_tokens": 262026963.0, "step": 1644 }, { "epoch": 0.8367243133265514, "grad_norm": 0.9536095857620239, "learning_rate": 1e-05, "loss": 0.4901, "mean_token_accuracy": 0.8455750942230225, "num_tokens": 262196156.0, "step": 1645 }, { "epoch": 0.8372329603255341, "grad_norm": 0.9847731590270996, "learning_rate": 1e-05, "loss": 0.5259, "mean_token_accuracy": 0.8377428650856018, "num_tokens": 262365347.0, "step": 1646 }, { "epoch": 0.8377416073245167, "grad_norm": 1.059686541557312, "learning_rate": 1e-05, "loss": 0.5206, "mean_token_accuracy": 0.8395569324493408, "num_tokens": 262519307.0, "step": 1647 }, { "epoch": 0.8382502543234995, "grad_norm": 0.9664161801338196, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.843921422958374, "num_tokens": 262673650.0, "step": 1648 }, { "epoch": 0.8387589013224822, "grad_norm": 1.0741627216339111, "learning_rate": 1e-05, "loss": 0.5116, "mean_token_accuracy": 0.8394699096679688, "num_tokens": 262812546.0, "step": 1649 }, { "epoch": 0.8392675483214649, "grad_norm": 1.0161736011505127, "learning_rate": 1e-05, "loss": 0.5391, "mean_token_accuracy": 0.832436203956604, "num_tokens": 262970304.0, "step": 1650 }, { "epoch": 0.8397761953204476, "grad_norm": 1.0764623880386353, "learning_rate": 1e-05, "loss": 0.5357, "mean_token_accuracy": 0.834117591381073, "num_tokens": 263132275.0, "step": 1651 }, { "epoch": 0.8402848423194303, "grad_norm": 1.0434576272964478, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8394972085952759, "num_tokens": 263293537.0, "step": 1652 }, { "epoch": 0.840793489318413, "grad_norm": 0.9257302284240723, "learning_rate": 1e-05, "loss": 0.5123, "mean_token_accuracy": 0.840105414390564, "num_tokens": 263455476.0, "step": 1653 }, { "epoch": 0.8413021363173957, "grad_norm": 0.974478006362915, "learning_rate": 1e-05, "loss": 0.4996, "mean_token_accuracy": 0.8424463868141174, "num_tokens": 263615516.0, "step": 1654 }, { "epoch": 0.8418107833163785, "grad_norm": 1.0634433031082153, "learning_rate": 1e-05, "loss": 0.4987, "mean_token_accuracy": 0.8448903560638428, "num_tokens": 263759112.0, "step": 1655 }, { "epoch": 0.8423194303153612, "grad_norm": 0.922835648059845, "learning_rate": 1e-05, "loss": 0.4973, "mean_token_accuracy": 0.8442589640617371, "num_tokens": 263924035.0, "step": 1656 }, { "epoch": 0.8428280773143438, "grad_norm": 0.9309987425804138, "learning_rate": 1e-05, "loss": 0.5619, "mean_token_accuracy": 0.8253204226493835, "num_tokens": 264096716.0, "step": 1657 }, { "epoch": 0.8433367243133265, "grad_norm": 0.9536299705505371, "learning_rate": 1e-05, "loss": 0.5241, "mean_token_accuracy": 0.8355963826179504, "num_tokens": 264253124.0, "step": 1658 }, { "epoch": 0.8438453713123093, "grad_norm": 0.9557009339332581, "learning_rate": 1e-05, "loss": 0.484, "mean_token_accuracy": 0.8470132946968079, "num_tokens": 264412558.0, "step": 1659 }, { "epoch": 0.844354018311292, "grad_norm": 0.8998320698738098, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.837689220905304, "num_tokens": 264578027.0, "step": 1660 }, { "epoch": 0.8448626653102747, "grad_norm": 0.9375623464584351, "learning_rate": 1e-05, "loss": 0.5143, "mean_token_accuracy": 0.8389825224876404, "num_tokens": 264737779.0, "step": 1661 }, { "epoch": 0.8453713123092573, "grad_norm": 1.0092815160751343, "learning_rate": 1e-05, "loss": 0.5244, "mean_token_accuracy": 0.8360074758529663, "num_tokens": 264888054.0, "step": 1662 }, { "epoch": 0.8458799593082401, "grad_norm": 0.9785453677177429, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.837114155292511, "num_tokens": 265048672.0, "step": 1663 }, { "epoch": 0.8463886063072228, "grad_norm": 3.0526108741760254, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8422542810440063, "num_tokens": 265191225.0, "step": 1664 }, { "epoch": 0.8468972533062055, "grad_norm": 1.1187739372253418, "learning_rate": 1e-05, "loss": 0.5326, "mean_token_accuracy": 0.8355621695518494, "num_tokens": 265357998.0, "step": 1665 }, { "epoch": 0.8474059003051883, "grad_norm": 0.957465648651123, "learning_rate": 1e-05, "loss": 0.5054, "mean_token_accuracy": 0.8424095511436462, "num_tokens": 265529823.0, "step": 1666 }, { "epoch": 0.8479145473041709, "grad_norm": 1.0484718084335327, "learning_rate": 1e-05, "loss": 0.5208, "mean_token_accuracy": 0.8372833728790283, "num_tokens": 265683573.0, "step": 1667 }, { "epoch": 0.8484231943031536, "grad_norm": 0.9759782552719116, "learning_rate": 1e-05, "loss": 0.4765, "mean_token_accuracy": 0.8503193259239197, "num_tokens": 265845965.0, "step": 1668 }, { "epoch": 0.8489318413021363, "grad_norm": 1.09355628490448, "learning_rate": 1e-05, "loss": 0.5445, "mean_token_accuracy": 0.8312594890594482, "num_tokens": 265991385.0, "step": 1669 }, { "epoch": 0.8494404883011191, "grad_norm": 1.0344514846801758, "learning_rate": 1e-05, "loss": 0.5187, "mean_token_accuracy": 0.8376867175102234, "num_tokens": 266161615.0, "step": 1670 }, { "epoch": 0.8499491353001017, "grad_norm": 1.143629789352417, "learning_rate": 1e-05, "loss": 0.5329, "mean_token_accuracy": 0.8344286680221558, "num_tokens": 266310118.0, "step": 1671 }, { "epoch": 0.8504577822990844, "grad_norm": 1.0117353200912476, "learning_rate": 1e-05, "loss": 0.4985, "mean_token_accuracy": 0.8438706398010254, "num_tokens": 266462436.0, "step": 1672 }, { "epoch": 0.8509664292980671, "grad_norm": 1.0194602012634277, "learning_rate": 1e-05, "loss": 0.5075, "mean_token_accuracy": 0.8406510353088379, "num_tokens": 266613253.0, "step": 1673 }, { "epoch": 0.8514750762970499, "grad_norm": 1.0524338483810425, "learning_rate": 1e-05, "loss": 0.5141, "mean_token_accuracy": 0.839398205280304, "num_tokens": 266773293.0, "step": 1674 }, { "epoch": 0.8519837232960326, "grad_norm": 1.0298408269882202, "learning_rate": 1e-05, "loss": 0.5216, "mean_token_accuracy": 0.8359988927841187, "num_tokens": 266935261.0, "step": 1675 }, { "epoch": 0.8524923702950152, "grad_norm": 1.1193722486495972, "learning_rate": 1e-05, "loss": 0.5277, "mean_token_accuracy": 0.8361387252807617, "num_tokens": 267100828.0, "step": 1676 }, { "epoch": 0.853001017293998, "grad_norm": 1.1007781028747559, "learning_rate": 1e-05, "loss": 0.5152, "mean_token_accuracy": 0.8386905789375305, "num_tokens": 267269826.0, "step": 1677 }, { "epoch": 0.8535096642929807, "grad_norm": 0.9150673747062683, "learning_rate": 1e-05, "loss": 0.4983, "mean_token_accuracy": 0.8425648212432861, "num_tokens": 267429972.0, "step": 1678 }, { "epoch": 0.8540183112919634, "grad_norm": 1.072229027748108, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.8400737643241882, "num_tokens": 267590240.0, "step": 1679 }, { "epoch": 0.854526958290946, "grad_norm": 0.9825935363769531, "learning_rate": 1e-05, "loss": 0.4731, "mean_token_accuracy": 0.8503822088241577, "num_tokens": 267748631.0, "step": 1680 }, { "epoch": 0.8550356052899288, "grad_norm": 0.9460796117782593, "learning_rate": 1e-05, "loss": 0.5178, "mean_token_accuracy": 0.8373454213142395, "num_tokens": 267906365.0, "step": 1681 }, { "epoch": 0.8555442522889115, "grad_norm": 0.9841094613075256, "learning_rate": 1e-05, "loss": 0.4899, "mean_token_accuracy": 0.8446120023727417, "num_tokens": 268058483.0, "step": 1682 }, { "epoch": 0.8560528992878942, "grad_norm": 1.0456801652908325, "learning_rate": 1e-05, "loss": 0.4919, "mean_token_accuracy": 0.8460915684700012, "num_tokens": 268217173.0, "step": 1683 }, { "epoch": 0.8565615462868769, "grad_norm": 0.9893629550933838, "learning_rate": 1e-05, "loss": 0.5249, "mean_token_accuracy": 0.8375509977340698, "num_tokens": 268382188.0, "step": 1684 }, { "epoch": 0.8570701932858596, "grad_norm": 1.030446171760559, "learning_rate": 1e-05, "loss": 0.5088, "mean_token_accuracy": 0.8399752378463745, "num_tokens": 268543730.0, "step": 1685 }, { "epoch": 0.8575788402848423, "grad_norm": 1.0455074310302734, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8435068130493164, "num_tokens": 268701052.0, "step": 1686 }, { "epoch": 0.858087487283825, "grad_norm": 1.0250892639160156, "learning_rate": 1e-05, "loss": 0.4961, "mean_token_accuracy": 0.843188464641571, "num_tokens": 268852461.0, "step": 1687 }, { "epoch": 0.8585961342828077, "grad_norm": 1.1781569719314575, "learning_rate": 1e-05, "loss": 0.4759, "mean_token_accuracy": 0.8500210642814636, "num_tokens": 268996708.0, "step": 1688 }, { "epoch": 0.8591047812817905, "grad_norm": 1.1067460775375366, "learning_rate": 1e-05, "loss": 0.5362, "mean_token_accuracy": 0.8340690732002258, "num_tokens": 269146990.0, "step": 1689 }, { "epoch": 0.8596134282807731, "grad_norm": 1.0462528467178345, "learning_rate": 1e-05, "loss": 0.473, "mean_token_accuracy": 0.8502258062362671, "num_tokens": 269311951.0, "step": 1690 }, { "epoch": 0.8601220752797558, "grad_norm": 1.0539947748184204, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.838455080986023, "num_tokens": 269450985.0, "step": 1691 }, { "epoch": 0.8606307222787386, "grad_norm": 1.1058789491653442, "learning_rate": 1e-05, "loss": 0.4725, "mean_token_accuracy": 0.8504195213317871, "num_tokens": 269598930.0, "step": 1692 }, { "epoch": 0.8611393692777213, "grad_norm": 1.2138804197311401, "learning_rate": 1e-05, "loss": 0.5342, "mean_token_accuracy": 0.8357694745063782, "num_tokens": 269760034.0, "step": 1693 }, { "epoch": 0.861648016276704, "grad_norm": 1.0645147562026978, "learning_rate": 1e-05, "loss": 0.5232, "mean_token_accuracy": 0.8370406627655029, "num_tokens": 269921692.0, "step": 1694 }, { "epoch": 0.8621566632756866, "grad_norm": 1.1063612699508667, "learning_rate": 1e-05, "loss": 0.4897, "mean_token_accuracy": 0.8471656441688538, "num_tokens": 270073024.0, "step": 1695 }, { "epoch": 0.8626653102746694, "grad_norm": 1.1592669486999512, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8345301151275635, "num_tokens": 270232620.0, "step": 1696 }, { "epoch": 0.8631739572736521, "grad_norm": 1.076188325881958, "learning_rate": 1e-05, "loss": 0.5072, "mean_token_accuracy": 0.8404670357704163, "num_tokens": 270385773.0, "step": 1697 }, { "epoch": 0.8636826042726348, "grad_norm": 0.9822161793708801, "learning_rate": 1e-05, "loss": 0.5015, "mean_token_accuracy": 0.8447793126106262, "num_tokens": 270560619.0, "step": 1698 }, { "epoch": 0.8641912512716174, "grad_norm": 1.886228084564209, "learning_rate": 1e-05, "loss": 0.5253, "mean_token_accuracy": 0.8372279405593872, "num_tokens": 270719944.0, "step": 1699 }, { "epoch": 0.8646998982706002, "grad_norm": 1.1022945642471313, "learning_rate": 1e-05, "loss": 0.4961, "mean_token_accuracy": 0.8436857461929321, "num_tokens": 270874728.0, "step": 1700 }, { "epoch": 0.8652085452695829, "grad_norm": 1.010673999786377, "learning_rate": 1e-05, "loss": 0.5085, "mean_token_accuracy": 0.8426359295845032, "num_tokens": 271036953.0, "step": 1701 }, { "epoch": 0.8657171922685656, "grad_norm": 0.9527826905250549, "learning_rate": 1e-05, "loss": 0.5034, "mean_token_accuracy": 0.8433707356452942, "num_tokens": 271202782.0, "step": 1702 }, { "epoch": 0.8662258392675484, "grad_norm": 1.014635682106018, "learning_rate": 1e-05, "loss": 0.5183, "mean_token_accuracy": 0.8393032550811768, "num_tokens": 271367311.0, "step": 1703 }, { "epoch": 0.866734486266531, "grad_norm": 1.0066399574279785, "learning_rate": 1e-05, "loss": 0.5139, "mean_token_accuracy": 0.839614748954773, "num_tokens": 271529123.0, "step": 1704 }, { "epoch": 0.8672431332655137, "grad_norm": 1.093875765800476, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.844380259513855, "num_tokens": 271673733.0, "step": 1705 }, { "epoch": 0.8677517802644964, "grad_norm": 1.0652796030044556, "learning_rate": 1e-05, "loss": 0.4892, "mean_token_accuracy": 0.8460615873336792, "num_tokens": 271828281.0, "step": 1706 }, { "epoch": 0.8682604272634792, "grad_norm": 1.0834511518478394, "learning_rate": 1e-05, "loss": 0.5299, "mean_token_accuracy": 0.8352327346801758, "num_tokens": 271997170.0, "step": 1707 }, { "epoch": 0.8687690742624619, "grad_norm": 1.0391765832901, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.8471356630325317, "num_tokens": 272152291.0, "step": 1708 }, { "epoch": 0.8692777212614445, "grad_norm": 1.064730167388916, "learning_rate": 1e-05, "loss": 0.5366, "mean_token_accuracy": 0.8317385315895081, "num_tokens": 272307347.0, "step": 1709 }, { "epoch": 0.8697863682604272, "grad_norm": 1.0805742740631104, "learning_rate": 1e-05, "loss": 0.5291, "mean_token_accuracy": 0.8354371786117554, "num_tokens": 272469049.0, "step": 1710 }, { "epoch": 0.87029501525941, "grad_norm": 1.0154848098754883, "learning_rate": 1e-05, "loss": 0.527, "mean_token_accuracy": 0.8360839486122131, "num_tokens": 272612517.0, "step": 1711 }, { "epoch": 0.8708036622583927, "grad_norm": 1.0521576404571533, "learning_rate": 1e-05, "loss": 0.543, "mean_token_accuracy": 0.831218957901001, "num_tokens": 272768408.0, "step": 1712 }, { "epoch": 0.8713123092573754, "grad_norm": 0.9749472141265869, "learning_rate": 1e-05, "loss": 0.5158, "mean_token_accuracy": 0.8387061953544617, "num_tokens": 272919347.0, "step": 1713 }, { "epoch": 0.8718209562563581, "grad_norm": 1.0825953483581543, "learning_rate": 1e-05, "loss": 0.5007, "mean_token_accuracy": 0.8432174921035767, "num_tokens": 273089732.0, "step": 1714 }, { "epoch": 0.8723296032553408, "grad_norm": 1.0068557262420654, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8477575778961182, "num_tokens": 273242780.0, "step": 1715 }, { "epoch": 0.8728382502543235, "grad_norm": 0.9401190280914307, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.843682050704956, "num_tokens": 273407433.0, "step": 1716 }, { "epoch": 0.8733468972533062, "grad_norm": 0.9649401903152466, "learning_rate": 1e-05, "loss": 0.5146, "mean_token_accuracy": 0.8399534225463867, "num_tokens": 273570166.0, "step": 1717 }, { "epoch": 0.873855544252289, "grad_norm": 0.9992680549621582, "learning_rate": 1e-05, "loss": 0.5289, "mean_token_accuracy": 0.8345081210136414, "num_tokens": 273726324.0, "step": 1718 }, { "epoch": 0.8743641912512716, "grad_norm": 1.0180540084838867, "learning_rate": 1e-05, "loss": 0.5199, "mean_token_accuracy": 0.8366790413856506, "num_tokens": 273890563.0, "step": 1719 }, { "epoch": 0.8748728382502543, "grad_norm": 0.9962469339370728, "learning_rate": 1e-05, "loss": 0.4778, "mean_token_accuracy": 0.8489860892295837, "num_tokens": 274048187.0, "step": 1720 }, { "epoch": 0.875381485249237, "grad_norm": 0.9790189266204834, "learning_rate": 1e-05, "loss": 0.4766, "mean_token_accuracy": 0.8492850065231323, "num_tokens": 274221855.0, "step": 1721 }, { "epoch": 0.8758901322482198, "grad_norm": 0.9516292810440063, "learning_rate": 1e-05, "loss": 0.5212, "mean_token_accuracy": 0.8369489908218384, "num_tokens": 274377833.0, "step": 1722 }, { "epoch": 0.8763987792472024, "grad_norm": 0.9753563404083252, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.8327175378799438, "num_tokens": 274541936.0, "step": 1723 }, { "epoch": 0.8769074262461851, "grad_norm": 0.9900280237197876, "learning_rate": 1e-05, "loss": 0.5311, "mean_token_accuracy": 0.8332983255386353, "num_tokens": 274705006.0, "step": 1724 }, { "epoch": 0.8774160732451679, "grad_norm": 1.0057647228240967, "learning_rate": 1e-05, "loss": 0.5025, "mean_token_accuracy": 0.8407544493675232, "num_tokens": 274854136.0, "step": 1725 }, { "epoch": 0.8779247202441506, "grad_norm": 0.9810066223144531, "learning_rate": 1e-05, "loss": 0.491, "mean_token_accuracy": 0.8476090431213379, "num_tokens": 275014059.0, "step": 1726 }, { "epoch": 0.8784333672431333, "grad_norm": 1.0328130722045898, "learning_rate": 1e-05, "loss": 0.5533, "mean_token_accuracy": 0.8297367095947266, "num_tokens": 275187361.0, "step": 1727 }, { "epoch": 0.8789420142421159, "grad_norm": 0.9669415354728699, "learning_rate": 1e-05, "loss": 0.4991, "mean_token_accuracy": 0.8440873622894287, "num_tokens": 275338869.0, "step": 1728 }, { "epoch": 0.8794506612410987, "grad_norm": 0.9246980547904968, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.8422975540161133, "num_tokens": 275496774.0, "step": 1729 }, { "epoch": 0.8799593082400814, "grad_norm": 1.1389602422714233, "learning_rate": 1e-05, "loss": 0.5346, "mean_token_accuracy": 0.8315758109092712, "num_tokens": 275653401.0, "step": 1730 }, { "epoch": 0.8804679552390641, "grad_norm": 1.0125043392181396, "learning_rate": 1e-05, "loss": 0.4926, "mean_token_accuracy": 0.8451186418533325, "num_tokens": 275806070.0, "step": 1731 }, { "epoch": 0.8809766022380467, "grad_norm": 0.9579908847808838, "learning_rate": 1e-05, "loss": 0.5061, "mean_token_accuracy": 0.8411065340042114, "num_tokens": 275962373.0, "step": 1732 }, { "epoch": 0.8814852492370295, "grad_norm": 1.0477601289749146, "learning_rate": 1e-05, "loss": 0.5127, "mean_token_accuracy": 0.8413093686103821, "num_tokens": 276129118.0, "step": 1733 }, { "epoch": 0.8819938962360122, "grad_norm": 0.9794126749038696, "learning_rate": 1e-05, "loss": 0.5198, "mean_token_accuracy": 0.8378043174743652, "num_tokens": 276287064.0, "step": 1734 }, { "epoch": 0.8825025432349949, "grad_norm": 1.0100884437561035, "learning_rate": 1e-05, "loss": 0.4977, "mean_token_accuracy": 0.844001054763794, "num_tokens": 276457678.0, "step": 1735 }, { "epoch": 0.8830111902339777, "grad_norm": 1.052374005317688, "learning_rate": 1e-05, "loss": 0.5299, "mean_token_accuracy": 0.8356032371520996, "num_tokens": 276610916.0, "step": 1736 }, { "epoch": 0.8835198372329603, "grad_norm": 0.9559528827667236, "learning_rate": 1e-05, "loss": 0.4894, "mean_token_accuracy": 0.8479528427124023, "num_tokens": 276764925.0, "step": 1737 }, { "epoch": 0.884028484231943, "grad_norm": 1.052488088607788, "learning_rate": 1e-05, "loss": 0.5685, "mean_token_accuracy": 0.8241016864776611, "num_tokens": 276928368.0, "step": 1738 }, { "epoch": 0.8845371312309257, "grad_norm": 0.9992378950119019, "learning_rate": 1e-05, "loss": 0.4906, "mean_token_accuracy": 0.8465350270271301, "num_tokens": 277094718.0, "step": 1739 }, { "epoch": 0.8850457782299085, "grad_norm": 0.9928638935089111, "learning_rate": 1e-05, "loss": 0.4951, "mean_token_accuracy": 0.8433694839477539, "num_tokens": 277246859.0, "step": 1740 }, { "epoch": 0.8855544252288912, "grad_norm": 0.923547625541687, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8368440866470337, "num_tokens": 277414068.0, "step": 1741 }, { "epoch": 0.8860630722278738, "grad_norm": 0.986189603805542, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8419417142868042, "num_tokens": 277573295.0, "step": 1742 }, { "epoch": 0.8865717192268565, "grad_norm": 1.0184423923492432, "learning_rate": 1e-05, "loss": 0.5186, "mean_token_accuracy": 0.8393261432647705, "num_tokens": 277726913.0, "step": 1743 }, { "epoch": 0.8870803662258393, "grad_norm": 0.9636666774749756, "learning_rate": 1e-05, "loss": 0.4865, "mean_token_accuracy": 0.8466484546661377, "num_tokens": 277877252.0, "step": 1744 }, { "epoch": 0.887589013224822, "grad_norm": 1.0365499258041382, "learning_rate": 1e-05, "loss": 0.5315, "mean_token_accuracy": 0.8329277634620667, "num_tokens": 278034976.0, "step": 1745 }, { "epoch": 0.8880976602238047, "grad_norm": 0.9787980914115906, "learning_rate": 1e-05, "loss": 0.472, "mean_token_accuracy": 0.8506600260734558, "num_tokens": 278194339.0, "step": 1746 }, { "epoch": 0.8886063072227874, "grad_norm": 0.9673038721084595, "learning_rate": 1e-05, "loss": 0.5062, "mean_token_accuracy": 0.8403258323669434, "num_tokens": 278352150.0, "step": 1747 }, { "epoch": 0.8891149542217701, "grad_norm": 1.0336358547210693, "learning_rate": 1e-05, "loss": 0.5287, "mean_token_accuracy": 0.8341023921966553, "num_tokens": 278516575.0, "step": 1748 }, { "epoch": 0.8896236012207528, "grad_norm": 0.9779040813446045, "learning_rate": 1e-05, "loss": 0.5282, "mean_token_accuracy": 0.8354268670082092, "num_tokens": 278687157.0, "step": 1749 }, { "epoch": 0.8901322482197355, "grad_norm": 1.0056418180465698, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8347800374031067, "num_tokens": 278844290.0, "step": 1750 }, { "epoch": 0.8906408952187183, "grad_norm": 1.0253515243530273, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.8392736315727234, "num_tokens": 279002276.0, "step": 1751 }, { "epoch": 0.8911495422177009, "grad_norm": 0.9850518703460693, "learning_rate": 1e-05, "loss": 0.5256, "mean_token_accuracy": 0.8362544775009155, "num_tokens": 279161918.0, "step": 1752 }, { "epoch": 0.8916581892166836, "grad_norm": 1.0186904668807983, "learning_rate": 1e-05, "loss": 0.524, "mean_token_accuracy": 0.8351168036460876, "num_tokens": 279321930.0, "step": 1753 }, { "epoch": 0.8921668362156663, "grad_norm": 1.041852355003357, "learning_rate": 1e-05, "loss": 0.5296, "mean_token_accuracy": 0.8372722864151001, "num_tokens": 279481295.0, "step": 1754 }, { "epoch": 0.8926754832146491, "grad_norm": 0.9848358631134033, "learning_rate": 1e-05, "loss": 0.5179, "mean_token_accuracy": 0.8382662534713745, "num_tokens": 279633071.0, "step": 1755 }, { "epoch": 0.8931841302136317, "grad_norm": 1.0203869342803955, "learning_rate": 1e-05, "loss": 0.4843, "mean_token_accuracy": 0.8476390838623047, "num_tokens": 279800329.0, "step": 1756 }, { "epoch": 0.8936927772126144, "grad_norm": 0.9973755478858948, "learning_rate": 1e-05, "loss": 0.4773, "mean_token_accuracy": 0.8494220972061157, "num_tokens": 279944707.0, "step": 1757 }, { "epoch": 0.8942014242115972, "grad_norm": 0.9391854405403137, "learning_rate": 1e-05, "loss": 0.5, "mean_token_accuracy": 0.8429989814758301, "num_tokens": 280111037.0, "step": 1758 }, { "epoch": 0.8947100712105799, "grad_norm": 1.0117913484573364, "learning_rate": 1e-05, "loss": 0.5004, "mean_token_accuracy": 0.8444038033485413, "num_tokens": 280258749.0, "step": 1759 }, { "epoch": 0.8952187182095626, "grad_norm": 0.9810928702354431, "learning_rate": 1e-05, "loss": 0.5124, "mean_token_accuracy": 0.8384989500045776, "num_tokens": 280405852.0, "step": 1760 }, { "epoch": 0.8957273652085452, "grad_norm": 0.9981587529182434, "learning_rate": 1e-05, "loss": 0.5108, "mean_token_accuracy": 0.8404662609100342, "num_tokens": 280580217.0, "step": 1761 }, { "epoch": 0.896236012207528, "grad_norm": 1.026110053062439, "learning_rate": 1e-05, "loss": 0.5284, "mean_token_accuracy": 0.8371784687042236, "num_tokens": 280727104.0, "step": 1762 }, { "epoch": 0.8967446592065107, "grad_norm": 1.0082656145095825, "learning_rate": 1e-05, "loss": 0.485, "mean_token_accuracy": 0.8476098775863647, "num_tokens": 280889043.0, "step": 1763 }, { "epoch": 0.8972533062054934, "grad_norm": 1.0548858642578125, "learning_rate": 1e-05, "loss": 0.5011, "mean_token_accuracy": 0.8431612849235535, "num_tokens": 281043942.0, "step": 1764 }, { "epoch": 0.897761953204476, "grad_norm": 1.0609831809997559, "learning_rate": 1e-05, "loss": 0.5155, "mean_token_accuracy": 0.8397126197814941, "num_tokens": 281216118.0, "step": 1765 }, { "epoch": 0.8982706002034588, "grad_norm": 1.008461594581604, "learning_rate": 1e-05, "loss": 0.5126, "mean_token_accuracy": 0.8402866125106812, "num_tokens": 281374849.0, "step": 1766 }, { "epoch": 0.8987792472024415, "grad_norm": 1.030743956565857, "learning_rate": 1e-05, "loss": 0.5023, "mean_token_accuracy": 0.8423265218734741, "num_tokens": 281528478.0, "step": 1767 }, { "epoch": 0.8992878942014242, "grad_norm": 1.0897209644317627, "learning_rate": 1e-05, "loss": 0.505, "mean_token_accuracy": 0.8422761559486389, "num_tokens": 281681982.0, "step": 1768 }, { "epoch": 0.8997965412004069, "grad_norm": 0.992885172367096, "learning_rate": 1e-05, "loss": 0.492, "mean_token_accuracy": 0.8450870513916016, "num_tokens": 281846049.0, "step": 1769 }, { "epoch": 0.9003051881993896, "grad_norm": 1.098056674003601, "learning_rate": 1e-05, "loss": 0.5516, "mean_token_accuracy": 0.8283272981643677, "num_tokens": 282019096.0, "step": 1770 }, { "epoch": 0.9008138351983723, "grad_norm": 1.0340752601623535, "learning_rate": 1e-05, "loss": 0.4885, "mean_token_accuracy": 0.846498966217041, "num_tokens": 282196142.0, "step": 1771 }, { "epoch": 0.901322482197355, "grad_norm": 0.9638667702674866, "learning_rate": 1e-05, "loss": 0.5213, "mean_token_accuracy": 0.8388038277626038, "num_tokens": 282360553.0, "step": 1772 }, { "epoch": 0.9018311291963378, "grad_norm": 1.0786890983581543, "learning_rate": 1e-05, "loss": 0.4915, "mean_token_accuracy": 0.8466812372207642, "num_tokens": 282527151.0, "step": 1773 }, { "epoch": 0.9023397761953205, "grad_norm": 1.0417206287384033, "learning_rate": 1e-05, "loss": 0.5355, "mean_token_accuracy": 0.8339139223098755, "num_tokens": 282674995.0, "step": 1774 }, { "epoch": 0.9028484231943031, "grad_norm": 1.0728365182876587, "learning_rate": 1e-05, "loss": 0.4639, "mean_token_accuracy": 0.852975070476532, "num_tokens": 282832990.0, "step": 1775 }, { "epoch": 0.9033570701932858, "grad_norm": 1.0209739208221436, "learning_rate": 1e-05, "loss": 0.5316, "mean_token_accuracy": 0.8355213403701782, "num_tokens": 282997592.0, "step": 1776 }, { "epoch": 0.9038657171922686, "grad_norm": 1.114395022392273, "learning_rate": 1e-05, "loss": 0.5097, "mean_token_accuracy": 0.8406573534011841, "num_tokens": 283155838.0, "step": 1777 }, { "epoch": 0.9043743641912513, "grad_norm": 0.9711954593658447, "learning_rate": 1e-05, "loss": 0.4673, "mean_token_accuracy": 0.8512140512466431, "num_tokens": 283307527.0, "step": 1778 }, { "epoch": 0.904883011190234, "grad_norm": 0.9695219397544861, "learning_rate": 1e-05, "loss": 0.5267, "mean_token_accuracy": 0.8353557586669922, "num_tokens": 283465966.0, "step": 1779 }, { "epoch": 0.9053916581892166, "grad_norm": 1.0465660095214844, "learning_rate": 1e-05, "loss": 0.5272, "mean_token_accuracy": 0.8351324796676636, "num_tokens": 283620377.0, "step": 1780 }, { "epoch": 0.9059003051881994, "grad_norm": 0.9748997688293457, "learning_rate": 1e-05, "loss": 0.4877, "mean_token_accuracy": 0.8453119993209839, "num_tokens": 283777695.0, "step": 1781 }, { "epoch": 0.9064089521871821, "grad_norm": 1.0441025495529175, "learning_rate": 1e-05, "loss": 0.5047, "mean_token_accuracy": 0.8432708978652954, "num_tokens": 283931044.0, "step": 1782 }, { "epoch": 0.9069175991861648, "grad_norm": 1.090386152267456, "learning_rate": 1e-05, "loss": 0.4958, "mean_token_accuracy": 0.8454595804214478, "num_tokens": 284089124.0, "step": 1783 }, { "epoch": 0.9074262461851476, "grad_norm": 0.9559053182601929, "learning_rate": 1e-05, "loss": 0.528, "mean_token_accuracy": 0.8353322744369507, "num_tokens": 284248921.0, "step": 1784 }, { "epoch": 0.9079348931841302, "grad_norm": 1.0538196563720703, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8436644673347473, "num_tokens": 284406295.0, "step": 1785 }, { "epoch": 0.9084435401831129, "grad_norm": 1.0404249429702759, "learning_rate": 1e-05, "loss": 0.4647, "mean_token_accuracy": 0.8512811660766602, "num_tokens": 284570062.0, "step": 1786 }, { "epoch": 0.9089521871820956, "grad_norm": 1.0365242958068848, "learning_rate": 1e-05, "loss": 0.5121, "mean_token_accuracy": 0.8398833274841309, "num_tokens": 284731919.0, "step": 1787 }, { "epoch": 0.9094608341810784, "grad_norm": 0.9548749923706055, "learning_rate": 1e-05, "loss": 0.5237, "mean_token_accuracy": 0.8365575075149536, "num_tokens": 284892788.0, "step": 1788 }, { "epoch": 0.909969481180061, "grad_norm": 1.0997098684310913, "learning_rate": 1e-05, "loss": 0.5303, "mean_token_accuracy": 0.8356540203094482, "num_tokens": 285053351.0, "step": 1789 }, { "epoch": 0.9104781281790437, "grad_norm": 0.9898800849914551, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.8391681909561157, "num_tokens": 285203700.0, "step": 1790 }, { "epoch": 0.9109867751780264, "grad_norm": 0.9842552542686462, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8393641710281372, "num_tokens": 285375982.0, "step": 1791 }, { "epoch": 0.9114954221770092, "grad_norm": 0.9427465796470642, "learning_rate": 1e-05, "loss": 0.5069, "mean_token_accuracy": 0.8418722748756409, "num_tokens": 285550546.0, "step": 1792 }, { "epoch": 0.9120040691759919, "grad_norm": 1.0155222415924072, "learning_rate": 1e-05, "loss": 0.517, "mean_token_accuracy": 0.8391909003257751, "num_tokens": 285699042.0, "step": 1793 }, { "epoch": 0.9125127161749745, "grad_norm": 1.0051095485687256, "learning_rate": 1e-05, "loss": 0.5109, "mean_token_accuracy": 0.8380466103553772, "num_tokens": 285845604.0, "step": 1794 }, { "epoch": 0.9130213631739573, "grad_norm": 1.6794397830963135, "learning_rate": 1e-05, "loss": 0.5147, "mean_token_accuracy": 0.8399105668067932, "num_tokens": 285989582.0, "step": 1795 }, { "epoch": 0.91353001017294, "grad_norm": 1.0809688568115234, "learning_rate": 1e-05, "loss": 0.5238, "mean_token_accuracy": 0.8375011682510376, "num_tokens": 286144853.0, "step": 1796 }, { "epoch": 0.9140386571719227, "grad_norm": 1.0487751960754395, "learning_rate": 1e-05, "loss": 0.5093, "mean_token_accuracy": 0.8402568697929382, "num_tokens": 286306136.0, "step": 1797 }, { "epoch": 0.9145473041709054, "grad_norm": 0.9961705803871155, "learning_rate": 1e-05, "loss": 0.5113, "mean_token_accuracy": 0.8373109698295593, "num_tokens": 286458777.0, "step": 1798 }, { "epoch": 0.9150559511698881, "grad_norm": 0.9926751255989075, "learning_rate": 1e-05, "loss": 0.5173, "mean_token_accuracy": 0.8379104137420654, "num_tokens": 286618373.0, "step": 1799 }, { "epoch": 0.9155645981688708, "grad_norm": 1.060032606124878, "learning_rate": 1e-05, "loss": 0.5038, "mean_token_accuracy": 0.8424394130706787, "num_tokens": 286774374.0, "step": 1800 }, { "epoch": 0.9160732451678535, "grad_norm": 0.9973428249359131, "learning_rate": 1e-05, "loss": 0.4875, "mean_token_accuracy": 0.8459199666976929, "num_tokens": 286926506.0, "step": 1801 }, { "epoch": 0.9165818921668362, "grad_norm": 1.0062426328659058, "learning_rate": 1e-05, "loss": 0.5365, "mean_token_accuracy": 0.832974374294281, "num_tokens": 287085779.0, "step": 1802 }, { "epoch": 0.917090539165819, "grad_norm": 1.040196418762207, "learning_rate": 1e-05, "loss": 0.5065, "mean_token_accuracy": 0.8405765295028687, "num_tokens": 287253165.0, "step": 1803 }, { "epoch": 0.9175991861648016, "grad_norm": 0.997842013835907, "learning_rate": 1e-05, "loss": 0.5324, "mean_token_accuracy": 0.8338080048561096, "num_tokens": 287405700.0, "step": 1804 }, { "epoch": 0.9181078331637843, "grad_norm": 0.9390808343887329, "learning_rate": 1e-05, "loss": 0.4715, "mean_token_accuracy": 0.8508753180503845, "num_tokens": 287551897.0, "step": 1805 }, { "epoch": 0.9186164801627671, "grad_norm": 1.057931900024414, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.839033842086792, "num_tokens": 287716716.0, "step": 1806 }, { "epoch": 0.9191251271617498, "grad_norm": 1.0124062299728394, "learning_rate": 1e-05, "loss": 0.5435, "mean_token_accuracy": 0.8313062787055969, "num_tokens": 287885440.0, "step": 1807 }, { "epoch": 0.9196337741607324, "grad_norm": 0.9651282429695129, "learning_rate": 1e-05, "loss": 0.4837, "mean_token_accuracy": 0.846926212310791, "num_tokens": 288040043.0, "step": 1808 }, { "epoch": 0.9201424211597151, "grad_norm": 1.0314844846725464, "learning_rate": 1e-05, "loss": 0.4862, "mean_token_accuracy": 0.847362220287323, "num_tokens": 288190884.0, "step": 1809 }, { "epoch": 0.9206510681586979, "grad_norm": 1.0677003860473633, "learning_rate": 1e-05, "loss": 0.5356, "mean_token_accuracy": 0.832841157913208, "num_tokens": 288352383.0, "step": 1810 }, { "epoch": 0.9211597151576806, "grad_norm": 0.9072279930114746, "learning_rate": 1e-05, "loss": 0.4695, "mean_token_accuracy": 0.8505159616470337, "num_tokens": 288513226.0, "step": 1811 }, { "epoch": 0.9216683621566633, "grad_norm": 1.0185366868972778, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.836707353591919, "num_tokens": 288681122.0, "step": 1812 }, { "epoch": 0.9221770091556459, "grad_norm": 0.9919898509979248, "learning_rate": 1e-05, "loss": 0.5333, "mean_token_accuracy": 0.8353002071380615, "num_tokens": 288840955.0, "step": 1813 }, { "epoch": 0.9226856561546287, "grad_norm": 0.9551877975463867, "learning_rate": 1e-05, "loss": 0.5247, "mean_token_accuracy": 0.8349590301513672, "num_tokens": 289007588.0, "step": 1814 }, { "epoch": 0.9231943031536114, "grad_norm": 1.023133635520935, "learning_rate": 1e-05, "loss": 0.506, "mean_token_accuracy": 0.8420091867446899, "num_tokens": 289171963.0, "step": 1815 }, { "epoch": 0.9237029501525941, "grad_norm": 0.9281512498855591, "learning_rate": 1e-05, "loss": 0.4984, "mean_token_accuracy": 0.8434433937072754, "num_tokens": 289338174.0, "step": 1816 }, { "epoch": 0.9242115971515769, "grad_norm": 0.9637218117713928, "learning_rate": 1e-05, "loss": 0.4985, "mean_token_accuracy": 0.8432784080505371, "num_tokens": 289498369.0, "step": 1817 }, { "epoch": 0.9247202441505595, "grad_norm": 1.0020900964736938, "learning_rate": 1e-05, "loss": 0.4979, "mean_token_accuracy": 0.8426657915115356, "num_tokens": 289657315.0, "step": 1818 }, { "epoch": 0.9252288911495422, "grad_norm": 0.9804989695549011, "learning_rate": 1e-05, "loss": 0.494, "mean_token_accuracy": 0.8450539708137512, "num_tokens": 289824552.0, "step": 1819 }, { "epoch": 0.9257375381485249, "grad_norm": 0.9363784790039062, "learning_rate": 1e-05, "loss": 0.511, "mean_token_accuracy": 0.8384186029434204, "num_tokens": 289982984.0, "step": 1820 }, { "epoch": 0.9262461851475077, "grad_norm": 0.9209647178649902, "learning_rate": 1e-05, "loss": 0.4967, "mean_token_accuracy": 0.8442416787147522, "num_tokens": 290127147.0, "step": 1821 }, { "epoch": 0.9267548321464903, "grad_norm": 0.9212963581085205, "learning_rate": 1e-05, "loss": 0.5008, "mean_token_accuracy": 0.8438435792922974, "num_tokens": 290298649.0, "step": 1822 }, { "epoch": 0.927263479145473, "grad_norm": 0.9627429842948914, "learning_rate": 1e-05, "loss": 0.5277, "mean_token_accuracy": 0.8364180326461792, "num_tokens": 290460355.0, "step": 1823 }, { "epoch": 0.9277721261444557, "grad_norm": 0.890675961971283, "learning_rate": 1e-05, "loss": 0.4794, "mean_token_accuracy": 0.8487001657485962, "num_tokens": 290617255.0, "step": 1824 }, { "epoch": 0.9282807731434385, "grad_norm": 1.059567928314209, "learning_rate": 1e-05, "loss": 0.5145, "mean_token_accuracy": 0.8380982875823975, "num_tokens": 290773839.0, "step": 1825 }, { "epoch": 0.9287894201424212, "grad_norm": 0.9610816836357117, "learning_rate": 1e-05, "loss": 0.4843, "mean_token_accuracy": 0.8474520444869995, "num_tokens": 290937706.0, "step": 1826 }, { "epoch": 0.9292980671414038, "grad_norm": 0.9560855627059937, "learning_rate": 1e-05, "loss": 0.5022, "mean_token_accuracy": 0.8428947329521179, "num_tokens": 291095257.0, "step": 1827 }, { "epoch": 0.9298067141403866, "grad_norm": 1.0478545427322388, "learning_rate": 1e-05, "loss": 0.5242, "mean_token_accuracy": 0.8372861742973328, "num_tokens": 291248241.0, "step": 1828 }, { "epoch": 0.9303153611393693, "grad_norm": 0.9691060185432434, "learning_rate": 1e-05, "loss": 0.539, "mean_token_accuracy": 0.8321580290794373, "num_tokens": 291408732.0, "step": 1829 }, { "epoch": 0.930824008138352, "grad_norm": 0.9107898473739624, "learning_rate": 1e-05, "loss": 0.4966, "mean_token_accuracy": 0.8426162600517273, "num_tokens": 291586275.0, "step": 1830 }, { "epoch": 0.9313326551373347, "grad_norm": 0.9577358365058899, "learning_rate": 1e-05, "loss": 0.4676, "mean_token_accuracy": 0.8520687818527222, "num_tokens": 291748574.0, "step": 1831 }, { "epoch": 0.9318413021363174, "grad_norm": 1.018463134765625, "learning_rate": 1e-05, "loss": 0.5156, "mean_token_accuracy": 0.8389250040054321, "num_tokens": 291904561.0, "step": 1832 }, { "epoch": 0.9323499491353001, "grad_norm": 1.0342974662780762, "learning_rate": 1e-05, "loss": 0.5027, "mean_token_accuracy": 0.8422161340713501, "num_tokens": 292063897.0, "step": 1833 }, { "epoch": 0.9328585961342828, "grad_norm": 0.9696241021156311, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.839712381362915, "num_tokens": 292214056.0, "step": 1834 }, { "epoch": 0.9333672431332655, "grad_norm": 1.0362638235092163, "learning_rate": 1e-05, "loss": 0.5214, "mean_token_accuracy": 0.8358685970306396, "num_tokens": 292349124.0, "step": 1835 }, { "epoch": 0.9338758901322483, "grad_norm": 0.9768627882003784, "learning_rate": 1e-05, "loss": 0.4951, "mean_token_accuracy": 0.8442312479019165, "num_tokens": 292505385.0, "step": 1836 }, { "epoch": 0.9343845371312309, "grad_norm": 0.9081904292106628, "learning_rate": 1e-05, "loss": 0.5137, "mean_token_accuracy": 0.8416123390197754, "num_tokens": 292665526.0, "step": 1837 }, { "epoch": 0.9348931841302136, "grad_norm": 0.9557290077209473, "learning_rate": 1e-05, "loss": 0.5307, "mean_token_accuracy": 0.8357499241828918, "num_tokens": 292829795.0, "step": 1838 }, { "epoch": 0.9354018311291964, "grad_norm": 1.004783034324646, "learning_rate": 1e-05, "loss": 0.4956, "mean_token_accuracy": 0.8466652035713196, "num_tokens": 292987496.0, "step": 1839 }, { "epoch": 0.9359104781281791, "grad_norm": 0.9559733271598816, "learning_rate": 1e-05, "loss": 0.4946, "mean_token_accuracy": 0.8437668681144714, "num_tokens": 293144553.0, "step": 1840 }, { "epoch": 0.9364191251271617, "grad_norm": 0.9844328165054321, "learning_rate": 1e-05, "loss": 0.5388, "mean_token_accuracy": 0.8340772390365601, "num_tokens": 293302965.0, "step": 1841 }, { "epoch": 0.9369277721261444, "grad_norm": 0.9541322588920593, "learning_rate": 1e-05, "loss": 0.4913, "mean_token_accuracy": 0.8453675508499146, "num_tokens": 293463953.0, "step": 1842 }, { "epoch": 0.9374364191251272, "grad_norm": 0.9606726169586182, "learning_rate": 1e-05, "loss": 0.4968, "mean_token_accuracy": 0.8437399864196777, "num_tokens": 293629039.0, "step": 1843 }, { "epoch": 0.9379450661241099, "grad_norm": 0.9770108461380005, "learning_rate": 1e-05, "loss": 0.4542, "mean_token_accuracy": 0.8554556369781494, "num_tokens": 293773864.0, "step": 1844 }, { "epoch": 0.9384537131230926, "grad_norm": 0.980654776096344, "learning_rate": 1e-05, "loss": 0.5014, "mean_token_accuracy": 0.8446040153503418, "num_tokens": 293937000.0, "step": 1845 }, { "epoch": 0.9389623601220752, "grad_norm": 0.9239630103111267, "learning_rate": 1e-05, "loss": 0.478, "mean_token_accuracy": 0.8489571213722229, "num_tokens": 294103688.0, "step": 1846 }, { "epoch": 0.939471007121058, "grad_norm": 0.8894424438476562, "learning_rate": 1e-05, "loss": 0.4892, "mean_token_accuracy": 0.8482279181480408, "num_tokens": 294266886.0, "step": 1847 }, { "epoch": 0.9399796541200407, "grad_norm": 0.985583484172821, "learning_rate": 1e-05, "loss": 0.4722, "mean_token_accuracy": 0.8499586582183838, "num_tokens": 294425536.0, "step": 1848 }, { "epoch": 0.9404883011190234, "grad_norm": 0.9692992568016052, "learning_rate": 1e-05, "loss": 0.5176, "mean_token_accuracy": 0.8396903276443481, "num_tokens": 294590856.0, "step": 1849 }, { "epoch": 0.940996948118006, "grad_norm": 0.994042158126831, "learning_rate": 1e-05, "loss": 0.5046, "mean_token_accuracy": 0.8418453931808472, "num_tokens": 294743889.0, "step": 1850 }, { "epoch": 0.9415055951169888, "grad_norm": 0.9837060570716858, "learning_rate": 1e-05, "loss": 0.4981, "mean_token_accuracy": 0.8423642516136169, "num_tokens": 294899666.0, "step": 1851 }, { "epoch": 0.9420142421159715, "grad_norm": 0.9281747341156006, "learning_rate": 1e-05, "loss": 0.5526, "mean_token_accuracy": 0.8312216997146606, "num_tokens": 295078964.0, "step": 1852 }, { "epoch": 0.9425228891149542, "grad_norm": 0.9872815608978271, "learning_rate": 1e-05, "loss": 0.4768, "mean_token_accuracy": 0.8486718535423279, "num_tokens": 295221135.0, "step": 1853 }, { "epoch": 0.943031536113937, "grad_norm": 0.9974026679992676, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.8463758230209351, "num_tokens": 295382594.0, "step": 1854 }, { "epoch": 0.9435401831129197, "grad_norm": 0.9670667052268982, "learning_rate": 1e-05, "loss": 0.5271, "mean_token_accuracy": 0.8365948796272278, "num_tokens": 295541772.0, "step": 1855 }, { "epoch": 0.9440488301119023, "grad_norm": 0.97458815574646, "learning_rate": 1e-05, "loss": 0.4863, "mean_token_accuracy": 0.8459551334381104, "num_tokens": 295694723.0, "step": 1856 }, { "epoch": 0.944557477110885, "grad_norm": 0.9388545155525208, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8488112688064575, "num_tokens": 295848132.0, "step": 1857 }, { "epoch": 0.9450661241098678, "grad_norm": 1.0486011505126953, "learning_rate": 1e-05, "loss": 0.4996, "mean_token_accuracy": 0.8439739942550659, "num_tokens": 295998034.0, "step": 1858 }, { "epoch": 0.9455747711088505, "grad_norm": 0.9702494740486145, "learning_rate": 1e-05, "loss": 0.5149, "mean_token_accuracy": 0.8386168479919434, "num_tokens": 296142665.0, "step": 1859 }, { "epoch": 0.9460834181078331, "grad_norm": 0.9410427212715149, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8444942235946655, "num_tokens": 296290081.0, "step": 1860 }, { "epoch": 0.9465920651068158, "grad_norm": 0.9935798645019531, "learning_rate": 1e-05, "loss": 0.5125, "mean_token_accuracy": 0.8418990969657898, "num_tokens": 296439741.0, "step": 1861 }, { "epoch": 0.9471007121057986, "grad_norm": 0.9800233244895935, "learning_rate": 1e-05, "loss": 0.4851, "mean_token_accuracy": 0.8472626209259033, "num_tokens": 296598043.0, "step": 1862 }, { "epoch": 0.9476093591047813, "grad_norm": 0.9883062839508057, "learning_rate": 1e-05, "loss": 0.4907, "mean_token_accuracy": 0.8471593260765076, "num_tokens": 296748422.0, "step": 1863 }, { "epoch": 0.948118006103764, "grad_norm": 0.9847372174263, "learning_rate": 1e-05, "loss": 0.5052, "mean_token_accuracy": 0.8409173488616943, "num_tokens": 296907598.0, "step": 1864 }, { "epoch": 0.9486266531027467, "grad_norm": 1.044905662536621, "learning_rate": 1e-05, "loss": 0.5184, "mean_token_accuracy": 0.8379148244857788, "num_tokens": 297068522.0, "step": 1865 }, { "epoch": 0.9491353001017294, "grad_norm": 0.9221831560134888, "learning_rate": 1e-05, "loss": 0.4885, "mean_token_accuracy": 0.8469454646110535, "num_tokens": 297229538.0, "step": 1866 }, { "epoch": 0.9496439471007121, "grad_norm": 1.0027371644973755, "learning_rate": 1e-05, "loss": 0.5063, "mean_token_accuracy": 0.8409550189971924, "num_tokens": 297390837.0, "step": 1867 }, { "epoch": 0.9501525940996948, "grad_norm": 1.0401782989501953, "learning_rate": 1e-05, "loss": 0.5495, "mean_token_accuracy": 0.8319090604782104, "num_tokens": 297545849.0, "step": 1868 }, { "epoch": 0.9506612410986776, "grad_norm": 1.0064892768859863, "learning_rate": 1e-05, "loss": 0.494, "mean_token_accuracy": 0.8429477214813232, "num_tokens": 297702474.0, "step": 1869 }, { "epoch": 0.9511698880976602, "grad_norm": 1.0638560056686401, "learning_rate": 1e-05, "loss": 0.5411, "mean_token_accuracy": 0.8319836854934692, "num_tokens": 297857385.0, "step": 1870 }, { "epoch": 0.9516785350966429, "grad_norm": 1.0173994302749634, "learning_rate": 1e-05, "loss": 0.4934, "mean_token_accuracy": 0.8425914645195007, "num_tokens": 298007755.0, "step": 1871 }, { "epoch": 0.9521871820956256, "grad_norm": 1.0122215747833252, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.8413197994232178, "num_tokens": 298178437.0, "step": 1872 }, { "epoch": 0.9526958290946084, "grad_norm": 1.0996077060699463, "learning_rate": 1e-05, "loss": 0.5105, "mean_token_accuracy": 0.8416764736175537, "num_tokens": 298335117.0, "step": 1873 }, { "epoch": 0.953204476093591, "grad_norm": 1.0579203367233276, "learning_rate": 1e-05, "loss": 0.4829, "mean_token_accuracy": 0.8460375666618347, "num_tokens": 298502544.0, "step": 1874 }, { "epoch": 0.9537131230925737, "grad_norm": 1.020734429359436, "learning_rate": 1e-05, "loss": 0.5059, "mean_token_accuracy": 0.8390011787414551, "num_tokens": 298660517.0, "step": 1875 }, { "epoch": 0.9542217700915565, "grad_norm": 0.9793227314949036, "learning_rate": 1e-05, "loss": 0.52, "mean_token_accuracy": 0.8381893038749695, "num_tokens": 298813928.0, "step": 1876 }, { "epoch": 0.9547304170905392, "grad_norm": 0.9196975827217102, "learning_rate": 1e-05, "loss": 0.4868, "mean_token_accuracy": 0.845372200012207, "num_tokens": 298982892.0, "step": 1877 }, { "epoch": 0.9552390640895219, "grad_norm": 0.9534643888473511, "learning_rate": 1e-05, "loss": 0.4918, "mean_token_accuracy": 0.8440676927566528, "num_tokens": 299145054.0, "step": 1878 }, { "epoch": 0.9557477110885045, "grad_norm": 0.924911379814148, "learning_rate": 1e-05, "loss": 0.4938, "mean_token_accuracy": 0.8451396226882935, "num_tokens": 299310640.0, "step": 1879 }, { "epoch": 0.9562563580874873, "grad_norm": 0.9954731464385986, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.8396790027618408, "num_tokens": 299484449.0, "step": 1880 }, { "epoch": 0.95676500508647, "grad_norm": 1.0057673454284668, "learning_rate": 1e-05, "loss": 0.5218, "mean_token_accuracy": 0.8367470502853394, "num_tokens": 299657556.0, "step": 1881 }, { "epoch": 0.9572736520854527, "grad_norm": 0.9887730479240417, "learning_rate": 1e-05, "loss": 0.5136, "mean_token_accuracy": 0.8388510942459106, "num_tokens": 299821866.0, "step": 1882 }, { "epoch": 0.9577822990844354, "grad_norm": 1.045063853263855, "learning_rate": 1e-05, "loss": 0.5199, "mean_token_accuracy": 0.8389185070991516, "num_tokens": 299975575.0, "step": 1883 }, { "epoch": 0.9582909460834181, "grad_norm": 1.1116621494293213, "learning_rate": 1e-05, "loss": 0.5085, "mean_token_accuracy": 0.8415688276290894, "num_tokens": 300122149.0, "step": 1884 }, { "epoch": 0.9587995930824008, "grad_norm": 0.9680479168891907, "learning_rate": 1e-05, "loss": 0.5193, "mean_token_accuracy": 0.8371805548667908, "num_tokens": 300275629.0, "step": 1885 }, { "epoch": 0.9593082400813835, "grad_norm": 0.9897911548614502, "learning_rate": 1e-05, "loss": 0.5094, "mean_token_accuracy": 0.8400824069976807, "num_tokens": 300434761.0, "step": 1886 }, { "epoch": 0.9598168870803663, "grad_norm": 0.9866281747817993, "learning_rate": 1e-05, "loss": 0.5078, "mean_token_accuracy": 0.8412807583808899, "num_tokens": 300595753.0, "step": 1887 }, { "epoch": 0.960325534079349, "grad_norm": 0.9881137013435364, "learning_rate": 1e-05, "loss": 0.5187, "mean_token_accuracy": 0.8375228047370911, "num_tokens": 300761476.0, "step": 1888 }, { "epoch": 0.9608341810783316, "grad_norm": 1.011414885520935, "learning_rate": 1e-05, "loss": 0.474, "mean_token_accuracy": 0.8508404493331909, "num_tokens": 300907255.0, "step": 1889 }, { "epoch": 0.9613428280773143, "grad_norm": 0.9575902819633484, "learning_rate": 1e-05, "loss": 0.5026, "mean_token_accuracy": 0.8420031666755676, "num_tokens": 301078714.0, "step": 1890 }, { "epoch": 0.9618514750762971, "grad_norm": 0.9903496503829956, "learning_rate": 1e-05, "loss": 0.5071, "mean_token_accuracy": 0.8426606059074402, "num_tokens": 301235054.0, "step": 1891 }, { "epoch": 0.9623601220752798, "grad_norm": 0.9883329272270203, "learning_rate": 1e-05, "loss": 0.5086, "mean_token_accuracy": 0.8412028551101685, "num_tokens": 301390887.0, "step": 1892 }, { "epoch": 0.9628687690742624, "grad_norm": 0.9743595123291016, "learning_rate": 1e-05, "loss": 0.5201, "mean_token_accuracy": 0.8380074501037598, "num_tokens": 301553051.0, "step": 1893 }, { "epoch": 0.9633774160732451, "grad_norm": 0.9916619658470154, "learning_rate": 1e-05, "loss": 0.4949, "mean_token_accuracy": 0.8453688621520996, "num_tokens": 301709240.0, "step": 1894 }, { "epoch": 0.9638860630722279, "grad_norm": 0.9561258554458618, "learning_rate": 1e-05, "loss": 0.5097, "mean_token_accuracy": 0.8401749134063721, "num_tokens": 301873555.0, "step": 1895 }, { "epoch": 0.9643947100712106, "grad_norm": 0.9598016738891602, "learning_rate": 1e-05, "loss": 0.5135, "mean_token_accuracy": 0.8393949270248413, "num_tokens": 302020968.0, "step": 1896 }, { "epoch": 0.9649033570701933, "grad_norm": 0.9727898240089417, "learning_rate": 1e-05, "loss": 0.5428, "mean_token_accuracy": 0.8316559791564941, "num_tokens": 302178645.0, "step": 1897 }, { "epoch": 0.965412004069176, "grad_norm": 0.9923699498176575, "learning_rate": 1e-05, "loss": 0.5475, "mean_token_accuracy": 0.8295910358428955, "num_tokens": 302349196.0, "step": 1898 }, { "epoch": 0.9659206510681587, "grad_norm": 0.9970629811286926, "learning_rate": 1e-05, "loss": 0.5224, "mean_token_accuracy": 0.8365216851234436, "num_tokens": 302506292.0, "step": 1899 }, { "epoch": 0.9664292980671414, "grad_norm": 1.000089406967163, "learning_rate": 1e-05, "loss": 0.5264, "mean_token_accuracy": 0.835106372833252, "num_tokens": 302661848.0, "step": 1900 }, { "epoch": 0.9669379450661241, "grad_norm": 0.961031973361969, "learning_rate": 1e-05, "loss": 0.5419, "mean_token_accuracy": 0.8303220272064209, "num_tokens": 302822413.0, "step": 1901 }, { "epoch": 0.9674465920651069, "grad_norm": 0.9586144089698792, "learning_rate": 1e-05, "loss": 0.493, "mean_token_accuracy": 0.8451566696166992, "num_tokens": 302985335.0, "step": 1902 }, { "epoch": 0.9679552390640895, "grad_norm": 1.0209102630615234, "learning_rate": 1e-05, "loss": 0.4917, "mean_token_accuracy": 0.8447892069816589, "num_tokens": 303149395.0, "step": 1903 }, { "epoch": 0.9684638860630722, "grad_norm": 0.9929437041282654, "learning_rate": 1e-05, "loss": 0.4818, "mean_token_accuracy": 0.8488729596138, "num_tokens": 303312885.0, "step": 1904 }, { "epoch": 0.9689725330620549, "grad_norm": 0.9224243760108948, "learning_rate": 1e-05, "loss": 0.5192, "mean_token_accuracy": 0.8377442955970764, "num_tokens": 303478385.0, "step": 1905 }, { "epoch": 0.9694811800610377, "grad_norm": 1.0138916969299316, "learning_rate": 1e-05, "loss": 0.5091, "mean_token_accuracy": 0.8416192531585693, "num_tokens": 303632397.0, "step": 1906 }, { "epoch": 0.9699898270600203, "grad_norm": 0.973189651966095, "learning_rate": 1e-05, "loss": 0.5104, "mean_token_accuracy": 0.8388887643814087, "num_tokens": 303793145.0, "step": 1907 }, { "epoch": 0.970498474059003, "grad_norm": 0.9834722280502319, "learning_rate": 1e-05, "loss": 0.5302, "mean_token_accuracy": 0.8342853784561157, "num_tokens": 303954402.0, "step": 1908 }, { "epoch": 0.9710071210579858, "grad_norm": 0.998568058013916, "learning_rate": 1e-05, "loss": 0.5134, "mean_token_accuracy": 0.8387386202812195, "num_tokens": 304115485.0, "step": 1909 }, { "epoch": 0.9715157680569685, "grad_norm": 0.9369135499000549, "learning_rate": 1e-05, "loss": 0.5122, "mean_token_accuracy": 0.8402111530303955, "num_tokens": 304285608.0, "step": 1910 }, { "epoch": 0.9720244150559512, "grad_norm": 0.9555466771125793, "learning_rate": 1e-05, "loss": 0.4608, "mean_token_accuracy": 0.8527745008468628, "num_tokens": 304444429.0, "step": 1911 }, { "epoch": 0.9725330620549338, "grad_norm": 1.0225001573562622, "learning_rate": 1e-05, "loss": 0.4925, "mean_token_accuracy": 0.8453694581985474, "num_tokens": 304604968.0, "step": 1912 }, { "epoch": 0.9730417090539166, "grad_norm": 1.0820183753967285, "learning_rate": 1e-05, "loss": 0.4846, "mean_token_accuracy": 0.8482344746589661, "num_tokens": 304762895.0, "step": 1913 }, { "epoch": 0.9735503560528993, "grad_norm": 0.9791913628578186, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8402152061462402, "num_tokens": 304919739.0, "step": 1914 }, { "epoch": 0.974059003051882, "grad_norm": 0.9272238612174988, "learning_rate": 1e-05, "loss": 0.518, "mean_token_accuracy": 0.8391333818435669, "num_tokens": 305084650.0, "step": 1915 }, { "epoch": 0.9745676500508647, "grad_norm": 0.9650485515594482, "learning_rate": 1e-05, "loss": 0.4985, "mean_token_accuracy": 0.8421435952186584, "num_tokens": 305245025.0, "step": 1916 }, { "epoch": 0.9750762970498474, "grad_norm": 1.0105266571044922, "learning_rate": 1e-05, "loss": 0.4951, "mean_token_accuracy": 0.8445926308631897, "num_tokens": 305403475.0, "step": 1917 }, { "epoch": 0.9755849440488301, "grad_norm": 0.9218766093254089, "learning_rate": 1e-05, "loss": 0.51, "mean_token_accuracy": 0.8400081396102905, "num_tokens": 305554045.0, "step": 1918 }, { "epoch": 0.9760935910478128, "grad_norm": 0.9602965116500854, "learning_rate": 1e-05, "loss": 0.4853, "mean_token_accuracy": 0.8474862575531006, "num_tokens": 305712579.0, "step": 1919 }, { "epoch": 0.9766022380467956, "grad_norm": 0.9874665141105652, "learning_rate": 1e-05, "loss": 0.4895, "mean_token_accuracy": 0.8474003076553345, "num_tokens": 305880937.0, "step": 1920 }, { "epoch": 0.9771108850457783, "grad_norm": 0.9894477725028992, "learning_rate": 1e-05, "loss": 0.4939, "mean_token_accuracy": 0.8452675342559814, "num_tokens": 306037487.0, "step": 1921 }, { "epoch": 0.9776195320447609, "grad_norm": 0.9343340396881104, "learning_rate": 1e-05, "loss": 0.5048, "mean_token_accuracy": 0.842698335647583, "num_tokens": 306205585.0, "step": 1922 }, { "epoch": 0.9781281790437436, "grad_norm": 1.034570336341858, "learning_rate": 1e-05, "loss": 0.5167, "mean_token_accuracy": 0.8392354249954224, "num_tokens": 306363764.0, "step": 1923 }, { "epoch": 0.9786368260427264, "grad_norm": 0.9801474213600159, "learning_rate": 1e-05, "loss": 0.496, "mean_token_accuracy": 0.8447216749191284, "num_tokens": 306521564.0, "step": 1924 }, { "epoch": 0.9791454730417091, "grad_norm": 1.0061413049697876, "learning_rate": 1e-05, "loss": 0.4963, "mean_token_accuracy": 0.8442873954772949, "num_tokens": 306674429.0, "step": 1925 }, { "epoch": 0.9796541200406917, "grad_norm": 0.9756056070327759, "learning_rate": 1e-05, "loss": 0.5349, "mean_token_accuracy": 0.8323445320129395, "num_tokens": 306832682.0, "step": 1926 }, { "epoch": 0.9801627670396744, "grad_norm": 1.1082823276519775, "learning_rate": 1e-05, "loss": 0.4761, "mean_token_accuracy": 0.8504911065101624, "num_tokens": 306990015.0, "step": 1927 }, { "epoch": 0.9806714140386572, "grad_norm": 0.9246140122413635, "learning_rate": 1e-05, "loss": 0.4983, "mean_token_accuracy": 0.8428771495819092, "num_tokens": 307155638.0, "step": 1928 }, { "epoch": 0.9811800610376399, "grad_norm": 1.0045851469039917, "learning_rate": 1e-05, "loss": 0.4776, "mean_token_accuracy": 0.8490939140319824, "num_tokens": 307317052.0, "step": 1929 }, { "epoch": 0.9816887080366226, "grad_norm": 0.9832070469856262, "learning_rate": 1e-05, "loss": 0.5067, "mean_token_accuracy": 0.8416174054145813, "num_tokens": 307461975.0, "step": 1930 }, { "epoch": 0.9821973550356052, "grad_norm": 1.0693434476852417, "learning_rate": 1e-05, "loss": 0.5424, "mean_token_accuracy": 0.8304628729820251, "num_tokens": 307618995.0, "step": 1931 }, { "epoch": 0.982706002034588, "grad_norm": 0.981465220451355, "learning_rate": 1e-05, "loss": 0.4994, "mean_token_accuracy": 0.8437796235084534, "num_tokens": 307787106.0, "step": 1932 }, { "epoch": 0.9832146490335707, "grad_norm": 0.91338050365448, "learning_rate": 1e-05, "loss": 0.5084, "mean_token_accuracy": 0.841780960559845, "num_tokens": 307957991.0, "step": 1933 }, { "epoch": 0.9837232960325534, "grad_norm": 1.0736961364746094, "learning_rate": 1e-05, "loss": 0.5252, "mean_token_accuracy": 0.8364447355270386, "num_tokens": 308114670.0, "step": 1934 }, { "epoch": 0.9842319430315362, "grad_norm": 0.9931672811508179, "learning_rate": 1e-05, "loss": 0.5294, "mean_token_accuracy": 0.834403395652771, "num_tokens": 308273482.0, "step": 1935 }, { "epoch": 0.9847405900305188, "grad_norm": 0.8900210857391357, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8416975140571594, "num_tokens": 308438717.0, "step": 1936 }, { "epoch": 0.9852492370295015, "grad_norm": 0.9809801578521729, "learning_rate": 1e-05, "loss": 0.4656, "mean_token_accuracy": 0.8523440957069397, "num_tokens": 308606239.0, "step": 1937 }, { "epoch": 0.9857578840284842, "grad_norm": 0.9752235412597656, "learning_rate": 1e-05, "loss": 0.508, "mean_token_accuracy": 0.8410700559616089, "num_tokens": 308770389.0, "step": 1938 }, { "epoch": 0.986266531027467, "grad_norm": 1.01345956325531, "learning_rate": 1e-05, "loss": 0.5072, "mean_token_accuracy": 0.841428279876709, "num_tokens": 308938986.0, "step": 1939 }, { "epoch": 0.9867751780264497, "grad_norm": 0.8987012505531311, "learning_rate": 1e-05, "loss": 0.4844, "mean_token_accuracy": 0.8478468656539917, "num_tokens": 309098467.0, "step": 1940 }, { "epoch": 0.9872838250254323, "grad_norm": 1.0079503059387207, "learning_rate": 1e-05, "loss": 0.5107, "mean_token_accuracy": 0.8383113145828247, "num_tokens": 309271767.0, "step": 1941 }, { "epoch": 0.987792472024415, "grad_norm": 0.9667823314666748, "learning_rate": 1e-05, "loss": 0.5268, "mean_token_accuracy": 0.8350872993469238, "num_tokens": 309430767.0, "step": 1942 }, { "epoch": 0.9883011190233978, "grad_norm": 0.9856606125831604, "learning_rate": 1e-05, "loss": 0.5076, "mean_token_accuracy": 0.8395977020263672, "num_tokens": 309588008.0, "step": 1943 }, { "epoch": 0.9888097660223805, "grad_norm": 0.9772642254829407, "learning_rate": 1e-05, "loss": 0.5115, "mean_token_accuracy": 0.8399066925048828, "num_tokens": 309743337.0, "step": 1944 }, { "epoch": 0.9893184130213631, "grad_norm": 1.0005584955215454, "learning_rate": 1e-05, "loss": 0.5132, "mean_token_accuracy": 0.8383961319923401, "num_tokens": 309895383.0, "step": 1945 }, { "epoch": 0.9898270600203459, "grad_norm": 1.0755404233932495, "learning_rate": 1e-05, "loss": 0.5298, "mean_token_accuracy": 0.8351097106933594, "num_tokens": 310051593.0, "step": 1946 }, { "epoch": 0.9903357070193286, "grad_norm": 1.0297179222106934, "learning_rate": 1e-05, "loss": 0.5187, "mean_token_accuracy": 0.8374543786048889, "num_tokens": 310222144.0, "step": 1947 }, { "epoch": 0.9908443540183113, "grad_norm": 1.331903100013733, "learning_rate": 1e-05, "loss": 0.481, "mean_token_accuracy": 0.8481841087341309, "num_tokens": 310376369.0, "step": 1948 }, { "epoch": 0.991353001017294, "grad_norm": 0.9783390164375305, "learning_rate": 1e-05, "loss": 0.5276, "mean_token_accuracy": 0.8363245725631714, "num_tokens": 310540773.0, "step": 1949 }, { "epoch": 0.9918616480162767, "grad_norm": 1.0117331743240356, "learning_rate": 1e-05, "loss": 0.5133, "mean_token_accuracy": 0.8386774063110352, "num_tokens": 310705795.0, "step": 1950 }, { "epoch": 0.9923702950152594, "grad_norm": 0.8965991735458374, "learning_rate": 1e-05, "loss": 0.4924, "mean_token_accuracy": 0.8455886244773865, "num_tokens": 310870435.0, "step": 1951 }, { "epoch": 0.9928789420142421, "grad_norm": 0.979218065738678, "learning_rate": 1e-05, "loss": 0.5277, "mean_token_accuracy": 0.8348729610443115, "num_tokens": 311023220.0, "step": 1952 }, { "epoch": 0.9933875890132248, "grad_norm": 0.9134143590927124, "learning_rate": 1e-05, "loss": 0.5089, "mean_token_accuracy": 0.8415424823760986, "num_tokens": 311192559.0, "step": 1953 }, { "epoch": 0.9938962360122076, "grad_norm": 1.0180108547210693, "learning_rate": 1e-05, "loss": 0.5309, "mean_token_accuracy": 0.8341297507286072, "num_tokens": 311361294.0, "step": 1954 }, { "epoch": 0.9944048830111902, "grad_norm": 0.993411660194397, "learning_rate": 1e-05, "loss": 0.5372, "mean_token_accuracy": 0.8315683603286743, "num_tokens": 311513624.0, "step": 1955 }, { "epoch": 0.9949135300101729, "grad_norm": 1.0283094644546509, "learning_rate": 1e-05, "loss": 0.548, "mean_token_accuracy": 0.8294710516929626, "num_tokens": 311682933.0, "step": 1956 }, { "epoch": 0.9954221770091557, "grad_norm": 0.9628696441650391, "learning_rate": 1e-05, "loss": 0.5232, "mean_token_accuracy": 0.8364568948745728, "num_tokens": 311836701.0, "step": 1957 }, { "epoch": 0.9959308240081384, "grad_norm": 0.9204580783843994, "learning_rate": 1e-05, "loss": 0.4931, "mean_token_accuracy": 0.846779465675354, "num_tokens": 311997757.0, "step": 1958 }, { "epoch": 0.996439471007121, "grad_norm": 0.9796516299247742, "learning_rate": 1e-05, "loss": 0.5053, "mean_token_accuracy": 0.8413169384002686, "num_tokens": 312141716.0, "step": 1959 }, { "epoch": 0.9969481180061037, "grad_norm": 0.9200966954231262, "learning_rate": 1e-05, "loss": 0.5064, "mean_token_accuracy": 0.8408148288726807, "num_tokens": 312300232.0, "step": 1960 }, { "epoch": 0.9974567650050865, "grad_norm": 0.9429312348365784, "learning_rate": 1e-05, "loss": 0.5184, "mean_token_accuracy": 0.8386069536209106, "num_tokens": 312460325.0, "step": 1961 }, { "epoch": 0.9979654120040692, "grad_norm": 0.9559454917907715, "learning_rate": 1e-05, "loss": 0.4942, "mean_token_accuracy": 0.8460768461227417, "num_tokens": 312617272.0, "step": 1962 }, { "epoch": 0.9984740590030519, "grad_norm": 0.8990178108215332, "learning_rate": 1e-05, "loss": 0.4999, "mean_token_accuracy": 0.8424972891807556, "num_tokens": 312782970.0, "step": 1963 }, { "epoch": 0.9989827060020345, "grad_norm": 0.9896379709243774, "learning_rate": 1e-05, "loss": 0.5175, "mean_token_accuracy": 0.839756429195404, "num_tokens": 312940535.0, "step": 1964 }, { "epoch": 0.9994913530010173, "grad_norm": 0.9886254072189331, "learning_rate": 1e-05, "loss": 0.4982, "mean_token_accuracy": 0.8438684940338135, "num_tokens": 313094817.0, "step": 1965 }, { "epoch": 1.0, "grad_norm": 0.9970649480819702, "learning_rate": 1e-05, "loss": 0.5205, "mean_token_accuracy": 0.836739718914032, "num_tokens": 313255150.0, "step": 1966 }, { "epoch": 1.0, "step": 1966, "total_flos": 2.0963795699960381e+18, "train_loss": 0.560804831153985, "train_runtime": 2871.9626, "train_samples_per_second": 43.792, "train_steps_per_second": 0.685 } ], "logging_steps": 1, "max_steps": 1966, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 983, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0963795699960381e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }