{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004761904761904762, "grad_norm": 8.288029670715332, "learning_rate": 0.0, "loss": 1.7656, "mean_token_accuracy": 0.5768666863441467, "num_tokens": 582781.0, "step": 1 }, { "epoch": 0.009523809523809525, "grad_norm": 8.260492324829102, "learning_rate": 1.5873015873015874e-07, "loss": 1.7728, "mean_token_accuracy": 0.5752322673797607, "num_tokens": 1163696.0, "step": 2 }, { "epoch": 0.014285714285714285, "grad_norm": 8.188252449035645, "learning_rate": 3.174603174603175e-07, "loss": 1.776, "mean_token_accuracy": 0.5746057033538818, "num_tokens": 1762000.0, "step": 3 }, { "epoch": 0.01904761904761905, "grad_norm": 8.122298240661621, "learning_rate": 4.7619047619047623e-07, "loss": 1.7765, "mean_token_accuracy": 0.5741599798202515, "num_tokens": 2363228.0, "step": 4 }, { "epoch": 0.023809523809523808, "grad_norm": 7.91809606552124, "learning_rate": 6.34920634920635e-07, "loss": 1.7924, "mean_token_accuracy": 0.5723700523376465, "num_tokens": 2968748.0, "step": 5 }, { "epoch": 0.02857142857142857, "grad_norm": 7.924537181854248, "learning_rate": 7.936507936507937e-07, "loss": 1.7649, "mean_token_accuracy": 0.5754636526107788, "num_tokens": 3564062.0, "step": 6 }, { "epoch": 0.03333333333333333, "grad_norm": 7.629780292510986, "learning_rate": 9.523809523809525e-07, "loss": 1.7769, "mean_token_accuracy": 0.5719509124755859, "num_tokens": 4140352.0, "step": 7 }, { "epoch": 0.0380952380952381, "grad_norm": 7.133674621582031, "learning_rate": 1.111111111111111e-06, "loss": 1.7748, "mean_token_accuracy": 0.5719484090805054, "num_tokens": 4748067.0, "step": 8 }, { "epoch": 0.04285714285714286, "grad_norm": 6.150221347808838, "learning_rate": 1.26984126984127e-06, "loss": 1.7288, "mean_token_accuracy": 0.5776432156562805, "num_tokens": 5333791.0, "step": 9 }, { "epoch": 0.047619047619047616, "grad_norm": 6.026834964752197, "learning_rate": 1.4285714285714286e-06, "loss": 1.7401, "mean_token_accuracy": 0.5752577781677246, "num_tokens": 5923564.0, "step": 10 }, { "epoch": 0.05238095238095238, "grad_norm": 5.608363151550293, "learning_rate": 1.5873015873015873e-06, "loss": 1.7097, "mean_token_accuracy": 0.5797863602638245, "num_tokens": 6528559.0, "step": 11 }, { "epoch": 0.05714285714285714, "grad_norm": 4.234569072723389, "learning_rate": 1.746031746031746e-06, "loss": 1.6598, "mean_token_accuracy": 0.5850973725318909, "num_tokens": 7118765.0, "step": 12 }, { "epoch": 0.06190476190476191, "grad_norm": 4.145053386688232, "learning_rate": 1.904761904761905e-06, "loss": 1.6597, "mean_token_accuracy": 0.5842898488044739, "num_tokens": 7709226.0, "step": 13 }, { "epoch": 0.06666666666666667, "grad_norm": 3.9073646068573, "learning_rate": 2.0634920634920634e-06, "loss": 1.6303, "mean_token_accuracy": 0.5906457901000977, "num_tokens": 8298984.0, "step": 14 }, { "epoch": 0.07142857142857142, "grad_norm": 3.8127150535583496, "learning_rate": 2.222222222222222e-06, "loss": 1.6281, "mean_token_accuracy": 0.5896565914154053, "num_tokens": 8875624.0, "step": 15 }, { "epoch": 0.0761904761904762, "grad_norm": 3.0899341106414795, "learning_rate": 2.380952380952381e-06, "loss": 1.5687, "mean_token_accuracy": 0.5990549325942993, "num_tokens": 9448671.0, "step": 16 }, { "epoch": 0.08095238095238096, "grad_norm": 2.755232334136963, "learning_rate": 2.53968253968254e-06, "loss": 1.5548, "mean_token_accuracy": 0.6021129488945007, "num_tokens": 10049546.0, "step": 17 }, { "epoch": 0.08571428571428572, "grad_norm": 2.589613914489746, "learning_rate": 2.6984126984126986e-06, "loss": 1.5609, "mean_token_accuracy": 0.5993459820747375, "num_tokens": 10644905.0, "step": 18 }, { "epoch": 0.09047619047619047, "grad_norm": 2.2161478996276855, "learning_rate": 2.8571428571428573e-06, "loss": 1.5541, "mean_token_accuracy": 0.6018418073654175, "num_tokens": 11239583.0, "step": 19 }, { "epoch": 0.09523809523809523, "grad_norm": 1.9722470045089722, "learning_rate": 3.015873015873016e-06, "loss": 1.5295, "mean_token_accuracy": 0.6070071458816528, "num_tokens": 11827320.0, "step": 20 }, { "epoch": 0.1, "grad_norm": 1.8827704191207886, "learning_rate": 3.1746031746031746e-06, "loss": 1.4814, "mean_token_accuracy": 0.6151003837585449, "num_tokens": 12425511.0, "step": 21 }, { "epoch": 0.10476190476190476, "grad_norm": 2.351033926010132, "learning_rate": 3.3333333333333333e-06, "loss": 1.4865, "mean_token_accuracy": 0.6138286590576172, "num_tokens": 13015708.0, "step": 22 }, { "epoch": 0.10952380952380952, "grad_norm": 2.134150981903076, "learning_rate": 3.492063492063492e-06, "loss": 1.469, "mean_token_accuracy": 0.6165286302566528, "num_tokens": 13608875.0, "step": 23 }, { "epoch": 0.11428571428571428, "grad_norm": 1.9380258321762085, "learning_rate": 3.6507936507936507e-06, "loss": 1.476, "mean_token_accuracy": 0.6141604781150818, "num_tokens": 14204297.0, "step": 24 }, { "epoch": 0.11904761904761904, "grad_norm": 1.656062364578247, "learning_rate": 3.80952380952381e-06, "loss": 1.461, "mean_token_accuracy": 0.6166412830352783, "num_tokens": 14782206.0, "step": 25 }, { "epoch": 0.12380952380952381, "grad_norm": 1.3905470371246338, "learning_rate": 3.968253968253968e-06, "loss": 1.4382, "mean_token_accuracy": 0.6210923194885254, "num_tokens": 15377829.0, "step": 26 }, { "epoch": 0.12857142857142856, "grad_norm": 1.1439160108566284, "learning_rate": 4.126984126984127e-06, "loss": 1.4318, "mean_token_accuracy": 0.6224101781845093, "num_tokens": 15975819.0, "step": 27 }, { "epoch": 0.13333333333333333, "grad_norm": 1.0443707704544067, "learning_rate": 4.2857142857142855e-06, "loss": 1.4182, "mean_token_accuracy": 0.6251046061515808, "num_tokens": 16577839.0, "step": 28 }, { "epoch": 0.1380952380952381, "grad_norm": 1.0729820728302002, "learning_rate": 4.444444444444444e-06, "loss": 1.4116, "mean_token_accuracy": 0.6257858276367188, "num_tokens": 17164831.0, "step": 29 }, { "epoch": 0.14285714285714285, "grad_norm": 1.1262085437774658, "learning_rate": 4.603174603174604e-06, "loss": 1.3974, "mean_token_accuracy": 0.6290417909622192, "num_tokens": 17770476.0, "step": 30 }, { "epoch": 0.14761904761904762, "grad_norm": 1.1004436016082764, "learning_rate": 4.761904761904762e-06, "loss": 1.383, "mean_token_accuracy": 0.6305603981018066, "num_tokens": 18360862.0, "step": 31 }, { "epoch": 0.1523809523809524, "grad_norm": 0.9822593927383423, "learning_rate": 4.920634920634921e-06, "loss": 1.3981, "mean_token_accuracy": 0.6271172761917114, "num_tokens": 18944338.0, "step": 32 }, { "epoch": 0.15714285714285714, "grad_norm": 0.8572197556495667, "learning_rate": 5.07936507936508e-06, "loss": 1.3721, "mean_token_accuracy": 0.6327400207519531, "num_tokens": 19540189.0, "step": 33 }, { "epoch": 0.1619047619047619, "grad_norm": 0.9113824963569641, "learning_rate": 5.2380952380952384e-06, "loss": 1.3689, "mean_token_accuracy": 0.6341559290885925, "num_tokens": 20138131.0, "step": 34 }, { "epoch": 0.16666666666666666, "grad_norm": 0.8736249208450317, "learning_rate": 5.396825396825397e-06, "loss": 1.3855, "mean_token_accuracy": 0.6294394731521606, "num_tokens": 20735187.0, "step": 35 }, { "epoch": 0.17142857142857143, "grad_norm": 0.8438997268676758, "learning_rate": 5.555555555555557e-06, "loss": 1.3614, "mean_token_accuracy": 0.6335337162017822, "num_tokens": 21316383.0, "step": 36 }, { "epoch": 0.1761904761904762, "grad_norm": 0.7541394233703613, "learning_rate": 5.7142857142857145e-06, "loss": 1.3378, "mean_token_accuracy": 0.6401833295822144, "num_tokens": 21910626.0, "step": 37 }, { "epoch": 0.18095238095238095, "grad_norm": 0.697533130645752, "learning_rate": 5.873015873015874e-06, "loss": 1.3591, "mean_token_accuracy": 0.6341187357902527, "num_tokens": 22503955.0, "step": 38 }, { "epoch": 0.18571428571428572, "grad_norm": 0.677990734577179, "learning_rate": 6.031746031746032e-06, "loss": 1.3543, "mean_token_accuracy": 0.6353764533996582, "num_tokens": 23093310.0, "step": 39 }, { "epoch": 0.19047619047619047, "grad_norm": 0.677953839302063, "learning_rate": 6.1904761904761914e-06, "loss": 1.3249, "mean_token_accuracy": 0.641827404499054, "num_tokens": 23681028.0, "step": 40 }, { "epoch": 0.19523809523809524, "grad_norm": 0.6177698969841003, "learning_rate": 6.349206349206349e-06, "loss": 1.3271, "mean_token_accuracy": 0.6412782669067383, "num_tokens": 24275532.0, "step": 41 }, { "epoch": 0.2, "grad_norm": 0.6382781267166138, "learning_rate": 6.507936507936509e-06, "loss": 1.3309, "mean_token_accuracy": 0.6407559514045715, "num_tokens": 24868054.0, "step": 42 }, { "epoch": 0.20476190476190476, "grad_norm": 0.5981337428092957, "learning_rate": 6.666666666666667e-06, "loss": 1.3323, "mean_token_accuracy": 0.6397281885147095, "num_tokens": 25459842.0, "step": 43 }, { "epoch": 0.20952380952380953, "grad_norm": 0.5885143876075745, "learning_rate": 6.825396825396826e-06, "loss": 1.339, "mean_token_accuracy": 0.6373006105422974, "num_tokens": 26051340.0, "step": 44 }, { "epoch": 0.21428571428571427, "grad_norm": 0.5942175984382629, "learning_rate": 6.984126984126984e-06, "loss": 1.3188, "mean_token_accuracy": 0.6426886320114136, "num_tokens": 26635240.0, "step": 45 }, { "epoch": 0.21904761904761905, "grad_norm": 0.6174569129943848, "learning_rate": 7.1428571428571436e-06, "loss": 1.3198, "mean_token_accuracy": 0.6419786214828491, "num_tokens": 27228570.0, "step": 46 }, { "epoch": 0.22380952380952382, "grad_norm": 0.6012991070747375, "learning_rate": 7.301587301587301e-06, "loss": 1.3139, "mean_token_accuracy": 0.6440544128417969, "num_tokens": 27825958.0, "step": 47 }, { "epoch": 0.22857142857142856, "grad_norm": 0.6103922128677368, "learning_rate": 7.460317460317461e-06, "loss": 1.3076, "mean_token_accuracy": 0.6433683037757874, "num_tokens": 28418470.0, "step": 48 }, { "epoch": 0.23333333333333334, "grad_norm": 0.6127147674560547, "learning_rate": 7.61904761904762e-06, "loss": 1.3044, "mean_token_accuracy": 0.6449373364448547, "num_tokens": 29013060.0, "step": 49 }, { "epoch": 0.23809523809523808, "grad_norm": 0.5933082103729248, "learning_rate": 7.77777777777778e-06, "loss": 1.3131, "mean_token_accuracy": 0.6417987942695618, "num_tokens": 29624709.0, "step": 50 }, { "epoch": 0.24285714285714285, "grad_norm": 0.6003814339637756, "learning_rate": 7.936507936507936e-06, "loss": 1.3056, "mean_token_accuracy": 0.6438874006271362, "num_tokens": 30227928.0, "step": 51 }, { "epoch": 0.24761904761904763, "grad_norm": 0.5546218156814575, "learning_rate": 8.095238095238097e-06, "loss": 1.3073, "mean_token_accuracy": 0.6426275968551636, "num_tokens": 30823383.0, "step": 52 }, { "epoch": 0.2523809523809524, "grad_norm": 0.5813356637954712, "learning_rate": 8.253968253968254e-06, "loss": 1.2887, "mean_token_accuracy": 0.6480042338371277, "num_tokens": 31418593.0, "step": 53 }, { "epoch": 0.2571428571428571, "grad_norm": 0.6125403046607971, "learning_rate": 8.412698412698414e-06, "loss": 1.2812, "mean_token_accuracy": 0.6492801904678345, "num_tokens": 32008377.0, "step": 54 }, { "epoch": 0.2619047619047619, "grad_norm": 0.6021028757095337, "learning_rate": 8.571428571428571e-06, "loss": 1.2881, "mean_token_accuracy": 0.6466339230537415, "num_tokens": 32600302.0, "step": 55 }, { "epoch": 0.26666666666666666, "grad_norm": 0.5916977524757385, "learning_rate": 8.730158730158731e-06, "loss": 1.2896, "mean_token_accuracy": 0.6466712951660156, "num_tokens": 33201147.0, "step": 56 }, { "epoch": 0.2714285714285714, "grad_norm": 0.5573871731758118, "learning_rate": 8.888888888888888e-06, "loss": 1.269, "mean_token_accuracy": 0.6514161229133606, "num_tokens": 33790565.0, "step": 57 }, { "epoch": 0.2761904761904762, "grad_norm": 0.6427719593048096, "learning_rate": 9.047619047619049e-06, "loss": 1.2747, "mean_token_accuracy": 0.6507048606872559, "num_tokens": 34387187.0, "step": 58 }, { "epoch": 0.28095238095238095, "grad_norm": 0.5992103219032288, "learning_rate": 9.206349206349207e-06, "loss": 1.2832, "mean_token_accuracy": 0.6487317085266113, "num_tokens": 35000480.0, "step": 59 }, { "epoch": 0.2857142857142857, "grad_norm": 0.6176905632019043, "learning_rate": 9.365079365079366e-06, "loss": 1.266, "mean_token_accuracy": 0.6526767611503601, "num_tokens": 35588577.0, "step": 60 }, { "epoch": 0.2904761904761905, "grad_norm": 0.6162196397781372, "learning_rate": 9.523809523809525e-06, "loss": 1.2696, "mean_token_accuracy": 0.6507794857025146, "num_tokens": 36179186.0, "step": 61 }, { "epoch": 0.29523809523809524, "grad_norm": 0.5662937760353088, "learning_rate": 9.682539682539683e-06, "loss": 1.2769, "mean_token_accuracy": 0.6498540639877319, "num_tokens": 36787338.0, "step": 62 }, { "epoch": 0.3, "grad_norm": 0.6263328790664673, "learning_rate": 9.841269841269842e-06, "loss": 1.2659, "mean_token_accuracy": 0.6512309908866882, "num_tokens": 37376232.0, "step": 63 }, { "epoch": 0.3047619047619048, "grad_norm": 0.5712647438049316, "learning_rate": 1e-05, "loss": 1.2575, "mean_token_accuracy": 0.65373295545578, "num_tokens": 37965066.0, "step": 64 }, { "epoch": 0.30952380952380953, "grad_norm": 0.6364603042602539, "learning_rate": 1e-05, "loss": 1.2707, "mean_token_accuracy": 0.6504393219947815, "num_tokens": 38556474.0, "step": 65 }, { "epoch": 0.3142857142857143, "grad_norm": 0.5501719117164612, "learning_rate": 1e-05, "loss": 1.2817, "mean_token_accuracy": 0.6485756039619446, "num_tokens": 39153957.0, "step": 66 }, { "epoch": 0.319047619047619, "grad_norm": 0.6252837777137756, "learning_rate": 1e-05, "loss": 1.269, "mean_token_accuracy": 0.6509230136871338, "num_tokens": 39743079.0, "step": 67 }, { "epoch": 0.3238095238095238, "grad_norm": 0.635744035243988, "learning_rate": 1e-05, "loss": 1.2538, "mean_token_accuracy": 0.6549092531204224, "num_tokens": 40341422.0, "step": 68 }, { "epoch": 0.32857142857142857, "grad_norm": 0.602989137172699, "learning_rate": 1e-05, "loss": 1.2522, "mean_token_accuracy": 0.6547552347183228, "num_tokens": 40930579.0, "step": 69 }, { "epoch": 0.3333333333333333, "grad_norm": 0.6224581003189087, "learning_rate": 1e-05, "loss": 1.2475, "mean_token_accuracy": 0.6561790704727173, "num_tokens": 41521392.0, "step": 70 }, { "epoch": 0.3380952380952381, "grad_norm": 0.6388071179389954, "learning_rate": 1e-05, "loss": 1.2652, "mean_token_accuracy": 0.6521209478378296, "num_tokens": 42126117.0, "step": 71 }, { "epoch": 0.34285714285714286, "grad_norm": 0.6036304235458374, "learning_rate": 1e-05, "loss": 1.2435, "mean_token_accuracy": 0.6566687822341919, "num_tokens": 42717085.0, "step": 72 }, { "epoch": 0.3476190476190476, "grad_norm": 0.6735650300979614, "learning_rate": 1e-05, "loss": 1.2474, "mean_token_accuracy": 0.6550711989402771, "num_tokens": 43300932.0, "step": 73 }, { "epoch": 0.3523809523809524, "grad_norm": 0.6821399927139282, "learning_rate": 1e-05, "loss": 1.2612, "mean_token_accuracy": 0.6513347625732422, "num_tokens": 43885512.0, "step": 74 }, { "epoch": 0.35714285714285715, "grad_norm": 0.5906922221183777, "learning_rate": 1e-05, "loss": 1.2462, "mean_token_accuracy": 0.6552602052688599, "num_tokens": 44482626.0, "step": 75 }, { "epoch": 0.3619047619047619, "grad_norm": 0.6703640222549438, "learning_rate": 1e-05, "loss": 1.2555, "mean_token_accuracy": 0.6526749134063721, "num_tokens": 45073331.0, "step": 76 }, { "epoch": 0.36666666666666664, "grad_norm": 0.6432617902755737, "learning_rate": 1e-05, "loss": 1.2536, "mean_token_accuracy": 0.654289186000824, "num_tokens": 45683001.0, "step": 77 }, { "epoch": 0.37142857142857144, "grad_norm": 0.5765655040740967, "learning_rate": 1e-05, "loss": 1.2571, "mean_token_accuracy": 0.6539218425750732, "num_tokens": 46280871.0, "step": 78 }, { "epoch": 0.3761904761904762, "grad_norm": 0.6340111494064331, "learning_rate": 1e-05, "loss": 1.2372, "mean_token_accuracy": 0.6561391353607178, "num_tokens": 46860927.0, "step": 79 }, { "epoch": 0.38095238095238093, "grad_norm": 0.6405033469200134, "learning_rate": 1e-05, "loss": 1.2526, "mean_token_accuracy": 0.6536115407943726, "num_tokens": 47450747.0, "step": 80 }, { "epoch": 0.38571428571428573, "grad_norm": 0.5792959332466125, "learning_rate": 1e-05, "loss": 1.25, "mean_token_accuracy": 0.6553176641464233, "num_tokens": 48053355.0, "step": 81 }, { "epoch": 0.3904761904761905, "grad_norm": 0.686775267124176, "learning_rate": 1e-05, "loss": 1.2208, "mean_token_accuracy": 0.659858226776123, "num_tokens": 48654406.0, "step": 82 }, { "epoch": 0.3952380952380952, "grad_norm": 0.6492419838905334, "learning_rate": 1e-05, "loss": 1.2283, "mean_token_accuracy": 0.6583410501480103, "num_tokens": 49253902.0, "step": 83 }, { "epoch": 0.4, "grad_norm": 0.5871007442474365, "learning_rate": 1e-05, "loss": 1.2452, "mean_token_accuracy": 0.6552358269691467, "num_tokens": 49851728.0, "step": 84 }, { "epoch": 0.40476190476190477, "grad_norm": 0.5860946774482727, "learning_rate": 1e-05, "loss": 1.2512, "mean_token_accuracy": 0.6536369919776917, "num_tokens": 50456288.0, "step": 85 }, { "epoch": 0.4095238095238095, "grad_norm": 0.6220575571060181, "learning_rate": 1e-05, "loss": 1.2576, "mean_token_accuracy": 0.6526967883110046, "num_tokens": 51058176.0, "step": 86 }, { "epoch": 0.4142857142857143, "grad_norm": 0.6111760139465332, "learning_rate": 1e-05, "loss": 1.2426, "mean_token_accuracy": 0.6556516885757446, "num_tokens": 51665178.0, "step": 87 }, { "epoch": 0.41904761904761906, "grad_norm": 0.7028889060020447, "learning_rate": 1e-05, "loss": 1.2275, "mean_token_accuracy": 0.658629298210144, "num_tokens": 52237427.0, "step": 88 }, { "epoch": 0.4238095238095238, "grad_norm": 0.6114148497581482, "learning_rate": 1e-05, "loss": 1.2493, "mean_token_accuracy": 0.6530453562736511, "num_tokens": 52850605.0, "step": 89 }, { "epoch": 0.42857142857142855, "grad_norm": 0.6214424967765808, "learning_rate": 1e-05, "loss": 1.2107, "mean_token_accuracy": 0.6619127988815308, "num_tokens": 53435907.0, "step": 90 }, { "epoch": 0.43333333333333335, "grad_norm": 0.6224313378334045, "learning_rate": 1e-05, "loss": 1.2479, "mean_token_accuracy": 0.6531662344932556, "num_tokens": 54032690.0, "step": 91 }, { "epoch": 0.4380952380952381, "grad_norm": 0.5745725035667419, "learning_rate": 1e-05, "loss": 1.2339, "mean_token_accuracy": 0.6577485799789429, "num_tokens": 54631908.0, "step": 92 }, { "epoch": 0.44285714285714284, "grad_norm": 0.6754887104034424, "learning_rate": 1e-05, "loss": 1.2274, "mean_token_accuracy": 0.6584199666976929, "num_tokens": 55218598.0, "step": 93 }, { "epoch": 0.44761904761904764, "grad_norm": 0.6922246813774109, "learning_rate": 1e-05, "loss": 1.2513, "mean_token_accuracy": 0.6527312397956848, "num_tokens": 55814642.0, "step": 94 }, { "epoch": 0.4523809523809524, "grad_norm": 0.5802931189537048, "learning_rate": 1e-05, "loss": 1.2231, "mean_token_accuracy": 0.6605392694473267, "num_tokens": 56410000.0, "step": 95 }, { "epoch": 0.45714285714285713, "grad_norm": 0.7186371088027954, "learning_rate": 1e-05, "loss": 1.2325, "mean_token_accuracy": 0.6574358940124512, "num_tokens": 57001902.0, "step": 96 }, { "epoch": 0.46190476190476193, "grad_norm": 0.5912067294120789, "learning_rate": 1e-05, "loss": 1.2413, "mean_token_accuracy": 0.6551775336265564, "num_tokens": 57612227.0, "step": 97 }, { "epoch": 0.4666666666666667, "grad_norm": 0.7110946774482727, "learning_rate": 1e-05, "loss": 1.2272, "mean_token_accuracy": 0.6573148965835571, "num_tokens": 58198983.0, "step": 98 }, { "epoch": 0.4714285714285714, "grad_norm": 0.703130841255188, "learning_rate": 1e-05, "loss": 1.2488, "mean_token_accuracy": 0.6536985039710999, "num_tokens": 58805739.0, "step": 99 }, { "epoch": 0.47619047619047616, "grad_norm": 0.6474947333335876, "learning_rate": 1e-05, "loss": 1.216, "mean_token_accuracy": 0.6609683036804199, "num_tokens": 59386596.0, "step": 100 }, { "epoch": 0.48095238095238096, "grad_norm": 0.7493091225624084, "learning_rate": 1e-05, "loss": 1.2239, "mean_token_accuracy": 0.6587037444114685, "num_tokens": 59976677.0, "step": 101 }, { "epoch": 0.4857142857142857, "grad_norm": 0.6101422905921936, "learning_rate": 1e-05, "loss": 1.2366, "mean_token_accuracy": 0.6560448408126831, "num_tokens": 60581023.0, "step": 102 }, { "epoch": 0.49047619047619045, "grad_norm": 0.7304781079292297, "learning_rate": 1e-05, "loss": 1.2269, "mean_token_accuracy": 0.6589258909225464, "num_tokens": 61177587.0, "step": 103 }, { "epoch": 0.49523809523809526, "grad_norm": 0.618215024471283, "learning_rate": 1e-05, "loss": 1.2207, "mean_token_accuracy": 0.6586862802505493, "num_tokens": 61759739.0, "step": 104 }, { "epoch": 0.5, "grad_norm": 0.6789980530738831, "learning_rate": 1e-05, "loss": 1.2283, "mean_token_accuracy": 0.6580797433853149, "num_tokens": 62343623.0, "step": 105 }, { "epoch": 0.5047619047619047, "grad_norm": 0.6834375858306885, "learning_rate": 1e-05, "loss": 1.2226, "mean_token_accuracy": 0.6588083505630493, "num_tokens": 62936609.0, "step": 106 }, { "epoch": 0.5095238095238095, "grad_norm": 0.6128349304199219, "learning_rate": 1e-05, "loss": 1.219, "mean_token_accuracy": 0.6602170467376709, "num_tokens": 63540035.0, "step": 107 }, { "epoch": 0.5142857142857142, "grad_norm": 0.6424954533576965, "learning_rate": 1e-05, "loss": 1.2252, "mean_token_accuracy": 0.6583743691444397, "num_tokens": 64137406.0, "step": 108 }, { "epoch": 0.5190476190476191, "grad_norm": 0.566566526889801, "learning_rate": 1e-05, "loss": 1.2104, "mean_token_accuracy": 0.6621809005737305, "num_tokens": 64747343.0, "step": 109 }, { "epoch": 0.5238095238095238, "grad_norm": 0.5913292169570923, "learning_rate": 1e-05, "loss": 1.21, "mean_token_accuracy": 0.6611165404319763, "num_tokens": 65340433.0, "step": 110 }, { "epoch": 0.5285714285714286, "grad_norm": 0.5560601353645325, "learning_rate": 1e-05, "loss": 1.2029, "mean_token_accuracy": 0.6629985570907593, "num_tokens": 65928375.0, "step": 111 }, { "epoch": 0.5333333333333333, "grad_norm": 0.5711589455604553, "learning_rate": 1e-05, "loss": 1.2285, "mean_token_accuracy": 0.6574028134346008, "num_tokens": 66527455.0, "step": 112 }, { "epoch": 0.5380952380952381, "grad_norm": 0.5675383806228638, "learning_rate": 1e-05, "loss": 1.2001, "mean_token_accuracy": 0.6645528674125671, "num_tokens": 67120147.0, "step": 113 }, { "epoch": 0.5428571428571428, "grad_norm": 0.5860258340835571, "learning_rate": 1e-05, "loss": 1.2182, "mean_token_accuracy": 0.6599565744400024, "num_tokens": 67726850.0, "step": 114 }, { "epoch": 0.5476190476190477, "grad_norm": 0.5209094285964966, "learning_rate": 1e-05, "loss": 1.2126, "mean_token_accuracy": 0.6609143018722534, "num_tokens": 68316713.0, "step": 115 }, { "epoch": 0.5523809523809524, "grad_norm": 0.6333171725273132, "learning_rate": 1e-05, "loss": 1.2156, "mean_token_accuracy": 0.6600525379180908, "num_tokens": 68892365.0, "step": 116 }, { "epoch": 0.5571428571428572, "grad_norm": 0.5704973340034485, "learning_rate": 1e-05, "loss": 1.2211, "mean_token_accuracy": 0.6591875553131104, "num_tokens": 69505524.0, "step": 117 }, { "epoch": 0.5619047619047619, "grad_norm": 0.7181419134140015, "learning_rate": 1e-05, "loss": 1.2036, "mean_token_accuracy": 0.6623135805130005, "num_tokens": 70095302.0, "step": 118 }, { "epoch": 0.5666666666666667, "grad_norm": 0.5681948661804199, "learning_rate": 1e-05, "loss": 1.216, "mean_token_accuracy": 0.6598063707351685, "num_tokens": 70694971.0, "step": 119 }, { "epoch": 0.5714285714285714, "grad_norm": 0.7001712918281555, "learning_rate": 1e-05, "loss": 1.2146, "mean_token_accuracy": 0.6608985662460327, "num_tokens": 71279415.0, "step": 120 }, { "epoch": 0.5761904761904761, "grad_norm": 0.6377084255218506, "learning_rate": 1e-05, "loss": 1.209, "mean_token_accuracy": 0.6621115207672119, "num_tokens": 71869014.0, "step": 121 }, { "epoch": 0.580952380952381, "grad_norm": 0.6364737153053284, "learning_rate": 1e-05, "loss": 1.2171, "mean_token_accuracy": 0.6591671705245972, "num_tokens": 72472715.0, "step": 122 }, { "epoch": 0.5857142857142857, "grad_norm": 0.6466585397720337, "learning_rate": 1e-05, "loss": 1.2089, "mean_token_accuracy": 0.661442756652832, "num_tokens": 73055740.0, "step": 123 }, { "epoch": 0.5904761904761905, "grad_norm": 0.5920109152793884, "learning_rate": 1e-05, "loss": 1.1924, "mean_token_accuracy": 0.6659133434295654, "num_tokens": 73639151.0, "step": 124 }, { "epoch": 0.5952380952380952, "grad_norm": 0.6872738599777222, "learning_rate": 1e-05, "loss": 1.2113, "mean_token_accuracy": 0.6628360152244568, "num_tokens": 74216756.0, "step": 125 }, { "epoch": 0.6, "grad_norm": 0.5881339907646179, "learning_rate": 1e-05, "loss": 1.2062, "mean_token_accuracy": 0.662140965461731, "num_tokens": 74813953.0, "step": 126 }, { "epoch": 0.6047619047619047, "grad_norm": 0.6483287215232849, "learning_rate": 1e-05, "loss": 1.2065, "mean_token_accuracy": 0.6624675989151001, "num_tokens": 75410691.0, "step": 127 }, { "epoch": 0.6095238095238096, "grad_norm": 0.5890834331512451, "learning_rate": 1e-05, "loss": 1.2235, "mean_token_accuracy": 0.6575560569763184, "num_tokens": 75996496.0, "step": 128 }, { "epoch": 0.6142857142857143, "grad_norm": 0.6782101988792419, "learning_rate": 1e-05, "loss": 1.199, "mean_token_accuracy": 0.6648662090301514, "num_tokens": 76585198.0, "step": 129 }, { "epoch": 0.6190476190476191, "grad_norm": 0.6252265572547913, "learning_rate": 1e-05, "loss": 1.1872, "mean_token_accuracy": 0.6665824055671692, "num_tokens": 77191596.0, "step": 130 }, { "epoch": 0.6238095238095238, "grad_norm": 0.6833210587501526, "learning_rate": 1e-05, "loss": 1.2048, "mean_token_accuracy": 0.6622829437255859, "num_tokens": 77796998.0, "step": 131 }, { "epoch": 0.6285714285714286, "grad_norm": 0.6870852708816528, "learning_rate": 1e-05, "loss": 1.2171, "mean_token_accuracy": 0.6590390801429749, "num_tokens": 78395104.0, "step": 132 }, { "epoch": 0.6333333333333333, "grad_norm": 0.7417638897895813, "learning_rate": 1e-05, "loss": 1.2036, "mean_token_accuracy": 0.66297847032547, "num_tokens": 78988563.0, "step": 133 }, { "epoch": 0.638095238095238, "grad_norm": 0.569595456123352, "learning_rate": 1e-05, "loss": 1.2234, "mean_token_accuracy": 0.6573336124420166, "num_tokens": 79599633.0, "step": 134 }, { "epoch": 0.6428571428571429, "grad_norm": 0.8054560422897339, "learning_rate": 1e-05, "loss": 1.2149, "mean_token_accuracy": 0.6601018905639648, "num_tokens": 80196954.0, "step": 135 }, { "epoch": 0.6476190476190476, "grad_norm": 0.6360299587249756, "learning_rate": 1e-05, "loss": 1.2141, "mean_token_accuracy": 0.6599046587944031, "num_tokens": 80790959.0, "step": 136 }, { "epoch": 0.6523809523809524, "grad_norm": 0.7952516078948975, "learning_rate": 1e-05, "loss": 1.2004, "mean_token_accuracy": 0.6641189455986023, "num_tokens": 81363350.0, "step": 137 }, { "epoch": 0.6571428571428571, "grad_norm": 0.7050403356552124, "learning_rate": 1e-05, "loss": 1.2017, "mean_token_accuracy": 0.6631975173950195, "num_tokens": 81960356.0, "step": 138 }, { "epoch": 0.6619047619047619, "grad_norm": 0.809806227684021, "learning_rate": 1e-05, "loss": 1.2119, "mean_token_accuracy": 0.6605896353721619, "num_tokens": 82573967.0, "step": 139 }, { "epoch": 0.6666666666666666, "grad_norm": 0.7040579915046692, "learning_rate": 1e-05, "loss": 1.1997, "mean_token_accuracy": 0.6634917259216309, "num_tokens": 83170751.0, "step": 140 }, { "epoch": 0.6714285714285714, "grad_norm": 0.7381901144981384, "learning_rate": 1e-05, "loss": 1.1815, "mean_token_accuracy": 0.6675858497619629, "num_tokens": 83744022.0, "step": 141 }, { "epoch": 0.6761904761904762, "grad_norm": 0.6610327959060669, "learning_rate": 1e-05, "loss": 1.2172, "mean_token_accuracy": 0.659174382686615, "num_tokens": 84336667.0, "step": 142 }, { "epoch": 0.680952380952381, "grad_norm": 0.8185865879058838, "learning_rate": 1e-05, "loss": 1.199, "mean_token_accuracy": 0.6634737253189087, "num_tokens": 84927599.0, "step": 143 }, { "epoch": 0.6857142857142857, "grad_norm": 0.6603442430496216, "learning_rate": 1e-05, "loss": 1.1976, "mean_token_accuracy": 0.6643534898757935, "num_tokens": 85516121.0, "step": 144 }, { "epoch": 0.6904761904761905, "grad_norm": 0.7519460320472717, "learning_rate": 1e-05, "loss": 1.1926, "mean_token_accuracy": 0.664984941482544, "num_tokens": 86106161.0, "step": 145 }, { "epoch": 0.6952380952380952, "grad_norm": 0.7080089449882507, "learning_rate": 1e-05, "loss": 1.2086, "mean_token_accuracy": 0.6621935963630676, "num_tokens": 86723880.0, "step": 146 }, { "epoch": 0.7, "grad_norm": 0.7303557395935059, "learning_rate": 1e-05, "loss": 1.2033, "mean_token_accuracy": 0.6634014248847961, "num_tokens": 87327542.0, "step": 147 }, { "epoch": 0.7047619047619048, "grad_norm": 0.6376964449882507, "learning_rate": 1e-05, "loss": 1.1977, "mean_token_accuracy": 0.6633247137069702, "num_tokens": 87912557.0, "step": 148 }, { "epoch": 0.7095238095238096, "grad_norm": 0.6810888051986694, "learning_rate": 1e-05, "loss": 1.2087, "mean_token_accuracy": 0.6617689728736877, "num_tokens": 88514499.0, "step": 149 }, { "epoch": 0.7142857142857143, "grad_norm": 0.6272366046905518, "learning_rate": 1e-05, "loss": 1.1879, "mean_token_accuracy": 0.6662660837173462, "num_tokens": 89090412.0, "step": 150 }, { "epoch": 0.719047619047619, "grad_norm": 0.6499550938606262, "learning_rate": 1e-05, "loss": 1.1978, "mean_token_accuracy": 0.6638685464859009, "num_tokens": 89689944.0, "step": 151 }, { "epoch": 0.7238095238095238, "grad_norm": 0.6450507640838623, "learning_rate": 1e-05, "loss": 1.2088, "mean_token_accuracy": 0.6614329218864441, "num_tokens": 90281605.0, "step": 152 }, { "epoch": 0.7285714285714285, "grad_norm": 0.6113287806510925, "learning_rate": 1e-05, "loss": 1.2095, "mean_token_accuracy": 0.6616454124450684, "num_tokens": 90877169.0, "step": 153 }, { "epoch": 0.7333333333333333, "grad_norm": 0.6421619653701782, "learning_rate": 1e-05, "loss": 1.2141, "mean_token_accuracy": 0.6598343253135681, "num_tokens": 91473587.0, "step": 154 }, { "epoch": 0.7380952380952381, "grad_norm": 0.5994828939437866, "learning_rate": 1e-05, "loss": 1.2069, "mean_token_accuracy": 0.6615887880325317, "num_tokens": 92066142.0, "step": 155 }, { "epoch": 0.7428571428571429, "grad_norm": 0.5635871887207031, "learning_rate": 1e-05, "loss": 1.1885, "mean_token_accuracy": 0.6657248735427856, "num_tokens": 92671294.0, "step": 156 }, { "epoch": 0.7476190476190476, "grad_norm": 0.5961142778396606, "learning_rate": 1e-05, "loss": 1.1915, "mean_token_accuracy": 0.6649054884910583, "num_tokens": 93267004.0, "step": 157 }, { "epoch": 0.7523809523809524, "grad_norm": 0.5518187284469604, "learning_rate": 1e-05, "loss": 1.2093, "mean_token_accuracy": 0.6612235307693481, "num_tokens": 93865099.0, "step": 158 }, { "epoch": 0.7571428571428571, "grad_norm": 0.6183374524116516, "learning_rate": 1e-05, "loss": 1.1825, "mean_token_accuracy": 0.6676396131515503, "num_tokens": 94449283.0, "step": 159 }, { "epoch": 0.7619047619047619, "grad_norm": 0.5925056338310242, "learning_rate": 1e-05, "loss": 1.1927, "mean_token_accuracy": 0.6643291711807251, "num_tokens": 95037160.0, "step": 160 }, { "epoch": 0.7666666666666667, "grad_norm": 0.6148018836975098, "learning_rate": 1e-05, "loss": 1.1761, "mean_token_accuracy": 0.6689929962158203, "num_tokens": 95620329.0, "step": 161 }, { "epoch": 0.7714285714285715, "grad_norm": 0.6416387557983398, "learning_rate": 1e-05, "loss": 1.1978, "mean_token_accuracy": 0.6625751256942749, "num_tokens": 96202979.0, "step": 162 }, { "epoch": 0.7761904761904762, "grad_norm": 0.5393695831298828, "learning_rate": 1e-05, "loss": 1.1918, "mean_token_accuracy": 0.665260910987854, "num_tokens": 96794135.0, "step": 163 }, { "epoch": 0.780952380952381, "grad_norm": 0.6334103941917419, "learning_rate": 1e-05, "loss": 1.1821, "mean_token_accuracy": 0.6664952635765076, "num_tokens": 97380180.0, "step": 164 }, { "epoch": 0.7857142857142857, "grad_norm": 0.6443802118301392, "learning_rate": 1e-05, "loss": 1.2005, "mean_token_accuracy": 0.663545548915863, "num_tokens": 97979583.0, "step": 165 }, { "epoch": 0.7904761904761904, "grad_norm": 0.6070786714553833, "learning_rate": 1e-05, "loss": 1.1818, "mean_token_accuracy": 0.6681106686592102, "num_tokens": 98573453.0, "step": 166 }, { "epoch": 0.7952380952380952, "grad_norm": 0.5983892679214478, "learning_rate": 1e-05, "loss": 1.189, "mean_token_accuracy": 0.6651272177696228, "num_tokens": 99162518.0, "step": 167 }, { "epoch": 0.8, "grad_norm": 0.5511825084686279, "learning_rate": 1e-05, "loss": 1.1859, "mean_token_accuracy": 0.6656243801116943, "num_tokens": 99755688.0, "step": 168 }, { "epoch": 0.8047619047619048, "grad_norm": 0.5612326264381409, "learning_rate": 1e-05, "loss": 1.1923, "mean_token_accuracy": 0.6645892858505249, "num_tokens": 100367122.0, "step": 169 }, { "epoch": 0.8095238095238095, "grad_norm": 0.6149346232414246, "learning_rate": 1e-05, "loss": 1.1866, "mean_token_accuracy": 0.665177583694458, "num_tokens": 100966663.0, "step": 170 }, { "epoch": 0.8142857142857143, "grad_norm": 0.5557584166526794, "learning_rate": 1e-05, "loss": 1.1993, "mean_token_accuracy": 0.6638921499252319, "num_tokens": 101561561.0, "step": 171 }, { "epoch": 0.819047619047619, "grad_norm": 0.6174666285514832, "learning_rate": 1e-05, "loss": 1.2058, "mean_token_accuracy": 0.6619209051132202, "num_tokens": 102150367.0, "step": 172 }, { "epoch": 0.8238095238095238, "grad_norm": 0.6149846911430359, "learning_rate": 1e-05, "loss": 1.1956, "mean_token_accuracy": 0.6646385788917542, "num_tokens": 102744438.0, "step": 173 }, { "epoch": 0.8285714285714286, "grad_norm": 0.6205980777740479, "learning_rate": 1e-05, "loss": 1.1944, "mean_token_accuracy": 0.6641254425048828, "num_tokens": 103336159.0, "step": 174 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6782044172286987, "learning_rate": 1e-05, "loss": 1.1993, "mean_token_accuracy": 0.6630405187606812, "num_tokens": 103933457.0, "step": 175 }, { "epoch": 0.8380952380952381, "grad_norm": 0.6339226961135864, "learning_rate": 1e-05, "loss": 1.1854, "mean_token_accuracy": 0.6652607917785645, "num_tokens": 104528020.0, "step": 176 }, { "epoch": 0.8428571428571429, "grad_norm": 0.604350209236145, "learning_rate": 1e-05, "loss": 1.2142, "mean_token_accuracy": 0.6597182750701904, "num_tokens": 105126562.0, "step": 177 }, { "epoch": 0.8476190476190476, "grad_norm": 0.5730092525482178, "learning_rate": 1e-05, "loss": 1.1796, "mean_token_accuracy": 0.6674203872680664, "num_tokens": 105730229.0, "step": 178 }, { "epoch": 0.8523809523809524, "grad_norm": 0.6724650263786316, "learning_rate": 1e-05, "loss": 1.201, "mean_token_accuracy": 0.6622498035430908, "num_tokens": 106338239.0, "step": 179 }, { "epoch": 0.8571428571428571, "grad_norm": 0.5882953405380249, "learning_rate": 1e-05, "loss": 1.1982, "mean_token_accuracy": 0.6630674600601196, "num_tokens": 106929782.0, "step": 180 }, { "epoch": 0.861904761904762, "grad_norm": 0.6305244565010071, "learning_rate": 1e-05, "loss": 1.1932, "mean_token_accuracy": 0.6646133661270142, "num_tokens": 107516950.0, "step": 181 }, { "epoch": 0.8666666666666667, "grad_norm": 0.6297836899757385, "learning_rate": 1e-05, "loss": 1.1825, "mean_token_accuracy": 0.6660134792327881, "num_tokens": 108104046.0, "step": 182 }, { "epoch": 0.8714285714285714, "grad_norm": 0.5446469783782959, "learning_rate": 1e-05, "loss": 1.1992, "mean_token_accuracy": 0.6630533933639526, "num_tokens": 108711068.0, "step": 183 }, { "epoch": 0.8761904761904762, "grad_norm": 0.5844411253929138, "learning_rate": 1e-05, "loss": 1.1687, "mean_token_accuracy": 0.669592022895813, "num_tokens": 109294847.0, "step": 184 }, { "epoch": 0.8809523809523809, "grad_norm": 0.6065420508384705, "learning_rate": 1e-05, "loss": 1.1886, "mean_token_accuracy": 0.664987325668335, "num_tokens": 109903424.0, "step": 185 }, { "epoch": 0.8857142857142857, "grad_norm": 0.6002596616744995, "learning_rate": 1e-05, "loss": 1.1894, "mean_token_accuracy": 0.66484135389328, "num_tokens": 110515082.0, "step": 186 }, { "epoch": 0.8904761904761904, "grad_norm": 0.5755858421325684, "learning_rate": 1e-05, "loss": 1.1887, "mean_token_accuracy": 0.6651521325111389, "num_tokens": 111105456.0, "step": 187 }, { "epoch": 0.8952380952380953, "grad_norm": 0.6171888709068298, "learning_rate": 1e-05, "loss": 1.1893, "mean_token_accuracy": 0.6657494306564331, "num_tokens": 111699029.0, "step": 188 }, { "epoch": 0.9, "grad_norm": 0.579205334186554, "learning_rate": 1e-05, "loss": 1.1659, "mean_token_accuracy": 0.6696426272392273, "num_tokens": 112280321.0, "step": 189 }, { "epoch": 0.9047619047619048, "grad_norm": 0.6712483167648315, "learning_rate": 1e-05, "loss": 1.1677, "mean_token_accuracy": 0.6694087982177734, "num_tokens": 112860009.0, "step": 190 }, { "epoch": 0.9095238095238095, "grad_norm": 0.6215792894363403, "learning_rate": 1e-05, "loss": 1.1872, "mean_token_accuracy": 0.6649343967437744, "num_tokens": 113457303.0, "step": 191 }, { "epoch": 0.9142857142857143, "grad_norm": 0.5627334117889404, "learning_rate": 1e-05, "loss": 1.181, "mean_token_accuracy": 0.6672377586364746, "num_tokens": 114054977.0, "step": 192 }, { "epoch": 0.919047619047619, "grad_norm": 0.5678215622901917, "learning_rate": 1e-05, "loss": 1.1778, "mean_token_accuracy": 0.6673398613929749, "num_tokens": 114641555.0, "step": 193 }, { "epoch": 0.9238095238095239, "grad_norm": 0.5933332443237305, "learning_rate": 1e-05, "loss": 1.1939, "mean_token_accuracy": 0.6647536754608154, "num_tokens": 115241437.0, "step": 194 }, { "epoch": 0.9285714285714286, "grad_norm": 0.5732199549674988, "learning_rate": 1e-05, "loss": 1.1714, "mean_token_accuracy": 0.6686159372329712, "num_tokens": 115845775.0, "step": 195 }, { "epoch": 0.9333333333333333, "grad_norm": 0.6514256596565247, "learning_rate": 1e-05, "loss": 1.1782, "mean_token_accuracy": 0.6679466962814331, "num_tokens": 116452623.0, "step": 196 }, { "epoch": 0.9380952380952381, "grad_norm": 0.5765755772590637, "learning_rate": 1e-05, "loss": 1.1861, "mean_token_accuracy": 0.6654437780380249, "num_tokens": 117045570.0, "step": 197 }, { "epoch": 0.9428571428571428, "grad_norm": 0.7004836797714233, "learning_rate": 1e-05, "loss": 1.1638, "mean_token_accuracy": 0.6707776784896851, "num_tokens": 117654535.0, "step": 198 }, { "epoch": 0.9476190476190476, "grad_norm": 0.5966997146606445, "learning_rate": 1e-05, "loss": 1.1772, "mean_token_accuracy": 0.6684892177581787, "num_tokens": 118247244.0, "step": 199 }, { "epoch": 0.9523809523809523, "grad_norm": 0.6460300087928772, "learning_rate": 1e-05, "loss": 1.1713, "mean_token_accuracy": 0.6694802045822144, "num_tokens": 118843074.0, "step": 200 }, { "epoch": 0.9571428571428572, "grad_norm": 0.599161684513092, "learning_rate": 1e-05, "loss": 1.1712, "mean_token_accuracy": 0.6690815687179565, "num_tokens": 119445023.0, "step": 201 }, { "epoch": 0.9619047619047619, "grad_norm": 0.6229502558708191, "learning_rate": 1e-05, "loss": 1.1864, "mean_token_accuracy": 0.6660387516021729, "num_tokens": 120045748.0, "step": 202 }, { "epoch": 0.9666666666666667, "grad_norm": 0.6429843306541443, "learning_rate": 1e-05, "loss": 1.1785, "mean_token_accuracy": 0.6669691205024719, "num_tokens": 120635079.0, "step": 203 }, { "epoch": 0.9714285714285714, "grad_norm": 0.6153910756111145, "learning_rate": 1e-05, "loss": 1.1791, "mean_token_accuracy": 0.66630619764328, "num_tokens": 121220486.0, "step": 204 }, { "epoch": 0.9761904761904762, "grad_norm": 0.6496953368186951, "learning_rate": 1e-05, "loss": 1.1804, "mean_token_accuracy": 0.6666555404663086, "num_tokens": 121800676.0, "step": 205 }, { "epoch": 0.9809523809523809, "grad_norm": 0.6011868119239807, "learning_rate": 1e-05, "loss": 1.1842, "mean_token_accuracy": 0.6658217906951904, "num_tokens": 122409399.0, "step": 206 }, { "epoch": 0.9857142857142858, "grad_norm": 0.857315182685852, "learning_rate": 1e-05, "loss": 1.1652, "mean_token_accuracy": 0.6701173186302185, "num_tokens": 123003502.0, "step": 207 }, { "epoch": 0.9904761904761905, "grad_norm": 0.6711968183517456, "learning_rate": 1e-05, "loss": 1.1821, "mean_token_accuracy": 0.6669960021972656, "num_tokens": 123595838.0, "step": 208 }, { "epoch": 0.9952380952380953, "grad_norm": 0.8044399619102478, "learning_rate": 1e-05, "loss": 1.1797, "mean_token_accuracy": 0.6671728491783142, "num_tokens": 124166476.0, "step": 209 }, { "epoch": 1.0, "grad_norm": 0.724872887134552, "learning_rate": 1e-05, "loss": 1.1689, "mean_token_accuracy": 0.66896653175354, "num_tokens": 124761423.0, "step": 210 }, { "epoch": 1.0047619047619047, "grad_norm": 0.7732614278793335, "learning_rate": 1e-05, "loss": 1.176, "mean_token_accuracy": 0.6668572425842285, "num_tokens": 125364371.0, "step": 211 }, { "epoch": 1.0095238095238095, "grad_norm": 0.6983124017715454, "learning_rate": 1e-05, "loss": 1.1342, "mean_token_accuracy": 0.6760746240615845, "num_tokens": 125954118.0, "step": 212 }, { "epoch": 1.0142857142857142, "grad_norm": 0.6097580790519714, "learning_rate": 1e-05, "loss": 1.1398, "mean_token_accuracy": 0.6745401620864868, "num_tokens": 126544991.0, "step": 213 }, { "epoch": 1.019047619047619, "grad_norm": 0.6844852566719055, "learning_rate": 1e-05, "loss": 1.1425, "mean_token_accuracy": 0.6751389503479004, "num_tokens": 127151772.0, "step": 214 }, { "epoch": 1.0238095238095237, "grad_norm": 0.7108845114707947, "learning_rate": 1e-05, "loss": 1.1472, "mean_token_accuracy": 0.6734536290168762, "num_tokens": 127762517.0, "step": 215 }, { "epoch": 1.0285714285714285, "grad_norm": 0.7051171660423279, "learning_rate": 1e-05, "loss": 1.1516, "mean_token_accuracy": 0.672438383102417, "num_tokens": 128358892.0, "step": 216 }, { "epoch": 1.0333333333333334, "grad_norm": 0.742440938949585, "learning_rate": 1e-05, "loss": 1.1486, "mean_token_accuracy": 0.6727321743965149, "num_tokens": 128930309.0, "step": 217 }, { "epoch": 1.0380952380952382, "grad_norm": 0.6921288371086121, "learning_rate": 1e-05, "loss": 1.1336, "mean_token_accuracy": 0.6769453883171082, "num_tokens": 129537678.0, "step": 218 }, { "epoch": 1.042857142857143, "grad_norm": 0.6531715989112854, "learning_rate": 1e-05, "loss": 1.1486, "mean_token_accuracy": 0.6732891201972961, "num_tokens": 130113717.0, "step": 219 }, { "epoch": 1.0476190476190477, "grad_norm": 0.8497748970985413, "learning_rate": 1e-05, "loss": 1.1554, "mean_token_accuracy": 0.6714987754821777, "num_tokens": 130724521.0, "step": 220 }, { "epoch": 1.0523809523809524, "grad_norm": 0.6819850206375122, "learning_rate": 1e-05, "loss": 1.1407, "mean_token_accuracy": 0.6752928495407104, "num_tokens": 131298037.0, "step": 221 }, { "epoch": 1.0571428571428572, "grad_norm": 0.785930335521698, "learning_rate": 1e-05, "loss": 1.1486, "mean_token_accuracy": 0.6729685068130493, "num_tokens": 131909779.0, "step": 222 }, { "epoch": 1.061904761904762, "grad_norm": 0.6023511290550232, "learning_rate": 1e-05, "loss": 1.1458, "mean_token_accuracy": 0.6734186410903931, "num_tokens": 132506621.0, "step": 223 }, { "epoch": 1.0666666666666667, "grad_norm": 0.8720818758010864, "learning_rate": 1e-05, "loss": 1.1498, "mean_token_accuracy": 0.6726520657539368, "num_tokens": 133124443.0, "step": 224 }, { "epoch": 1.0714285714285714, "grad_norm": 0.6429004073143005, "learning_rate": 1e-05, "loss": 1.1608, "mean_token_accuracy": 0.6698133945465088, "num_tokens": 133719672.0, "step": 225 }, { "epoch": 1.0761904761904761, "grad_norm": 0.7744424343109131, "learning_rate": 1e-05, "loss": 1.1357, "mean_token_accuracy": 0.6747680306434631, "num_tokens": 134309852.0, "step": 226 }, { "epoch": 1.0809523809523809, "grad_norm": 0.7106124758720398, "learning_rate": 1e-05, "loss": 1.1472, "mean_token_accuracy": 0.6723679900169373, "num_tokens": 134890952.0, "step": 227 }, { "epoch": 1.0857142857142856, "grad_norm": 0.8420917987823486, "learning_rate": 1e-05, "loss": 1.1144, "mean_token_accuracy": 0.6813350915908813, "num_tokens": 135479588.0, "step": 228 }, { "epoch": 1.0904761904761904, "grad_norm": 0.7307847738265991, "learning_rate": 1e-05, "loss": 1.14, "mean_token_accuracy": 0.6748834848403931, "num_tokens": 136065836.0, "step": 229 }, { "epoch": 1.0952380952380953, "grad_norm": 0.6740959882736206, "learning_rate": 1e-05, "loss": 1.1377, "mean_token_accuracy": 0.6761696934700012, "num_tokens": 136668062.0, "step": 230 }, { "epoch": 1.1, "grad_norm": 0.6920994520187378, "learning_rate": 1e-05, "loss": 1.1398, "mean_token_accuracy": 0.6743276715278625, "num_tokens": 137256533.0, "step": 231 }, { "epoch": 1.1047619047619048, "grad_norm": 0.6870349645614624, "learning_rate": 1e-05, "loss": 1.1459, "mean_token_accuracy": 0.6732701063156128, "num_tokens": 137848246.0, "step": 232 }, { "epoch": 1.1095238095238096, "grad_norm": 0.6535449028015137, "learning_rate": 1e-05, "loss": 1.1494, "mean_token_accuracy": 0.6729423999786377, "num_tokens": 138450870.0, "step": 233 }, { "epoch": 1.1142857142857143, "grad_norm": 0.6108024716377258, "learning_rate": 1e-05, "loss": 1.1498, "mean_token_accuracy": 0.671773374080658, "num_tokens": 139048178.0, "step": 234 }, { "epoch": 1.119047619047619, "grad_norm": 0.618743360042572, "learning_rate": 1e-05, "loss": 1.1394, "mean_token_accuracy": 0.6749163866043091, "num_tokens": 139647536.0, "step": 235 }, { "epoch": 1.1238095238095238, "grad_norm": 0.5873496532440186, "learning_rate": 1e-05, "loss": 1.1428, "mean_token_accuracy": 0.6742033958435059, "num_tokens": 140237569.0, "step": 236 }, { "epoch": 1.1285714285714286, "grad_norm": 0.6749809980392456, "learning_rate": 1e-05, "loss": 1.1462, "mean_token_accuracy": 0.67291659116745, "num_tokens": 140808948.0, "step": 237 }, { "epoch": 1.1333333333333333, "grad_norm": 0.5988799333572388, "learning_rate": 1e-05, "loss": 1.1417, "mean_token_accuracy": 0.6734879016876221, "num_tokens": 141387906.0, "step": 238 }, { "epoch": 1.138095238095238, "grad_norm": 0.7041788697242737, "learning_rate": 1e-05, "loss": 1.1421, "mean_token_accuracy": 0.6749635934829712, "num_tokens": 141991024.0, "step": 239 }, { "epoch": 1.1428571428571428, "grad_norm": 0.677106499671936, "learning_rate": 1e-05, "loss": 1.122, "mean_token_accuracy": 0.6789741516113281, "num_tokens": 142585170.0, "step": 240 }, { "epoch": 1.1476190476190475, "grad_norm": 0.6422439217567444, "learning_rate": 1e-05, "loss": 1.1509, "mean_token_accuracy": 0.6719658374786377, "num_tokens": 143178473.0, "step": 241 }, { "epoch": 1.1523809523809523, "grad_norm": 0.6920860409736633, "learning_rate": 1e-05, "loss": 1.1511, "mean_token_accuracy": 0.6708908677101135, "num_tokens": 143782184.0, "step": 242 }, { "epoch": 1.157142857142857, "grad_norm": 0.5582302212715149, "learning_rate": 1e-05, "loss": 1.1331, "mean_token_accuracy": 0.6759682297706604, "num_tokens": 144383051.0, "step": 243 }, { "epoch": 1.161904761904762, "grad_norm": 0.6627556085586548, "learning_rate": 1e-05, "loss": 1.1432, "mean_token_accuracy": 0.6744831204414368, "num_tokens": 144977872.0, "step": 244 }, { "epoch": 1.1666666666666667, "grad_norm": 0.5956741571426392, "learning_rate": 1e-05, "loss": 1.1456, "mean_token_accuracy": 0.6733117699623108, "num_tokens": 145573077.0, "step": 245 }, { "epoch": 1.1714285714285715, "grad_norm": 0.7862910628318787, "learning_rate": 1e-05, "loss": 1.1464, "mean_token_accuracy": 0.6739993691444397, "num_tokens": 146165107.0, "step": 246 }, { "epoch": 1.1761904761904762, "grad_norm": 0.6099702715873718, "learning_rate": 1e-05, "loss": 1.1393, "mean_token_accuracy": 0.6740779876708984, "num_tokens": 146763356.0, "step": 247 }, { "epoch": 1.180952380952381, "grad_norm": 0.7584065198898315, "learning_rate": 1e-05, "loss": 1.136, "mean_token_accuracy": 0.6759775280952454, "num_tokens": 147358035.0, "step": 248 }, { "epoch": 1.1857142857142857, "grad_norm": 0.6754823327064514, "learning_rate": 1e-05, "loss": 1.1523, "mean_token_accuracy": 0.6710008978843689, "num_tokens": 147955530.0, "step": 249 }, { "epoch": 1.1904761904761905, "grad_norm": 0.6045711636543274, "learning_rate": 1e-05, "loss": 1.1468, "mean_token_accuracy": 0.6728271245956421, "num_tokens": 148547950.0, "step": 250 }, { "epoch": 1.1952380952380952, "grad_norm": 0.6770275235176086, "learning_rate": 1e-05, "loss": 1.1309, "mean_token_accuracy": 0.6762286424636841, "num_tokens": 149127280.0, "step": 251 }, { "epoch": 1.2, "grad_norm": 0.5667791366577148, "learning_rate": 1e-05, "loss": 1.1389, "mean_token_accuracy": 0.6750789284706116, "num_tokens": 149735096.0, "step": 252 }, { "epoch": 1.2047619047619047, "grad_norm": 0.6122450232505798, "learning_rate": 1e-05, "loss": 1.1423, "mean_token_accuracy": 0.6746940612792969, "num_tokens": 150338864.0, "step": 253 }, { "epoch": 1.2095238095238094, "grad_norm": 0.6596109867095947, "learning_rate": 1e-05, "loss": 1.1234, "mean_token_accuracy": 0.6786649227142334, "num_tokens": 150940365.0, "step": 254 }, { "epoch": 1.2142857142857142, "grad_norm": 0.6414262652397156, "learning_rate": 1e-05, "loss": 1.1454, "mean_token_accuracy": 0.6734991073608398, "num_tokens": 151517902.0, "step": 255 }, { "epoch": 1.2190476190476192, "grad_norm": 0.7465854287147522, "learning_rate": 1e-05, "loss": 1.1225, "mean_token_accuracy": 0.6790366172790527, "num_tokens": 152093932.0, "step": 256 }, { "epoch": 1.223809523809524, "grad_norm": 0.6045883297920227, "learning_rate": 1e-05, "loss": 1.1281, "mean_token_accuracy": 0.6779497861862183, "num_tokens": 152690003.0, "step": 257 }, { "epoch": 1.2285714285714286, "grad_norm": 0.7717053890228271, "learning_rate": 1e-05, "loss": 1.1305, "mean_token_accuracy": 0.6769629716873169, "num_tokens": 153278014.0, "step": 258 }, { "epoch": 1.2333333333333334, "grad_norm": 0.6217109560966492, "learning_rate": 1e-05, "loss": 1.1377, "mean_token_accuracy": 0.6756360530853271, "num_tokens": 153871781.0, "step": 259 }, { "epoch": 1.2380952380952381, "grad_norm": 0.7101379632949829, "learning_rate": 1e-05, "loss": 1.1396, "mean_token_accuracy": 0.6745343208312988, "num_tokens": 154466124.0, "step": 260 }, { "epoch": 1.2428571428571429, "grad_norm": 0.6611591577529907, "learning_rate": 1e-05, "loss": 1.1342, "mean_token_accuracy": 0.675082802772522, "num_tokens": 155073053.0, "step": 261 }, { "epoch": 1.2476190476190476, "grad_norm": 0.7041805386543274, "learning_rate": 1e-05, "loss": 1.1612, "mean_token_accuracy": 0.6703898906707764, "num_tokens": 155680694.0, "step": 262 }, { "epoch": 1.2523809523809524, "grad_norm": 0.6518973708152771, "learning_rate": 1e-05, "loss": 1.1492, "mean_token_accuracy": 0.6719495058059692, "num_tokens": 156279612.0, "step": 263 }, { "epoch": 1.2571428571428571, "grad_norm": 0.6293846368789673, "learning_rate": 1e-05, "loss": 1.1381, "mean_token_accuracy": 0.6761749982833862, "num_tokens": 156898086.0, "step": 264 }, { "epoch": 1.2619047619047619, "grad_norm": 0.5713494420051575, "learning_rate": 1e-05, "loss": 1.1527, "mean_token_accuracy": 0.6716663837432861, "num_tokens": 157502996.0, "step": 265 }, { "epoch": 1.2666666666666666, "grad_norm": 0.6561734676361084, "learning_rate": 1e-05, "loss": 1.1544, "mean_token_accuracy": 0.6708611845970154, "num_tokens": 158107778.0, "step": 266 }, { "epoch": 1.2714285714285714, "grad_norm": 0.5799586772918701, "learning_rate": 1e-05, "loss": 1.1177, "mean_token_accuracy": 0.6797953844070435, "num_tokens": 158713147.0, "step": 267 }, { "epoch": 1.276190476190476, "grad_norm": 0.5941030979156494, "learning_rate": 1e-05, "loss": 1.1255, "mean_token_accuracy": 0.6776763200759888, "num_tokens": 159292006.0, "step": 268 }, { "epoch": 1.2809523809523808, "grad_norm": 0.6683588624000549, "learning_rate": 1e-05, "loss": 1.1234, "mean_token_accuracy": 0.6778484582901001, "num_tokens": 159889197.0, "step": 269 }, { "epoch": 1.2857142857142856, "grad_norm": 0.6561569571495056, "learning_rate": 1e-05, "loss": 1.1378, "mean_token_accuracy": 0.6750425696372986, "num_tokens": 160485304.0, "step": 270 }, { "epoch": 1.2904761904761906, "grad_norm": 0.5719537138938904, "learning_rate": 1e-05, "loss": 1.1404, "mean_token_accuracy": 0.6747204065322876, "num_tokens": 161092433.0, "step": 271 }, { "epoch": 1.2952380952380953, "grad_norm": 0.6006868481636047, "learning_rate": 1e-05, "loss": 1.1396, "mean_token_accuracy": 0.6749382019042969, "num_tokens": 161683555.0, "step": 272 }, { "epoch": 1.3, "grad_norm": 0.6102608442306519, "learning_rate": 1e-05, "loss": 1.1293, "mean_token_accuracy": 0.6775893568992615, "num_tokens": 162278973.0, "step": 273 }, { "epoch": 1.3047619047619048, "grad_norm": 0.6217197179794312, "learning_rate": 1e-05, "loss": 1.1366, "mean_token_accuracy": 0.6764044165611267, "num_tokens": 162885270.0, "step": 274 }, { "epoch": 1.3095238095238095, "grad_norm": 0.6187546253204346, "learning_rate": 1e-05, "loss": 1.1315, "mean_token_accuracy": 0.6765252351760864, "num_tokens": 163476311.0, "step": 275 }, { "epoch": 1.3142857142857143, "grad_norm": 0.5942601561546326, "learning_rate": 1e-05, "loss": 1.1455, "mean_token_accuracy": 0.6730477213859558, "num_tokens": 164071314.0, "step": 276 }, { "epoch": 1.319047619047619, "grad_norm": 0.5942831635475159, "learning_rate": 1e-05, "loss": 1.1321, "mean_token_accuracy": 0.6764451861381531, "num_tokens": 164663415.0, "step": 277 }, { "epoch": 1.3238095238095238, "grad_norm": 0.6232311129570007, "learning_rate": 1e-05, "loss": 1.1269, "mean_token_accuracy": 0.6775949597358704, "num_tokens": 165256997.0, "step": 278 }, { "epoch": 1.3285714285714285, "grad_norm": 0.6126914024353027, "learning_rate": 1e-05, "loss": 1.1317, "mean_token_accuracy": 0.676669716835022, "num_tokens": 165847922.0, "step": 279 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6624312400817871, "learning_rate": 1e-05, "loss": 1.1337, "mean_token_accuracy": 0.6758729815483093, "num_tokens": 166444541.0, "step": 280 }, { "epoch": 1.3380952380952382, "grad_norm": 0.6634590029716492, "learning_rate": 1e-05, "loss": 1.1246, "mean_token_accuracy": 0.6781991124153137, "num_tokens": 167028591.0, "step": 281 }, { "epoch": 1.342857142857143, "grad_norm": 0.7142046093940735, "learning_rate": 1e-05, "loss": 1.1473, "mean_token_accuracy": 0.6724534034729004, "num_tokens": 167627132.0, "step": 282 }, { "epoch": 1.3476190476190477, "grad_norm": 0.5835825800895691, "learning_rate": 1e-05, "loss": 1.119, "mean_token_accuracy": 0.6801720857620239, "num_tokens": 168226854.0, "step": 283 }, { "epoch": 1.3523809523809525, "grad_norm": 0.7441895008087158, "learning_rate": 1e-05, "loss": 1.1508, "mean_token_accuracy": 0.6721788048744202, "num_tokens": 168833732.0, "step": 284 }, { "epoch": 1.3571428571428572, "grad_norm": 0.613866925239563, "learning_rate": 1e-05, "loss": 1.1263, "mean_token_accuracy": 0.6785060167312622, "num_tokens": 169426970.0, "step": 285 }, { "epoch": 1.361904761904762, "grad_norm": 0.7395045161247253, "learning_rate": 1e-05, "loss": 1.1504, "mean_token_accuracy": 0.6733224391937256, "num_tokens": 170025787.0, "step": 286 }, { "epoch": 1.3666666666666667, "grad_norm": 0.7011858224868774, "learning_rate": 1e-05, "loss": 1.1457, "mean_token_accuracy": 0.6723621487617493, "num_tokens": 170621857.0, "step": 287 }, { "epoch": 1.3714285714285714, "grad_norm": 0.6301146149635315, "learning_rate": 1e-05, "loss": 1.1428, "mean_token_accuracy": 0.6736270189285278, "num_tokens": 171220708.0, "step": 288 }, { "epoch": 1.3761904761904762, "grad_norm": 0.6546505093574524, "learning_rate": 1e-05, "loss": 1.1527, "mean_token_accuracy": 0.6708388924598694, "num_tokens": 171812508.0, "step": 289 }, { "epoch": 1.380952380952381, "grad_norm": 0.665846049785614, "learning_rate": 1e-05, "loss": 1.1239, "mean_token_accuracy": 0.6771635413169861, "num_tokens": 172401090.0, "step": 290 }, { "epoch": 1.3857142857142857, "grad_norm": 0.6951489448547363, "learning_rate": 1e-05, "loss": 1.1303, "mean_token_accuracy": 0.6767557263374329, "num_tokens": 172989201.0, "step": 291 }, { "epoch": 1.3904761904761904, "grad_norm": 0.6228903532028198, "learning_rate": 1e-05, "loss": 1.1316, "mean_token_accuracy": 0.6754661798477173, "num_tokens": 173563807.0, "step": 292 }, { "epoch": 1.3952380952380952, "grad_norm": 0.7011890411376953, "learning_rate": 1e-05, "loss": 1.1303, "mean_token_accuracy": 0.6768910884857178, "num_tokens": 174159574.0, "step": 293 }, { "epoch": 1.4, "grad_norm": 0.6298404932022095, "learning_rate": 1e-05, "loss": 1.1487, "mean_token_accuracy": 0.672224223613739, "num_tokens": 174744244.0, "step": 294 }, { "epoch": 1.4047619047619047, "grad_norm": 0.6158511638641357, "learning_rate": 1e-05, "loss": 1.1315, "mean_token_accuracy": 0.6756511926651001, "num_tokens": 175341946.0, "step": 295 }, { "epoch": 1.4095238095238094, "grad_norm": 0.6887179613113403, "learning_rate": 1e-05, "loss": 1.1019, "mean_token_accuracy": 0.6828951239585876, "num_tokens": 175904117.0, "step": 296 }, { "epoch": 1.4142857142857144, "grad_norm": 0.64696204662323, "learning_rate": 1e-05, "loss": 1.1307, "mean_token_accuracy": 0.6764581799507141, "num_tokens": 176493621.0, "step": 297 }, { "epoch": 1.4190476190476191, "grad_norm": 0.5804628133773804, "learning_rate": 1e-05, "loss": 1.1316, "mean_token_accuracy": 0.6758260726928711, "num_tokens": 177082157.0, "step": 298 }, { "epoch": 1.4238095238095239, "grad_norm": 0.6294459104537964, "learning_rate": 1e-05, "loss": 1.1325, "mean_token_accuracy": 0.6751164197921753, "num_tokens": 177668681.0, "step": 299 }, { "epoch": 1.4285714285714286, "grad_norm": 0.617782711982727, "learning_rate": 1e-05, "loss": 1.1352, "mean_token_accuracy": 0.6748452186584473, "num_tokens": 178256283.0, "step": 300 }, { "epoch": 1.4333333333333333, "grad_norm": 0.6512781977653503, "learning_rate": 1e-05, "loss": 1.1468, "mean_token_accuracy": 0.6721617579460144, "num_tokens": 178850673.0, "step": 301 }, { "epoch": 1.438095238095238, "grad_norm": 0.5774661898612976, "learning_rate": 1e-05, "loss": 1.1246, "mean_token_accuracy": 0.6787533760070801, "num_tokens": 179457871.0, "step": 302 }, { "epoch": 1.4428571428571428, "grad_norm": 0.5992771983146667, "learning_rate": 1e-05, "loss": 1.1548, "mean_token_accuracy": 0.6706414818763733, "num_tokens": 180064071.0, "step": 303 }, { "epoch": 1.4476190476190476, "grad_norm": 0.5943005681037903, "learning_rate": 1e-05, "loss": 1.1106, "mean_token_accuracy": 0.6806790828704834, "num_tokens": 180650796.0, "step": 304 }, { "epoch": 1.4523809523809523, "grad_norm": 0.6455477476119995, "learning_rate": 1e-05, "loss": 1.1409, "mean_token_accuracy": 0.6753484606742859, "num_tokens": 181246825.0, "step": 305 }, { "epoch": 1.457142857142857, "grad_norm": 0.5515779852867126, "learning_rate": 1e-05, "loss": 1.1429, "mean_token_accuracy": 0.6737354397773743, "num_tokens": 181855567.0, "step": 306 }, { "epoch": 1.461904761904762, "grad_norm": 0.6088519096374512, "learning_rate": 1e-05, "loss": 1.1095, "mean_token_accuracy": 0.680343508720398, "num_tokens": 182433911.0, "step": 307 }, { "epoch": 1.4666666666666668, "grad_norm": 0.6310312747955322, "learning_rate": 1e-05, "loss": 1.1307, "mean_token_accuracy": 0.676478385925293, "num_tokens": 183023144.0, "step": 308 }, { "epoch": 1.4714285714285715, "grad_norm": 0.6333861947059631, "learning_rate": 1e-05, "loss": 1.1225, "mean_token_accuracy": 0.6778949499130249, "num_tokens": 183626514.0, "step": 309 }, { "epoch": 1.4761904761904763, "grad_norm": 0.6410499811172485, "learning_rate": 1e-05, "loss": 1.1284, "mean_token_accuracy": 0.6767443418502808, "num_tokens": 184221439.0, "step": 310 }, { "epoch": 1.480952380952381, "grad_norm": 0.6700615882873535, "learning_rate": 1e-05, "loss": 1.134, "mean_token_accuracy": 0.6758592128753662, "num_tokens": 184819506.0, "step": 311 }, { "epoch": 1.4857142857142858, "grad_norm": 0.5785894989967346, "learning_rate": 1e-05, "loss": 1.1338, "mean_token_accuracy": 0.6757279634475708, "num_tokens": 185419019.0, "step": 312 }, { "epoch": 1.4904761904761905, "grad_norm": 0.6253511309623718, "learning_rate": 1e-05, "loss": 1.1212, "mean_token_accuracy": 0.6801990270614624, "num_tokens": 186010772.0, "step": 313 }, { "epoch": 1.4952380952380953, "grad_norm": 0.6034374237060547, "learning_rate": 1e-05, "loss": 1.1178, "mean_token_accuracy": 0.6792829036712646, "num_tokens": 186589243.0, "step": 314 }, { "epoch": 1.5, "grad_norm": 0.6875804662704468, "learning_rate": 1e-05, "loss": 1.1165, "mean_token_accuracy": 0.6799081563949585, "num_tokens": 187182368.0, "step": 315 }, { "epoch": 1.5047619047619047, "grad_norm": 0.5927019119262695, "learning_rate": 1e-05, "loss": 1.1179, "mean_token_accuracy": 0.6792271733283997, "num_tokens": 187763428.0, "step": 316 }, { "epoch": 1.5095238095238095, "grad_norm": 0.5725839734077454, "learning_rate": 1e-05, "loss": 1.1129, "mean_token_accuracy": 0.6808658838272095, "num_tokens": 188359395.0, "step": 317 }, { "epoch": 1.5142857142857142, "grad_norm": 0.6134579181671143, "learning_rate": 1e-05, "loss": 1.1329, "mean_token_accuracy": 0.6752611398696899, "num_tokens": 188952450.0, "step": 318 }, { "epoch": 1.519047619047619, "grad_norm": 0.5980193018913269, "learning_rate": 1e-05, "loss": 1.1282, "mean_token_accuracy": 0.6765316128730774, "num_tokens": 189535853.0, "step": 319 }, { "epoch": 1.5238095238095237, "grad_norm": 0.6418870091438293, "learning_rate": 1e-05, "loss": 1.1113, "mean_token_accuracy": 0.6808905601501465, "num_tokens": 190127386.0, "step": 320 }, { "epoch": 1.5285714285714285, "grad_norm": 0.5932308435440063, "learning_rate": 1e-05, "loss": 1.1282, "mean_token_accuracy": 0.6762252449989319, "num_tokens": 190718877.0, "step": 321 }, { "epoch": 1.5333333333333332, "grad_norm": 0.6508740782737732, "learning_rate": 1e-05, "loss": 1.1504, "mean_token_accuracy": 0.6717185974121094, "num_tokens": 191320553.0, "step": 322 }, { "epoch": 1.538095238095238, "grad_norm": 0.6029355525970459, "learning_rate": 1e-05, "loss": 1.1219, "mean_token_accuracy": 0.6790941953659058, "num_tokens": 191911786.0, "step": 323 }, { "epoch": 1.5428571428571427, "grad_norm": 0.5820804834365845, "learning_rate": 1e-05, "loss": 1.1483, "mean_token_accuracy": 0.6729787588119507, "num_tokens": 192517254.0, "step": 324 }, { "epoch": 1.5476190476190477, "grad_norm": 0.6086446642875671, "learning_rate": 1e-05, "loss": 1.1438, "mean_token_accuracy": 0.6730492115020752, "num_tokens": 193113713.0, "step": 325 }, { "epoch": 1.5523809523809524, "grad_norm": 0.6287596821784973, "learning_rate": 1e-05, "loss": 1.1255, "mean_token_accuracy": 0.6779239177703857, "num_tokens": 193718335.0, "step": 326 }, { "epoch": 1.5571428571428572, "grad_norm": 0.6495358347892761, "learning_rate": 1e-05, "loss": 1.1267, "mean_token_accuracy": 0.6764586567878723, "num_tokens": 194303328.0, "step": 327 }, { "epoch": 1.561904761904762, "grad_norm": 0.6034678816795349, "learning_rate": 1e-05, "loss": 1.1204, "mean_token_accuracy": 0.6789346933364868, "num_tokens": 194886509.0, "step": 328 }, { "epoch": 1.5666666666666667, "grad_norm": 0.6537843346595764, "learning_rate": 1e-05, "loss": 1.1269, "mean_token_accuracy": 0.678215742111206, "num_tokens": 195456896.0, "step": 329 }, { "epoch": 1.5714285714285714, "grad_norm": 0.5981965661048889, "learning_rate": 1e-05, "loss": 1.1237, "mean_token_accuracy": 0.6771047115325928, "num_tokens": 196053871.0, "step": 330 }, { "epoch": 1.5761904761904761, "grad_norm": 0.7181389331817627, "learning_rate": 1e-05, "loss": 1.1236, "mean_token_accuracy": 0.6774399280548096, "num_tokens": 196654732.0, "step": 331 }, { "epoch": 1.580952380952381, "grad_norm": 0.6066569089889526, "learning_rate": 1e-05, "loss": 1.1124, "mean_token_accuracy": 0.6811067461967468, "num_tokens": 197242864.0, "step": 332 }, { "epoch": 1.5857142857142859, "grad_norm": 0.7779151797294617, "learning_rate": 1e-05, "loss": 1.1153, "mean_token_accuracy": 0.6798511743545532, "num_tokens": 197840214.0, "step": 333 }, { "epoch": 1.5904761904761906, "grad_norm": 0.5971040725708008, "learning_rate": 1e-05, "loss": 1.1177, "mean_token_accuracy": 0.6795299649238586, "num_tokens": 198440572.0, "step": 334 }, { "epoch": 1.5952380952380953, "grad_norm": 0.6526306867599487, "learning_rate": 1e-05, "loss": 1.1134, "mean_token_accuracy": 0.6805366277694702, "num_tokens": 199039184.0, "step": 335 }, { "epoch": 1.6, "grad_norm": 0.622909426689148, "learning_rate": 1e-05, "loss": 1.1139, "mean_token_accuracy": 0.6792494058609009, "num_tokens": 199626548.0, "step": 336 }, { "epoch": 1.6047619047619048, "grad_norm": 0.6684408187866211, "learning_rate": 1e-05, "loss": 1.128, "mean_token_accuracy": 0.6774076819419861, "num_tokens": 200222258.0, "step": 337 }, { "epoch": 1.6095238095238096, "grad_norm": 0.5934977531433105, "learning_rate": 1e-05, "loss": 1.1203, "mean_token_accuracy": 0.6792654991149902, "num_tokens": 200819172.0, "step": 338 }, { "epoch": 1.6142857142857143, "grad_norm": 0.6164219975471497, "learning_rate": 1e-05, "loss": 1.1314, "mean_token_accuracy": 0.6759560704231262, "num_tokens": 201413549.0, "step": 339 }, { "epoch": 1.619047619047619, "grad_norm": 0.6061872839927673, "learning_rate": 1e-05, "loss": 1.1162, "mean_token_accuracy": 0.6795899868011475, "num_tokens": 202014069.0, "step": 340 }, { "epoch": 1.6238095238095238, "grad_norm": 0.6192796230316162, "learning_rate": 1e-05, "loss": 1.1476, "mean_token_accuracy": 0.6721718311309814, "num_tokens": 202600379.0, "step": 341 }, { "epoch": 1.6285714285714286, "grad_norm": 0.6233608722686768, "learning_rate": 1e-05, "loss": 1.1226, "mean_token_accuracy": 0.6779032945632935, "num_tokens": 203203233.0, "step": 342 }, { "epoch": 1.6333333333333333, "grad_norm": 0.5831724405288696, "learning_rate": 1e-05, "loss": 1.1159, "mean_token_accuracy": 0.6793074607849121, "num_tokens": 203802554.0, "step": 343 }, { "epoch": 1.638095238095238, "grad_norm": 0.6623408794403076, "learning_rate": 1e-05, "loss": 1.1296, "mean_token_accuracy": 0.677479088306427, "num_tokens": 204395560.0, "step": 344 }, { "epoch": 1.6428571428571428, "grad_norm": 0.5827105045318604, "learning_rate": 1e-05, "loss": 1.113, "mean_token_accuracy": 0.6808111071586609, "num_tokens": 205001404.0, "step": 345 }, { "epoch": 1.6476190476190475, "grad_norm": 0.5602775812149048, "learning_rate": 1e-05, "loss": 1.1066, "mean_token_accuracy": 0.6823267936706543, "num_tokens": 205599855.0, "step": 346 }, { "epoch": 1.6523809523809523, "grad_norm": 0.6435489654541016, "learning_rate": 1e-05, "loss": 1.1124, "mean_token_accuracy": 0.6803141832351685, "num_tokens": 206163338.0, "step": 347 }, { "epoch": 1.657142857142857, "grad_norm": 0.5933458209037781, "learning_rate": 1e-05, "loss": 1.137, "mean_token_accuracy": 0.6748683452606201, "num_tokens": 206741397.0, "step": 348 }, { "epoch": 1.6619047619047618, "grad_norm": 0.5775367021560669, "learning_rate": 1e-05, "loss": 1.1298, "mean_token_accuracy": 0.6758445501327515, "num_tokens": 207323297.0, "step": 349 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5773342251777649, "learning_rate": 1e-05, "loss": 1.1294, "mean_token_accuracy": 0.6764418482780457, "num_tokens": 207908589.0, "step": 350 }, { "epoch": 1.6714285714285713, "grad_norm": 0.6353156566619873, "learning_rate": 1e-05, "loss": 1.1403, "mean_token_accuracy": 0.6732203960418701, "num_tokens": 208500281.0, "step": 351 }, { "epoch": 1.6761904761904762, "grad_norm": 0.5841516852378845, "learning_rate": 1e-05, "loss": 1.1201, "mean_token_accuracy": 0.6789692640304565, "num_tokens": 209097427.0, "step": 352 }, { "epoch": 1.680952380952381, "grad_norm": 0.5935720205307007, "learning_rate": 1e-05, "loss": 1.1217, "mean_token_accuracy": 0.6778074502944946, "num_tokens": 209704225.0, "step": 353 }, { "epoch": 1.6857142857142857, "grad_norm": 0.6088152527809143, "learning_rate": 1e-05, "loss": 1.1177, "mean_token_accuracy": 0.6796123385429382, "num_tokens": 210313267.0, "step": 354 }, { "epoch": 1.6904761904761905, "grad_norm": 0.5818439722061157, "learning_rate": 1e-05, "loss": 1.1273, "mean_token_accuracy": 0.6770058870315552, "num_tokens": 210918588.0, "step": 355 }, { "epoch": 1.6952380952380952, "grad_norm": 0.6217803955078125, "learning_rate": 1e-05, "loss": 1.1268, "mean_token_accuracy": 0.6772897243499756, "num_tokens": 211508221.0, "step": 356 }, { "epoch": 1.7, "grad_norm": 0.5793229937553406, "learning_rate": 1e-05, "loss": 1.1359, "mean_token_accuracy": 0.6748672127723694, "num_tokens": 212108728.0, "step": 357 }, { "epoch": 1.704761904761905, "grad_norm": 0.5839233994483948, "learning_rate": 1e-05, "loss": 1.1226, "mean_token_accuracy": 0.6776269674301147, "num_tokens": 212705437.0, "step": 358 }, { "epoch": 1.7095238095238097, "grad_norm": 0.6158073544502258, "learning_rate": 1e-05, "loss": 1.1324, "mean_token_accuracy": 0.6745504140853882, "num_tokens": 213300176.0, "step": 359 }, { "epoch": 1.7142857142857144, "grad_norm": 0.6093515753746033, "learning_rate": 1e-05, "loss": 1.132, "mean_token_accuracy": 0.6751940250396729, "num_tokens": 213890731.0, "step": 360 }, { "epoch": 1.7190476190476192, "grad_norm": 0.629436194896698, "learning_rate": 1e-05, "loss": 1.1147, "mean_token_accuracy": 0.6785677075386047, "num_tokens": 214471137.0, "step": 361 }, { "epoch": 1.723809523809524, "grad_norm": 0.6373199820518494, "learning_rate": 1e-05, "loss": 1.1169, "mean_token_accuracy": 0.6792606115341187, "num_tokens": 215062165.0, "step": 362 }, { "epoch": 1.7285714285714286, "grad_norm": 0.5850217938423157, "learning_rate": 1e-05, "loss": 1.1264, "mean_token_accuracy": 0.6766684055328369, "num_tokens": 215662977.0, "step": 363 }, { "epoch": 1.7333333333333334, "grad_norm": 0.676506757736206, "learning_rate": 1e-05, "loss": 1.1328, "mean_token_accuracy": 0.6750730276107788, "num_tokens": 216253942.0, "step": 364 }, { "epoch": 1.7380952380952381, "grad_norm": 0.5996358394622803, "learning_rate": 1e-05, "loss": 1.1234, "mean_token_accuracy": 0.6771166920661926, "num_tokens": 216847247.0, "step": 365 }, { "epoch": 1.7428571428571429, "grad_norm": 0.604375422000885, "learning_rate": 1e-05, "loss": 1.1111, "mean_token_accuracy": 0.6807925701141357, "num_tokens": 217427979.0, "step": 366 }, { "epoch": 1.7476190476190476, "grad_norm": 0.6484256386756897, "learning_rate": 1e-05, "loss": 1.1149, "mean_token_accuracy": 0.67896568775177, "num_tokens": 218020622.0, "step": 367 }, { "epoch": 1.7523809523809524, "grad_norm": 0.5445154905319214, "learning_rate": 1e-05, "loss": 1.1238, "mean_token_accuracy": 0.677640438079834, "num_tokens": 218613768.0, "step": 368 }, { "epoch": 1.7571428571428571, "grad_norm": 0.5835940837860107, "learning_rate": 1e-05, "loss": 1.1352, "mean_token_accuracy": 0.6746830940246582, "num_tokens": 219217863.0, "step": 369 }, { "epoch": 1.7619047619047619, "grad_norm": 0.6108807325363159, "learning_rate": 1e-05, "loss": 1.1245, "mean_token_accuracy": 0.6771240234375, "num_tokens": 219826128.0, "step": 370 }, { "epoch": 1.7666666666666666, "grad_norm": 0.5301618576049805, "learning_rate": 1e-05, "loss": 1.1193, "mean_token_accuracy": 0.6791725158691406, "num_tokens": 220424737.0, "step": 371 }, { "epoch": 1.7714285714285714, "grad_norm": 0.567722737789154, "learning_rate": 1e-05, "loss": 1.1241, "mean_token_accuracy": 0.6771541833877563, "num_tokens": 221010073.0, "step": 372 }, { "epoch": 1.776190476190476, "grad_norm": 0.6946297883987427, "learning_rate": 1e-05, "loss": 1.1205, "mean_token_accuracy": 0.678805410861969, "num_tokens": 221614799.0, "step": 373 }, { "epoch": 1.7809523809523808, "grad_norm": 0.5566631555557251, "learning_rate": 1e-05, "loss": 1.1185, "mean_token_accuracy": 0.6782611012458801, "num_tokens": 222215943.0, "step": 374 }, { "epoch": 1.7857142857142856, "grad_norm": 0.5999249219894409, "learning_rate": 1e-05, "loss": 1.114, "mean_token_accuracy": 0.6802798509597778, "num_tokens": 222803822.0, "step": 375 }, { "epoch": 1.7904761904761903, "grad_norm": 0.5825783014297485, "learning_rate": 1e-05, "loss": 1.1314, "mean_token_accuracy": 0.6754652261734009, "num_tokens": 223409541.0, "step": 376 }, { "epoch": 1.795238095238095, "grad_norm": 0.5893160700798035, "learning_rate": 1e-05, "loss": 1.1183, "mean_token_accuracy": 0.6782077550888062, "num_tokens": 223996446.0, "step": 377 }, { "epoch": 1.8, "grad_norm": 0.5960800051689148, "learning_rate": 1e-05, "loss": 1.1203, "mean_token_accuracy": 0.678328275680542, "num_tokens": 224599074.0, "step": 378 }, { "epoch": 1.8047619047619048, "grad_norm": 0.5972325205802917, "learning_rate": 1e-05, "loss": 1.1122, "mean_token_accuracy": 0.6802579760551453, "num_tokens": 225184557.0, "step": 379 }, { "epoch": 1.8095238095238095, "grad_norm": 0.597683310508728, "learning_rate": 1e-05, "loss": 1.1185, "mean_token_accuracy": 0.6798061728477478, "num_tokens": 225774197.0, "step": 380 }, { "epoch": 1.8142857142857143, "grad_norm": 0.575453519821167, "learning_rate": 1e-05, "loss": 1.1072, "mean_token_accuracy": 0.6810543537139893, "num_tokens": 226359063.0, "step": 381 }, { "epoch": 1.819047619047619, "grad_norm": 0.5560538172721863, "learning_rate": 1e-05, "loss": 1.1237, "mean_token_accuracy": 0.6774187088012695, "num_tokens": 226962202.0, "step": 382 }, { "epoch": 1.8238095238095238, "grad_norm": 0.6427722573280334, "learning_rate": 1e-05, "loss": 1.1218, "mean_token_accuracy": 0.677949070930481, "num_tokens": 227541002.0, "step": 383 }, { "epoch": 1.8285714285714287, "grad_norm": 0.6143935322761536, "learning_rate": 1e-05, "loss": 1.1221, "mean_token_accuracy": 0.6778963804244995, "num_tokens": 228134124.0, "step": 384 }, { "epoch": 1.8333333333333335, "grad_norm": 0.6365751624107361, "learning_rate": 1e-05, "loss": 1.112, "mean_token_accuracy": 0.6797761917114258, "num_tokens": 228729717.0, "step": 385 }, { "epoch": 1.8380952380952382, "grad_norm": 0.719041109085083, "learning_rate": 1e-05, "loss": 1.1178, "mean_token_accuracy": 0.6780564785003662, "num_tokens": 229318931.0, "step": 386 }, { "epoch": 1.842857142857143, "grad_norm": 0.6031278967857361, "learning_rate": 1e-05, "loss": 1.1246, "mean_token_accuracy": 0.6776800155639648, "num_tokens": 229923675.0, "step": 387 }, { "epoch": 1.8476190476190477, "grad_norm": 0.6627750396728516, "learning_rate": 1e-05, "loss": 1.1149, "mean_token_accuracy": 0.6797564029693604, "num_tokens": 230514254.0, "step": 388 }, { "epoch": 1.8523809523809525, "grad_norm": 0.576654314994812, "learning_rate": 1e-05, "loss": 1.1228, "mean_token_accuracy": 0.6780418157577515, "num_tokens": 231113801.0, "step": 389 }, { "epoch": 1.8571428571428572, "grad_norm": 0.6316273212432861, "learning_rate": 1e-05, "loss": 1.1119, "mean_token_accuracy": 0.6792047023773193, "num_tokens": 231709098.0, "step": 390 }, { "epoch": 1.861904761904762, "grad_norm": 0.5546997785568237, "learning_rate": 1e-05, "loss": 1.1247, "mean_token_accuracy": 0.6769775748252869, "num_tokens": 232311276.0, "step": 391 }, { "epoch": 1.8666666666666667, "grad_norm": 0.617088794708252, "learning_rate": 1e-05, "loss": 1.113, "mean_token_accuracy": 0.6795423030853271, "num_tokens": 232904607.0, "step": 392 }, { "epoch": 1.8714285714285714, "grad_norm": 0.611702561378479, "learning_rate": 1e-05, "loss": 1.1057, "mean_token_accuracy": 0.6821488738059998, "num_tokens": 233493254.0, "step": 393 }, { "epoch": 1.8761904761904762, "grad_norm": 0.6276193261146545, "learning_rate": 1e-05, "loss": 1.1154, "mean_token_accuracy": 0.6800172328948975, "num_tokens": 234085431.0, "step": 394 }, { "epoch": 1.880952380952381, "grad_norm": 0.6570289731025696, "learning_rate": 1e-05, "loss": 1.1245, "mean_token_accuracy": 0.676892876625061, "num_tokens": 234685396.0, "step": 395 }, { "epoch": 1.8857142857142857, "grad_norm": 0.6350821256637573, "learning_rate": 1e-05, "loss": 1.1253, "mean_token_accuracy": 0.6770513653755188, "num_tokens": 235280049.0, "step": 396 }, { "epoch": 1.8904761904761904, "grad_norm": 0.6419028639793396, "learning_rate": 1e-05, "loss": 1.1258, "mean_token_accuracy": 0.6772979497909546, "num_tokens": 235867276.0, "step": 397 }, { "epoch": 1.8952380952380952, "grad_norm": 0.6098426580429077, "learning_rate": 1e-05, "loss": 1.1126, "mean_token_accuracy": 0.6804271936416626, "num_tokens": 236448647.0, "step": 398 }, { "epoch": 1.9, "grad_norm": 0.5854616165161133, "learning_rate": 1e-05, "loss": 1.1256, "mean_token_accuracy": 0.6755622625350952, "num_tokens": 237054180.0, "step": 399 }, { "epoch": 1.9047619047619047, "grad_norm": 0.6416271328926086, "learning_rate": 1e-05, "loss": 1.1394, "mean_token_accuracy": 0.6737023591995239, "num_tokens": 237658218.0, "step": 400 }, { "epoch": 1.9095238095238094, "grad_norm": 0.5833379626274109, "learning_rate": 1e-05, "loss": 1.1223, "mean_token_accuracy": 0.6782907247543335, "num_tokens": 238248393.0, "step": 401 }, { "epoch": 1.9142857142857141, "grad_norm": 0.6798136830329895, "learning_rate": 1e-05, "loss": 1.1096, "mean_token_accuracy": 0.6816190481185913, "num_tokens": 238838548.0, "step": 402 }, { "epoch": 1.919047619047619, "grad_norm": 0.5994821786880493, "learning_rate": 1e-05, "loss": 1.1154, "mean_token_accuracy": 0.6799057722091675, "num_tokens": 239442502.0, "step": 403 }, { "epoch": 1.9238095238095239, "grad_norm": 0.6224843263626099, "learning_rate": 1e-05, "loss": 1.1273, "mean_token_accuracy": 0.6760965585708618, "num_tokens": 240029019.0, "step": 404 }, { "epoch": 1.9285714285714286, "grad_norm": 0.6100861430168152, "learning_rate": 1e-05, "loss": 1.1134, "mean_token_accuracy": 0.6803538799285889, "num_tokens": 240623504.0, "step": 405 }, { "epoch": 1.9333333333333333, "grad_norm": 0.6026962399482727, "learning_rate": 1e-05, "loss": 1.1022, "mean_token_accuracy": 0.6817559599876404, "num_tokens": 241217102.0, "step": 406 }, { "epoch": 1.938095238095238, "grad_norm": 0.6529442667961121, "learning_rate": 1e-05, "loss": 1.1141, "mean_token_accuracy": 0.6804797649383545, "num_tokens": 241812222.0, "step": 407 }, { "epoch": 1.9428571428571428, "grad_norm": 0.6519430875778198, "learning_rate": 1e-05, "loss": 1.1085, "mean_token_accuracy": 0.680460512638092, "num_tokens": 242388669.0, "step": 408 }, { "epoch": 1.9476190476190476, "grad_norm": 0.7020300626754761, "learning_rate": 1e-05, "loss": 1.1111, "mean_token_accuracy": 0.6802721619606018, "num_tokens": 242965686.0, "step": 409 }, { "epoch": 1.9523809523809523, "grad_norm": 0.6024628281593323, "learning_rate": 1e-05, "loss": 1.1183, "mean_token_accuracy": 0.6792590022087097, "num_tokens": 243553853.0, "step": 410 }, { "epoch": 1.9571428571428573, "grad_norm": 0.7494162321090698, "learning_rate": 1e-05, "loss": 1.1135, "mean_token_accuracy": 0.6788877844810486, "num_tokens": 244139754.0, "step": 411 }, { "epoch": 1.961904761904762, "grad_norm": 0.6602755188941956, "learning_rate": 1e-05, "loss": 1.1176, "mean_token_accuracy": 0.6777335405349731, "num_tokens": 244736423.0, "step": 412 }, { "epoch": 1.9666666666666668, "grad_norm": 0.7016980051994324, "learning_rate": 1e-05, "loss": 1.1186, "mean_token_accuracy": 0.6798810362815857, "num_tokens": 245337682.0, "step": 413 }, { "epoch": 1.9714285714285715, "grad_norm": 0.6483145356178284, "learning_rate": 1e-05, "loss": 1.1172, "mean_token_accuracy": 0.67914879322052, "num_tokens": 245952105.0, "step": 414 }, { "epoch": 1.9761904761904763, "grad_norm": 0.678092896938324, "learning_rate": 1e-05, "loss": 1.1039, "mean_token_accuracy": 0.6819472312927246, "num_tokens": 246540999.0, "step": 415 }, { "epoch": 1.980952380952381, "grad_norm": 0.7507527470588684, "learning_rate": 1e-05, "loss": 1.1103, "mean_token_accuracy": 0.6814316511154175, "num_tokens": 247142303.0, "step": 416 }, { "epoch": 1.9857142857142858, "grad_norm": 0.625765323638916, "learning_rate": 1e-05, "loss": 1.108, "mean_token_accuracy": 0.6812607049942017, "num_tokens": 247732988.0, "step": 417 }, { "epoch": 1.9904761904761905, "grad_norm": 0.6421918869018555, "learning_rate": 1e-05, "loss": 1.1022, "mean_token_accuracy": 0.681933581829071, "num_tokens": 248334744.0, "step": 418 }, { "epoch": 1.9952380952380953, "grad_norm": 0.6160528659820557, "learning_rate": 1e-05, "loss": 1.1133, "mean_token_accuracy": 0.6797480583190918, "num_tokens": 248930347.0, "step": 419 }, { "epoch": 2.0, "grad_norm": 0.703513503074646, "learning_rate": 1e-05, "loss": 1.129, "mean_token_accuracy": 0.676598846912384, "num_tokens": 249522093.0, "step": 420 }, { "epoch": 2.0047619047619047, "grad_norm": 0.7784668207168579, "learning_rate": 1e-05, "loss": 1.0772, "mean_token_accuracy": 0.687883734703064, "num_tokens": 250112163.0, "step": 421 }, { "epoch": 2.0095238095238095, "grad_norm": 0.7685954570770264, "learning_rate": 1e-05, "loss": 1.065, "mean_token_accuracy": 0.6909651160240173, "num_tokens": 250690466.0, "step": 422 }, { "epoch": 2.0142857142857142, "grad_norm": 0.5822970867156982, "learning_rate": 1e-05, "loss": 1.0678, "mean_token_accuracy": 0.68952476978302, "num_tokens": 251279220.0, "step": 423 }, { "epoch": 2.019047619047619, "grad_norm": 0.8003807663917542, "learning_rate": 1e-05, "loss": 1.071, "mean_token_accuracy": 0.6891335248947144, "num_tokens": 251871717.0, "step": 424 }, { "epoch": 2.0238095238095237, "grad_norm": 0.6656951904296875, "learning_rate": 1e-05, "loss": 1.0907, "mean_token_accuracy": 0.6843781471252441, "num_tokens": 252474129.0, "step": 425 }, { "epoch": 2.0285714285714285, "grad_norm": 0.662339448928833, "learning_rate": 1e-05, "loss": 1.0518, "mean_token_accuracy": 0.693079948425293, "num_tokens": 253069065.0, "step": 426 }, { "epoch": 2.033333333333333, "grad_norm": 0.6397184729576111, "learning_rate": 1e-05, "loss": 1.079, "mean_token_accuracy": 0.6863641738891602, "num_tokens": 253654392.0, "step": 427 }, { "epoch": 2.038095238095238, "grad_norm": 0.6415942907333374, "learning_rate": 1e-05, "loss": 1.0688, "mean_token_accuracy": 0.6888935565948486, "num_tokens": 254245168.0, "step": 428 }, { "epoch": 2.0428571428571427, "grad_norm": 0.6560488939285278, "learning_rate": 1e-05, "loss": 1.0824, "mean_token_accuracy": 0.685175895690918, "num_tokens": 254841132.0, "step": 429 }, { "epoch": 2.0476190476190474, "grad_norm": 0.5839130878448486, "learning_rate": 1e-05, "loss": 1.0676, "mean_token_accuracy": 0.6896021366119385, "num_tokens": 255433793.0, "step": 430 }, { "epoch": 2.052380952380952, "grad_norm": 0.7360151410102844, "learning_rate": 1e-05, "loss": 1.0658, "mean_token_accuracy": 0.6903011798858643, "num_tokens": 256015225.0, "step": 431 }, { "epoch": 2.057142857142857, "grad_norm": 0.633699893951416, "learning_rate": 1e-05, "loss": 1.0638, "mean_token_accuracy": 0.6905348300933838, "num_tokens": 256617333.0, "step": 432 }, { "epoch": 2.0619047619047617, "grad_norm": 0.6784190535545349, "learning_rate": 1e-05, "loss": 1.0913, "mean_token_accuracy": 0.6846885681152344, "num_tokens": 257214120.0, "step": 433 }, { "epoch": 2.066666666666667, "grad_norm": 0.6749794483184814, "learning_rate": 1e-05, "loss": 1.0707, "mean_token_accuracy": 0.6884802579879761, "num_tokens": 257807506.0, "step": 434 }, { "epoch": 2.0714285714285716, "grad_norm": 0.6474000811576843, "learning_rate": 1e-05, "loss": 1.0873, "mean_token_accuracy": 0.6844640970230103, "num_tokens": 258414225.0, "step": 435 }, { "epoch": 2.0761904761904764, "grad_norm": 0.6300811171531677, "learning_rate": 1e-05, "loss": 1.0684, "mean_token_accuracy": 0.688113808631897, "num_tokens": 259009305.0, "step": 436 }, { "epoch": 2.080952380952381, "grad_norm": 0.6160655617713928, "learning_rate": 1e-05, "loss": 1.0626, "mean_token_accuracy": 0.690530002117157, "num_tokens": 259600283.0, "step": 437 }, { "epoch": 2.085714285714286, "grad_norm": 0.5936851501464844, "learning_rate": 1e-05, "loss": 1.0876, "mean_token_accuracy": 0.6846874952316284, "num_tokens": 260203907.0, "step": 438 }, { "epoch": 2.0904761904761906, "grad_norm": 0.6563723683357239, "learning_rate": 1e-05, "loss": 1.0823, "mean_token_accuracy": 0.6850008964538574, "num_tokens": 260795356.0, "step": 439 }, { "epoch": 2.0952380952380953, "grad_norm": 0.6244327425956726, "learning_rate": 1e-05, "loss": 1.0595, "mean_token_accuracy": 0.6918923854827881, "num_tokens": 261376414.0, "step": 440 }, { "epoch": 2.1, "grad_norm": 0.6768208146095276, "learning_rate": 1e-05, "loss": 1.0854, "mean_token_accuracy": 0.6857748031616211, "num_tokens": 261965206.0, "step": 441 }, { "epoch": 2.104761904761905, "grad_norm": 0.6261032819747925, "learning_rate": 1e-05, "loss": 1.0792, "mean_token_accuracy": 0.6868791580200195, "num_tokens": 262561371.0, "step": 442 }, { "epoch": 2.1095238095238096, "grad_norm": 0.6388991475105286, "learning_rate": 1e-05, "loss": 1.068, "mean_token_accuracy": 0.689771294593811, "num_tokens": 263159935.0, "step": 443 }, { "epoch": 2.1142857142857143, "grad_norm": 0.6453383564949036, "learning_rate": 1e-05, "loss": 1.0803, "mean_token_accuracy": 0.6851140260696411, "num_tokens": 263754299.0, "step": 444 }, { "epoch": 2.119047619047619, "grad_norm": 0.6248214244842529, "learning_rate": 1e-05, "loss": 1.0792, "mean_token_accuracy": 0.686503529548645, "num_tokens": 264354630.0, "step": 445 }, { "epoch": 2.123809523809524, "grad_norm": 0.6909031271934509, "learning_rate": 1e-05, "loss": 1.0995, "mean_token_accuracy": 0.681476891040802, "num_tokens": 264964353.0, "step": 446 }, { "epoch": 2.1285714285714286, "grad_norm": 0.6381927132606506, "learning_rate": 1e-05, "loss": 1.0816, "mean_token_accuracy": 0.6861193180084229, "num_tokens": 265561411.0, "step": 447 }, { "epoch": 2.1333333333333333, "grad_norm": 0.669456958770752, "learning_rate": 1e-05, "loss": 1.073, "mean_token_accuracy": 0.6876237392425537, "num_tokens": 266155790.0, "step": 448 }, { "epoch": 2.138095238095238, "grad_norm": 0.6266065239906311, "learning_rate": 1e-05, "loss": 1.0788, "mean_token_accuracy": 0.6870714426040649, "num_tokens": 266757269.0, "step": 449 }, { "epoch": 2.142857142857143, "grad_norm": 0.6428273916244507, "learning_rate": 1e-05, "loss": 1.0942, "mean_token_accuracy": 0.6831685304641724, "num_tokens": 267369143.0, "step": 450 }, { "epoch": 2.1476190476190475, "grad_norm": 0.6169470548629761, "learning_rate": 1e-05, "loss": 1.0619, "mean_token_accuracy": 0.6913155317306519, "num_tokens": 267965437.0, "step": 451 }, { "epoch": 2.1523809523809523, "grad_norm": 0.6351789832115173, "learning_rate": 1e-05, "loss": 1.0713, "mean_token_accuracy": 0.6888561248779297, "num_tokens": 268571463.0, "step": 452 }, { "epoch": 2.157142857142857, "grad_norm": 0.6532635688781738, "learning_rate": 1e-05, "loss": 1.0698, "mean_token_accuracy": 0.6889727115631104, "num_tokens": 269157041.0, "step": 453 }, { "epoch": 2.1619047619047618, "grad_norm": 0.5989878177642822, "learning_rate": 1e-05, "loss": 1.0682, "mean_token_accuracy": 0.6890783309936523, "num_tokens": 269758195.0, "step": 454 }, { "epoch": 2.1666666666666665, "grad_norm": 0.6337672472000122, "learning_rate": 1e-05, "loss": 1.0969, "mean_token_accuracy": 0.6822439432144165, "num_tokens": 270349613.0, "step": 455 }, { "epoch": 2.1714285714285713, "grad_norm": 0.5972429513931274, "learning_rate": 1e-05, "loss": 1.0831, "mean_token_accuracy": 0.6840848326683044, "num_tokens": 270947174.0, "step": 456 }, { "epoch": 2.176190476190476, "grad_norm": 0.6298529505729675, "learning_rate": 1e-05, "loss": 1.0921, "mean_token_accuracy": 0.6836713552474976, "num_tokens": 271549889.0, "step": 457 }, { "epoch": 2.1809523809523808, "grad_norm": 0.574796199798584, "learning_rate": 1e-05, "loss": 1.064, "mean_token_accuracy": 0.6906387805938721, "num_tokens": 272139782.0, "step": 458 }, { "epoch": 2.185714285714286, "grad_norm": 0.6812316179275513, "learning_rate": 1e-05, "loss": 1.0762, "mean_token_accuracy": 0.687111496925354, "num_tokens": 272746279.0, "step": 459 }, { "epoch": 2.1904761904761907, "grad_norm": 0.5981315970420837, "learning_rate": 1e-05, "loss": 1.0626, "mean_token_accuracy": 0.6907453536987305, "num_tokens": 273348449.0, "step": 460 }, { "epoch": 2.1952380952380954, "grad_norm": 0.6438897252082825, "learning_rate": 1e-05, "loss": 1.0853, "mean_token_accuracy": 0.6858918070793152, "num_tokens": 273949928.0, "step": 461 }, { "epoch": 2.2, "grad_norm": 0.6236709952354431, "learning_rate": 1e-05, "loss": 1.0733, "mean_token_accuracy": 0.6877059936523438, "num_tokens": 274548070.0, "step": 462 }, { "epoch": 2.204761904761905, "grad_norm": 0.6749060153961182, "learning_rate": 1e-05, "loss": 1.0758, "mean_token_accuracy": 0.6867290735244751, "num_tokens": 275135656.0, "step": 463 }, { "epoch": 2.2095238095238097, "grad_norm": 0.6628844738006592, "learning_rate": 1e-05, "loss": 1.0765, "mean_token_accuracy": 0.6874538660049438, "num_tokens": 275740663.0, "step": 464 }, { "epoch": 2.2142857142857144, "grad_norm": 0.5728548169136047, "learning_rate": 1e-05, "loss": 1.0754, "mean_token_accuracy": 0.6882718205451965, "num_tokens": 276346207.0, "step": 465 }, { "epoch": 2.219047619047619, "grad_norm": 0.6232889294624329, "learning_rate": 1e-05, "loss": 1.0752, "mean_token_accuracy": 0.6872685551643372, "num_tokens": 276940208.0, "step": 466 }, { "epoch": 2.223809523809524, "grad_norm": 0.6447910070419312, "learning_rate": 1e-05, "loss": 1.091, "mean_token_accuracy": 0.6836293339729309, "num_tokens": 277539762.0, "step": 467 }, { "epoch": 2.2285714285714286, "grad_norm": 0.6113771796226501, "learning_rate": 1e-05, "loss": 1.0865, "mean_token_accuracy": 0.684094250202179, "num_tokens": 278136526.0, "step": 468 }, { "epoch": 2.2333333333333334, "grad_norm": 0.6344524025917053, "learning_rate": 1e-05, "loss": 1.0772, "mean_token_accuracy": 0.6870338320732117, "num_tokens": 278723575.0, "step": 469 }, { "epoch": 2.238095238095238, "grad_norm": 0.6180852055549622, "learning_rate": 1e-05, "loss": 1.0544, "mean_token_accuracy": 0.6927859783172607, "num_tokens": 279313692.0, "step": 470 }, { "epoch": 2.242857142857143, "grad_norm": 0.6375457644462585, "learning_rate": 1e-05, "loss": 1.0869, "mean_token_accuracy": 0.6847492456436157, "num_tokens": 279911596.0, "step": 471 }, { "epoch": 2.2476190476190476, "grad_norm": 0.6032583117485046, "learning_rate": 1e-05, "loss": 1.0701, "mean_token_accuracy": 0.6893506050109863, "num_tokens": 280516626.0, "step": 472 }, { "epoch": 2.2523809523809524, "grad_norm": 0.6571868062019348, "learning_rate": 1e-05, "loss": 1.0723, "mean_token_accuracy": 0.6889193654060364, "num_tokens": 281109826.0, "step": 473 }, { "epoch": 2.257142857142857, "grad_norm": 0.5816087126731873, "learning_rate": 1e-05, "loss": 1.0783, "mean_token_accuracy": 0.6873452663421631, "num_tokens": 281705908.0, "step": 474 }, { "epoch": 2.261904761904762, "grad_norm": 0.6110855340957642, "learning_rate": 1e-05, "loss": 1.0733, "mean_token_accuracy": 0.6875293850898743, "num_tokens": 282295646.0, "step": 475 }, { "epoch": 2.2666666666666666, "grad_norm": 0.5722987055778503, "learning_rate": 1e-05, "loss": 1.064, "mean_token_accuracy": 0.6898794174194336, "num_tokens": 282882984.0, "step": 476 }, { "epoch": 2.2714285714285714, "grad_norm": 0.5756980776786804, "learning_rate": 1e-05, "loss": 1.0705, "mean_token_accuracy": 0.6888871192932129, "num_tokens": 283470314.0, "step": 477 }, { "epoch": 2.276190476190476, "grad_norm": 0.6090242862701416, "learning_rate": 1e-05, "loss": 1.0729, "mean_token_accuracy": 0.6876958012580872, "num_tokens": 284064822.0, "step": 478 }, { "epoch": 2.280952380952381, "grad_norm": 0.551956295967102, "learning_rate": 1e-05, "loss": 1.0666, "mean_token_accuracy": 0.6899924278259277, "num_tokens": 284659143.0, "step": 479 }, { "epoch": 2.2857142857142856, "grad_norm": 0.617386519908905, "learning_rate": 1e-05, "loss": 1.0789, "mean_token_accuracy": 0.6873286366462708, "num_tokens": 285260603.0, "step": 480 }, { "epoch": 2.2904761904761903, "grad_norm": 0.5895305871963501, "learning_rate": 1e-05, "loss": 1.0668, "mean_token_accuracy": 0.6887931823730469, "num_tokens": 285858617.0, "step": 481 }, { "epoch": 2.295238095238095, "grad_norm": 0.575018584728241, "learning_rate": 1e-05, "loss": 1.0733, "mean_token_accuracy": 0.6886229515075684, "num_tokens": 286462909.0, "step": 482 }, { "epoch": 2.3, "grad_norm": 0.680483341217041, "learning_rate": 1e-05, "loss": 1.0686, "mean_token_accuracy": 0.6894232034683228, "num_tokens": 287057508.0, "step": 483 }, { "epoch": 2.3047619047619046, "grad_norm": 0.6086472868919373, "learning_rate": 1e-05, "loss": 1.0784, "mean_token_accuracy": 0.6863738298416138, "num_tokens": 287647864.0, "step": 484 }, { "epoch": 2.3095238095238093, "grad_norm": 0.6269891858100891, "learning_rate": 1e-05, "loss": 1.0803, "mean_token_accuracy": 0.6864203810691833, "num_tokens": 288244654.0, "step": 485 }, { "epoch": 2.314285714285714, "grad_norm": 0.6842952370643616, "learning_rate": 1e-05, "loss": 1.0897, "mean_token_accuracy": 0.684012770652771, "num_tokens": 288833805.0, "step": 486 }, { "epoch": 2.319047619047619, "grad_norm": 0.5772620439529419, "learning_rate": 1e-05, "loss": 1.0728, "mean_token_accuracy": 0.6879225969314575, "num_tokens": 289430249.0, "step": 487 }, { "epoch": 2.323809523809524, "grad_norm": 0.6799498796463013, "learning_rate": 1e-05, "loss": 1.0737, "mean_token_accuracy": 0.6892322897911072, "num_tokens": 290017640.0, "step": 488 }, { "epoch": 2.3285714285714287, "grad_norm": 0.63170325756073, "learning_rate": 1e-05, "loss": 1.0694, "mean_token_accuracy": 0.6884621381759644, "num_tokens": 290598414.0, "step": 489 }, { "epoch": 2.3333333333333335, "grad_norm": 0.6786331534385681, "learning_rate": 1e-05, "loss": 1.061, "mean_token_accuracy": 0.6906882524490356, "num_tokens": 291181812.0, "step": 490 }, { "epoch": 2.3380952380952382, "grad_norm": 0.6489508748054504, "learning_rate": 1e-05, "loss": 1.0747, "mean_token_accuracy": 0.6877006888389587, "num_tokens": 291764317.0, "step": 491 }, { "epoch": 2.342857142857143, "grad_norm": 0.6271830797195435, "learning_rate": 1e-05, "loss": 1.0809, "mean_token_accuracy": 0.6854414343833923, "num_tokens": 292350504.0, "step": 492 }, { "epoch": 2.3476190476190477, "grad_norm": 0.6458184123039246, "learning_rate": 1e-05, "loss": 1.0777, "mean_token_accuracy": 0.6864031553268433, "num_tokens": 292951776.0, "step": 493 }, { "epoch": 2.3523809523809525, "grad_norm": 0.6648980379104614, "learning_rate": 1e-05, "loss": 1.0695, "mean_token_accuracy": 0.6887059211730957, "num_tokens": 293532488.0, "step": 494 }, { "epoch": 2.357142857142857, "grad_norm": 0.6425085067749023, "learning_rate": 1e-05, "loss": 1.0575, "mean_token_accuracy": 0.6918837428092957, "num_tokens": 294121146.0, "step": 495 }, { "epoch": 2.361904761904762, "grad_norm": 0.6645520329475403, "learning_rate": 1e-05, "loss": 1.0768, "mean_token_accuracy": 0.6873211860656738, "num_tokens": 294726732.0, "step": 496 }, { "epoch": 2.3666666666666667, "grad_norm": 0.6538220047950745, "learning_rate": 1e-05, "loss": 1.0682, "mean_token_accuracy": 0.6892035007476807, "num_tokens": 295306697.0, "step": 497 }, { "epoch": 2.3714285714285714, "grad_norm": 0.7154629230499268, "learning_rate": 1e-05, "loss": 1.0893, "mean_token_accuracy": 0.6833094358444214, "num_tokens": 295894972.0, "step": 498 }, { "epoch": 2.376190476190476, "grad_norm": 0.6492322087287903, "learning_rate": 1e-05, "loss": 1.0831, "mean_token_accuracy": 0.6853781938552856, "num_tokens": 296505345.0, "step": 499 }, { "epoch": 2.380952380952381, "grad_norm": 0.7426714301109314, "learning_rate": 1e-05, "loss": 1.0664, "mean_token_accuracy": 0.6893447637557983, "num_tokens": 297100909.0, "step": 500 }, { "epoch": 2.3857142857142857, "grad_norm": 0.6399804353713989, "learning_rate": 1e-05, "loss": 1.0743, "mean_token_accuracy": 0.688417375087738, "num_tokens": 297690267.0, "step": 501 }, { "epoch": 2.3904761904761904, "grad_norm": 0.599839985370636, "learning_rate": 1e-05, "loss": 1.0646, "mean_token_accuracy": 0.6897764801979065, "num_tokens": 298283484.0, "step": 502 }, { "epoch": 2.395238095238095, "grad_norm": 0.6296051740646362, "learning_rate": 1e-05, "loss": 1.0814, "mean_token_accuracy": 0.685540497303009, "num_tokens": 298880193.0, "step": 503 }, { "epoch": 2.4, "grad_norm": 0.5922709107398987, "learning_rate": 1e-05, "loss": 1.058, "mean_token_accuracy": 0.6912336349487305, "num_tokens": 299479995.0, "step": 504 }, { "epoch": 2.4047619047619047, "grad_norm": 0.608103334903717, "learning_rate": 1e-05, "loss": 1.0731, "mean_token_accuracy": 0.6877481937408447, "num_tokens": 300068384.0, "step": 505 }, { "epoch": 2.4095238095238094, "grad_norm": 0.6003749966621399, "learning_rate": 1e-05, "loss": 1.083, "mean_token_accuracy": 0.6847676038742065, "num_tokens": 300687274.0, "step": 506 }, { "epoch": 2.414285714285714, "grad_norm": 0.5747948884963989, "learning_rate": 1e-05, "loss": 1.075, "mean_token_accuracy": 0.6867921352386475, "num_tokens": 301288728.0, "step": 507 }, { "epoch": 2.419047619047619, "grad_norm": 0.6287463307380676, "learning_rate": 1e-05, "loss": 1.0698, "mean_token_accuracy": 0.6888238787651062, "num_tokens": 301868926.0, "step": 508 }, { "epoch": 2.4238095238095236, "grad_norm": 0.5455256104469299, "learning_rate": 1e-05, "loss": 1.0644, "mean_token_accuracy": 0.690401017665863, "num_tokens": 302467630.0, "step": 509 }, { "epoch": 2.4285714285714284, "grad_norm": 0.6476891040802002, "learning_rate": 1e-05, "loss": 1.09, "mean_token_accuracy": 0.6842606663703918, "num_tokens": 303058965.0, "step": 510 }, { "epoch": 2.4333333333333336, "grad_norm": 0.6696739792823792, "learning_rate": 1e-05, "loss": 1.091, "mean_token_accuracy": 0.6829922795295715, "num_tokens": 303639694.0, "step": 511 }, { "epoch": 2.4380952380952383, "grad_norm": 0.5850697159767151, "learning_rate": 1e-05, "loss": 1.0651, "mean_token_accuracy": 0.6903361082077026, "num_tokens": 304234504.0, "step": 512 }, { "epoch": 2.442857142857143, "grad_norm": 0.6123826503753662, "learning_rate": 1e-05, "loss": 1.0848, "mean_token_accuracy": 0.6855412125587463, "num_tokens": 304822484.0, "step": 513 }, { "epoch": 2.447619047619048, "grad_norm": 0.6242313981056213, "learning_rate": 1e-05, "loss": 1.069, "mean_token_accuracy": 0.6895902156829834, "num_tokens": 305405226.0, "step": 514 }, { "epoch": 2.4523809523809526, "grad_norm": 0.6153740286827087, "learning_rate": 1e-05, "loss": 1.0701, "mean_token_accuracy": 0.6889458298683167, "num_tokens": 306007153.0, "step": 515 }, { "epoch": 2.4571428571428573, "grad_norm": 0.6674852967262268, "learning_rate": 1e-05, "loss": 1.0701, "mean_token_accuracy": 0.6897221803665161, "num_tokens": 306588836.0, "step": 516 }, { "epoch": 2.461904761904762, "grad_norm": 0.6560084819793701, "learning_rate": 1e-05, "loss": 1.0804, "mean_token_accuracy": 0.6861131191253662, "num_tokens": 307200955.0, "step": 517 }, { "epoch": 2.466666666666667, "grad_norm": 0.5911952257156372, "learning_rate": 1e-05, "loss": 1.0669, "mean_token_accuracy": 0.6889474987983704, "num_tokens": 307799028.0, "step": 518 }, { "epoch": 2.4714285714285715, "grad_norm": 0.6963088512420654, "learning_rate": 1e-05, "loss": 1.0965, "mean_token_accuracy": 0.6822454929351807, "num_tokens": 308389748.0, "step": 519 }, { "epoch": 2.4761904761904763, "grad_norm": 0.7166724801063538, "learning_rate": 1e-05, "loss": 1.0773, "mean_token_accuracy": 0.6871429681777954, "num_tokens": 308978715.0, "step": 520 }, { "epoch": 2.480952380952381, "grad_norm": 0.598521888256073, "learning_rate": 1e-05, "loss": 1.0756, "mean_token_accuracy": 0.6871167421340942, "num_tokens": 309587298.0, "step": 521 }, { "epoch": 2.4857142857142858, "grad_norm": 0.6383949518203735, "learning_rate": 1e-05, "loss": 1.0643, "mean_token_accuracy": 0.6895929574966431, "num_tokens": 310173585.0, "step": 522 }, { "epoch": 2.4904761904761905, "grad_norm": 0.6667410731315613, "learning_rate": 1e-05, "loss": 1.0736, "mean_token_accuracy": 0.6880219578742981, "num_tokens": 310760313.0, "step": 523 }, { "epoch": 2.4952380952380953, "grad_norm": 0.6218487620353699, "learning_rate": 1e-05, "loss": 1.0764, "mean_token_accuracy": 0.6872262358665466, "num_tokens": 311374002.0, "step": 524 }, { "epoch": 2.5, "grad_norm": 0.6058824062347412, "learning_rate": 1e-05, "loss": 1.0701, "mean_token_accuracy": 0.6883900165557861, "num_tokens": 311952533.0, "step": 525 }, { "epoch": 2.5047619047619047, "grad_norm": 0.6459484100341797, "learning_rate": 1e-05, "loss": 1.065, "mean_token_accuracy": 0.6896857023239136, "num_tokens": 312542383.0, "step": 526 }, { "epoch": 2.5095238095238095, "grad_norm": 0.6192833781242371, "learning_rate": 1e-05, "loss": 1.0745, "mean_token_accuracy": 0.686732828617096, "num_tokens": 313136427.0, "step": 527 }, { "epoch": 2.5142857142857142, "grad_norm": 0.602884829044342, "learning_rate": 1e-05, "loss": 1.0564, "mean_token_accuracy": 0.6925665140151978, "num_tokens": 313731115.0, "step": 528 }, { "epoch": 2.519047619047619, "grad_norm": 0.5805109143257141, "learning_rate": 1e-05, "loss": 1.0644, "mean_token_accuracy": 0.6895827651023865, "num_tokens": 314316253.0, "step": 529 }, { "epoch": 2.5238095238095237, "grad_norm": 0.6484024524688721, "learning_rate": 1e-05, "loss": 1.0634, "mean_token_accuracy": 0.6902580857276917, "num_tokens": 314906539.0, "step": 530 }, { "epoch": 2.5285714285714285, "grad_norm": 0.6236498355865479, "learning_rate": 1e-05, "loss": 1.0611, "mean_token_accuracy": 0.6907384991645813, "num_tokens": 315491005.0, "step": 531 }, { "epoch": 2.533333333333333, "grad_norm": 0.68634432554245, "learning_rate": 1e-05, "loss": 1.0759, "mean_token_accuracy": 0.6861008405685425, "num_tokens": 316086156.0, "step": 532 }, { "epoch": 2.538095238095238, "grad_norm": 0.6483022570610046, "learning_rate": 1e-05, "loss": 1.0809, "mean_token_accuracy": 0.6863186359405518, "num_tokens": 316687284.0, "step": 533 }, { "epoch": 2.5428571428571427, "grad_norm": 0.6313026547431946, "learning_rate": 1e-05, "loss": 1.065, "mean_token_accuracy": 0.6903449296951294, "num_tokens": 317280976.0, "step": 534 }, { "epoch": 2.5476190476190474, "grad_norm": 0.7180777788162231, "learning_rate": 1e-05, "loss": 1.072, "mean_token_accuracy": 0.6879873275756836, "num_tokens": 317869704.0, "step": 535 }, { "epoch": 2.552380952380952, "grad_norm": 0.6203593611717224, "learning_rate": 1e-05, "loss": 1.0754, "mean_token_accuracy": 0.6873841285705566, "num_tokens": 318453830.0, "step": 536 }, { "epoch": 2.557142857142857, "grad_norm": 0.7294032573699951, "learning_rate": 1e-05, "loss": 1.0816, "mean_token_accuracy": 0.6853822469711304, "num_tokens": 319036628.0, "step": 537 }, { "epoch": 2.5619047619047617, "grad_norm": 0.6315251588821411, "learning_rate": 1e-05, "loss": 1.0671, "mean_token_accuracy": 0.6895589828491211, "num_tokens": 319641680.0, "step": 538 }, { "epoch": 2.5666666666666664, "grad_norm": 0.6481133699417114, "learning_rate": 1e-05, "loss": 1.0733, "mean_token_accuracy": 0.6874011754989624, "num_tokens": 320235018.0, "step": 539 }, { "epoch": 2.571428571428571, "grad_norm": 0.6537102460861206, "learning_rate": 1e-05, "loss": 1.0782, "mean_token_accuracy": 0.6865108609199524, "num_tokens": 320839523.0, "step": 540 }, { "epoch": 2.576190476190476, "grad_norm": 0.5990563631057739, "learning_rate": 1e-05, "loss": 1.0692, "mean_token_accuracy": 0.6882610321044922, "num_tokens": 321445469.0, "step": 541 }, { "epoch": 2.580952380952381, "grad_norm": 0.7251924276351929, "learning_rate": 1e-05, "loss": 1.0769, "mean_token_accuracy": 0.685580849647522, "num_tokens": 322040382.0, "step": 542 }, { "epoch": 2.585714285714286, "grad_norm": 0.5734168291091919, "learning_rate": 1e-05, "loss": 1.0731, "mean_token_accuracy": 0.6879425048828125, "num_tokens": 322631980.0, "step": 543 }, { "epoch": 2.5904761904761906, "grad_norm": 0.6524589657783508, "learning_rate": 1e-05, "loss": 1.0715, "mean_token_accuracy": 0.6874139308929443, "num_tokens": 323217003.0, "step": 544 }, { "epoch": 2.5952380952380953, "grad_norm": 0.6292608976364136, "learning_rate": 1e-05, "loss": 1.0751, "mean_token_accuracy": 0.6870990991592407, "num_tokens": 323797882.0, "step": 545 }, { "epoch": 2.6, "grad_norm": 0.631439208984375, "learning_rate": 1e-05, "loss": 1.0783, "mean_token_accuracy": 0.685767412185669, "num_tokens": 324381508.0, "step": 546 }, { "epoch": 2.604761904761905, "grad_norm": 0.621782124042511, "learning_rate": 1e-05, "loss": 1.0954, "mean_token_accuracy": 0.6821581125259399, "num_tokens": 324979976.0, "step": 547 }, { "epoch": 2.6095238095238096, "grad_norm": 0.6306419372558594, "learning_rate": 1e-05, "loss": 1.0677, "mean_token_accuracy": 0.6885519623756409, "num_tokens": 325579147.0, "step": 548 }, { "epoch": 2.6142857142857143, "grad_norm": 0.5700802206993103, "learning_rate": 1e-05, "loss": 1.0701, "mean_token_accuracy": 0.6891588568687439, "num_tokens": 326189724.0, "step": 549 }, { "epoch": 2.619047619047619, "grad_norm": 0.5674880146980286, "learning_rate": 1e-05, "loss": 1.0723, "mean_token_accuracy": 0.6874587535858154, "num_tokens": 326781040.0, "step": 550 }, { "epoch": 2.623809523809524, "grad_norm": 0.6210941076278687, "learning_rate": 1e-05, "loss": 1.066, "mean_token_accuracy": 0.6903613805770874, "num_tokens": 327384993.0, "step": 551 }, { "epoch": 2.6285714285714286, "grad_norm": 0.5762701630592346, "learning_rate": 1e-05, "loss": 1.0541, "mean_token_accuracy": 0.6926007866859436, "num_tokens": 327967527.0, "step": 552 }, { "epoch": 2.6333333333333333, "grad_norm": 0.5869442224502563, "learning_rate": 1e-05, "loss": 1.0602, "mean_token_accuracy": 0.6907045841217041, "num_tokens": 328556111.0, "step": 553 }, { "epoch": 2.638095238095238, "grad_norm": 0.6561670303344727, "learning_rate": 1e-05, "loss": 1.067, "mean_token_accuracy": 0.6888686418533325, "num_tokens": 329156419.0, "step": 554 }, { "epoch": 2.642857142857143, "grad_norm": 0.5729210376739502, "learning_rate": 1e-05, "loss": 1.0908, "mean_token_accuracy": 0.6830568313598633, "num_tokens": 329765795.0, "step": 555 }, { "epoch": 2.6476190476190475, "grad_norm": 0.5583658218383789, "learning_rate": 1e-05, "loss": 1.0715, "mean_token_accuracy": 0.6889873743057251, "num_tokens": 330366805.0, "step": 556 }, { "epoch": 2.6523809523809523, "grad_norm": 0.6156875491142273, "learning_rate": 1e-05, "loss": 1.0597, "mean_token_accuracy": 0.6899721622467041, "num_tokens": 330960683.0, "step": 557 }, { "epoch": 2.657142857142857, "grad_norm": 0.5830056667327881, "learning_rate": 1e-05, "loss": 1.0766, "mean_token_accuracy": 0.6871880292892456, "num_tokens": 331566617.0, "step": 558 }, { "epoch": 2.6619047619047618, "grad_norm": 0.6878387928009033, "learning_rate": 1e-05, "loss": 1.0606, "mean_token_accuracy": 0.6908230781555176, "num_tokens": 332146780.0, "step": 559 }, { "epoch": 2.6666666666666665, "grad_norm": 0.6010000705718994, "learning_rate": 1e-05, "loss": 1.073, "mean_token_accuracy": 0.6884846091270447, "num_tokens": 332744565.0, "step": 560 }, { "epoch": 2.6714285714285713, "grad_norm": 0.6257455348968506, "learning_rate": 1e-05, "loss": 1.0718, "mean_token_accuracy": 0.6879858374595642, "num_tokens": 333327246.0, "step": 561 }, { "epoch": 2.6761904761904765, "grad_norm": 0.6111727356910706, "learning_rate": 1e-05, "loss": 1.0604, "mean_token_accuracy": 0.6906289458274841, "num_tokens": 333924279.0, "step": 562 }, { "epoch": 2.680952380952381, "grad_norm": 0.6363468170166016, "learning_rate": 1e-05, "loss": 1.0628, "mean_token_accuracy": 0.6897515058517456, "num_tokens": 334526451.0, "step": 563 }, { "epoch": 2.685714285714286, "grad_norm": 0.6247795820236206, "learning_rate": 1e-05, "loss": 1.0726, "mean_token_accuracy": 0.6881762742996216, "num_tokens": 335121296.0, "step": 564 }, { "epoch": 2.6904761904761907, "grad_norm": 0.7256935238838196, "learning_rate": 1e-05, "loss": 1.069, "mean_token_accuracy": 0.6878950595855713, "num_tokens": 335705229.0, "step": 565 }, { "epoch": 2.6952380952380954, "grad_norm": 0.6218934655189514, "learning_rate": 1e-05, "loss": 1.0745, "mean_token_accuracy": 0.687312126159668, "num_tokens": 336296515.0, "step": 566 }, { "epoch": 2.7, "grad_norm": 0.64492267370224, "learning_rate": 1e-05, "loss": 1.0772, "mean_token_accuracy": 0.6871779561042786, "num_tokens": 336898581.0, "step": 567 }, { "epoch": 2.704761904761905, "grad_norm": 0.6439410448074341, "learning_rate": 1e-05, "loss": 1.0849, "mean_token_accuracy": 0.685498833656311, "num_tokens": 337492720.0, "step": 568 }, { "epoch": 2.7095238095238097, "grad_norm": 0.5982577204704285, "learning_rate": 1e-05, "loss": 1.0685, "mean_token_accuracy": 0.6890565752983093, "num_tokens": 338088080.0, "step": 569 }, { "epoch": 2.7142857142857144, "grad_norm": 0.6382868885993958, "learning_rate": 1e-05, "loss": 1.0678, "mean_token_accuracy": 0.6893150806427002, "num_tokens": 338682863.0, "step": 570 }, { "epoch": 2.719047619047619, "grad_norm": 0.5995696187019348, "learning_rate": 1e-05, "loss": 1.0737, "mean_token_accuracy": 0.6885708570480347, "num_tokens": 339274595.0, "step": 571 }, { "epoch": 2.723809523809524, "grad_norm": 0.6478890180587769, "learning_rate": 1e-05, "loss": 1.0736, "mean_token_accuracy": 0.687543511390686, "num_tokens": 339857633.0, "step": 572 }, { "epoch": 2.7285714285714286, "grad_norm": 0.6489014625549316, "learning_rate": 1e-05, "loss": 1.0564, "mean_token_accuracy": 0.6918776035308838, "num_tokens": 340451043.0, "step": 573 }, { "epoch": 2.7333333333333334, "grad_norm": 0.6406450271606445, "learning_rate": 1e-05, "loss": 1.0801, "mean_token_accuracy": 0.6862790584564209, "num_tokens": 341042238.0, "step": 574 }, { "epoch": 2.738095238095238, "grad_norm": 0.6261545419692993, "learning_rate": 1e-05, "loss": 1.0766, "mean_token_accuracy": 0.6869131922721863, "num_tokens": 341644006.0, "step": 575 }, { "epoch": 2.742857142857143, "grad_norm": 0.5907791256904602, "learning_rate": 1e-05, "loss": 1.0666, "mean_token_accuracy": 0.6884465217590332, "num_tokens": 342238235.0, "step": 576 }, { "epoch": 2.7476190476190476, "grad_norm": 0.638664186000824, "learning_rate": 1e-05, "loss": 1.0609, "mean_token_accuracy": 0.6908861994743347, "num_tokens": 342831260.0, "step": 577 }, { "epoch": 2.7523809523809524, "grad_norm": 0.6344829797744751, "learning_rate": 1e-05, "loss": 1.0762, "mean_token_accuracy": 0.687269389629364, "num_tokens": 343427629.0, "step": 578 }, { "epoch": 2.757142857142857, "grad_norm": 0.6150461435317993, "learning_rate": 1e-05, "loss": 1.0761, "mean_token_accuracy": 0.6873693466186523, "num_tokens": 344021401.0, "step": 579 }, { "epoch": 2.761904761904762, "grad_norm": 0.6308332681655884, "learning_rate": 1e-05, "loss": 1.0618, "mean_token_accuracy": 0.6911748647689819, "num_tokens": 344610108.0, "step": 580 }, { "epoch": 2.7666666666666666, "grad_norm": 0.55866539478302, "learning_rate": 1e-05, "loss": 1.0725, "mean_token_accuracy": 0.6869944334030151, "num_tokens": 345217241.0, "step": 581 }, { "epoch": 2.7714285714285714, "grad_norm": 0.638909637928009, "learning_rate": 1e-05, "loss": 1.0658, "mean_token_accuracy": 0.6890157461166382, "num_tokens": 345804258.0, "step": 582 }, { "epoch": 2.776190476190476, "grad_norm": 0.5688804984092712, "learning_rate": 1e-05, "loss": 1.0689, "mean_token_accuracy": 0.6887319087982178, "num_tokens": 346399481.0, "step": 583 }, { "epoch": 2.780952380952381, "grad_norm": 0.6002762317657471, "learning_rate": 1e-05, "loss": 1.0563, "mean_token_accuracy": 0.6915134191513062, "num_tokens": 346997922.0, "step": 584 }, { "epoch": 2.7857142857142856, "grad_norm": 0.6163663864135742, "learning_rate": 1e-05, "loss": 1.07, "mean_token_accuracy": 0.6884576082229614, "num_tokens": 347597863.0, "step": 585 }, { "epoch": 2.7904761904761903, "grad_norm": 0.580531656742096, "learning_rate": 1e-05, "loss": 1.0638, "mean_token_accuracy": 0.6888343095779419, "num_tokens": 348201046.0, "step": 586 }, { "epoch": 2.795238095238095, "grad_norm": 0.5918668508529663, "learning_rate": 1e-05, "loss": 1.0584, "mean_token_accuracy": 0.6905962228775024, "num_tokens": 348787326.0, "step": 587 }, { "epoch": 2.8, "grad_norm": 0.6383691430091858, "learning_rate": 1e-05, "loss": 1.0689, "mean_token_accuracy": 0.6883484125137329, "num_tokens": 349380645.0, "step": 588 }, { "epoch": 2.8047619047619046, "grad_norm": 0.6115639805793762, "learning_rate": 1e-05, "loss": 1.0654, "mean_token_accuracy": 0.6897737979888916, "num_tokens": 349983092.0, "step": 589 }, { "epoch": 2.8095238095238093, "grad_norm": 0.6397126317024231, "learning_rate": 1e-05, "loss": 1.0617, "mean_token_accuracy": 0.6903370022773743, "num_tokens": 350576836.0, "step": 590 }, { "epoch": 2.814285714285714, "grad_norm": 0.6862447261810303, "learning_rate": 1e-05, "loss": 1.0624, "mean_token_accuracy": 0.691243588924408, "num_tokens": 351172725.0, "step": 591 }, { "epoch": 2.819047619047619, "grad_norm": 0.6518527269363403, "learning_rate": 1e-05, "loss": 1.0568, "mean_token_accuracy": 0.6918639540672302, "num_tokens": 351777142.0, "step": 592 }, { "epoch": 2.8238095238095235, "grad_norm": 0.7507683634757996, "learning_rate": 1e-05, "loss": 1.0576, "mean_token_accuracy": 0.6908581852912903, "num_tokens": 352361834.0, "step": 593 }, { "epoch": 2.8285714285714287, "grad_norm": 0.6769391298294067, "learning_rate": 1e-05, "loss": 1.0658, "mean_token_accuracy": 0.6891970038414001, "num_tokens": 352964892.0, "step": 594 }, { "epoch": 2.8333333333333335, "grad_norm": 0.7207344770431519, "learning_rate": 1e-05, "loss": 1.0655, "mean_token_accuracy": 0.688566267490387, "num_tokens": 353563696.0, "step": 595 }, { "epoch": 2.8380952380952382, "grad_norm": 0.6687008142471313, "learning_rate": 1e-05, "loss": 1.0663, "mean_token_accuracy": 0.6889446377754211, "num_tokens": 354162323.0, "step": 596 }, { "epoch": 2.842857142857143, "grad_norm": 0.6510334610939026, "learning_rate": 1e-05, "loss": 1.0973, "mean_token_accuracy": 0.6817850470542908, "num_tokens": 354763224.0, "step": 597 }, { "epoch": 2.8476190476190477, "grad_norm": 0.6164536476135254, "learning_rate": 1e-05, "loss": 1.0599, "mean_token_accuracy": 0.6904336214065552, "num_tokens": 355360011.0, "step": 598 }, { "epoch": 2.8523809523809525, "grad_norm": 0.6652323603630066, "learning_rate": 1e-05, "loss": 1.0664, "mean_token_accuracy": 0.6892472505569458, "num_tokens": 355948770.0, "step": 599 }, { "epoch": 2.857142857142857, "grad_norm": 0.6170997619628906, "learning_rate": 1e-05, "loss": 1.0749, "mean_token_accuracy": 0.686385989189148, "num_tokens": 356555915.0, "step": 600 }, { "epoch": 2.861904761904762, "grad_norm": 0.5823125839233398, "learning_rate": 1e-05, "loss": 1.0762, "mean_token_accuracy": 0.6866952180862427, "num_tokens": 357168089.0, "step": 601 }, { "epoch": 2.8666666666666667, "grad_norm": 0.6084815859794617, "learning_rate": 1e-05, "loss": 1.0589, "mean_token_accuracy": 0.6905912160873413, "num_tokens": 357762209.0, "step": 602 }, { "epoch": 2.8714285714285714, "grad_norm": 0.5347459316253662, "learning_rate": 1e-05, "loss": 1.065, "mean_token_accuracy": 0.6898081302642822, "num_tokens": 358365632.0, "step": 603 }, { "epoch": 2.876190476190476, "grad_norm": 0.6211216449737549, "learning_rate": 1e-05, "loss": 1.0795, "mean_token_accuracy": 0.6860474944114685, "num_tokens": 358969038.0, "step": 604 }, { "epoch": 2.880952380952381, "grad_norm": 0.6298102736473083, "learning_rate": 1e-05, "loss": 1.0616, "mean_token_accuracy": 0.691013514995575, "num_tokens": 359560638.0, "step": 605 }, { "epoch": 2.8857142857142857, "grad_norm": 0.6150857210159302, "learning_rate": 1e-05, "loss": 1.0726, "mean_token_accuracy": 0.6874991655349731, "num_tokens": 360159047.0, "step": 606 }, { "epoch": 2.8904761904761904, "grad_norm": 0.6256808638572693, "learning_rate": 1e-05, "loss": 1.0696, "mean_token_accuracy": 0.6895867586135864, "num_tokens": 360752550.0, "step": 607 }, { "epoch": 2.895238095238095, "grad_norm": 0.6338992714881897, "learning_rate": 1e-05, "loss": 1.0706, "mean_token_accuracy": 0.6878842115402222, "num_tokens": 361348966.0, "step": 608 }, { "epoch": 2.9, "grad_norm": 0.6074673533439636, "learning_rate": 1e-05, "loss": 1.0625, "mean_token_accuracy": 0.690711498260498, "num_tokens": 361933541.0, "step": 609 }, { "epoch": 2.9047619047619047, "grad_norm": 0.6169112324714661, "learning_rate": 1e-05, "loss": 1.0679, "mean_token_accuracy": 0.6893640756607056, "num_tokens": 362522689.0, "step": 610 }, { "epoch": 2.9095238095238094, "grad_norm": 0.6712765097618103, "learning_rate": 1e-05, "loss": 1.0481, "mean_token_accuracy": 0.6935627460479736, "num_tokens": 363107779.0, "step": 611 }, { "epoch": 2.914285714285714, "grad_norm": 0.6030009388923645, "learning_rate": 1e-05, "loss": 1.0624, "mean_token_accuracy": 0.6899582147598267, "num_tokens": 363690809.0, "step": 612 }, { "epoch": 2.919047619047619, "grad_norm": 0.6335533261299133, "learning_rate": 1e-05, "loss": 1.0632, "mean_token_accuracy": 0.6892010569572449, "num_tokens": 364279923.0, "step": 613 }, { "epoch": 2.923809523809524, "grad_norm": 0.6299601793289185, "learning_rate": 1e-05, "loss": 1.0536, "mean_token_accuracy": 0.6920279264450073, "num_tokens": 364846929.0, "step": 614 }, { "epoch": 2.928571428571429, "grad_norm": 0.6494601964950562, "learning_rate": 1e-05, "loss": 1.0797, "mean_token_accuracy": 0.6871404051780701, "num_tokens": 365427755.0, "step": 615 }, { "epoch": 2.9333333333333336, "grad_norm": 0.6412233710289001, "learning_rate": 1e-05, "loss": 1.0618, "mean_token_accuracy": 0.6902071833610535, "num_tokens": 366022879.0, "step": 616 }, { "epoch": 2.9380952380952383, "grad_norm": 0.5901429653167725, "learning_rate": 1e-05, "loss": 1.0639, "mean_token_accuracy": 0.6891224384307861, "num_tokens": 366608198.0, "step": 617 }, { "epoch": 2.942857142857143, "grad_norm": 0.6606128811836243, "learning_rate": 1e-05, "loss": 1.0687, "mean_token_accuracy": 0.6881773471832275, "num_tokens": 367187170.0, "step": 618 }, { "epoch": 2.947619047619048, "grad_norm": 0.6021740436553955, "learning_rate": 1e-05, "loss": 1.062, "mean_token_accuracy": 0.6895371675491333, "num_tokens": 367778542.0, "step": 619 }, { "epoch": 2.9523809523809526, "grad_norm": 0.6304929852485657, "learning_rate": 1e-05, "loss": 1.0685, "mean_token_accuracy": 0.6876203417778015, "num_tokens": 368374361.0, "step": 620 }, { "epoch": 2.9571428571428573, "grad_norm": 0.6775472164154053, "learning_rate": 1e-05, "loss": 1.0693, "mean_token_accuracy": 0.688637375831604, "num_tokens": 368975961.0, "step": 621 }, { "epoch": 2.961904761904762, "grad_norm": 0.6188324689865112, "learning_rate": 1e-05, "loss": 1.0446, "mean_token_accuracy": 0.69502854347229, "num_tokens": 369565801.0, "step": 622 }, { "epoch": 2.966666666666667, "grad_norm": 0.7237592339515686, "learning_rate": 1e-05, "loss": 1.069, "mean_token_accuracy": 0.6882259845733643, "num_tokens": 370147963.0, "step": 623 }, { "epoch": 2.9714285714285715, "grad_norm": 0.5706875920295715, "learning_rate": 1e-05, "loss": 1.0661, "mean_token_accuracy": 0.688866376876831, "num_tokens": 370730337.0, "step": 624 }, { "epoch": 2.9761904761904763, "grad_norm": 0.6157565712928772, "learning_rate": 1e-05, "loss": 1.0595, "mean_token_accuracy": 0.6903921365737915, "num_tokens": 371313464.0, "step": 625 }, { "epoch": 2.980952380952381, "grad_norm": 0.5899333953857422, "learning_rate": 1e-05, "loss": 1.0634, "mean_token_accuracy": 0.690090537071228, "num_tokens": 371903211.0, "step": 626 }, { "epoch": 2.9857142857142858, "grad_norm": 0.6269708275794983, "learning_rate": 1e-05, "loss": 1.0536, "mean_token_accuracy": 0.6934218406677246, "num_tokens": 372496314.0, "step": 627 }, { "epoch": 2.9904761904761905, "grad_norm": 0.6969268321990967, "learning_rate": 1e-05, "loss": 1.0676, "mean_token_accuracy": 0.688661515712738, "num_tokens": 373096610.0, "step": 628 }, { "epoch": 2.9952380952380953, "grad_norm": 0.5695185661315918, "learning_rate": 1e-05, "loss": 1.0693, "mean_token_accuracy": 0.6875466108322144, "num_tokens": 373694165.0, "step": 629 }, { "epoch": 3.0, "grad_norm": 0.6636136174201965, "learning_rate": 1e-05, "loss": 1.0583, "mean_token_accuracy": 0.6922066807746887, "num_tokens": 374283247.0, "step": 630 }, { "epoch": 3.0, "step": 630, "total_flos": 2.1853937174671524e+18, "train_loss": 1.1615400253780304, "train_runtime": 1840.1375, "train_samples_per_second": 175.286, "train_steps_per_second": 0.342 } ], "logging_steps": 1, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 315, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1853937174671524e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }