Files
Llama-3.2-1B-Instruct_SFT_s…/trainer_state.json
ModelHub XC 6f263bcc32 初始化项目,由ModelHub XC社区提供模型
Model: Neelectric/Llama-3.2-1B-Instruct_SFT_sciencev00.04
Source: Original Platform
2026-06-01 02:21:15 +08:00

5714 lines
148 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 630,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004761904761904762,
"grad_norm": 8.288029670715332,
"learning_rate": 0.0,
"loss": 1.7656,
"mean_token_accuracy": 0.5768666863441467,
"num_tokens": 582781.0,
"step": 1
},
{
"epoch": 0.009523809523809525,
"grad_norm": 8.260492324829102,
"learning_rate": 1.5873015873015874e-07,
"loss": 1.7728,
"mean_token_accuracy": 0.5752322673797607,
"num_tokens": 1163696.0,
"step": 2
},
{
"epoch": 0.014285714285714285,
"grad_norm": 8.188252449035645,
"learning_rate": 3.174603174603175e-07,
"loss": 1.776,
"mean_token_accuracy": 0.5746057033538818,
"num_tokens": 1762000.0,
"step": 3
},
{
"epoch": 0.01904761904761905,
"grad_norm": 8.122298240661621,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.7765,
"mean_token_accuracy": 0.5741599798202515,
"num_tokens": 2363228.0,
"step": 4
},
{
"epoch": 0.023809523809523808,
"grad_norm": 7.91809606552124,
"learning_rate": 6.34920634920635e-07,
"loss": 1.7924,
"mean_token_accuracy": 0.5723700523376465,
"num_tokens": 2968748.0,
"step": 5
},
{
"epoch": 0.02857142857142857,
"grad_norm": 7.924537181854248,
"learning_rate": 7.936507936507937e-07,
"loss": 1.7649,
"mean_token_accuracy": 0.5754636526107788,
"num_tokens": 3564062.0,
"step": 6
},
{
"epoch": 0.03333333333333333,
"grad_norm": 7.629780292510986,
"learning_rate": 9.523809523809525e-07,
"loss": 1.7769,
"mean_token_accuracy": 0.5719509124755859,
"num_tokens": 4140352.0,
"step": 7
},
{
"epoch": 0.0380952380952381,
"grad_norm": 7.133674621582031,
"learning_rate": 1.111111111111111e-06,
"loss": 1.7748,
"mean_token_accuracy": 0.5719484090805054,
"num_tokens": 4748067.0,
"step": 8
},
{
"epoch": 0.04285714285714286,
"grad_norm": 6.150221347808838,
"learning_rate": 1.26984126984127e-06,
"loss": 1.7288,
"mean_token_accuracy": 0.5776432156562805,
"num_tokens": 5333791.0,
"step": 9
},
{
"epoch": 0.047619047619047616,
"grad_norm": 6.026834964752197,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.7401,
"mean_token_accuracy": 0.5752577781677246,
"num_tokens": 5923564.0,
"step": 10
},
{
"epoch": 0.05238095238095238,
"grad_norm": 5.608363151550293,
"learning_rate": 1.5873015873015873e-06,
"loss": 1.7097,
"mean_token_accuracy": 0.5797863602638245,
"num_tokens": 6528559.0,
"step": 11
},
{
"epoch": 0.05714285714285714,
"grad_norm": 4.234569072723389,
"learning_rate": 1.746031746031746e-06,
"loss": 1.6598,
"mean_token_accuracy": 0.5850973725318909,
"num_tokens": 7118765.0,
"step": 12
},
{
"epoch": 0.06190476190476191,
"grad_norm": 4.145053386688232,
"learning_rate": 1.904761904761905e-06,
"loss": 1.6597,
"mean_token_accuracy": 0.5842898488044739,
"num_tokens": 7709226.0,
"step": 13
},
{
"epoch": 0.06666666666666667,
"grad_norm": 3.9073646068573,
"learning_rate": 2.0634920634920634e-06,
"loss": 1.6303,
"mean_token_accuracy": 0.5906457901000977,
"num_tokens": 8298984.0,
"step": 14
},
{
"epoch": 0.07142857142857142,
"grad_norm": 3.8127150535583496,
"learning_rate": 2.222222222222222e-06,
"loss": 1.6281,
"mean_token_accuracy": 0.5896565914154053,
"num_tokens": 8875624.0,
"step": 15
},
{
"epoch": 0.0761904761904762,
"grad_norm": 3.0899341106414795,
"learning_rate": 2.380952380952381e-06,
"loss": 1.5687,
"mean_token_accuracy": 0.5990549325942993,
"num_tokens": 9448671.0,
"step": 16
},
{
"epoch": 0.08095238095238096,
"grad_norm": 2.755232334136963,
"learning_rate": 2.53968253968254e-06,
"loss": 1.5548,
"mean_token_accuracy": 0.6021129488945007,
"num_tokens": 10049546.0,
"step": 17
},
{
"epoch": 0.08571428571428572,
"grad_norm": 2.589613914489746,
"learning_rate": 2.6984126984126986e-06,
"loss": 1.5609,
"mean_token_accuracy": 0.5993459820747375,
"num_tokens": 10644905.0,
"step": 18
},
{
"epoch": 0.09047619047619047,
"grad_norm": 2.2161478996276855,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.5541,
"mean_token_accuracy": 0.6018418073654175,
"num_tokens": 11239583.0,
"step": 19
},
{
"epoch": 0.09523809523809523,
"grad_norm": 1.9722470045089722,
"learning_rate": 3.015873015873016e-06,
"loss": 1.5295,
"mean_token_accuracy": 0.6070071458816528,
"num_tokens": 11827320.0,
"step": 20
},
{
"epoch": 0.1,
"grad_norm": 1.8827704191207886,
"learning_rate": 3.1746031746031746e-06,
"loss": 1.4814,
"mean_token_accuracy": 0.6151003837585449,
"num_tokens": 12425511.0,
"step": 21
},
{
"epoch": 0.10476190476190476,
"grad_norm": 2.351033926010132,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4865,
"mean_token_accuracy": 0.6138286590576172,
"num_tokens": 13015708.0,
"step": 22
},
{
"epoch": 0.10952380952380952,
"grad_norm": 2.134150981903076,
"learning_rate": 3.492063492063492e-06,
"loss": 1.469,
"mean_token_accuracy": 0.6165286302566528,
"num_tokens": 13608875.0,
"step": 23
},
{
"epoch": 0.11428571428571428,
"grad_norm": 1.9380258321762085,
"learning_rate": 3.6507936507936507e-06,
"loss": 1.476,
"mean_token_accuracy": 0.6141604781150818,
"num_tokens": 14204297.0,
"step": 24
},
{
"epoch": 0.11904761904761904,
"grad_norm": 1.656062364578247,
"learning_rate": 3.80952380952381e-06,
"loss": 1.461,
"mean_token_accuracy": 0.6166412830352783,
"num_tokens": 14782206.0,
"step": 25
},
{
"epoch": 0.12380952380952381,
"grad_norm": 1.3905470371246338,
"learning_rate": 3.968253968253968e-06,
"loss": 1.4382,
"mean_token_accuracy": 0.6210923194885254,
"num_tokens": 15377829.0,
"step": 26
},
{
"epoch": 0.12857142857142856,
"grad_norm": 1.1439160108566284,
"learning_rate": 4.126984126984127e-06,
"loss": 1.4318,
"mean_token_accuracy": 0.6224101781845093,
"num_tokens": 15975819.0,
"step": 27
},
{
"epoch": 0.13333333333333333,
"grad_norm": 1.0443707704544067,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.4182,
"mean_token_accuracy": 0.6251046061515808,
"num_tokens": 16577839.0,
"step": 28
},
{
"epoch": 0.1380952380952381,
"grad_norm": 1.0729820728302002,
"learning_rate": 4.444444444444444e-06,
"loss": 1.4116,
"mean_token_accuracy": 0.6257858276367188,
"num_tokens": 17164831.0,
"step": 29
},
{
"epoch": 0.14285714285714285,
"grad_norm": 1.1262085437774658,
"learning_rate": 4.603174603174604e-06,
"loss": 1.3974,
"mean_token_accuracy": 0.6290417909622192,
"num_tokens": 17770476.0,
"step": 30
},
{
"epoch": 0.14761904761904762,
"grad_norm": 1.1004436016082764,
"learning_rate": 4.761904761904762e-06,
"loss": 1.383,
"mean_token_accuracy": 0.6305603981018066,
"num_tokens": 18360862.0,
"step": 31
},
{
"epoch": 0.1523809523809524,
"grad_norm": 0.9822593927383423,
"learning_rate": 4.920634920634921e-06,
"loss": 1.3981,
"mean_token_accuracy": 0.6271172761917114,
"num_tokens": 18944338.0,
"step": 32
},
{
"epoch": 0.15714285714285714,
"grad_norm": 0.8572197556495667,
"learning_rate": 5.07936507936508e-06,
"loss": 1.3721,
"mean_token_accuracy": 0.6327400207519531,
"num_tokens": 19540189.0,
"step": 33
},
{
"epoch": 0.1619047619047619,
"grad_norm": 0.9113824963569641,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.3689,
"mean_token_accuracy": 0.6341559290885925,
"num_tokens": 20138131.0,
"step": 34
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.8736249208450317,
"learning_rate": 5.396825396825397e-06,
"loss": 1.3855,
"mean_token_accuracy": 0.6294394731521606,
"num_tokens": 20735187.0,
"step": 35
},
{
"epoch": 0.17142857142857143,
"grad_norm": 0.8438997268676758,
"learning_rate": 5.555555555555557e-06,
"loss": 1.3614,
"mean_token_accuracy": 0.6335337162017822,
"num_tokens": 21316383.0,
"step": 36
},
{
"epoch": 0.1761904761904762,
"grad_norm": 0.7541394233703613,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.3378,
"mean_token_accuracy": 0.6401833295822144,
"num_tokens": 21910626.0,
"step": 37
},
{
"epoch": 0.18095238095238095,
"grad_norm": 0.697533130645752,
"learning_rate": 5.873015873015874e-06,
"loss": 1.3591,
"mean_token_accuracy": 0.6341187357902527,
"num_tokens": 22503955.0,
"step": 38
},
{
"epoch": 0.18571428571428572,
"grad_norm": 0.677990734577179,
"learning_rate": 6.031746031746032e-06,
"loss": 1.3543,
"mean_token_accuracy": 0.6353764533996582,
"num_tokens": 23093310.0,
"step": 39
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.677953839302063,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.3249,
"mean_token_accuracy": 0.641827404499054,
"num_tokens": 23681028.0,
"step": 40
},
{
"epoch": 0.19523809523809524,
"grad_norm": 0.6177698969841003,
"learning_rate": 6.349206349206349e-06,
"loss": 1.3271,
"mean_token_accuracy": 0.6412782669067383,
"num_tokens": 24275532.0,
"step": 41
},
{
"epoch": 0.2,
"grad_norm": 0.6382781267166138,
"learning_rate": 6.507936507936509e-06,
"loss": 1.3309,
"mean_token_accuracy": 0.6407559514045715,
"num_tokens": 24868054.0,
"step": 42
},
{
"epoch": 0.20476190476190476,
"grad_norm": 0.5981337428092957,
"learning_rate": 6.666666666666667e-06,
"loss": 1.3323,
"mean_token_accuracy": 0.6397281885147095,
"num_tokens": 25459842.0,
"step": 43
},
{
"epoch": 0.20952380952380953,
"grad_norm": 0.5885143876075745,
"learning_rate": 6.825396825396826e-06,
"loss": 1.339,
"mean_token_accuracy": 0.6373006105422974,
"num_tokens": 26051340.0,
"step": 44
},
{
"epoch": 0.21428571428571427,
"grad_norm": 0.5942175984382629,
"learning_rate": 6.984126984126984e-06,
"loss": 1.3188,
"mean_token_accuracy": 0.6426886320114136,
"num_tokens": 26635240.0,
"step": 45
},
{
"epoch": 0.21904761904761905,
"grad_norm": 0.6174569129943848,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.3198,
"mean_token_accuracy": 0.6419786214828491,
"num_tokens": 27228570.0,
"step": 46
},
{
"epoch": 0.22380952380952382,
"grad_norm": 0.6012991070747375,
"learning_rate": 7.301587301587301e-06,
"loss": 1.3139,
"mean_token_accuracy": 0.6440544128417969,
"num_tokens": 27825958.0,
"step": 47
},
{
"epoch": 0.22857142857142856,
"grad_norm": 0.6103922128677368,
"learning_rate": 7.460317460317461e-06,
"loss": 1.3076,
"mean_token_accuracy": 0.6433683037757874,
"num_tokens": 28418470.0,
"step": 48
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.6127147674560547,
"learning_rate": 7.61904761904762e-06,
"loss": 1.3044,
"mean_token_accuracy": 0.6449373364448547,
"num_tokens": 29013060.0,
"step": 49
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.5933082103729248,
"learning_rate": 7.77777777777778e-06,
"loss": 1.3131,
"mean_token_accuracy": 0.6417987942695618,
"num_tokens": 29624709.0,
"step": 50
},
{
"epoch": 0.24285714285714285,
"grad_norm": 0.6003814339637756,
"learning_rate": 7.936507936507936e-06,
"loss": 1.3056,
"mean_token_accuracy": 0.6438874006271362,
"num_tokens": 30227928.0,
"step": 51
},
{
"epoch": 0.24761904761904763,
"grad_norm": 0.5546218156814575,
"learning_rate": 8.095238095238097e-06,
"loss": 1.3073,
"mean_token_accuracy": 0.6426275968551636,
"num_tokens": 30823383.0,
"step": 52
},
{
"epoch": 0.2523809523809524,
"grad_norm": 0.5813356637954712,
"learning_rate": 8.253968253968254e-06,
"loss": 1.2887,
"mean_token_accuracy": 0.6480042338371277,
"num_tokens": 31418593.0,
"step": 53
},
{
"epoch": 0.2571428571428571,
"grad_norm": 0.6125403046607971,
"learning_rate": 8.412698412698414e-06,
"loss": 1.2812,
"mean_token_accuracy": 0.6492801904678345,
"num_tokens": 32008377.0,
"step": 54
},
{
"epoch": 0.2619047619047619,
"grad_norm": 0.6021028757095337,
"learning_rate": 8.571428571428571e-06,
"loss": 1.2881,
"mean_token_accuracy": 0.6466339230537415,
"num_tokens": 32600302.0,
"step": 55
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.5916977524757385,
"learning_rate": 8.730158730158731e-06,
"loss": 1.2896,
"mean_token_accuracy": 0.6466712951660156,
"num_tokens": 33201147.0,
"step": 56
},
{
"epoch": 0.2714285714285714,
"grad_norm": 0.5573871731758118,
"learning_rate": 8.888888888888888e-06,
"loss": 1.269,
"mean_token_accuracy": 0.6514161229133606,
"num_tokens": 33790565.0,
"step": 57
},
{
"epoch": 0.2761904761904762,
"grad_norm": 0.6427719593048096,
"learning_rate": 9.047619047619049e-06,
"loss": 1.2747,
"mean_token_accuracy": 0.6507048606872559,
"num_tokens": 34387187.0,
"step": 58
},
{
"epoch": 0.28095238095238095,
"grad_norm": 0.5992103219032288,
"learning_rate": 9.206349206349207e-06,
"loss": 1.2832,
"mean_token_accuracy": 0.6487317085266113,
"num_tokens": 35000480.0,
"step": 59
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.6176905632019043,
"learning_rate": 9.365079365079366e-06,
"loss": 1.266,
"mean_token_accuracy": 0.6526767611503601,
"num_tokens": 35588577.0,
"step": 60
},
{
"epoch": 0.2904761904761905,
"grad_norm": 0.6162196397781372,
"learning_rate": 9.523809523809525e-06,
"loss": 1.2696,
"mean_token_accuracy": 0.6507794857025146,
"num_tokens": 36179186.0,
"step": 61
},
{
"epoch": 0.29523809523809524,
"grad_norm": 0.5662937760353088,
"learning_rate": 9.682539682539683e-06,
"loss": 1.2769,
"mean_token_accuracy": 0.6498540639877319,
"num_tokens": 36787338.0,
"step": 62
},
{
"epoch": 0.3,
"grad_norm": 0.6263328790664673,
"learning_rate": 9.841269841269842e-06,
"loss": 1.2659,
"mean_token_accuracy": 0.6512309908866882,
"num_tokens": 37376232.0,
"step": 63
},
{
"epoch": 0.3047619047619048,
"grad_norm": 0.5712647438049316,
"learning_rate": 1e-05,
"loss": 1.2575,
"mean_token_accuracy": 0.65373295545578,
"num_tokens": 37965066.0,
"step": 64
},
{
"epoch": 0.30952380952380953,
"grad_norm": 0.6364603042602539,
"learning_rate": 1e-05,
"loss": 1.2707,
"mean_token_accuracy": 0.6504393219947815,
"num_tokens": 38556474.0,
"step": 65
},
{
"epoch": 0.3142857142857143,
"grad_norm": 0.5501719117164612,
"learning_rate": 1e-05,
"loss": 1.2817,
"mean_token_accuracy": 0.6485756039619446,
"num_tokens": 39153957.0,
"step": 66
},
{
"epoch": 0.319047619047619,
"grad_norm": 0.6252837777137756,
"learning_rate": 1e-05,
"loss": 1.269,
"mean_token_accuracy": 0.6509230136871338,
"num_tokens": 39743079.0,
"step": 67
},
{
"epoch": 0.3238095238095238,
"grad_norm": 0.635744035243988,
"learning_rate": 1e-05,
"loss": 1.2538,
"mean_token_accuracy": 0.6549092531204224,
"num_tokens": 40341422.0,
"step": 68
},
{
"epoch": 0.32857142857142857,
"grad_norm": 0.602989137172699,
"learning_rate": 1e-05,
"loss": 1.2522,
"mean_token_accuracy": 0.6547552347183228,
"num_tokens": 40930579.0,
"step": 69
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.6224581003189087,
"learning_rate": 1e-05,
"loss": 1.2475,
"mean_token_accuracy": 0.6561790704727173,
"num_tokens": 41521392.0,
"step": 70
},
{
"epoch": 0.3380952380952381,
"grad_norm": 0.6388071179389954,
"learning_rate": 1e-05,
"loss": 1.2652,
"mean_token_accuracy": 0.6521209478378296,
"num_tokens": 42126117.0,
"step": 71
},
{
"epoch": 0.34285714285714286,
"grad_norm": 0.6036304235458374,
"learning_rate": 1e-05,
"loss": 1.2435,
"mean_token_accuracy": 0.6566687822341919,
"num_tokens": 42717085.0,
"step": 72
},
{
"epoch": 0.3476190476190476,
"grad_norm": 0.6735650300979614,
"learning_rate": 1e-05,
"loss": 1.2474,
"mean_token_accuracy": 0.6550711989402771,
"num_tokens": 43300932.0,
"step": 73
},
{
"epoch": 0.3523809523809524,
"grad_norm": 0.6821399927139282,
"learning_rate": 1e-05,
"loss": 1.2612,
"mean_token_accuracy": 0.6513347625732422,
"num_tokens": 43885512.0,
"step": 74
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.5906922221183777,
"learning_rate": 1e-05,
"loss": 1.2462,
"mean_token_accuracy": 0.6552602052688599,
"num_tokens": 44482626.0,
"step": 75
},
{
"epoch": 0.3619047619047619,
"grad_norm": 0.6703640222549438,
"learning_rate": 1e-05,
"loss": 1.2555,
"mean_token_accuracy": 0.6526749134063721,
"num_tokens": 45073331.0,
"step": 76
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.6432617902755737,
"learning_rate": 1e-05,
"loss": 1.2536,
"mean_token_accuracy": 0.654289186000824,
"num_tokens": 45683001.0,
"step": 77
},
{
"epoch": 0.37142857142857144,
"grad_norm": 0.5765655040740967,
"learning_rate": 1e-05,
"loss": 1.2571,
"mean_token_accuracy": 0.6539218425750732,
"num_tokens": 46280871.0,
"step": 78
},
{
"epoch": 0.3761904761904762,
"grad_norm": 0.6340111494064331,
"learning_rate": 1e-05,
"loss": 1.2372,
"mean_token_accuracy": 0.6561391353607178,
"num_tokens": 46860927.0,
"step": 79
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.6405033469200134,
"learning_rate": 1e-05,
"loss": 1.2526,
"mean_token_accuracy": 0.6536115407943726,
"num_tokens": 47450747.0,
"step": 80
},
{
"epoch": 0.38571428571428573,
"grad_norm": 0.5792959332466125,
"learning_rate": 1e-05,
"loss": 1.25,
"mean_token_accuracy": 0.6553176641464233,
"num_tokens": 48053355.0,
"step": 81
},
{
"epoch": 0.3904761904761905,
"grad_norm": 0.686775267124176,
"learning_rate": 1e-05,
"loss": 1.2208,
"mean_token_accuracy": 0.659858226776123,
"num_tokens": 48654406.0,
"step": 82
},
{
"epoch": 0.3952380952380952,
"grad_norm": 0.6492419838905334,
"learning_rate": 1e-05,
"loss": 1.2283,
"mean_token_accuracy": 0.6583410501480103,
"num_tokens": 49253902.0,
"step": 83
},
{
"epoch": 0.4,
"grad_norm": 0.5871007442474365,
"learning_rate": 1e-05,
"loss": 1.2452,
"mean_token_accuracy": 0.6552358269691467,
"num_tokens": 49851728.0,
"step": 84
},
{
"epoch": 0.40476190476190477,
"grad_norm": 0.5860946774482727,
"learning_rate": 1e-05,
"loss": 1.2512,
"mean_token_accuracy": 0.6536369919776917,
"num_tokens": 50456288.0,
"step": 85
},
{
"epoch": 0.4095238095238095,
"grad_norm": 0.6220575571060181,
"learning_rate": 1e-05,
"loss": 1.2576,
"mean_token_accuracy": 0.6526967883110046,
"num_tokens": 51058176.0,
"step": 86
},
{
"epoch": 0.4142857142857143,
"grad_norm": 0.6111760139465332,
"learning_rate": 1e-05,
"loss": 1.2426,
"mean_token_accuracy": 0.6556516885757446,
"num_tokens": 51665178.0,
"step": 87
},
{
"epoch": 0.41904761904761906,
"grad_norm": 0.7028889060020447,
"learning_rate": 1e-05,
"loss": 1.2275,
"mean_token_accuracy": 0.658629298210144,
"num_tokens": 52237427.0,
"step": 88
},
{
"epoch": 0.4238095238095238,
"grad_norm": 0.6114148497581482,
"learning_rate": 1e-05,
"loss": 1.2493,
"mean_token_accuracy": 0.6530453562736511,
"num_tokens": 52850605.0,
"step": 89
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.6214424967765808,
"learning_rate": 1e-05,
"loss": 1.2107,
"mean_token_accuracy": 0.6619127988815308,
"num_tokens": 53435907.0,
"step": 90
},
{
"epoch": 0.43333333333333335,
"grad_norm": 0.6224313378334045,
"learning_rate": 1e-05,
"loss": 1.2479,
"mean_token_accuracy": 0.6531662344932556,
"num_tokens": 54032690.0,
"step": 91
},
{
"epoch": 0.4380952380952381,
"grad_norm": 0.5745725035667419,
"learning_rate": 1e-05,
"loss": 1.2339,
"mean_token_accuracy": 0.6577485799789429,
"num_tokens": 54631908.0,
"step": 92
},
{
"epoch": 0.44285714285714284,
"grad_norm": 0.6754887104034424,
"learning_rate": 1e-05,
"loss": 1.2274,
"mean_token_accuracy": 0.6584199666976929,
"num_tokens": 55218598.0,
"step": 93
},
{
"epoch": 0.44761904761904764,
"grad_norm": 0.6922246813774109,
"learning_rate": 1e-05,
"loss": 1.2513,
"mean_token_accuracy": 0.6527312397956848,
"num_tokens": 55814642.0,
"step": 94
},
{
"epoch": 0.4523809523809524,
"grad_norm": 0.5802931189537048,
"learning_rate": 1e-05,
"loss": 1.2231,
"mean_token_accuracy": 0.6605392694473267,
"num_tokens": 56410000.0,
"step": 95
},
{
"epoch": 0.45714285714285713,
"grad_norm": 0.7186371088027954,
"learning_rate": 1e-05,
"loss": 1.2325,
"mean_token_accuracy": 0.6574358940124512,
"num_tokens": 57001902.0,
"step": 96
},
{
"epoch": 0.46190476190476193,
"grad_norm": 0.5912067294120789,
"learning_rate": 1e-05,
"loss": 1.2413,
"mean_token_accuracy": 0.6551775336265564,
"num_tokens": 57612227.0,
"step": 97
},
{
"epoch": 0.4666666666666667,
"grad_norm": 0.7110946774482727,
"learning_rate": 1e-05,
"loss": 1.2272,
"mean_token_accuracy": 0.6573148965835571,
"num_tokens": 58198983.0,
"step": 98
},
{
"epoch": 0.4714285714285714,
"grad_norm": 0.703130841255188,
"learning_rate": 1e-05,
"loss": 1.2488,
"mean_token_accuracy": 0.6536985039710999,
"num_tokens": 58805739.0,
"step": 99
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.6474947333335876,
"learning_rate": 1e-05,
"loss": 1.216,
"mean_token_accuracy": 0.6609683036804199,
"num_tokens": 59386596.0,
"step": 100
},
{
"epoch": 0.48095238095238096,
"grad_norm": 0.7493091225624084,
"learning_rate": 1e-05,
"loss": 1.2239,
"mean_token_accuracy": 0.6587037444114685,
"num_tokens": 59976677.0,
"step": 101
},
{
"epoch": 0.4857142857142857,
"grad_norm": 0.6101422905921936,
"learning_rate": 1e-05,
"loss": 1.2366,
"mean_token_accuracy": 0.6560448408126831,
"num_tokens": 60581023.0,
"step": 102
},
{
"epoch": 0.49047619047619045,
"grad_norm": 0.7304781079292297,
"learning_rate": 1e-05,
"loss": 1.2269,
"mean_token_accuracy": 0.6589258909225464,
"num_tokens": 61177587.0,
"step": 103
},
{
"epoch": 0.49523809523809526,
"grad_norm": 0.618215024471283,
"learning_rate": 1e-05,
"loss": 1.2207,
"mean_token_accuracy": 0.6586862802505493,
"num_tokens": 61759739.0,
"step": 104
},
{
"epoch": 0.5,
"grad_norm": 0.6789980530738831,
"learning_rate": 1e-05,
"loss": 1.2283,
"mean_token_accuracy": 0.6580797433853149,
"num_tokens": 62343623.0,
"step": 105
},
{
"epoch": 0.5047619047619047,
"grad_norm": 0.6834375858306885,
"learning_rate": 1e-05,
"loss": 1.2226,
"mean_token_accuracy": 0.6588083505630493,
"num_tokens": 62936609.0,
"step": 106
},
{
"epoch": 0.5095238095238095,
"grad_norm": 0.6128349304199219,
"learning_rate": 1e-05,
"loss": 1.219,
"mean_token_accuracy": 0.6602170467376709,
"num_tokens": 63540035.0,
"step": 107
},
{
"epoch": 0.5142857142857142,
"grad_norm": 0.6424954533576965,
"learning_rate": 1e-05,
"loss": 1.2252,
"mean_token_accuracy": 0.6583743691444397,
"num_tokens": 64137406.0,
"step": 108
},
{
"epoch": 0.5190476190476191,
"grad_norm": 0.566566526889801,
"learning_rate": 1e-05,
"loss": 1.2104,
"mean_token_accuracy": 0.6621809005737305,
"num_tokens": 64747343.0,
"step": 109
},
{
"epoch": 0.5238095238095238,
"grad_norm": 0.5913292169570923,
"learning_rate": 1e-05,
"loss": 1.21,
"mean_token_accuracy": 0.6611165404319763,
"num_tokens": 65340433.0,
"step": 110
},
{
"epoch": 0.5285714285714286,
"grad_norm": 0.5560601353645325,
"learning_rate": 1e-05,
"loss": 1.2029,
"mean_token_accuracy": 0.6629985570907593,
"num_tokens": 65928375.0,
"step": 111
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.5711589455604553,
"learning_rate": 1e-05,
"loss": 1.2285,
"mean_token_accuracy": 0.6574028134346008,
"num_tokens": 66527455.0,
"step": 112
},
{
"epoch": 0.5380952380952381,
"grad_norm": 0.5675383806228638,
"learning_rate": 1e-05,
"loss": 1.2001,
"mean_token_accuracy": 0.6645528674125671,
"num_tokens": 67120147.0,
"step": 113
},
{
"epoch": 0.5428571428571428,
"grad_norm": 0.5860258340835571,
"learning_rate": 1e-05,
"loss": 1.2182,
"mean_token_accuracy": 0.6599565744400024,
"num_tokens": 67726850.0,
"step": 114
},
{
"epoch": 0.5476190476190477,
"grad_norm": 0.5209094285964966,
"learning_rate": 1e-05,
"loss": 1.2126,
"mean_token_accuracy": 0.6609143018722534,
"num_tokens": 68316713.0,
"step": 115
},
{
"epoch": 0.5523809523809524,
"grad_norm": 0.6333171725273132,
"learning_rate": 1e-05,
"loss": 1.2156,
"mean_token_accuracy": 0.6600525379180908,
"num_tokens": 68892365.0,
"step": 116
},
{
"epoch": 0.5571428571428572,
"grad_norm": 0.5704973340034485,
"learning_rate": 1e-05,
"loss": 1.2211,
"mean_token_accuracy": 0.6591875553131104,
"num_tokens": 69505524.0,
"step": 117
},
{
"epoch": 0.5619047619047619,
"grad_norm": 0.7181419134140015,
"learning_rate": 1e-05,
"loss": 1.2036,
"mean_token_accuracy": 0.6623135805130005,
"num_tokens": 70095302.0,
"step": 118
},
{
"epoch": 0.5666666666666667,
"grad_norm": 0.5681948661804199,
"learning_rate": 1e-05,
"loss": 1.216,
"mean_token_accuracy": 0.6598063707351685,
"num_tokens": 70694971.0,
"step": 119
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.7001712918281555,
"learning_rate": 1e-05,
"loss": 1.2146,
"mean_token_accuracy": 0.6608985662460327,
"num_tokens": 71279415.0,
"step": 120
},
{
"epoch": 0.5761904761904761,
"grad_norm": 0.6377084255218506,
"learning_rate": 1e-05,
"loss": 1.209,
"mean_token_accuracy": 0.6621115207672119,
"num_tokens": 71869014.0,
"step": 121
},
{
"epoch": 0.580952380952381,
"grad_norm": 0.6364737153053284,
"learning_rate": 1e-05,
"loss": 1.2171,
"mean_token_accuracy": 0.6591671705245972,
"num_tokens": 72472715.0,
"step": 122
},
{
"epoch": 0.5857142857142857,
"grad_norm": 0.6466585397720337,
"learning_rate": 1e-05,
"loss": 1.2089,
"mean_token_accuracy": 0.661442756652832,
"num_tokens": 73055740.0,
"step": 123
},
{
"epoch": 0.5904761904761905,
"grad_norm": 0.5920109152793884,
"learning_rate": 1e-05,
"loss": 1.1924,
"mean_token_accuracy": 0.6659133434295654,
"num_tokens": 73639151.0,
"step": 124
},
{
"epoch": 0.5952380952380952,
"grad_norm": 0.6872738599777222,
"learning_rate": 1e-05,
"loss": 1.2113,
"mean_token_accuracy": 0.6628360152244568,
"num_tokens": 74216756.0,
"step": 125
},
{
"epoch": 0.6,
"grad_norm": 0.5881339907646179,
"learning_rate": 1e-05,
"loss": 1.2062,
"mean_token_accuracy": 0.662140965461731,
"num_tokens": 74813953.0,
"step": 126
},
{
"epoch": 0.6047619047619047,
"grad_norm": 0.6483287215232849,
"learning_rate": 1e-05,
"loss": 1.2065,
"mean_token_accuracy": 0.6624675989151001,
"num_tokens": 75410691.0,
"step": 127
},
{
"epoch": 0.6095238095238096,
"grad_norm": 0.5890834331512451,
"learning_rate": 1e-05,
"loss": 1.2235,
"mean_token_accuracy": 0.6575560569763184,
"num_tokens": 75996496.0,
"step": 128
},
{
"epoch": 0.6142857142857143,
"grad_norm": 0.6782101988792419,
"learning_rate": 1e-05,
"loss": 1.199,
"mean_token_accuracy": 0.6648662090301514,
"num_tokens": 76585198.0,
"step": 129
},
{
"epoch": 0.6190476190476191,
"grad_norm": 0.6252265572547913,
"learning_rate": 1e-05,
"loss": 1.1872,
"mean_token_accuracy": 0.6665824055671692,
"num_tokens": 77191596.0,
"step": 130
},
{
"epoch": 0.6238095238095238,
"grad_norm": 0.6833210587501526,
"learning_rate": 1e-05,
"loss": 1.2048,
"mean_token_accuracy": 0.6622829437255859,
"num_tokens": 77796998.0,
"step": 131
},
{
"epoch": 0.6285714285714286,
"grad_norm": 0.6870852708816528,
"learning_rate": 1e-05,
"loss": 1.2171,
"mean_token_accuracy": 0.6590390801429749,
"num_tokens": 78395104.0,
"step": 132
},
{
"epoch": 0.6333333333333333,
"grad_norm": 0.7417638897895813,
"learning_rate": 1e-05,
"loss": 1.2036,
"mean_token_accuracy": 0.66297847032547,
"num_tokens": 78988563.0,
"step": 133
},
{
"epoch": 0.638095238095238,
"grad_norm": 0.569595456123352,
"learning_rate": 1e-05,
"loss": 1.2234,
"mean_token_accuracy": 0.6573336124420166,
"num_tokens": 79599633.0,
"step": 134
},
{
"epoch": 0.6428571428571429,
"grad_norm": 0.8054560422897339,
"learning_rate": 1e-05,
"loss": 1.2149,
"mean_token_accuracy": 0.6601018905639648,
"num_tokens": 80196954.0,
"step": 135
},
{
"epoch": 0.6476190476190476,
"grad_norm": 0.6360299587249756,
"learning_rate": 1e-05,
"loss": 1.2141,
"mean_token_accuracy": 0.6599046587944031,
"num_tokens": 80790959.0,
"step": 136
},
{
"epoch": 0.6523809523809524,
"grad_norm": 0.7952516078948975,
"learning_rate": 1e-05,
"loss": 1.2004,
"mean_token_accuracy": 0.6641189455986023,
"num_tokens": 81363350.0,
"step": 137
},
{
"epoch": 0.6571428571428571,
"grad_norm": 0.7050403356552124,
"learning_rate": 1e-05,
"loss": 1.2017,
"mean_token_accuracy": 0.6631975173950195,
"num_tokens": 81960356.0,
"step": 138
},
{
"epoch": 0.6619047619047619,
"grad_norm": 0.809806227684021,
"learning_rate": 1e-05,
"loss": 1.2119,
"mean_token_accuracy": 0.6605896353721619,
"num_tokens": 82573967.0,
"step": 139
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.7040579915046692,
"learning_rate": 1e-05,
"loss": 1.1997,
"mean_token_accuracy": 0.6634917259216309,
"num_tokens": 83170751.0,
"step": 140
},
{
"epoch": 0.6714285714285714,
"grad_norm": 0.7381901144981384,
"learning_rate": 1e-05,
"loss": 1.1815,
"mean_token_accuracy": 0.6675858497619629,
"num_tokens": 83744022.0,
"step": 141
},
{
"epoch": 0.6761904761904762,
"grad_norm": 0.6610327959060669,
"learning_rate": 1e-05,
"loss": 1.2172,
"mean_token_accuracy": 0.659174382686615,
"num_tokens": 84336667.0,
"step": 142
},
{
"epoch": 0.680952380952381,
"grad_norm": 0.8185865879058838,
"learning_rate": 1e-05,
"loss": 1.199,
"mean_token_accuracy": 0.6634737253189087,
"num_tokens": 84927599.0,
"step": 143
},
{
"epoch": 0.6857142857142857,
"grad_norm": 0.6603442430496216,
"learning_rate": 1e-05,
"loss": 1.1976,
"mean_token_accuracy": 0.6643534898757935,
"num_tokens": 85516121.0,
"step": 144
},
{
"epoch": 0.6904761904761905,
"grad_norm": 0.7519460320472717,
"learning_rate": 1e-05,
"loss": 1.1926,
"mean_token_accuracy": 0.664984941482544,
"num_tokens": 86106161.0,
"step": 145
},
{
"epoch": 0.6952380952380952,
"grad_norm": 0.7080089449882507,
"learning_rate": 1e-05,
"loss": 1.2086,
"mean_token_accuracy": 0.6621935963630676,
"num_tokens": 86723880.0,
"step": 146
},
{
"epoch": 0.7,
"grad_norm": 0.7303557395935059,
"learning_rate": 1e-05,
"loss": 1.2033,
"mean_token_accuracy": 0.6634014248847961,
"num_tokens": 87327542.0,
"step": 147
},
{
"epoch": 0.7047619047619048,
"grad_norm": 0.6376964449882507,
"learning_rate": 1e-05,
"loss": 1.1977,
"mean_token_accuracy": 0.6633247137069702,
"num_tokens": 87912557.0,
"step": 148
},
{
"epoch": 0.7095238095238096,
"grad_norm": 0.6810888051986694,
"learning_rate": 1e-05,
"loss": 1.2087,
"mean_token_accuracy": 0.6617689728736877,
"num_tokens": 88514499.0,
"step": 149
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.6272366046905518,
"learning_rate": 1e-05,
"loss": 1.1879,
"mean_token_accuracy": 0.6662660837173462,
"num_tokens": 89090412.0,
"step": 150
},
{
"epoch": 0.719047619047619,
"grad_norm": 0.6499550938606262,
"learning_rate": 1e-05,
"loss": 1.1978,
"mean_token_accuracy": 0.6638685464859009,
"num_tokens": 89689944.0,
"step": 151
},
{
"epoch": 0.7238095238095238,
"grad_norm": 0.6450507640838623,
"learning_rate": 1e-05,
"loss": 1.2088,
"mean_token_accuracy": 0.6614329218864441,
"num_tokens": 90281605.0,
"step": 152
},
{
"epoch": 0.7285714285714285,
"grad_norm": 0.6113287806510925,
"learning_rate": 1e-05,
"loss": 1.2095,
"mean_token_accuracy": 0.6616454124450684,
"num_tokens": 90877169.0,
"step": 153
},
{
"epoch": 0.7333333333333333,
"grad_norm": 0.6421619653701782,
"learning_rate": 1e-05,
"loss": 1.2141,
"mean_token_accuracy": 0.6598343253135681,
"num_tokens": 91473587.0,
"step": 154
},
{
"epoch": 0.7380952380952381,
"grad_norm": 0.5994828939437866,
"learning_rate": 1e-05,
"loss": 1.2069,
"mean_token_accuracy": 0.6615887880325317,
"num_tokens": 92066142.0,
"step": 155
},
{
"epoch": 0.7428571428571429,
"grad_norm": 0.5635871887207031,
"learning_rate": 1e-05,
"loss": 1.1885,
"mean_token_accuracy": 0.6657248735427856,
"num_tokens": 92671294.0,
"step": 156
},
{
"epoch": 0.7476190476190476,
"grad_norm": 0.5961142778396606,
"learning_rate": 1e-05,
"loss": 1.1915,
"mean_token_accuracy": 0.6649054884910583,
"num_tokens": 93267004.0,
"step": 157
},
{
"epoch": 0.7523809523809524,
"grad_norm": 0.5518187284469604,
"learning_rate": 1e-05,
"loss": 1.2093,
"mean_token_accuracy": 0.6612235307693481,
"num_tokens": 93865099.0,
"step": 158
},
{
"epoch": 0.7571428571428571,
"grad_norm": 0.6183374524116516,
"learning_rate": 1e-05,
"loss": 1.1825,
"mean_token_accuracy": 0.6676396131515503,
"num_tokens": 94449283.0,
"step": 159
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.5925056338310242,
"learning_rate": 1e-05,
"loss": 1.1927,
"mean_token_accuracy": 0.6643291711807251,
"num_tokens": 95037160.0,
"step": 160
},
{
"epoch": 0.7666666666666667,
"grad_norm": 0.6148018836975098,
"learning_rate": 1e-05,
"loss": 1.1761,
"mean_token_accuracy": 0.6689929962158203,
"num_tokens": 95620329.0,
"step": 161
},
{
"epoch": 0.7714285714285715,
"grad_norm": 0.6416387557983398,
"learning_rate": 1e-05,
"loss": 1.1978,
"mean_token_accuracy": 0.6625751256942749,
"num_tokens": 96202979.0,
"step": 162
},
{
"epoch": 0.7761904761904762,
"grad_norm": 0.5393695831298828,
"learning_rate": 1e-05,
"loss": 1.1918,
"mean_token_accuracy": 0.665260910987854,
"num_tokens": 96794135.0,
"step": 163
},
{
"epoch": 0.780952380952381,
"grad_norm": 0.6334103941917419,
"learning_rate": 1e-05,
"loss": 1.1821,
"mean_token_accuracy": 0.6664952635765076,
"num_tokens": 97380180.0,
"step": 164
},
{
"epoch": 0.7857142857142857,
"grad_norm": 0.6443802118301392,
"learning_rate": 1e-05,
"loss": 1.2005,
"mean_token_accuracy": 0.663545548915863,
"num_tokens": 97979583.0,
"step": 165
},
{
"epoch": 0.7904761904761904,
"grad_norm": 0.6070786714553833,
"learning_rate": 1e-05,
"loss": 1.1818,
"mean_token_accuracy": 0.6681106686592102,
"num_tokens": 98573453.0,
"step": 166
},
{
"epoch": 0.7952380952380952,
"grad_norm": 0.5983892679214478,
"learning_rate": 1e-05,
"loss": 1.189,
"mean_token_accuracy": 0.6651272177696228,
"num_tokens": 99162518.0,
"step": 167
},
{
"epoch": 0.8,
"grad_norm": 0.5511825084686279,
"learning_rate": 1e-05,
"loss": 1.1859,
"mean_token_accuracy": 0.6656243801116943,
"num_tokens": 99755688.0,
"step": 168
},
{
"epoch": 0.8047619047619048,
"grad_norm": 0.5612326264381409,
"learning_rate": 1e-05,
"loss": 1.1923,
"mean_token_accuracy": 0.6645892858505249,
"num_tokens": 100367122.0,
"step": 169
},
{
"epoch": 0.8095238095238095,
"grad_norm": 0.6149346232414246,
"learning_rate": 1e-05,
"loss": 1.1866,
"mean_token_accuracy": 0.665177583694458,
"num_tokens": 100966663.0,
"step": 170
},
{
"epoch": 0.8142857142857143,
"grad_norm": 0.5557584166526794,
"learning_rate": 1e-05,
"loss": 1.1993,
"mean_token_accuracy": 0.6638921499252319,
"num_tokens": 101561561.0,
"step": 171
},
{
"epoch": 0.819047619047619,
"grad_norm": 0.6174666285514832,
"learning_rate": 1e-05,
"loss": 1.2058,
"mean_token_accuracy": 0.6619209051132202,
"num_tokens": 102150367.0,
"step": 172
},
{
"epoch": 0.8238095238095238,
"grad_norm": 0.6149846911430359,
"learning_rate": 1e-05,
"loss": 1.1956,
"mean_token_accuracy": 0.6646385788917542,
"num_tokens": 102744438.0,
"step": 173
},
{
"epoch": 0.8285714285714286,
"grad_norm": 0.6205980777740479,
"learning_rate": 1e-05,
"loss": 1.1944,
"mean_token_accuracy": 0.6641254425048828,
"num_tokens": 103336159.0,
"step": 174
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.6782044172286987,
"learning_rate": 1e-05,
"loss": 1.1993,
"mean_token_accuracy": 0.6630405187606812,
"num_tokens": 103933457.0,
"step": 175
},
{
"epoch": 0.8380952380952381,
"grad_norm": 0.6339226961135864,
"learning_rate": 1e-05,
"loss": 1.1854,
"mean_token_accuracy": 0.6652607917785645,
"num_tokens": 104528020.0,
"step": 176
},
{
"epoch": 0.8428571428571429,
"grad_norm": 0.604350209236145,
"learning_rate": 1e-05,
"loss": 1.2142,
"mean_token_accuracy": 0.6597182750701904,
"num_tokens": 105126562.0,
"step": 177
},
{
"epoch": 0.8476190476190476,
"grad_norm": 0.5730092525482178,
"learning_rate": 1e-05,
"loss": 1.1796,
"mean_token_accuracy": 0.6674203872680664,
"num_tokens": 105730229.0,
"step": 178
},
{
"epoch": 0.8523809523809524,
"grad_norm": 0.6724650263786316,
"learning_rate": 1e-05,
"loss": 1.201,
"mean_token_accuracy": 0.6622498035430908,
"num_tokens": 106338239.0,
"step": 179
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.5882953405380249,
"learning_rate": 1e-05,
"loss": 1.1982,
"mean_token_accuracy": 0.6630674600601196,
"num_tokens": 106929782.0,
"step": 180
},
{
"epoch": 0.861904761904762,
"grad_norm": 0.6305244565010071,
"learning_rate": 1e-05,
"loss": 1.1932,
"mean_token_accuracy": 0.6646133661270142,
"num_tokens": 107516950.0,
"step": 181
},
{
"epoch": 0.8666666666666667,
"grad_norm": 0.6297836899757385,
"learning_rate": 1e-05,
"loss": 1.1825,
"mean_token_accuracy": 0.6660134792327881,
"num_tokens": 108104046.0,
"step": 182
},
{
"epoch": 0.8714285714285714,
"grad_norm": 0.5446469783782959,
"learning_rate": 1e-05,
"loss": 1.1992,
"mean_token_accuracy": 0.6630533933639526,
"num_tokens": 108711068.0,
"step": 183
},
{
"epoch": 0.8761904761904762,
"grad_norm": 0.5844411253929138,
"learning_rate": 1e-05,
"loss": 1.1687,
"mean_token_accuracy": 0.669592022895813,
"num_tokens": 109294847.0,
"step": 184
},
{
"epoch": 0.8809523809523809,
"grad_norm": 0.6065420508384705,
"learning_rate": 1e-05,
"loss": 1.1886,
"mean_token_accuracy": 0.664987325668335,
"num_tokens": 109903424.0,
"step": 185
},
{
"epoch": 0.8857142857142857,
"grad_norm": 0.6002596616744995,
"learning_rate": 1e-05,
"loss": 1.1894,
"mean_token_accuracy": 0.66484135389328,
"num_tokens": 110515082.0,
"step": 186
},
{
"epoch": 0.8904761904761904,
"grad_norm": 0.5755858421325684,
"learning_rate": 1e-05,
"loss": 1.1887,
"mean_token_accuracy": 0.6651521325111389,
"num_tokens": 111105456.0,
"step": 187
},
{
"epoch": 0.8952380952380953,
"grad_norm": 0.6171888709068298,
"learning_rate": 1e-05,
"loss": 1.1893,
"mean_token_accuracy": 0.6657494306564331,
"num_tokens": 111699029.0,
"step": 188
},
{
"epoch": 0.9,
"grad_norm": 0.579205334186554,
"learning_rate": 1e-05,
"loss": 1.1659,
"mean_token_accuracy": 0.6696426272392273,
"num_tokens": 112280321.0,
"step": 189
},
{
"epoch": 0.9047619047619048,
"grad_norm": 0.6712483167648315,
"learning_rate": 1e-05,
"loss": 1.1677,
"mean_token_accuracy": 0.6694087982177734,
"num_tokens": 112860009.0,
"step": 190
},
{
"epoch": 0.9095238095238095,
"grad_norm": 0.6215792894363403,
"learning_rate": 1e-05,
"loss": 1.1872,
"mean_token_accuracy": 0.6649343967437744,
"num_tokens": 113457303.0,
"step": 191
},
{
"epoch": 0.9142857142857143,
"grad_norm": 0.5627334117889404,
"learning_rate": 1e-05,
"loss": 1.181,
"mean_token_accuracy": 0.6672377586364746,
"num_tokens": 114054977.0,
"step": 192
},
{
"epoch": 0.919047619047619,
"grad_norm": 0.5678215622901917,
"learning_rate": 1e-05,
"loss": 1.1778,
"mean_token_accuracy": 0.6673398613929749,
"num_tokens": 114641555.0,
"step": 193
},
{
"epoch": 0.9238095238095239,
"grad_norm": 0.5933332443237305,
"learning_rate": 1e-05,
"loss": 1.1939,
"mean_token_accuracy": 0.6647536754608154,
"num_tokens": 115241437.0,
"step": 194
},
{
"epoch": 0.9285714285714286,
"grad_norm": 0.5732199549674988,
"learning_rate": 1e-05,
"loss": 1.1714,
"mean_token_accuracy": 0.6686159372329712,
"num_tokens": 115845775.0,
"step": 195
},
{
"epoch": 0.9333333333333333,
"grad_norm": 0.6514256596565247,
"learning_rate": 1e-05,
"loss": 1.1782,
"mean_token_accuracy": 0.6679466962814331,
"num_tokens": 116452623.0,
"step": 196
},
{
"epoch": 0.9380952380952381,
"grad_norm": 0.5765755772590637,
"learning_rate": 1e-05,
"loss": 1.1861,
"mean_token_accuracy": 0.6654437780380249,
"num_tokens": 117045570.0,
"step": 197
},
{
"epoch": 0.9428571428571428,
"grad_norm": 0.7004836797714233,
"learning_rate": 1e-05,
"loss": 1.1638,
"mean_token_accuracy": 0.6707776784896851,
"num_tokens": 117654535.0,
"step": 198
},
{
"epoch": 0.9476190476190476,
"grad_norm": 0.5966997146606445,
"learning_rate": 1e-05,
"loss": 1.1772,
"mean_token_accuracy": 0.6684892177581787,
"num_tokens": 118247244.0,
"step": 199
},
{
"epoch": 0.9523809523809523,
"grad_norm": 0.6460300087928772,
"learning_rate": 1e-05,
"loss": 1.1713,
"mean_token_accuracy": 0.6694802045822144,
"num_tokens": 118843074.0,
"step": 200
},
{
"epoch": 0.9571428571428572,
"grad_norm": 0.599161684513092,
"learning_rate": 1e-05,
"loss": 1.1712,
"mean_token_accuracy": 0.6690815687179565,
"num_tokens": 119445023.0,
"step": 201
},
{
"epoch": 0.9619047619047619,
"grad_norm": 0.6229502558708191,
"learning_rate": 1e-05,
"loss": 1.1864,
"mean_token_accuracy": 0.6660387516021729,
"num_tokens": 120045748.0,
"step": 202
},
{
"epoch": 0.9666666666666667,
"grad_norm": 0.6429843306541443,
"learning_rate": 1e-05,
"loss": 1.1785,
"mean_token_accuracy": 0.6669691205024719,
"num_tokens": 120635079.0,
"step": 203
},
{
"epoch": 0.9714285714285714,
"grad_norm": 0.6153910756111145,
"learning_rate": 1e-05,
"loss": 1.1791,
"mean_token_accuracy": 0.66630619764328,
"num_tokens": 121220486.0,
"step": 204
},
{
"epoch": 0.9761904761904762,
"grad_norm": 0.6496953368186951,
"learning_rate": 1e-05,
"loss": 1.1804,
"mean_token_accuracy": 0.6666555404663086,
"num_tokens": 121800676.0,
"step": 205
},
{
"epoch": 0.9809523809523809,
"grad_norm": 0.6011868119239807,
"learning_rate": 1e-05,
"loss": 1.1842,
"mean_token_accuracy": 0.6658217906951904,
"num_tokens": 122409399.0,
"step": 206
},
{
"epoch": 0.9857142857142858,
"grad_norm": 0.857315182685852,
"learning_rate": 1e-05,
"loss": 1.1652,
"mean_token_accuracy": 0.6701173186302185,
"num_tokens": 123003502.0,
"step": 207
},
{
"epoch": 0.9904761904761905,
"grad_norm": 0.6711968183517456,
"learning_rate": 1e-05,
"loss": 1.1821,
"mean_token_accuracy": 0.6669960021972656,
"num_tokens": 123595838.0,
"step": 208
},
{
"epoch": 0.9952380952380953,
"grad_norm": 0.8044399619102478,
"learning_rate": 1e-05,
"loss": 1.1797,
"mean_token_accuracy": 0.6671728491783142,
"num_tokens": 124166476.0,
"step": 209
},
{
"epoch": 1.0,
"grad_norm": 0.724872887134552,
"learning_rate": 1e-05,
"loss": 1.1689,
"mean_token_accuracy": 0.66896653175354,
"num_tokens": 124761423.0,
"step": 210
},
{
"epoch": 1.0047619047619047,
"grad_norm": 0.7732614278793335,
"learning_rate": 1e-05,
"loss": 1.176,
"mean_token_accuracy": 0.6668572425842285,
"num_tokens": 125364371.0,
"step": 211
},
{
"epoch": 1.0095238095238095,
"grad_norm": 0.6983124017715454,
"learning_rate": 1e-05,
"loss": 1.1342,
"mean_token_accuracy": 0.6760746240615845,
"num_tokens": 125954118.0,
"step": 212
},
{
"epoch": 1.0142857142857142,
"grad_norm": 0.6097580790519714,
"learning_rate": 1e-05,
"loss": 1.1398,
"mean_token_accuracy": 0.6745401620864868,
"num_tokens": 126544991.0,
"step": 213
},
{
"epoch": 1.019047619047619,
"grad_norm": 0.6844852566719055,
"learning_rate": 1e-05,
"loss": 1.1425,
"mean_token_accuracy": 0.6751389503479004,
"num_tokens": 127151772.0,
"step": 214
},
{
"epoch": 1.0238095238095237,
"grad_norm": 0.7108845114707947,
"learning_rate": 1e-05,
"loss": 1.1472,
"mean_token_accuracy": 0.6734536290168762,
"num_tokens": 127762517.0,
"step": 215
},
{
"epoch": 1.0285714285714285,
"grad_norm": 0.7051171660423279,
"learning_rate": 1e-05,
"loss": 1.1516,
"mean_token_accuracy": 0.672438383102417,
"num_tokens": 128358892.0,
"step": 216
},
{
"epoch": 1.0333333333333334,
"grad_norm": 0.742440938949585,
"learning_rate": 1e-05,
"loss": 1.1486,
"mean_token_accuracy": 0.6727321743965149,
"num_tokens": 128930309.0,
"step": 217
},
{
"epoch": 1.0380952380952382,
"grad_norm": 0.6921288371086121,
"learning_rate": 1e-05,
"loss": 1.1336,
"mean_token_accuracy": 0.6769453883171082,
"num_tokens": 129537678.0,
"step": 218
},
{
"epoch": 1.042857142857143,
"grad_norm": 0.6531715989112854,
"learning_rate": 1e-05,
"loss": 1.1486,
"mean_token_accuracy": 0.6732891201972961,
"num_tokens": 130113717.0,
"step": 219
},
{
"epoch": 1.0476190476190477,
"grad_norm": 0.8497748970985413,
"learning_rate": 1e-05,
"loss": 1.1554,
"mean_token_accuracy": 0.6714987754821777,
"num_tokens": 130724521.0,
"step": 220
},
{
"epoch": 1.0523809523809524,
"grad_norm": 0.6819850206375122,
"learning_rate": 1e-05,
"loss": 1.1407,
"mean_token_accuracy": 0.6752928495407104,
"num_tokens": 131298037.0,
"step": 221
},
{
"epoch": 1.0571428571428572,
"grad_norm": 0.785930335521698,
"learning_rate": 1e-05,
"loss": 1.1486,
"mean_token_accuracy": 0.6729685068130493,
"num_tokens": 131909779.0,
"step": 222
},
{
"epoch": 1.061904761904762,
"grad_norm": 0.6023511290550232,
"learning_rate": 1e-05,
"loss": 1.1458,
"mean_token_accuracy": 0.6734186410903931,
"num_tokens": 132506621.0,
"step": 223
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.8720818758010864,
"learning_rate": 1e-05,
"loss": 1.1498,
"mean_token_accuracy": 0.6726520657539368,
"num_tokens": 133124443.0,
"step": 224
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.6429004073143005,
"learning_rate": 1e-05,
"loss": 1.1608,
"mean_token_accuracy": 0.6698133945465088,
"num_tokens": 133719672.0,
"step": 225
},
{
"epoch": 1.0761904761904761,
"grad_norm": 0.7744424343109131,
"learning_rate": 1e-05,
"loss": 1.1357,
"mean_token_accuracy": 0.6747680306434631,
"num_tokens": 134309852.0,
"step": 226
},
{
"epoch": 1.0809523809523809,
"grad_norm": 0.7106124758720398,
"learning_rate": 1e-05,
"loss": 1.1472,
"mean_token_accuracy": 0.6723679900169373,
"num_tokens": 134890952.0,
"step": 227
},
{
"epoch": 1.0857142857142856,
"grad_norm": 0.8420917987823486,
"learning_rate": 1e-05,
"loss": 1.1144,
"mean_token_accuracy": 0.6813350915908813,
"num_tokens": 135479588.0,
"step": 228
},
{
"epoch": 1.0904761904761904,
"grad_norm": 0.7307847738265991,
"learning_rate": 1e-05,
"loss": 1.14,
"mean_token_accuracy": 0.6748834848403931,
"num_tokens": 136065836.0,
"step": 229
},
{
"epoch": 1.0952380952380953,
"grad_norm": 0.6740959882736206,
"learning_rate": 1e-05,
"loss": 1.1377,
"mean_token_accuracy": 0.6761696934700012,
"num_tokens": 136668062.0,
"step": 230
},
{
"epoch": 1.1,
"grad_norm": 0.6920994520187378,
"learning_rate": 1e-05,
"loss": 1.1398,
"mean_token_accuracy": 0.6743276715278625,
"num_tokens": 137256533.0,
"step": 231
},
{
"epoch": 1.1047619047619048,
"grad_norm": 0.6870349645614624,
"learning_rate": 1e-05,
"loss": 1.1459,
"mean_token_accuracy": 0.6732701063156128,
"num_tokens": 137848246.0,
"step": 232
},
{
"epoch": 1.1095238095238096,
"grad_norm": 0.6535449028015137,
"learning_rate": 1e-05,
"loss": 1.1494,
"mean_token_accuracy": 0.6729423999786377,
"num_tokens": 138450870.0,
"step": 233
},
{
"epoch": 1.1142857142857143,
"grad_norm": 0.6108024716377258,
"learning_rate": 1e-05,
"loss": 1.1498,
"mean_token_accuracy": 0.671773374080658,
"num_tokens": 139048178.0,
"step": 234
},
{
"epoch": 1.119047619047619,
"grad_norm": 0.618743360042572,
"learning_rate": 1e-05,
"loss": 1.1394,
"mean_token_accuracy": 0.6749163866043091,
"num_tokens": 139647536.0,
"step": 235
},
{
"epoch": 1.1238095238095238,
"grad_norm": 0.5873496532440186,
"learning_rate": 1e-05,
"loss": 1.1428,
"mean_token_accuracy": 0.6742033958435059,
"num_tokens": 140237569.0,
"step": 236
},
{
"epoch": 1.1285714285714286,
"grad_norm": 0.6749809980392456,
"learning_rate": 1e-05,
"loss": 1.1462,
"mean_token_accuracy": 0.67291659116745,
"num_tokens": 140808948.0,
"step": 237
},
{
"epoch": 1.1333333333333333,
"grad_norm": 0.5988799333572388,
"learning_rate": 1e-05,
"loss": 1.1417,
"mean_token_accuracy": 0.6734879016876221,
"num_tokens": 141387906.0,
"step": 238
},
{
"epoch": 1.138095238095238,
"grad_norm": 0.7041788697242737,
"learning_rate": 1e-05,
"loss": 1.1421,
"mean_token_accuracy": 0.6749635934829712,
"num_tokens": 141991024.0,
"step": 239
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.677106499671936,
"learning_rate": 1e-05,
"loss": 1.122,
"mean_token_accuracy": 0.6789741516113281,
"num_tokens": 142585170.0,
"step": 240
},
{
"epoch": 1.1476190476190475,
"grad_norm": 0.6422439217567444,
"learning_rate": 1e-05,
"loss": 1.1509,
"mean_token_accuracy": 0.6719658374786377,
"num_tokens": 143178473.0,
"step": 241
},
{
"epoch": 1.1523809523809523,
"grad_norm": 0.6920860409736633,
"learning_rate": 1e-05,
"loss": 1.1511,
"mean_token_accuracy": 0.6708908677101135,
"num_tokens": 143782184.0,
"step": 242
},
{
"epoch": 1.157142857142857,
"grad_norm": 0.5582302212715149,
"learning_rate": 1e-05,
"loss": 1.1331,
"mean_token_accuracy": 0.6759682297706604,
"num_tokens": 144383051.0,
"step": 243
},
{
"epoch": 1.161904761904762,
"grad_norm": 0.6627556085586548,
"learning_rate": 1e-05,
"loss": 1.1432,
"mean_token_accuracy": 0.6744831204414368,
"num_tokens": 144977872.0,
"step": 244
},
{
"epoch": 1.1666666666666667,
"grad_norm": 0.5956741571426392,
"learning_rate": 1e-05,
"loss": 1.1456,
"mean_token_accuracy": 0.6733117699623108,
"num_tokens": 145573077.0,
"step": 245
},
{
"epoch": 1.1714285714285715,
"grad_norm": 0.7862910628318787,
"learning_rate": 1e-05,
"loss": 1.1464,
"mean_token_accuracy": 0.6739993691444397,
"num_tokens": 146165107.0,
"step": 246
},
{
"epoch": 1.1761904761904762,
"grad_norm": 0.6099702715873718,
"learning_rate": 1e-05,
"loss": 1.1393,
"mean_token_accuracy": 0.6740779876708984,
"num_tokens": 146763356.0,
"step": 247
},
{
"epoch": 1.180952380952381,
"grad_norm": 0.7584065198898315,
"learning_rate": 1e-05,
"loss": 1.136,
"mean_token_accuracy": 0.6759775280952454,
"num_tokens": 147358035.0,
"step": 248
},
{
"epoch": 1.1857142857142857,
"grad_norm": 0.6754823327064514,
"learning_rate": 1e-05,
"loss": 1.1523,
"mean_token_accuracy": 0.6710008978843689,
"num_tokens": 147955530.0,
"step": 249
},
{
"epoch": 1.1904761904761905,
"grad_norm": 0.6045711636543274,
"learning_rate": 1e-05,
"loss": 1.1468,
"mean_token_accuracy": 0.6728271245956421,
"num_tokens": 148547950.0,
"step": 250
},
{
"epoch": 1.1952380952380952,
"grad_norm": 0.6770275235176086,
"learning_rate": 1e-05,
"loss": 1.1309,
"mean_token_accuracy": 0.6762286424636841,
"num_tokens": 149127280.0,
"step": 251
},
{
"epoch": 1.2,
"grad_norm": 0.5667791366577148,
"learning_rate": 1e-05,
"loss": 1.1389,
"mean_token_accuracy": 0.6750789284706116,
"num_tokens": 149735096.0,
"step": 252
},
{
"epoch": 1.2047619047619047,
"grad_norm": 0.6122450232505798,
"learning_rate": 1e-05,
"loss": 1.1423,
"mean_token_accuracy": 0.6746940612792969,
"num_tokens": 150338864.0,
"step": 253
},
{
"epoch": 1.2095238095238094,
"grad_norm": 0.6596109867095947,
"learning_rate": 1e-05,
"loss": 1.1234,
"mean_token_accuracy": 0.6786649227142334,
"num_tokens": 150940365.0,
"step": 254
},
{
"epoch": 1.2142857142857142,
"grad_norm": 0.6414262652397156,
"learning_rate": 1e-05,
"loss": 1.1454,
"mean_token_accuracy": 0.6734991073608398,
"num_tokens": 151517902.0,
"step": 255
},
{
"epoch": 1.2190476190476192,
"grad_norm": 0.7465854287147522,
"learning_rate": 1e-05,
"loss": 1.1225,
"mean_token_accuracy": 0.6790366172790527,
"num_tokens": 152093932.0,
"step": 256
},
{
"epoch": 1.223809523809524,
"grad_norm": 0.6045883297920227,
"learning_rate": 1e-05,
"loss": 1.1281,
"mean_token_accuracy": 0.6779497861862183,
"num_tokens": 152690003.0,
"step": 257
},
{
"epoch": 1.2285714285714286,
"grad_norm": 0.7717053890228271,
"learning_rate": 1e-05,
"loss": 1.1305,
"mean_token_accuracy": 0.6769629716873169,
"num_tokens": 153278014.0,
"step": 258
},
{
"epoch": 1.2333333333333334,
"grad_norm": 0.6217109560966492,
"learning_rate": 1e-05,
"loss": 1.1377,
"mean_token_accuracy": 0.6756360530853271,
"num_tokens": 153871781.0,
"step": 259
},
{
"epoch": 1.2380952380952381,
"grad_norm": 0.7101379632949829,
"learning_rate": 1e-05,
"loss": 1.1396,
"mean_token_accuracy": 0.6745343208312988,
"num_tokens": 154466124.0,
"step": 260
},
{
"epoch": 1.2428571428571429,
"grad_norm": 0.6611591577529907,
"learning_rate": 1e-05,
"loss": 1.1342,
"mean_token_accuracy": 0.675082802772522,
"num_tokens": 155073053.0,
"step": 261
},
{
"epoch": 1.2476190476190476,
"grad_norm": 0.7041805386543274,
"learning_rate": 1e-05,
"loss": 1.1612,
"mean_token_accuracy": 0.6703898906707764,
"num_tokens": 155680694.0,
"step": 262
},
{
"epoch": 1.2523809523809524,
"grad_norm": 0.6518973708152771,
"learning_rate": 1e-05,
"loss": 1.1492,
"mean_token_accuracy": 0.6719495058059692,
"num_tokens": 156279612.0,
"step": 263
},
{
"epoch": 1.2571428571428571,
"grad_norm": 0.6293846368789673,
"learning_rate": 1e-05,
"loss": 1.1381,
"mean_token_accuracy": 0.6761749982833862,
"num_tokens": 156898086.0,
"step": 264
},
{
"epoch": 1.2619047619047619,
"grad_norm": 0.5713494420051575,
"learning_rate": 1e-05,
"loss": 1.1527,
"mean_token_accuracy": 0.6716663837432861,
"num_tokens": 157502996.0,
"step": 265
},
{
"epoch": 1.2666666666666666,
"grad_norm": 0.6561734676361084,
"learning_rate": 1e-05,
"loss": 1.1544,
"mean_token_accuracy": 0.6708611845970154,
"num_tokens": 158107778.0,
"step": 266
},
{
"epoch": 1.2714285714285714,
"grad_norm": 0.5799586772918701,
"learning_rate": 1e-05,
"loss": 1.1177,
"mean_token_accuracy": 0.6797953844070435,
"num_tokens": 158713147.0,
"step": 267
},
{
"epoch": 1.276190476190476,
"grad_norm": 0.5941030979156494,
"learning_rate": 1e-05,
"loss": 1.1255,
"mean_token_accuracy": 0.6776763200759888,
"num_tokens": 159292006.0,
"step": 268
},
{
"epoch": 1.2809523809523808,
"grad_norm": 0.6683588624000549,
"learning_rate": 1e-05,
"loss": 1.1234,
"mean_token_accuracy": 0.6778484582901001,
"num_tokens": 159889197.0,
"step": 269
},
{
"epoch": 1.2857142857142856,
"grad_norm": 0.6561569571495056,
"learning_rate": 1e-05,
"loss": 1.1378,
"mean_token_accuracy": 0.6750425696372986,
"num_tokens": 160485304.0,
"step": 270
},
{
"epoch": 1.2904761904761906,
"grad_norm": 0.5719537138938904,
"learning_rate": 1e-05,
"loss": 1.1404,
"mean_token_accuracy": 0.6747204065322876,
"num_tokens": 161092433.0,
"step": 271
},
{
"epoch": 1.2952380952380953,
"grad_norm": 0.6006868481636047,
"learning_rate": 1e-05,
"loss": 1.1396,
"mean_token_accuracy": 0.6749382019042969,
"num_tokens": 161683555.0,
"step": 272
},
{
"epoch": 1.3,
"grad_norm": 0.6102608442306519,
"learning_rate": 1e-05,
"loss": 1.1293,
"mean_token_accuracy": 0.6775893568992615,
"num_tokens": 162278973.0,
"step": 273
},
{
"epoch": 1.3047619047619048,
"grad_norm": 0.6217197179794312,
"learning_rate": 1e-05,
"loss": 1.1366,
"mean_token_accuracy": 0.6764044165611267,
"num_tokens": 162885270.0,
"step": 274
},
{
"epoch": 1.3095238095238095,
"grad_norm": 0.6187546253204346,
"learning_rate": 1e-05,
"loss": 1.1315,
"mean_token_accuracy": 0.6765252351760864,
"num_tokens": 163476311.0,
"step": 275
},
{
"epoch": 1.3142857142857143,
"grad_norm": 0.5942601561546326,
"learning_rate": 1e-05,
"loss": 1.1455,
"mean_token_accuracy": 0.6730477213859558,
"num_tokens": 164071314.0,
"step": 276
},
{
"epoch": 1.319047619047619,
"grad_norm": 0.5942831635475159,
"learning_rate": 1e-05,
"loss": 1.1321,
"mean_token_accuracy": 0.6764451861381531,
"num_tokens": 164663415.0,
"step": 277
},
{
"epoch": 1.3238095238095238,
"grad_norm": 0.6232311129570007,
"learning_rate": 1e-05,
"loss": 1.1269,
"mean_token_accuracy": 0.6775949597358704,
"num_tokens": 165256997.0,
"step": 278
},
{
"epoch": 1.3285714285714285,
"grad_norm": 0.6126914024353027,
"learning_rate": 1e-05,
"loss": 1.1317,
"mean_token_accuracy": 0.676669716835022,
"num_tokens": 165847922.0,
"step": 279
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.6624312400817871,
"learning_rate": 1e-05,
"loss": 1.1337,
"mean_token_accuracy": 0.6758729815483093,
"num_tokens": 166444541.0,
"step": 280
},
{
"epoch": 1.3380952380952382,
"grad_norm": 0.6634590029716492,
"learning_rate": 1e-05,
"loss": 1.1246,
"mean_token_accuracy": 0.6781991124153137,
"num_tokens": 167028591.0,
"step": 281
},
{
"epoch": 1.342857142857143,
"grad_norm": 0.7142046093940735,
"learning_rate": 1e-05,
"loss": 1.1473,
"mean_token_accuracy": 0.6724534034729004,
"num_tokens": 167627132.0,
"step": 282
},
{
"epoch": 1.3476190476190477,
"grad_norm": 0.5835825800895691,
"learning_rate": 1e-05,
"loss": 1.119,
"mean_token_accuracy": 0.6801720857620239,
"num_tokens": 168226854.0,
"step": 283
},
{
"epoch": 1.3523809523809525,
"grad_norm": 0.7441895008087158,
"learning_rate": 1e-05,
"loss": 1.1508,
"mean_token_accuracy": 0.6721788048744202,
"num_tokens": 168833732.0,
"step": 284
},
{
"epoch": 1.3571428571428572,
"grad_norm": 0.613866925239563,
"learning_rate": 1e-05,
"loss": 1.1263,
"mean_token_accuracy": 0.6785060167312622,
"num_tokens": 169426970.0,
"step": 285
},
{
"epoch": 1.361904761904762,
"grad_norm": 0.7395045161247253,
"learning_rate": 1e-05,
"loss": 1.1504,
"mean_token_accuracy": 0.6733224391937256,
"num_tokens": 170025787.0,
"step": 286
},
{
"epoch": 1.3666666666666667,
"grad_norm": 0.7011858224868774,
"learning_rate": 1e-05,
"loss": 1.1457,
"mean_token_accuracy": 0.6723621487617493,
"num_tokens": 170621857.0,
"step": 287
},
{
"epoch": 1.3714285714285714,
"grad_norm": 0.6301146149635315,
"learning_rate": 1e-05,
"loss": 1.1428,
"mean_token_accuracy": 0.6736270189285278,
"num_tokens": 171220708.0,
"step": 288
},
{
"epoch": 1.3761904761904762,
"grad_norm": 0.6546505093574524,
"learning_rate": 1e-05,
"loss": 1.1527,
"mean_token_accuracy": 0.6708388924598694,
"num_tokens": 171812508.0,
"step": 289
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.665846049785614,
"learning_rate": 1e-05,
"loss": 1.1239,
"mean_token_accuracy": 0.6771635413169861,
"num_tokens": 172401090.0,
"step": 290
},
{
"epoch": 1.3857142857142857,
"grad_norm": 0.6951489448547363,
"learning_rate": 1e-05,
"loss": 1.1303,
"mean_token_accuracy": 0.6767557263374329,
"num_tokens": 172989201.0,
"step": 291
},
{
"epoch": 1.3904761904761904,
"grad_norm": 0.6228903532028198,
"learning_rate": 1e-05,
"loss": 1.1316,
"mean_token_accuracy": 0.6754661798477173,
"num_tokens": 173563807.0,
"step": 292
},
{
"epoch": 1.3952380952380952,
"grad_norm": 0.7011890411376953,
"learning_rate": 1e-05,
"loss": 1.1303,
"mean_token_accuracy": 0.6768910884857178,
"num_tokens": 174159574.0,
"step": 293
},
{
"epoch": 1.4,
"grad_norm": 0.6298404932022095,
"learning_rate": 1e-05,
"loss": 1.1487,
"mean_token_accuracy": 0.672224223613739,
"num_tokens": 174744244.0,
"step": 294
},
{
"epoch": 1.4047619047619047,
"grad_norm": 0.6158511638641357,
"learning_rate": 1e-05,
"loss": 1.1315,
"mean_token_accuracy": 0.6756511926651001,
"num_tokens": 175341946.0,
"step": 295
},
{
"epoch": 1.4095238095238094,
"grad_norm": 0.6887179613113403,
"learning_rate": 1e-05,
"loss": 1.1019,
"mean_token_accuracy": 0.6828951239585876,
"num_tokens": 175904117.0,
"step": 296
},
{
"epoch": 1.4142857142857144,
"grad_norm": 0.64696204662323,
"learning_rate": 1e-05,
"loss": 1.1307,
"mean_token_accuracy": 0.6764581799507141,
"num_tokens": 176493621.0,
"step": 297
},
{
"epoch": 1.4190476190476191,
"grad_norm": 0.5804628133773804,
"learning_rate": 1e-05,
"loss": 1.1316,
"mean_token_accuracy": 0.6758260726928711,
"num_tokens": 177082157.0,
"step": 298
},
{
"epoch": 1.4238095238095239,
"grad_norm": 0.6294459104537964,
"learning_rate": 1e-05,
"loss": 1.1325,
"mean_token_accuracy": 0.6751164197921753,
"num_tokens": 177668681.0,
"step": 299
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.617782711982727,
"learning_rate": 1e-05,
"loss": 1.1352,
"mean_token_accuracy": 0.6748452186584473,
"num_tokens": 178256283.0,
"step": 300
},
{
"epoch": 1.4333333333333333,
"grad_norm": 0.6512781977653503,
"learning_rate": 1e-05,
"loss": 1.1468,
"mean_token_accuracy": 0.6721617579460144,
"num_tokens": 178850673.0,
"step": 301
},
{
"epoch": 1.438095238095238,
"grad_norm": 0.5774661898612976,
"learning_rate": 1e-05,
"loss": 1.1246,
"mean_token_accuracy": 0.6787533760070801,
"num_tokens": 179457871.0,
"step": 302
},
{
"epoch": 1.4428571428571428,
"grad_norm": 0.5992771983146667,
"learning_rate": 1e-05,
"loss": 1.1548,
"mean_token_accuracy": 0.6706414818763733,
"num_tokens": 180064071.0,
"step": 303
},
{
"epoch": 1.4476190476190476,
"grad_norm": 0.5943005681037903,
"learning_rate": 1e-05,
"loss": 1.1106,
"mean_token_accuracy": 0.6806790828704834,
"num_tokens": 180650796.0,
"step": 304
},
{
"epoch": 1.4523809523809523,
"grad_norm": 0.6455477476119995,
"learning_rate": 1e-05,
"loss": 1.1409,
"mean_token_accuracy": 0.6753484606742859,
"num_tokens": 181246825.0,
"step": 305
},
{
"epoch": 1.457142857142857,
"grad_norm": 0.5515779852867126,
"learning_rate": 1e-05,
"loss": 1.1429,
"mean_token_accuracy": 0.6737354397773743,
"num_tokens": 181855567.0,
"step": 306
},
{
"epoch": 1.461904761904762,
"grad_norm": 0.6088519096374512,
"learning_rate": 1e-05,
"loss": 1.1095,
"mean_token_accuracy": 0.680343508720398,
"num_tokens": 182433911.0,
"step": 307
},
{
"epoch": 1.4666666666666668,
"grad_norm": 0.6310312747955322,
"learning_rate": 1e-05,
"loss": 1.1307,
"mean_token_accuracy": 0.676478385925293,
"num_tokens": 183023144.0,
"step": 308
},
{
"epoch": 1.4714285714285715,
"grad_norm": 0.6333861947059631,
"learning_rate": 1e-05,
"loss": 1.1225,
"mean_token_accuracy": 0.6778949499130249,
"num_tokens": 183626514.0,
"step": 309
},
{
"epoch": 1.4761904761904763,
"grad_norm": 0.6410499811172485,
"learning_rate": 1e-05,
"loss": 1.1284,
"mean_token_accuracy": 0.6767443418502808,
"num_tokens": 184221439.0,
"step": 310
},
{
"epoch": 1.480952380952381,
"grad_norm": 0.6700615882873535,
"learning_rate": 1e-05,
"loss": 1.134,
"mean_token_accuracy": 0.6758592128753662,
"num_tokens": 184819506.0,
"step": 311
},
{
"epoch": 1.4857142857142858,
"grad_norm": 0.5785894989967346,
"learning_rate": 1e-05,
"loss": 1.1338,
"mean_token_accuracy": 0.6757279634475708,
"num_tokens": 185419019.0,
"step": 312
},
{
"epoch": 1.4904761904761905,
"grad_norm": 0.6253511309623718,
"learning_rate": 1e-05,
"loss": 1.1212,
"mean_token_accuracy": 0.6801990270614624,
"num_tokens": 186010772.0,
"step": 313
},
{
"epoch": 1.4952380952380953,
"grad_norm": 0.6034374237060547,
"learning_rate": 1e-05,
"loss": 1.1178,
"mean_token_accuracy": 0.6792829036712646,
"num_tokens": 186589243.0,
"step": 314
},
{
"epoch": 1.5,
"grad_norm": 0.6875804662704468,
"learning_rate": 1e-05,
"loss": 1.1165,
"mean_token_accuracy": 0.6799081563949585,
"num_tokens": 187182368.0,
"step": 315
},
{
"epoch": 1.5047619047619047,
"grad_norm": 0.5927019119262695,
"learning_rate": 1e-05,
"loss": 1.1179,
"mean_token_accuracy": 0.6792271733283997,
"num_tokens": 187763428.0,
"step": 316
},
{
"epoch": 1.5095238095238095,
"grad_norm": 0.5725839734077454,
"learning_rate": 1e-05,
"loss": 1.1129,
"mean_token_accuracy": 0.6808658838272095,
"num_tokens": 188359395.0,
"step": 317
},
{
"epoch": 1.5142857142857142,
"grad_norm": 0.6134579181671143,
"learning_rate": 1e-05,
"loss": 1.1329,
"mean_token_accuracy": 0.6752611398696899,
"num_tokens": 188952450.0,
"step": 318
},
{
"epoch": 1.519047619047619,
"grad_norm": 0.5980193018913269,
"learning_rate": 1e-05,
"loss": 1.1282,
"mean_token_accuracy": 0.6765316128730774,
"num_tokens": 189535853.0,
"step": 319
},
{
"epoch": 1.5238095238095237,
"grad_norm": 0.6418870091438293,
"learning_rate": 1e-05,
"loss": 1.1113,
"mean_token_accuracy": 0.6808905601501465,
"num_tokens": 190127386.0,
"step": 320
},
{
"epoch": 1.5285714285714285,
"grad_norm": 0.5932308435440063,
"learning_rate": 1e-05,
"loss": 1.1282,
"mean_token_accuracy": 0.6762252449989319,
"num_tokens": 190718877.0,
"step": 321
},
{
"epoch": 1.5333333333333332,
"grad_norm": 0.6508740782737732,
"learning_rate": 1e-05,
"loss": 1.1504,
"mean_token_accuracy": 0.6717185974121094,
"num_tokens": 191320553.0,
"step": 322
},
{
"epoch": 1.538095238095238,
"grad_norm": 0.6029355525970459,
"learning_rate": 1e-05,
"loss": 1.1219,
"mean_token_accuracy": 0.6790941953659058,
"num_tokens": 191911786.0,
"step": 323
},
{
"epoch": 1.5428571428571427,
"grad_norm": 0.5820804834365845,
"learning_rate": 1e-05,
"loss": 1.1483,
"mean_token_accuracy": 0.6729787588119507,
"num_tokens": 192517254.0,
"step": 324
},
{
"epoch": 1.5476190476190477,
"grad_norm": 0.6086446642875671,
"learning_rate": 1e-05,
"loss": 1.1438,
"mean_token_accuracy": 0.6730492115020752,
"num_tokens": 193113713.0,
"step": 325
},
{
"epoch": 1.5523809523809524,
"grad_norm": 0.6287596821784973,
"learning_rate": 1e-05,
"loss": 1.1255,
"mean_token_accuracy": 0.6779239177703857,
"num_tokens": 193718335.0,
"step": 326
},
{
"epoch": 1.5571428571428572,
"grad_norm": 0.6495358347892761,
"learning_rate": 1e-05,
"loss": 1.1267,
"mean_token_accuracy": 0.6764586567878723,
"num_tokens": 194303328.0,
"step": 327
},
{
"epoch": 1.561904761904762,
"grad_norm": 0.6034678816795349,
"learning_rate": 1e-05,
"loss": 1.1204,
"mean_token_accuracy": 0.6789346933364868,
"num_tokens": 194886509.0,
"step": 328
},
{
"epoch": 1.5666666666666667,
"grad_norm": 0.6537843346595764,
"learning_rate": 1e-05,
"loss": 1.1269,
"mean_token_accuracy": 0.678215742111206,
"num_tokens": 195456896.0,
"step": 329
},
{
"epoch": 1.5714285714285714,
"grad_norm": 0.5981965661048889,
"learning_rate": 1e-05,
"loss": 1.1237,
"mean_token_accuracy": 0.6771047115325928,
"num_tokens": 196053871.0,
"step": 330
},
{
"epoch": 1.5761904761904761,
"grad_norm": 0.7181389331817627,
"learning_rate": 1e-05,
"loss": 1.1236,
"mean_token_accuracy": 0.6774399280548096,
"num_tokens": 196654732.0,
"step": 331
},
{
"epoch": 1.580952380952381,
"grad_norm": 0.6066569089889526,
"learning_rate": 1e-05,
"loss": 1.1124,
"mean_token_accuracy": 0.6811067461967468,
"num_tokens": 197242864.0,
"step": 332
},
{
"epoch": 1.5857142857142859,
"grad_norm": 0.7779151797294617,
"learning_rate": 1e-05,
"loss": 1.1153,
"mean_token_accuracy": 0.6798511743545532,
"num_tokens": 197840214.0,
"step": 333
},
{
"epoch": 1.5904761904761906,
"grad_norm": 0.5971040725708008,
"learning_rate": 1e-05,
"loss": 1.1177,
"mean_token_accuracy": 0.6795299649238586,
"num_tokens": 198440572.0,
"step": 334
},
{
"epoch": 1.5952380952380953,
"grad_norm": 0.6526306867599487,
"learning_rate": 1e-05,
"loss": 1.1134,
"mean_token_accuracy": 0.6805366277694702,
"num_tokens": 199039184.0,
"step": 335
},
{
"epoch": 1.6,
"grad_norm": 0.622909426689148,
"learning_rate": 1e-05,
"loss": 1.1139,
"mean_token_accuracy": 0.6792494058609009,
"num_tokens": 199626548.0,
"step": 336
},
{
"epoch": 1.6047619047619048,
"grad_norm": 0.6684408187866211,
"learning_rate": 1e-05,
"loss": 1.128,
"mean_token_accuracy": 0.6774076819419861,
"num_tokens": 200222258.0,
"step": 337
},
{
"epoch": 1.6095238095238096,
"grad_norm": 0.5934977531433105,
"learning_rate": 1e-05,
"loss": 1.1203,
"mean_token_accuracy": 0.6792654991149902,
"num_tokens": 200819172.0,
"step": 338
},
{
"epoch": 1.6142857142857143,
"grad_norm": 0.6164219975471497,
"learning_rate": 1e-05,
"loss": 1.1314,
"mean_token_accuracy": 0.6759560704231262,
"num_tokens": 201413549.0,
"step": 339
},
{
"epoch": 1.619047619047619,
"grad_norm": 0.6061872839927673,
"learning_rate": 1e-05,
"loss": 1.1162,
"mean_token_accuracy": 0.6795899868011475,
"num_tokens": 202014069.0,
"step": 340
},
{
"epoch": 1.6238095238095238,
"grad_norm": 0.6192796230316162,
"learning_rate": 1e-05,
"loss": 1.1476,
"mean_token_accuracy": 0.6721718311309814,
"num_tokens": 202600379.0,
"step": 341
},
{
"epoch": 1.6285714285714286,
"grad_norm": 0.6233608722686768,
"learning_rate": 1e-05,
"loss": 1.1226,
"mean_token_accuracy": 0.6779032945632935,
"num_tokens": 203203233.0,
"step": 342
},
{
"epoch": 1.6333333333333333,
"grad_norm": 0.5831724405288696,
"learning_rate": 1e-05,
"loss": 1.1159,
"mean_token_accuracy": 0.6793074607849121,
"num_tokens": 203802554.0,
"step": 343
},
{
"epoch": 1.638095238095238,
"grad_norm": 0.6623408794403076,
"learning_rate": 1e-05,
"loss": 1.1296,
"mean_token_accuracy": 0.677479088306427,
"num_tokens": 204395560.0,
"step": 344
},
{
"epoch": 1.6428571428571428,
"grad_norm": 0.5827105045318604,
"learning_rate": 1e-05,
"loss": 1.113,
"mean_token_accuracy": 0.6808111071586609,
"num_tokens": 205001404.0,
"step": 345
},
{
"epoch": 1.6476190476190475,
"grad_norm": 0.5602775812149048,
"learning_rate": 1e-05,
"loss": 1.1066,
"mean_token_accuracy": 0.6823267936706543,
"num_tokens": 205599855.0,
"step": 346
},
{
"epoch": 1.6523809523809523,
"grad_norm": 0.6435489654541016,
"learning_rate": 1e-05,
"loss": 1.1124,
"mean_token_accuracy": 0.6803141832351685,
"num_tokens": 206163338.0,
"step": 347
},
{
"epoch": 1.657142857142857,
"grad_norm": 0.5933458209037781,
"learning_rate": 1e-05,
"loss": 1.137,
"mean_token_accuracy": 0.6748683452606201,
"num_tokens": 206741397.0,
"step": 348
},
{
"epoch": 1.6619047619047618,
"grad_norm": 0.5775367021560669,
"learning_rate": 1e-05,
"loss": 1.1298,
"mean_token_accuracy": 0.6758445501327515,
"num_tokens": 207323297.0,
"step": 349
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.5773342251777649,
"learning_rate": 1e-05,
"loss": 1.1294,
"mean_token_accuracy": 0.6764418482780457,
"num_tokens": 207908589.0,
"step": 350
},
{
"epoch": 1.6714285714285713,
"grad_norm": 0.6353156566619873,
"learning_rate": 1e-05,
"loss": 1.1403,
"mean_token_accuracy": 0.6732203960418701,
"num_tokens": 208500281.0,
"step": 351
},
{
"epoch": 1.6761904761904762,
"grad_norm": 0.5841516852378845,
"learning_rate": 1e-05,
"loss": 1.1201,
"mean_token_accuracy": 0.6789692640304565,
"num_tokens": 209097427.0,
"step": 352
},
{
"epoch": 1.680952380952381,
"grad_norm": 0.5935720205307007,
"learning_rate": 1e-05,
"loss": 1.1217,
"mean_token_accuracy": 0.6778074502944946,
"num_tokens": 209704225.0,
"step": 353
},
{
"epoch": 1.6857142857142857,
"grad_norm": 0.6088152527809143,
"learning_rate": 1e-05,
"loss": 1.1177,
"mean_token_accuracy": 0.6796123385429382,
"num_tokens": 210313267.0,
"step": 354
},
{
"epoch": 1.6904761904761905,
"grad_norm": 0.5818439722061157,
"learning_rate": 1e-05,
"loss": 1.1273,
"mean_token_accuracy": 0.6770058870315552,
"num_tokens": 210918588.0,
"step": 355
},
{
"epoch": 1.6952380952380952,
"grad_norm": 0.6217803955078125,
"learning_rate": 1e-05,
"loss": 1.1268,
"mean_token_accuracy": 0.6772897243499756,
"num_tokens": 211508221.0,
"step": 356
},
{
"epoch": 1.7,
"grad_norm": 0.5793229937553406,
"learning_rate": 1e-05,
"loss": 1.1359,
"mean_token_accuracy": 0.6748672127723694,
"num_tokens": 212108728.0,
"step": 357
},
{
"epoch": 1.704761904761905,
"grad_norm": 0.5839233994483948,
"learning_rate": 1e-05,
"loss": 1.1226,
"mean_token_accuracy": 0.6776269674301147,
"num_tokens": 212705437.0,
"step": 358
},
{
"epoch": 1.7095238095238097,
"grad_norm": 0.6158073544502258,
"learning_rate": 1e-05,
"loss": 1.1324,
"mean_token_accuracy": 0.6745504140853882,
"num_tokens": 213300176.0,
"step": 359
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.6093515753746033,
"learning_rate": 1e-05,
"loss": 1.132,
"mean_token_accuracy": 0.6751940250396729,
"num_tokens": 213890731.0,
"step": 360
},
{
"epoch": 1.7190476190476192,
"grad_norm": 0.629436194896698,
"learning_rate": 1e-05,
"loss": 1.1147,
"mean_token_accuracy": 0.6785677075386047,
"num_tokens": 214471137.0,
"step": 361
},
{
"epoch": 1.723809523809524,
"grad_norm": 0.6373199820518494,
"learning_rate": 1e-05,
"loss": 1.1169,
"mean_token_accuracy": 0.6792606115341187,
"num_tokens": 215062165.0,
"step": 362
},
{
"epoch": 1.7285714285714286,
"grad_norm": 0.5850217938423157,
"learning_rate": 1e-05,
"loss": 1.1264,
"mean_token_accuracy": 0.6766684055328369,
"num_tokens": 215662977.0,
"step": 363
},
{
"epoch": 1.7333333333333334,
"grad_norm": 0.676506757736206,
"learning_rate": 1e-05,
"loss": 1.1328,
"mean_token_accuracy": 0.6750730276107788,
"num_tokens": 216253942.0,
"step": 364
},
{
"epoch": 1.7380952380952381,
"grad_norm": 0.5996358394622803,
"learning_rate": 1e-05,
"loss": 1.1234,
"mean_token_accuracy": 0.6771166920661926,
"num_tokens": 216847247.0,
"step": 365
},
{
"epoch": 1.7428571428571429,
"grad_norm": 0.604375422000885,
"learning_rate": 1e-05,
"loss": 1.1111,
"mean_token_accuracy": 0.6807925701141357,
"num_tokens": 217427979.0,
"step": 366
},
{
"epoch": 1.7476190476190476,
"grad_norm": 0.6484256386756897,
"learning_rate": 1e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.67896568775177,
"num_tokens": 218020622.0,
"step": 367
},
{
"epoch": 1.7523809523809524,
"grad_norm": 0.5445154905319214,
"learning_rate": 1e-05,
"loss": 1.1238,
"mean_token_accuracy": 0.677640438079834,
"num_tokens": 218613768.0,
"step": 368
},
{
"epoch": 1.7571428571428571,
"grad_norm": 0.5835940837860107,
"learning_rate": 1e-05,
"loss": 1.1352,
"mean_token_accuracy": 0.6746830940246582,
"num_tokens": 219217863.0,
"step": 369
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.6108807325363159,
"learning_rate": 1e-05,
"loss": 1.1245,
"mean_token_accuracy": 0.6771240234375,
"num_tokens": 219826128.0,
"step": 370
},
{
"epoch": 1.7666666666666666,
"grad_norm": 0.5301618576049805,
"learning_rate": 1e-05,
"loss": 1.1193,
"mean_token_accuracy": 0.6791725158691406,
"num_tokens": 220424737.0,
"step": 371
},
{
"epoch": 1.7714285714285714,
"grad_norm": 0.567722737789154,
"learning_rate": 1e-05,
"loss": 1.1241,
"mean_token_accuracy": 0.6771541833877563,
"num_tokens": 221010073.0,
"step": 372
},
{
"epoch": 1.776190476190476,
"grad_norm": 0.6946297883987427,
"learning_rate": 1e-05,
"loss": 1.1205,
"mean_token_accuracy": 0.678805410861969,
"num_tokens": 221614799.0,
"step": 373
},
{
"epoch": 1.7809523809523808,
"grad_norm": 0.5566631555557251,
"learning_rate": 1e-05,
"loss": 1.1185,
"mean_token_accuracy": 0.6782611012458801,
"num_tokens": 222215943.0,
"step": 374
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.5999249219894409,
"learning_rate": 1e-05,
"loss": 1.114,
"mean_token_accuracy": 0.6802798509597778,
"num_tokens": 222803822.0,
"step": 375
},
{
"epoch": 1.7904761904761903,
"grad_norm": 0.5825783014297485,
"learning_rate": 1e-05,
"loss": 1.1314,
"mean_token_accuracy": 0.6754652261734009,
"num_tokens": 223409541.0,
"step": 376
},
{
"epoch": 1.795238095238095,
"grad_norm": 0.5893160700798035,
"learning_rate": 1e-05,
"loss": 1.1183,
"mean_token_accuracy": 0.6782077550888062,
"num_tokens": 223996446.0,
"step": 377
},
{
"epoch": 1.8,
"grad_norm": 0.5960800051689148,
"learning_rate": 1e-05,
"loss": 1.1203,
"mean_token_accuracy": 0.678328275680542,
"num_tokens": 224599074.0,
"step": 378
},
{
"epoch": 1.8047619047619048,
"grad_norm": 0.5972325205802917,
"learning_rate": 1e-05,
"loss": 1.1122,
"mean_token_accuracy": 0.6802579760551453,
"num_tokens": 225184557.0,
"step": 379
},
{
"epoch": 1.8095238095238095,
"grad_norm": 0.597683310508728,
"learning_rate": 1e-05,
"loss": 1.1185,
"mean_token_accuracy": 0.6798061728477478,
"num_tokens": 225774197.0,
"step": 380
},
{
"epoch": 1.8142857142857143,
"grad_norm": 0.575453519821167,
"learning_rate": 1e-05,
"loss": 1.1072,
"mean_token_accuracy": 0.6810543537139893,
"num_tokens": 226359063.0,
"step": 381
},
{
"epoch": 1.819047619047619,
"grad_norm": 0.5560538172721863,
"learning_rate": 1e-05,
"loss": 1.1237,
"mean_token_accuracy": 0.6774187088012695,
"num_tokens": 226962202.0,
"step": 382
},
{
"epoch": 1.8238095238095238,
"grad_norm": 0.6427722573280334,
"learning_rate": 1e-05,
"loss": 1.1218,
"mean_token_accuracy": 0.677949070930481,
"num_tokens": 227541002.0,
"step": 383
},
{
"epoch": 1.8285714285714287,
"grad_norm": 0.6143935322761536,
"learning_rate": 1e-05,
"loss": 1.1221,
"mean_token_accuracy": 0.6778963804244995,
"num_tokens": 228134124.0,
"step": 384
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.6365751624107361,
"learning_rate": 1e-05,
"loss": 1.112,
"mean_token_accuracy": 0.6797761917114258,
"num_tokens": 228729717.0,
"step": 385
},
{
"epoch": 1.8380952380952382,
"grad_norm": 0.719041109085083,
"learning_rate": 1e-05,
"loss": 1.1178,
"mean_token_accuracy": 0.6780564785003662,
"num_tokens": 229318931.0,
"step": 386
},
{
"epoch": 1.842857142857143,
"grad_norm": 0.6031278967857361,
"learning_rate": 1e-05,
"loss": 1.1246,
"mean_token_accuracy": 0.6776800155639648,
"num_tokens": 229923675.0,
"step": 387
},
{
"epoch": 1.8476190476190477,
"grad_norm": 0.6627750396728516,
"learning_rate": 1e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.6797564029693604,
"num_tokens": 230514254.0,
"step": 388
},
{
"epoch": 1.8523809523809525,
"grad_norm": 0.576654314994812,
"learning_rate": 1e-05,
"loss": 1.1228,
"mean_token_accuracy": 0.6780418157577515,
"num_tokens": 231113801.0,
"step": 389
},
{
"epoch": 1.8571428571428572,
"grad_norm": 0.6316273212432861,
"learning_rate": 1e-05,
"loss": 1.1119,
"mean_token_accuracy": 0.6792047023773193,
"num_tokens": 231709098.0,
"step": 390
},
{
"epoch": 1.861904761904762,
"grad_norm": 0.5546997785568237,
"learning_rate": 1e-05,
"loss": 1.1247,
"mean_token_accuracy": 0.6769775748252869,
"num_tokens": 232311276.0,
"step": 391
},
{
"epoch": 1.8666666666666667,
"grad_norm": 0.617088794708252,
"learning_rate": 1e-05,
"loss": 1.113,
"mean_token_accuracy": 0.6795423030853271,
"num_tokens": 232904607.0,
"step": 392
},
{
"epoch": 1.8714285714285714,
"grad_norm": 0.611702561378479,
"learning_rate": 1e-05,
"loss": 1.1057,
"mean_token_accuracy": 0.6821488738059998,
"num_tokens": 233493254.0,
"step": 393
},
{
"epoch": 1.8761904761904762,
"grad_norm": 0.6276193261146545,
"learning_rate": 1e-05,
"loss": 1.1154,
"mean_token_accuracy": 0.6800172328948975,
"num_tokens": 234085431.0,
"step": 394
},
{
"epoch": 1.880952380952381,
"grad_norm": 0.6570289731025696,
"learning_rate": 1e-05,
"loss": 1.1245,
"mean_token_accuracy": 0.676892876625061,
"num_tokens": 234685396.0,
"step": 395
},
{
"epoch": 1.8857142857142857,
"grad_norm": 0.6350821256637573,
"learning_rate": 1e-05,
"loss": 1.1253,
"mean_token_accuracy": 0.6770513653755188,
"num_tokens": 235280049.0,
"step": 396
},
{
"epoch": 1.8904761904761904,
"grad_norm": 0.6419028639793396,
"learning_rate": 1e-05,
"loss": 1.1258,
"mean_token_accuracy": 0.6772979497909546,
"num_tokens": 235867276.0,
"step": 397
},
{
"epoch": 1.8952380952380952,
"grad_norm": 0.6098426580429077,
"learning_rate": 1e-05,
"loss": 1.1126,
"mean_token_accuracy": 0.6804271936416626,
"num_tokens": 236448647.0,
"step": 398
},
{
"epoch": 1.9,
"grad_norm": 0.5854616165161133,
"learning_rate": 1e-05,
"loss": 1.1256,
"mean_token_accuracy": 0.6755622625350952,
"num_tokens": 237054180.0,
"step": 399
},
{
"epoch": 1.9047619047619047,
"grad_norm": 0.6416271328926086,
"learning_rate": 1e-05,
"loss": 1.1394,
"mean_token_accuracy": 0.6737023591995239,
"num_tokens": 237658218.0,
"step": 400
},
{
"epoch": 1.9095238095238094,
"grad_norm": 0.5833379626274109,
"learning_rate": 1e-05,
"loss": 1.1223,
"mean_token_accuracy": 0.6782907247543335,
"num_tokens": 238248393.0,
"step": 401
},
{
"epoch": 1.9142857142857141,
"grad_norm": 0.6798136830329895,
"learning_rate": 1e-05,
"loss": 1.1096,
"mean_token_accuracy": 0.6816190481185913,
"num_tokens": 238838548.0,
"step": 402
},
{
"epoch": 1.919047619047619,
"grad_norm": 0.5994821786880493,
"learning_rate": 1e-05,
"loss": 1.1154,
"mean_token_accuracy": 0.6799057722091675,
"num_tokens": 239442502.0,
"step": 403
},
{
"epoch": 1.9238095238095239,
"grad_norm": 0.6224843263626099,
"learning_rate": 1e-05,
"loss": 1.1273,
"mean_token_accuracy": 0.6760965585708618,
"num_tokens": 240029019.0,
"step": 404
},
{
"epoch": 1.9285714285714286,
"grad_norm": 0.6100861430168152,
"learning_rate": 1e-05,
"loss": 1.1134,
"mean_token_accuracy": 0.6803538799285889,
"num_tokens": 240623504.0,
"step": 405
},
{
"epoch": 1.9333333333333333,
"grad_norm": 0.6026962399482727,
"learning_rate": 1e-05,
"loss": 1.1022,
"mean_token_accuracy": 0.6817559599876404,
"num_tokens": 241217102.0,
"step": 406
},
{
"epoch": 1.938095238095238,
"grad_norm": 0.6529442667961121,
"learning_rate": 1e-05,
"loss": 1.1141,
"mean_token_accuracy": 0.6804797649383545,
"num_tokens": 241812222.0,
"step": 407
},
{
"epoch": 1.9428571428571428,
"grad_norm": 0.6519430875778198,
"learning_rate": 1e-05,
"loss": 1.1085,
"mean_token_accuracy": 0.680460512638092,
"num_tokens": 242388669.0,
"step": 408
},
{
"epoch": 1.9476190476190476,
"grad_norm": 0.7020300626754761,
"learning_rate": 1e-05,
"loss": 1.1111,
"mean_token_accuracy": 0.6802721619606018,
"num_tokens": 242965686.0,
"step": 409
},
{
"epoch": 1.9523809523809523,
"grad_norm": 0.6024628281593323,
"learning_rate": 1e-05,
"loss": 1.1183,
"mean_token_accuracy": 0.6792590022087097,
"num_tokens": 243553853.0,
"step": 410
},
{
"epoch": 1.9571428571428573,
"grad_norm": 0.7494162321090698,
"learning_rate": 1e-05,
"loss": 1.1135,
"mean_token_accuracy": 0.6788877844810486,
"num_tokens": 244139754.0,
"step": 411
},
{
"epoch": 1.961904761904762,
"grad_norm": 0.6602755188941956,
"learning_rate": 1e-05,
"loss": 1.1176,
"mean_token_accuracy": 0.6777335405349731,
"num_tokens": 244736423.0,
"step": 412
},
{
"epoch": 1.9666666666666668,
"grad_norm": 0.7016980051994324,
"learning_rate": 1e-05,
"loss": 1.1186,
"mean_token_accuracy": 0.6798810362815857,
"num_tokens": 245337682.0,
"step": 413
},
{
"epoch": 1.9714285714285715,
"grad_norm": 0.6483145356178284,
"learning_rate": 1e-05,
"loss": 1.1172,
"mean_token_accuracy": 0.67914879322052,
"num_tokens": 245952105.0,
"step": 414
},
{
"epoch": 1.9761904761904763,
"grad_norm": 0.678092896938324,
"learning_rate": 1e-05,
"loss": 1.1039,
"mean_token_accuracy": 0.6819472312927246,
"num_tokens": 246540999.0,
"step": 415
},
{
"epoch": 1.980952380952381,
"grad_norm": 0.7507527470588684,
"learning_rate": 1e-05,
"loss": 1.1103,
"mean_token_accuracy": 0.6814316511154175,
"num_tokens": 247142303.0,
"step": 416
},
{
"epoch": 1.9857142857142858,
"grad_norm": 0.625765323638916,
"learning_rate": 1e-05,
"loss": 1.108,
"mean_token_accuracy": 0.6812607049942017,
"num_tokens": 247732988.0,
"step": 417
},
{
"epoch": 1.9904761904761905,
"grad_norm": 0.6421918869018555,
"learning_rate": 1e-05,
"loss": 1.1022,
"mean_token_accuracy": 0.681933581829071,
"num_tokens": 248334744.0,
"step": 418
},
{
"epoch": 1.9952380952380953,
"grad_norm": 0.6160528659820557,
"learning_rate": 1e-05,
"loss": 1.1133,
"mean_token_accuracy": 0.6797480583190918,
"num_tokens": 248930347.0,
"step": 419
},
{
"epoch": 2.0,
"grad_norm": 0.703513503074646,
"learning_rate": 1e-05,
"loss": 1.129,
"mean_token_accuracy": 0.676598846912384,
"num_tokens": 249522093.0,
"step": 420
},
{
"epoch": 2.0047619047619047,
"grad_norm": 0.7784668207168579,
"learning_rate": 1e-05,
"loss": 1.0772,
"mean_token_accuracy": 0.687883734703064,
"num_tokens": 250112163.0,
"step": 421
},
{
"epoch": 2.0095238095238095,
"grad_norm": 0.7685954570770264,
"learning_rate": 1e-05,
"loss": 1.065,
"mean_token_accuracy": 0.6909651160240173,
"num_tokens": 250690466.0,
"step": 422
},
{
"epoch": 2.0142857142857142,
"grad_norm": 0.5822970867156982,
"learning_rate": 1e-05,
"loss": 1.0678,
"mean_token_accuracy": 0.68952476978302,
"num_tokens": 251279220.0,
"step": 423
},
{
"epoch": 2.019047619047619,
"grad_norm": 0.8003807663917542,
"learning_rate": 1e-05,
"loss": 1.071,
"mean_token_accuracy": 0.6891335248947144,
"num_tokens": 251871717.0,
"step": 424
},
{
"epoch": 2.0238095238095237,
"grad_norm": 0.6656951904296875,
"learning_rate": 1e-05,
"loss": 1.0907,
"mean_token_accuracy": 0.6843781471252441,
"num_tokens": 252474129.0,
"step": 425
},
{
"epoch": 2.0285714285714285,
"grad_norm": 0.662339448928833,
"learning_rate": 1e-05,
"loss": 1.0518,
"mean_token_accuracy": 0.693079948425293,
"num_tokens": 253069065.0,
"step": 426
},
{
"epoch": 2.033333333333333,
"grad_norm": 0.6397184729576111,
"learning_rate": 1e-05,
"loss": 1.079,
"mean_token_accuracy": 0.6863641738891602,
"num_tokens": 253654392.0,
"step": 427
},
{
"epoch": 2.038095238095238,
"grad_norm": 0.6415942907333374,
"learning_rate": 1e-05,
"loss": 1.0688,
"mean_token_accuracy": 0.6888935565948486,
"num_tokens": 254245168.0,
"step": 428
},
{
"epoch": 2.0428571428571427,
"grad_norm": 0.6560488939285278,
"learning_rate": 1e-05,
"loss": 1.0824,
"mean_token_accuracy": 0.685175895690918,
"num_tokens": 254841132.0,
"step": 429
},
{
"epoch": 2.0476190476190474,
"grad_norm": 0.5839130878448486,
"learning_rate": 1e-05,
"loss": 1.0676,
"mean_token_accuracy": 0.6896021366119385,
"num_tokens": 255433793.0,
"step": 430
},
{
"epoch": 2.052380952380952,
"grad_norm": 0.7360151410102844,
"learning_rate": 1e-05,
"loss": 1.0658,
"mean_token_accuracy": 0.6903011798858643,
"num_tokens": 256015225.0,
"step": 431
},
{
"epoch": 2.057142857142857,
"grad_norm": 0.633699893951416,
"learning_rate": 1e-05,
"loss": 1.0638,
"mean_token_accuracy": 0.6905348300933838,
"num_tokens": 256617333.0,
"step": 432
},
{
"epoch": 2.0619047619047617,
"grad_norm": 0.6784190535545349,
"learning_rate": 1e-05,
"loss": 1.0913,
"mean_token_accuracy": 0.6846885681152344,
"num_tokens": 257214120.0,
"step": 433
},
{
"epoch": 2.066666666666667,
"grad_norm": 0.6749794483184814,
"learning_rate": 1e-05,
"loss": 1.0707,
"mean_token_accuracy": 0.6884802579879761,
"num_tokens": 257807506.0,
"step": 434
},
{
"epoch": 2.0714285714285716,
"grad_norm": 0.6474000811576843,
"learning_rate": 1e-05,
"loss": 1.0873,
"mean_token_accuracy": 0.6844640970230103,
"num_tokens": 258414225.0,
"step": 435
},
{
"epoch": 2.0761904761904764,
"grad_norm": 0.6300811171531677,
"learning_rate": 1e-05,
"loss": 1.0684,
"mean_token_accuracy": 0.688113808631897,
"num_tokens": 259009305.0,
"step": 436
},
{
"epoch": 2.080952380952381,
"grad_norm": 0.6160655617713928,
"learning_rate": 1e-05,
"loss": 1.0626,
"mean_token_accuracy": 0.690530002117157,
"num_tokens": 259600283.0,
"step": 437
},
{
"epoch": 2.085714285714286,
"grad_norm": 0.5936851501464844,
"learning_rate": 1e-05,
"loss": 1.0876,
"mean_token_accuracy": 0.6846874952316284,
"num_tokens": 260203907.0,
"step": 438
},
{
"epoch": 2.0904761904761906,
"grad_norm": 0.6563723683357239,
"learning_rate": 1e-05,
"loss": 1.0823,
"mean_token_accuracy": 0.6850008964538574,
"num_tokens": 260795356.0,
"step": 439
},
{
"epoch": 2.0952380952380953,
"grad_norm": 0.6244327425956726,
"learning_rate": 1e-05,
"loss": 1.0595,
"mean_token_accuracy": 0.6918923854827881,
"num_tokens": 261376414.0,
"step": 440
},
{
"epoch": 2.1,
"grad_norm": 0.6768208146095276,
"learning_rate": 1e-05,
"loss": 1.0854,
"mean_token_accuracy": 0.6857748031616211,
"num_tokens": 261965206.0,
"step": 441
},
{
"epoch": 2.104761904761905,
"grad_norm": 0.6261032819747925,
"learning_rate": 1e-05,
"loss": 1.0792,
"mean_token_accuracy": 0.6868791580200195,
"num_tokens": 262561371.0,
"step": 442
},
{
"epoch": 2.1095238095238096,
"grad_norm": 0.6388991475105286,
"learning_rate": 1e-05,
"loss": 1.068,
"mean_token_accuracy": 0.689771294593811,
"num_tokens": 263159935.0,
"step": 443
},
{
"epoch": 2.1142857142857143,
"grad_norm": 0.6453383564949036,
"learning_rate": 1e-05,
"loss": 1.0803,
"mean_token_accuracy": 0.6851140260696411,
"num_tokens": 263754299.0,
"step": 444
},
{
"epoch": 2.119047619047619,
"grad_norm": 0.6248214244842529,
"learning_rate": 1e-05,
"loss": 1.0792,
"mean_token_accuracy": 0.686503529548645,
"num_tokens": 264354630.0,
"step": 445
},
{
"epoch": 2.123809523809524,
"grad_norm": 0.6909031271934509,
"learning_rate": 1e-05,
"loss": 1.0995,
"mean_token_accuracy": 0.681476891040802,
"num_tokens": 264964353.0,
"step": 446
},
{
"epoch": 2.1285714285714286,
"grad_norm": 0.6381927132606506,
"learning_rate": 1e-05,
"loss": 1.0816,
"mean_token_accuracy": 0.6861193180084229,
"num_tokens": 265561411.0,
"step": 447
},
{
"epoch": 2.1333333333333333,
"grad_norm": 0.669456958770752,
"learning_rate": 1e-05,
"loss": 1.073,
"mean_token_accuracy": 0.6876237392425537,
"num_tokens": 266155790.0,
"step": 448
},
{
"epoch": 2.138095238095238,
"grad_norm": 0.6266065239906311,
"learning_rate": 1e-05,
"loss": 1.0788,
"mean_token_accuracy": 0.6870714426040649,
"num_tokens": 266757269.0,
"step": 449
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.6428273916244507,
"learning_rate": 1e-05,
"loss": 1.0942,
"mean_token_accuracy": 0.6831685304641724,
"num_tokens": 267369143.0,
"step": 450
},
{
"epoch": 2.1476190476190475,
"grad_norm": 0.6169470548629761,
"learning_rate": 1e-05,
"loss": 1.0619,
"mean_token_accuracy": 0.6913155317306519,
"num_tokens": 267965437.0,
"step": 451
},
{
"epoch": 2.1523809523809523,
"grad_norm": 0.6351789832115173,
"learning_rate": 1e-05,
"loss": 1.0713,
"mean_token_accuracy": 0.6888561248779297,
"num_tokens": 268571463.0,
"step": 452
},
{
"epoch": 2.157142857142857,
"grad_norm": 0.6532635688781738,
"learning_rate": 1e-05,
"loss": 1.0698,
"mean_token_accuracy": 0.6889727115631104,
"num_tokens": 269157041.0,
"step": 453
},
{
"epoch": 2.1619047619047618,
"grad_norm": 0.5989878177642822,
"learning_rate": 1e-05,
"loss": 1.0682,
"mean_token_accuracy": 0.6890783309936523,
"num_tokens": 269758195.0,
"step": 454
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.6337672472000122,
"learning_rate": 1e-05,
"loss": 1.0969,
"mean_token_accuracy": 0.6822439432144165,
"num_tokens": 270349613.0,
"step": 455
},
{
"epoch": 2.1714285714285713,
"grad_norm": 0.5972429513931274,
"learning_rate": 1e-05,
"loss": 1.0831,
"mean_token_accuracy": 0.6840848326683044,
"num_tokens": 270947174.0,
"step": 456
},
{
"epoch": 2.176190476190476,
"grad_norm": 0.6298529505729675,
"learning_rate": 1e-05,
"loss": 1.0921,
"mean_token_accuracy": 0.6836713552474976,
"num_tokens": 271549889.0,
"step": 457
},
{
"epoch": 2.1809523809523808,
"grad_norm": 0.574796199798584,
"learning_rate": 1e-05,
"loss": 1.064,
"mean_token_accuracy": 0.6906387805938721,
"num_tokens": 272139782.0,
"step": 458
},
{
"epoch": 2.185714285714286,
"grad_norm": 0.6812316179275513,
"learning_rate": 1e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.687111496925354,
"num_tokens": 272746279.0,
"step": 459
},
{
"epoch": 2.1904761904761907,
"grad_norm": 0.5981315970420837,
"learning_rate": 1e-05,
"loss": 1.0626,
"mean_token_accuracy": 0.6907453536987305,
"num_tokens": 273348449.0,
"step": 460
},
{
"epoch": 2.1952380952380954,
"grad_norm": 0.6438897252082825,
"learning_rate": 1e-05,
"loss": 1.0853,
"mean_token_accuracy": 0.6858918070793152,
"num_tokens": 273949928.0,
"step": 461
},
{
"epoch": 2.2,
"grad_norm": 0.6236709952354431,
"learning_rate": 1e-05,
"loss": 1.0733,
"mean_token_accuracy": 0.6877059936523438,
"num_tokens": 274548070.0,
"step": 462
},
{
"epoch": 2.204761904761905,
"grad_norm": 0.6749060153961182,
"learning_rate": 1e-05,
"loss": 1.0758,
"mean_token_accuracy": 0.6867290735244751,
"num_tokens": 275135656.0,
"step": 463
},
{
"epoch": 2.2095238095238097,
"grad_norm": 0.6628844738006592,
"learning_rate": 1e-05,
"loss": 1.0765,
"mean_token_accuracy": 0.6874538660049438,
"num_tokens": 275740663.0,
"step": 464
},
{
"epoch": 2.2142857142857144,
"grad_norm": 0.5728548169136047,
"learning_rate": 1e-05,
"loss": 1.0754,
"mean_token_accuracy": 0.6882718205451965,
"num_tokens": 276346207.0,
"step": 465
},
{
"epoch": 2.219047619047619,
"grad_norm": 0.6232889294624329,
"learning_rate": 1e-05,
"loss": 1.0752,
"mean_token_accuracy": 0.6872685551643372,
"num_tokens": 276940208.0,
"step": 466
},
{
"epoch": 2.223809523809524,
"grad_norm": 0.6447910070419312,
"learning_rate": 1e-05,
"loss": 1.091,
"mean_token_accuracy": 0.6836293339729309,
"num_tokens": 277539762.0,
"step": 467
},
{
"epoch": 2.2285714285714286,
"grad_norm": 0.6113771796226501,
"learning_rate": 1e-05,
"loss": 1.0865,
"mean_token_accuracy": 0.684094250202179,
"num_tokens": 278136526.0,
"step": 468
},
{
"epoch": 2.2333333333333334,
"grad_norm": 0.6344524025917053,
"learning_rate": 1e-05,
"loss": 1.0772,
"mean_token_accuracy": 0.6870338320732117,
"num_tokens": 278723575.0,
"step": 469
},
{
"epoch": 2.238095238095238,
"grad_norm": 0.6180852055549622,
"learning_rate": 1e-05,
"loss": 1.0544,
"mean_token_accuracy": 0.6927859783172607,
"num_tokens": 279313692.0,
"step": 470
},
{
"epoch": 2.242857142857143,
"grad_norm": 0.6375457644462585,
"learning_rate": 1e-05,
"loss": 1.0869,
"mean_token_accuracy": 0.6847492456436157,
"num_tokens": 279911596.0,
"step": 471
},
{
"epoch": 2.2476190476190476,
"grad_norm": 0.6032583117485046,
"learning_rate": 1e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.6893506050109863,
"num_tokens": 280516626.0,
"step": 472
},
{
"epoch": 2.2523809523809524,
"grad_norm": 0.6571868062019348,
"learning_rate": 1e-05,
"loss": 1.0723,
"mean_token_accuracy": 0.6889193654060364,
"num_tokens": 281109826.0,
"step": 473
},
{
"epoch": 2.257142857142857,
"grad_norm": 0.5816087126731873,
"learning_rate": 1e-05,
"loss": 1.0783,
"mean_token_accuracy": 0.6873452663421631,
"num_tokens": 281705908.0,
"step": 474
},
{
"epoch": 2.261904761904762,
"grad_norm": 0.6110855340957642,
"learning_rate": 1e-05,
"loss": 1.0733,
"mean_token_accuracy": 0.6875293850898743,
"num_tokens": 282295646.0,
"step": 475
},
{
"epoch": 2.2666666666666666,
"grad_norm": 0.5722987055778503,
"learning_rate": 1e-05,
"loss": 1.064,
"mean_token_accuracy": 0.6898794174194336,
"num_tokens": 282882984.0,
"step": 476
},
{
"epoch": 2.2714285714285714,
"grad_norm": 0.5756980776786804,
"learning_rate": 1e-05,
"loss": 1.0705,
"mean_token_accuracy": 0.6888871192932129,
"num_tokens": 283470314.0,
"step": 477
},
{
"epoch": 2.276190476190476,
"grad_norm": 0.6090242862701416,
"learning_rate": 1e-05,
"loss": 1.0729,
"mean_token_accuracy": 0.6876958012580872,
"num_tokens": 284064822.0,
"step": 478
},
{
"epoch": 2.280952380952381,
"grad_norm": 0.551956295967102,
"learning_rate": 1e-05,
"loss": 1.0666,
"mean_token_accuracy": 0.6899924278259277,
"num_tokens": 284659143.0,
"step": 479
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.617386519908905,
"learning_rate": 1e-05,
"loss": 1.0789,
"mean_token_accuracy": 0.6873286366462708,
"num_tokens": 285260603.0,
"step": 480
},
{
"epoch": 2.2904761904761903,
"grad_norm": 0.5895305871963501,
"learning_rate": 1e-05,
"loss": 1.0668,
"mean_token_accuracy": 0.6887931823730469,
"num_tokens": 285858617.0,
"step": 481
},
{
"epoch": 2.295238095238095,
"grad_norm": 0.575018584728241,
"learning_rate": 1e-05,
"loss": 1.0733,
"mean_token_accuracy": 0.6886229515075684,
"num_tokens": 286462909.0,
"step": 482
},
{
"epoch": 2.3,
"grad_norm": 0.680483341217041,
"learning_rate": 1e-05,
"loss": 1.0686,
"mean_token_accuracy": 0.6894232034683228,
"num_tokens": 287057508.0,
"step": 483
},
{
"epoch": 2.3047619047619046,
"grad_norm": 0.6086472868919373,
"learning_rate": 1e-05,
"loss": 1.0784,
"mean_token_accuracy": 0.6863738298416138,
"num_tokens": 287647864.0,
"step": 484
},
{
"epoch": 2.3095238095238093,
"grad_norm": 0.6269891858100891,
"learning_rate": 1e-05,
"loss": 1.0803,
"mean_token_accuracy": 0.6864203810691833,
"num_tokens": 288244654.0,
"step": 485
},
{
"epoch": 2.314285714285714,
"grad_norm": 0.6842952370643616,
"learning_rate": 1e-05,
"loss": 1.0897,
"mean_token_accuracy": 0.684012770652771,
"num_tokens": 288833805.0,
"step": 486
},
{
"epoch": 2.319047619047619,
"grad_norm": 0.5772620439529419,
"learning_rate": 1e-05,
"loss": 1.0728,
"mean_token_accuracy": 0.6879225969314575,
"num_tokens": 289430249.0,
"step": 487
},
{
"epoch": 2.323809523809524,
"grad_norm": 0.6799498796463013,
"learning_rate": 1e-05,
"loss": 1.0737,
"mean_token_accuracy": 0.6892322897911072,
"num_tokens": 290017640.0,
"step": 488
},
{
"epoch": 2.3285714285714287,
"grad_norm": 0.63170325756073,
"learning_rate": 1e-05,
"loss": 1.0694,
"mean_token_accuracy": 0.6884621381759644,
"num_tokens": 290598414.0,
"step": 489
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.6786331534385681,
"learning_rate": 1e-05,
"loss": 1.061,
"mean_token_accuracy": 0.6906882524490356,
"num_tokens": 291181812.0,
"step": 490
},
{
"epoch": 2.3380952380952382,
"grad_norm": 0.6489508748054504,
"learning_rate": 1e-05,
"loss": 1.0747,
"mean_token_accuracy": 0.6877006888389587,
"num_tokens": 291764317.0,
"step": 491
},
{
"epoch": 2.342857142857143,
"grad_norm": 0.6271830797195435,
"learning_rate": 1e-05,
"loss": 1.0809,
"mean_token_accuracy": 0.6854414343833923,
"num_tokens": 292350504.0,
"step": 492
},
{
"epoch": 2.3476190476190477,
"grad_norm": 0.6458184123039246,
"learning_rate": 1e-05,
"loss": 1.0777,
"mean_token_accuracy": 0.6864031553268433,
"num_tokens": 292951776.0,
"step": 493
},
{
"epoch": 2.3523809523809525,
"grad_norm": 0.6648980379104614,
"learning_rate": 1e-05,
"loss": 1.0695,
"mean_token_accuracy": 0.6887059211730957,
"num_tokens": 293532488.0,
"step": 494
},
{
"epoch": 2.357142857142857,
"grad_norm": 0.6425085067749023,
"learning_rate": 1e-05,
"loss": 1.0575,
"mean_token_accuracy": 0.6918837428092957,
"num_tokens": 294121146.0,
"step": 495
},
{
"epoch": 2.361904761904762,
"grad_norm": 0.6645520329475403,
"learning_rate": 1e-05,
"loss": 1.0768,
"mean_token_accuracy": 0.6873211860656738,
"num_tokens": 294726732.0,
"step": 496
},
{
"epoch": 2.3666666666666667,
"grad_norm": 0.6538220047950745,
"learning_rate": 1e-05,
"loss": 1.0682,
"mean_token_accuracy": 0.6892035007476807,
"num_tokens": 295306697.0,
"step": 497
},
{
"epoch": 2.3714285714285714,
"grad_norm": 0.7154629230499268,
"learning_rate": 1e-05,
"loss": 1.0893,
"mean_token_accuracy": 0.6833094358444214,
"num_tokens": 295894972.0,
"step": 498
},
{
"epoch": 2.376190476190476,
"grad_norm": 0.6492322087287903,
"learning_rate": 1e-05,
"loss": 1.0831,
"mean_token_accuracy": 0.6853781938552856,
"num_tokens": 296505345.0,
"step": 499
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.7426714301109314,
"learning_rate": 1e-05,
"loss": 1.0664,
"mean_token_accuracy": 0.6893447637557983,
"num_tokens": 297100909.0,
"step": 500
},
{
"epoch": 2.3857142857142857,
"grad_norm": 0.6399804353713989,
"learning_rate": 1e-05,
"loss": 1.0743,
"mean_token_accuracy": 0.688417375087738,
"num_tokens": 297690267.0,
"step": 501
},
{
"epoch": 2.3904761904761904,
"grad_norm": 0.599839985370636,
"learning_rate": 1e-05,
"loss": 1.0646,
"mean_token_accuracy": 0.6897764801979065,
"num_tokens": 298283484.0,
"step": 502
},
{
"epoch": 2.395238095238095,
"grad_norm": 0.6296051740646362,
"learning_rate": 1e-05,
"loss": 1.0814,
"mean_token_accuracy": 0.685540497303009,
"num_tokens": 298880193.0,
"step": 503
},
{
"epoch": 2.4,
"grad_norm": 0.5922709107398987,
"learning_rate": 1e-05,
"loss": 1.058,
"mean_token_accuracy": 0.6912336349487305,
"num_tokens": 299479995.0,
"step": 504
},
{
"epoch": 2.4047619047619047,
"grad_norm": 0.608103334903717,
"learning_rate": 1e-05,
"loss": 1.0731,
"mean_token_accuracy": 0.6877481937408447,
"num_tokens": 300068384.0,
"step": 505
},
{
"epoch": 2.4095238095238094,
"grad_norm": 0.6003749966621399,
"learning_rate": 1e-05,
"loss": 1.083,
"mean_token_accuracy": 0.6847676038742065,
"num_tokens": 300687274.0,
"step": 506
},
{
"epoch": 2.414285714285714,
"grad_norm": 0.5747948884963989,
"learning_rate": 1e-05,
"loss": 1.075,
"mean_token_accuracy": 0.6867921352386475,
"num_tokens": 301288728.0,
"step": 507
},
{
"epoch": 2.419047619047619,
"grad_norm": 0.6287463307380676,
"learning_rate": 1e-05,
"loss": 1.0698,
"mean_token_accuracy": 0.6888238787651062,
"num_tokens": 301868926.0,
"step": 508
},
{
"epoch": 2.4238095238095236,
"grad_norm": 0.5455256104469299,
"learning_rate": 1e-05,
"loss": 1.0644,
"mean_token_accuracy": 0.690401017665863,
"num_tokens": 302467630.0,
"step": 509
},
{
"epoch": 2.4285714285714284,
"grad_norm": 0.6476891040802002,
"learning_rate": 1e-05,
"loss": 1.09,
"mean_token_accuracy": 0.6842606663703918,
"num_tokens": 303058965.0,
"step": 510
},
{
"epoch": 2.4333333333333336,
"grad_norm": 0.6696739792823792,
"learning_rate": 1e-05,
"loss": 1.091,
"mean_token_accuracy": 0.6829922795295715,
"num_tokens": 303639694.0,
"step": 511
},
{
"epoch": 2.4380952380952383,
"grad_norm": 0.5850697159767151,
"learning_rate": 1e-05,
"loss": 1.0651,
"mean_token_accuracy": 0.6903361082077026,
"num_tokens": 304234504.0,
"step": 512
},
{
"epoch": 2.442857142857143,
"grad_norm": 0.6123826503753662,
"learning_rate": 1e-05,
"loss": 1.0848,
"mean_token_accuracy": 0.6855412125587463,
"num_tokens": 304822484.0,
"step": 513
},
{
"epoch": 2.447619047619048,
"grad_norm": 0.6242313981056213,
"learning_rate": 1e-05,
"loss": 1.069,
"mean_token_accuracy": 0.6895902156829834,
"num_tokens": 305405226.0,
"step": 514
},
{
"epoch": 2.4523809523809526,
"grad_norm": 0.6153740286827087,
"learning_rate": 1e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.6889458298683167,
"num_tokens": 306007153.0,
"step": 515
},
{
"epoch": 2.4571428571428573,
"grad_norm": 0.6674852967262268,
"learning_rate": 1e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.6897221803665161,
"num_tokens": 306588836.0,
"step": 516
},
{
"epoch": 2.461904761904762,
"grad_norm": 0.6560084819793701,
"learning_rate": 1e-05,
"loss": 1.0804,
"mean_token_accuracy": 0.6861131191253662,
"num_tokens": 307200955.0,
"step": 517
},
{
"epoch": 2.466666666666667,
"grad_norm": 0.5911952257156372,
"learning_rate": 1e-05,
"loss": 1.0669,
"mean_token_accuracy": 0.6889474987983704,
"num_tokens": 307799028.0,
"step": 518
},
{
"epoch": 2.4714285714285715,
"grad_norm": 0.6963088512420654,
"learning_rate": 1e-05,
"loss": 1.0965,
"mean_token_accuracy": 0.6822454929351807,
"num_tokens": 308389748.0,
"step": 519
},
{
"epoch": 2.4761904761904763,
"grad_norm": 0.7166724801063538,
"learning_rate": 1e-05,
"loss": 1.0773,
"mean_token_accuracy": 0.6871429681777954,
"num_tokens": 308978715.0,
"step": 520
},
{
"epoch": 2.480952380952381,
"grad_norm": 0.598521888256073,
"learning_rate": 1e-05,
"loss": 1.0756,
"mean_token_accuracy": 0.6871167421340942,
"num_tokens": 309587298.0,
"step": 521
},
{
"epoch": 2.4857142857142858,
"grad_norm": 0.6383949518203735,
"learning_rate": 1e-05,
"loss": 1.0643,
"mean_token_accuracy": 0.6895929574966431,
"num_tokens": 310173585.0,
"step": 522
},
{
"epoch": 2.4904761904761905,
"grad_norm": 0.6667410731315613,
"learning_rate": 1e-05,
"loss": 1.0736,
"mean_token_accuracy": 0.6880219578742981,
"num_tokens": 310760313.0,
"step": 523
},
{
"epoch": 2.4952380952380953,
"grad_norm": 0.6218487620353699,
"learning_rate": 1e-05,
"loss": 1.0764,
"mean_token_accuracy": 0.6872262358665466,
"num_tokens": 311374002.0,
"step": 524
},
{
"epoch": 2.5,
"grad_norm": 0.6058824062347412,
"learning_rate": 1e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.6883900165557861,
"num_tokens": 311952533.0,
"step": 525
},
{
"epoch": 2.5047619047619047,
"grad_norm": 0.6459484100341797,
"learning_rate": 1e-05,
"loss": 1.065,
"mean_token_accuracy": 0.6896857023239136,
"num_tokens": 312542383.0,
"step": 526
},
{
"epoch": 2.5095238095238095,
"grad_norm": 0.6192833781242371,
"learning_rate": 1e-05,
"loss": 1.0745,
"mean_token_accuracy": 0.686732828617096,
"num_tokens": 313136427.0,
"step": 527
},
{
"epoch": 2.5142857142857142,
"grad_norm": 0.602884829044342,
"learning_rate": 1e-05,
"loss": 1.0564,
"mean_token_accuracy": 0.6925665140151978,
"num_tokens": 313731115.0,
"step": 528
},
{
"epoch": 2.519047619047619,
"grad_norm": 0.5805109143257141,
"learning_rate": 1e-05,
"loss": 1.0644,
"mean_token_accuracy": 0.6895827651023865,
"num_tokens": 314316253.0,
"step": 529
},
{
"epoch": 2.5238095238095237,
"grad_norm": 0.6484024524688721,
"learning_rate": 1e-05,
"loss": 1.0634,
"mean_token_accuracy": 0.6902580857276917,
"num_tokens": 314906539.0,
"step": 530
},
{
"epoch": 2.5285714285714285,
"grad_norm": 0.6236498355865479,
"learning_rate": 1e-05,
"loss": 1.0611,
"mean_token_accuracy": 0.6907384991645813,
"num_tokens": 315491005.0,
"step": 531
},
{
"epoch": 2.533333333333333,
"grad_norm": 0.68634432554245,
"learning_rate": 1e-05,
"loss": 1.0759,
"mean_token_accuracy": 0.6861008405685425,
"num_tokens": 316086156.0,
"step": 532
},
{
"epoch": 2.538095238095238,
"grad_norm": 0.6483022570610046,
"learning_rate": 1e-05,
"loss": 1.0809,
"mean_token_accuracy": 0.6863186359405518,
"num_tokens": 316687284.0,
"step": 533
},
{
"epoch": 2.5428571428571427,
"grad_norm": 0.6313026547431946,
"learning_rate": 1e-05,
"loss": 1.065,
"mean_token_accuracy": 0.6903449296951294,
"num_tokens": 317280976.0,
"step": 534
},
{
"epoch": 2.5476190476190474,
"grad_norm": 0.7180777788162231,
"learning_rate": 1e-05,
"loss": 1.072,
"mean_token_accuracy": 0.6879873275756836,
"num_tokens": 317869704.0,
"step": 535
},
{
"epoch": 2.552380952380952,
"grad_norm": 0.6203593611717224,
"learning_rate": 1e-05,
"loss": 1.0754,
"mean_token_accuracy": 0.6873841285705566,
"num_tokens": 318453830.0,
"step": 536
},
{
"epoch": 2.557142857142857,
"grad_norm": 0.7294032573699951,
"learning_rate": 1e-05,
"loss": 1.0816,
"mean_token_accuracy": 0.6853822469711304,
"num_tokens": 319036628.0,
"step": 537
},
{
"epoch": 2.5619047619047617,
"grad_norm": 0.6315251588821411,
"learning_rate": 1e-05,
"loss": 1.0671,
"mean_token_accuracy": 0.6895589828491211,
"num_tokens": 319641680.0,
"step": 538
},
{
"epoch": 2.5666666666666664,
"grad_norm": 0.6481133699417114,
"learning_rate": 1e-05,
"loss": 1.0733,
"mean_token_accuracy": 0.6874011754989624,
"num_tokens": 320235018.0,
"step": 539
},
{
"epoch": 2.571428571428571,
"grad_norm": 0.6537102460861206,
"learning_rate": 1e-05,
"loss": 1.0782,
"mean_token_accuracy": 0.6865108609199524,
"num_tokens": 320839523.0,
"step": 540
},
{
"epoch": 2.576190476190476,
"grad_norm": 0.5990563631057739,
"learning_rate": 1e-05,
"loss": 1.0692,
"mean_token_accuracy": 0.6882610321044922,
"num_tokens": 321445469.0,
"step": 541
},
{
"epoch": 2.580952380952381,
"grad_norm": 0.7251924276351929,
"learning_rate": 1e-05,
"loss": 1.0769,
"mean_token_accuracy": 0.685580849647522,
"num_tokens": 322040382.0,
"step": 542
},
{
"epoch": 2.585714285714286,
"grad_norm": 0.5734168291091919,
"learning_rate": 1e-05,
"loss": 1.0731,
"mean_token_accuracy": 0.6879425048828125,
"num_tokens": 322631980.0,
"step": 543
},
{
"epoch": 2.5904761904761906,
"grad_norm": 0.6524589657783508,
"learning_rate": 1e-05,
"loss": 1.0715,
"mean_token_accuracy": 0.6874139308929443,
"num_tokens": 323217003.0,
"step": 544
},
{
"epoch": 2.5952380952380953,
"grad_norm": 0.6292608976364136,
"learning_rate": 1e-05,
"loss": 1.0751,
"mean_token_accuracy": 0.6870990991592407,
"num_tokens": 323797882.0,
"step": 545
},
{
"epoch": 2.6,
"grad_norm": 0.631439208984375,
"learning_rate": 1e-05,
"loss": 1.0783,
"mean_token_accuracy": 0.685767412185669,
"num_tokens": 324381508.0,
"step": 546
},
{
"epoch": 2.604761904761905,
"grad_norm": 0.621782124042511,
"learning_rate": 1e-05,
"loss": 1.0954,
"mean_token_accuracy": 0.6821581125259399,
"num_tokens": 324979976.0,
"step": 547
},
{
"epoch": 2.6095238095238096,
"grad_norm": 0.6306419372558594,
"learning_rate": 1e-05,
"loss": 1.0677,
"mean_token_accuracy": 0.6885519623756409,
"num_tokens": 325579147.0,
"step": 548
},
{
"epoch": 2.6142857142857143,
"grad_norm": 0.5700802206993103,
"learning_rate": 1e-05,
"loss": 1.0701,
"mean_token_accuracy": 0.6891588568687439,
"num_tokens": 326189724.0,
"step": 549
},
{
"epoch": 2.619047619047619,
"grad_norm": 0.5674880146980286,
"learning_rate": 1e-05,
"loss": 1.0723,
"mean_token_accuracy": 0.6874587535858154,
"num_tokens": 326781040.0,
"step": 550
},
{
"epoch": 2.623809523809524,
"grad_norm": 0.6210941076278687,
"learning_rate": 1e-05,
"loss": 1.066,
"mean_token_accuracy": 0.6903613805770874,
"num_tokens": 327384993.0,
"step": 551
},
{
"epoch": 2.6285714285714286,
"grad_norm": 0.5762701630592346,
"learning_rate": 1e-05,
"loss": 1.0541,
"mean_token_accuracy": 0.6926007866859436,
"num_tokens": 327967527.0,
"step": 552
},
{
"epoch": 2.6333333333333333,
"grad_norm": 0.5869442224502563,
"learning_rate": 1e-05,
"loss": 1.0602,
"mean_token_accuracy": 0.6907045841217041,
"num_tokens": 328556111.0,
"step": 553
},
{
"epoch": 2.638095238095238,
"grad_norm": 0.6561670303344727,
"learning_rate": 1e-05,
"loss": 1.067,
"mean_token_accuracy": 0.6888686418533325,
"num_tokens": 329156419.0,
"step": 554
},
{
"epoch": 2.642857142857143,
"grad_norm": 0.5729210376739502,
"learning_rate": 1e-05,
"loss": 1.0908,
"mean_token_accuracy": 0.6830568313598633,
"num_tokens": 329765795.0,
"step": 555
},
{
"epoch": 2.6476190476190475,
"grad_norm": 0.5583658218383789,
"learning_rate": 1e-05,
"loss": 1.0715,
"mean_token_accuracy": 0.6889873743057251,
"num_tokens": 330366805.0,
"step": 556
},
{
"epoch": 2.6523809523809523,
"grad_norm": 0.6156875491142273,
"learning_rate": 1e-05,
"loss": 1.0597,
"mean_token_accuracy": 0.6899721622467041,
"num_tokens": 330960683.0,
"step": 557
},
{
"epoch": 2.657142857142857,
"grad_norm": 0.5830056667327881,
"learning_rate": 1e-05,
"loss": 1.0766,
"mean_token_accuracy": 0.6871880292892456,
"num_tokens": 331566617.0,
"step": 558
},
{
"epoch": 2.6619047619047618,
"grad_norm": 0.6878387928009033,
"learning_rate": 1e-05,
"loss": 1.0606,
"mean_token_accuracy": 0.6908230781555176,
"num_tokens": 332146780.0,
"step": 559
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.6010000705718994,
"learning_rate": 1e-05,
"loss": 1.073,
"mean_token_accuracy": 0.6884846091270447,
"num_tokens": 332744565.0,
"step": 560
},
{
"epoch": 2.6714285714285713,
"grad_norm": 0.6257455348968506,
"learning_rate": 1e-05,
"loss": 1.0718,
"mean_token_accuracy": 0.6879858374595642,
"num_tokens": 333327246.0,
"step": 561
},
{
"epoch": 2.6761904761904765,
"grad_norm": 0.6111727356910706,
"learning_rate": 1e-05,
"loss": 1.0604,
"mean_token_accuracy": 0.6906289458274841,
"num_tokens": 333924279.0,
"step": 562
},
{
"epoch": 2.680952380952381,
"grad_norm": 0.6363468170166016,
"learning_rate": 1e-05,
"loss": 1.0628,
"mean_token_accuracy": 0.6897515058517456,
"num_tokens": 334526451.0,
"step": 563
},
{
"epoch": 2.685714285714286,
"grad_norm": 0.6247795820236206,
"learning_rate": 1e-05,
"loss": 1.0726,
"mean_token_accuracy": 0.6881762742996216,
"num_tokens": 335121296.0,
"step": 564
},
{
"epoch": 2.6904761904761907,
"grad_norm": 0.7256935238838196,
"learning_rate": 1e-05,
"loss": 1.069,
"mean_token_accuracy": 0.6878950595855713,
"num_tokens": 335705229.0,
"step": 565
},
{
"epoch": 2.6952380952380954,
"grad_norm": 0.6218934655189514,
"learning_rate": 1e-05,
"loss": 1.0745,
"mean_token_accuracy": 0.687312126159668,
"num_tokens": 336296515.0,
"step": 566
},
{
"epoch": 2.7,
"grad_norm": 0.64492267370224,
"learning_rate": 1e-05,
"loss": 1.0772,
"mean_token_accuracy": 0.6871779561042786,
"num_tokens": 336898581.0,
"step": 567
},
{
"epoch": 2.704761904761905,
"grad_norm": 0.6439410448074341,
"learning_rate": 1e-05,
"loss": 1.0849,
"mean_token_accuracy": 0.685498833656311,
"num_tokens": 337492720.0,
"step": 568
},
{
"epoch": 2.7095238095238097,
"grad_norm": 0.5982577204704285,
"learning_rate": 1e-05,
"loss": 1.0685,
"mean_token_accuracy": 0.6890565752983093,
"num_tokens": 338088080.0,
"step": 569
},
{
"epoch": 2.7142857142857144,
"grad_norm": 0.6382868885993958,
"learning_rate": 1e-05,
"loss": 1.0678,
"mean_token_accuracy": 0.6893150806427002,
"num_tokens": 338682863.0,
"step": 570
},
{
"epoch": 2.719047619047619,
"grad_norm": 0.5995696187019348,
"learning_rate": 1e-05,
"loss": 1.0737,
"mean_token_accuracy": 0.6885708570480347,
"num_tokens": 339274595.0,
"step": 571
},
{
"epoch": 2.723809523809524,
"grad_norm": 0.6478890180587769,
"learning_rate": 1e-05,
"loss": 1.0736,
"mean_token_accuracy": 0.687543511390686,
"num_tokens": 339857633.0,
"step": 572
},
{
"epoch": 2.7285714285714286,
"grad_norm": 0.6489014625549316,
"learning_rate": 1e-05,
"loss": 1.0564,
"mean_token_accuracy": 0.6918776035308838,
"num_tokens": 340451043.0,
"step": 573
},
{
"epoch": 2.7333333333333334,
"grad_norm": 0.6406450271606445,
"learning_rate": 1e-05,
"loss": 1.0801,
"mean_token_accuracy": 0.6862790584564209,
"num_tokens": 341042238.0,
"step": 574
},
{
"epoch": 2.738095238095238,
"grad_norm": 0.6261545419692993,
"learning_rate": 1e-05,
"loss": 1.0766,
"mean_token_accuracy": 0.6869131922721863,
"num_tokens": 341644006.0,
"step": 575
},
{
"epoch": 2.742857142857143,
"grad_norm": 0.5907791256904602,
"learning_rate": 1e-05,
"loss": 1.0666,
"mean_token_accuracy": 0.6884465217590332,
"num_tokens": 342238235.0,
"step": 576
},
{
"epoch": 2.7476190476190476,
"grad_norm": 0.638664186000824,
"learning_rate": 1e-05,
"loss": 1.0609,
"mean_token_accuracy": 0.6908861994743347,
"num_tokens": 342831260.0,
"step": 577
},
{
"epoch": 2.7523809523809524,
"grad_norm": 0.6344829797744751,
"learning_rate": 1e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.687269389629364,
"num_tokens": 343427629.0,
"step": 578
},
{
"epoch": 2.757142857142857,
"grad_norm": 0.6150461435317993,
"learning_rate": 1e-05,
"loss": 1.0761,
"mean_token_accuracy": 0.6873693466186523,
"num_tokens": 344021401.0,
"step": 579
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.6308332681655884,
"learning_rate": 1e-05,
"loss": 1.0618,
"mean_token_accuracy": 0.6911748647689819,
"num_tokens": 344610108.0,
"step": 580
},
{
"epoch": 2.7666666666666666,
"grad_norm": 0.55866539478302,
"learning_rate": 1e-05,
"loss": 1.0725,
"mean_token_accuracy": 0.6869944334030151,
"num_tokens": 345217241.0,
"step": 581
},
{
"epoch": 2.7714285714285714,
"grad_norm": 0.638909637928009,
"learning_rate": 1e-05,
"loss": 1.0658,
"mean_token_accuracy": 0.6890157461166382,
"num_tokens": 345804258.0,
"step": 582
},
{
"epoch": 2.776190476190476,
"grad_norm": 0.5688804984092712,
"learning_rate": 1e-05,
"loss": 1.0689,
"mean_token_accuracy": 0.6887319087982178,
"num_tokens": 346399481.0,
"step": 583
},
{
"epoch": 2.780952380952381,
"grad_norm": 0.6002762317657471,
"learning_rate": 1e-05,
"loss": 1.0563,
"mean_token_accuracy": 0.6915134191513062,
"num_tokens": 346997922.0,
"step": 584
},
{
"epoch": 2.7857142857142856,
"grad_norm": 0.6163663864135742,
"learning_rate": 1e-05,
"loss": 1.07,
"mean_token_accuracy": 0.6884576082229614,
"num_tokens": 347597863.0,
"step": 585
},
{
"epoch": 2.7904761904761903,
"grad_norm": 0.580531656742096,
"learning_rate": 1e-05,
"loss": 1.0638,
"mean_token_accuracy": 0.6888343095779419,
"num_tokens": 348201046.0,
"step": 586
},
{
"epoch": 2.795238095238095,
"grad_norm": 0.5918668508529663,
"learning_rate": 1e-05,
"loss": 1.0584,
"mean_token_accuracy": 0.6905962228775024,
"num_tokens": 348787326.0,
"step": 587
},
{
"epoch": 2.8,
"grad_norm": 0.6383691430091858,
"learning_rate": 1e-05,
"loss": 1.0689,
"mean_token_accuracy": 0.6883484125137329,
"num_tokens": 349380645.0,
"step": 588
},
{
"epoch": 2.8047619047619046,
"grad_norm": 0.6115639805793762,
"learning_rate": 1e-05,
"loss": 1.0654,
"mean_token_accuracy": 0.6897737979888916,
"num_tokens": 349983092.0,
"step": 589
},
{
"epoch": 2.8095238095238093,
"grad_norm": 0.6397126317024231,
"learning_rate": 1e-05,
"loss": 1.0617,
"mean_token_accuracy": 0.6903370022773743,
"num_tokens": 350576836.0,
"step": 590
},
{
"epoch": 2.814285714285714,
"grad_norm": 0.6862447261810303,
"learning_rate": 1e-05,
"loss": 1.0624,
"mean_token_accuracy": 0.691243588924408,
"num_tokens": 351172725.0,
"step": 591
},
{
"epoch": 2.819047619047619,
"grad_norm": 0.6518527269363403,
"learning_rate": 1e-05,
"loss": 1.0568,
"mean_token_accuracy": 0.6918639540672302,
"num_tokens": 351777142.0,
"step": 592
},
{
"epoch": 2.8238095238095235,
"grad_norm": 0.7507683634757996,
"learning_rate": 1e-05,
"loss": 1.0576,
"mean_token_accuracy": 0.6908581852912903,
"num_tokens": 352361834.0,
"step": 593
},
{
"epoch": 2.8285714285714287,
"grad_norm": 0.6769391298294067,
"learning_rate": 1e-05,
"loss": 1.0658,
"mean_token_accuracy": 0.6891970038414001,
"num_tokens": 352964892.0,
"step": 594
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.7207344770431519,
"learning_rate": 1e-05,
"loss": 1.0655,
"mean_token_accuracy": 0.688566267490387,
"num_tokens": 353563696.0,
"step": 595
},
{
"epoch": 2.8380952380952382,
"grad_norm": 0.6687008142471313,
"learning_rate": 1e-05,
"loss": 1.0663,
"mean_token_accuracy": 0.6889446377754211,
"num_tokens": 354162323.0,
"step": 596
},
{
"epoch": 2.842857142857143,
"grad_norm": 0.6510334610939026,
"learning_rate": 1e-05,
"loss": 1.0973,
"mean_token_accuracy": 0.6817850470542908,
"num_tokens": 354763224.0,
"step": 597
},
{
"epoch": 2.8476190476190477,
"grad_norm": 0.6164536476135254,
"learning_rate": 1e-05,
"loss": 1.0599,
"mean_token_accuracy": 0.6904336214065552,
"num_tokens": 355360011.0,
"step": 598
},
{
"epoch": 2.8523809523809525,
"grad_norm": 0.6652323603630066,
"learning_rate": 1e-05,
"loss": 1.0664,
"mean_token_accuracy": 0.6892472505569458,
"num_tokens": 355948770.0,
"step": 599
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.6170997619628906,
"learning_rate": 1e-05,
"loss": 1.0749,
"mean_token_accuracy": 0.686385989189148,
"num_tokens": 356555915.0,
"step": 600
},
{
"epoch": 2.861904761904762,
"grad_norm": 0.5823125839233398,
"learning_rate": 1e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.6866952180862427,
"num_tokens": 357168089.0,
"step": 601
},
{
"epoch": 2.8666666666666667,
"grad_norm": 0.6084815859794617,
"learning_rate": 1e-05,
"loss": 1.0589,
"mean_token_accuracy": 0.6905912160873413,
"num_tokens": 357762209.0,
"step": 602
},
{
"epoch": 2.8714285714285714,
"grad_norm": 0.5347459316253662,
"learning_rate": 1e-05,
"loss": 1.065,
"mean_token_accuracy": 0.6898081302642822,
"num_tokens": 358365632.0,
"step": 603
},
{
"epoch": 2.876190476190476,
"grad_norm": 0.6211216449737549,
"learning_rate": 1e-05,
"loss": 1.0795,
"mean_token_accuracy": 0.6860474944114685,
"num_tokens": 358969038.0,
"step": 604
},
{
"epoch": 2.880952380952381,
"grad_norm": 0.6298102736473083,
"learning_rate": 1e-05,
"loss": 1.0616,
"mean_token_accuracy": 0.691013514995575,
"num_tokens": 359560638.0,
"step": 605
},
{
"epoch": 2.8857142857142857,
"grad_norm": 0.6150857210159302,
"learning_rate": 1e-05,
"loss": 1.0726,
"mean_token_accuracy": 0.6874991655349731,
"num_tokens": 360159047.0,
"step": 606
},
{
"epoch": 2.8904761904761904,
"grad_norm": 0.6256808638572693,
"learning_rate": 1e-05,
"loss": 1.0696,
"mean_token_accuracy": 0.6895867586135864,
"num_tokens": 360752550.0,
"step": 607
},
{
"epoch": 2.895238095238095,
"grad_norm": 0.6338992714881897,
"learning_rate": 1e-05,
"loss": 1.0706,
"mean_token_accuracy": 0.6878842115402222,
"num_tokens": 361348966.0,
"step": 608
},
{
"epoch": 2.9,
"grad_norm": 0.6074673533439636,
"learning_rate": 1e-05,
"loss": 1.0625,
"mean_token_accuracy": 0.690711498260498,
"num_tokens": 361933541.0,
"step": 609
},
{
"epoch": 2.9047619047619047,
"grad_norm": 0.6169112324714661,
"learning_rate": 1e-05,
"loss": 1.0679,
"mean_token_accuracy": 0.6893640756607056,
"num_tokens": 362522689.0,
"step": 610
},
{
"epoch": 2.9095238095238094,
"grad_norm": 0.6712765097618103,
"learning_rate": 1e-05,
"loss": 1.0481,
"mean_token_accuracy": 0.6935627460479736,
"num_tokens": 363107779.0,
"step": 611
},
{
"epoch": 2.914285714285714,
"grad_norm": 0.6030009388923645,
"learning_rate": 1e-05,
"loss": 1.0624,
"mean_token_accuracy": 0.6899582147598267,
"num_tokens": 363690809.0,
"step": 612
},
{
"epoch": 2.919047619047619,
"grad_norm": 0.6335533261299133,
"learning_rate": 1e-05,
"loss": 1.0632,
"mean_token_accuracy": 0.6892010569572449,
"num_tokens": 364279923.0,
"step": 613
},
{
"epoch": 2.923809523809524,
"grad_norm": 0.6299601793289185,
"learning_rate": 1e-05,
"loss": 1.0536,
"mean_token_accuracy": 0.6920279264450073,
"num_tokens": 364846929.0,
"step": 614
},
{
"epoch": 2.928571428571429,
"grad_norm": 0.6494601964950562,
"learning_rate": 1e-05,
"loss": 1.0797,
"mean_token_accuracy": 0.6871404051780701,
"num_tokens": 365427755.0,
"step": 615
},
{
"epoch": 2.9333333333333336,
"grad_norm": 0.6412233710289001,
"learning_rate": 1e-05,
"loss": 1.0618,
"mean_token_accuracy": 0.6902071833610535,
"num_tokens": 366022879.0,
"step": 616
},
{
"epoch": 2.9380952380952383,
"grad_norm": 0.5901429653167725,
"learning_rate": 1e-05,
"loss": 1.0639,
"mean_token_accuracy": 0.6891224384307861,
"num_tokens": 366608198.0,
"step": 617
},
{
"epoch": 2.942857142857143,
"grad_norm": 0.6606128811836243,
"learning_rate": 1e-05,
"loss": 1.0687,
"mean_token_accuracy": 0.6881773471832275,
"num_tokens": 367187170.0,
"step": 618
},
{
"epoch": 2.947619047619048,
"grad_norm": 0.6021740436553955,
"learning_rate": 1e-05,
"loss": 1.062,
"mean_token_accuracy": 0.6895371675491333,
"num_tokens": 367778542.0,
"step": 619
},
{
"epoch": 2.9523809523809526,
"grad_norm": 0.6304929852485657,
"learning_rate": 1e-05,
"loss": 1.0685,
"mean_token_accuracy": 0.6876203417778015,
"num_tokens": 368374361.0,
"step": 620
},
{
"epoch": 2.9571428571428573,
"grad_norm": 0.6775472164154053,
"learning_rate": 1e-05,
"loss": 1.0693,
"mean_token_accuracy": 0.688637375831604,
"num_tokens": 368975961.0,
"step": 621
},
{
"epoch": 2.961904761904762,
"grad_norm": 0.6188324689865112,
"learning_rate": 1e-05,
"loss": 1.0446,
"mean_token_accuracy": 0.69502854347229,
"num_tokens": 369565801.0,
"step": 622
},
{
"epoch": 2.966666666666667,
"grad_norm": 0.7237592339515686,
"learning_rate": 1e-05,
"loss": 1.069,
"mean_token_accuracy": 0.6882259845733643,
"num_tokens": 370147963.0,
"step": 623
},
{
"epoch": 2.9714285714285715,
"grad_norm": 0.5706875920295715,
"learning_rate": 1e-05,
"loss": 1.0661,
"mean_token_accuracy": 0.688866376876831,
"num_tokens": 370730337.0,
"step": 624
},
{
"epoch": 2.9761904761904763,
"grad_norm": 0.6157565712928772,
"learning_rate": 1e-05,
"loss": 1.0595,
"mean_token_accuracy": 0.6903921365737915,
"num_tokens": 371313464.0,
"step": 625
},
{
"epoch": 2.980952380952381,
"grad_norm": 0.5899333953857422,
"learning_rate": 1e-05,
"loss": 1.0634,
"mean_token_accuracy": 0.690090537071228,
"num_tokens": 371903211.0,
"step": 626
},
{
"epoch": 2.9857142857142858,
"grad_norm": 0.6269708275794983,
"learning_rate": 1e-05,
"loss": 1.0536,
"mean_token_accuracy": 0.6934218406677246,
"num_tokens": 372496314.0,
"step": 627
},
{
"epoch": 2.9904761904761905,
"grad_norm": 0.6969268321990967,
"learning_rate": 1e-05,
"loss": 1.0676,
"mean_token_accuracy": 0.688661515712738,
"num_tokens": 373096610.0,
"step": 628
},
{
"epoch": 2.9952380952380953,
"grad_norm": 0.5695185661315918,
"learning_rate": 1e-05,
"loss": 1.0693,
"mean_token_accuracy": 0.6875466108322144,
"num_tokens": 373694165.0,
"step": 629
},
{
"epoch": 3.0,
"grad_norm": 0.6636136174201965,
"learning_rate": 1e-05,
"loss": 1.0583,
"mean_token_accuracy": 0.6922066807746887,
"num_tokens": 374283247.0,
"step": 630
},
{
"epoch": 3.0,
"step": 630,
"total_flos": 2.1853937174671524e+18,
"train_loss": 1.1615400253780304,
"train_runtime": 1840.1375,
"train_samples_per_second": 175.286,
"train_steps_per_second": 0.342
}
],
"logging_steps": 1,
"max_steps": 630,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 315,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.1853937174671524e+18,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}