Files
openthoughts3_100k_qwen25_1…/trainer_state.json
ModelHub XC fab7e2bb35 初始化项目,由ModelHub XC社区提供模型
Model: mlfoundations-dev/openthoughts3_100k_qwen25_1b_bsz256_lr2e5_epochs7
Source: Original Platform
2026-06-11 18:32:12 +08:00

19202 lines
466 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.0,
"eval_steps": 500,
"global_step": 2737,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025575447570332483,
"grad_norm": 2.9635716191319874,
"learning_rate": 7.299270072992701e-08,
"loss": 1.5218,
"step": 1
},
{
"epoch": 0.005115089514066497,
"grad_norm": 2.9570039035815743,
"learning_rate": 1.4598540145985402e-07,
"loss": 1.4755,
"step": 2
},
{
"epoch": 0.0076726342710997444,
"grad_norm": 3.017937072257941,
"learning_rate": 2.1897810218978106e-07,
"loss": 1.4935,
"step": 3
},
{
"epoch": 0.010230179028132993,
"grad_norm": 2.960891290072101,
"learning_rate": 2.9197080291970804e-07,
"loss": 1.4813,
"step": 4
},
{
"epoch": 0.01278772378516624,
"grad_norm": 2.976019939455323,
"learning_rate": 3.6496350364963505e-07,
"loss": 1.4941,
"step": 5
},
{
"epoch": 0.015345268542199489,
"grad_norm": 3.0149006457959886,
"learning_rate": 4.379562043795621e-07,
"loss": 1.5066,
"step": 6
},
{
"epoch": 0.017902813299232736,
"grad_norm": 2.9237260833122214,
"learning_rate": 5.109489051094891e-07,
"loss": 1.478,
"step": 7
},
{
"epoch": 0.020460358056265986,
"grad_norm": 2.9640674426484077,
"learning_rate": 5.839416058394161e-07,
"loss": 1.4882,
"step": 8
},
{
"epoch": 0.023017902813299233,
"grad_norm": 2.883080870578686,
"learning_rate": 6.569343065693432e-07,
"loss": 1.5219,
"step": 9
},
{
"epoch": 0.02557544757033248,
"grad_norm": 2.8912016510708844,
"learning_rate": 7.299270072992701e-07,
"loss": 1.5149,
"step": 10
},
{
"epoch": 0.028132992327365727,
"grad_norm": 2.8525137837011734,
"learning_rate": 8.029197080291971e-07,
"loss": 1.5065,
"step": 11
},
{
"epoch": 0.030690537084398978,
"grad_norm": 2.6980401328828734,
"learning_rate": 8.759124087591242e-07,
"loss": 1.47,
"step": 12
},
{
"epoch": 0.03324808184143223,
"grad_norm": 2.6499759522230795,
"learning_rate": 9.489051094890511e-07,
"loss": 1.5126,
"step": 13
},
{
"epoch": 0.03580562659846547,
"grad_norm": 2.646192888612826,
"learning_rate": 1.0218978102189781e-06,
"loss": 1.4605,
"step": 14
},
{
"epoch": 0.03836317135549872,
"grad_norm": 2.584050631976731,
"learning_rate": 1.0948905109489052e-06,
"loss": 1.4985,
"step": 15
},
{
"epoch": 0.04092071611253197,
"grad_norm": 2.3627571129305425,
"learning_rate": 1.1678832116788322e-06,
"loss": 1.4523,
"step": 16
},
{
"epoch": 0.043478260869565216,
"grad_norm": 2.052553239445229,
"learning_rate": 1.2408759124087592e-06,
"loss": 1.4734,
"step": 17
},
{
"epoch": 0.04603580562659847,
"grad_norm": 2.0014770644457442,
"learning_rate": 1.3138686131386864e-06,
"loss": 1.479,
"step": 18
},
{
"epoch": 0.04859335038363171,
"grad_norm": 1.9847838678835794,
"learning_rate": 1.3868613138686132e-06,
"loss": 1.4702,
"step": 19
},
{
"epoch": 0.05115089514066496,
"grad_norm": 1.9111274600693329,
"learning_rate": 1.4598540145985402e-06,
"loss": 1.4617,
"step": 20
},
{
"epoch": 0.05370843989769821,
"grad_norm": 1.870897574722989,
"learning_rate": 1.5328467153284674e-06,
"loss": 1.4463,
"step": 21
},
{
"epoch": 0.056265984654731455,
"grad_norm": 1.4296640142109796,
"learning_rate": 1.6058394160583942e-06,
"loss": 1.4599,
"step": 22
},
{
"epoch": 0.058823529411764705,
"grad_norm": 1.4790607914654283,
"learning_rate": 1.6788321167883212e-06,
"loss": 1.4157,
"step": 23
},
{
"epoch": 0.061381074168797956,
"grad_norm": 1.6141927865863235,
"learning_rate": 1.7518248175182485e-06,
"loss": 1.4439,
"step": 24
},
{
"epoch": 0.0639386189258312,
"grad_norm": 1.599753856171314,
"learning_rate": 1.8248175182481753e-06,
"loss": 1.4218,
"step": 25
},
{
"epoch": 0.06649616368286446,
"grad_norm": 1.4847704184111228,
"learning_rate": 1.8978102189781023e-06,
"loss": 1.4269,
"step": 26
},
{
"epoch": 0.06905370843989769,
"grad_norm": 1.3521166489305316,
"learning_rate": 1.9708029197080293e-06,
"loss": 1.4158,
"step": 27
},
{
"epoch": 0.07161125319693094,
"grad_norm": 1.2579545228076663,
"learning_rate": 2.0437956204379563e-06,
"loss": 1.4405,
"step": 28
},
{
"epoch": 0.0741687979539642,
"grad_norm": 1.009619956209423,
"learning_rate": 2.1167883211678833e-06,
"loss": 1.4151,
"step": 29
},
{
"epoch": 0.07672634271099744,
"grad_norm": 1.1838282966029092,
"learning_rate": 2.1897810218978103e-06,
"loss": 1.419,
"step": 30
},
{
"epoch": 0.0792838874680307,
"grad_norm": 1.2384598412642265,
"learning_rate": 2.2627737226277373e-06,
"loss": 1.412,
"step": 31
},
{
"epoch": 0.08184143222506395,
"grad_norm": 1.1754182466507677,
"learning_rate": 2.3357664233576643e-06,
"loss": 1.3866,
"step": 32
},
{
"epoch": 0.08439897698209718,
"grad_norm": 1.0614055850869524,
"learning_rate": 2.4087591240875918e-06,
"loss": 1.4127,
"step": 33
},
{
"epoch": 0.08695652173913043,
"grad_norm": 1.0576160445761484,
"learning_rate": 2.4817518248175183e-06,
"loss": 1.4281,
"step": 34
},
{
"epoch": 0.08951406649616368,
"grad_norm": 1.0117252925259892,
"learning_rate": 2.5547445255474458e-06,
"loss": 1.3731,
"step": 35
},
{
"epoch": 0.09207161125319693,
"grad_norm": 0.9022593000895403,
"learning_rate": 2.627737226277373e-06,
"loss": 1.3866,
"step": 36
},
{
"epoch": 0.09462915601023018,
"grad_norm": 0.8340755212001483,
"learning_rate": 2.7007299270072994e-06,
"loss": 1.4026,
"step": 37
},
{
"epoch": 0.09718670076726342,
"grad_norm": 0.7261384916519003,
"learning_rate": 2.7737226277372264e-06,
"loss": 1.372,
"step": 38
},
{
"epoch": 0.09974424552429667,
"grad_norm": 0.6484685338282444,
"learning_rate": 2.8467153284671534e-06,
"loss": 1.3914,
"step": 39
},
{
"epoch": 0.10230179028132992,
"grad_norm": 0.5852202685330168,
"learning_rate": 2.9197080291970804e-06,
"loss": 1.328,
"step": 40
},
{
"epoch": 0.10485933503836317,
"grad_norm": 0.7534890308070339,
"learning_rate": 2.992700729927008e-06,
"loss": 1.3525,
"step": 41
},
{
"epoch": 0.10741687979539642,
"grad_norm": 0.851146761403294,
"learning_rate": 3.065693430656935e-06,
"loss": 1.3478,
"step": 42
},
{
"epoch": 0.10997442455242967,
"grad_norm": 0.7827817647570426,
"learning_rate": 3.1386861313868614e-06,
"loss": 1.3191,
"step": 43
},
{
"epoch": 0.11253196930946291,
"grad_norm": 0.664689408470926,
"learning_rate": 3.2116788321167884e-06,
"loss": 1.3222,
"step": 44
},
{
"epoch": 0.11508951406649616,
"grad_norm": 0.5490554622557167,
"learning_rate": 3.2846715328467155e-06,
"loss": 1.3238,
"step": 45
},
{
"epoch": 0.11764705882352941,
"grad_norm": 0.5108750790400686,
"learning_rate": 3.3576642335766425e-06,
"loss": 1.3436,
"step": 46
},
{
"epoch": 0.12020460358056266,
"grad_norm": 0.5445952611951665,
"learning_rate": 3.43065693430657e-06,
"loss": 1.3458,
"step": 47
},
{
"epoch": 0.12276214833759591,
"grad_norm": 0.5697581064671751,
"learning_rate": 3.503649635036497e-06,
"loss": 1.3132,
"step": 48
},
{
"epoch": 0.12531969309462915,
"grad_norm": 0.578411430323597,
"learning_rate": 3.576642335766424e-06,
"loss": 1.3268,
"step": 49
},
{
"epoch": 0.1278772378516624,
"grad_norm": 0.5601792557806415,
"learning_rate": 3.6496350364963505e-06,
"loss": 1.2966,
"step": 50
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.5306373264311374,
"learning_rate": 3.7226277372262775e-06,
"loss": 1.3004,
"step": 51
},
{
"epoch": 0.1329923273657289,
"grad_norm": 0.4661660429983145,
"learning_rate": 3.7956204379562045e-06,
"loss": 1.2812,
"step": 52
},
{
"epoch": 0.13554987212276215,
"grad_norm": 0.42244352277225405,
"learning_rate": 3.868613138686132e-06,
"loss": 1.2774,
"step": 53
},
{
"epoch": 0.13810741687979539,
"grad_norm": 0.39129018686480066,
"learning_rate": 3.9416058394160585e-06,
"loss": 1.3168,
"step": 54
},
{
"epoch": 0.14066496163682865,
"grad_norm": 0.3485115346190062,
"learning_rate": 4.014598540145986e-06,
"loss": 1.3283,
"step": 55
},
{
"epoch": 0.1432225063938619,
"grad_norm": 0.3976730412907507,
"learning_rate": 4.0875912408759126e-06,
"loss": 1.3135,
"step": 56
},
{
"epoch": 0.14578005115089515,
"grad_norm": 0.4153119646875293,
"learning_rate": 4.16058394160584e-06,
"loss": 1.2989,
"step": 57
},
{
"epoch": 0.1483375959079284,
"grad_norm": 0.42065859451204163,
"learning_rate": 4.233576642335767e-06,
"loss": 1.3137,
"step": 58
},
{
"epoch": 0.15089514066496162,
"grad_norm": 0.35014086468112804,
"learning_rate": 4.306569343065693e-06,
"loss": 1.2743,
"step": 59
},
{
"epoch": 0.1534526854219949,
"grad_norm": 0.32228235531527744,
"learning_rate": 4.379562043795621e-06,
"loss": 1.2987,
"step": 60
},
{
"epoch": 0.15601023017902813,
"grad_norm": 0.33710245284823415,
"learning_rate": 4.452554744525548e-06,
"loss": 1.2869,
"step": 61
},
{
"epoch": 0.1585677749360614,
"grad_norm": 0.34426470471374965,
"learning_rate": 4.525547445255475e-06,
"loss": 1.3199,
"step": 62
},
{
"epoch": 0.16112531969309463,
"grad_norm": 0.334431341569014,
"learning_rate": 4.598540145985402e-06,
"loss": 1.2972,
"step": 63
},
{
"epoch": 0.1636828644501279,
"grad_norm": 0.33024914298061436,
"learning_rate": 4.671532846715329e-06,
"loss": 1.2928,
"step": 64
},
{
"epoch": 0.16624040920716113,
"grad_norm": 0.3058316278280544,
"learning_rate": 4.744525547445255e-06,
"loss": 1.2861,
"step": 65
},
{
"epoch": 0.16879795396419436,
"grad_norm": 0.292869194083437,
"learning_rate": 4.8175182481751835e-06,
"loss": 1.2461,
"step": 66
},
{
"epoch": 0.17135549872122763,
"grad_norm": 0.24971695111221698,
"learning_rate": 4.89051094890511e-06,
"loss": 1.2661,
"step": 67
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.26954765363549843,
"learning_rate": 4.963503649635037e-06,
"loss": 1.2467,
"step": 68
},
{
"epoch": 0.17647058823529413,
"grad_norm": 0.25356010222488795,
"learning_rate": 5.036496350364964e-06,
"loss": 1.2303,
"step": 69
},
{
"epoch": 0.17902813299232737,
"grad_norm": 0.2339589024717998,
"learning_rate": 5.1094890510948916e-06,
"loss": 1.2399,
"step": 70
},
{
"epoch": 0.1815856777493606,
"grad_norm": 0.22823462929167784,
"learning_rate": 5.182481751824818e-06,
"loss": 1.2498,
"step": 71
},
{
"epoch": 0.18414322250639387,
"grad_norm": 0.24948571250389207,
"learning_rate": 5.255474452554746e-06,
"loss": 1.2643,
"step": 72
},
{
"epoch": 0.1867007672634271,
"grad_norm": 0.2298632960982471,
"learning_rate": 5.328467153284672e-06,
"loss": 1.2958,
"step": 73
},
{
"epoch": 0.18925831202046037,
"grad_norm": 0.22223759951095107,
"learning_rate": 5.401459854014599e-06,
"loss": 1.2422,
"step": 74
},
{
"epoch": 0.1918158567774936,
"grad_norm": 0.23124679789968172,
"learning_rate": 5.474452554744526e-06,
"loss": 1.2407,
"step": 75
},
{
"epoch": 0.19437340153452684,
"grad_norm": 0.2221181062125986,
"learning_rate": 5.547445255474453e-06,
"loss": 1.2456,
"step": 76
},
{
"epoch": 0.1969309462915601,
"grad_norm": 0.1998449044080008,
"learning_rate": 5.62043795620438e-06,
"loss": 1.2514,
"step": 77
},
{
"epoch": 0.19948849104859334,
"grad_norm": 0.19727362882566524,
"learning_rate": 5.693430656934307e-06,
"loss": 1.2335,
"step": 78
},
{
"epoch": 0.2020460358056266,
"grad_norm": 0.20659124094509168,
"learning_rate": 5.766423357664233e-06,
"loss": 1.2276,
"step": 79
},
{
"epoch": 0.20460358056265984,
"grad_norm": 0.22959713985782182,
"learning_rate": 5.839416058394161e-06,
"loss": 1.2435,
"step": 80
},
{
"epoch": 0.2071611253196931,
"grad_norm": 0.19904222253631854,
"learning_rate": 5.912408759124088e-06,
"loss": 1.2266,
"step": 81
},
{
"epoch": 0.20971867007672634,
"grad_norm": 0.19344151897086864,
"learning_rate": 5.985401459854016e-06,
"loss": 1.2261,
"step": 82
},
{
"epoch": 0.21227621483375958,
"grad_norm": 0.19302417663791685,
"learning_rate": 6.058394160583942e-06,
"loss": 1.2384,
"step": 83
},
{
"epoch": 0.21483375959079284,
"grad_norm": 0.21396454463521547,
"learning_rate": 6.13138686131387e-06,
"loss": 1.235,
"step": 84
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.1913859035516872,
"learning_rate": 6.204379562043796e-06,
"loss": 1.2838,
"step": 85
},
{
"epoch": 0.21994884910485935,
"grad_norm": 0.17510278677847252,
"learning_rate": 6.277372262773723e-06,
"loss": 1.2358,
"step": 86
},
{
"epoch": 0.22250639386189258,
"grad_norm": 0.19863525132725016,
"learning_rate": 6.35036496350365e-06,
"loss": 1.2419,
"step": 87
},
{
"epoch": 0.22506393861892582,
"grad_norm": 0.19478563516185365,
"learning_rate": 6.423357664233577e-06,
"loss": 1.2641,
"step": 88
},
{
"epoch": 0.22762148337595908,
"grad_norm": 0.17875499154062388,
"learning_rate": 6.496350364963504e-06,
"loss": 1.2239,
"step": 89
},
{
"epoch": 0.23017902813299232,
"grad_norm": 0.1751251099110654,
"learning_rate": 6.569343065693431e-06,
"loss": 1.2524,
"step": 90
},
{
"epoch": 0.23273657289002558,
"grad_norm": 0.1869390091762672,
"learning_rate": 6.6423357664233575e-06,
"loss": 1.2494,
"step": 91
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.17676974553290642,
"learning_rate": 6.715328467153285e-06,
"loss": 1.2537,
"step": 92
},
{
"epoch": 0.23785166240409208,
"grad_norm": 0.1806189007928041,
"learning_rate": 6.7883211678832115e-06,
"loss": 1.2349,
"step": 93
},
{
"epoch": 0.24040920716112532,
"grad_norm": 0.18193990233718968,
"learning_rate": 6.86131386861314e-06,
"loss": 1.2583,
"step": 94
},
{
"epoch": 0.24296675191815856,
"grad_norm": 0.19012671201766562,
"learning_rate": 6.934306569343066e-06,
"loss": 1.2029,
"step": 95
},
{
"epoch": 0.24552429667519182,
"grad_norm": 0.16857838785815454,
"learning_rate": 7.007299270072994e-06,
"loss": 1.2423,
"step": 96
},
{
"epoch": 0.24808184143222506,
"grad_norm": 0.18952785901605423,
"learning_rate": 7.08029197080292e-06,
"loss": 1.2394,
"step": 97
},
{
"epoch": 0.2506393861892583,
"grad_norm": 0.18078294692872968,
"learning_rate": 7.153284671532848e-06,
"loss": 1.2122,
"step": 98
},
{
"epoch": 0.2531969309462916,
"grad_norm": 0.17487368586515217,
"learning_rate": 7.2262773722627744e-06,
"loss": 1.2117,
"step": 99
},
{
"epoch": 0.2557544757033248,
"grad_norm": 0.17732077203789362,
"learning_rate": 7.299270072992701e-06,
"loss": 1.2041,
"step": 100
},
{
"epoch": 0.25831202046035806,
"grad_norm": 0.18421840800752218,
"learning_rate": 7.3722627737226285e-06,
"loss": 1.2231,
"step": 101
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.1768000076239069,
"learning_rate": 7.445255474452555e-06,
"loss": 1.2325,
"step": 102
},
{
"epoch": 0.26342710997442453,
"grad_norm": 0.16984854034130697,
"learning_rate": 7.5182481751824825e-06,
"loss": 1.2026,
"step": 103
},
{
"epoch": 0.2659846547314578,
"grad_norm": 0.16277787684968492,
"learning_rate": 7.591240875912409e-06,
"loss": 1.193,
"step": 104
},
{
"epoch": 0.26854219948849106,
"grad_norm": 0.17357111549131551,
"learning_rate": 7.664233576642336e-06,
"loss": 1.2009,
"step": 105
},
{
"epoch": 0.2710997442455243,
"grad_norm": 0.1800163972852127,
"learning_rate": 7.737226277372264e-06,
"loss": 1.1909,
"step": 106
},
{
"epoch": 0.27365728900255754,
"grad_norm": 0.1681574320113801,
"learning_rate": 7.810218978102191e-06,
"loss": 1.2194,
"step": 107
},
{
"epoch": 0.27621483375959077,
"grad_norm": 0.16885285400717157,
"learning_rate": 7.883211678832117e-06,
"loss": 1.1985,
"step": 108
},
{
"epoch": 0.27877237851662406,
"grad_norm": 0.17914067468814437,
"learning_rate": 7.956204379562045e-06,
"loss": 1.2218,
"step": 109
},
{
"epoch": 0.2813299232736573,
"grad_norm": 0.16706925568533235,
"learning_rate": 8.029197080291972e-06,
"loss": 1.222,
"step": 110
},
{
"epoch": 0.28388746803069054,
"grad_norm": 0.1641264132835115,
"learning_rate": 8.1021897810219e-06,
"loss": 1.2242,
"step": 111
},
{
"epoch": 0.2864450127877238,
"grad_norm": 0.18443514799994437,
"learning_rate": 8.175182481751825e-06,
"loss": 1.2118,
"step": 112
},
{
"epoch": 0.289002557544757,
"grad_norm": 0.17675822272527503,
"learning_rate": 8.248175182481753e-06,
"loss": 1.1849,
"step": 113
},
{
"epoch": 0.2915601023017903,
"grad_norm": 0.1880451995042565,
"learning_rate": 8.32116788321168e-06,
"loss": 1.2103,
"step": 114
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.16598375442205784,
"learning_rate": 8.394160583941606e-06,
"loss": 1.1937,
"step": 115
},
{
"epoch": 0.2966751918158568,
"grad_norm": 0.190898911263414,
"learning_rate": 8.467153284671533e-06,
"loss": 1.2028,
"step": 116
},
{
"epoch": 0.29923273657289,
"grad_norm": 0.18881369445042054,
"learning_rate": 8.54014598540146e-06,
"loss": 1.1976,
"step": 117
},
{
"epoch": 0.30179028132992325,
"grad_norm": 0.20907040258575316,
"learning_rate": 8.613138686131386e-06,
"loss": 1.2476,
"step": 118
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.1704000017989476,
"learning_rate": 8.686131386861315e-06,
"loss": 1.2087,
"step": 119
},
{
"epoch": 0.3069053708439898,
"grad_norm": 0.19455649517228424,
"learning_rate": 8.759124087591241e-06,
"loss": 1.214,
"step": 120
},
{
"epoch": 0.309462915601023,
"grad_norm": 0.18574238206663096,
"learning_rate": 8.832116788321169e-06,
"loss": 1.2276,
"step": 121
},
{
"epoch": 0.31202046035805625,
"grad_norm": 0.19290426166252228,
"learning_rate": 8.905109489051096e-06,
"loss": 1.1805,
"step": 122
},
{
"epoch": 0.3145780051150895,
"grad_norm": 0.1995598501375803,
"learning_rate": 8.978102189781024e-06,
"loss": 1.2007,
"step": 123
},
{
"epoch": 0.3171355498721228,
"grad_norm": 0.17673439222358,
"learning_rate": 9.05109489051095e-06,
"loss": 1.1966,
"step": 124
},
{
"epoch": 0.319693094629156,
"grad_norm": 0.1966681987874607,
"learning_rate": 9.124087591240877e-06,
"loss": 1.1739,
"step": 125
},
{
"epoch": 0.32225063938618925,
"grad_norm": 0.20745524723498263,
"learning_rate": 9.197080291970804e-06,
"loss": 1.2309,
"step": 126
},
{
"epoch": 0.3248081841432225,
"grad_norm": 0.20371417264487574,
"learning_rate": 9.27007299270073e-06,
"loss": 1.1718,
"step": 127
},
{
"epoch": 0.3273657289002558,
"grad_norm": 0.20142192992356361,
"learning_rate": 9.343065693430657e-06,
"loss": 1.1981,
"step": 128
},
{
"epoch": 0.329923273657289,
"grad_norm": 0.18157695452516256,
"learning_rate": 9.416058394160585e-06,
"loss": 1.187,
"step": 129
},
{
"epoch": 0.33248081841432225,
"grad_norm": 0.18405529622418393,
"learning_rate": 9.48905109489051e-06,
"loss": 1.2154,
"step": 130
},
{
"epoch": 0.3350383631713555,
"grad_norm": 0.18826966568044085,
"learning_rate": 9.56204379562044e-06,
"loss": 1.1823,
"step": 131
},
{
"epoch": 0.3375959079283887,
"grad_norm": 0.17870276101242044,
"learning_rate": 9.635036496350367e-06,
"loss": 1.2399,
"step": 132
},
{
"epoch": 0.340153452685422,
"grad_norm": 0.18386831261657108,
"learning_rate": 9.708029197080293e-06,
"loss": 1.2114,
"step": 133
},
{
"epoch": 0.34271099744245526,
"grad_norm": 0.1795896309939293,
"learning_rate": 9.78102189781022e-06,
"loss": 1.1832,
"step": 134
},
{
"epoch": 0.3452685421994885,
"grad_norm": 0.21827425129513728,
"learning_rate": 9.854014598540148e-06,
"loss": 1.2389,
"step": 135
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.1768309026825683,
"learning_rate": 9.927007299270073e-06,
"loss": 1.1965,
"step": 136
},
{
"epoch": 0.35038363171355497,
"grad_norm": 0.20302569863881262,
"learning_rate": 1e-05,
"loss": 1.2094,
"step": 137
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.19427846203063504,
"learning_rate": 1.0072992700729928e-05,
"loss": 1.1974,
"step": 138
},
{
"epoch": 0.3554987212276215,
"grad_norm": 0.17339331224519358,
"learning_rate": 1.0145985401459854e-05,
"loss": 1.1736,
"step": 139
},
{
"epoch": 0.35805626598465473,
"grad_norm": 0.2466539718194467,
"learning_rate": 1.0218978102189783e-05,
"loss": 1.2279,
"step": 140
},
{
"epoch": 0.36061381074168797,
"grad_norm": 0.21241110450455392,
"learning_rate": 1.0291970802919709e-05,
"loss": 1.1409,
"step": 141
},
{
"epoch": 0.3631713554987212,
"grad_norm": 0.18293508498426997,
"learning_rate": 1.0364963503649636e-05,
"loss": 1.1957,
"step": 142
},
{
"epoch": 0.3657289002557545,
"grad_norm": 0.19790775478208397,
"learning_rate": 1.0437956204379562e-05,
"loss": 1.2193,
"step": 143
},
{
"epoch": 0.36828644501278773,
"grad_norm": 0.20929660856991877,
"learning_rate": 1.0510948905109491e-05,
"loss": 1.1866,
"step": 144
},
{
"epoch": 0.37084398976982097,
"grad_norm": 0.1926018989518869,
"learning_rate": 1.0583941605839417e-05,
"loss": 1.2015,
"step": 145
},
{
"epoch": 0.3734015345268542,
"grad_norm": 0.19192914492955238,
"learning_rate": 1.0656934306569344e-05,
"loss": 1.1886,
"step": 146
},
{
"epoch": 0.37595907928388744,
"grad_norm": 0.20322534422512073,
"learning_rate": 1.072992700729927e-05,
"loss": 1.2199,
"step": 147
},
{
"epoch": 0.37851662404092073,
"grad_norm": 0.18947938981971202,
"learning_rate": 1.0802919708029198e-05,
"loss": 1.1829,
"step": 148
},
{
"epoch": 0.38107416879795397,
"grad_norm": 0.2154696847726249,
"learning_rate": 1.0875912408759123e-05,
"loss": 1.1655,
"step": 149
},
{
"epoch": 0.3836317135549872,
"grad_norm": 0.20859256059256231,
"learning_rate": 1.0948905109489052e-05,
"loss": 1.1815,
"step": 150
},
{
"epoch": 0.38618925831202044,
"grad_norm": 0.20565139563521717,
"learning_rate": 1.102189781021898e-05,
"loss": 1.1848,
"step": 151
},
{
"epoch": 0.3887468030690537,
"grad_norm": 0.21340531513272162,
"learning_rate": 1.1094890510948906e-05,
"loss": 1.188,
"step": 152
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.22952365545919354,
"learning_rate": 1.1167883211678833e-05,
"loss": 1.1772,
"step": 153
},
{
"epoch": 0.3938618925831202,
"grad_norm": 0.21489457470648385,
"learning_rate": 1.124087591240876e-05,
"loss": 1.1807,
"step": 154
},
{
"epoch": 0.39641943734015345,
"grad_norm": 0.22932079381688553,
"learning_rate": 1.1313868613138688e-05,
"loss": 1.1949,
"step": 155
},
{
"epoch": 0.3989769820971867,
"grad_norm": 0.23209900752946952,
"learning_rate": 1.1386861313868614e-05,
"loss": 1.1996,
"step": 156
},
{
"epoch": 0.40153452685422,
"grad_norm": 0.22388173844283388,
"learning_rate": 1.1459854014598541e-05,
"loss": 1.2097,
"step": 157
},
{
"epoch": 0.4040920716112532,
"grad_norm": 0.21380373488801446,
"learning_rate": 1.1532846715328467e-05,
"loss": 1.2082,
"step": 158
},
{
"epoch": 0.40664961636828645,
"grad_norm": 0.21817873889647327,
"learning_rate": 1.1605839416058396e-05,
"loss": 1.1586,
"step": 159
},
{
"epoch": 0.4092071611253197,
"grad_norm": 0.2450535248536084,
"learning_rate": 1.1678832116788322e-05,
"loss": 1.1765,
"step": 160
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.24576894425899287,
"learning_rate": 1.1751824817518249e-05,
"loss": 1.1701,
"step": 161
},
{
"epoch": 0.4143222506393862,
"grad_norm": 0.2781533359151788,
"learning_rate": 1.1824817518248176e-05,
"loss": 1.1686,
"step": 162
},
{
"epoch": 0.41687979539641945,
"grad_norm": 0.23249844406377174,
"learning_rate": 1.1897810218978102e-05,
"loss": 1.169,
"step": 163
},
{
"epoch": 0.4194373401534527,
"grad_norm": 0.2425823032194627,
"learning_rate": 1.1970802919708031e-05,
"loss": 1.1821,
"step": 164
},
{
"epoch": 0.4219948849104859,
"grad_norm": 0.18932993548929591,
"learning_rate": 1.2043795620437957e-05,
"loss": 1.1538,
"step": 165
},
{
"epoch": 0.42455242966751916,
"grad_norm": 0.2884159065917926,
"learning_rate": 1.2116788321167885e-05,
"loss": 1.1787,
"step": 166
},
{
"epoch": 0.42710997442455245,
"grad_norm": 0.2667378207082784,
"learning_rate": 1.218978102189781e-05,
"loss": 1.1774,
"step": 167
},
{
"epoch": 0.4296675191815857,
"grad_norm": 0.24644746723371008,
"learning_rate": 1.226277372262774e-05,
"loss": 1.1823,
"step": 168
},
{
"epoch": 0.4322250639386189,
"grad_norm": 0.3049603900188157,
"learning_rate": 1.2335766423357665e-05,
"loss": 1.1808,
"step": 169
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.24091240924103605,
"learning_rate": 1.2408759124087593e-05,
"loss": 1.1646,
"step": 170
},
{
"epoch": 0.4373401534526854,
"grad_norm": 0.31462619972433453,
"learning_rate": 1.2481751824817518e-05,
"loss": 1.1742,
"step": 171
},
{
"epoch": 0.4398976982097187,
"grad_norm": 0.25976500149808457,
"learning_rate": 1.2554744525547446e-05,
"loss": 1.1741,
"step": 172
},
{
"epoch": 0.4424552429667519,
"grad_norm": 0.22869248627416927,
"learning_rate": 1.2627737226277371e-05,
"loss": 1.1927,
"step": 173
},
{
"epoch": 0.44501278772378516,
"grad_norm": 0.27204853892769404,
"learning_rate": 1.27007299270073e-05,
"loss": 1.199,
"step": 174
},
{
"epoch": 0.4475703324808184,
"grad_norm": 0.22922656795364751,
"learning_rate": 1.2773722627737228e-05,
"loss": 1.1742,
"step": 175
},
{
"epoch": 0.45012787723785164,
"grad_norm": 0.3018012418428905,
"learning_rate": 1.2846715328467154e-05,
"loss": 1.2027,
"step": 176
},
{
"epoch": 0.45268542199488493,
"grad_norm": 0.2578612414340434,
"learning_rate": 1.2919708029197083e-05,
"loss": 1.1757,
"step": 177
},
{
"epoch": 0.45524296675191817,
"grad_norm": 0.25636745613132944,
"learning_rate": 1.2992700729927009e-05,
"loss": 1.1716,
"step": 178
},
{
"epoch": 0.4578005115089514,
"grad_norm": 0.2715386790093217,
"learning_rate": 1.3065693430656936e-05,
"loss": 1.1583,
"step": 179
},
{
"epoch": 0.46035805626598464,
"grad_norm": 0.2891675384844315,
"learning_rate": 1.3138686131386862e-05,
"loss": 1.1657,
"step": 180
},
{
"epoch": 0.4629156010230179,
"grad_norm": 0.23385863111978508,
"learning_rate": 1.321167883211679e-05,
"loss": 1.1922,
"step": 181
},
{
"epoch": 0.46547314578005117,
"grad_norm": 0.22994123507129197,
"learning_rate": 1.3284671532846715e-05,
"loss": 1.1717,
"step": 182
},
{
"epoch": 0.4680306905370844,
"grad_norm": 0.23612727422353394,
"learning_rate": 1.3357664233576644e-05,
"loss": 1.1801,
"step": 183
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.2069010463077349,
"learning_rate": 1.343065693430657e-05,
"loss": 1.177,
"step": 184
},
{
"epoch": 0.4731457800511509,
"grad_norm": 0.2588170825534718,
"learning_rate": 1.3503649635036497e-05,
"loss": 1.1808,
"step": 185
},
{
"epoch": 0.47570332480818417,
"grad_norm": 0.2157790774775731,
"learning_rate": 1.3576642335766423e-05,
"loss": 1.1821,
"step": 186
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.23223470081294634,
"learning_rate": 1.3649635036496352e-05,
"loss": 1.1615,
"step": 187
},
{
"epoch": 0.48081841432225064,
"grad_norm": 0.21725466354040374,
"learning_rate": 1.372262773722628e-05,
"loss": 1.1912,
"step": 188
},
{
"epoch": 0.4833759590792839,
"grad_norm": 0.211538836700456,
"learning_rate": 1.3795620437956205e-05,
"loss": 1.1678,
"step": 189
},
{
"epoch": 0.4859335038363171,
"grad_norm": 0.25537726955126566,
"learning_rate": 1.3868613138686133e-05,
"loss": 1.1745,
"step": 190
},
{
"epoch": 0.4884910485933504,
"grad_norm": 0.28371208474889603,
"learning_rate": 1.3941605839416059e-05,
"loss": 1.1193,
"step": 191
},
{
"epoch": 0.49104859335038364,
"grad_norm": 0.26303907455029885,
"learning_rate": 1.4014598540145988e-05,
"loss": 1.1622,
"step": 192
},
{
"epoch": 0.4936061381074169,
"grad_norm": 0.2799114044156544,
"learning_rate": 1.4087591240875913e-05,
"loss": 1.136,
"step": 193
},
{
"epoch": 0.4961636828644501,
"grad_norm": 0.24139333187754325,
"learning_rate": 1.416058394160584e-05,
"loss": 1.1306,
"step": 194
},
{
"epoch": 0.49872122762148335,
"grad_norm": 0.2793729959544077,
"learning_rate": 1.4233576642335767e-05,
"loss": 1.2086,
"step": 195
},
{
"epoch": 0.5012787723785166,
"grad_norm": 0.27570376951402886,
"learning_rate": 1.4306569343065696e-05,
"loss": 1.1628,
"step": 196
},
{
"epoch": 0.5038363171355499,
"grad_norm": 0.32786685913286884,
"learning_rate": 1.4379562043795621e-05,
"loss": 1.1518,
"step": 197
},
{
"epoch": 0.5063938618925832,
"grad_norm": 0.45385237120867455,
"learning_rate": 1.4452554744525549e-05,
"loss": 1.1856,
"step": 198
},
{
"epoch": 0.5089514066496164,
"grad_norm": 0.41272427110721904,
"learning_rate": 1.4525547445255475e-05,
"loss": 1.1483,
"step": 199
},
{
"epoch": 0.5115089514066496,
"grad_norm": 0.2841480764999212,
"learning_rate": 1.4598540145985402e-05,
"loss": 1.1629,
"step": 200
},
{
"epoch": 0.5140664961636828,
"grad_norm": 0.27714909479279093,
"learning_rate": 1.4671532846715331e-05,
"loss": 1.1442,
"step": 201
},
{
"epoch": 0.5166240409207161,
"grad_norm": 0.403242161588326,
"learning_rate": 1.4744525547445257e-05,
"loss": 1.1385,
"step": 202
},
{
"epoch": 0.5191815856777494,
"grad_norm": 0.337013121025594,
"learning_rate": 1.4817518248175184e-05,
"loss": 1.171,
"step": 203
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.4040109170859878,
"learning_rate": 1.489051094890511e-05,
"loss": 1.1418,
"step": 204
},
{
"epoch": 0.5242966751918159,
"grad_norm": 0.48665453956547733,
"learning_rate": 1.4963503649635038e-05,
"loss": 1.164,
"step": 205
},
{
"epoch": 0.5268542199488491,
"grad_norm": 0.24722444184837292,
"learning_rate": 1.5036496350364965e-05,
"loss": 1.1535,
"step": 206
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.329077822667812,
"learning_rate": 1.5109489051094892e-05,
"loss": 1.1704,
"step": 207
},
{
"epoch": 0.5319693094629157,
"grad_norm": 0.41651469422399784,
"learning_rate": 1.5182481751824818e-05,
"loss": 1.1559,
"step": 208
},
{
"epoch": 0.5345268542199488,
"grad_norm": 0.32960667919190284,
"learning_rate": 1.5255474452554746e-05,
"loss": 1.1495,
"step": 209
},
{
"epoch": 0.5370843989769821,
"grad_norm": 0.4781321369544006,
"learning_rate": 1.5328467153284673e-05,
"loss": 1.1387,
"step": 210
},
{
"epoch": 0.5396419437340153,
"grad_norm": 0.43671817015361414,
"learning_rate": 1.54014598540146e-05,
"loss": 1.1607,
"step": 211
},
{
"epoch": 0.5421994884910486,
"grad_norm": 0.32190848339790007,
"learning_rate": 1.5474452554744528e-05,
"loss": 1.1286,
"step": 212
},
{
"epoch": 0.5447570332480819,
"grad_norm": 0.28497016967310845,
"learning_rate": 1.5547445255474454e-05,
"loss": 1.1701,
"step": 213
},
{
"epoch": 0.5473145780051151,
"grad_norm": 0.30316718930544045,
"learning_rate": 1.5620437956204383e-05,
"loss": 1.1236,
"step": 214
},
{
"epoch": 0.5498721227621484,
"grad_norm": 0.26835985072996216,
"learning_rate": 1.569343065693431e-05,
"loss": 1.1289,
"step": 215
},
{
"epoch": 0.5524296675191815,
"grad_norm": 0.3009095238514411,
"learning_rate": 1.5766423357664234e-05,
"loss": 1.1636,
"step": 216
},
{
"epoch": 0.5549872122762148,
"grad_norm": 0.3065933942839116,
"learning_rate": 1.583941605839416e-05,
"loss": 1.1368,
"step": 217
},
{
"epoch": 0.5575447570332481,
"grad_norm": 0.26109719009183135,
"learning_rate": 1.591240875912409e-05,
"loss": 1.1077,
"step": 218
},
{
"epoch": 0.5601023017902813,
"grad_norm": 0.3164778738084223,
"learning_rate": 1.5985401459854015e-05,
"loss": 1.1333,
"step": 219
},
{
"epoch": 0.5626598465473146,
"grad_norm": 0.35400248747839075,
"learning_rate": 1.6058394160583944e-05,
"loss": 1.1865,
"step": 220
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.28805686893200677,
"learning_rate": 1.613138686131387e-05,
"loss": 1.1293,
"step": 221
},
{
"epoch": 0.5677749360613811,
"grad_norm": 0.30523736515745126,
"learning_rate": 1.62043795620438e-05,
"loss": 1.1296,
"step": 222
},
{
"epoch": 0.5703324808184144,
"grad_norm": 0.4190076909483638,
"learning_rate": 1.6277372262773725e-05,
"loss": 1.1344,
"step": 223
},
{
"epoch": 0.5728900255754475,
"grad_norm": 0.42243425644304494,
"learning_rate": 1.635036496350365e-05,
"loss": 1.1665,
"step": 224
},
{
"epoch": 0.5754475703324808,
"grad_norm": 0.33398927440080145,
"learning_rate": 1.642335766423358e-05,
"loss": 1.1616,
"step": 225
},
{
"epoch": 0.578005115089514,
"grad_norm": 0.31042126932738984,
"learning_rate": 1.6496350364963505e-05,
"loss": 1.1346,
"step": 226
},
{
"epoch": 0.5805626598465473,
"grad_norm": 0.4022933069927679,
"learning_rate": 1.6569343065693434e-05,
"loss": 1.1474,
"step": 227
},
{
"epoch": 0.5831202046035806,
"grad_norm": 0.34778708873533665,
"learning_rate": 1.664233576642336e-05,
"loss": 1.1328,
"step": 228
},
{
"epoch": 0.5856777493606138,
"grad_norm": 0.35235801712692716,
"learning_rate": 1.6715328467153286e-05,
"loss": 1.1507,
"step": 229
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.3378264430318775,
"learning_rate": 1.678832116788321e-05,
"loss": 1.1556,
"step": 230
},
{
"epoch": 0.5907928388746803,
"grad_norm": 0.3260621828817585,
"learning_rate": 1.686131386861314e-05,
"loss": 1.152,
"step": 231
},
{
"epoch": 0.5933503836317136,
"grad_norm": 0.39226471807556507,
"learning_rate": 1.6934306569343066e-05,
"loss": 1.1398,
"step": 232
},
{
"epoch": 0.5959079283887468,
"grad_norm": 0.4562478952465355,
"learning_rate": 1.7007299270072995e-05,
"loss": 1.1447,
"step": 233
},
{
"epoch": 0.59846547314578,
"grad_norm": 0.3451241092677777,
"learning_rate": 1.708029197080292e-05,
"loss": 1.1005,
"step": 234
},
{
"epoch": 0.6010230179028133,
"grad_norm": 0.35647792283371854,
"learning_rate": 1.7153284671532847e-05,
"loss": 1.1227,
"step": 235
},
{
"epoch": 0.6035805626598465,
"grad_norm": 0.4594520420622475,
"learning_rate": 1.7226277372262773e-05,
"loss": 1.1505,
"step": 236
},
{
"epoch": 0.6061381074168798,
"grad_norm": 0.45224289985329424,
"learning_rate": 1.7299270072992702e-05,
"loss": 1.1308,
"step": 237
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.40418344343634116,
"learning_rate": 1.737226277372263e-05,
"loss": 1.1181,
"step": 238
},
{
"epoch": 0.6112531969309463,
"grad_norm": 0.3386408460236669,
"learning_rate": 1.7445255474452557e-05,
"loss": 1.1584,
"step": 239
},
{
"epoch": 0.6138107416879796,
"grad_norm": 0.26946506842987866,
"learning_rate": 1.7518248175182482e-05,
"loss": 1.1264,
"step": 240
},
{
"epoch": 0.6163682864450127,
"grad_norm": 0.36854128324837004,
"learning_rate": 1.7591240875912408e-05,
"loss": 1.1234,
"step": 241
},
{
"epoch": 0.618925831202046,
"grad_norm": 0.40766745885420824,
"learning_rate": 1.7664233576642337e-05,
"loss": 1.1473,
"step": 242
},
{
"epoch": 0.6214833759590793,
"grad_norm": 0.34418627419066,
"learning_rate": 1.7737226277372263e-05,
"loss": 1.1443,
"step": 243
},
{
"epoch": 0.6240409207161125,
"grad_norm": 0.3132419041181749,
"learning_rate": 1.7810218978102192e-05,
"loss": 1.1898,
"step": 244
},
{
"epoch": 0.6265984654731458,
"grad_norm": 0.3133703026128217,
"learning_rate": 1.7883211678832118e-05,
"loss": 1.1501,
"step": 245
},
{
"epoch": 0.629156010230179,
"grad_norm": 0.3441898164827929,
"learning_rate": 1.7956204379562047e-05,
"loss": 1.1452,
"step": 246
},
{
"epoch": 0.6317135549872123,
"grad_norm": 0.33750686928448953,
"learning_rate": 1.8029197080291973e-05,
"loss": 1.1359,
"step": 247
},
{
"epoch": 0.6342710997442456,
"grad_norm": 0.374020584176404,
"learning_rate": 1.81021897810219e-05,
"loss": 1.1823,
"step": 248
},
{
"epoch": 0.6368286445012787,
"grad_norm": 0.3514782831462071,
"learning_rate": 1.8175182481751824e-05,
"loss": 1.1632,
"step": 249
},
{
"epoch": 0.639386189258312,
"grad_norm": 0.3606450922286876,
"learning_rate": 1.8248175182481753e-05,
"loss": 1.1409,
"step": 250
},
{
"epoch": 0.6419437340153452,
"grad_norm": 0.261265823171208,
"learning_rate": 1.8321167883211683e-05,
"loss": 1.1499,
"step": 251
},
{
"epoch": 0.6445012787723785,
"grad_norm": 0.42167995133388064,
"learning_rate": 1.8394160583941608e-05,
"loss": 1.154,
"step": 252
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.3940819685714755,
"learning_rate": 1.8467153284671534e-05,
"loss": 1.1355,
"step": 253
},
{
"epoch": 0.649616368286445,
"grad_norm": 0.3265578920410715,
"learning_rate": 1.854014598540146e-05,
"loss": 1.1874,
"step": 254
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.39035015686633145,
"learning_rate": 1.861313868613139e-05,
"loss": 1.1374,
"step": 255
},
{
"epoch": 0.6547314578005116,
"grad_norm": 0.41589276832005634,
"learning_rate": 1.8686131386861315e-05,
"loss": 1.1289,
"step": 256
},
{
"epoch": 0.6572890025575447,
"grad_norm": 0.45228952583155346,
"learning_rate": 1.8759124087591244e-05,
"loss": 1.1646,
"step": 257
},
{
"epoch": 0.659846547314578,
"grad_norm": 0.5348752543777668,
"learning_rate": 1.883211678832117e-05,
"loss": 1.1268,
"step": 258
},
{
"epoch": 0.6624040920716112,
"grad_norm": 0.6021227056854751,
"learning_rate": 1.8905109489051095e-05,
"loss": 1.1593,
"step": 259
},
{
"epoch": 0.6649616368286445,
"grad_norm": 0.5171238656799629,
"learning_rate": 1.897810218978102e-05,
"loss": 1.1374,
"step": 260
},
{
"epoch": 0.6675191815856778,
"grad_norm": 0.4416261577215247,
"learning_rate": 1.905109489051095e-05,
"loss": 1.1093,
"step": 261
},
{
"epoch": 0.670076726342711,
"grad_norm": 0.569218554097933,
"learning_rate": 1.912408759124088e-05,
"loss": 1.1232,
"step": 262
},
{
"epoch": 0.6726342710997443,
"grad_norm": 0.6811617127901143,
"learning_rate": 1.9197080291970805e-05,
"loss": 1.1682,
"step": 263
},
{
"epoch": 0.6751918158567775,
"grad_norm": 0.749600012327492,
"learning_rate": 1.9270072992700734e-05,
"loss": 1.1484,
"step": 264
},
{
"epoch": 0.6777493606138107,
"grad_norm": 0.5547245978393044,
"learning_rate": 1.934306569343066e-05,
"loss": 1.1746,
"step": 265
},
{
"epoch": 0.680306905370844,
"grad_norm": 0.29516123217758117,
"learning_rate": 1.9416058394160586e-05,
"loss": 1.1414,
"step": 266
},
{
"epoch": 0.6828644501278772,
"grad_norm": 0.5616443320978407,
"learning_rate": 1.948905109489051e-05,
"loss": 1.096,
"step": 267
},
{
"epoch": 0.6854219948849105,
"grad_norm": 0.7110950485565922,
"learning_rate": 1.956204379562044e-05,
"loss": 1.1383,
"step": 268
},
{
"epoch": 0.6879795396419437,
"grad_norm": 0.5747864084575326,
"learning_rate": 1.9635036496350366e-05,
"loss": 1.1157,
"step": 269
},
{
"epoch": 0.690537084398977,
"grad_norm": 0.5551996423553552,
"learning_rate": 1.9708029197080295e-05,
"loss": 1.1569,
"step": 270
},
{
"epoch": 0.6930946291560103,
"grad_norm": 0.7165225607672224,
"learning_rate": 1.978102189781022e-05,
"loss": 1.1551,
"step": 271
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.7036255091082283,
"learning_rate": 1.9854014598540147e-05,
"loss": 1.1155,
"step": 272
},
{
"epoch": 0.6982097186700768,
"grad_norm": 0.37416829306334026,
"learning_rate": 1.9927007299270073e-05,
"loss": 1.1293,
"step": 273
},
{
"epoch": 0.7007672634271099,
"grad_norm": 0.5000491272477234,
"learning_rate": 2e-05,
"loss": 1.1495,
"step": 274
},
{
"epoch": 0.7033248081841432,
"grad_norm": 0.7162752485719868,
"learning_rate": 1.9999991865312627e-05,
"loss": 1.1267,
"step": 275
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.6356341002049819,
"learning_rate": 1.9999967461263736e-05,
"loss": 1.1469,
"step": 276
},
{
"epoch": 0.7084398976982097,
"grad_norm": 0.46429306768513406,
"learning_rate": 1.9999926787893038e-05,
"loss": 1.1605,
"step": 277
},
{
"epoch": 0.710997442455243,
"grad_norm": 0.42193730725900314,
"learning_rate": 1.99998698452667e-05,
"loss": 1.1291,
"step": 278
},
{
"epoch": 0.7135549872122762,
"grad_norm": 0.45111683276082,
"learning_rate": 1.999979663347736e-05,
"loss": 1.1594,
"step": 279
},
{
"epoch": 0.7161125319693095,
"grad_norm": 0.48963964069881194,
"learning_rate": 1.9999707152644143e-05,
"loss": 1.1245,
"step": 280
},
{
"epoch": 0.7186700767263428,
"grad_norm": 0.4979629650586617,
"learning_rate": 1.999960140291262e-05,
"loss": 1.119,
"step": 281
},
{
"epoch": 0.7212276214833759,
"grad_norm": 0.4664713962878264,
"learning_rate": 1.9999479384454838e-05,
"loss": 1.1468,
"step": 282
},
{
"epoch": 0.7237851662404092,
"grad_norm": 0.3844942432737082,
"learning_rate": 1.9999341097469313e-05,
"loss": 1.075,
"step": 283
},
{
"epoch": 0.7263427109974424,
"grad_norm": 0.3748435881073205,
"learning_rate": 1.9999186542181038e-05,
"loss": 1.1388,
"step": 284
},
{
"epoch": 0.7289002557544757,
"grad_norm": 0.37537611839818713,
"learning_rate": 1.9999015718841453e-05,
"loss": 1.1204,
"step": 285
},
{
"epoch": 0.731457800511509,
"grad_norm": 0.2604152551489964,
"learning_rate": 1.9998828627728483e-05,
"loss": 1.1441,
"step": 286
},
{
"epoch": 0.7340153452685422,
"grad_norm": 0.3500229133794647,
"learning_rate": 1.9998625269146515e-05,
"loss": 1.1418,
"step": 287
},
{
"epoch": 0.7365728900255755,
"grad_norm": 0.40870411685426555,
"learning_rate": 1.9998405643426398e-05,
"loss": 1.107,
"step": 288
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.4142193267583776,
"learning_rate": 1.999816975092545e-05,
"loss": 1.1386,
"step": 289
},
{
"epoch": 0.7416879795396419,
"grad_norm": 0.3984615533147621,
"learning_rate": 1.9997917592027455e-05,
"loss": 1.1478,
"step": 290
},
{
"epoch": 0.7442455242966752,
"grad_norm": 0.33486990703650343,
"learning_rate": 1.9997649167142654e-05,
"loss": 1.1322,
"step": 291
},
{
"epoch": 0.7468030690537084,
"grad_norm": 0.34307927675012156,
"learning_rate": 1.9997364476707765e-05,
"loss": 1.0975,
"step": 292
},
{
"epoch": 0.7493606138107417,
"grad_norm": 0.32862273663424796,
"learning_rate": 1.9997063521185956e-05,
"loss": 1.1234,
"step": 293
},
{
"epoch": 0.7519181585677749,
"grad_norm": 0.3832334389775187,
"learning_rate": 1.9996746301066867e-05,
"loss": 1.1204,
"step": 294
},
{
"epoch": 0.7544757033248082,
"grad_norm": 0.37651748057590684,
"learning_rate": 1.999641281686659e-05,
"loss": 1.1101,
"step": 295
},
{
"epoch": 0.7570332480818415,
"grad_norm": 0.3987512509485477,
"learning_rate": 1.999606306912769e-05,
"loss": 1.1182,
"step": 296
},
{
"epoch": 0.7595907928388747,
"grad_norm": 0.3135294282014092,
"learning_rate": 1.999569705841918e-05,
"loss": 1.1576,
"step": 297
},
{
"epoch": 0.7621483375959079,
"grad_norm": 0.310570536991235,
"learning_rate": 1.9995314785336534e-05,
"loss": 1.1329,
"step": 298
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.28886285275015344,
"learning_rate": 1.999491625050169e-05,
"loss": 1.1486,
"step": 299
},
{
"epoch": 0.7672634271099744,
"grad_norm": 0.2810916108747155,
"learning_rate": 1.9994501454563046e-05,
"loss": 1.1067,
"step": 300
},
{
"epoch": 0.7698209718670077,
"grad_norm": 0.2641826394714093,
"learning_rate": 1.9994070398195437e-05,
"loss": 1.1391,
"step": 301
},
{
"epoch": 0.7723785166240409,
"grad_norm": 0.23992392919351505,
"learning_rate": 1.999362308210017e-05,
"loss": 1.1387,
"step": 302
},
{
"epoch": 0.7749360613810742,
"grad_norm": 0.24856265925820004,
"learning_rate": 1.9993159507005e-05,
"loss": 1.1084,
"step": 303
},
{
"epoch": 0.7774936061381074,
"grad_norm": 0.22572823705824116,
"learning_rate": 1.9992679673664136e-05,
"loss": 1.1134,
"step": 304
},
{
"epoch": 0.7800511508951407,
"grad_norm": 0.27595626439843796,
"learning_rate": 1.9992183582858233e-05,
"loss": 1.1269,
"step": 305
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.33828817219220914,
"learning_rate": 1.9991671235394404e-05,
"loss": 1.1211,
"step": 306
},
{
"epoch": 0.7851662404092071,
"grad_norm": 0.23908198593915184,
"learning_rate": 1.9991142632106205e-05,
"loss": 1.0874,
"step": 307
},
{
"epoch": 0.7877237851662404,
"grad_norm": 0.32916775113793656,
"learning_rate": 1.999059777385364e-05,
"loss": 1.1189,
"step": 308
},
{
"epoch": 0.7902813299232737,
"grad_norm": 0.4164086722930908,
"learning_rate": 1.9990036661523162e-05,
"loss": 1.1368,
"step": 309
},
{
"epoch": 0.7928388746803069,
"grad_norm": 0.4356985530787425,
"learning_rate": 1.998945929602766e-05,
"loss": 1.1041,
"step": 310
},
{
"epoch": 0.7953964194373402,
"grad_norm": 0.32329800121359825,
"learning_rate": 1.9988865678306476e-05,
"loss": 1.1381,
"step": 311
},
{
"epoch": 0.7979539641943734,
"grad_norm": 0.28030048685966436,
"learning_rate": 1.998825580932539e-05,
"loss": 1.1505,
"step": 312
},
{
"epoch": 0.8005115089514067,
"grad_norm": 0.3736128210505236,
"learning_rate": 1.9987629690076615e-05,
"loss": 1.116,
"step": 313
},
{
"epoch": 0.80306905370844,
"grad_norm": 0.3711938440381308,
"learning_rate": 1.998698732157881e-05,
"loss": 1.1233,
"step": 314
},
{
"epoch": 0.8056265984654731,
"grad_norm": 0.283799635820317,
"learning_rate": 1.998632870487707e-05,
"loss": 1.1112,
"step": 315
},
{
"epoch": 0.8081841432225064,
"grad_norm": 0.29982174151777125,
"learning_rate": 1.9985653841042926e-05,
"loss": 1.1089,
"step": 316
},
{
"epoch": 0.8107416879795396,
"grad_norm": 0.33144242270715973,
"learning_rate": 1.9984962731174336e-05,
"loss": 1.1387,
"step": 317
},
{
"epoch": 0.8132992327365729,
"grad_norm": 0.33991853938376265,
"learning_rate": 1.998425537639569e-05,
"loss": 1.1292,
"step": 318
},
{
"epoch": 0.8158567774936062,
"grad_norm": 0.342802086067408,
"learning_rate": 1.9983531777857817e-05,
"loss": 1.0907,
"step": 319
},
{
"epoch": 0.8184143222506394,
"grad_norm": 0.3083367366680541,
"learning_rate": 1.998279193673796e-05,
"loss": 1.1157,
"step": 320
},
{
"epoch": 0.8209718670076727,
"grad_norm": 0.32536985414256364,
"learning_rate": 1.9982035854239793e-05,
"loss": 1.0971,
"step": 321
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.3810630836606236,
"learning_rate": 1.9981263531593422e-05,
"loss": 1.1236,
"step": 322
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.36452300722278047,
"learning_rate": 1.9980474970055367e-05,
"loss": 1.1438,
"step": 323
},
{
"epoch": 0.8286445012787724,
"grad_norm": 0.2795921565060519,
"learning_rate": 1.997967017090856e-05,
"loss": 1.1465,
"step": 324
},
{
"epoch": 0.8312020460358056,
"grad_norm": 0.2986081929713523,
"learning_rate": 1.9978849135462367e-05,
"loss": 1.1061,
"step": 325
},
{
"epoch": 0.8337595907928389,
"grad_norm": 0.3054440401423343,
"learning_rate": 1.9978011865052554e-05,
"loss": 1.1146,
"step": 326
},
{
"epoch": 0.8363171355498721,
"grad_norm": 0.32318950453837997,
"learning_rate": 1.9977158361041317e-05,
"loss": 1.1554,
"step": 327
},
{
"epoch": 0.8388746803069054,
"grad_norm": 0.30472902927491496,
"learning_rate": 1.997628862481725e-05,
"loss": 1.1274,
"step": 328
},
{
"epoch": 0.8414322250639387,
"grad_norm": 0.4042829285862421,
"learning_rate": 1.9975402657795355e-05,
"loss": 1.1669,
"step": 329
},
{
"epoch": 0.8439897698209718,
"grad_norm": 0.2804285578799784,
"learning_rate": 1.997450046141705e-05,
"loss": 1.1361,
"step": 330
},
{
"epoch": 0.8465473145780051,
"grad_norm": 0.3569177728816469,
"learning_rate": 1.997358203715015e-05,
"loss": 1.1095,
"step": 331
},
{
"epoch": 0.8491048593350383,
"grad_norm": 0.4230090216431553,
"learning_rate": 1.9972647386488873e-05,
"loss": 1.1016,
"step": 332
},
{
"epoch": 0.8516624040920716,
"grad_norm": 0.37021286913388013,
"learning_rate": 1.997169651095384e-05,
"loss": 1.1475,
"step": 333
},
{
"epoch": 0.8542199488491049,
"grad_norm": 0.3317123580055209,
"learning_rate": 1.9970729412092064e-05,
"loss": 1.0813,
"step": 334
},
{
"epoch": 0.8567774936061381,
"grad_norm": 0.273842287695835,
"learning_rate": 1.9969746091476955e-05,
"loss": 1.1067,
"step": 335
},
{
"epoch": 0.8593350383631714,
"grad_norm": 0.2673820670815786,
"learning_rate": 1.9968746550708313e-05,
"loss": 1.1069,
"step": 336
},
{
"epoch": 0.8618925831202046,
"grad_norm": 0.2979937548082758,
"learning_rate": 1.996773079141233e-05,
"loss": 1.1279,
"step": 337
},
{
"epoch": 0.8644501278772379,
"grad_norm": 0.37172355657034833,
"learning_rate": 1.9966698815241583e-05,
"loss": 1.1339,
"step": 338
},
{
"epoch": 0.8670076726342711,
"grad_norm": 0.506903869952954,
"learning_rate": 1.9965650623875034e-05,
"loss": 1.1039,
"step": 339
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.4279498776163848,
"learning_rate": 1.9964586219018018e-05,
"loss": 1.1425,
"step": 340
},
{
"epoch": 0.8721227621483376,
"grad_norm": 0.36753587770795587,
"learning_rate": 1.9963505602402263e-05,
"loss": 1.0978,
"step": 341
},
{
"epoch": 0.8746803069053708,
"grad_norm": 0.3648609772451092,
"learning_rate": 1.996240877578586e-05,
"loss": 1.1242,
"step": 342
},
{
"epoch": 0.8772378516624041,
"grad_norm": 0.37366918011434086,
"learning_rate": 1.996129574095328e-05,
"loss": 1.1191,
"step": 343
},
{
"epoch": 0.8797953964194374,
"grad_norm": 0.3879756302273747,
"learning_rate": 1.996016649971536e-05,
"loss": 1.1253,
"step": 344
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.35306903326209926,
"learning_rate": 1.9959021053909304e-05,
"loss": 1.1097,
"step": 345
},
{
"epoch": 0.8849104859335039,
"grad_norm": 0.3497813112371213,
"learning_rate": 1.995785940539868e-05,
"loss": 1.1751,
"step": 346
},
{
"epoch": 0.887468030690537,
"grad_norm": 0.31991011885719,
"learning_rate": 1.995668155607342e-05,
"loss": 1.06,
"step": 347
},
{
"epoch": 0.8900255754475703,
"grad_norm": 0.33100033800466955,
"learning_rate": 1.9955487507849815e-05,
"loss": 1.1217,
"step": 348
},
{
"epoch": 0.8925831202046036,
"grad_norm": 0.3302462532169077,
"learning_rate": 1.9954277262670497e-05,
"loss": 1.1016,
"step": 349
},
{
"epoch": 0.8951406649616368,
"grad_norm": 0.2988617813500731,
"learning_rate": 1.9953050822504466e-05,
"loss": 1.1259,
"step": 350
},
{
"epoch": 0.8976982097186701,
"grad_norm": 0.2467443109516983,
"learning_rate": 1.995180818934706e-05,
"loss": 1.1449,
"step": 351
},
{
"epoch": 0.9002557544757033,
"grad_norm": 0.2862819186333417,
"learning_rate": 1.995054936521997e-05,
"loss": 1.1,
"step": 352
},
{
"epoch": 0.9028132992327366,
"grad_norm": 0.3386935579478213,
"learning_rate": 1.9949274352171218e-05,
"loss": 1.1215,
"step": 353
},
{
"epoch": 0.9053708439897699,
"grad_norm": 0.377267345773294,
"learning_rate": 1.9947983152275175e-05,
"loss": 1.1151,
"step": 354
},
{
"epoch": 0.907928388746803,
"grad_norm": 0.26418004315541993,
"learning_rate": 1.9946675767632545e-05,
"loss": 1.0909,
"step": 355
},
{
"epoch": 0.9104859335038363,
"grad_norm": 0.3036950602266219,
"learning_rate": 1.9945352200370352e-05,
"loss": 1.1065,
"step": 356
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.2847990677396293,
"learning_rate": 1.9944012452641966e-05,
"loss": 1.1187,
"step": 357
},
{
"epoch": 0.9156010230179028,
"grad_norm": 0.3155239647410138,
"learning_rate": 1.994265652662707e-05,
"loss": 1.1402,
"step": 358
},
{
"epoch": 0.9181585677749361,
"grad_norm": 0.3011564965680371,
"learning_rate": 1.9941284424531668e-05,
"loss": 1.1232,
"step": 359
},
{
"epoch": 0.9207161125319693,
"grad_norm": 0.3119452115804441,
"learning_rate": 1.9939896148588086e-05,
"loss": 1.0879,
"step": 360
},
{
"epoch": 0.9232736572890026,
"grad_norm": 0.33133352515569403,
"learning_rate": 1.9938491701054965e-05,
"loss": 1.1384,
"step": 361
},
{
"epoch": 0.9258312020460358,
"grad_norm": 0.2085194934877816,
"learning_rate": 1.9937071084217254e-05,
"loss": 1.0616,
"step": 362
},
{
"epoch": 0.928388746803069,
"grad_norm": 0.27348539950003964,
"learning_rate": 1.99356343003862e-05,
"loss": 1.127,
"step": 363
},
{
"epoch": 0.9309462915601023,
"grad_norm": 0.314231043083254,
"learning_rate": 1.9934181351899365e-05,
"loss": 1.1075,
"step": 364
},
{
"epoch": 0.9335038363171355,
"grad_norm": 0.3354380584507947,
"learning_rate": 1.9932712241120606e-05,
"loss": 1.1272,
"step": 365
},
{
"epoch": 0.9360613810741688,
"grad_norm": 0.28703321632472045,
"learning_rate": 1.9931226970440075e-05,
"loss": 1.1469,
"step": 366
},
{
"epoch": 0.9386189258312021,
"grad_norm": 0.3426859912220677,
"learning_rate": 1.9929725542274215e-05,
"loss": 1.1278,
"step": 367
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.29299540193881474,
"learning_rate": 1.992820795906575e-05,
"loss": 1.1187,
"step": 368
},
{
"epoch": 0.9437340153452686,
"grad_norm": 0.39295341923846966,
"learning_rate": 1.99266742232837e-05,
"loss": 1.1126,
"step": 369
},
{
"epoch": 0.9462915601023018,
"grad_norm": 0.35353202277391543,
"learning_rate": 1.9925124337423356e-05,
"loss": 1.1139,
"step": 370
},
{
"epoch": 0.948849104859335,
"grad_norm": 0.3311467211582019,
"learning_rate": 1.9923558304006283e-05,
"loss": 1.138,
"step": 371
},
{
"epoch": 0.9514066496163683,
"grad_norm": 0.3816152174441759,
"learning_rate": 1.992197612558032e-05,
"loss": 1.1176,
"step": 372
},
{
"epoch": 0.9539641943734015,
"grad_norm": 0.36605913254516786,
"learning_rate": 1.9920377804719573e-05,
"loss": 1.1221,
"step": 373
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.36097755733897396,
"learning_rate": 1.991876334402441e-05,
"loss": 1.1198,
"step": 374
},
{
"epoch": 0.959079283887468,
"grad_norm": 0.34895670740815254,
"learning_rate": 1.9917132746121454e-05,
"loss": 1.1438,
"step": 375
},
{
"epoch": 0.9616368286445013,
"grad_norm": 0.2817987248252719,
"learning_rate": 1.9915486013663595e-05,
"loss": 1.0946,
"step": 376
},
{
"epoch": 0.9641943734015346,
"grad_norm": 0.2440543185648296,
"learning_rate": 1.9913823149329952e-05,
"loss": 1.1257,
"step": 377
},
{
"epoch": 0.9667519181585678,
"grad_norm": 0.29938424755141774,
"learning_rate": 1.9912144155825913e-05,
"loss": 1.1315,
"step": 378
},
{
"epoch": 0.969309462915601,
"grad_norm": 0.3042211939245891,
"learning_rate": 1.9910449035883086e-05,
"loss": 1.1005,
"step": 379
},
{
"epoch": 0.9718670076726342,
"grad_norm": 0.3662935173068649,
"learning_rate": 1.990873779225933e-05,
"loss": 1.0831,
"step": 380
},
{
"epoch": 0.9744245524296675,
"grad_norm": 0.34290782200372855,
"learning_rate": 1.990701042773873e-05,
"loss": 1.1116,
"step": 381
},
{
"epoch": 0.9769820971867008,
"grad_norm": 0.2659876511429978,
"learning_rate": 1.99052669451316e-05,
"loss": 1.1172,
"step": 382
},
{
"epoch": 0.979539641943734,
"grad_norm": 0.2656583663382276,
"learning_rate": 1.9903507347274478e-05,
"loss": 1.1243,
"step": 383
},
{
"epoch": 0.9820971867007673,
"grad_norm": 0.35197356004646674,
"learning_rate": 1.9901731637030123e-05,
"loss": 1.0751,
"step": 384
},
{
"epoch": 0.9846547314578005,
"grad_norm": 0.4123186710230891,
"learning_rate": 1.9899939817287494e-05,
"loss": 1.1572,
"step": 385
},
{
"epoch": 0.9872122762148338,
"grad_norm": 0.48886837110572706,
"learning_rate": 1.989813189096178e-05,
"loss": 1.1109,
"step": 386
},
{
"epoch": 0.989769820971867,
"grad_norm": 0.4200898181195607,
"learning_rate": 1.989630786099436e-05,
"loss": 1.1243,
"step": 387
},
{
"epoch": 0.9923273657289002,
"grad_norm": 0.36473186521348727,
"learning_rate": 1.9894467730352817e-05,
"loss": 1.1379,
"step": 388
},
{
"epoch": 0.9948849104859335,
"grad_norm": 0.33106729200219565,
"learning_rate": 1.9892611502030932e-05,
"loss": 1.1183,
"step": 389
},
{
"epoch": 0.9974424552429667,
"grad_norm": 0.28859949847448485,
"learning_rate": 1.9890739179048666e-05,
"loss": 1.1019,
"step": 390
},
{
"epoch": 1.0,
"grad_norm": 0.32343067044443596,
"learning_rate": 1.9888850764452177e-05,
"loss": 1.1315,
"step": 391
},
{
"epoch": 1.0025575447570332,
"grad_norm": 0.2946752191785302,
"learning_rate": 1.988694626131379e-05,
"loss": 1.1027,
"step": 392
},
{
"epoch": 1.0051150895140666,
"grad_norm": 0.2840956310037306,
"learning_rate": 1.9885025672732024e-05,
"loss": 1.1255,
"step": 393
},
{
"epoch": 1.0076726342710998,
"grad_norm": 0.3834929641779387,
"learning_rate": 1.9883089001831545e-05,
"loss": 1.0926,
"step": 394
},
{
"epoch": 1.010230179028133,
"grad_norm": 0.37119046465058125,
"learning_rate": 1.9881136251763203e-05,
"loss": 1.1024,
"step": 395
},
{
"epoch": 1.0127877237851663,
"grad_norm": 0.3481999615848297,
"learning_rate": 1.9879167425703998e-05,
"loss": 1.1177,
"step": 396
},
{
"epoch": 1.0153452685421995,
"grad_norm": 0.4174534154279672,
"learning_rate": 1.9877182526857086e-05,
"loss": 1.1194,
"step": 397
},
{
"epoch": 1.0179028132992327,
"grad_norm": 0.428283352237624,
"learning_rate": 1.9875181558451774e-05,
"loss": 1.1126,
"step": 398
},
{
"epoch": 1.020460358056266,
"grad_norm": 0.34788898984052513,
"learning_rate": 1.9873164523743517e-05,
"loss": 1.0826,
"step": 399
},
{
"epoch": 1.0230179028132993,
"grad_norm": 0.3235948349939345,
"learning_rate": 1.9871131426013894e-05,
"loss": 1.137,
"step": 400
},
{
"epoch": 1.0255754475703325,
"grad_norm": 0.3661886910233816,
"learning_rate": 1.9869082268570637e-05,
"loss": 1.1135,
"step": 401
},
{
"epoch": 1.0281329923273657,
"grad_norm": 0.3844357019706309,
"learning_rate": 1.9867017054747593e-05,
"loss": 1.1316,
"step": 402
},
{
"epoch": 1.030690537084399,
"grad_norm": 0.3351625771872402,
"learning_rate": 1.9864935787904734e-05,
"loss": 1.1009,
"step": 403
},
{
"epoch": 1.0332480818414322,
"grad_norm": 0.34602161255624664,
"learning_rate": 1.986283847142816e-05,
"loss": 1.1047,
"step": 404
},
{
"epoch": 1.0358056265984654,
"grad_norm": 0.3709821493330784,
"learning_rate": 1.9860725108730065e-05,
"loss": 1.1031,
"step": 405
},
{
"epoch": 1.0383631713554988,
"grad_norm": 0.37774483264562303,
"learning_rate": 1.9858595703248755e-05,
"loss": 1.137,
"step": 406
},
{
"epoch": 1.040920716112532,
"grad_norm": 0.3599825369273542,
"learning_rate": 1.985645025844865e-05,
"loss": 1.0707,
"step": 407
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.39966584857588405,
"learning_rate": 1.9854288777820246e-05,
"loss": 1.1033,
"step": 408
},
{
"epoch": 1.0460358056265984,
"grad_norm": 0.40289071310305824,
"learning_rate": 1.9852111264880145e-05,
"loss": 1.0806,
"step": 409
},
{
"epoch": 1.0485933503836318,
"grad_norm": 0.47128238325065436,
"learning_rate": 1.984991772317102e-05,
"loss": 1.0756,
"step": 410
},
{
"epoch": 1.051150895140665,
"grad_norm": 0.5298917118212448,
"learning_rate": 1.9847708156261622e-05,
"loss": 1.1055,
"step": 411
},
{
"epoch": 1.0537084398976981,
"grad_norm": 0.47297356768421134,
"learning_rate": 1.9845482567746783e-05,
"loss": 1.0836,
"step": 412
},
{
"epoch": 1.0562659846547315,
"grad_norm": 0.38344561089251955,
"learning_rate": 1.9843240961247398e-05,
"loss": 1.0904,
"step": 413
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.27676602705991193,
"learning_rate": 1.9840983340410414e-05,
"loss": 1.1402,
"step": 414
},
{
"epoch": 1.061381074168798,
"grad_norm": 0.4125473070163219,
"learning_rate": 1.9838709708908848e-05,
"loss": 1.1108,
"step": 415
},
{
"epoch": 1.0639386189258313,
"grad_norm": 0.39100913652365626,
"learning_rate": 1.983642007044175e-05,
"loss": 1.0894,
"step": 416
},
{
"epoch": 1.0664961636828645,
"grad_norm": 0.3635147529725554,
"learning_rate": 1.983411442873422e-05,
"loss": 1.0751,
"step": 417
},
{
"epoch": 1.0690537084398977,
"grad_norm": 0.3157457311508148,
"learning_rate": 1.983179278753739e-05,
"loss": 1.0867,
"step": 418
},
{
"epoch": 1.0716112531969308,
"grad_norm": 0.3380507668468239,
"learning_rate": 1.9829455150628432e-05,
"loss": 1.1428,
"step": 419
},
{
"epoch": 1.0741687979539642,
"grad_norm": 0.3531121689418475,
"learning_rate": 1.982710152181053e-05,
"loss": 1.0877,
"step": 420
},
{
"epoch": 1.0767263427109974,
"grad_norm": 0.2800940522052926,
"learning_rate": 1.982473190491289e-05,
"loss": 1.1025,
"step": 421
},
{
"epoch": 1.0792838874680306,
"grad_norm": 0.3045440051536889,
"learning_rate": 1.9822346303790732e-05,
"loss": 1.0954,
"step": 422
},
{
"epoch": 1.081841432225064,
"grad_norm": 0.2875179180998631,
"learning_rate": 1.9819944722325283e-05,
"loss": 1.0799,
"step": 423
},
{
"epoch": 1.0843989769820972,
"grad_norm": 0.3671466904640979,
"learning_rate": 1.981752716442376e-05,
"loss": 1.1239,
"step": 424
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.310905332933887,
"learning_rate": 1.9815093634019384e-05,
"loss": 1.0885,
"step": 425
},
{
"epoch": 1.0895140664961638,
"grad_norm": 0.34866191023824383,
"learning_rate": 1.9812644135071358e-05,
"loss": 1.0789,
"step": 426
},
{
"epoch": 1.092071611253197,
"grad_norm": 0.3670206738107968,
"learning_rate": 1.9810178671564853e-05,
"loss": 1.1051,
"step": 427
},
{
"epoch": 1.0946291560102301,
"grad_norm": 0.46475258100798056,
"learning_rate": 1.980769724751104e-05,
"loss": 1.0838,
"step": 428
},
{
"epoch": 1.0971867007672633,
"grad_norm": 0.3157024370545657,
"learning_rate": 1.9805199866947026e-05,
"loss": 1.114,
"step": 429
},
{
"epoch": 1.0997442455242967,
"grad_norm": 0.29958992335623563,
"learning_rate": 1.9802686533935903e-05,
"loss": 1.0909,
"step": 430
},
{
"epoch": 1.10230179028133,
"grad_norm": 0.3045539331442299,
"learning_rate": 1.9800157252566698e-05,
"loss": 1.119,
"step": 431
},
{
"epoch": 1.104859335038363,
"grad_norm": 0.35388881893166907,
"learning_rate": 1.97976120269544e-05,
"loss": 1.1357,
"step": 432
},
{
"epoch": 1.1074168797953965,
"grad_norm": 0.4072658855507119,
"learning_rate": 1.9795050861239932e-05,
"loss": 1.1153,
"step": 433
},
{
"epoch": 1.1099744245524297,
"grad_norm": 0.3515081652084557,
"learning_rate": 1.9792473759590148e-05,
"loss": 1.1051,
"step": 434
},
{
"epoch": 1.1125319693094629,
"grad_norm": 0.30513537117496636,
"learning_rate": 1.978988072619783e-05,
"loss": 1.0943,
"step": 435
},
{
"epoch": 1.1150895140664963,
"grad_norm": 0.5088746516427844,
"learning_rate": 1.9787271765281684e-05,
"loss": 1.0947,
"step": 436
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.6682126794134292,
"learning_rate": 1.9784646881086327e-05,
"loss": 1.0743,
"step": 437
},
{
"epoch": 1.1202046035805626,
"grad_norm": 0.5551640593749172,
"learning_rate": 1.9782006077882282e-05,
"loss": 1.0861,
"step": 438
},
{
"epoch": 1.1227621483375958,
"grad_norm": 0.3278866812808205,
"learning_rate": 1.9779349359965966e-05,
"loss": 1.1069,
"step": 439
},
{
"epoch": 1.1253196930946292,
"grad_norm": 0.38591224008325814,
"learning_rate": 1.9776676731659695e-05,
"loss": 1.0849,
"step": 440
},
{
"epoch": 1.1278772378516624,
"grad_norm": 0.35719651550677206,
"learning_rate": 1.977398819731167e-05,
"loss": 1.1053,
"step": 441
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.4232965403621678,
"learning_rate": 1.9771283761295966e-05,
"loss": 1.0848,
"step": 442
},
{
"epoch": 1.132992327365729,
"grad_norm": 0.2697343671368354,
"learning_rate": 1.9768563428012536e-05,
"loss": 1.1091,
"step": 443
},
{
"epoch": 1.1355498721227621,
"grad_norm": 0.3193367309932036,
"learning_rate": 1.9765827201887183e-05,
"loss": 1.0767,
"step": 444
},
{
"epoch": 1.1381074168797953,
"grad_norm": 0.36846576847881124,
"learning_rate": 1.9763075087371583e-05,
"loss": 1.0996,
"step": 445
},
{
"epoch": 1.1406649616368287,
"grad_norm": 0.31668666427159936,
"learning_rate": 1.9760307088943254e-05,
"loss": 1.0713,
"step": 446
},
{
"epoch": 1.143222506393862,
"grad_norm": 0.35150116619841826,
"learning_rate": 1.9757523211105555e-05,
"loss": 1.0564,
"step": 447
},
{
"epoch": 1.145780051150895,
"grad_norm": 0.429831549745095,
"learning_rate": 1.975472345838768e-05,
"loss": 1.0907,
"step": 448
},
{
"epoch": 1.1483375959079285,
"grad_norm": 0.44872565734771747,
"learning_rate": 1.9751907835344654e-05,
"loss": 1.0817,
"step": 449
},
{
"epoch": 1.1508951406649617,
"grad_norm": 0.33913236381932554,
"learning_rate": 1.9749076346557318e-05,
"loss": 1.129,
"step": 450
},
{
"epoch": 1.1534526854219949,
"grad_norm": 0.33115586128973074,
"learning_rate": 1.9746228996632326e-05,
"loss": 1.1034,
"step": 451
},
{
"epoch": 1.156010230179028,
"grad_norm": 0.3057185791661933,
"learning_rate": 1.974336579020214e-05,
"loss": 1.1076,
"step": 452
},
{
"epoch": 1.1585677749360614,
"grad_norm": 0.43316526036175457,
"learning_rate": 1.9740486731925022e-05,
"loss": 1.1224,
"step": 453
},
{
"epoch": 1.1611253196930946,
"grad_norm": 0.5066112837446138,
"learning_rate": 1.9737591826485013e-05,
"loss": 1.0962,
"step": 454
},
{
"epoch": 1.1636828644501278,
"grad_norm": 0.4014502906289108,
"learning_rate": 1.9734681078591943e-05,
"loss": 1.0905,
"step": 455
},
{
"epoch": 1.1662404092071612,
"grad_norm": 0.30247128311625804,
"learning_rate": 1.9731754492981423e-05,
"loss": 1.0812,
"step": 456
},
{
"epoch": 1.1687979539641944,
"grad_norm": 0.31145252945008656,
"learning_rate": 1.9728812074414822e-05,
"loss": 1.0729,
"step": 457
},
{
"epoch": 1.1713554987212276,
"grad_norm": 0.33968915375934183,
"learning_rate": 1.9725853827679266e-05,
"loss": 1.078,
"step": 458
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.27618072861680876,
"learning_rate": 1.9722879757587647e-05,
"loss": 1.0864,
"step": 459
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.28234315821124384,
"learning_rate": 1.9719889868978582e-05,
"loss": 1.1135,
"step": 460
},
{
"epoch": 1.1790281329923273,
"grad_norm": 0.29884726287169866,
"learning_rate": 1.971688416671644e-05,
"loss": 1.1363,
"step": 461
},
{
"epoch": 1.1815856777493605,
"grad_norm": 0.27600448666706423,
"learning_rate": 1.9713862655691302e-05,
"loss": 1.0791,
"step": 462
},
{
"epoch": 1.184143222506394,
"grad_norm": 0.2803813788615088,
"learning_rate": 1.971082534081899e-05,
"loss": 1.0718,
"step": 463
},
{
"epoch": 1.186700767263427,
"grad_norm": 0.2696501099289663,
"learning_rate": 1.970777222704101e-05,
"loss": 1.0961,
"step": 464
},
{
"epoch": 1.1892583120204603,
"grad_norm": 0.3010556872116562,
"learning_rate": 1.97047033193246e-05,
"loss": 1.1038,
"step": 465
},
{
"epoch": 1.1918158567774937,
"grad_norm": 0.28235325514025905,
"learning_rate": 1.970161862266268e-05,
"loss": 1.1054,
"step": 466
},
{
"epoch": 1.1943734015345269,
"grad_norm": 0.28808186970685423,
"learning_rate": 1.969851814207385e-05,
"loss": 1.0807,
"step": 467
},
{
"epoch": 1.19693094629156,
"grad_norm": 0.33258411208957883,
"learning_rate": 1.9695401882602406e-05,
"loss": 1.1296,
"step": 468
},
{
"epoch": 1.1994884910485935,
"grad_norm": 0.3318703383183081,
"learning_rate": 1.9692269849318303e-05,
"loss": 1.0936,
"step": 469
},
{
"epoch": 1.2020460358056266,
"grad_norm": 0.30178464518160203,
"learning_rate": 1.9689122047317166e-05,
"loss": 1.1155,
"step": 470
},
{
"epoch": 1.2046035805626598,
"grad_norm": 0.30521273043475255,
"learning_rate": 1.968595848172027e-05,
"loss": 1.0896,
"step": 471
},
{
"epoch": 1.207161125319693,
"grad_norm": 0.34614634138914463,
"learning_rate": 1.968277915767454e-05,
"loss": 1.0452,
"step": 472
},
{
"epoch": 1.2097186700767264,
"grad_norm": 0.32741746531886684,
"learning_rate": 1.9679584080352537e-05,
"loss": 1.1045,
"step": 473
},
{
"epoch": 1.2122762148337596,
"grad_norm": 0.2615489309131341,
"learning_rate": 1.967637325495245e-05,
"loss": 1.0855,
"step": 474
},
{
"epoch": 1.2148337595907928,
"grad_norm": 0.27476592859150684,
"learning_rate": 1.9673146686698093e-05,
"loss": 1.1001,
"step": 475
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.3421071933190777,
"learning_rate": 1.9669904380838892e-05,
"loss": 1.0729,
"step": 476
},
{
"epoch": 1.2199488491048593,
"grad_norm": 0.3598257915245131,
"learning_rate": 1.966664634264987e-05,
"loss": 1.1242,
"step": 477
},
{
"epoch": 1.2225063938618925,
"grad_norm": 0.32107570559715254,
"learning_rate": 1.9663372577431663e-05,
"loss": 1.1087,
"step": 478
},
{
"epoch": 1.2250639386189257,
"grad_norm": 0.341209086018264,
"learning_rate": 1.966008309051047e-05,
"loss": 1.1167,
"step": 479
},
{
"epoch": 1.227621483375959,
"grad_norm": 0.29733249941263845,
"learning_rate": 1.965677788723809e-05,
"loss": 1.07,
"step": 480
},
{
"epoch": 1.2301790281329923,
"grad_norm": 0.26502862394927407,
"learning_rate": 1.9653456972991877e-05,
"loss": 1.0775,
"step": 481
},
{
"epoch": 1.2327365728900257,
"grad_norm": 0.28986896788872485,
"learning_rate": 1.965012035317475e-05,
"loss": 1.0967,
"step": 482
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.33295845795202056,
"learning_rate": 1.9646768033215183e-05,
"loss": 1.0879,
"step": 483
},
{
"epoch": 1.237851662404092,
"grad_norm": 0.3705619524001342,
"learning_rate": 1.9643400018567195e-05,
"loss": 1.1019,
"step": 484
},
{
"epoch": 1.2404092071611252,
"grad_norm": 0.3266347911273673,
"learning_rate": 1.9640016314710323e-05,
"loss": 1.1084,
"step": 485
},
{
"epoch": 1.2429667519181586,
"grad_norm": 0.3761069051897771,
"learning_rate": 1.9636616927149655e-05,
"loss": 1.1029,
"step": 486
},
{
"epoch": 1.2455242966751918,
"grad_norm": 0.2621662577070755,
"learning_rate": 1.9633201861415773e-05,
"loss": 1.0735,
"step": 487
},
{
"epoch": 1.248081841432225,
"grad_norm": 0.266376960810325,
"learning_rate": 1.9629771123064784e-05,
"loss": 1.0948,
"step": 488
},
{
"epoch": 1.2506393861892584,
"grad_norm": 0.3408438115021644,
"learning_rate": 1.9626324717678275e-05,
"loss": 1.0984,
"step": 489
},
{
"epoch": 1.2531969309462916,
"grad_norm": 0.3255066954002719,
"learning_rate": 1.962286265086334e-05,
"loss": 1.1213,
"step": 490
},
{
"epoch": 1.2557544757033248,
"grad_norm": 0.3765758476751633,
"learning_rate": 1.961938492825254e-05,
"loss": 1.0909,
"step": 491
},
{
"epoch": 1.258312020460358,
"grad_norm": 0.3109670040308706,
"learning_rate": 1.9615891555503914e-05,
"loss": 1.1164,
"step": 492
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.28523527744811616,
"learning_rate": 1.961238253830096e-05,
"loss": 1.0834,
"step": 493
},
{
"epoch": 1.2634271099744245,
"grad_norm": 0.3472113617037474,
"learning_rate": 1.9608857882352636e-05,
"loss": 1.0823,
"step": 494
},
{
"epoch": 1.265984654731458,
"grad_norm": 0.45214384592951995,
"learning_rate": 1.9605317593393326e-05,
"loss": 1.1084,
"step": 495
},
{
"epoch": 1.2685421994884911,
"grad_norm": 0.3401855972965097,
"learning_rate": 1.9601761677182868e-05,
"loss": 1.0978,
"step": 496
},
{
"epoch": 1.2710997442455243,
"grad_norm": 0.3025957486994177,
"learning_rate": 1.959819013950651e-05,
"loss": 1.0889,
"step": 497
},
{
"epoch": 1.2736572890025575,
"grad_norm": 0.29140422941812544,
"learning_rate": 1.9594602986174923e-05,
"loss": 1.0792,
"step": 498
},
{
"epoch": 1.2762148337595907,
"grad_norm": 0.3620688439157377,
"learning_rate": 1.959100022302418e-05,
"loss": 1.092,
"step": 499
},
{
"epoch": 1.278772378516624,
"grad_norm": 0.3498507983384518,
"learning_rate": 1.9587381855915754e-05,
"loss": 1.0652,
"step": 500
},
{
"epoch": 1.2813299232736572,
"grad_norm": 0.34633148833870603,
"learning_rate": 1.95837478907365e-05,
"loss": 1.0859,
"step": 501
},
{
"epoch": 1.2838874680306906,
"grad_norm": 0.28466962730903933,
"learning_rate": 1.958009833339865e-05,
"loss": 1.0912,
"step": 502
},
{
"epoch": 1.2864450127877238,
"grad_norm": 0.26890207030009217,
"learning_rate": 1.9576433189839807e-05,
"loss": 1.1088,
"step": 503
},
{
"epoch": 1.289002557544757,
"grad_norm": 0.273263645379487,
"learning_rate": 1.957275246602293e-05,
"loss": 1.0837,
"step": 504
},
{
"epoch": 1.2915601023017902,
"grad_norm": 0.2716148540613851,
"learning_rate": 1.9569056167936332e-05,
"loss": 1.105,
"step": 505
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.24370260465489227,
"learning_rate": 1.956534430159365e-05,
"loss": 1.0726,
"step": 506
},
{
"epoch": 1.2966751918158568,
"grad_norm": 0.2620730046771573,
"learning_rate": 1.9561616873033867e-05,
"loss": 1.1079,
"step": 507
},
{
"epoch": 1.29923273657289,
"grad_norm": 0.3135544306790673,
"learning_rate": 1.955787388832127e-05,
"loss": 1.0697,
"step": 508
},
{
"epoch": 1.3017902813299234,
"grad_norm": 0.26135639483849105,
"learning_rate": 1.9554115353545464e-05,
"loss": 1.1016,
"step": 509
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.25771344651987327,
"learning_rate": 1.9550341274821348e-05,
"loss": 1.0727,
"step": 510
},
{
"epoch": 1.3069053708439897,
"grad_norm": 0.3167223084832456,
"learning_rate": 1.9546551658289113e-05,
"loss": 1.0792,
"step": 511
},
{
"epoch": 1.309462915601023,
"grad_norm": 0.37857845074967256,
"learning_rate": 1.954274651011423e-05,
"loss": 1.1143,
"step": 512
},
{
"epoch": 1.3120204603580563,
"grad_norm": 0.2580494189739856,
"learning_rate": 1.9538925836487436e-05,
"loss": 1.0674,
"step": 513
},
{
"epoch": 1.3145780051150895,
"grad_norm": 0.39297270925108346,
"learning_rate": 1.953508964362473e-05,
"loss": 1.0885,
"step": 514
},
{
"epoch": 1.317135549872123,
"grad_norm": 0.4568937813346712,
"learning_rate": 1.9531237937767352e-05,
"loss": 1.0807,
"step": 515
},
{
"epoch": 1.319693094629156,
"grad_norm": 0.4182414922758871,
"learning_rate": 1.9527370725181793e-05,
"loss": 1.0766,
"step": 516
},
{
"epoch": 1.3222506393861893,
"grad_norm": 0.4402863172879326,
"learning_rate": 1.9523488012159762e-05,
"loss": 1.0712,
"step": 517
},
{
"epoch": 1.3248081841432224,
"grad_norm": 0.3810424193074309,
"learning_rate": 1.9519589805018187e-05,
"loss": 1.0888,
"step": 518
},
{
"epoch": 1.3273657289002558,
"grad_norm": 0.4051938816832732,
"learning_rate": 1.951567611009922e-05,
"loss": 1.0801,
"step": 519
},
{
"epoch": 1.329923273657289,
"grad_norm": 0.3260440045944625,
"learning_rate": 1.9511746933770186e-05,
"loss": 1.1149,
"step": 520
},
{
"epoch": 1.3324808184143222,
"grad_norm": 0.31554258651135036,
"learning_rate": 1.9507802282423612e-05,
"loss": 1.1202,
"step": 521
},
{
"epoch": 1.3350383631713556,
"grad_norm": 0.2622342243824476,
"learning_rate": 1.9503842162477205e-05,
"loss": 1.1006,
"step": 522
},
{
"epoch": 1.3375959079283888,
"grad_norm": 0.3015423536266443,
"learning_rate": 1.9499866580373826e-05,
"loss": 1.0873,
"step": 523
},
{
"epoch": 1.340153452685422,
"grad_norm": 0.3920165036339574,
"learning_rate": 1.94958755425815e-05,
"loss": 1.1154,
"step": 524
},
{
"epoch": 1.3427109974424551,
"grad_norm": 0.2769409471650046,
"learning_rate": 1.9491869055593392e-05,
"loss": 1.0867,
"step": 525
},
{
"epoch": 1.3452685421994885,
"grad_norm": 0.30161940340621723,
"learning_rate": 1.9487847125927814e-05,
"loss": 1.1126,
"step": 526
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.41990580701086677,
"learning_rate": 1.948380976012819e-05,
"loss": 1.0625,
"step": 527
},
{
"epoch": 1.350383631713555,
"grad_norm": 0.3940286196901995,
"learning_rate": 1.9479756964763062e-05,
"loss": 1.1262,
"step": 528
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.3683443524857737,
"learning_rate": 1.9475688746426075e-05,
"loss": 1.0865,
"step": 529
},
{
"epoch": 1.3554987212276215,
"grad_norm": 0.2675607272032647,
"learning_rate": 1.9471605111735964e-05,
"loss": 1.0594,
"step": 530
},
{
"epoch": 1.3580562659846547,
"grad_norm": 0.30194225210114733,
"learning_rate": 1.9467506067336554e-05,
"loss": 1.0955,
"step": 531
},
{
"epoch": 1.3606138107416879,
"grad_norm": 0.32576735510414695,
"learning_rate": 1.946339161989672e-05,
"loss": 1.0824,
"step": 532
},
{
"epoch": 1.3631713554987213,
"grad_norm": 0.3598150497292756,
"learning_rate": 1.9459261776110426e-05,
"loss": 1.1215,
"step": 533
},
{
"epoch": 1.3657289002557544,
"grad_norm": 0.30585802865605916,
"learning_rate": 1.945511654269666e-05,
"loss": 1.086,
"step": 534
},
{
"epoch": 1.3682864450127878,
"grad_norm": 0.2832294529242309,
"learning_rate": 1.945095592639946e-05,
"loss": 1.0992,
"step": 535
},
{
"epoch": 1.370843989769821,
"grad_norm": 0.29056128095513195,
"learning_rate": 1.944677993398789e-05,
"loss": 1.1311,
"step": 536
},
{
"epoch": 1.3734015345268542,
"grad_norm": 0.2598885076655647,
"learning_rate": 1.944258857225603e-05,
"loss": 1.0869,
"step": 537
},
{
"epoch": 1.3759590792838874,
"grad_norm": 0.29819735030908995,
"learning_rate": 1.943838184802296e-05,
"loss": 1.1034,
"step": 538
},
{
"epoch": 1.3785166240409208,
"grad_norm": 0.27354562935410204,
"learning_rate": 1.9434159768132762e-05,
"loss": 1.0834,
"step": 539
},
{
"epoch": 1.381074168797954,
"grad_norm": 0.3164865864885613,
"learning_rate": 1.9429922339454486e-05,
"loss": 1.0952,
"step": 540
},
{
"epoch": 1.3836317135549872,
"grad_norm": 0.34458030079305596,
"learning_rate": 1.9425669568882175e-05,
"loss": 1.1195,
"step": 541
},
{
"epoch": 1.3861892583120206,
"grad_norm": 0.2973996932273863,
"learning_rate": 1.942140146333481e-05,
"loss": 1.1082,
"step": 542
},
{
"epoch": 1.3887468030690537,
"grad_norm": 0.41583952226086746,
"learning_rate": 1.9417118029756342e-05,
"loss": 1.0664,
"step": 543
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.33101469656406096,
"learning_rate": 1.9412819275115648e-05,
"loss": 1.087,
"step": 544
},
{
"epoch": 1.39386189258312,
"grad_norm": 0.2709972180594455,
"learning_rate": 1.9408505206406526e-05,
"loss": 1.078,
"step": 545
},
{
"epoch": 1.3964194373401535,
"grad_norm": 0.3358832525629651,
"learning_rate": 1.9404175830647703e-05,
"loss": 1.0549,
"step": 546
},
{
"epoch": 1.3989769820971867,
"grad_norm": 0.2987798463061033,
"learning_rate": 1.93998311548828e-05,
"loss": 1.0946,
"step": 547
},
{
"epoch": 1.40153452685422,
"grad_norm": 0.3337061384486843,
"learning_rate": 1.939547118618033e-05,
"loss": 1.0898,
"step": 548
},
{
"epoch": 1.4040920716112533,
"grad_norm": 0.3217064113312768,
"learning_rate": 1.9391095931633694e-05,
"loss": 1.1098,
"step": 549
},
{
"epoch": 1.4066496163682864,
"grad_norm": 0.2752108304141071,
"learning_rate": 1.9386705398361156e-05,
"loss": 1.0469,
"step": 550
},
{
"epoch": 1.4092071611253196,
"grad_norm": 0.25580623137423647,
"learning_rate": 1.938229959350584e-05,
"loss": 1.0616,
"step": 551
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.3326332518112022,
"learning_rate": 1.937787852423571e-05,
"loss": 1.1083,
"step": 552
},
{
"epoch": 1.4143222506393862,
"grad_norm": 0.28662569595039195,
"learning_rate": 1.937344219774358e-05,
"loss": 1.0908,
"step": 553
},
{
"epoch": 1.4168797953964194,
"grad_norm": 0.27173135593182157,
"learning_rate": 1.9368990621247062e-05,
"loss": 1.102,
"step": 554
},
{
"epoch": 1.4194373401534528,
"grad_norm": 0.2468084134139675,
"learning_rate": 1.9364523801988606e-05,
"loss": 1.1147,
"step": 555
},
{
"epoch": 1.421994884910486,
"grad_norm": 0.2709546209917836,
"learning_rate": 1.9360041747235437e-05,
"loss": 1.0962,
"step": 556
},
{
"epoch": 1.4245524296675192,
"grad_norm": 0.2653203619472685,
"learning_rate": 1.9355544464279587e-05,
"loss": 1.0864,
"step": 557
},
{
"epoch": 1.4271099744245523,
"grad_norm": 0.28467968268797966,
"learning_rate": 1.9351031960437848e-05,
"loss": 1.0747,
"step": 558
},
{
"epoch": 1.4296675191815857,
"grad_norm": 0.31847968792917525,
"learning_rate": 1.934650424305178e-05,
"loss": 1.0731,
"step": 559
},
{
"epoch": 1.432225063938619,
"grad_norm": 0.3091639351747145,
"learning_rate": 1.9341961319487704e-05,
"loss": 1.0598,
"step": 560
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.26120102379692217,
"learning_rate": 1.9337403197136663e-05,
"loss": 1.0712,
"step": 561
},
{
"epoch": 1.4373401534526855,
"grad_norm": 0.283165316308832,
"learning_rate": 1.9332829883414444e-05,
"loss": 1.0883,
"step": 562
},
{
"epoch": 1.4398976982097187,
"grad_norm": 0.2767794060421261,
"learning_rate": 1.932824138576154e-05,
"loss": 1.1141,
"step": 563
},
{
"epoch": 1.4424552429667519,
"grad_norm": 0.3027787955580307,
"learning_rate": 1.9323637711643147e-05,
"loss": 1.1109,
"step": 564
},
{
"epoch": 1.445012787723785,
"grad_norm": 0.32071961002527666,
"learning_rate": 1.9319018868549165e-05,
"loss": 1.1192,
"step": 565
},
{
"epoch": 1.4475703324808185,
"grad_norm": 0.33467873672280385,
"learning_rate": 1.931438486399415e-05,
"loss": 1.0817,
"step": 566
},
{
"epoch": 1.4501278772378516,
"grad_norm": 0.30569240173237483,
"learning_rate": 1.930973570551735e-05,
"loss": 1.0607,
"step": 567
},
{
"epoch": 1.452685421994885,
"grad_norm": 0.298726423982734,
"learning_rate": 1.9305071400682644e-05,
"loss": 1.0914,
"step": 568
},
{
"epoch": 1.4552429667519182,
"grad_norm": 0.3038529339878212,
"learning_rate": 1.9300391957078564e-05,
"loss": 1.0834,
"step": 569
},
{
"epoch": 1.4578005115089514,
"grad_norm": 0.30563450154931243,
"learning_rate": 1.9295697382318286e-05,
"loss": 1.0733,
"step": 570
},
{
"epoch": 1.4603580562659846,
"grad_norm": 0.3808106030288731,
"learning_rate": 1.9290987684039576e-05,
"loss": 1.0955,
"step": 571
},
{
"epoch": 1.4629156010230178,
"grad_norm": 0.32964679230942334,
"learning_rate": 1.9286262869904827e-05,
"loss": 1.0977,
"step": 572
},
{
"epoch": 1.4654731457800512,
"grad_norm": 0.3576744350781661,
"learning_rate": 1.928152294760101e-05,
"loss": 1.0826,
"step": 573
},
{
"epoch": 1.4680306905370843,
"grad_norm": 0.3442477800849191,
"learning_rate": 1.9276767924839687e-05,
"loss": 1.0693,
"step": 574
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.4177409226360097,
"learning_rate": 1.927199780935698e-05,
"loss": 1.1031,
"step": 575
},
{
"epoch": 1.473145780051151,
"grad_norm": 0.5022744214347684,
"learning_rate": 1.926721260891357e-05,
"loss": 1.1081,
"step": 576
},
{
"epoch": 1.4757033248081841,
"grad_norm": 0.5089458782552098,
"learning_rate": 1.9262412331294677e-05,
"loss": 1.0984,
"step": 577
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.28913442828013464,
"learning_rate": 1.9257596984310055e-05,
"loss": 1.0907,
"step": 578
},
{
"epoch": 1.4808184143222507,
"grad_norm": 0.36385701502207274,
"learning_rate": 1.925276657579397e-05,
"loss": 1.0667,
"step": 579
},
{
"epoch": 1.4833759590792839,
"grad_norm": 0.39854637256040343,
"learning_rate": 1.9247921113605197e-05,
"loss": 1.0814,
"step": 580
},
{
"epoch": 1.485933503836317,
"grad_norm": 0.3421920326108303,
"learning_rate": 1.9243060605626995e-05,
"loss": 1.0984,
"step": 581
},
{
"epoch": 1.4884910485933505,
"grad_norm": 0.2806970145004491,
"learning_rate": 1.9238185059767116e-05,
"loss": 1.0903,
"step": 582
},
{
"epoch": 1.4910485933503836,
"grad_norm": 0.458875989536999,
"learning_rate": 1.9233294483957758e-05,
"loss": 1.1135,
"step": 583
},
{
"epoch": 1.4936061381074168,
"grad_norm": 0.5204446417118193,
"learning_rate": 1.922838888615559e-05,
"loss": 1.1228,
"step": 584
},
{
"epoch": 1.49616368286445,
"grad_norm": 0.4574878580551403,
"learning_rate": 1.922346827434171e-05,
"loss": 1.0595,
"step": 585
},
{
"epoch": 1.4987212276214834,
"grad_norm": 0.26814443608722427,
"learning_rate": 1.921853265652164e-05,
"loss": 1.0742,
"step": 586
},
{
"epoch": 1.5012787723785166,
"grad_norm": 0.4321843380909753,
"learning_rate": 1.9213582040725333e-05,
"loss": 1.0823,
"step": 587
},
{
"epoch": 1.50383631713555,
"grad_norm": 0.3998584041466985,
"learning_rate": 1.9208616435007124e-05,
"loss": 1.1113,
"step": 588
},
{
"epoch": 1.5063938618925832,
"grad_norm": 0.36340166424292447,
"learning_rate": 1.9203635847445743e-05,
"loss": 1.0495,
"step": 589
},
{
"epoch": 1.5089514066496164,
"grad_norm": 0.30341924814307153,
"learning_rate": 1.9198640286144296e-05,
"loss": 1.0778,
"step": 590
},
{
"epoch": 1.5115089514066495,
"grad_norm": 0.3549252043532506,
"learning_rate": 1.9193629759230252e-05,
"loss": 1.0526,
"step": 591
},
{
"epoch": 1.5140664961636827,
"grad_norm": 0.3706707482911529,
"learning_rate": 1.9188604274855417e-05,
"loss": 1.1082,
"step": 592
},
{
"epoch": 1.5166240409207161,
"grad_norm": 0.3221161365565599,
"learning_rate": 1.9183563841195948e-05,
"loss": 1.0358,
"step": 593
},
{
"epoch": 1.5191815856777495,
"grad_norm": 0.35561020647213454,
"learning_rate": 1.917850846645231e-05,
"loss": 1.1016,
"step": 594
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.3891453948051964,
"learning_rate": 1.917343815884929e-05,
"loss": 1.0723,
"step": 595
},
{
"epoch": 1.5242966751918159,
"grad_norm": 0.293218650160261,
"learning_rate": 1.9168352926635948e-05,
"loss": 1.0842,
"step": 596
},
{
"epoch": 1.526854219948849,
"grad_norm": 0.331624086856979,
"learning_rate": 1.9163252778085646e-05,
"loss": 1.0928,
"step": 597
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.36005628746389595,
"learning_rate": 1.9158137721496014e-05,
"loss": 1.0954,
"step": 598
},
{
"epoch": 1.5319693094629157,
"grad_norm": 0.25854576697363735,
"learning_rate": 1.9153007765188918e-05,
"loss": 1.0703,
"step": 599
},
{
"epoch": 1.5345268542199488,
"grad_norm": 0.3178892680337157,
"learning_rate": 1.914786291751048e-05,
"loss": 1.1178,
"step": 600
},
{
"epoch": 1.5370843989769822,
"grad_norm": 0.3276728285320476,
"learning_rate": 1.9142703186831044e-05,
"loss": 1.0711,
"step": 601
},
{
"epoch": 1.5396419437340154,
"grad_norm": 0.34402306746609335,
"learning_rate": 1.9137528581545172e-05,
"loss": 1.0669,
"step": 602
},
{
"epoch": 1.5421994884910486,
"grad_norm": 0.3658697294408855,
"learning_rate": 1.9132339110071623e-05,
"loss": 1.0738,
"step": 603
},
{
"epoch": 1.5447570332480818,
"grad_norm": 0.33272997926321957,
"learning_rate": 1.9127134780853343e-05,
"loss": 1.0891,
"step": 604
},
{
"epoch": 1.547314578005115,
"grad_norm": 0.26256059097959605,
"learning_rate": 1.9121915602357447e-05,
"loss": 1.0752,
"step": 605
},
{
"epoch": 1.5498721227621484,
"grad_norm": 0.29698212652722755,
"learning_rate": 1.9116681583075215e-05,
"loss": 1.0531,
"step": 606
},
{
"epoch": 1.5524296675191815,
"grad_norm": 0.3308461220455405,
"learning_rate": 1.9111432731522067e-05,
"loss": 1.0775,
"step": 607
},
{
"epoch": 1.554987212276215,
"grad_norm": 0.28434303668023103,
"learning_rate": 1.910616905623756e-05,
"loss": 1.0989,
"step": 608
},
{
"epoch": 1.5575447570332481,
"grad_norm": 0.2949610693246568,
"learning_rate": 1.910089056578536e-05,
"loss": 1.0942,
"step": 609
},
{
"epoch": 1.5601023017902813,
"grad_norm": 0.26028511630293355,
"learning_rate": 1.9095597268753243e-05,
"loss": 1.0639,
"step": 610
},
{
"epoch": 1.5626598465473145,
"grad_norm": 0.2736816450940113,
"learning_rate": 1.9090289173753077e-05,
"loss": 1.1013,
"step": 611
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.24169212652369965,
"learning_rate": 1.908496628942079e-05,
"loss": 1.0904,
"step": 612
},
{
"epoch": 1.567774936061381,
"grad_norm": 0.2790060046832418,
"learning_rate": 1.907962862441639e-05,
"loss": 1.0789,
"step": 613
},
{
"epoch": 1.5703324808184145,
"grad_norm": 0.25148763709880523,
"learning_rate": 1.9074276187423925e-05,
"loss": 1.083,
"step": 614
},
{
"epoch": 1.5728900255754477,
"grad_norm": 0.260089635225582,
"learning_rate": 1.906890898715147e-05,
"loss": 1.1052,
"step": 615
},
{
"epoch": 1.5754475703324808,
"grad_norm": 0.24239290344853867,
"learning_rate": 1.9063527032331128e-05,
"loss": 1.0587,
"step": 616
},
{
"epoch": 1.578005115089514,
"grad_norm": 0.31033949728422483,
"learning_rate": 1.9058130331719002e-05,
"loss": 1.0906,
"step": 617
},
{
"epoch": 1.5805626598465472,
"grad_norm": 0.29694640873919886,
"learning_rate": 1.9052718894095183e-05,
"loss": 1.0828,
"step": 618
},
{
"epoch": 1.5831202046035806,
"grad_norm": 0.268458744450183,
"learning_rate": 1.904729272826375e-05,
"loss": 1.0697,
"step": 619
},
{
"epoch": 1.5856777493606138,
"grad_norm": 0.3328538025026265,
"learning_rate": 1.9041851843052727e-05,
"loss": 1.0556,
"step": 620
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.4354576423430095,
"learning_rate": 1.90363962473141e-05,
"loss": 1.0888,
"step": 621
},
{
"epoch": 1.5907928388746804,
"grad_norm": 0.4488970201166202,
"learning_rate": 1.9030925949923777e-05,
"loss": 1.0991,
"step": 622
},
{
"epoch": 1.5933503836317136,
"grad_norm": 0.30850235477610843,
"learning_rate": 1.9025440959781593e-05,
"loss": 1.0721,
"step": 623
},
{
"epoch": 1.5959079283887467,
"grad_norm": 0.24306011770668454,
"learning_rate": 1.9019941285811284e-05,
"loss": 1.1146,
"step": 624
},
{
"epoch": 1.59846547314578,
"grad_norm": 0.31927732953474425,
"learning_rate": 1.9014426936960477e-05,
"loss": 1.1386,
"step": 625
},
{
"epoch": 1.6010230179028133,
"grad_norm": 0.30395309199867215,
"learning_rate": 1.900889792220067e-05,
"loss": 1.0651,
"step": 626
},
{
"epoch": 1.6035805626598465,
"grad_norm": 0.2641664347228699,
"learning_rate": 1.9003354250527225e-05,
"loss": 1.0737,
"step": 627
},
{
"epoch": 1.60613810741688,
"grad_norm": 0.2541673904415416,
"learning_rate": 1.899779593095935e-05,
"loss": 1.1093,
"step": 628
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.248114384702292,
"learning_rate": 1.8992222972540083e-05,
"loss": 1.0631,
"step": 629
},
{
"epoch": 1.6112531969309463,
"grad_norm": 0.27098670487834897,
"learning_rate": 1.8986635384336275e-05,
"loss": 1.0684,
"step": 630
},
{
"epoch": 1.6138107416879794,
"grad_norm": 0.2707047290641469,
"learning_rate": 1.8981033175438593e-05,
"loss": 1.0793,
"step": 631
},
{
"epoch": 1.6163682864450126,
"grad_norm": 0.2248022175811438,
"learning_rate": 1.897541635496147e-05,
"loss": 1.0741,
"step": 632
},
{
"epoch": 1.618925831202046,
"grad_norm": 0.33046089699268805,
"learning_rate": 1.896978493204313e-05,
"loss": 1.0536,
"step": 633
},
{
"epoch": 1.6214833759590794,
"grad_norm": 0.2897890506100947,
"learning_rate": 1.896413891584554e-05,
"loss": 1.1041,
"step": 634
},
{
"epoch": 1.6240409207161126,
"grad_norm": 0.24423929651462964,
"learning_rate": 1.8958478315554414e-05,
"loss": 1.0554,
"step": 635
},
{
"epoch": 1.6265984654731458,
"grad_norm": 0.2824637389915044,
"learning_rate": 1.8952803140379198e-05,
"loss": 1.105,
"step": 636
},
{
"epoch": 1.629156010230179,
"grad_norm": 0.34172319194434536,
"learning_rate": 1.894711339955305e-05,
"loss": 1.0966,
"step": 637
},
{
"epoch": 1.6317135549872122,
"grad_norm": 0.2986624598202099,
"learning_rate": 1.8941409102332818e-05,
"loss": 1.0801,
"step": 638
},
{
"epoch": 1.6342710997442456,
"grad_norm": 0.35330551163337126,
"learning_rate": 1.893569025799904e-05,
"loss": 1.1168,
"step": 639
},
{
"epoch": 1.6368286445012787,
"grad_norm": 0.37997527154753075,
"learning_rate": 1.8929956875855913e-05,
"loss": 1.044,
"step": 640
},
{
"epoch": 1.6393861892583121,
"grad_norm": 0.3987670557181093,
"learning_rate": 1.89242089652313e-05,
"loss": 1.0678,
"step": 641
},
{
"epoch": 1.6419437340153453,
"grad_norm": 0.4164983853962145,
"learning_rate": 1.8918446535476683e-05,
"loss": 1.0713,
"step": 642
},
{
"epoch": 1.6445012787723785,
"grad_norm": 0.36634278907361967,
"learning_rate": 1.8912669595967182e-05,
"loss": 1.0845,
"step": 643
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.3377854105852521,
"learning_rate": 1.890687815610151e-05,
"loss": 1.1325,
"step": 644
},
{
"epoch": 1.6496163682864449,
"grad_norm": 0.2921364211079459,
"learning_rate": 1.8901072225301983e-05,
"loss": 1.0417,
"step": 645
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.40803324585389733,
"learning_rate": 1.8895251813014486e-05,
"loss": 1.0985,
"step": 646
},
{
"epoch": 1.6547314578005117,
"grad_norm": 0.4777584379650545,
"learning_rate": 1.8889416928708465e-05,
"loss": 1.0579,
"step": 647
},
{
"epoch": 1.6572890025575449,
"grad_norm": 0.4575863335013247,
"learning_rate": 1.8883567581876913e-05,
"loss": 1.075,
"step": 648
},
{
"epoch": 1.659846547314578,
"grad_norm": 0.44868767506108537,
"learning_rate": 1.887770378203635e-05,
"loss": 1.082,
"step": 649
},
{
"epoch": 1.6624040920716112,
"grad_norm": 0.3990360823870846,
"learning_rate": 1.8871825538726815e-05,
"loss": 1.0618,
"step": 650
},
{
"epoch": 1.6649616368286444,
"grad_norm": 0.384455268117493,
"learning_rate": 1.8865932861511836e-05,
"loss": 1.0883,
"step": 651
},
{
"epoch": 1.6675191815856778,
"grad_norm": 0.4308655650983798,
"learning_rate": 1.8860025759978436e-05,
"loss": 1.1136,
"step": 652
},
{
"epoch": 1.670076726342711,
"grad_norm": 0.5161027640726775,
"learning_rate": 1.8854104243737096e-05,
"loss": 1.0876,
"step": 653
},
{
"epoch": 1.6726342710997444,
"grad_norm": 0.5710337903727111,
"learning_rate": 1.8848168322421756e-05,
"loss": 1.0921,
"step": 654
},
{
"epoch": 1.6751918158567776,
"grad_norm": 0.4680011964164238,
"learning_rate": 1.884221800568979e-05,
"loss": 1.0817,
"step": 655
},
{
"epoch": 1.6777493606138107,
"grad_norm": 0.273509418810932,
"learning_rate": 1.8836253303221985e-05,
"loss": 1.0676,
"step": 656
},
{
"epoch": 1.680306905370844,
"grad_norm": 0.36238937602325755,
"learning_rate": 1.8830274224722544e-05,
"loss": 1.0694,
"step": 657
},
{
"epoch": 1.682864450127877,
"grad_norm": 0.4331370312585361,
"learning_rate": 1.8824280779919055e-05,
"loss": 1.0939,
"step": 658
},
{
"epoch": 1.6854219948849105,
"grad_norm": 0.42161084086226236,
"learning_rate": 1.8818272978562472e-05,
"loss": 1.0949,
"step": 659
},
{
"epoch": 1.6879795396419437,
"grad_norm": 0.42114600096809945,
"learning_rate": 1.8812250830427116e-05,
"loss": 1.1071,
"step": 660
},
{
"epoch": 1.690537084398977,
"grad_norm": 0.2580305989521523,
"learning_rate": 1.8806214345310648e-05,
"loss": 1.0884,
"step": 661
},
{
"epoch": 1.6930946291560103,
"grad_norm": 0.2790098578226022,
"learning_rate": 1.8800163533034048e-05,
"loss": 1.0786,
"step": 662
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.3952483114126335,
"learning_rate": 1.879409840344161e-05,
"loss": 1.1025,
"step": 663
},
{
"epoch": 1.6982097186700766,
"grad_norm": 0.34837002184241345,
"learning_rate": 1.8788018966400923e-05,
"loss": 1.0862,
"step": 664
},
{
"epoch": 1.7007672634271098,
"grad_norm": 0.23347425632455518,
"learning_rate": 1.878192523180285e-05,
"loss": 1.0903,
"step": 665
},
{
"epoch": 1.7033248081841432,
"grad_norm": 0.258084870513599,
"learning_rate": 1.877581720956151e-05,
"loss": 1.0659,
"step": 666
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.2955310030807304,
"learning_rate": 1.876969490961428e-05,
"loss": 1.0803,
"step": 667
},
{
"epoch": 1.7084398976982098,
"grad_norm": 0.34485101895191056,
"learning_rate": 1.8763558341921762e-05,
"loss": 1.0729,
"step": 668
},
{
"epoch": 1.710997442455243,
"grad_norm": 0.25932977011662367,
"learning_rate": 1.8757407516467762e-05,
"loss": 1.1017,
"step": 669
},
{
"epoch": 1.7135549872122762,
"grad_norm": 0.23771298856204617,
"learning_rate": 1.8751242443259286e-05,
"loss": 1.0771,
"step": 670
},
{
"epoch": 1.7161125319693094,
"grad_norm": 0.3403000739473665,
"learning_rate": 1.874506313232653e-05,
"loss": 1.0972,
"step": 671
},
{
"epoch": 1.7186700767263428,
"grad_norm": 0.36624614786635146,
"learning_rate": 1.873886959372284e-05,
"loss": 1.0948,
"step": 672
},
{
"epoch": 1.721227621483376,
"grad_norm": 0.23241780598609607,
"learning_rate": 1.8732661837524722e-05,
"loss": 1.0726,
"step": 673
},
{
"epoch": 1.7237851662404093,
"grad_norm": 0.27573330219222747,
"learning_rate": 1.8726439873831803e-05,
"loss": 1.1154,
"step": 674
},
{
"epoch": 1.7263427109974425,
"grad_norm": 0.3289571952505283,
"learning_rate": 1.8720203712766833e-05,
"loss": 1.0855,
"step": 675
},
{
"epoch": 1.7289002557544757,
"grad_norm": 0.26315983835648826,
"learning_rate": 1.8713953364475654e-05,
"loss": 1.0561,
"step": 676
},
{
"epoch": 1.7314578005115089,
"grad_norm": 0.2933737539222408,
"learning_rate": 1.8707688839127187e-05,
"loss": 1.0717,
"step": 677
},
{
"epoch": 1.734015345268542,
"grad_norm": 0.24075336640916348,
"learning_rate": 1.8701410146913427e-05,
"loss": 1.0733,
"step": 678
},
{
"epoch": 1.7365728900255755,
"grad_norm": 0.2969635924636881,
"learning_rate": 1.869511729804942e-05,
"loss": 1.0736,
"step": 679
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.2302120367596696,
"learning_rate": 1.8688810302773225e-05,
"loss": 1.0718,
"step": 680
},
{
"epoch": 1.741687979539642,
"grad_norm": 0.31123990252305606,
"learning_rate": 1.8682489171345942e-05,
"loss": 1.0633,
"step": 681
},
{
"epoch": 1.7442455242966752,
"grad_norm": 0.25671775642481637,
"learning_rate": 1.8676153914051648e-05,
"loss": 1.1055,
"step": 682
},
{
"epoch": 1.7468030690537084,
"grad_norm": 0.2731165902037635,
"learning_rate": 1.866980454119741e-05,
"loss": 1.1019,
"step": 683
},
{
"epoch": 1.7493606138107416,
"grad_norm": 0.29946202186623655,
"learning_rate": 1.8663441063113266e-05,
"loss": 1.0856,
"step": 684
},
{
"epoch": 1.7519181585677748,
"grad_norm": 0.2743108383298565,
"learning_rate": 1.8657063490152193e-05,
"loss": 1.0797,
"step": 685
},
{
"epoch": 1.7544757033248082,
"grad_norm": 0.2910690805954212,
"learning_rate": 1.8650671832690106e-05,
"loss": 1.1068,
"step": 686
},
{
"epoch": 1.7570332480818416,
"grad_norm": 0.25617556691443527,
"learning_rate": 1.864426610112583e-05,
"loss": 1.0801,
"step": 687
},
{
"epoch": 1.7595907928388748,
"grad_norm": 0.2446643852273966,
"learning_rate": 1.8637846305881092e-05,
"loss": 1.0712,
"step": 688
},
{
"epoch": 1.762148337595908,
"grad_norm": 0.24853300895824507,
"learning_rate": 1.8631412457400494e-05,
"loss": 1.0518,
"step": 689
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.2250526521940477,
"learning_rate": 1.862496456615151e-05,
"loss": 1.0802,
"step": 690
},
{
"epoch": 1.7672634271099743,
"grad_norm": 0.23033386861703295,
"learning_rate": 1.861850264262445e-05,
"loss": 1.0921,
"step": 691
},
{
"epoch": 1.7698209718670077,
"grad_norm": 0.22393185289398734,
"learning_rate": 1.8612026697332466e-05,
"loss": 1.0824,
"step": 692
},
{
"epoch": 1.772378516624041,
"grad_norm": 0.24371247518659098,
"learning_rate": 1.860553674081151e-05,
"loss": 1.0958,
"step": 693
},
{
"epoch": 1.7749360613810743,
"grad_norm": 0.21684995978781324,
"learning_rate": 1.859903278362034e-05,
"loss": 1.0511,
"step": 694
},
{
"epoch": 1.7774936061381075,
"grad_norm": 0.24359803588661344,
"learning_rate": 1.8592514836340485e-05,
"loss": 1.064,
"step": 695
},
{
"epoch": 1.7800511508951407,
"grad_norm": 0.2806613621237684,
"learning_rate": 1.8585982909576243e-05,
"loss": 1.0974,
"step": 696
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.2951317541501585,
"learning_rate": 1.857943701395464e-05,
"loss": 1.0745,
"step": 697
},
{
"epoch": 1.785166240409207,
"grad_norm": 0.2602691127905397,
"learning_rate": 1.857287716012545e-05,
"loss": 1.094,
"step": 698
},
{
"epoch": 1.7877237851662404,
"grad_norm": 0.2878865850607815,
"learning_rate": 1.8566303358761134e-05,
"loss": 1.0764,
"step": 699
},
{
"epoch": 1.7902813299232738,
"grad_norm": 0.25826524614522556,
"learning_rate": 1.8559715620556865e-05,
"loss": 1.095,
"step": 700
},
{
"epoch": 1.792838874680307,
"grad_norm": 0.3113734244197743,
"learning_rate": 1.855311395623048e-05,
"loss": 1.0636,
"step": 701
},
{
"epoch": 1.7953964194373402,
"grad_norm": 0.32545837268145317,
"learning_rate": 1.854649837652247e-05,
"loss": 1.0836,
"step": 702
},
{
"epoch": 1.7979539641943734,
"grad_norm": 0.285984682125429,
"learning_rate": 1.8539868892195972e-05,
"loss": 1.0848,
"step": 703
},
{
"epoch": 1.8005115089514065,
"grad_norm": 0.27758608852953665,
"learning_rate": 1.8533225514036742e-05,
"loss": 1.0663,
"step": 704
},
{
"epoch": 1.80306905370844,
"grad_norm": 0.27148772448252917,
"learning_rate": 1.852656825285314e-05,
"loss": 1.094,
"step": 705
},
{
"epoch": 1.8056265984654731,
"grad_norm": 0.30810009717755804,
"learning_rate": 1.8519897119476115e-05,
"loss": 1.0455,
"step": 706
},
{
"epoch": 1.8081841432225065,
"grad_norm": 0.2763175632842481,
"learning_rate": 1.8513212124759185e-05,
"loss": 1.0525,
"step": 707
},
{
"epoch": 1.8107416879795397,
"grad_norm": 0.2555077301269018,
"learning_rate": 1.8506513279578415e-05,
"loss": 1.0708,
"step": 708
},
{
"epoch": 1.813299232736573,
"grad_norm": 0.2861828394638753,
"learning_rate": 1.849980059483241e-05,
"loss": 1.0269,
"step": 709
},
{
"epoch": 1.815856777493606,
"grad_norm": 0.32694363610851984,
"learning_rate": 1.849307408144229e-05,
"loss": 1.0742,
"step": 710
},
{
"epoch": 1.8184143222506393,
"grad_norm": 0.33550420038638934,
"learning_rate": 1.8486333750351668e-05,
"loss": 1.1291,
"step": 711
},
{
"epoch": 1.8209718670076727,
"grad_norm": 0.30494475043620173,
"learning_rate": 1.8479579612526642e-05,
"loss": 1.0754,
"step": 712
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.2449819480488345,
"learning_rate": 1.8472811678955773e-05,
"loss": 1.083,
"step": 713
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.26042670531487994,
"learning_rate": 1.8466029960650066e-05,
"loss": 1.0749,
"step": 714
},
{
"epoch": 1.8286445012787724,
"grad_norm": 0.3057228350277353,
"learning_rate": 1.845923446864295e-05,
"loss": 1.0549,
"step": 715
},
{
"epoch": 1.8312020460358056,
"grad_norm": 0.2500852141764497,
"learning_rate": 1.845242521399027e-05,
"loss": 1.0721,
"step": 716
},
{
"epoch": 1.8337595907928388,
"grad_norm": 0.2675252870460311,
"learning_rate": 1.8445602207770254e-05,
"loss": 1.0449,
"step": 717
},
{
"epoch": 1.836317135549872,
"grad_norm": 0.2836719734304398,
"learning_rate": 1.8438765461083504e-05,
"loss": 1.0905,
"step": 718
},
{
"epoch": 1.8388746803069054,
"grad_norm": 0.34699165997108533,
"learning_rate": 1.843191498505299e-05,
"loss": 1.0901,
"step": 719
},
{
"epoch": 1.8414322250639388,
"grad_norm": 0.2722070954863811,
"learning_rate": 1.8425050790823994e-05,
"loss": 1.0964,
"step": 720
},
{
"epoch": 1.843989769820972,
"grad_norm": 0.258368289769939,
"learning_rate": 1.8418172889564145e-05,
"loss": 1.0962,
"step": 721
},
{
"epoch": 1.8465473145780051,
"grad_norm": 0.25936143701246717,
"learning_rate": 1.8411281292463345e-05,
"loss": 1.0545,
"step": 722
},
{
"epoch": 1.8491048593350383,
"grad_norm": 0.3060957581043503,
"learning_rate": 1.8404376010733802e-05,
"loss": 1.0815,
"step": 723
},
{
"epoch": 1.8516624040920715,
"grad_norm": 0.2815365945528782,
"learning_rate": 1.8397457055609973e-05,
"loss": 1.0759,
"step": 724
},
{
"epoch": 1.854219948849105,
"grad_norm": 0.2745951540225352,
"learning_rate": 1.8390524438348565e-05,
"loss": 1.1021,
"step": 725
},
{
"epoch": 1.856777493606138,
"grad_norm": 0.27846031555437806,
"learning_rate": 1.8383578170228514e-05,
"loss": 1.0248,
"step": 726
},
{
"epoch": 1.8593350383631715,
"grad_norm": 0.2938959273434096,
"learning_rate": 1.8376618262550966e-05,
"loss": 1.0528,
"step": 727
},
{
"epoch": 1.8618925831202047,
"grad_norm": 0.2993316558221603,
"learning_rate": 1.836964472663925e-05,
"loss": 1.058,
"step": 728
},
{
"epoch": 1.8644501278772379,
"grad_norm": 0.28817201575308804,
"learning_rate": 1.8362657573838874e-05,
"loss": 1.1157,
"step": 729
},
{
"epoch": 1.867007672634271,
"grad_norm": 0.22467467671098768,
"learning_rate": 1.8355656815517505e-05,
"loss": 1.0711,
"step": 730
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.29149108866988305,
"learning_rate": 1.8348642463064937e-05,
"loss": 1.0414,
"step": 731
},
{
"epoch": 1.8721227621483376,
"grad_norm": 0.39401431973372464,
"learning_rate": 1.8341614527893077e-05,
"loss": 1.0791,
"step": 732
},
{
"epoch": 1.8746803069053708,
"grad_norm": 0.4335182479065654,
"learning_rate": 1.833457302143594e-05,
"loss": 1.0878,
"step": 733
},
{
"epoch": 1.8772378516624042,
"grad_norm": 0.43497766670833005,
"learning_rate": 1.832751795514962e-05,
"loss": 1.0484,
"step": 734
},
{
"epoch": 1.8797953964194374,
"grad_norm": 0.2997553952148685,
"learning_rate": 1.832044934051226e-05,
"loss": 1.0762,
"step": 735
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.23441660095601177,
"learning_rate": 1.8313367189024065e-05,
"loss": 1.1082,
"step": 736
},
{
"epoch": 1.8849104859335037,
"grad_norm": 0.23816717696848114,
"learning_rate": 1.8306271512207242e-05,
"loss": 1.0834,
"step": 737
},
{
"epoch": 1.887468030690537,
"grad_norm": 0.29809886717421774,
"learning_rate": 1.829916232160602e-05,
"loss": 1.087,
"step": 738
},
{
"epoch": 1.8900255754475703,
"grad_norm": 0.36580006827207345,
"learning_rate": 1.829203962878661e-05,
"loss": 1.0718,
"step": 739
},
{
"epoch": 1.8925831202046037,
"grad_norm": 0.36472500474679165,
"learning_rate": 1.8284903445337184e-05,
"loss": 1.0435,
"step": 740
},
{
"epoch": 1.895140664961637,
"grad_norm": 0.2569898458683152,
"learning_rate": 1.8277753782867865e-05,
"loss": 1.0569,
"step": 741
},
{
"epoch": 1.89769820971867,
"grad_norm": 0.2807015519670205,
"learning_rate": 1.8270590653010706e-05,
"loss": 1.0623,
"step": 742
},
{
"epoch": 1.9002557544757033,
"grad_norm": 0.2706420270561887,
"learning_rate": 1.8263414067419676e-05,
"loss": 1.101,
"step": 743
},
{
"epoch": 1.9028132992327365,
"grad_norm": 0.28562929161394046,
"learning_rate": 1.8256224037770628e-05,
"loss": 1.0524,
"step": 744
},
{
"epoch": 1.9053708439897699,
"grad_norm": 0.2774733347803849,
"learning_rate": 1.824902057576129e-05,
"loss": 1.0511,
"step": 745
},
{
"epoch": 1.907928388746803,
"grad_norm": 0.22198709105225659,
"learning_rate": 1.8241803693111245e-05,
"loss": 1.075,
"step": 746
},
{
"epoch": 1.9104859335038364,
"grad_norm": 0.287788512970941,
"learning_rate": 1.8234573401561914e-05,
"loss": 1.0665,
"step": 747
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.2909301551397291,
"learning_rate": 1.8227329712876525e-05,
"loss": 1.0802,
"step": 748
},
{
"epoch": 1.9156010230179028,
"grad_norm": 0.25392349276614573,
"learning_rate": 1.8220072638840105e-05,
"loss": 1.1035,
"step": 749
},
{
"epoch": 1.918158567774936,
"grad_norm": 0.22821936416155694,
"learning_rate": 1.8212802191259465e-05,
"loss": 1.0571,
"step": 750
},
{
"epoch": 1.9207161125319692,
"grad_norm": 0.3130516886250542,
"learning_rate": 1.8205518381963165e-05,
"loss": 1.1095,
"step": 751
},
{
"epoch": 1.9232736572890026,
"grad_norm": 0.3857586516868388,
"learning_rate": 1.8198221222801506e-05,
"loss": 1.06,
"step": 752
},
{
"epoch": 1.9258312020460358,
"grad_norm": 0.315792024279407,
"learning_rate": 1.8190910725646512e-05,
"loss": 1.0772,
"step": 753
},
{
"epoch": 1.9283887468030692,
"grad_norm": 0.26686727973038904,
"learning_rate": 1.8183586902391905e-05,
"loss": 1.0708,
"step": 754
},
{
"epoch": 1.9309462915601023,
"grad_norm": 0.3669775155609857,
"learning_rate": 1.8176249764953088e-05,
"loss": 1.0393,
"step": 755
},
{
"epoch": 1.9335038363171355,
"grad_norm": 0.3411186812565117,
"learning_rate": 1.8168899325267122e-05,
"loss": 1.0777,
"step": 756
},
{
"epoch": 1.9360613810741687,
"grad_norm": 0.29525106020949826,
"learning_rate": 1.8161535595292717e-05,
"loss": 1.0738,
"step": 757
},
{
"epoch": 1.938618925831202,
"grad_norm": 0.2431416087312154,
"learning_rate": 1.8154158587010195e-05,
"loss": 1.0552,
"step": 758
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.2528824918629993,
"learning_rate": 1.8146768312421495e-05,
"loss": 1.1049,
"step": 759
},
{
"epoch": 1.9437340153452687,
"grad_norm": 0.27274199937217425,
"learning_rate": 1.8139364783550128e-05,
"loss": 1.11,
"step": 760
},
{
"epoch": 1.9462915601023019,
"grad_norm": 0.27694326525936447,
"learning_rate": 1.813194801244117e-05,
"loss": 1.1085,
"step": 761
},
{
"epoch": 1.948849104859335,
"grad_norm": 0.26284036778935943,
"learning_rate": 1.8124518011161246e-05,
"loss": 1.0817,
"step": 762
},
{
"epoch": 1.9514066496163682,
"grad_norm": 0.34628694859076536,
"learning_rate": 1.8117074791798503e-05,
"loss": 1.0723,
"step": 763
},
{
"epoch": 1.9539641943734014,
"grad_norm": 0.3205449398285809,
"learning_rate": 1.8109618366462597e-05,
"loss": 1.0878,
"step": 764
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.2930907660937919,
"learning_rate": 1.8102148747284662e-05,
"loss": 1.0194,
"step": 765
},
{
"epoch": 1.959079283887468,
"grad_norm": 0.3199378305398446,
"learning_rate": 1.8094665946417304e-05,
"loss": 1.0818,
"step": 766
},
{
"epoch": 1.9616368286445014,
"grad_norm": 0.3147442131814513,
"learning_rate": 1.8087169976034568e-05,
"loss": 1.0524,
"step": 767
},
{
"epoch": 1.9641943734015346,
"grad_norm": 0.29010540377698546,
"learning_rate": 1.807966084833193e-05,
"loss": 1.0804,
"step": 768
},
{
"epoch": 1.9667519181585678,
"grad_norm": 0.2830375710975825,
"learning_rate": 1.8072138575526277e-05,
"loss": 1.0876,
"step": 769
},
{
"epoch": 1.969309462915601,
"grad_norm": 0.29912181409924526,
"learning_rate": 1.806460316985587e-05,
"loss": 1.0674,
"step": 770
},
{
"epoch": 1.9718670076726341,
"grad_norm": 0.280637494020639,
"learning_rate": 1.8057054643580347e-05,
"loss": 1.059,
"step": 771
},
{
"epoch": 1.9744245524296675,
"grad_norm": 0.25437147169201857,
"learning_rate": 1.8049493008980685e-05,
"loss": 1.076,
"step": 772
},
{
"epoch": 1.976982097186701,
"grad_norm": 0.260015840044801,
"learning_rate": 1.8041918278359194e-05,
"loss": 1.0884,
"step": 773
},
{
"epoch": 1.979539641943734,
"grad_norm": 0.23338451398624144,
"learning_rate": 1.8034330464039485e-05,
"loss": 1.0564,
"step": 774
},
{
"epoch": 1.9820971867007673,
"grad_norm": 0.27240262637273416,
"learning_rate": 1.8026729578366457e-05,
"loss": 1.0653,
"step": 775
},
{
"epoch": 1.9846547314578005,
"grad_norm": 0.2658428330726454,
"learning_rate": 1.801911563370628e-05,
"loss": 1.0847,
"step": 776
},
{
"epoch": 1.9872122762148337,
"grad_norm": 0.24259844645380865,
"learning_rate": 1.801148864244636e-05,
"loss": 1.0617,
"step": 777
},
{
"epoch": 1.989769820971867,
"grad_norm": 0.274423591955145,
"learning_rate": 1.8003848616995333e-05,
"loss": 1.1046,
"step": 778
},
{
"epoch": 1.9923273657289002,
"grad_norm": 0.270074412347766,
"learning_rate": 1.7996195569783053e-05,
"loss": 1.0841,
"step": 779
},
{
"epoch": 1.9948849104859336,
"grad_norm": 0.32727342222060607,
"learning_rate": 1.798852951326054e-05,
"loss": 1.064,
"step": 780
},
{
"epoch": 1.9974424552429668,
"grad_norm": 0.28041604224998723,
"learning_rate": 1.7980850459899997e-05,
"loss": 1.0748,
"step": 781
},
{
"epoch": 2.0,
"grad_norm": 0.230649257113214,
"learning_rate": 1.7973158422194754e-05,
"loss": 1.0504,
"step": 782
},
{
"epoch": 2.002557544757033,
"grad_norm": 0.27721442928112094,
"learning_rate": 1.7965453412659284e-05,
"loss": 1.0561,
"step": 783
},
{
"epoch": 2.0051150895140664,
"grad_norm": 0.3484629274944669,
"learning_rate": 1.795773544382915e-05,
"loss": 1.0484,
"step": 784
},
{
"epoch": 2.0076726342710995,
"grad_norm": 0.35248757109292245,
"learning_rate": 1.795000452826101e-05,
"loss": 1.0494,
"step": 785
},
{
"epoch": 2.010230179028133,
"grad_norm": 0.31602726514395096,
"learning_rate": 1.794226067853257e-05,
"loss": 1.1343,
"step": 786
},
{
"epoch": 2.0127877237851663,
"grad_norm": 0.30632695925595954,
"learning_rate": 1.79345039072426e-05,
"loss": 1.0648,
"step": 787
},
{
"epoch": 2.0153452685421995,
"grad_norm": 0.33328827891250323,
"learning_rate": 1.7926734227010876e-05,
"loss": 1.0801,
"step": 788
},
{
"epoch": 2.0179028132992327,
"grad_norm": 0.35618373914463364,
"learning_rate": 1.7918951650478188e-05,
"loss": 1.0613,
"step": 789
},
{
"epoch": 2.020460358056266,
"grad_norm": 0.3085542598082131,
"learning_rate": 1.7911156190306296e-05,
"loss": 1.0476,
"step": 790
},
{
"epoch": 2.023017902813299,
"grad_norm": 0.22686489493321832,
"learning_rate": 1.7903347859177926e-05,
"loss": 1.0486,
"step": 791
},
{
"epoch": 2.0255754475703327,
"grad_norm": 0.2750201664093288,
"learning_rate": 1.7895526669796747e-05,
"loss": 1.0543,
"step": 792
},
{
"epoch": 2.028132992327366,
"grad_norm": 0.2998881689120612,
"learning_rate": 1.7887692634887345e-05,
"loss": 1.0434,
"step": 793
},
{
"epoch": 2.030690537084399,
"grad_norm": 0.260904922673988,
"learning_rate": 1.7879845767195204e-05,
"loss": 1.0443,
"step": 794
},
{
"epoch": 2.0332480818414322,
"grad_norm": 0.2465816351987358,
"learning_rate": 1.787198607948669e-05,
"loss": 1.0516,
"step": 795
},
{
"epoch": 2.0358056265984654,
"grad_norm": 0.23239060808440448,
"learning_rate": 1.786411358454902e-05,
"loss": 1.0588,
"step": 796
},
{
"epoch": 2.0383631713554986,
"grad_norm": 0.26101630597920855,
"learning_rate": 1.785622829519025e-05,
"loss": 1.0835,
"step": 797
},
{
"epoch": 2.040920716112532,
"grad_norm": 0.3040971752066545,
"learning_rate": 1.7848330224239256e-05,
"loss": 1.0563,
"step": 798
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.26487253530894395,
"learning_rate": 1.7840419384545706e-05,
"loss": 1.0579,
"step": 799
},
{
"epoch": 2.0460358056265986,
"grad_norm": 0.2689601096947907,
"learning_rate": 1.7832495788980035e-05,
"loss": 1.1015,
"step": 800
},
{
"epoch": 2.0485933503836318,
"grad_norm": 0.25525460785840065,
"learning_rate": 1.7824559450433446e-05,
"loss": 1.0537,
"step": 801
},
{
"epoch": 2.051150895140665,
"grad_norm": 0.345599384998098,
"learning_rate": 1.7816610381817864e-05,
"loss": 1.0604,
"step": 802
},
{
"epoch": 2.053708439897698,
"grad_norm": 0.3359389407416057,
"learning_rate": 1.780864859606592e-05,
"loss": 1.0664,
"step": 803
},
{
"epoch": 2.0562659846547313,
"grad_norm": 0.2813553104050823,
"learning_rate": 1.780067410613095e-05,
"loss": 1.0937,
"step": 804
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.2548220560875847,
"learning_rate": 1.7792686924986946e-05,
"loss": 1.0441,
"step": 805
},
{
"epoch": 2.061381074168798,
"grad_norm": 0.28792647000401994,
"learning_rate": 1.7784687065628554e-05,
"loss": 1.058,
"step": 806
},
{
"epoch": 2.0639386189258313,
"grad_norm": 0.2603601267230107,
"learning_rate": 1.777667454107104e-05,
"loss": 1.0992,
"step": 807
},
{
"epoch": 2.0664961636828645,
"grad_norm": 0.2583588654263776,
"learning_rate": 1.776864936435029e-05,
"loss": 1.0735,
"step": 808
},
{
"epoch": 2.0690537084398977,
"grad_norm": 0.30719716854376583,
"learning_rate": 1.7760611548522755e-05,
"loss": 1.0498,
"step": 809
},
{
"epoch": 2.071611253196931,
"grad_norm": 0.30807492892970295,
"learning_rate": 1.7752561106665463e-05,
"loss": 1.0548,
"step": 810
},
{
"epoch": 2.074168797953964,
"grad_norm": 0.3210704099635407,
"learning_rate": 1.7744498051875984e-05,
"loss": 1.077,
"step": 811
},
{
"epoch": 2.0767263427109977,
"grad_norm": 0.4282126010865939,
"learning_rate": 1.7736422397272396e-05,
"loss": 1.0494,
"step": 812
},
{
"epoch": 2.079283887468031,
"grad_norm": 0.4051125030459934,
"learning_rate": 1.772833415599329e-05,
"loss": 1.0511,
"step": 813
},
{
"epoch": 2.081841432225064,
"grad_norm": 0.2991528183767012,
"learning_rate": 1.7720233341197726e-05,
"loss": 1.1121,
"step": 814
},
{
"epoch": 2.084398976982097,
"grad_norm": 0.22783217071200507,
"learning_rate": 1.7712119966065225e-05,
"loss": 1.0383,
"step": 815
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.3516616820022178,
"learning_rate": 1.770399404379574e-05,
"loss": 1.0498,
"step": 816
},
{
"epoch": 2.0895140664961636,
"grad_norm": 0.2606641623626611,
"learning_rate": 1.7695855587609637e-05,
"loss": 1.0594,
"step": 817
},
{
"epoch": 2.0920716112531967,
"grad_norm": 0.269085192714615,
"learning_rate": 1.7687704610747676e-05,
"loss": 1.0419,
"step": 818
},
{
"epoch": 2.0946291560102304,
"grad_norm": 0.28768629596697776,
"learning_rate": 1.767954112647099e-05,
"loss": 1.0435,
"step": 819
},
{
"epoch": 2.0971867007672635,
"grad_norm": 0.27429737921035624,
"learning_rate": 1.7671365148061053e-05,
"loss": 1.0458,
"step": 820
},
{
"epoch": 2.0997442455242967,
"grad_norm": 0.29736519534073375,
"learning_rate": 1.7663176688819673e-05,
"loss": 1.0566,
"step": 821
},
{
"epoch": 2.10230179028133,
"grad_norm": 0.26021437570192907,
"learning_rate": 1.765497576206896e-05,
"loss": 1.0422,
"step": 822
},
{
"epoch": 2.104859335038363,
"grad_norm": 0.2783440308095714,
"learning_rate": 1.764676238115131e-05,
"loss": 1.0776,
"step": 823
},
{
"epoch": 2.1074168797953963,
"grad_norm": 0.3339846285282316,
"learning_rate": 1.763853655942938e-05,
"loss": 1.0674,
"step": 824
},
{
"epoch": 2.10997442455243,
"grad_norm": 0.2223362385153581,
"learning_rate": 1.7630298310286065e-05,
"loss": 1.0699,
"step": 825
},
{
"epoch": 2.112531969309463,
"grad_norm": 0.33059613735162624,
"learning_rate": 1.7622047647124488e-05,
"loss": 1.0634,
"step": 826
},
{
"epoch": 2.1150895140664963,
"grad_norm": 0.3414911305158879,
"learning_rate": 1.761378458336796e-05,
"loss": 1.0548,
"step": 827
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.32041930375116484,
"learning_rate": 1.760550913245996e-05,
"loss": 1.0621,
"step": 828
},
{
"epoch": 2.1202046035805626,
"grad_norm": 0.2971788267573472,
"learning_rate": 1.7597221307864142e-05,
"loss": 1.0704,
"step": 829
},
{
"epoch": 2.122762148337596,
"grad_norm": 0.27537162097267065,
"learning_rate": 1.7588921123064273e-05,
"loss": 1.0961,
"step": 830
},
{
"epoch": 2.125319693094629,
"grad_norm": 0.29232241446373336,
"learning_rate": 1.7580608591564233e-05,
"loss": 1.0916,
"step": 831
},
{
"epoch": 2.1278772378516626,
"grad_norm": 0.3815701080685027,
"learning_rate": 1.757228372688799e-05,
"loss": 1.0848,
"step": 832
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.33830135607419565,
"learning_rate": 1.7563946542579584e-05,
"loss": 1.0824,
"step": 833
},
{
"epoch": 2.132992327365729,
"grad_norm": 0.26436755888688523,
"learning_rate": 1.7555597052203088e-05,
"loss": 1.0424,
"step": 834
},
{
"epoch": 2.135549872122762,
"grad_norm": 0.2204259325114956,
"learning_rate": 1.7547235269342602e-05,
"loss": 1.0749,
"step": 835
},
{
"epoch": 2.1381074168797953,
"grad_norm": 0.31500508880378464,
"learning_rate": 1.7538861207602225e-05,
"loss": 1.0871,
"step": 836
},
{
"epoch": 2.1406649616368285,
"grad_norm": 0.33104625224299034,
"learning_rate": 1.753047488060603e-05,
"loss": 1.0257,
"step": 837
},
{
"epoch": 2.1432225063938617,
"grad_norm": 0.2325551980906377,
"learning_rate": 1.7522076301998048e-05,
"loss": 1.0907,
"step": 838
},
{
"epoch": 2.1457800511508953,
"grad_norm": 0.2464976826758584,
"learning_rate": 1.7513665485442238e-05,
"loss": 1.067,
"step": 839
},
{
"epoch": 2.1483375959079285,
"grad_norm": 0.25290511781194314,
"learning_rate": 1.750524244462248e-05,
"loss": 1.0893,
"step": 840
},
{
"epoch": 2.1508951406649617,
"grad_norm": 0.3247901788745791,
"learning_rate": 1.7496807193242528e-05,
"loss": 1.0638,
"step": 841
},
{
"epoch": 2.153452685421995,
"grad_norm": 0.34958915516133227,
"learning_rate": 1.748835974502601e-05,
"loss": 1.0825,
"step": 842
},
{
"epoch": 2.156010230179028,
"grad_norm": 0.24243104695456325,
"learning_rate": 1.7479900113716398e-05,
"loss": 1.0537,
"step": 843
},
{
"epoch": 2.1585677749360612,
"grad_norm": 0.2734369268109971,
"learning_rate": 1.7471428313076984e-05,
"loss": 1.1031,
"step": 844
},
{
"epoch": 2.1611253196930944,
"grad_norm": 0.3380184912512867,
"learning_rate": 1.7462944356890853e-05,
"loss": 1.0589,
"step": 845
},
{
"epoch": 2.163682864450128,
"grad_norm": 0.3625402818137926,
"learning_rate": 1.7454448258960877e-05,
"loss": 1.0561,
"step": 846
},
{
"epoch": 2.166240409207161,
"grad_norm": 0.34638148620089215,
"learning_rate": 1.744594003310967e-05,
"loss": 1.0186,
"step": 847
},
{
"epoch": 2.1687979539641944,
"grad_norm": 0.24740728690176142,
"learning_rate": 1.743741969317959e-05,
"loss": 1.1099,
"step": 848
},
{
"epoch": 2.1713554987212276,
"grad_norm": 0.287155398140135,
"learning_rate": 1.7428887253032695e-05,
"loss": 1.0691,
"step": 849
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.3566062867329238,
"learning_rate": 1.7420342726550728e-05,
"loss": 1.0701,
"step": 850
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.3096727205958978,
"learning_rate": 1.74117861276351e-05,
"loss": 1.0716,
"step": 851
},
{
"epoch": 2.1790281329923276,
"grad_norm": 0.25874536932280473,
"learning_rate": 1.740321747020687e-05,
"loss": 1.0893,
"step": 852
},
{
"epoch": 2.1815856777493607,
"grad_norm": 0.21538442833683963,
"learning_rate": 1.7394636768206702e-05,
"loss": 1.0266,
"step": 853
},
{
"epoch": 2.184143222506394,
"grad_norm": 0.2871943030157397,
"learning_rate": 1.738604403559486e-05,
"loss": 1.0085,
"step": 854
},
{
"epoch": 2.186700767263427,
"grad_norm": 0.2851621085345804,
"learning_rate": 1.7377439286351184e-05,
"loss": 1.0622,
"step": 855
},
{
"epoch": 2.1892583120204603,
"grad_norm": 0.26228336638762867,
"learning_rate": 1.736882253447506e-05,
"loss": 1.083,
"step": 856
},
{
"epoch": 2.1918158567774935,
"grad_norm": 0.26992050889733915,
"learning_rate": 1.736019379398542e-05,
"loss": 1.1006,
"step": 857
},
{
"epoch": 2.1943734015345266,
"grad_norm": 0.23555655653113924,
"learning_rate": 1.7351553078920665e-05,
"loss": 1.0914,
"step": 858
},
{
"epoch": 2.1969309462915603,
"grad_norm": 0.30209071932451825,
"learning_rate": 1.734290040333871e-05,
"loss": 1.0873,
"step": 859
},
{
"epoch": 2.1994884910485935,
"grad_norm": 0.23936877597438264,
"learning_rate": 1.733423578131691e-05,
"loss": 1.0835,
"step": 860
},
{
"epoch": 2.2020460358056266,
"grad_norm": 0.3366403647300894,
"learning_rate": 1.732555922695207e-05,
"loss": 1.0743,
"step": 861
},
{
"epoch": 2.20460358056266,
"grad_norm": 0.30248308613139313,
"learning_rate": 1.73168707543604e-05,
"loss": 1.0482,
"step": 862
},
{
"epoch": 2.207161125319693,
"grad_norm": 0.26759196361130394,
"learning_rate": 1.73081703776775e-05,
"loss": 1.0686,
"step": 863
},
{
"epoch": 2.209718670076726,
"grad_norm": 0.2424062745806639,
"learning_rate": 1.7299458111058336e-05,
"loss": 1.0738,
"step": 864
},
{
"epoch": 2.21227621483376,
"grad_norm": 0.24086304886593904,
"learning_rate": 1.7290733968677226e-05,
"loss": 1.0313,
"step": 865
},
{
"epoch": 2.214833759590793,
"grad_norm": 0.30184358263466177,
"learning_rate": 1.7281997964727803e-05,
"loss": 1.0602,
"step": 866
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.2366294082979442,
"learning_rate": 1.7273250113423e-05,
"loss": 1.1046,
"step": 867
},
{
"epoch": 2.2199488491048593,
"grad_norm": 0.26905581826310315,
"learning_rate": 1.726449042899502e-05,
"loss": 1.0437,
"step": 868
},
{
"epoch": 2.2225063938618925,
"grad_norm": 0.36508543225667806,
"learning_rate": 1.725571892569533e-05,
"loss": 1.0809,
"step": 869
},
{
"epoch": 2.2250639386189257,
"grad_norm": 0.30221117179280654,
"learning_rate": 1.7246935617794608e-05,
"loss": 1.0664,
"step": 870
},
{
"epoch": 2.227621483375959,
"grad_norm": 0.2269380846996494,
"learning_rate": 1.723814051958275e-05,
"loss": 1.045,
"step": 871
},
{
"epoch": 2.2301790281329925,
"grad_norm": 0.3848192034817777,
"learning_rate": 1.7229333645368834e-05,
"loss": 1.0661,
"step": 872
},
{
"epoch": 2.2327365728900257,
"grad_norm": 0.4724477310420707,
"learning_rate": 1.722051500948109e-05,
"loss": 1.0846,
"step": 873
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.3561338471365552,
"learning_rate": 1.7211684626266887e-05,
"loss": 1.0718,
"step": 874
},
{
"epoch": 2.237851662404092,
"grad_norm": 0.24533531015000096,
"learning_rate": 1.7202842510092706e-05,
"loss": 1.0428,
"step": 875
},
{
"epoch": 2.2404092071611252,
"grad_norm": 0.2999534454935499,
"learning_rate": 1.7193988675344125e-05,
"loss": 1.0598,
"step": 876
},
{
"epoch": 2.2429667519181584,
"grad_norm": 0.3931502655829081,
"learning_rate": 1.7185123136425775e-05,
"loss": 1.0486,
"step": 877
},
{
"epoch": 2.2455242966751916,
"grad_norm": 0.4099239641868052,
"learning_rate": 1.7176245907761327e-05,
"loss": 1.0567,
"step": 878
},
{
"epoch": 2.2480818414322252,
"grad_norm": 0.2859379832887241,
"learning_rate": 1.7167357003793485e-05,
"loss": 1.0567,
"step": 879
},
{
"epoch": 2.2506393861892584,
"grad_norm": 0.29262327466969734,
"learning_rate": 1.7158456438983934e-05,
"loss": 1.0299,
"step": 880
},
{
"epoch": 2.2531969309462916,
"grad_norm": 0.43158299248544585,
"learning_rate": 1.7149544227813343e-05,
"loss": 1.05,
"step": 881
},
{
"epoch": 2.2557544757033248,
"grad_norm": 0.3011090401640172,
"learning_rate": 1.7140620384781316e-05,
"loss": 1.0166,
"step": 882
},
{
"epoch": 2.258312020460358,
"grad_norm": 0.2826762526500697,
"learning_rate": 1.7131684924406392e-05,
"loss": 1.0561,
"step": 883
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.40076272547936787,
"learning_rate": 1.7122737861226007e-05,
"loss": 1.0536,
"step": 884
},
{
"epoch": 2.2634271099744243,
"grad_norm": 0.3893952639906247,
"learning_rate": 1.711377920979647e-05,
"loss": 1.0717,
"step": 885
},
{
"epoch": 2.265984654731458,
"grad_norm": 0.2701415754560129,
"learning_rate": 1.7104808984692946e-05,
"loss": 1.0788,
"step": 886
},
{
"epoch": 2.268542199488491,
"grad_norm": 0.3118978955533469,
"learning_rate": 1.7095827200509436e-05,
"loss": 1.0358,
"step": 887
},
{
"epoch": 2.2710997442455243,
"grad_norm": 0.4681497183113763,
"learning_rate": 1.7086833871858735e-05,
"loss": 1.0405,
"step": 888
},
{
"epoch": 2.2736572890025575,
"grad_norm": 0.44886562710116457,
"learning_rate": 1.707782901337243e-05,
"loss": 1.0635,
"step": 889
},
{
"epoch": 2.2762148337595907,
"grad_norm": 0.24326783713209693,
"learning_rate": 1.7068812639700862e-05,
"loss": 1.0995,
"step": 890
},
{
"epoch": 2.2787723785166243,
"grad_norm": 0.34628521799460377,
"learning_rate": 1.7059784765513106e-05,
"loss": 1.0772,
"step": 891
},
{
"epoch": 2.2813299232736575,
"grad_norm": 0.3903166631143913,
"learning_rate": 1.705074540549695e-05,
"loss": 1.0609,
"step": 892
},
{
"epoch": 2.2838874680306906,
"grad_norm": 0.3263912141551758,
"learning_rate": 1.704169457435887e-05,
"loss": 1.0661,
"step": 893
},
{
"epoch": 2.286445012787724,
"grad_norm": 0.2566336981081094,
"learning_rate": 1.7032632286823995e-05,
"loss": 1.0853,
"step": 894
},
{
"epoch": 2.289002557544757,
"grad_norm": 0.36154048413903833,
"learning_rate": 1.702355855763611e-05,
"loss": 1.0723,
"step": 895
},
{
"epoch": 2.29156010230179,
"grad_norm": 0.2971617301340999,
"learning_rate": 1.70144734015576e-05,
"loss": 1.0619,
"step": 896
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.2572103383141402,
"learning_rate": 1.700537683336944e-05,
"loss": 1.0589,
"step": 897
},
{
"epoch": 2.296675191815857,
"grad_norm": 0.37750177979394905,
"learning_rate": 1.699626886787119e-05,
"loss": 1.0361,
"step": 898
},
{
"epoch": 2.29923273657289,
"grad_norm": 0.35765757522418873,
"learning_rate": 1.698714951988093e-05,
"loss": 1.071,
"step": 899
},
{
"epoch": 2.3017902813299234,
"grad_norm": 0.30989044748347006,
"learning_rate": 1.6978018804235278e-05,
"loss": 1.0555,
"step": 900
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.24476809290635856,
"learning_rate": 1.6968876735789326e-05,
"loss": 1.0483,
"step": 901
},
{
"epoch": 2.3069053708439897,
"grad_norm": 0.308551372008468,
"learning_rate": 1.695972332941666e-05,
"loss": 1.0551,
"step": 902
},
{
"epoch": 2.309462915601023,
"grad_norm": 0.37111491476604536,
"learning_rate": 1.695055860000929e-05,
"loss": 1.0743,
"step": 903
},
{
"epoch": 2.312020460358056,
"grad_norm": 0.29147416337800386,
"learning_rate": 1.6941382562477664e-05,
"loss": 1.0003,
"step": 904
},
{
"epoch": 2.3145780051150897,
"grad_norm": 0.26326878890729166,
"learning_rate": 1.6932195231750616e-05,
"loss": 1.0351,
"step": 905
},
{
"epoch": 2.317135549872123,
"grad_norm": 0.29839767577203885,
"learning_rate": 1.6922996622775363e-05,
"loss": 1.0445,
"step": 906
},
{
"epoch": 2.319693094629156,
"grad_norm": 0.23637128109675618,
"learning_rate": 1.691378675051747e-05,
"loss": 1.0519,
"step": 907
},
{
"epoch": 2.3222506393861893,
"grad_norm": 0.25442257071130125,
"learning_rate": 1.6904565629960814e-05,
"loss": 1.0902,
"step": 908
},
{
"epoch": 2.3248081841432224,
"grad_norm": 0.3303656891744051,
"learning_rate": 1.6895333276107588e-05,
"loss": 1.0265,
"step": 909
},
{
"epoch": 2.3273657289002556,
"grad_norm": 0.2612217404110996,
"learning_rate": 1.688608970397825e-05,
"loss": 1.1046,
"step": 910
},
{
"epoch": 2.329923273657289,
"grad_norm": 0.271721798226581,
"learning_rate": 1.6876834928611524e-05,
"loss": 1.0784,
"step": 911
},
{
"epoch": 2.3324808184143224,
"grad_norm": 0.22229862393309946,
"learning_rate": 1.6867568965064336e-05,
"loss": 1.0364,
"step": 912
},
{
"epoch": 2.3350383631713556,
"grad_norm": 0.23741009658476048,
"learning_rate": 1.685829182841184e-05,
"loss": 1.0707,
"step": 913
},
{
"epoch": 2.337595907928389,
"grad_norm": 0.28874176637750065,
"learning_rate": 1.684900353374735e-05,
"loss": 1.0702,
"step": 914
},
{
"epoch": 2.340153452685422,
"grad_norm": 0.30379227509184065,
"learning_rate": 1.683970409618235e-05,
"loss": 1.0689,
"step": 915
},
{
"epoch": 2.342710997442455,
"grad_norm": 0.2726310509927992,
"learning_rate": 1.683039353084644e-05,
"loss": 1.0905,
"step": 916
},
{
"epoch": 2.3452685421994883,
"grad_norm": 0.2713331067951481,
"learning_rate": 1.6821071852887322e-05,
"loss": 1.0317,
"step": 917
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.3293005148131402,
"learning_rate": 1.681173907747079e-05,
"loss": 1.0572,
"step": 918
},
{
"epoch": 2.350383631713555,
"grad_norm": 0.2660221814623652,
"learning_rate": 1.680239521978068e-05,
"loss": 1.0429,
"step": 919
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.2412158860005583,
"learning_rate": 1.679304029501887e-05,
"loss": 1.0452,
"step": 920
},
{
"epoch": 2.3554987212276215,
"grad_norm": 0.33605356950268017,
"learning_rate": 1.6783674318405233e-05,
"loss": 1.0496,
"step": 921
},
{
"epoch": 2.3580562659846547,
"grad_norm": 0.29348949393829404,
"learning_rate": 1.677429730517763e-05,
"loss": 1.0471,
"step": 922
},
{
"epoch": 2.360613810741688,
"grad_norm": 0.27205789977362044,
"learning_rate": 1.6764909270591875e-05,
"loss": 1.049,
"step": 923
},
{
"epoch": 2.363171355498721,
"grad_norm": 0.24380065073942686,
"learning_rate": 1.6755510229921713e-05,
"loss": 1.0568,
"step": 924
},
{
"epoch": 2.3657289002557547,
"grad_norm": 0.2607905003163443,
"learning_rate": 1.6746100198458795e-05,
"loss": 1.0447,
"step": 925
},
{
"epoch": 2.368286445012788,
"grad_norm": 0.25646849705097663,
"learning_rate": 1.673667919151266e-05,
"loss": 1.0213,
"step": 926
},
{
"epoch": 2.370843989769821,
"grad_norm": 0.24557852833345492,
"learning_rate": 1.6727247224410686e-05,
"loss": 1.079,
"step": 927
},
{
"epoch": 2.373401534526854,
"grad_norm": 0.2536896072712956,
"learning_rate": 1.67178043124981e-05,
"loss": 1.0864,
"step": 928
},
{
"epoch": 2.3759590792838874,
"grad_norm": 0.2921088303385537,
"learning_rate": 1.6708350471137927e-05,
"loss": 1.0564,
"step": 929
},
{
"epoch": 2.3785166240409206,
"grad_norm": 0.20366681064359315,
"learning_rate": 1.669888571571098e-05,
"loss": 1.0815,
"step": 930
},
{
"epoch": 2.381074168797954,
"grad_norm": 0.2708885776774786,
"learning_rate": 1.6689410061615823e-05,
"loss": 1.0453,
"step": 931
},
{
"epoch": 2.3836317135549874,
"grad_norm": 0.26422900568518476,
"learning_rate": 1.6679923524268748e-05,
"loss": 1.0691,
"step": 932
},
{
"epoch": 2.3861892583120206,
"grad_norm": 0.24062139672551194,
"learning_rate": 1.6670426119103762e-05,
"loss": 1.0527,
"step": 933
},
{
"epoch": 2.3887468030690537,
"grad_norm": 0.2440568759213169,
"learning_rate": 1.666091786157255e-05,
"loss": 1.039,
"step": 934
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.24192631220648755,
"learning_rate": 1.6651398767144454e-05,
"loss": 1.0368,
"step": 935
},
{
"epoch": 2.39386189258312,
"grad_norm": 0.3094662604619502,
"learning_rate": 1.664186885130644e-05,
"loss": 1.0612,
"step": 936
},
{
"epoch": 2.3964194373401533,
"grad_norm": 0.22698815376801923,
"learning_rate": 1.6632328129563088e-05,
"loss": 1.0573,
"step": 937
},
{
"epoch": 2.398976982097187,
"grad_norm": 0.25713439762667506,
"learning_rate": 1.6622776617436556e-05,
"loss": 1.0689,
"step": 938
},
{
"epoch": 2.40153452685422,
"grad_norm": 0.21070288001877646,
"learning_rate": 1.6613214330466557e-05,
"loss": 1.0514,
"step": 939
},
{
"epoch": 2.4040920716112533,
"grad_norm": 0.2650104302111488,
"learning_rate": 1.6603641284210335e-05,
"loss": 1.0607,
"step": 940
},
{
"epoch": 2.4066496163682864,
"grad_norm": 0.24280091189228237,
"learning_rate": 1.6594057494242634e-05,
"loss": 1.0526,
"step": 941
},
{
"epoch": 2.4092071611253196,
"grad_norm": 0.2255724092281544,
"learning_rate": 1.6584462976155683e-05,
"loss": 1.0584,
"step": 942
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.2704536970024839,
"learning_rate": 1.6574857745559168e-05,
"loss": 1.0621,
"step": 943
},
{
"epoch": 2.414322250639386,
"grad_norm": 0.29272610932834264,
"learning_rate": 1.656524181808019e-05,
"loss": 1.0625,
"step": 944
},
{
"epoch": 2.4168797953964196,
"grad_norm": 0.28911787491946217,
"learning_rate": 1.655561520936327e-05,
"loss": 1.0165,
"step": 945
},
{
"epoch": 2.419437340153453,
"grad_norm": 0.2532789709507061,
"learning_rate": 1.6545977935070293e-05,
"loss": 1.036,
"step": 946
},
{
"epoch": 2.421994884910486,
"grad_norm": 0.2522741919476773,
"learning_rate": 1.6536330010880502e-05,
"loss": 1.0879,
"step": 947
},
{
"epoch": 2.424552429667519,
"grad_norm": 0.2902148618078098,
"learning_rate": 1.652667145249047e-05,
"loss": 1.0447,
"step": 948
},
{
"epoch": 2.4271099744245523,
"grad_norm": 0.2266116217612757,
"learning_rate": 1.6517002275614062e-05,
"loss": 1.0603,
"step": 949
},
{
"epoch": 2.4296675191815855,
"grad_norm": 0.2855681782290051,
"learning_rate": 1.6507322495982433e-05,
"loss": 1.0415,
"step": 950
},
{
"epoch": 2.4322250639386187,
"grad_norm": 0.2666978671553076,
"learning_rate": 1.6497632129343964e-05,
"loss": 1.057,
"step": 951
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.25398223147396237,
"learning_rate": 1.6487931191464293e-05,
"loss": 1.0225,
"step": 952
},
{
"epoch": 2.4373401534526855,
"grad_norm": 0.27478774153195795,
"learning_rate": 1.647821969812623e-05,
"loss": 1.0743,
"step": 953
},
{
"epoch": 2.4398976982097187,
"grad_norm": 0.2548269730970245,
"learning_rate": 1.6468497665129767e-05,
"loss": 1.0753,
"step": 954
},
{
"epoch": 2.442455242966752,
"grad_norm": 0.2531646552603803,
"learning_rate": 1.645876510829205e-05,
"loss": 1.0502,
"step": 955
},
{
"epoch": 2.445012787723785,
"grad_norm": 0.2716259730414166,
"learning_rate": 1.6449022043447333e-05,
"loss": 1.0604,
"step": 956
},
{
"epoch": 2.4475703324808182,
"grad_norm": 0.2759652629992187,
"learning_rate": 1.6439268486446982e-05,
"loss": 1.0307,
"step": 957
},
{
"epoch": 2.4501278772378514,
"grad_norm": 0.284229730108131,
"learning_rate": 1.642950445315941e-05,
"loss": 1.0244,
"step": 958
},
{
"epoch": 2.452685421994885,
"grad_norm": 0.2857191939202473,
"learning_rate": 1.6419729959470107e-05,
"loss": 1.0475,
"step": 959
},
{
"epoch": 2.455242966751918,
"grad_norm": 0.24411876551827455,
"learning_rate": 1.6409945021281547e-05,
"loss": 1.0205,
"step": 960
},
{
"epoch": 2.4578005115089514,
"grad_norm": 0.2839219346381256,
"learning_rate": 1.6400149654513224e-05,
"loss": 1.0902,
"step": 961
},
{
"epoch": 2.4603580562659846,
"grad_norm": 0.290894600450773,
"learning_rate": 1.6390343875101582e-05,
"loss": 1.0655,
"step": 962
},
{
"epoch": 2.4629156010230178,
"grad_norm": 0.25018640254339125,
"learning_rate": 1.6380527699000012e-05,
"loss": 1.075,
"step": 963
},
{
"epoch": 2.4654731457800514,
"grad_norm": 0.314947984707885,
"learning_rate": 1.6370701142178815e-05,
"loss": 1.0802,
"step": 964
},
{
"epoch": 2.4680306905370846,
"grad_norm": 0.23513441288297676,
"learning_rate": 1.636086422062519e-05,
"loss": 1.0315,
"step": 965
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.26967522371119773,
"learning_rate": 1.635101695034319e-05,
"loss": 1.0454,
"step": 966
},
{
"epoch": 2.473145780051151,
"grad_norm": 0.2673917447835626,
"learning_rate": 1.6341159347353714e-05,
"loss": 1.0577,
"step": 967
},
{
"epoch": 2.475703324808184,
"grad_norm": 0.24623838061921519,
"learning_rate": 1.633129142769446e-05,
"loss": 1.0607,
"step": 968
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.5975989314807109,
"learning_rate": 1.6321413207419915e-05,
"loss": 1.0624,
"step": 969
},
{
"epoch": 2.4808184143222505,
"grad_norm": 0.2783985268403012,
"learning_rate": 1.6311524702601328e-05,
"loss": 1.0277,
"step": 970
},
{
"epoch": 2.483375959079284,
"grad_norm": 0.2948227168148377,
"learning_rate": 1.6301625929326682e-05,
"loss": 1.0509,
"step": 971
},
{
"epoch": 2.4859335038363173,
"grad_norm": 0.25464495418366273,
"learning_rate": 1.6291716903700657e-05,
"loss": 1.0743,
"step": 972
},
{
"epoch": 2.4884910485933505,
"grad_norm": 0.32267891042610297,
"learning_rate": 1.6281797641844615e-05,
"loss": 1.0528,
"step": 973
},
{
"epoch": 2.4910485933503836,
"grad_norm": 0.24461174022768228,
"learning_rate": 1.6271868159896583e-05,
"loss": 1.0536,
"step": 974
},
{
"epoch": 2.493606138107417,
"grad_norm": 0.3184259095166065,
"learning_rate": 1.6261928474011205e-05,
"loss": 1.0295,
"step": 975
},
{
"epoch": 2.49616368286445,
"grad_norm": 0.31223168542424856,
"learning_rate": 1.6251978600359727e-05,
"loss": 1.0611,
"step": 976
},
{
"epoch": 2.498721227621483,
"grad_norm": 0.24470883821957645,
"learning_rate": 1.6242018555129968e-05,
"loss": 1.0501,
"step": 977
},
{
"epoch": 2.501278772378517,
"grad_norm": 0.263841680832215,
"learning_rate": 1.6232048354526305e-05,
"loss": 1.0632,
"step": 978
},
{
"epoch": 2.50383631713555,
"grad_norm": 0.2799350053468126,
"learning_rate": 1.6222068014769626e-05,
"loss": 1.0669,
"step": 979
},
{
"epoch": 2.506393861892583,
"grad_norm": 0.23708656285849256,
"learning_rate": 1.6212077552097326e-05,
"loss": 1.0242,
"step": 980
},
{
"epoch": 2.5089514066496164,
"grad_norm": 0.32106303705514144,
"learning_rate": 1.6202076982763258e-05,
"loss": 1.038,
"step": 981
},
{
"epoch": 2.5115089514066495,
"grad_norm": 0.32641459248285415,
"learning_rate": 1.6192066323037723e-05,
"loss": 1.0192,
"step": 982
},
{
"epoch": 2.5140664961636827,
"grad_norm": 0.2374782294678397,
"learning_rate": 1.618204558920744e-05,
"loss": 1.0317,
"step": 983
},
{
"epoch": 2.516624040920716,
"grad_norm": 0.2669950742681541,
"learning_rate": 1.6172014797575512e-05,
"loss": 1.0604,
"step": 984
},
{
"epoch": 2.5191815856777495,
"grad_norm": 0.3289018657957539,
"learning_rate": 1.616197396446142e-05,
"loss": 1.0558,
"step": 985
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.30014120894320534,
"learning_rate": 1.6151923106200964e-05,
"loss": 1.0282,
"step": 986
},
{
"epoch": 2.524296675191816,
"grad_norm": 0.22934126760741957,
"learning_rate": 1.6141862239146263e-05,
"loss": 1.0442,
"step": 987
},
{
"epoch": 2.526854219948849,
"grad_norm": 0.3082443169061738,
"learning_rate": 1.613179137966572e-05,
"loss": 1.0671,
"step": 988
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.34264852115767747,
"learning_rate": 1.612171054414399e-05,
"loss": 1.0659,
"step": 989
},
{
"epoch": 2.531969309462916,
"grad_norm": 0.28840855857878017,
"learning_rate": 1.6111619748981967e-05,
"loss": 1.0757,
"step": 990
},
{
"epoch": 2.5345268542199486,
"grad_norm": 0.29679625325903564,
"learning_rate": 1.610151901059674e-05,
"loss": 1.0574,
"step": 991
},
{
"epoch": 2.5370843989769822,
"grad_norm": 0.2701305485919972,
"learning_rate": 1.6091408345421583e-05,
"loss": 1.076,
"step": 992
},
{
"epoch": 2.5396419437340154,
"grad_norm": 0.27772319714999755,
"learning_rate": 1.6081287769905914e-05,
"loss": 1.0557,
"step": 993
},
{
"epoch": 2.5421994884910486,
"grad_norm": 0.2575298835482317,
"learning_rate": 1.6071157300515274e-05,
"loss": 1.0371,
"step": 994
},
{
"epoch": 2.544757033248082,
"grad_norm": 0.2434229348885953,
"learning_rate": 1.6061016953731307e-05,
"loss": 1.0293,
"step": 995
},
{
"epoch": 2.547314578005115,
"grad_norm": 0.24931228820010734,
"learning_rate": 1.6050866746051722e-05,
"loss": 1.0497,
"step": 996
},
{
"epoch": 2.5498721227621486,
"grad_norm": 0.24970615225374868,
"learning_rate": 1.6040706693990272e-05,
"loss": 1.0507,
"step": 997
},
{
"epoch": 2.5524296675191813,
"grad_norm": 0.2705848075384666,
"learning_rate": 1.6030536814076722e-05,
"loss": 1.051,
"step": 998
},
{
"epoch": 2.554987212276215,
"grad_norm": 0.2645976951028759,
"learning_rate": 1.602035712285684e-05,
"loss": 1.044,
"step": 999
},
{
"epoch": 2.557544757033248,
"grad_norm": 0.25280588284501737,
"learning_rate": 1.6010167636892338e-05,
"loss": 1.0466,
"step": 1000
},
{
"epoch": 2.5601023017902813,
"grad_norm": 0.23309975174376094,
"learning_rate": 1.5999968372760882e-05,
"loss": 1.0503,
"step": 1001
},
{
"epoch": 2.5626598465473145,
"grad_norm": 0.24003131974818753,
"learning_rate": 1.5989759347056028e-05,
"loss": 1.0428,
"step": 1002
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.22803670250684518,
"learning_rate": 1.5979540576387226e-05,
"loss": 1.067,
"step": 1003
},
{
"epoch": 2.5677749360613813,
"grad_norm": 0.23366692767216873,
"learning_rate": 1.596931207737978e-05,
"loss": 1.0735,
"step": 1004
},
{
"epoch": 2.5703324808184145,
"grad_norm": 0.2514628572179653,
"learning_rate": 1.5959073866674812e-05,
"loss": 1.0683,
"step": 1005
},
{
"epoch": 2.5728900255754477,
"grad_norm": 0.2647695835957155,
"learning_rate": 1.594882596092926e-05,
"loss": 1.006,
"step": 1006
},
{
"epoch": 2.575447570332481,
"grad_norm": 0.2705206567562451,
"learning_rate": 1.5938568376815816e-05,
"loss": 1.0815,
"step": 1007
},
{
"epoch": 2.578005115089514,
"grad_norm": 0.26218100830771535,
"learning_rate": 1.5928301131022933e-05,
"loss": 1.0712,
"step": 1008
},
{
"epoch": 2.580562659846547,
"grad_norm": 0.24704018764157912,
"learning_rate": 1.5918024240254778e-05,
"loss": 1.069,
"step": 1009
},
{
"epoch": 2.5831202046035804,
"grad_norm": 0.3099818232532923,
"learning_rate": 1.5907737721231205e-05,
"loss": 1.0485,
"step": 1010
},
{
"epoch": 2.585677749360614,
"grad_norm": 0.2976698121714401,
"learning_rate": 1.5897441590687747e-05,
"loss": 1.0577,
"step": 1011
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.25285713641828206,
"learning_rate": 1.5887135865375552e-05,
"loss": 1.0603,
"step": 1012
},
{
"epoch": 2.5907928388746804,
"grad_norm": 0.2526446484384057,
"learning_rate": 1.5876820562061402e-05,
"loss": 1.0433,
"step": 1013
},
{
"epoch": 2.5933503836317136,
"grad_norm": 0.29067294932967996,
"learning_rate": 1.586649569752765e-05,
"loss": 1.0616,
"step": 1014
},
{
"epoch": 2.5959079283887467,
"grad_norm": 0.282910218177146,
"learning_rate": 1.5856161288572195e-05,
"loss": 1.0413,
"step": 1015
},
{
"epoch": 2.59846547314578,
"grad_norm": 0.2268843181296163,
"learning_rate": 1.5845817352008485e-05,
"loss": 1.0407,
"step": 1016
},
{
"epoch": 2.601023017902813,
"grad_norm": 0.22762472803069236,
"learning_rate": 1.583546390466545e-05,
"loss": 1.0536,
"step": 1017
},
{
"epoch": 2.6035805626598467,
"grad_norm": 0.23603794648210832,
"learning_rate": 1.58251009633875e-05,
"loss": 1.0571,
"step": 1018
},
{
"epoch": 2.60613810741688,
"grad_norm": 0.2676423332930833,
"learning_rate": 1.5814728545034503e-05,
"loss": 1.0297,
"step": 1019
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.25371119273646303,
"learning_rate": 1.5804346666481728e-05,
"loss": 1.037,
"step": 1020
},
{
"epoch": 2.6112531969309463,
"grad_norm": 0.23765073500378178,
"learning_rate": 1.5793955344619846e-05,
"loss": 1.0493,
"step": 1021
},
{
"epoch": 2.6138107416879794,
"grad_norm": 0.28479895070770733,
"learning_rate": 1.5783554596354885e-05,
"loss": 1.0428,
"step": 1022
},
{
"epoch": 2.6163682864450126,
"grad_norm": 0.2610596840924324,
"learning_rate": 1.577314443860821e-05,
"loss": 1.0659,
"step": 1023
},
{
"epoch": 2.618925831202046,
"grad_norm": 0.24670717715351206,
"learning_rate": 1.57627248883165e-05,
"loss": 1.0434,
"step": 1024
},
{
"epoch": 2.6214833759590794,
"grad_norm": 0.22640840073229135,
"learning_rate": 1.575229596243171e-05,
"loss": 1.043,
"step": 1025
},
{
"epoch": 2.6240409207161126,
"grad_norm": 0.25314200985521523,
"learning_rate": 1.574185767792106e-05,
"loss": 1.0494,
"step": 1026
},
{
"epoch": 2.626598465473146,
"grad_norm": 0.21470094174624627,
"learning_rate": 1.573141005176697e-05,
"loss": 1.0568,
"step": 1027
},
{
"epoch": 2.629156010230179,
"grad_norm": 0.23151889692704267,
"learning_rate": 1.5720953100967085e-05,
"loss": 1.0648,
"step": 1028
},
{
"epoch": 2.631713554987212,
"grad_norm": 0.21397184877158426,
"learning_rate": 1.5710486842534206e-05,
"loss": 1.0663,
"step": 1029
},
{
"epoch": 2.634271099744246,
"grad_norm": 0.22192997813660584,
"learning_rate": 1.5700011293496285e-05,
"loss": 1.0534,
"step": 1030
},
{
"epoch": 2.6368286445012785,
"grad_norm": 0.21407356154899657,
"learning_rate": 1.568952647089638e-05,
"loss": 1.059,
"step": 1031
},
{
"epoch": 2.639386189258312,
"grad_norm": 0.21832618515669033,
"learning_rate": 1.5679032391792648e-05,
"loss": 1.0221,
"step": 1032
},
{
"epoch": 2.6419437340153453,
"grad_norm": 0.24431871394272658,
"learning_rate": 1.5668529073258298e-05,
"loss": 1.0858,
"step": 1033
},
{
"epoch": 2.6445012787723785,
"grad_norm": 0.31234951434869057,
"learning_rate": 1.5658016532381565e-05,
"loss": 1.06,
"step": 1034
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.2080542192295102,
"learning_rate": 1.5647494786265705e-05,
"loss": 1.0651,
"step": 1035
},
{
"epoch": 2.649616368286445,
"grad_norm": 0.24670278561413833,
"learning_rate": 1.5636963852028936e-05,
"loss": 1.0373,
"step": 1036
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.23750220801463004,
"learning_rate": 1.5626423746804433e-05,
"loss": 1.0426,
"step": 1037
},
{
"epoch": 2.6547314578005117,
"grad_norm": 0.24041568140574793,
"learning_rate": 1.5615874487740287e-05,
"loss": 1.0504,
"step": 1038
},
{
"epoch": 2.657289002557545,
"grad_norm": 0.2389633958150457,
"learning_rate": 1.560531609199948e-05,
"loss": 1.0572,
"step": 1039
},
{
"epoch": 2.659846547314578,
"grad_norm": 0.2770548151196396,
"learning_rate": 1.559474857675986e-05,
"loss": 1.068,
"step": 1040
},
{
"epoch": 2.662404092071611,
"grad_norm": 0.266725154908083,
"learning_rate": 1.5584171959214126e-05,
"loss": 1.0449,
"step": 1041
},
{
"epoch": 2.6649616368286444,
"grad_norm": 0.25482885945652345,
"learning_rate": 1.557358625656976e-05,
"loss": 1.0784,
"step": 1042
},
{
"epoch": 2.6675191815856776,
"grad_norm": 0.264472394184579,
"learning_rate": 1.5562991486049045e-05,
"loss": 1.0118,
"step": 1043
},
{
"epoch": 2.670076726342711,
"grad_norm": 0.2848797989882817,
"learning_rate": 1.555238766488901e-05,
"loss": 1.0555,
"step": 1044
},
{
"epoch": 2.6726342710997444,
"grad_norm": 0.24695033243914596,
"learning_rate": 1.5541774810341404e-05,
"loss": 1.0402,
"step": 1045
},
{
"epoch": 2.6751918158567776,
"grad_norm": 0.20315866222350132,
"learning_rate": 1.5531152939672683e-05,
"loss": 1.0251,
"step": 1046
},
{
"epoch": 2.6777493606138107,
"grad_norm": 0.2608581931242649,
"learning_rate": 1.5520522070163962e-05,
"loss": 1.0549,
"step": 1047
},
{
"epoch": 2.680306905370844,
"grad_norm": 0.3085807293166213,
"learning_rate": 1.550988221911101e-05,
"loss": 1.0586,
"step": 1048
},
{
"epoch": 2.682864450127877,
"grad_norm": 0.22686082652143869,
"learning_rate": 1.549923340382419e-05,
"loss": 1.0315,
"step": 1049
},
{
"epoch": 2.6854219948849103,
"grad_norm": 0.23840859030860576,
"learning_rate": 1.548857564162846e-05,
"loss": 1.0542,
"step": 1050
},
{
"epoch": 2.687979539641944,
"grad_norm": 0.2828144148836396,
"learning_rate": 1.5477908949863335e-05,
"loss": 1.0546,
"step": 1051
},
{
"epoch": 2.690537084398977,
"grad_norm": 0.24462451577997144,
"learning_rate": 1.5467233345882858e-05,
"loss": 1.05,
"step": 1052
},
{
"epoch": 2.6930946291560103,
"grad_norm": 0.2608389325913873,
"learning_rate": 1.5456548847055565e-05,
"loss": 1.0582,
"step": 1053
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.2341653521141245,
"learning_rate": 1.5445855470764467e-05,
"loss": 1.0227,
"step": 1054
},
{
"epoch": 2.6982097186700766,
"grad_norm": 0.2001748409496552,
"learning_rate": 1.5435153234407023e-05,
"loss": 1.0361,
"step": 1055
},
{
"epoch": 2.70076726342711,
"grad_norm": 0.24778418959062198,
"learning_rate": 1.5424442155395095e-05,
"loss": 1.0556,
"step": 1056
},
{
"epoch": 2.703324808184143,
"grad_norm": 0.23891064433631373,
"learning_rate": 1.5413722251154947e-05,
"loss": 1.0583,
"step": 1057
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.18730639273619554,
"learning_rate": 1.540299353912719e-05,
"loss": 1.0461,
"step": 1058
},
{
"epoch": 2.70843989769821,
"grad_norm": 0.22764007423409213,
"learning_rate": 1.5392256036766767e-05,
"loss": 1.0723,
"step": 1059
},
{
"epoch": 2.710997442455243,
"grad_norm": 0.2161337514937876,
"learning_rate": 1.5381509761542925e-05,
"loss": 1.0303,
"step": 1060
},
{
"epoch": 2.713554987212276,
"grad_norm": 0.23665490844389125,
"learning_rate": 1.537075473093918e-05,
"loss": 1.072,
"step": 1061
},
{
"epoch": 2.7161125319693094,
"grad_norm": 0.2171745194472315,
"learning_rate": 1.535999096245329e-05,
"loss": 1.0609,
"step": 1062
},
{
"epoch": 2.718670076726343,
"grad_norm": 0.27479490086390757,
"learning_rate": 1.5349218473597244e-05,
"loss": 1.0976,
"step": 1063
},
{
"epoch": 2.7212276214833757,
"grad_norm": 0.23802159891837593,
"learning_rate": 1.5338437281897196e-05,
"loss": 1.0561,
"step": 1064
},
{
"epoch": 2.7237851662404093,
"grad_norm": 0.23413108216980624,
"learning_rate": 1.532764740489348e-05,
"loss": 1.0249,
"step": 1065
},
{
"epoch": 2.7263427109974425,
"grad_norm": 0.23839123328370654,
"learning_rate": 1.5316848860140545e-05,
"loss": 1.0448,
"step": 1066
},
{
"epoch": 2.7289002557544757,
"grad_norm": 0.26889749126936374,
"learning_rate": 1.530604166520695e-05,
"loss": 1.0538,
"step": 1067
},
{
"epoch": 2.731457800511509,
"grad_norm": 0.23104275616772496,
"learning_rate": 1.529522583767533e-05,
"loss": 1.0709,
"step": 1068
},
{
"epoch": 2.734015345268542,
"grad_norm": 0.26947945752974595,
"learning_rate": 1.5284401395142356e-05,
"loss": 1.0476,
"step": 1069
},
{
"epoch": 2.7365728900255757,
"grad_norm": 0.2650970504236315,
"learning_rate": 1.5273568355218714e-05,
"loss": 1.0906,
"step": 1070
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.2426600100365933,
"learning_rate": 1.5262726735529096e-05,
"loss": 1.0421,
"step": 1071
},
{
"epoch": 2.741687979539642,
"grad_norm": 0.2565653498953779,
"learning_rate": 1.5251876553712129e-05,
"loss": 1.0714,
"step": 1072
},
{
"epoch": 2.7442455242966752,
"grad_norm": 0.2590844357725753,
"learning_rate": 1.5241017827420379e-05,
"loss": 1.0529,
"step": 1073
},
{
"epoch": 2.7468030690537084,
"grad_norm": 0.2661157616076656,
"learning_rate": 1.523015057432032e-05,
"loss": 1.0413,
"step": 1074
},
{
"epoch": 2.7493606138107416,
"grad_norm": 0.2316877382855349,
"learning_rate": 1.5219274812092297e-05,
"loss": 1.0965,
"step": 1075
},
{
"epoch": 2.7519181585677748,
"grad_norm": 0.281689753856549,
"learning_rate": 1.5208390558430486e-05,
"loss": 1.0506,
"step": 1076
},
{
"epoch": 2.7544757033248084,
"grad_norm": 0.25889609476509934,
"learning_rate": 1.5197497831042891e-05,
"loss": 1.0701,
"step": 1077
},
{
"epoch": 2.7570332480818416,
"grad_norm": 0.25370938447354224,
"learning_rate": 1.5186596647651299e-05,
"loss": 1.0344,
"step": 1078
},
{
"epoch": 2.7595907928388748,
"grad_norm": 0.21590996086487077,
"learning_rate": 1.5175687025991254e-05,
"loss": 1.0111,
"step": 1079
},
{
"epoch": 2.762148337595908,
"grad_norm": 0.25136209115240976,
"learning_rate": 1.5164768983812031e-05,
"loss": 1.0594,
"step": 1080
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.2296309073317973,
"learning_rate": 1.5153842538876595e-05,
"loss": 1.0195,
"step": 1081
},
{
"epoch": 2.7672634271099743,
"grad_norm": 0.2188880236827278,
"learning_rate": 1.5142907708961594e-05,
"loss": 1.0563,
"step": 1082
},
{
"epoch": 2.7698209718670075,
"grad_norm": 0.29043124524993463,
"learning_rate": 1.5131964511857307e-05,
"loss": 1.0579,
"step": 1083
},
{
"epoch": 2.772378516624041,
"grad_norm": 0.23042976434473456,
"learning_rate": 1.512101296536764e-05,
"loss": 1.0594,
"step": 1084
},
{
"epoch": 2.7749360613810743,
"grad_norm": 0.3064542379695439,
"learning_rate": 1.5110053087310067e-05,
"loss": 1.0347,
"step": 1085
},
{
"epoch": 2.7774936061381075,
"grad_norm": 0.2990911954190306,
"learning_rate": 1.5099084895515633e-05,
"loss": 1.0872,
"step": 1086
},
{
"epoch": 2.7800511508951407,
"grad_norm": 0.30238830537129957,
"learning_rate": 1.5088108407828887e-05,
"loss": 1.0102,
"step": 1087
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.22800852447745912,
"learning_rate": 1.5077123642107901e-05,
"loss": 1.0373,
"step": 1088
},
{
"epoch": 2.785166240409207,
"grad_norm": 0.26466118290058793,
"learning_rate": 1.5066130616224194e-05,
"loss": 1.0601,
"step": 1089
},
{
"epoch": 2.78772378516624,
"grad_norm": 0.3134236905423725,
"learning_rate": 1.5055129348062733e-05,
"loss": 1.0282,
"step": 1090
},
{
"epoch": 2.790281329923274,
"grad_norm": 0.30040919493276264,
"learning_rate": 1.5044119855521899e-05,
"loss": 1.0028,
"step": 1091
},
{
"epoch": 2.792838874680307,
"grad_norm": 0.3018437088485077,
"learning_rate": 1.5033102156513442e-05,
"loss": 1.0642,
"step": 1092
},
{
"epoch": 2.79539641943734,
"grad_norm": 0.2594288455529522,
"learning_rate": 1.5022076268962474e-05,
"loss": 1.0651,
"step": 1093
},
{
"epoch": 2.7979539641943734,
"grad_norm": 0.2427672329241251,
"learning_rate": 1.5011042210807416e-05,
"loss": 1.0499,
"step": 1094
},
{
"epoch": 2.8005115089514065,
"grad_norm": 0.2753688016374087,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.0441,
"step": 1095
},
{
"epoch": 2.80306905370844,
"grad_norm": 0.333646004575826,
"learning_rate": 1.4988949654505212e-05,
"loss": 1.0954,
"step": 1096
},
{
"epoch": 2.805626598465473,
"grad_norm": 0.24884374092942535,
"learning_rate": 1.4977891192301266e-05,
"loss": 1.0616,
"step": 1097
},
{
"epoch": 2.8081841432225065,
"grad_norm": 0.25576802318021363,
"learning_rate": 1.4966824631379595e-05,
"loss": 1.0767,
"step": 1098
},
{
"epoch": 2.8107416879795397,
"grad_norm": 0.2726811004318987,
"learning_rate": 1.49557499897448e-05,
"loss": 1.0629,
"step": 1099
},
{
"epoch": 2.813299232736573,
"grad_norm": 0.2490020562964201,
"learning_rate": 1.4944667285414629e-05,
"loss": 1.0401,
"step": 1100
},
{
"epoch": 2.815856777493606,
"grad_norm": 0.230153454763048,
"learning_rate": 1.4933576536419951e-05,
"loss": 1.0681,
"step": 1101
},
{
"epoch": 2.8184143222506393,
"grad_norm": 0.29290021173573333,
"learning_rate": 1.492247776080472e-05,
"loss": 1.0478,
"step": 1102
},
{
"epoch": 2.820971867007673,
"grad_norm": 0.22373455728798555,
"learning_rate": 1.4911370976625951e-05,
"loss": 1.0646,
"step": 1103
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.2867670697761132,
"learning_rate": 1.4900256201953686e-05,
"loss": 1.0395,
"step": 1104
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.2580511336465639,
"learning_rate": 1.488913345487097e-05,
"loss": 1.0299,
"step": 1105
},
{
"epoch": 2.8286445012787724,
"grad_norm": 0.30823901300584283,
"learning_rate": 1.4878002753473814e-05,
"loss": 1.0588,
"step": 1106
},
{
"epoch": 2.8312020460358056,
"grad_norm": 0.26061529857491966,
"learning_rate": 1.486686411587118e-05,
"loss": 1.0544,
"step": 1107
},
{
"epoch": 2.833759590792839,
"grad_norm": 0.3411340236384177,
"learning_rate": 1.4855717560184925e-05,
"loss": 1.0673,
"step": 1108
},
{
"epoch": 2.836317135549872,
"grad_norm": 0.3112034427743734,
"learning_rate": 1.4844563104549808e-05,
"loss": 1.0702,
"step": 1109
},
{
"epoch": 2.8388746803069056,
"grad_norm": 0.26159448325094614,
"learning_rate": 1.4833400767113425e-05,
"loss": 1.0518,
"step": 1110
},
{
"epoch": 2.8414322250639388,
"grad_norm": 0.24843885045239295,
"learning_rate": 1.48222305660362e-05,
"loss": 1.0519,
"step": 1111
},
{
"epoch": 2.843989769820972,
"grad_norm": 0.34052436576940476,
"learning_rate": 1.4811052519491358e-05,
"loss": 1.0621,
"step": 1112
},
{
"epoch": 2.846547314578005,
"grad_norm": 0.25035667041534276,
"learning_rate": 1.4799866645664875e-05,
"loss": 1.0495,
"step": 1113
},
{
"epoch": 2.8491048593350383,
"grad_norm": 0.23950107492766087,
"learning_rate": 1.4788672962755474e-05,
"loss": 1.0474,
"step": 1114
},
{
"epoch": 2.8516624040920715,
"grad_norm": 0.2228748439468561,
"learning_rate": 1.4777471488974573e-05,
"loss": 1.056,
"step": 1115
},
{
"epoch": 2.8542199488491047,
"grad_norm": 0.21686894636285,
"learning_rate": 1.476626224254627e-05,
"loss": 1.0473,
"step": 1116
},
{
"epoch": 2.8567774936061383,
"grad_norm": 0.21336673271033718,
"learning_rate": 1.475504524170731e-05,
"loss": 1.0327,
"step": 1117
},
{
"epoch": 2.8593350383631715,
"grad_norm": 0.2412247096897979,
"learning_rate": 1.4743820504707054e-05,
"loss": 1.0603,
"step": 1118
},
{
"epoch": 2.8618925831202047,
"grad_norm": 0.20338495510222906,
"learning_rate": 1.4732588049807442e-05,
"loss": 1.0345,
"step": 1119
},
{
"epoch": 2.864450127877238,
"grad_norm": 0.2224056939046196,
"learning_rate": 1.4721347895282977e-05,
"loss": 1.0932,
"step": 1120
},
{
"epoch": 2.867007672634271,
"grad_norm": 0.21219190570803861,
"learning_rate": 1.4710100059420693e-05,
"loss": 1.0577,
"step": 1121
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.23417177032958478,
"learning_rate": 1.4698844560520107e-05,
"loss": 1.04,
"step": 1122
},
{
"epoch": 2.8721227621483374,
"grad_norm": 0.21756710346483277,
"learning_rate": 1.4687581416893218e-05,
"loss": 1.0115,
"step": 1123
},
{
"epoch": 2.874680306905371,
"grad_norm": 0.27116811809019226,
"learning_rate": 1.4676310646864455e-05,
"loss": 1.0925,
"step": 1124
},
{
"epoch": 2.877237851662404,
"grad_norm": 0.20359779513752466,
"learning_rate": 1.4665032268770656e-05,
"loss": 1.0662,
"step": 1125
},
{
"epoch": 2.8797953964194374,
"grad_norm": 0.25086860996163834,
"learning_rate": 1.4653746300961037e-05,
"loss": 1.0615,
"step": 1126
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.21619154701357268,
"learning_rate": 1.4642452761797166e-05,
"loss": 1.028,
"step": 1127
},
{
"epoch": 2.8849104859335037,
"grad_norm": 0.23657771626030477,
"learning_rate": 1.4631151669652917e-05,
"loss": 1.0339,
"step": 1128
},
{
"epoch": 2.887468030690537,
"grad_norm": 0.25435410320469787,
"learning_rate": 1.4619843042914466e-05,
"loss": 1.0382,
"step": 1129
},
{
"epoch": 2.89002557544757,
"grad_norm": 0.3165858987447032,
"learning_rate": 1.4608526899980238e-05,
"loss": 1.0631,
"step": 1130
},
{
"epoch": 2.8925831202046037,
"grad_norm": 0.3059530735276844,
"learning_rate": 1.4597203259260893e-05,
"loss": 1.0742,
"step": 1131
},
{
"epoch": 2.895140664961637,
"grad_norm": 0.23231123365328338,
"learning_rate": 1.4585872139179284e-05,
"loss": 1.0108,
"step": 1132
},
{
"epoch": 2.89769820971867,
"grad_norm": 0.32159788413714113,
"learning_rate": 1.457453355817044e-05,
"loss": 1.0343,
"step": 1133
},
{
"epoch": 2.9002557544757033,
"grad_norm": 0.2624561212579556,
"learning_rate": 1.456318753468152e-05,
"loss": 1.0344,
"step": 1134
},
{
"epoch": 2.9028132992327365,
"grad_norm": 0.21340797781295,
"learning_rate": 1.455183408717179e-05,
"loss": 1.0582,
"step": 1135
},
{
"epoch": 2.90537084398977,
"grad_norm": 0.27498982896150626,
"learning_rate": 1.4540473234112607e-05,
"loss": 1.0319,
"step": 1136
},
{
"epoch": 2.907928388746803,
"grad_norm": 0.26787413886350847,
"learning_rate": 1.4529104993987364e-05,
"loss": 1.094,
"step": 1137
},
{
"epoch": 2.9104859335038364,
"grad_norm": 0.22411507204789752,
"learning_rate": 1.4517729385291479e-05,
"loss": 1.0289,
"step": 1138
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.3186727715150146,
"learning_rate": 1.4506346426532356e-05,
"loss": 1.0474,
"step": 1139
},
{
"epoch": 2.915601023017903,
"grad_norm": 0.23017658335190225,
"learning_rate": 1.4494956136229356e-05,
"loss": 1.0406,
"step": 1140
},
{
"epoch": 2.918158567774936,
"grad_norm": 0.2469732487522561,
"learning_rate": 1.448355853291377e-05,
"loss": 1.0545,
"step": 1141
},
{
"epoch": 2.920716112531969,
"grad_norm": 0.34257461951959434,
"learning_rate": 1.4472153635128787e-05,
"loss": 1.0649,
"step": 1142
},
{
"epoch": 2.923273657289003,
"grad_norm": 0.26582238607210484,
"learning_rate": 1.4460741461429457e-05,
"loss": 1.0643,
"step": 1143
},
{
"epoch": 2.9258312020460355,
"grad_norm": 0.238713886041743,
"learning_rate": 1.4449322030382681e-05,
"loss": 1.0375,
"step": 1144
},
{
"epoch": 2.928388746803069,
"grad_norm": 0.28544164960503227,
"learning_rate": 1.4437895360567156e-05,
"loss": 1.0459,
"step": 1145
},
{
"epoch": 2.9309462915601023,
"grad_norm": 0.30617216188801405,
"learning_rate": 1.4426461470573358e-05,
"loss": 1.0352,
"step": 1146
},
{
"epoch": 2.9335038363171355,
"grad_norm": 0.23250706835607923,
"learning_rate": 1.4415020379003513e-05,
"loss": 1.0547,
"step": 1147
},
{
"epoch": 2.9360613810741687,
"grad_norm": 0.23449213816934886,
"learning_rate": 1.4403572104471559e-05,
"loss": 1.0506,
"step": 1148
},
{
"epoch": 2.938618925831202,
"grad_norm": 0.26285727807721854,
"learning_rate": 1.4392116665603123e-05,
"loss": 1.067,
"step": 1149
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.25864228967500363,
"learning_rate": 1.4380654081035492e-05,
"loss": 1.0566,
"step": 1150
},
{
"epoch": 2.9437340153452687,
"grad_norm": 0.2197313587417355,
"learning_rate": 1.4369184369417573e-05,
"loss": 1.069,
"step": 1151
},
{
"epoch": 2.946291560102302,
"grad_norm": 0.22175625255078796,
"learning_rate": 1.4357707549409865e-05,
"loss": 1.0393,
"step": 1152
},
{
"epoch": 2.948849104859335,
"grad_norm": 0.20734806987916835,
"learning_rate": 1.4346223639684445e-05,
"loss": 1.0629,
"step": 1153
},
{
"epoch": 2.9514066496163682,
"grad_norm": 0.20844980678105798,
"learning_rate": 1.4334732658924906e-05,
"loss": 1.0683,
"step": 1154
},
{
"epoch": 2.9539641943734014,
"grad_norm": 0.1986457605182691,
"learning_rate": 1.4323234625826363e-05,
"loss": 1.082,
"step": 1155
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.2173974733436024,
"learning_rate": 1.4311729559095391e-05,
"loss": 1.0579,
"step": 1156
},
{
"epoch": 2.959079283887468,
"grad_norm": 0.23569051033252647,
"learning_rate": 1.430021747745002e-05,
"loss": 1.0501,
"step": 1157
},
{
"epoch": 2.9616368286445014,
"grad_norm": 0.1958953354096487,
"learning_rate": 1.4288698399619682e-05,
"loss": 1.0423,
"step": 1158
},
{
"epoch": 2.9641943734015346,
"grad_norm": 0.24550680925330018,
"learning_rate": 1.4277172344345203e-05,
"loss": 1.0429,
"step": 1159
},
{
"epoch": 2.9667519181585678,
"grad_norm": 0.22335624269922177,
"learning_rate": 1.4265639330378751e-05,
"loss": 1.0637,
"step": 1160
},
{
"epoch": 2.969309462915601,
"grad_norm": 0.19207777433952558,
"learning_rate": 1.4254099376483814e-05,
"loss": 1.032,
"step": 1161
},
{
"epoch": 2.971867007672634,
"grad_norm": 0.21933228277599973,
"learning_rate": 1.424255250143518e-05,
"loss": 1.0399,
"step": 1162
},
{
"epoch": 2.9744245524296673,
"grad_norm": 0.2042696972237095,
"learning_rate": 1.423099872401889e-05,
"loss": 1.082,
"step": 1163
},
{
"epoch": 2.976982097186701,
"grad_norm": 0.23521017440946976,
"learning_rate": 1.4219438063032223e-05,
"loss": 1.0337,
"step": 1164
},
{
"epoch": 2.979539641943734,
"grad_norm": 0.23773407464153606,
"learning_rate": 1.4207870537283645e-05,
"loss": 1.0464,
"step": 1165
},
{
"epoch": 2.9820971867007673,
"grad_norm": 0.19999456866670134,
"learning_rate": 1.4196296165592804e-05,
"loss": 1.0738,
"step": 1166
},
{
"epoch": 2.9846547314578005,
"grad_norm": 0.24196149952728568,
"learning_rate": 1.4184714966790472e-05,
"loss": 1.0515,
"step": 1167
},
{
"epoch": 2.9872122762148337,
"grad_norm": 0.2078635385282362,
"learning_rate": 1.4173126959718542e-05,
"loss": 1.0685,
"step": 1168
},
{
"epoch": 2.9897698209718673,
"grad_norm": 0.22519888128468324,
"learning_rate": 1.416153216322997e-05,
"loss": 1.0406,
"step": 1169
},
{
"epoch": 2.9923273657289,
"grad_norm": 0.23526057180385235,
"learning_rate": 1.4149930596188768e-05,
"loss": 1.0388,
"step": 1170
},
{
"epoch": 2.9948849104859336,
"grad_norm": 0.23228687433861023,
"learning_rate": 1.4138322277469962e-05,
"loss": 1.035,
"step": 1171
},
{
"epoch": 2.997442455242967,
"grad_norm": 0.23799687340205392,
"learning_rate": 1.412670722595956e-05,
"loss": 1.0798,
"step": 1172
},
{
"epoch": 3.0,
"grad_norm": 0.22605319189413042,
"learning_rate": 1.4115085460554524e-05,
"loss": 1.0724,
"step": 1173
},
{
"epoch": 3.002557544757033,
"grad_norm": 0.22583372556086656,
"learning_rate": 1.410345700016274e-05,
"loss": 1.0653,
"step": 1174
},
{
"epoch": 3.0051150895140664,
"grad_norm": 0.20810235633737204,
"learning_rate": 1.4091821863702983e-05,
"loss": 1.0641,
"step": 1175
},
{
"epoch": 3.0076726342710995,
"grad_norm": 0.20645828983892262,
"learning_rate": 1.4080180070104897e-05,
"loss": 1.0426,
"step": 1176
},
{
"epoch": 3.010230179028133,
"grad_norm": 0.20345366792505884,
"learning_rate": 1.406853163830895e-05,
"loss": 1.0849,
"step": 1177
},
{
"epoch": 3.0127877237851663,
"grad_norm": 0.21212291453565033,
"learning_rate": 1.4056876587266413e-05,
"loss": 1.0687,
"step": 1178
},
{
"epoch": 3.0153452685421995,
"grad_norm": 0.19908450369242628,
"learning_rate": 1.4045214935939323e-05,
"loss": 1.0193,
"step": 1179
},
{
"epoch": 3.0179028132992327,
"grad_norm": 0.22127953869549283,
"learning_rate": 1.4033546703300465e-05,
"loss": 1.027,
"step": 1180
},
{
"epoch": 3.020460358056266,
"grad_norm": 0.2284795334598278,
"learning_rate": 1.402187190833331e-05,
"loss": 1.041,
"step": 1181
},
{
"epoch": 3.023017902813299,
"grad_norm": 0.2062329065326131,
"learning_rate": 1.4010190570032034e-05,
"loss": 1.0371,
"step": 1182
},
{
"epoch": 3.0255754475703327,
"grad_norm": 0.19478100964489237,
"learning_rate": 1.3998502707401437e-05,
"loss": 1.0578,
"step": 1183
},
{
"epoch": 3.028132992327366,
"grad_norm": 0.22168971452412287,
"learning_rate": 1.398680833945694e-05,
"loss": 1.023,
"step": 1184
},
{
"epoch": 3.030690537084399,
"grad_norm": 0.2040809628837293,
"learning_rate": 1.3975107485224552e-05,
"loss": 1.0382,
"step": 1185
},
{
"epoch": 3.0332480818414322,
"grad_norm": 0.2051983553640489,
"learning_rate": 1.3963400163740828e-05,
"loss": 1.0186,
"step": 1186
},
{
"epoch": 3.0358056265984654,
"grad_norm": 0.2350671015231016,
"learning_rate": 1.395168639405285e-05,
"loss": 1.0455,
"step": 1187
},
{
"epoch": 3.0383631713554986,
"grad_norm": 0.22621448501355076,
"learning_rate": 1.3939966195218188e-05,
"loss": 1.0074,
"step": 1188
},
{
"epoch": 3.040920716112532,
"grad_norm": 0.23737640534776971,
"learning_rate": 1.3928239586304873e-05,
"loss": 1.0437,
"step": 1189
},
{
"epoch": 3.0434782608695654,
"grad_norm": 0.2323257168547048,
"learning_rate": 1.3916506586391364e-05,
"loss": 1.0327,
"step": 1190
},
{
"epoch": 3.0460358056265986,
"grad_norm": 0.22305161499533654,
"learning_rate": 1.390476721456652e-05,
"loss": 1.0099,
"step": 1191
},
{
"epoch": 3.0485933503836318,
"grad_norm": 0.23535858097990897,
"learning_rate": 1.3893021489929564e-05,
"loss": 1.051,
"step": 1192
},
{
"epoch": 3.051150895140665,
"grad_norm": 0.20326385048979087,
"learning_rate": 1.3881269431590052e-05,
"loss": 1.057,
"step": 1193
},
{
"epoch": 3.053708439897698,
"grad_norm": 0.21150204467244554,
"learning_rate": 1.3869511058667855e-05,
"loss": 1.0296,
"step": 1194
},
{
"epoch": 3.0562659846547313,
"grad_norm": 0.2227085234968232,
"learning_rate": 1.3857746390293106e-05,
"loss": 1.0342,
"step": 1195
},
{
"epoch": 3.0588235294117645,
"grad_norm": 0.24189335016155378,
"learning_rate": 1.3845975445606184e-05,
"loss": 1.0491,
"step": 1196
},
{
"epoch": 3.061381074168798,
"grad_norm": 0.21700679291777608,
"learning_rate": 1.383419824375768e-05,
"loss": 1.0458,
"step": 1197
},
{
"epoch": 3.0639386189258313,
"grad_norm": 0.2325789506958363,
"learning_rate": 1.382241480390837e-05,
"loss": 1.0451,
"step": 1198
},
{
"epoch": 3.0664961636828645,
"grad_norm": 0.21783084381710976,
"learning_rate": 1.3810625145229174e-05,
"loss": 1.0621,
"step": 1199
},
{
"epoch": 3.0690537084398977,
"grad_norm": 0.259978348441225,
"learning_rate": 1.3798829286901122e-05,
"loss": 1.0216,
"step": 1200
},
{
"epoch": 3.071611253196931,
"grad_norm": 0.2531231166315013,
"learning_rate": 1.3787027248115341e-05,
"loss": 1.0344,
"step": 1201
},
{
"epoch": 3.074168797953964,
"grad_norm": 0.25693037958499804,
"learning_rate": 1.3775219048073011e-05,
"loss": 1.0571,
"step": 1202
},
{
"epoch": 3.0767263427109977,
"grad_norm": 0.22329447917802453,
"learning_rate": 1.376340470598534e-05,
"loss": 1.0621,
"step": 1203
},
{
"epoch": 3.079283887468031,
"grad_norm": 0.24363305905238922,
"learning_rate": 1.3751584241073517e-05,
"loss": 1.0627,
"step": 1204
},
{
"epoch": 3.081841432225064,
"grad_norm": 0.252245006887946,
"learning_rate": 1.3739757672568703e-05,
"loss": 1.0619,
"step": 1205
},
{
"epoch": 3.084398976982097,
"grad_norm": 0.24187527332738293,
"learning_rate": 1.3727925019711981e-05,
"loss": 1.0324,
"step": 1206
},
{
"epoch": 3.0869565217391304,
"grad_norm": 0.2140650570738505,
"learning_rate": 1.3716086301754343e-05,
"loss": 1.0538,
"step": 1207
},
{
"epoch": 3.0895140664961636,
"grad_norm": 0.26828049735013604,
"learning_rate": 1.3704241537956643e-05,
"loss": 1.0806,
"step": 1208
},
{
"epoch": 3.0920716112531967,
"grad_norm": 0.20662196910585112,
"learning_rate": 1.3692390747589564e-05,
"loss": 1.0272,
"step": 1209
},
{
"epoch": 3.0946291560102304,
"grad_norm": 0.23564415225665816,
"learning_rate": 1.3680533949933607e-05,
"loss": 1.0499,
"step": 1210
},
{
"epoch": 3.0971867007672635,
"grad_norm": 0.20991526952221617,
"learning_rate": 1.3668671164279039e-05,
"loss": 1.0514,
"step": 1211
},
{
"epoch": 3.0997442455242967,
"grad_norm": 0.22870151484413298,
"learning_rate": 1.3656802409925874e-05,
"loss": 1.0134,
"step": 1212
},
{
"epoch": 3.10230179028133,
"grad_norm": 0.21877781759998727,
"learning_rate": 1.3644927706183824e-05,
"loss": 1.0851,
"step": 1213
},
{
"epoch": 3.104859335038363,
"grad_norm": 0.2327125173805525,
"learning_rate": 1.3633047072372301e-05,
"loss": 1.0311,
"step": 1214
},
{
"epoch": 3.1074168797953963,
"grad_norm": 0.22202571636713042,
"learning_rate": 1.3621160527820343e-05,
"loss": 1.0737,
"step": 1215
},
{
"epoch": 3.10997442455243,
"grad_norm": 0.2154525697553689,
"learning_rate": 1.3609268091866621e-05,
"loss": 1.0298,
"step": 1216
},
{
"epoch": 3.112531969309463,
"grad_norm": 0.24440602961960542,
"learning_rate": 1.3597369783859385e-05,
"loss": 1.0637,
"step": 1217
},
{
"epoch": 3.1150895140664963,
"grad_norm": 0.22947504540372743,
"learning_rate": 1.3585465623156434e-05,
"loss": 1.0358,
"step": 1218
},
{
"epoch": 3.1176470588235294,
"grad_norm": 0.20546693748205078,
"learning_rate": 1.3573555629125097e-05,
"loss": 1.0531,
"step": 1219
},
{
"epoch": 3.1202046035805626,
"grad_norm": 0.2376207772257609,
"learning_rate": 1.3561639821142187e-05,
"loss": 1.0422,
"step": 1220
},
{
"epoch": 3.122762148337596,
"grad_norm": 0.2075906124157621,
"learning_rate": 1.3549718218593982e-05,
"loss": 1.0373,
"step": 1221
},
{
"epoch": 3.125319693094629,
"grad_norm": 0.2710877805734423,
"learning_rate": 1.3537790840876179e-05,
"loss": 0.9867,
"step": 1222
},
{
"epoch": 3.1278772378516626,
"grad_norm": 0.21873389694947254,
"learning_rate": 1.3525857707393878e-05,
"loss": 1.0493,
"step": 1223
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.23140954420047274,
"learning_rate": 1.3513918837561544e-05,
"loss": 1.0192,
"step": 1224
},
{
"epoch": 3.132992327365729,
"grad_norm": 0.21413826548960174,
"learning_rate": 1.3501974250802967e-05,
"loss": 1.0233,
"step": 1225
},
{
"epoch": 3.135549872122762,
"grad_norm": 0.2211593381832046,
"learning_rate": 1.3490023966551249e-05,
"loss": 1.0415,
"step": 1226
},
{
"epoch": 3.1381074168797953,
"grad_norm": 0.23108631631913867,
"learning_rate": 1.3478068004248747e-05,
"loss": 1.0399,
"step": 1227
},
{
"epoch": 3.1406649616368285,
"grad_norm": 0.22275279756167513,
"learning_rate": 1.346610638334707e-05,
"loss": 1.0596,
"step": 1228
},
{
"epoch": 3.1432225063938617,
"grad_norm": 0.2524231602837744,
"learning_rate": 1.3454139123307023e-05,
"loss": 1.065,
"step": 1229
},
{
"epoch": 3.1457800511508953,
"grad_norm": 0.2196098109454718,
"learning_rate": 1.3442166243598598e-05,
"loss": 1.0497,
"step": 1230
},
{
"epoch": 3.1483375959079285,
"grad_norm": 0.2392235318659055,
"learning_rate": 1.3430187763700914e-05,
"loss": 1.0579,
"step": 1231
},
{
"epoch": 3.1508951406649617,
"grad_norm": 0.2252882411678263,
"learning_rate": 1.341820370310221e-05,
"loss": 1.037,
"step": 1232
},
{
"epoch": 3.153452685421995,
"grad_norm": 0.21957606499611643,
"learning_rate": 1.3406214081299807e-05,
"loss": 1.077,
"step": 1233
},
{
"epoch": 3.156010230179028,
"grad_norm": 0.2158883136158835,
"learning_rate": 1.3394218917800064e-05,
"loss": 1.0576,
"step": 1234
},
{
"epoch": 3.1585677749360612,
"grad_norm": 0.23206630107006462,
"learning_rate": 1.3382218232118367e-05,
"loss": 1.046,
"step": 1235
},
{
"epoch": 3.1611253196930944,
"grad_norm": 0.22650165934718894,
"learning_rate": 1.3370212043779078e-05,
"loss": 1.0513,
"step": 1236
},
{
"epoch": 3.163682864450128,
"grad_norm": 0.2146494581025888,
"learning_rate": 1.335820037231552e-05,
"loss": 1.0418,
"step": 1237
},
{
"epoch": 3.166240409207161,
"grad_norm": 0.22693672785502703,
"learning_rate": 1.3346183237269925e-05,
"loss": 1.044,
"step": 1238
},
{
"epoch": 3.1687979539641944,
"grad_norm": 0.24944388113412067,
"learning_rate": 1.3334160658193425e-05,
"loss": 1.0085,
"step": 1239
},
{
"epoch": 3.1713554987212276,
"grad_norm": 0.2323240702756201,
"learning_rate": 1.3322132654646003e-05,
"loss": 1.0348,
"step": 1240
},
{
"epoch": 3.1739130434782608,
"grad_norm": 0.23314120380593967,
"learning_rate": 1.3310099246196466e-05,
"loss": 1.0255,
"step": 1241
},
{
"epoch": 3.176470588235294,
"grad_norm": 0.22959022702139156,
"learning_rate": 1.3298060452422421e-05,
"loss": 1.0303,
"step": 1242
},
{
"epoch": 3.1790281329923276,
"grad_norm": 0.1945764817333214,
"learning_rate": 1.3286016292910229e-05,
"loss": 1.0366,
"step": 1243
},
{
"epoch": 3.1815856777493607,
"grad_norm": 0.2049881448552149,
"learning_rate": 1.327396678725499e-05,
"loss": 1.0224,
"step": 1244
},
{
"epoch": 3.184143222506394,
"grad_norm": 0.245199876694944,
"learning_rate": 1.3261911955060493e-05,
"loss": 0.9968,
"step": 1245
},
{
"epoch": 3.186700767263427,
"grad_norm": 0.19541276884697034,
"learning_rate": 1.3249851815939197e-05,
"loss": 1.0502,
"step": 1246
},
{
"epoch": 3.1892583120204603,
"grad_norm": 0.22313066289223873,
"learning_rate": 1.3237786389512191e-05,
"loss": 1.0577,
"step": 1247
},
{
"epoch": 3.1918158567774935,
"grad_norm": 0.23691814508572034,
"learning_rate": 1.3225715695409171e-05,
"loss": 1.0407,
"step": 1248
},
{
"epoch": 3.1943734015345266,
"grad_norm": 0.19364764369376442,
"learning_rate": 1.3213639753268406e-05,
"loss": 1.0289,
"step": 1249
},
{
"epoch": 3.1969309462915603,
"grad_norm": 0.19636310287160377,
"learning_rate": 1.3201558582736693e-05,
"loss": 1.0389,
"step": 1250
},
{
"epoch": 3.1994884910485935,
"grad_norm": 0.1876664287484004,
"learning_rate": 1.3189472203469347e-05,
"loss": 1.0167,
"step": 1251
},
{
"epoch": 3.2020460358056266,
"grad_norm": 0.19365316134612506,
"learning_rate": 1.3177380635130144e-05,
"loss": 1.0522,
"step": 1252
},
{
"epoch": 3.20460358056266,
"grad_norm": 0.17412371216897868,
"learning_rate": 1.3165283897391315e-05,
"loss": 1.0125,
"step": 1253
},
{
"epoch": 3.207161125319693,
"grad_norm": 0.21377597350657065,
"learning_rate": 1.3153182009933495e-05,
"loss": 1.035,
"step": 1254
},
{
"epoch": 3.209718670076726,
"grad_norm": 0.18072951551049465,
"learning_rate": 1.3141074992445695e-05,
"loss": 1.0354,
"step": 1255
},
{
"epoch": 3.21227621483376,
"grad_norm": 0.21819804516231073,
"learning_rate": 1.3128962864625281e-05,
"loss": 1.0288,
"step": 1256
},
{
"epoch": 3.214833759590793,
"grad_norm": 0.22829327535687294,
"learning_rate": 1.3116845646177923e-05,
"loss": 1.0329,
"step": 1257
},
{
"epoch": 3.217391304347826,
"grad_norm": 0.22096551556124827,
"learning_rate": 1.3104723356817582e-05,
"loss": 1.0272,
"step": 1258
},
{
"epoch": 3.2199488491048593,
"grad_norm": 0.19427368545567542,
"learning_rate": 1.309259601626646e-05,
"loss": 1.0757,
"step": 1259
},
{
"epoch": 3.2225063938618925,
"grad_norm": 0.2517142880283656,
"learning_rate": 1.3080463644254986e-05,
"loss": 1.0449,
"step": 1260
},
{
"epoch": 3.2250639386189257,
"grad_norm": 0.21438511450639225,
"learning_rate": 1.3068326260521769e-05,
"loss": 1.0253,
"step": 1261
},
{
"epoch": 3.227621483375959,
"grad_norm": 0.23939604240119217,
"learning_rate": 1.3056183884813568e-05,
"loss": 1.0055,
"step": 1262
},
{
"epoch": 3.2301790281329925,
"grad_norm": 0.24913816729402657,
"learning_rate": 1.3044036536885284e-05,
"loss": 1.0305,
"step": 1263
},
{
"epoch": 3.2327365728900257,
"grad_norm": 0.22985968452270927,
"learning_rate": 1.3031884236499877e-05,
"loss": 1.0356,
"step": 1264
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.2432127136491896,
"learning_rate": 1.3019727003428387e-05,
"loss": 1.0327,
"step": 1265
},
{
"epoch": 3.237851662404092,
"grad_norm": 0.21511626506563813,
"learning_rate": 1.300756485744987e-05,
"loss": 1.0351,
"step": 1266
},
{
"epoch": 3.2404092071611252,
"grad_norm": 0.21620331140589194,
"learning_rate": 1.2995397818351381e-05,
"loss": 1.0272,
"step": 1267
},
{
"epoch": 3.2429667519181584,
"grad_norm": 0.24918797088173247,
"learning_rate": 1.2983225905927924e-05,
"loss": 0.9923,
"step": 1268
},
{
"epoch": 3.2455242966751916,
"grad_norm": 0.2033868759774891,
"learning_rate": 1.2971049139982448e-05,
"loss": 1.0526,
"step": 1269
},
{
"epoch": 3.2480818414322252,
"grad_norm": 0.24065409839804014,
"learning_rate": 1.2958867540325785e-05,
"loss": 1.0283,
"step": 1270
},
{
"epoch": 3.2506393861892584,
"grad_norm": 0.23975735377063542,
"learning_rate": 1.294668112677664e-05,
"loss": 1.0467,
"step": 1271
},
{
"epoch": 3.2531969309462916,
"grad_norm": 0.20321738007355677,
"learning_rate": 1.2934489919161541e-05,
"loss": 1.0292,
"step": 1272
},
{
"epoch": 3.2557544757033248,
"grad_norm": 0.22563988593724132,
"learning_rate": 1.292229393731482e-05,
"loss": 1.0273,
"step": 1273
},
{
"epoch": 3.258312020460358,
"grad_norm": 0.2108784426288754,
"learning_rate": 1.2910093201078584e-05,
"loss": 1.041,
"step": 1274
},
{
"epoch": 3.260869565217391,
"grad_norm": 0.25182826531670705,
"learning_rate": 1.289788773030266e-05,
"loss": 1.0507,
"step": 1275
},
{
"epoch": 3.2634271099744243,
"grad_norm": 0.23260866121986465,
"learning_rate": 1.2885677544844592e-05,
"loss": 1.0073,
"step": 1276
},
{
"epoch": 3.265984654731458,
"grad_norm": 0.20778832907058722,
"learning_rate": 1.2873462664569583e-05,
"loss": 1.063,
"step": 1277
},
{
"epoch": 3.268542199488491,
"grad_norm": 0.24704017386773852,
"learning_rate": 1.2861243109350485e-05,
"loss": 1.0275,
"step": 1278
},
{
"epoch": 3.2710997442455243,
"grad_norm": 0.20143011397018976,
"learning_rate": 1.2849018899067746e-05,
"loss": 1.0786,
"step": 1279
},
{
"epoch": 3.2736572890025575,
"grad_norm": 0.19780957370773475,
"learning_rate": 1.2836790053609396e-05,
"loss": 1.0475,
"step": 1280
},
{
"epoch": 3.2762148337595907,
"grad_norm": 0.21001290371983408,
"learning_rate": 1.2824556592870993e-05,
"loss": 1.0544,
"step": 1281
},
{
"epoch": 3.2787723785166243,
"grad_norm": 0.2314545925289747,
"learning_rate": 1.2812318536755624e-05,
"loss": 1.0432,
"step": 1282
},
{
"epoch": 3.2813299232736575,
"grad_norm": 0.21988256589877733,
"learning_rate": 1.2800075905173834e-05,
"loss": 1.0432,
"step": 1283
},
{
"epoch": 3.2838874680306906,
"grad_norm": 0.26832633674704665,
"learning_rate": 1.2787828718043622e-05,
"loss": 1.0379,
"step": 1284
},
{
"epoch": 3.286445012787724,
"grad_norm": 0.2234222589374059,
"learning_rate": 1.2775576995290397e-05,
"loss": 1.0421,
"step": 1285
},
{
"epoch": 3.289002557544757,
"grad_norm": 0.20516563803916263,
"learning_rate": 1.276332075684694e-05,
"loss": 1.0392,
"step": 1286
},
{
"epoch": 3.29156010230179,
"grad_norm": 0.2404590656925125,
"learning_rate": 1.2751060022653393e-05,
"loss": 1.0283,
"step": 1287
},
{
"epoch": 3.2941176470588234,
"grad_norm": 0.19864113603292302,
"learning_rate": 1.2738794812657194e-05,
"loss": 1.0144,
"step": 1288
},
{
"epoch": 3.296675191815857,
"grad_norm": 0.2323436030300969,
"learning_rate": 1.2726525146813078e-05,
"loss": 1.0151,
"step": 1289
},
{
"epoch": 3.29923273657289,
"grad_norm": 0.24929371156784427,
"learning_rate": 1.2714251045083028e-05,
"loss": 1.0137,
"step": 1290
},
{
"epoch": 3.3017902813299234,
"grad_norm": 0.20413376158858587,
"learning_rate": 1.2701972527436235e-05,
"loss": 1.0233,
"step": 1291
},
{
"epoch": 3.3043478260869565,
"grad_norm": 0.21637513281635873,
"learning_rate": 1.2689689613849083e-05,
"loss": 1.0586,
"step": 1292
},
{
"epoch": 3.3069053708439897,
"grad_norm": 0.18194714637573692,
"learning_rate": 1.2677402324305099e-05,
"loss": 0.994,
"step": 1293
},
{
"epoch": 3.309462915601023,
"grad_norm": 0.19606411156722506,
"learning_rate": 1.266511067879494e-05,
"loss": 1.0283,
"step": 1294
},
{
"epoch": 3.312020460358056,
"grad_norm": 0.19517256802808283,
"learning_rate": 1.265281469731634e-05,
"loss": 1.0373,
"step": 1295
},
{
"epoch": 3.3145780051150897,
"grad_norm": 0.17867307264513901,
"learning_rate": 1.2640514399874095e-05,
"loss": 1.0517,
"step": 1296
},
{
"epoch": 3.317135549872123,
"grad_norm": 0.19814474828943063,
"learning_rate": 1.2628209806480024e-05,
"loss": 1.0068,
"step": 1297
},
{
"epoch": 3.319693094629156,
"grad_norm": 0.21270750338094424,
"learning_rate": 1.2615900937152923e-05,
"loss": 1.0236,
"step": 1298
},
{
"epoch": 3.3222506393861893,
"grad_norm": 0.21625825452151415,
"learning_rate": 1.2603587811918558e-05,
"loss": 1.0495,
"step": 1299
},
{
"epoch": 3.3248081841432224,
"grad_norm": 0.23776899893360745,
"learning_rate": 1.2591270450809612e-05,
"loss": 1.0741,
"step": 1300
},
{
"epoch": 3.3273657289002556,
"grad_norm": 0.22428186293001376,
"learning_rate": 1.2578948873865662e-05,
"loss": 1.0132,
"step": 1301
},
{
"epoch": 3.329923273657289,
"grad_norm": 0.20864902455184137,
"learning_rate": 1.2566623101133144e-05,
"loss": 1.0464,
"step": 1302
},
{
"epoch": 3.3324808184143224,
"grad_norm": 0.2685355350833958,
"learning_rate": 1.2554293152665316e-05,
"loss": 1.0247,
"step": 1303
},
{
"epoch": 3.3350383631713556,
"grad_norm": 0.2527986356697781,
"learning_rate": 1.2541959048522239e-05,
"loss": 1.0399,
"step": 1304
},
{
"epoch": 3.337595907928389,
"grad_norm": 0.22197339925214596,
"learning_rate": 1.2529620808770723e-05,
"loss": 1.0157,
"step": 1305
},
{
"epoch": 3.340153452685422,
"grad_norm": 0.3107261506811511,
"learning_rate": 1.251727845348432e-05,
"loss": 1.0495,
"step": 1306
},
{
"epoch": 3.342710997442455,
"grad_norm": 0.2643689123746537,
"learning_rate": 1.2504932002743262e-05,
"loss": 1.001,
"step": 1307
},
{
"epoch": 3.3452685421994883,
"grad_norm": 0.2364739279711792,
"learning_rate": 1.2492581476634458e-05,
"loss": 1.045,
"step": 1308
},
{
"epoch": 3.3478260869565215,
"grad_norm": 0.28136518049730547,
"learning_rate": 1.2480226895251439e-05,
"loss": 1.0285,
"step": 1309
},
{
"epoch": 3.350383631713555,
"grad_norm": 0.2523350080360508,
"learning_rate": 1.2467868278694342e-05,
"loss": 1.0658,
"step": 1310
},
{
"epoch": 3.3529411764705883,
"grad_norm": 0.20529584681597104,
"learning_rate": 1.245550564706986e-05,
"loss": 1.0372,
"step": 1311
},
{
"epoch": 3.3554987212276215,
"grad_norm": 0.26187724014211844,
"learning_rate": 1.2443139020491216e-05,
"loss": 1.0295,
"step": 1312
},
{
"epoch": 3.3580562659846547,
"grad_norm": 0.2759180573007528,
"learning_rate": 1.2430768419078143e-05,
"loss": 1.0312,
"step": 1313
},
{
"epoch": 3.360613810741688,
"grad_norm": 0.2020495956799633,
"learning_rate": 1.2418393862956837e-05,
"loss": 1.0419,
"step": 1314
},
{
"epoch": 3.363171355498721,
"grad_norm": 0.2369272520944126,
"learning_rate": 1.2406015372259925e-05,
"loss": 1.0122,
"step": 1315
},
{
"epoch": 3.3657289002557547,
"grad_norm": 0.2184979100214276,
"learning_rate": 1.2393632967126441e-05,
"loss": 1.0327,
"step": 1316
},
{
"epoch": 3.368286445012788,
"grad_norm": 0.23858603204557072,
"learning_rate": 1.2381246667701781e-05,
"loss": 1.0475,
"step": 1317
},
{
"epoch": 3.370843989769821,
"grad_norm": 0.26756479784593945,
"learning_rate": 1.236885649413768e-05,
"loss": 1.0426,
"step": 1318
},
{
"epoch": 3.373401534526854,
"grad_norm": 0.1892302039091279,
"learning_rate": 1.2356462466592177e-05,
"loss": 1.0412,
"step": 1319
},
{
"epoch": 3.3759590792838874,
"grad_norm": 0.29335988888765785,
"learning_rate": 1.2344064605229577e-05,
"loss": 1.0175,
"step": 1320
},
{
"epoch": 3.3785166240409206,
"grad_norm": 0.21447038773497848,
"learning_rate": 1.2331662930220424e-05,
"loss": 1.018,
"step": 1321
},
{
"epoch": 3.381074168797954,
"grad_norm": 0.24164773212365756,
"learning_rate": 1.2319257461741478e-05,
"loss": 1.029,
"step": 1322
},
{
"epoch": 3.3836317135549874,
"grad_norm": 0.23724415736018667,
"learning_rate": 1.2306848219975649e-05,
"loss": 1.017,
"step": 1323
},
{
"epoch": 3.3861892583120206,
"grad_norm": 0.2146728306264026,
"learning_rate": 1.2294435225112005e-05,
"loss": 1.0301,
"step": 1324
},
{
"epoch": 3.3887468030690537,
"grad_norm": 0.18212095256468025,
"learning_rate": 1.2282018497345705e-05,
"loss": 1.0361,
"step": 1325
},
{
"epoch": 3.391304347826087,
"grad_norm": 0.23148682510609303,
"learning_rate": 1.2269598056877996e-05,
"loss": 1.0385,
"step": 1326
},
{
"epoch": 3.39386189258312,
"grad_norm": 0.20473257376707585,
"learning_rate": 1.2257173923916154e-05,
"loss": 1.0208,
"step": 1327
},
{
"epoch": 3.3964194373401533,
"grad_norm": 0.20995062344103757,
"learning_rate": 1.2244746118673467e-05,
"loss": 1.0116,
"step": 1328
},
{
"epoch": 3.398976982097187,
"grad_norm": 0.23774156769953378,
"learning_rate": 1.22323146613692e-05,
"loss": 1.0742,
"step": 1329
},
{
"epoch": 3.40153452685422,
"grad_norm": 0.20830692559875352,
"learning_rate": 1.2219879572228555e-05,
"loss": 1.0565,
"step": 1330
},
{
"epoch": 3.4040920716112533,
"grad_norm": 0.2147028468697588,
"learning_rate": 1.2207440871482644e-05,
"loss": 1.0294,
"step": 1331
},
{
"epoch": 3.4066496163682864,
"grad_norm": 0.24756067918436106,
"learning_rate": 1.2194998579368451e-05,
"loss": 1.0479,
"step": 1332
},
{
"epoch": 3.4092071611253196,
"grad_norm": 0.2056045421373826,
"learning_rate": 1.2182552716128818e-05,
"loss": 1.0236,
"step": 1333
},
{
"epoch": 3.411764705882353,
"grad_norm": 0.2079215269898909,
"learning_rate": 1.2170103302012374e-05,
"loss": 1.0513,
"step": 1334
},
{
"epoch": 3.414322250639386,
"grad_norm": 0.19554068307435188,
"learning_rate": 1.2157650357273547e-05,
"loss": 1.0389,
"step": 1335
},
{
"epoch": 3.4168797953964196,
"grad_norm": 0.20840944979090947,
"learning_rate": 1.2145193902172496e-05,
"loss": 1.0355,
"step": 1336
},
{
"epoch": 3.419437340153453,
"grad_norm": 0.21130712097196197,
"learning_rate": 1.2132733956975093e-05,
"loss": 1.0322,
"step": 1337
},
{
"epoch": 3.421994884910486,
"grad_norm": 0.17958150894777242,
"learning_rate": 1.2120270541952892e-05,
"loss": 1.0227,
"step": 1338
},
{
"epoch": 3.424552429667519,
"grad_norm": 0.2225571229441682,
"learning_rate": 1.210780367738309e-05,
"loss": 1.0285,
"step": 1339
},
{
"epoch": 3.4271099744245523,
"grad_norm": 0.1885954682977986,
"learning_rate": 1.2095333383548495e-05,
"loss": 1.0812,
"step": 1340
},
{
"epoch": 3.4296675191815855,
"grad_norm": 0.2099948092443905,
"learning_rate": 1.2082859680737495e-05,
"loss": 1.0716,
"step": 1341
},
{
"epoch": 3.4322250639386187,
"grad_norm": 0.2256939428442792,
"learning_rate": 1.2070382589244026e-05,
"loss": 1.0311,
"step": 1342
},
{
"epoch": 3.4347826086956523,
"grad_norm": 0.23072791297771425,
"learning_rate": 1.2057902129367536e-05,
"loss": 1.0467,
"step": 1343
},
{
"epoch": 3.4373401534526855,
"grad_norm": 0.2057602125391487,
"learning_rate": 1.204541832141295e-05,
"loss": 1.028,
"step": 1344
},
{
"epoch": 3.4398976982097187,
"grad_norm": 0.2520074046407619,
"learning_rate": 1.2032931185690646e-05,
"loss": 1.0163,
"step": 1345
},
{
"epoch": 3.442455242966752,
"grad_norm": 0.2421964192866277,
"learning_rate": 1.202044074251641e-05,
"loss": 1.063,
"step": 1346
},
{
"epoch": 3.445012787723785,
"grad_norm": 0.20429551187516548,
"learning_rate": 1.2007947012211419e-05,
"loss": 1.0361,
"step": 1347
},
{
"epoch": 3.4475703324808182,
"grad_norm": 0.2520787216839294,
"learning_rate": 1.199545001510218e-05,
"loss": 1.054,
"step": 1348
},
{
"epoch": 3.4501278772378514,
"grad_norm": 0.24681543428956615,
"learning_rate": 1.1982949771520535e-05,
"loss": 1.0605,
"step": 1349
},
{
"epoch": 3.452685421994885,
"grad_norm": 0.20282034999970464,
"learning_rate": 1.1970446301803598e-05,
"loss": 1.0461,
"step": 1350
},
{
"epoch": 3.455242966751918,
"grad_norm": 0.22677677047988842,
"learning_rate": 1.1957939626293726e-05,
"loss": 1.0459,
"step": 1351
},
{
"epoch": 3.4578005115089514,
"grad_norm": 0.23929950706752162,
"learning_rate": 1.1945429765338507e-05,
"loss": 1.0531,
"step": 1352
},
{
"epoch": 3.4603580562659846,
"grad_norm": 0.2096490071983182,
"learning_rate": 1.1932916739290694e-05,
"loss": 1.0148,
"step": 1353
},
{
"epoch": 3.4629156010230178,
"grad_norm": 0.20618185619438542,
"learning_rate": 1.1920400568508201e-05,
"loss": 1.0375,
"step": 1354
},
{
"epoch": 3.4654731457800514,
"grad_norm": 0.23186283780985562,
"learning_rate": 1.1907881273354059e-05,
"loss": 1.0276,
"step": 1355
},
{
"epoch": 3.4680306905370846,
"grad_norm": 0.21691929515578598,
"learning_rate": 1.1895358874196377e-05,
"loss": 1.0368,
"step": 1356
},
{
"epoch": 3.4705882352941178,
"grad_norm": 0.20410519325755752,
"learning_rate": 1.188283339140831e-05,
"loss": 1.038,
"step": 1357
},
{
"epoch": 3.473145780051151,
"grad_norm": 0.22863334112386996,
"learning_rate": 1.1870304845368043e-05,
"loss": 1.0433,
"step": 1358
},
{
"epoch": 3.475703324808184,
"grad_norm": 0.2126661663430652,
"learning_rate": 1.1857773256458732e-05,
"loss": 1.0605,
"step": 1359
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.24272298207990836,
"learning_rate": 1.184523864506849e-05,
"loss": 1.0476,
"step": 1360
},
{
"epoch": 3.4808184143222505,
"grad_norm": 0.20098243757734405,
"learning_rate": 1.1832701031590345e-05,
"loss": 1.032,
"step": 1361
},
{
"epoch": 3.483375959079284,
"grad_norm": 0.2516527217412891,
"learning_rate": 1.1820160436422213e-05,
"loss": 1.0392,
"step": 1362
},
{
"epoch": 3.4859335038363173,
"grad_norm": 0.22312520765078486,
"learning_rate": 1.1807616879966856e-05,
"loss": 1.0549,
"step": 1363
},
{
"epoch": 3.4884910485933505,
"grad_norm": 0.23508194911007732,
"learning_rate": 1.1795070382631856e-05,
"loss": 1.0257,
"step": 1364
},
{
"epoch": 3.4910485933503836,
"grad_norm": 0.2056219883277526,
"learning_rate": 1.1782520964829583e-05,
"loss": 1.0616,
"step": 1365
},
{
"epoch": 3.493606138107417,
"grad_norm": 0.22297849379676427,
"learning_rate": 1.1769968646977148e-05,
"loss": 1.08,
"step": 1366
},
{
"epoch": 3.49616368286445,
"grad_norm": 0.1917605236627194,
"learning_rate": 1.1757413449496393e-05,
"loss": 1.0582,
"step": 1367
},
{
"epoch": 3.498721227621483,
"grad_norm": 0.22264832355995012,
"learning_rate": 1.174485539281384e-05,
"loss": 0.9999,
"step": 1368
},
{
"epoch": 3.501278772378517,
"grad_norm": 0.18053830121135175,
"learning_rate": 1.1732294497360658e-05,
"loss": 1.0481,
"step": 1369
},
{
"epoch": 3.50383631713555,
"grad_norm": 0.25413658020729973,
"learning_rate": 1.1719730783572645e-05,
"loss": 1.0526,
"step": 1370
},
{
"epoch": 3.506393861892583,
"grad_norm": 0.20438148687464178,
"learning_rate": 1.1707164271890168e-05,
"loss": 1.0465,
"step": 1371
},
{
"epoch": 3.5089514066496164,
"grad_norm": 0.27411869672391553,
"learning_rate": 1.1694594982758164e-05,
"loss": 1.0672,
"step": 1372
},
{
"epoch": 3.5115089514066495,
"grad_norm": 0.27020394951486204,
"learning_rate": 1.1682022936626076e-05,
"loss": 1.0249,
"step": 1373
},
{
"epoch": 3.5140664961636827,
"grad_norm": 0.20542313494356507,
"learning_rate": 1.166944815394784e-05,
"loss": 1.0444,
"step": 1374
},
{
"epoch": 3.516624040920716,
"grad_norm": 0.2696771035530231,
"learning_rate": 1.165687065518184e-05,
"loss": 1.0164,
"step": 1375
},
{
"epoch": 3.5191815856777495,
"grad_norm": 0.21834933315057503,
"learning_rate": 1.1644290460790879e-05,
"loss": 1.0231,
"step": 1376
},
{
"epoch": 3.5217391304347827,
"grad_norm": 0.25602165129241816,
"learning_rate": 1.163170759124215e-05,
"loss": 1.0499,
"step": 1377
},
{
"epoch": 3.524296675191816,
"grad_norm": 0.2466307590095287,
"learning_rate": 1.161912206700719e-05,
"loss": 1.0179,
"step": 1378
},
{
"epoch": 3.526854219948849,
"grad_norm": 0.1990877095514582,
"learning_rate": 1.1606533908561866e-05,
"loss": 1.0825,
"step": 1379
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.2262880860449741,
"learning_rate": 1.1593943136386316e-05,
"loss": 1.0239,
"step": 1380
},
{
"epoch": 3.531969309462916,
"grad_norm": 0.23639713675723853,
"learning_rate": 1.1581349770964946e-05,
"loss": 1.0797,
"step": 1381
},
{
"epoch": 3.5345268542199486,
"grad_norm": 0.19143592758217978,
"learning_rate": 1.1568753832786376e-05,
"loss": 1.0482,
"step": 1382
},
{
"epoch": 3.5370843989769822,
"grad_norm": 0.21395077968188803,
"learning_rate": 1.1556155342343405e-05,
"loss": 1.0341,
"step": 1383
},
{
"epoch": 3.5396419437340154,
"grad_norm": 0.20517427967195068,
"learning_rate": 1.154355432013299e-05,
"loss": 1.0657,
"step": 1384
},
{
"epoch": 3.5421994884910486,
"grad_norm": 0.19022344547536582,
"learning_rate": 1.1530950786656205e-05,
"loss": 1.0428,
"step": 1385
},
{
"epoch": 3.544757033248082,
"grad_norm": 0.24857892965208156,
"learning_rate": 1.1518344762418216e-05,
"loss": 1.0614,
"step": 1386
},
{
"epoch": 3.547314578005115,
"grad_norm": 0.17434032950673256,
"learning_rate": 1.150573626792823e-05,
"loss": 1.0119,
"step": 1387
},
{
"epoch": 3.5498721227621486,
"grad_norm": 0.221669736437551,
"learning_rate": 1.1493125323699486e-05,
"loss": 1.0325,
"step": 1388
},
{
"epoch": 3.5524296675191813,
"grad_norm": 0.19550877444868983,
"learning_rate": 1.1480511950249195e-05,
"loss": 1.0621,
"step": 1389
},
{
"epoch": 3.554987212276215,
"grad_norm": 0.20320983764425946,
"learning_rate": 1.1467896168098533e-05,
"loss": 1.0688,
"step": 1390
},
{
"epoch": 3.557544757033248,
"grad_norm": 0.21236236447911172,
"learning_rate": 1.1455277997772585e-05,
"loss": 0.9992,
"step": 1391
},
{
"epoch": 3.5601023017902813,
"grad_norm": 0.1946876189282923,
"learning_rate": 1.1442657459800323e-05,
"loss": 1.0298,
"step": 1392
},
{
"epoch": 3.5626598465473145,
"grad_norm": 0.20833695509734265,
"learning_rate": 1.143003457471458e-05,
"loss": 1.0481,
"step": 1393
},
{
"epoch": 3.5652173913043477,
"grad_norm": 0.19849397670530705,
"learning_rate": 1.1417409363051992e-05,
"loss": 1.0508,
"step": 1394
},
{
"epoch": 3.5677749360613813,
"grad_norm": 0.1862173592034928,
"learning_rate": 1.1404781845352999e-05,
"loss": 1.0586,
"step": 1395
},
{
"epoch": 3.5703324808184145,
"grad_norm": 0.20151362231655162,
"learning_rate": 1.1392152042161774e-05,
"loss": 1.0319,
"step": 1396
},
{
"epoch": 3.5728900255754477,
"grad_norm": 0.23404342439834142,
"learning_rate": 1.1379519974026226e-05,
"loss": 1.0151,
"step": 1397
},
{
"epoch": 3.575447570332481,
"grad_norm": 0.18584316354206787,
"learning_rate": 1.136688566149793e-05,
"loss": 1.0516,
"step": 1398
},
{
"epoch": 3.578005115089514,
"grad_norm": 0.2357364264338847,
"learning_rate": 1.1354249125132131e-05,
"loss": 1.0558,
"step": 1399
},
{
"epoch": 3.580562659846547,
"grad_norm": 0.255370311471337,
"learning_rate": 1.1341610385487677e-05,
"loss": 1.0159,
"step": 1400
},
{
"epoch": 3.5831202046035804,
"grad_norm": 0.2015566724373594,
"learning_rate": 1.1328969463127009e-05,
"loss": 1.0256,
"step": 1401
},
{
"epoch": 3.585677749360614,
"grad_norm": 0.2717588011458947,
"learning_rate": 1.1316326378616121e-05,
"loss": 1.0452,
"step": 1402
},
{
"epoch": 3.588235294117647,
"grad_norm": 0.226800697503035,
"learning_rate": 1.1303681152524514e-05,
"loss": 1.0417,
"step": 1403
},
{
"epoch": 3.5907928388746804,
"grad_norm": 0.20628829171202948,
"learning_rate": 1.129103380542519e-05,
"loss": 1.0483,
"step": 1404
},
{
"epoch": 3.5933503836317136,
"grad_norm": 0.2260665953032841,
"learning_rate": 1.1278384357894585e-05,
"loss": 1.0407,
"step": 1405
},
{
"epoch": 3.5959079283887467,
"grad_norm": 0.20513785218039995,
"learning_rate": 1.1265732830512561e-05,
"loss": 1.0391,
"step": 1406
},
{
"epoch": 3.59846547314578,
"grad_norm": 0.21444285296757887,
"learning_rate": 1.125307924386236e-05,
"loss": 1.0456,
"step": 1407
},
{
"epoch": 3.601023017902813,
"grad_norm": 0.2652819565444848,
"learning_rate": 1.1240423618530578e-05,
"loss": 1.0501,
"step": 1408
},
{
"epoch": 3.6035805626598467,
"grad_norm": 0.23632809050025924,
"learning_rate": 1.122776597510713e-05,
"loss": 1.0294,
"step": 1409
},
{
"epoch": 3.60613810741688,
"grad_norm": 0.2185806876530497,
"learning_rate": 1.1215106334185201e-05,
"loss": 1.0024,
"step": 1410
},
{
"epoch": 3.608695652173913,
"grad_norm": 0.24854116957417377,
"learning_rate": 1.1202444716361247e-05,
"loss": 1.0451,
"step": 1411
},
{
"epoch": 3.6112531969309463,
"grad_norm": 0.2045525689869136,
"learning_rate": 1.1189781142234917e-05,
"loss": 1.0635,
"step": 1412
},
{
"epoch": 3.6138107416879794,
"grad_norm": 0.2399433598230184,
"learning_rate": 1.1177115632409064e-05,
"loss": 1.0177,
"step": 1413
},
{
"epoch": 3.6163682864450126,
"grad_norm": 0.2415017313404832,
"learning_rate": 1.1164448207489673e-05,
"loss": 1.0379,
"step": 1414
},
{
"epoch": 3.618925831202046,
"grad_norm": 0.21319360249943278,
"learning_rate": 1.1151778888085856e-05,
"loss": 1.0179,
"step": 1415
},
{
"epoch": 3.6214833759590794,
"grad_norm": 0.24881166658392342,
"learning_rate": 1.1139107694809806e-05,
"loss": 1.0392,
"step": 1416
},
{
"epoch": 3.6240409207161126,
"grad_norm": 0.19415985264760977,
"learning_rate": 1.1126434648276756e-05,
"loss": 1.0124,
"step": 1417
},
{
"epoch": 3.626598465473146,
"grad_norm": 0.25642703103922565,
"learning_rate": 1.1113759769104965e-05,
"loss": 1.0496,
"step": 1418
},
{
"epoch": 3.629156010230179,
"grad_norm": 0.2492878689877881,
"learning_rate": 1.1101083077915667e-05,
"loss": 1.043,
"step": 1419
},
{
"epoch": 3.631713554987212,
"grad_norm": 0.1983125579481505,
"learning_rate": 1.1088404595333046e-05,
"loss": 1.0449,
"step": 1420
},
{
"epoch": 3.634271099744246,
"grad_norm": 0.21827713474511093,
"learning_rate": 1.1075724341984201e-05,
"loss": 1.0622,
"step": 1421
},
{
"epoch": 3.6368286445012785,
"grad_norm": 0.23619084555258635,
"learning_rate": 1.1063042338499113e-05,
"loss": 1.015,
"step": 1422
},
{
"epoch": 3.639386189258312,
"grad_norm": 0.20336660531825468,
"learning_rate": 1.1050358605510606e-05,
"loss": 1.0413,
"step": 1423
},
{
"epoch": 3.6419437340153453,
"grad_norm": 0.2421386235557971,
"learning_rate": 1.1037673163654321e-05,
"loss": 1.0307,
"step": 1424
},
{
"epoch": 3.6445012787723785,
"grad_norm": 0.22360499286457716,
"learning_rate": 1.1024986033568683e-05,
"loss": 1.0605,
"step": 1425
},
{
"epoch": 3.6470588235294117,
"grad_norm": 0.2378376933825962,
"learning_rate": 1.101229723589485e-05,
"loss": 1.0192,
"step": 1426
},
{
"epoch": 3.649616368286445,
"grad_norm": 0.22968460013912853,
"learning_rate": 1.099960679127671e-05,
"loss": 1.0349,
"step": 1427
},
{
"epoch": 3.6521739130434785,
"grad_norm": 0.23158540102865127,
"learning_rate": 1.0986914720360821e-05,
"loss": 1.0253,
"step": 1428
},
{
"epoch": 3.6547314578005117,
"grad_norm": 0.22013393117978197,
"learning_rate": 1.097422104379639e-05,
"loss": 1.018,
"step": 1429
},
{
"epoch": 3.657289002557545,
"grad_norm": 0.22220097208242998,
"learning_rate": 1.0961525782235233e-05,
"loss": 1.0473,
"step": 1430
},
{
"epoch": 3.659846547314578,
"grad_norm": 0.22194116899976712,
"learning_rate": 1.0948828956331752e-05,
"loss": 1.0424,
"step": 1431
},
{
"epoch": 3.662404092071611,
"grad_norm": 0.1983453396349903,
"learning_rate": 1.0936130586742881e-05,
"loss": 1.0453,
"step": 1432
},
{
"epoch": 3.6649616368286444,
"grad_norm": 0.2327743943604014,
"learning_rate": 1.0923430694128074e-05,
"loss": 1.0193,
"step": 1433
},
{
"epoch": 3.6675191815856776,
"grad_norm": 0.21867884439727386,
"learning_rate": 1.091072929914927e-05,
"loss": 1.0256,
"step": 1434
},
{
"epoch": 3.670076726342711,
"grad_norm": 0.23080732244405422,
"learning_rate": 1.0898026422470838e-05,
"loss": 1.0232,
"step": 1435
},
{
"epoch": 3.6726342710997444,
"grad_norm": 0.22857566907679472,
"learning_rate": 1.0885322084759566e-05,
"loss": 1.0536,
"step": 1436
},
{
"epoch": 3.6751918158567776,
"grad_norm": 0.2520804757587095,
"learning_rate": 1.0872616306684616e-05,
"loss": 1.0287,
"step": 1437
},
{
"epoch": 3.6777493606138107,
"grad_norm": 0.2469698171523125,
"learning_rate": 1.0859909108917497e-05,
"loss": 1.0909,
"step": 1438
},
{
"epoch": 3.680306905370844,
"grad_norm": 0.2327692634720372,
"learning_rate": 1.084720051213202e-05,
"loss": 1.0193,
"step": 1439
},
{
"epoch": 3.682864450127877,
"grad_norm": 0.23658961049768784,
"learning_rate": 1.0834490537004286e-05,
"loss": 1.0212,
"step": 1440
},
{
"epoch": 3.6854219948849103,
"grad_norm": 0.20942394628132058,
"learning_rate": 1.0821779204212623e-05,
"loss": 1.0249,
"step": 1441
},
{
"epoch": 3.687979539641944,
"grad_norm": 0.23145657493822064,
"learning_rate": 1.0809066534437576e-05,
"loss": 1.0179,
"step": 1442
},
{
"epoch": 3.690537084398977,
"grad_norm": 0.1999453161376075,
"learning_rate": 1.0796352548361863e-05,
"loss": 1.0026,
"step": 1443
},
{
"epoch": 3.6930946291560103,
"grad_norm": 0.22035660036843002,
"learning_rate": 1.0783637266670348e-05,
"loss": 1.0287,
"step": 1444
},
{
"epoch": 3.6956521739130435,
"grad_norm": 0.19317194516834582,
"learning_rate": 1.0770920710049997e-05,
"loss": 1.0507,
"step": 1445
},
{
"epoch": 3.6982097186700766,
"grad_norm": 0.2457010945328612,
"learning_rate": 1.0758202899189852e-05,
"loss": 1.0135,
"step": 1446
},
{
"epoch": 3.70076726342711,
"grad_norm": 0.18287871278152357,
"learning_rate": 1.0745483854780996e-05,
"loss": 1.0408,
"step": 1447
},
{
"epoch": 3.703324808184143,
"grad_norm": 0.23748668263508885,
"learning_rate": 1.073276359751652e-05,
"loss": 1.0642,
"step": 1448
},
{
"epoch": 3.7058823529411766,
"grad_norm": 0.22123508756316554,
"learning_rate": 1.0720042148091487e-05,
"loss": 1.0136,
"step": 1449
},
{
"epoch": 3.70843989769821,
"grad_norm": 0.23936061656812962,
"learning_rate": 1.0707319527202902e-05,
"loss": 1.0297,
"step": 1450
},
{
"epoch": 3.710997442455243,
"grad_norm": 0.27579723622779695,
"learning_rate": 1.0694595755549668e-05,
"loss": 1.0088,
"step": 1451
},
{
"epoch": 3.713554987212276,
"grad_norm": 0.2295449569053256,
"learning_rate": 1.0681870853832572e-05,
"loss": 1.0411,
"step": 1452
},
{
"epoch": 3.7161125319693094,
"grad_norm": 0.21165912842223478,
"learning_rate": 1.066914484275423e-05,
"loss": 1.0237,
"step": 1453
},
{
"epoch": 3.718670076726343,
"grad_norm": 0.22373624538155187,
"learning_rate": 1.0656417743019065e-05,
"loss": 1.0661,
"step": 1454
},
{
"epoch": 3.7212276214833757,
"grad_norm": 0.18604305862261736,
"learning_rate": 1.0643689575333276e-05,
"loss": 1.0205,
"step": 1455
},
{
"epoch": 3.7237851662404093,
"grad_norm": 0.22160309843387682,
"learning_rate": 1.0630960360404793e-05,
"loss": 1.0179,
"step": 1456
},
{
"epoch": 3.7263427109974425,
"grad_norm": 0.1910813020463846,
"learning_rate": 1.061823011894326e-05,
"loss": 1.0622,
"step": 1457
},
{
"epoch": 3.7289002557544757,
"grad_norm": 0.22862715748972842,
"learning_rate": 1.0605498871659974e-05,
"loss": 1.0185,
"step": 1458
},
{
"epoch": 3.731457800511509,
"grad_norm": 0.20341936295394042,
"learning_rate": 1.0592766639267885e-05,
"loss": 1.0534,
"step": 1459
},
{
"epoch": 3.734015345268542,
"grad_norm": 0.2403253522185079,
"learning_rate": 1.0580033442481532e-05,
"loss": 1.0384,
"step": 1460
},
{
"epoch": 3.7365728900255757,
"grad_norm": 0.22338961464147264,
"learning_rate": 1.0567299302017038e-05,
"loss": 1.0143,
"step": 1461
},
{
"epoch": 3.7391304347826084,
"grad_norm": 0.2117212049005623,
"learning_rate": 1.0554564238592051e-05,
"loss": 1.021,
"step": 1462
},
{
"epoch": 3.741687979539642,
"grad_norm": 0.2254372260082909,
"learning_rate": 1.0541828272925721e-05,
"loss": 1.0292,
"step": 1463
},
{
"epoch": 3.7442455242966752,
"grad_norm": 0.1922734992717323,
"learning_rate": 1.0529091425738669e-05,
"loss": 1.0489,
"step": 1464
},
{
"epoch": 3.7468030690537084,
"grad_norm": 0.21486062627786348,
"learning_rate": 1.0516353717752947e-05,
"loss": 1.0359,
"step": 1465
},
{
"epoch": 3.7493606138107416,
"grad_norm": 0.19407217948842267,
"learning_rate": 1.0503615169692012e-05,
"loss": 1.0342,
"step": 1466
},
{
"epoch": 3.7519181585677748,
"grad_norm": 0.1785805281257786,
"learning_rate": 1.0490875802280685e-05,
"loss": 1.0353,
"step": 1467
},
{
"epoch": 3.7544757033248084,
"grad_norm": 0.20291577459751503,
"learning_rate": 1.0478135636245122e-05,
"loss": 1.0306,
"step": 1468
},
{
"epoch": 3.7570332480818416,
"grad_norm": 0.1982096205595046,
"learning_rate": 1.046539469231277e-05,
"loss": 1.0548,
"step": 1469
},
{
"epoch": 3.7595907928388748,
"grad_norm": 0.20930042720158404,
"learning_rate": 1.0452652991212357e-05,
"loss": 1.0094,
"step": 1470
},
{
"epoch": 3.762148337595908,
"grad_norm": 0.19919273397375814,
"learning_rate": 1.0439910553673829e-05,
"loss": 1.0439,
"step": 1471
},
{
"epoch": 3.764705882352941,
"grad_norm": 0.22254826567261315,
"learning_rate": 1.0427167400428331e-05,
"loss": 1.0373,
"step": 1472
},
{
"epoch": 3.7672634271099743,
"grad_norm": 0.22854611711688827,
"learning_rate": 1.0414423552208184e-05,
"loss": 1.0199,
"step": 1473
},
{
"epoch": 3.7698209718670075,
"grad_norm": 0.3654589035727414,
"learning_rate": 1.0401679029746828e-05,
"loss": 1.0311,
"step": 1474
},
{
"epoch": 3.772378516624041,
"grad_norm": 0.19477682817923897,
"learning_rate": 1.038893385377881e-05,
"loss": 1.0445,
"step": 1475
},
{
"epoch": 3.7749360613810743,
"grad_norm": 0.2035068833502665,
"learning_rate": 1.0376188045039723e-05,
"loss": 1.035,
"step": 1476
},
{
"epoch": 3.7774936061381075,
"grad_norm": 0.20207740056727894,
"learning_rate": 1.0363441624266213e-05,
"loss": 1.0054,
"step": 1477
},
{
"epoch": 3.7800511508951407,
"grad_norm": 0.23108316839210677,
"learning_rate": 1.0350694612195905e-05,
"loss": 1.0299,
"step": 1478
},
{
"epoch": 3.782608695652174,
"grad_norm": 0.19921910618488686,
"learning_rate": 1.0337947029567388e-05,
"loss": 1.013,
"step": 1479
},
{
"epoch": 3.785166240409207,
"grad_norm": 0.19609376442655463,
"learning_rate": 1.0325198897120183e-05,
"loss": 1.0239,
"step": 1480
},
{
"epoch": 3.78772378516624,
"grad_norm": 0.2039103534692172,
"learning_rate": 1.0312450235594706e-05,
"loss": 1.0262,
"step": 1481
},
{
"epoch": 3.790281329923274,
"grad_norm": 0.19686683259289736,
"learning_rate": 1.0299701065732235e-05,
"loss": 1.0444,
"step": 1482
},
{
"epoch": 3.792838874680307,
"grad_norm": 0.2031103792356114,
"learning_rate": 1.0286951408274865e-05,
"loss": 1.0993,
"step": 1483
},
{
"epoch": 3.79539641943734,
"grad_norm": 0.2263801739639009,
"learning_rate": 1.0274201283965497e-05,
"loss": 1.0409,
"step": 1484
},
{
"epoch": 3.7979539641943734,
"grad_norm": 0.17572315424279408,
"learning_rate": 1.0261450713547785e-05,
"loss": 1.075,
"step": 1485
},
{
"epoch": 3.8005115089514065,
"grad_norm": 0.27023491274755906,
"learning_rate": 1.0248699717766107e-05,
"loss": 1.0679,
"step": 1486
},
{
"epoch": 3.80306905370844,
"grad_norm": 0.1713633148592625,
"learning_rate": 1.023594831736554e-05,
"loss": 1.0484,
"step": 1487
},
{
"epoch": 3.805626598465473,
"grad_norm": 0.2367623046752298,
"learning_rate": 1.0223196533091813e-05,
"loss": 1.0287,
"step": 1488
},
{
"epoch": 3.8081841432225065,
"grad_norm": 0.1984118987646221,
"learning_rate": 1.0210444385691282e-05,
"loss": 1.0373,
"step": 1489
},
{
"epoch": 3.8107416879795397,
"grad_norm": 0.19013291547902408,
"learning_rate": 1.0197691895910895e-05,
"loss": 1.0396,
"step": 1490
},
{
"epoch": 3.813299232736573,
"grad_norm": 0.2262690201508357,
"learning_rate": 1.0184939084498153e-05,
"loss": 1.0383,
"step": 1491
},
{
"epoch": 3.815856777493606,
"grad_norm": 0.21345095926753077,
"learning_rate": 1.0172185972201082e-05,
"loss": 1.0341,
"step": 1492
},
{
"epoch": 3.8184143222506393,
"grad_norm": 0.18180827453898485,
"learning_rate": 1.01594325797682e-05,
"loss": 1.0419,
"step": 1493
},
{
"epoch": 3.820971867007673,
"grad_norm": 0.23760325057681905,
"learning_rate": 1.0146678927948484e-05,
"loss": 1.0178,
"step": 1494
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.18084043730292876,
"learning_rate": 1.013392503749132e-05,
"loss": 1.0701,
"step": 1495
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.18619103410121773,
"learning_rate": 1.0121170929146493e-05,
"loss": 1.0359,
"step": 1496
},
{
"epoch": 3.8286445012787724,
"grad_norm": 0.1814058213229099,
"learning_rate": 1.0108416623664142e-05,
"loss": 1.0483,
"step": 1497
},
{
"epoch": 3.8312020460358056,
"grad_norm": 0.17659823284048892,
"learning_rate": 1.0095662141794725e-05,
"loss": 1.0167,
"step": 1498
},
{
"epoch": 3.833759590792839,
"grad_norm": 0.18093838446366517,
"learning_rate": 1.0082907504288977e-05,
"loss": 1.0271,
"step": 1499
},
{
"epoch": 3.836317135549872,
"grad_norm": 0.19401662423230362,
"learning_rate": 1.0070152731897911e-05,
"loss": 1.0525,
"step": 1500
},
{
"epoch": 3.8388746803069056,
"grad_norm": 0.17897896363370017,
"learning_rate": 1.0057397845372734e-05,
"loss": 1.0354,
"step": 1501
},
{
"epoch": 3.8414322250639388,
"grad_norm": 0.18581636595029996,
"learning_rate": 1.004464286546485e-05,
"loss": 1.0439,
"step": 1502
},
{
"epoch": 3.843989769820972,
"grad_norm": 0.17458922536736418,
"learning_rate": 1.0031887812925818e-05,
"loss": 1.0073,
"step": 1503
},
{
"epoch": 3.846547314578005,
"grad_norm": 0.18401279215992355,
"learning_rate": 1.0019132708507307e-05,
"loss": 1.0549,
"step": 1504
},
{
"epoch": 3.8491048593350383,
"grad_norm": 0.17886260918603583,
"learning_rate": 1.0006377572961075e-05,
"loss": 1.056,
"step": 1505
},
{
"epoch": 3.8516624040920715,
"grad_norm": 0.18640944420175584,
"learning_rate": 9.99362242703893e-06,
"loss": 1.0317,
"step": 1506
},
{
"epoch": 3.8542199488491047,
"grad_norm": 0.1724777242125077,
"learning_rate": 9.980867291492697e-06,
"loss": 1.0496,
"step": 1507
},
{
"epoch": 3.8567774936061383,
"grad_norm": 0.17736614296923925,
"learning_rate": 9.968112187074187e-06,
"loss": 1.0321,
"step": 1508
},
{
"epoch": 3.8593350383631715,
"grad_norm": 0.18919776197181185,
"learning_rate": 9.955357134535153e-06,
"loss": 1.0612,
"step": 1509
},
{
"epoch": 3.8618925831202047,
"grad_norm": 0.17013450287572257,
"learning_rate": 9.94260215462727e-06,
"loss": 1.0371,
"step": 1510
},
{
"epoch": 3.864450127877238,
"grad_norm": 0.1795391930284376,
"learning_rate": 9.929847268102092e-06,
"loss": 1.0116,
"step": 1511
},
{
"epoch": 3.867007672634271,
"grad_norm": 0.18010343872623125,
"learning_rate": 9.917092495711023e-06,
"loss": 0.9975,
"step": 1512
},
{
"epoch": 3.869565217391304,
"grad_norm": 0.2018143041172149,
"learning_rate": 9.904337858205282e-06,
"loss": 1.0261,
"step": 1513
},
{
"epoch": 3.8721227621483374,
"grad_norm": 0.20189193249637963,
"learning_rate": 9.891583376335861e-06,
"loss": 1.036,
"step": 1514
},
{
"epoch": 3.874680306905371,
"grad_norm": 0.18604316403857601,
"learning_rate": 9.87882907085351e-06,
"loss": 1.0353,
"step": 1515
},
{
"epoch": 3.877237851662404,
"grad_norm": 0.1764086076077849,
"learning_rate": 9.866074962508684e-06,
"loss": 1.048,
"step": 1516
},
{
"epoch": 3.8797953964194374,
"grad_norm": 0.18861859299069214,
"learning_rate": 9.85332107205152e-06,
"loss": 1.0719,
"step": 1517
},
{
"epoch": 3.8823529411764706,
"grad_norm": 0.1729886347071538,
"learning_rate": 9.840567420231802e-06,
"loss": 1.0436,
"step": 1518
},
{
"epoch": 3.8849104859335037,
"grad_norm": 0.20230041478663247,
"learning_rate": 9.82781402779892e-06,
"loss": 1.0611,
"step": 1519
},
{
"epoch": 3.887468030690537,
"grad_norm": 0.19599063188718716,
"learning_rate": 9.815060915501852e-06,
"loss": 1.0517,
"step": 1520
},
{
"epoch": 3.89002557544757,
"grad_norm": 0.20556197980895194,
"learning_rate": 9.802308104089109e-06,
"loss": 1.0249,
"step": 1521
},
{
"epoch": 3.8925831202046037,
"grad_norm": 0.21413593644142717,
"learning_rate": 9.789555614308721e-06,
"loss": 0.9947,
"step": 1522
},
{
"epoch": 3.895140664961637,
"grad_norm": 0.20287758208508144,
"learning_rate": 9.77680346690819e-06,
"loss": 1.0352,
"step": 1523
},
{
"epoch": 3.89769820971867,
"grad_norm": 0.19248950316327032,
"learning_rate": 9.764051682634462e-06,
"loss": 1.0275,
"step": 1524
},
{
"epoch": 3.9002557544757033,
"grad_norm": 0.22258046212032104,
"learning_rate": 9.751300282233895e-06,
"loss": 1.0534,
"step": 1525
},
{
"epoch": 3.9028132992327365,
"grad_norm": 0.21347571901775975,
"learning_rate": 9.738549286452218e-06,
"loss": 1.038,
"step": 1526
},
{
"epoch": 3.90537084398977,
"grad_norm": 0.2280185995042673,
"learning_rate": 9.725798716034507e-06,
"loss": 1.0286,
"step": 1527
},
{
"epoch": 3.907928388746803,
"grad_norm": 0.20202933779134605,
"learning_rate": 9.713048591725138e-06,
"loss": 1.0448,
"step": 1528
},
{
"epoch": 3.9104859335038364,
"grad_norm": 0.20920944736139577,
"learning_rate": 9.700298934267766e-06,
"loss": 1.0069,
"step": 1529
},
{
"epoch": 3.9130434782608696,
"grad_norm": 0.19240200507914293,
"learning_rate": 9.687549764405296e-06,
"loss": 1.0376,
"step": 1530
},
{
"epoch": 3.915601023017903,
"grad_norm": 0.20292905124684749,
"learning_rate": 9.674801102879817e-06,
"loss": 1.0274,
"step": 1531
},
{
"epoch": 3.918158567774936,
"grad_norm": 0.19062905855598355,
"learning_rate": 9.662052970432617e-06,
"loss": 1.0407,
"step": 1532
},
{
"epoch": 3.920716112531969,
"grad_norm": 0.21406493946615143,
"learning_rate": 9.6493053878041e-06,
"loss": 1.0401,
"step": 1533
},
{
"epoch": 3.923273657289003,
"grad_norm": 0.19190236583371453,
"learning_rate": 9.63655837573379e-06,
"loss": 1.0521,
"step": 1534
},
{
"epoch": 3.9258312020460355,
"grad_norm": 0.22868484745745557,
"learning_rate": 9.623811954960279e-06,
"loss": 1.0396,
"step": 1535
},
{
"epoch": 3.928388746803069,
"grad_norm": 0.1896213962401851,
"learning_rate": 9.611066146221192e-06,
"loss": 1.0272,
"step": 1536
},
{
"epoch": 3.9309462915601023,
"grad_norm": 0.208558000446644,
"learning_rate": 9.598320970253175e-06,
"loss": 1.0263,
"step": 1537
},
{
"epoch": 3.9335038363171355,
"grad_norm": 0.18215621037833685,
"learning_rate": 9.585576447791817e-06,
"loss": 1.044,
"step": 1538
},
{
"epoch": 3.9360613810741687,
"grad_norm": 0.17351304593560926,
"learning_rate": 9.572832599571674e-06,
"loss": 1.0268,
"step": 1539
},
{
"epoch": 3.938618925831202,
"grad_norm": 0.22389061474679745,
"learning_rate": 9.560089446326175e-06,
"loss": 1.0313,
"step": 1540
},
{
"epoch": 3.9411764705882355,
"grad_norm": 0.17547633776625562,
"learning_rate": 9.547347008787648e-06,
"loss": 1.0321,
"step": 1541
},
{
"epoch": 3.9437340153452687,
"grad_norm": 0.21231411571444475,
"learning_rate": 9.534605307687233e-06,
"loss": 1.0027,
"step": 1542
},
{
"epoch": 3.946291560102302,
"grad_norm": 0.1792239552721382,
"learning_rate": 9.52186436375488e-06,
"loss": 1.0272,
"step": 1543
},
{
"epoch": 3.948849104859335,
"grad_norm": 0.21595336710565813,
"learning_rate": 9.509124197719317e-06,
"loss": 1.0074,
"step": 1544
},
{
"epoch": 3.9514066496163682,
"grad_norm": 0.20310879984969743,
"learning_rate": 9.496384830307988e-06,
"loss": 1.0481,
"step": 1545
},
{
"epoch": 3.9539641943734014,
"grad_norm": 0.20949639165674833,
"learning_rate": 9.483646282247056e-06,
"loss": 1.0167,
"step": 1546
},
{
"epoch": 3.9565217391304346,
"grad_norm": 0.23427285497954728,
"learning_rate": 9.470908574261333e-06,
"loss": 1.0478,
"step": 1547
},
{
"epoch": 3.959079283887468,
"grad_norm": 0.1881836520862583,
"learning_rate": 9.458171727074284e-06,
"loss": 1.0257,
"step": 1548
},
{
"epoch": 3.9616368286445014,
"grad_norm": 0.22079043196824938,
"learning_rate": 9.44543576140795e-06,
"loss": 1.0904,
"step": 1549
},
{
"epoch": 3.9641943734015346,
"grad_norm": 0.18959168411837335,
"learning_rate": 9.432700697982962e-06,
"loss": 1.0562,
"step": 1550
},
{
"epoch": 3.9667519181585678,
"grad_norm": 0.1881932409897208,
"learning_rate": 9.419966557518472e-06,
"loss": 1.048,
"step": 1551
},
{
"epoch": 3.969309462915601,
"grad_norm": 0.20694575807793056,
"learning_rate": 9.407233360732119e-06,
"loss": 1.0453,
"step": 1552
},
{
"epoch": 3.971867007672634,
"grad_norm": 0.21141511803194477,
"learning_rate": 9.39450112834003e-06,
"loss": 1.0416,
"step": 1553
},
{
"epoch": 3.9744245524296673,
"grad_norm": 0.19924380600743072,
"learning_rate": 9.381769881056744e-06,
"loss": 1.0302,
"step": 1554
},
{
"epoch": 3.976982097186701,
"grad_norm": 0.18443702573710982,
"learning_rate": 9.36903963959521e-06,
"loss": 1.0509,
"step": 1555
},
{
"epoch": 3.979539641943734,
"grad_norm": 0.2130900807101153,
"learning_rate": 9.356310424666725e-06,
"loss": 1.0674,
"step": 1556
},
{
"epoch": 3.9820971867007673,
"grad_norm": 0.18076464736813797,
"learning_rate": 9.343582256980937e-06,
"loss": 1.0327,
"step": 1557
},
{
"epoch": 3.9846547314578005,
"grad_norm": 0.19770573119978005,
"learning_rate": 9.330855157245776e-06,
"loss": 1.049,
"step": 1558
},
{
"epoch": 3.9872122762148337,
"grad_norm": 0.18941088064084555,
"learning_rate": 9.318129146167432e-06,
"loss": 1.0285,
"step": 1559
},
{
"epoch": 3.9897698209718673,
"grad_norm": 0.21949442372495884,
"learning_rate": 9.305404244450337e-06,
"loss": 1.0447,
"step": 1560
},
{
"epoch": 3.9923273657289,
"grad_norm": 0.19665403880426255,
"learning_rate": 9.292680472797101e-06,
"loss": 1.0411,
"step": 1561
},
{
"epoch": 3.9948849104859336,
"grad_norm": 0.19058036356127872,
"learning_rate": 9.279957851908513e-06,
"loss": 1.0535,
"step": 1562
},
{
"epoch": 3.997442455242967,
"grad_norm": 0.18814319318672243,
"learning_rate": 9.267236402483482e-06,
"loss": 1.036,
"step": 1563
},
{
"epoch": 4.0,
"grad_norm": 0.1865356816625339,
"learning_rate": 9.254516145219006e-06,
"loss": 1.0435,
"step": 1564
},
{
"epoch": 4.002557544757034,
"grad_norm": 0.19230450271770366,
"learning_rate": 9.241797100810152e-06,
"loss": 1.0143,
"step": 1565
},
{
"epoch": 4.005115089514066,
"grad_norm": 0.19899721133072965,
"learning_rate": 9.229079289950005e-06,
"loss": 1.0249,
"step": 1566
},
{
"epoch": 4.0076726342711,
"grad_norm": 0.21185878359559354,
"learning_rate": 9.216362733329657e-06,
"loss": 0.9987,
"step": 1567
},
{
"epoch": 4.010230179028133,
"grad_norm": 0.1985629222033457,
"learning_rate": 9.203647451638138e-06,
"loss": 1.0198,
"step": 1568
},
{
"epoch": 4.012787723785166,
"grad_norm": 0.1930121039553769,
"learning_rate": 9.190933465562426e-06,
"loss": 1.0328,
"step": 1569
},
{
"epoch": 4.015345268542199,
"grad_norm": 0.2189356848452908,
"learning_rate": 9.17822079578738e-06,
"loss": 1.0358,
"step": 1570
},
{
"epoch": 4.017902813299233,
"grad_norm": 0.18197666560197398,
"learning_rate": 9.165509462995716e-06,
"loss": 1.0312,
"step": 1571
},
{
"epoch": 4.020460358056266,
"grad_norm": 0.22141370700870244,
"learning_rate": 9.152799487867981e-06,
"loss": 1.0167,
"step": 1572
},
{
"epoch": 4.023017902813299,
"grad_norm": 0.2061928144217363,
"learning_rate": 9.140090891082506e-06,
"loss": 1.0173,
"step": 1573
},
{
"epoch": 4.025575447570333,
"grad_norm": 0.1855420730525284,
"learning_rate": 9.127383693315387e-06,
"loss": 1.0122,
"step": 1574
},
{
"epoch": 4.028132992327365,
"grad_norm": 0.19054702381827276,
"learning_rate": 9.114677915240436e-06,
"loss": 1.0207,
"step": 1575
},
{
"epoch": 4.030690537084399,
"grad_norm": 0.17786433578081798,
"learning_rate": 9.101973577529164e-06,
"loss": 1.0339,
"step": 1576
},
{
"epoch": 4.033248081841432,
"grad_norm": 0.18910562787321678,
"learning_rate": 9.089270700850733e-06,
"loss": 1.0007,
"step": 1577
},
{
"epoch": 4.035805626598465,
"grad_norm": 0.18519350419636166,
"learning_rate": 9.076569305871926e-06,
"loss": 1.0314,
"step": 1578
},
{
"epoch": 4.038363171355499,
"grad_norm": 0.21754655747857035,
"learning_rate": 9.063869413257124e-06,
"loss": 1.0302,
"step": 1579
},
{
"epoch": 4.040920716112532,
"grad_norm": 0.18004679417947927,
"learning_rate": 9.051171043668251e-06,
"loss": 1.0476,
"step": 1580
},
{
"epoch": 4.043478260869565,
"grad_norm": 0.2168920363400877,
"learning_rate": 9.038474217764768e-06,
"loss": 1.025,
"step": 1581
},
{
"epoch": 4.046035805626598,
"grad_norm": 0.19274796431055907,
"learning_rate": 9.025778956203611e-06,
"loss": 1.0098,
"step": 1582
},
{
"epoch": 4.048593350383632,
"grad_norm": 0.19201028214018007,
"learning_rate": 9.013085279639178e-06,
"loss": 1.0017,
"step": 1583
},
{
"epoch": 4.051150895140665,
"grad_norm": 0.19629486524205142,
"learning_rate": 9.000393208723291e-06,
"loss": 1.0219,
"step": 1584
},
{
"epoch": 4.053708439897698,
"grad_norm": 0.19752451256428386,
"learning_rate": 8.987702764105151e-06,
"loss": 1.0177,
"step": 1585
},
{
"epoch": 4.056265984654732,
"grad_norm": 0.20166118830323768,
"learning_rate": 8.975013966431323e-06,
"loss": 1.0601,
"step": 1586
},
{
"epoch": 4.0588235294117645,
"grad_norm": 0.17326861120237855,
"learning_rate": 8.96232683634568e-06,
"loss": 0.9847,
"step": 1587
},
{
"epoch": 4.061381074168798,
"grad_norm": 0.1898245941021511,
"learning_rate": 8.949641394489399e-06,
"loss": 1.0099,
"step": 1588
},
{
"epoch": 4.063938618925831,
"grad_norm": 0.1700392821316134,
"learning_rate": 8.93695766150089e-06,
"loss": 1.0538,
"step": 1589
},
{
"epoch": 4.0664961636828645,
"grad_norm": 0.1682061615806585,
"learning_rate": 8.9242756580158e-06,
"loss": 1.0172,
"step": 1590
},
{
"epoch": 4.069053708439898,
"grad_norm": 0.19303997092308417,
"learning_rate": 8.911595404666957e-06,
"loss": 1.0546,
"step": 1591
},
{
"epoch": 4.071611253196931,
"grad_norm": 0.1654939906619837,
"learning_rate": 8.898916922084336e-06,
"loss": 1.0464,
"step": 1592
},
{
"epoch": 4.0741687979539645,
"grad_norm": 0.18143405806846177,
"learning_rate": 8.88624023089504e-06,
"loss": 1.0545,
"step": 1593
},
{
"epoch": 4.076726342710997,
"grad_norm": 0.20747010533584376,
"learning_rate": 8.873565351723249e-06,
"loss": 1.0589,
"step": 1594
},
{
"epoch": 4.079283887468031,
"grad_norm": 0.15953653305890375,
"learning_rate": 8.8608923051902e-06,
"loss": 1.0179,
"step": 1595
},
{
"epoch": 4.081841432225064,
"grad_norm": 0.2035902582767619,
"learning_rate": 8.848221111914147e-06,
"loss": 1.0447,
"step": 1596
},
{
"epoch": 4.084398976982097,
"grad_norm": 0.15347759439362155,
"learning_rate": 8.835551792510329e-06,
"loss": 1.0307,
"step": 1597
},
{
"epoch": 4.086956521739131,
"grad_norm": 0.20574769500088766,
"learning_rate": 8.822884367590941e-06,
"loss": 0.9952,
"step": 1598
},
{
"epoch": 4.089514066496164,
"grad_norm": 0.1835496415175651,
"learning_rate": 8.810218857765085e-06,
"loss": 1.0005,
"step": 1599
},
{
"epoch": 4.092071611253197,
"grad_norm": 0.20530099186755948,
"learning_rate": 8.79755528363876e-06,
"loss": 1.0361,
"step": 1600
},
{
"epoch": 4.09462915601023,
"grad_norm": 0.2026938929869877,
"learning_rate": 8.7848936658148e-06,
"loss": 1.0328,
"step": 1601
},
{
"epoch": 4.0971867007672635,
"grad_norm": 0.1907662170906002,
"learning_rate": 8.772234024892872e-06,
"loss": 1.0133,
"step": 1602
},
{
"epoch": 4.099744245524296,
"grad_norm": 0.19617684565754476,
"learning_rate": 8.759576381469425e-06,
"loss": 1.0027,
"step": 1603
},
{
"epoch": 4.10230179028133,
"grad_norm": 0.17534476994793663,
"learning_rate": 8.746920756137642e-06,
"loss": 1.0437,
"step": 1604
},
{
"epoch": 4.1048593350383635,
"grad_norm": 0.20521166727954332,
"learning_rate": 8.734267169487444e-06,
"loss": 1.0265,
"step": 1605
},
{
"epoch": 4.107416879795396,
"grad_norm": 0.17225400361630142,
"learning_rate": 8.721615642105417e-06,
"loss": 1.0338,
"step": 1606
},
{
"epoch": 4.10997442455243,
"grad_norm": 0.21382338032724127,
"learning_rate": 8.708966194574814e-06,
"loss": 1.0083,
"step": 1607
},
{
"epoch": 4.112531969309463,
"grad_norm": 0.16180422908572098,
"learning_rate": 8.696318847475487e-06,
"loss": 1.0169,
"step": 1608
},
{
"epoch": 4.115089514066496,
"grad_norm": 0.23650182130816144,
"learning_rate": 8.68367362138388e-06,
"loss": 1.0323,
"step": 1609
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.18535588146645351,
"learning_rate": 8.671030536872995e-06,
"loss": 1.0299,
"step": 1610
},
{
"epoch": 4.120204603580563,
"grad_norm": 0.17955290128121904,
"learning_rate": 8.658389614512325e-06,
"loss": 1.0189,
"step": 1611
},
{
"epoch": 4.122762148337596,
"grad_norm": 0.1782288851096717,
"learning_rate": 8.645750874867876e-06,
"loss": 1.0134,
"step": 1612
},
{
"epoch": 4.125319693094629,
"grad_norm": 0.18693604034380645,
"learning_rate": 8.633114338502073e-06,
"loss": 1.0403,
"step": 1613
},
{
"epoch": 4.127877237851663,
"grad_norm": 0.18248123513699424,
"learning_rate": 8.62048002597378e-06,
"loss": 1.0288,
"step": 1614
},
{
"epoch": 4.130434782608695,
"grad_norm": 0.18165634630490243,
"learning_rate": 8.607847957838227e-06,
"loss": 1.0301,
"step": 1615
},
{
"epoch": 4.132992327365729,
"grad_norm": 0.1803487141905229,
"learning_rate": 8.595218154647001e-06,
"loss": 1.0301,
"step": 1616
},
{
"epoch": 4.135549872122763,
"grad_norm": 0.18173901474688528,
"learning_rate": 8.58259063694801e-06,
"loss": 1.0222,
"step": 1617
},
{
"epoch": 4.138107416879795,
"grad_norm": 0.18078862560079437,
"learning_rate": 8.56996542528542e-06,
"loss": 1.0235,
"step": 1618
},
{
"epoch": 4.140664961636829,
"grad_norm": 0.1803693056043885,
"learning_rate": 8.55734254019968e-06,
"loss": 0.9988,
"step": 1619
},
{
"epoch": 4.143222506393862,
"grad_norm": 0.1865048325076587,
"learning_rate": 8.544722002227417e-06,
"loss": 1.0538,
"step": 1620
},
{
"epoch": 4.145780051150895,
"grad_norm": 0.17978097814336544,
"learning_rate": 8.532103831901472e-06,
"loss": 1.035,
"step": 1621
},
{
"epoch": 4.148337595907928,
"grad_norm": 0.23624978152806544,
"learning_rate": 8.519488049750808e-06,
"loss": 1.0298,
"step": 1622
},
{
"epoch": 4.150895140664962,
"grad_norm": 0.16381055698474817,
"learning_rate": 8.506874676300514e-06,
"loss": 1.0485,
"step": 1623
},
{
"epoch": 4.153452685421995,
"grad_norm": 0.19963138199162672,
"learning_rate": 8.494263732071772e-06,
"loss": 1.0092,
"step": 1624
},
{
"epoch": 4.156010230179028,
"grad_norm": 0.19251260911612733,
"learning_rate": 8.481655237581785e-06,
"loss": 1.0209,
"step": 1625
},
{
"epoch": 4.158567774936062,
"grad_norm": 0.17091450724555518,
"learning_rate": 8.469049213343798e-06,
"loss": 1.0358,
"step": 1626
},
{
"epoch": 4.161125319693094,
"grad_norm": 0.18111441891291247,
"learning_rate": 8.456445679867013e-06,
"loss": 1.0235,
"step": 1627
},
{
"epoch": 4.163682864450128,
"grad_norm": 0.1742001195215167,
"learning_rate": 8.443844657656596e-06,
"loss": 1.0436,
"step": 1628
},
{
"epoch": 4.166240409207161,
"grad_norm": 0.17755175605855264,
"learning_rate": 8.431246167213627e-06,
"loss": 1.0444,
"step": 1629
},
{
"epoch": 4.168797953964194,
"grad_norm": 0.17719860198513576,
"learning_rate": 8.418650229035054e-06,
"loss": 1.0321,
"step": 1630
},
{
"epoch": 4.171355498721228,
"grad_norm": 0.1606826181735471,
"learning_rate": 8.406056863613689e-06,
"loss": 1.0539,
"step": 1631
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.1739885726513299,
"learning_rate": 8.393466091438139e-06,
"loss": 1.0282,
"step": 1632
},
{
"epoch": 4.176470588235294,
"grad_norm": 0.18218865497775108,
"learning_rate": 8.380877932992815e-06,
"loss": 1.0239,
"step": 1633
},
{
"epoch": 4.179028132992327,
"grad_norm": 0.16523774532642985,
"learning_rate": 8.368292408757853e-06,
"loss": 1.02,
"step": 1634
},
{
"epoch": 4.181585677749361,
"grad_norm": 0.17345180693087728,
"learning_rate": 8.355709539209121e-06,
"loss": 1.0392,
"step": 1635
},
{
"epoch": 4.1841432225063935,
"grad_norm": 0.17255097246631376,
"learning_rate": 8.343129344818162e-06,
"loss": 1.0714,
"step": 1636
},
{
"epoch": 4.186700767263427,
"grad_norm": 0.1814224170983909,
"learning_rate": 8.33055184605216e-06,
"loss": 1.0217,
"step": 1637
},
{
"epoch": 4.189258312020461,
"grad_norm": 0.1748560906889792,
"learning_rate": 8.317977063373925e-06,
"loss": 1.0391,
"step": 1638
},
{
"epoch": 4.1918158567774935,
"grad_norm": 0.18435771096605524,
"learning_rate": 8.305405017241837e-06,
"loss": 1.0215,
"step": 1639
},
{
"epoch": 4.194373401534527,
"grad_norm": 0.16909940397166726,
"learning_rate": 8.292835728109835e-06,
"loss": 1.0141,
"step": 1640
},
{
"epoch": 4.19693094629156,
"grad_norm": 0.16864611479976394,
"learning_rate": 8.28026921642736e-06,
"loss": 0.995,
"step": 1641
},
{
"epoch": 4.1994884910485935,
"grad_norm": 0.1832641724885349,
"learning_rate": 8.267705502639342e-06,
"loss": 1.0443,
"step": 1642
},
{
"epoch": 4.202046035805626,
"grad_norm": 0.15678971891456242,
"learning_rate": 8.255144607186161e-06,
"loss": 0.9988,
"step": 1643
},
{
"epoch": 4.20460358056266,
"grad_norm": 0.17026684913571113,
"learning_rate": 8.242586550503607e-06,
"loss": 1.0413,
"step": 1644
},
{
"epoch": 4.207161125319693,
"grad_norm": 0.17089179054567286,
"learning_rate": 8.230031353022855e-06,
"loss": 1.0305,
"step": 1645
},
{
"epoch": 4.209718670076726,
"grad_norm": 0.17613488393658056,
"learning_rate": 8.217479035170422e-06,
"loss": 1.0075,
"step": 1646
},
{
"epoch": 4.21227621483376,
"grad_norm": 0.15804554349273428,
"learning_rate": 8.204929617368147e-06,
"loss": 1.0119,
"step": 1647
},
{
"epoch": 4.2148337595907925,
"grad_norm": 0.20718638597658195,
"learning_rate": 8.192383120033147e-06,
"loss": 1.0239,
"step": 1648
},
{
"epoch": 4.217391304347826,
"grad_norm": 0.1845223450299457,
"learning_rate": 8.179839563577789e-06,
"loss": 1.0044,
"step": 1649
},
{
"epoch": 4.21994884910486,
"grad_norm": 0.1740911877816002,
"learning_rate": 8.167298968409658e-06,
"loss": 1.0114,
"step": 1650
},
{
"epoch": 4.2225063938618925,
"grad_norm": 0.17787524858695802,
"learning_rate": 8.154761354931513e-06,
"loss": 1.0342,
"step": 1651
},
{
"epoch": 4.225063938618926,
"grad_norm": 0.17981590233123262,
"learning_rate": 8.142226743541273e-06,
"loss": 1.0196,
"step": 1652
},
{
"epoch": 4.227621483375959,
"grad_norm": 0.15945346875306546,
"learning_rate": 8.12969515463196e-06,
"loss": 1.0319,
"step": 1653
},
{
"epoch": 4.2301790281329925,
"grad_norm": 0.1782254652095104,
"learning_rate": 8.117166608591693e-06,
"loss": 1.027,
"step": 1654
},
{
"epoch": 4.232736572890025,
"grad_norm": 0.16769675527664904,
"learning_rate": 8.104641125803628e-06,
"loss": 1.0512,
"step": 1655
},
{
"epoch": 4.235294117647059,
"grad_norm": 0.17673772312426278,
"learning_rate": 8.092118726645943e-06,
"loss": 1.0289,
"step": 1656
},
{
"epoch": 4.2378516624040925,
"grad_norm": 0.17775412310787495,
"learning_rate": 8.0795994314918e-06,
"loss": 1.0134,
"step": 1657
},
{
"epoch": 4.240409207161125,
"grad_norm": 0.165083768711067,
"learning_rate": 8.067083260709309e-06,
"loss": 1.0482,
"step": 1658
},
{
"epoch": 4.242966751918159,
"grad_norm": 0.19604799862438058,
"learning_rate": 8.054570234661498e-06,
"loss": 1.0317,
"step": 1659
},
{
"epoch": 4.245524296675192,
"grad_norm": 0.16528010613818045,
"learning_rate": 8.042060373706275e-06,
"loss": 1.0348,
"step": 1660
},
{
"epoch": 4.248081841432225,
"grad_norm": 0.1804031281677697,
"learning_rate": 8.029553698196405e-06,
"loss": 1.0401,
"step": 1661
},
{
"epoch": 4.250639386189258,
"grad_norm": 0.176393933273107,
"learning_rate": 8.017050228479467e-06,
"loss": 1.0356,
"step": 1662
},
{
"epoch": 4.253196930946292,
"grad_norm": 0.19395943497159726,
"learning_rate": 8.004549984897822e-06,
"loss": 1.0191,
"step": 1663
},
{
"epoch": 4.255754475703325,
"grad_norm": 0.17246963598612605,
"learning_rate": 7.992052987788586e-06,
"loss": 1.0162,
"step": 1664
},
{
"epoch": 4.258312020460358,
"grad_norm": 0.18066442113845643,
"learning_rate": 7.979559257483591e-06,
"loss": 1.0229,
"step": 1665
},
{
"epoch": 4.260869565217392,
"grad_norm": 0.1680697165366633,
"learning_rate": 7.967068814309359e-06,
"loss": 1.0202,
"step": 1666
},
{
"epoch": 4.263427109974424,
"grad_norm": 0.17705957749246876,
"learning_rate": 7.954581678587054e-06,
"loss": 1.0324,
"step": 1667
},
{
"epoch": 4.265984654731458,
"grad_norm": 0.16130768348650035,
"learning_rate": 7.942097870632467e-06,
"loss": 0.9793,
"step": 1668
},
{
"epoch": 4.268542199488491,
"grad_norm": 0.17498237044992782,
"learning_rate": 7.929617410755977e-06,
"loss": 1.0249,
"step": 1669
},
{
"epoch": 4.271099744245524,
"grad_norm": 0.1925424733299812,
"learning_rate": 7.917140319262507e-06,
"loss": 1.0365,
"step": 1670
},
{
"epoch": 4.273657289002558,
"grad_norm": 0.18797309789320532,
"learning_rate": 7.90466661645151e-06,
"loss": 1.0118,
"step": 1671
},
{
"epoch": 4.276214833759591,
"grad_norm": 0.16573297446104532,
"learning_rate": 7.892196322616912e-06,
"loss": 1.0247,
"step": 1672
},
{
"epoch": 4.278772378516624,
"grad_norm": 0.1925991067748996,
"learning_rate": 7.879729458047111e-06,
"loss": 0.978,
"step": 1673
},
{
"epoch": 4.281329923273657,
"grad_norm": 0.1758834459188358,
"learning_rate": 7.86726604302491e-06,
"loss": 1.0175,
"step": 1674
},
{
"epoch": 4.283887468030691,
"grad_norm": 0.16487956982839647,
"learning_rate": 7.854806097827507e-06,
"loss": 1.0288,
"step": 1675
},
{
"epoch": 4.286445012787723,
"grad_norm": 0.1787793037572042,
"learning_rate": 7.842349642726458e-06,
"loss": 1.0166,
"step": 1676
},
{
"epoch": 4.289002557544757,
"grad_norm": 0.1841366036398648,
"learning_rate": 7.829896697987627e-06,
"loss": 1.0348,
"step": 1677
},
{
"epoch": 4.291560102301791,
"grad_norm": 0.1576001038888875,
"learning_rate": 7.817447283871187e-06,
"loss": 1.0342,
"step": 1678
},
{
"epoch": 4.294117647058823,
"grad_norm": 0.17981916810192364,
"learning_rate": 7.80500142063155e-06,
"loss": 1.0214,
"step": 1679
},
{
"epoch": 4.296675191815857,
"grad_norm": 0.17518421051117097,
"learning_rate": 7.792559128517363e-06,
"loss": 1.0404,
"step": 1680
},
{
"epoch": 4.29923273657289,
"grad_norm": 0.16823487687822244,
"learning_rate": 7.780120427771449e-06,
"loss": 1.0112,
"step": 1681
},
{
"epoch": 4.301790281329923,
"grad_norm": 0.16558738219755195,
"learning_rate": 7.7676853386308e-06,
"loss": 1.0605,
"step": 1682
},
{
"epoch": 4.304347826086957,
"grad_norm": 0.17794613732094552,
"learning_rate": 7.755253881326535e-06,
"loss": 1.0371,
"step": 1683
},
{
"epoch": 4.30690537084399,
"grad_norm": 0.19300577747925785,
"learning_rate": 7.742826076083848e-06,
"loss": 1.06,
"step": 1684
},
{
"epoch": 4.309462915601023,
"grad_norm": 0.16066023211525512,
"learning_rate": 7.730401943122007e-06,
"loss": 1.0084,
"step": 1685
},
{
"epoch": 4.312020460358056,
"grad_norm": 0.1947539405327399,
"learning_rate": 7.717981502654297e-06,
"loss": 1.0418,
"step": 1686
},
{
"epoch": 4.31457800511509,
"grad_norm": 0.16039175830465094,
"learning_rate": 7.705564774888001e-06,
"loss": 1.0039,
"step": 1687
},
{
"epoch": 4.3171355498721224,
"grad_norm": 0.18746085529738926,
"learning_rate": 7.693151780024354e-06,
"loss": 1.0041,
"step": 1688
},
{
"epoch": 4.319693094629156,
"grad_norm": 0.17014035483962622,
"learning_rate": 7.680742538258524e-06,
"loss": 1.0087,
"step": 1689
},
{
"epoch": 4.322250639386189,
"grad_norm": 0.19178845859382257,
"learning_rate": 7.668337069779577e-06,
"loss": 1.0716,
"step": 1690
},
{
"epoch": 4.324808184143222,
"grad_norm": 0.16691270419041054,
"learning_rate": 7.655935394770425e-06,
"loss": 1.0185,
"step": 1691
},
{
"epoch": 4.327365728900256,
"grad_norm": 0.17518851447109943,
"learning_rate": 7.643537533407828e-06,
"loss": 1.0173,
"step": 1692
},
{
"epoch": 4.329923273657289,
"grad_norm": 0.16145421958943196,
"learning_rate": 7.631143505862325e-06,
"loss": 1.0351,
"step": 1693
},
{
"epoch": 4.332480818414322,
"grad_norm": 0.37204295825399436,
"learning_rate": 7.618753332298219e-06,
"loss": 1.0303,
"step": 1694
},
{
"epoch": 4.335038363171355,
"grad_norm": 0.15830617963945456,
"learning_rate": 7.606367032873562e-06,
"loss": 1.0129,
"step": 1695
},
{
"epoch": 4.337595907928389,
"grad_norm": 0.18979652677231215,
"learning_rate": 7.593984627740075e-06,
"loss": 1.0526,
"step": 1696
},
{
"epoch": 4.340153452685422,
"grad_norm": 0.1876359842591056,
"learning_rate": 7.5816061370431674e-06,
"loss": 1.0181,
"step": 1697
},
{
"epoch": 4.342710997442455,
"grad_norm": 0.18251068037823034,
"learning_rate": 7.569231580921858e-06,
"loss": 0.996,
"step": 1698
},
{
"epoch": 4.345268542199489,
"grad_norm": 0.17542644898051862,
"learning_rate": 7.556860979508791e-06,
"loss": 1.0301,
"step": 1699
},
{
"epoch": 4.3478260869565215,
"grad_norm": 0.1927803590994827,
"learning_rate": 7.544494352930145e-06,
"loss": 1.03,
"step": 1700
},
{
"epoch": 4.350383631713555,
"grad_norm": 0.16917148556319608,
"learning_rate": 7.532131721305659e-06,
"loss": 0.9895,
"step": 1701
},
{
"epoch": 4.352941176470588,
"grad_norm": 0.18346223176780307,
"learning_rate": 7.519773104748562e-06,
"loss": 1.0428,
"step": 1702
},
{
"epoch": 4.3554987212276215,
"grad_norm": 0.1628922532881499,
"learning_rate": 7.507418523365542e-06,
"loss": 1.058,
"step": 1703
},
{
"epoch": 4.358056265984655,
"grad_norm": 0.1876763139643933,
"learning_rate": 7.495067997256742e-06,
"loss": 1.0112,
"step": 1704
},
{
"epoch": 4.360613810741688,
"grad_norm": 0.15693274545823557,
"learning_rate": 7.482721546515683e-06,
"loss": 1.0281,
"step": 1705
},
{
"epoch": 4.3631713554987215,
"grad_norm": 0.18630090934243648,
"learning_rate": 7.47037919122928e-06,
"loss": 1.0418,
"step": 1706
},
{
"epoch": 4.365728900255754,
"grad_norm": 0.16500214550907966,
"learning_rate": 7.458040951477763e-06,
"loss": 1.0279,
"step": 1707
},
{
"epoch": 4.368286445012788,
"grad_norm": 0.18494529984039387,
"learning_rate": 7.4457068473346836e-06,
"loss": 1.0155,
"step": 1708
},
{
"epoch": 4.370843989769821,
"grad_norm": 0.19216574362796557,
"learning_rate": 7.43337689886686e-06,
"loss": 1.0423,
"step": 1709
},
{
"epoch": 4.373401534526854,
"grad_norm": 0.16751025476175924,
"learning_rate": 7.42105112613434e-06,
"loss": 1.0317,
"step": 1710
},
{
"epoch": 4.375959079283888,
"grad_norm": 0.20151154222401438,
"learning_rate": 7.408729549190393e-06,
"loss": 1.0536,
"step": 1711
},
{
"epoch": 4.378516624040921,
"grad_norm": 0.18065737789912834,
"learning_rate": 7.3964121880814445e-06,
"loss": 1.0549,
"step": 1712
},
{
"epoch": 4.381074168797954,
"grad_norm": 0.17160881413407147,
"learning_rate": 7.3840990628470824e-06,
"loss": 1.0168,
"step": 1713
},
{
"epoch": 4.383631713554987,
"grad_norm": 0.1786512550850061,
"learning_rate": 7.371790193519979e-06,
"loss": 1.0435,
"step": 1714
},
{
"epoch": 4.3861892583120206,
"grad_norm": 0.19232717850899114,
"learning_rate": 7.359485600125904e-06,
"loss": 1.0389,
"step": 1715
},
{
"epoch": 4.388746803069053,
"grad_norm": 0.18440121677046997,
"learning_rate": 7.347185302683662e-06,
"loss": 1.0264,
"step": 1716
},
{
"epoch": 4.391304347826087,
"grad_norm": 0.19371415512946702,
"learning_rate": 7.334889321205063e-06,
"loss": 1.0622,
"step": 1717
},
{
"epoch": 4.3938618925831205,
"grad_norm": 0.19249478474991424,
"learning_rate": 7.322597675694904e-06,
"loss": 1.0029,
"step": 1718
},
{
"epoch": 4.396419437340153,
"grad_norm": 0.19009338152933727,
"learning_rate": 7.31031038615092e-06,
"loss": 1.0165,
"step": 1719
},
{
"epoch": 4.398976982097187,
"grad_norm": 0.18669974928276975,
"learning_rate": 7.298027472563768e-06,
"loss": 1.0357,
"step": 1720
},
{
"epoch": 4.40153452685422,
"grad_norm": 0.1650051526675111,
"learning_rate": 7.285748954916973e-06,
"loss": 1.0562,
"step": 1721
},
{
"epoch": 4.404092071611253,
"grad_norm": 0.1917534223305165,
"learning_rate": 7.273474853186922e-06,
"loss": 1.0409,
"step": 1722
},
{
"epoch": 4.406649616368286,
"grad_norm": 0.17737384077233112,
"learning_rate": 7.261205187342809e-06,
"loss": 1.0464,
"step": 1723
},
{
"epoch": 4.40920716112532,
"grad_norm": 0.17939864900718247,
"learning_rate": 7.248939977346612e-06,
"loss": 1.0153,
"step": 1724
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.16822250340936998,
"learning_rate": 7.236679243153062e-06,
"loss": 1.0274,
"step": 1725
},
{
"epoch": 4.414322250639386,
"grad_norm": 0.2012483436702938,
"learning_rate": 7.224423004709607e-06,
"loss": 1.0302,
"step": 1726
},
{
"epoch": 4.41687979539642,
"grad_norm": 0.16437642340196237,
"learning_rate": 7.212171281956377e-06,
"loss": 1.0173,
"step": 1727
},
{
"epoch": 4.419437340153452,
"grad_norm": 0.18420316116672247,
"learning_rate": 7.199924094826167e-06,
"loss": 1.0154,
"step": 1728
},
{
"epoch": 4.421994884910486,
"grad_norm": 0.17063629208523548,
"learning_rate": 7.187681463244377e-06,
"loss": 1.0252,
"step": 1729
},
{
"epoch": 4.42455242966752,
"grad_norm": 0.2071747152600751,
"learning_rate": 7.175443407129008e-06,
"loss": 1.0643,
"step": 1730
},
{
"epoch": 4.427109974424552,
"grad_norm": 0.1596268627900996,
"learning_rate": 7.163209946390608e-06,
"loss": 1.0094,
"step": 1731
},
{
"epoch": 4.429667519181586,
"grad_norm": 0.17222832212411637,
"learning_rate": 7.1509811009322574e-06,
"loss": 1.0011,
"step": 1732
},
{
"epoch": 4.432225063938619,
"grad_norm": 0.18768984848570255,
"learning_rate": 7.138756890649516e-06,
"loss": 1.0344,
"step": 1733
},
{
"epoch": 4.434782608695652,
"grad_norm": 0.20394581557700622,
"learning_rate": 7.126537335430417e-06,
"loss": 1.0187,
"step": 1734
},
{
"epoch": 4.437340153452685,
"grad_norm": 0.1930227044611592,
"learning_rate": 7.1143224551554115e-06,
"loss": 1.0391,
"step": 1735
},
{
"epoch": 4.439897698209719,
"grad_norm": 0.19780011138369127,
"learning_rate": 7.102112269697341e-06,
"loss": 1.0599,
"step": 1736
},
{
"epoch": 4.442455242966752,
"grad_norm": 0.18641195549148987,
"learning_rate": 7.08990679892142e-06,
"loss": 1.0205,
"step": 1737
},
{
"epoch": 4.445012787723785,
"grad_norm": 0.1745033043017393,
"learning_rate": 7.077706062685181e-06,
"loss": 1.0254,
"step": 1738
},
{
"epoch": 4.447570332480819,
"grad_norm": 0.1875404190434515,
"learning_rate": 7.065510080838465e-06,
"loss": 1.0375,
"step": 1739
},
{
"epoch": 4.450127877237851,
"grad_norm": 0.17560201588299784,
"learning_rate": 7.053318873223365e-06,
"loss": 0.9962,
"step": 1740
},
{
"epoch": 4.452685421994885,
"grad_norm": 0.16337995441327988,
"learning_rate": 7.041132459674216e-06,
"loss": 1.0151,
"step": 1741
},
{
"epoch": 4.455242966751918,
"grad_norm": 0.17910647034147473,
"learning_rate": 7.028950860017555e-06,
"loss": 1.059,
"step": 1742
},
{
"epoch": 4.457800511508951,
"grad_norm": 0.1645714876947052,
"learning_rate": 7.016774094072077e-06,
"loss": 1.0151,
"step": 1743
},
{
"epoch": 4.460358056265985,
"grad_norm": 0.18052975261895468,
"learning_rate": 7.004602181648626e-06,
"loss": 1.0226,
"step": 1744
},
{
"epoch": 4.462915601023018,
"grad_norm": 0.15506591744701947,
"learning_rate": 6.992435142550133e-06,
"loss": 1.0315,
"step": 1745
},
{
"epoch": 4.465473145780051,
"grad_norm": 0.18883014972610887,
"learning_rate": 6.980272996571617e-06,
"loss": 1.035,
"step": 1746
},
{
"epoch": 4.468030690537084,
"grad_norm": 0.17244955302277767,
"learning_rate": 6.968115763500127e-06,
"loss": 1.0212,
"step": 1747
},
{
"epoch": 4.470588235294118,
"grad_norm": 0.17237420999484432,
"learning_rate": 6.95596346311472e-06,
"loss": 1.0262,
"step": 1748
},
{
"epoch": 4.4731457800511505,
"grad_norm": 0.18044664054131004,
"learning_rate": 6.943816115186432e-06,
"loss": 1.0285,
"step": 1749
},
{
"epoch": 4.475703324808184,
"grad_norm": 0.16838623145296286,
"learning_rate": 6.931673739478235e-06,
"loss": 1.0526,
"step": 1750
},
{
"epoch": 4.478260869565218,
"grad_norm": 0.16324922887275686,
"learning_rate": 6.919536355745018e-06,
"loss": 1.0174,
"step": 1751
},
{
"epoch": 4.4808184143222505,
"grad_norm": 0.16440559510930122,
"learning_rate": 6.907403983733543e-06,
"loss": 1.035,
"step": 1752
},
{
"epoch": 4.483375959079284,
"grad_norm": 0.15720327328308067,
"learning_rate": 6.895276643182423e-06,
"loss": 1.047,
"step": 1753
},
{
"epoch": 4.485933503836317,
"grad_norm": 0.16163765669193314,
"learning_rate": 6.883154353822079e-06,
"loss": 1.0465,
"step": 1754
},
{
"epoch": 4.4884910485933505,
"grad_norm": 0.17497015050920636,
"learning_rate": 6.871037135374722e-06,
"loss": 1.0184,
"step": 1755
},
{
"epoch": 4.491048593350383,
"grad_norm": 0.15908864283642854,
"learning_rate": 6.858925007554308e-06,
"loss": 1.0307,
"step": 1756
},
{
"epoch": 4.493606138107417,
"grad_norm": 0.18008191707505186,
"learning_rate": 6.8468179900665095e-06,
"loss": 1.0363,
"step": 1757
},
{
"epoch": 4.4961636828644505,
"grad_norm": 0.1854747706459379,
"learning_rate": 6.834716102608689e-06,
"loss": 1.0083,
"step": 1758
},
{
"epoch": 4.498721227621483,
"grad_norm": 0.1919413504278039,
"learning_rate": 6.8226193648698605e-06,
"loss": 0.996,
"step": 1759
},
{
"epoch": 4.501278772378517,
"grad_norm": 0.16472038994778412,
"learning_rate": 6.810527796530655e-06,
"loss": 1.0476,
"step": 1760
},
{
"epoch": 4.5038363171355495,
"grad_norm": 0.1877483916121461,
"learning_rate": 6.798441417263311e-06,
"loss": 1.042,
"step": 1761
},
{
"epoch": 4.506393861892583,
"grad_norm": 0.1524530347847294,
"learning_rate": 6.786360246731595e-06,
"loss": 1.0535,
"step": 1762
},
{
"epoch": 4.508951406649617,
"grad_norm": 0.16736289193940437,
"learning_rate": 6.774284304590832e-06,
"loss": 1.0384,
"step": 1763
},
{
"epoch": 4.5115089514066495,
"grad_norm": 0.1509168260512166,
"learning_rate": 6.762213610487813e-06,
"loss": 1.0124,
"step": 1764
},
{
"epoch": 4.514066496163683,
"grad_norm": 0.15987500159184168,
"learning_rate": 6.75014818406081e-06,
"loss": 1.0282,
"step": 1765
},
{
"epoch": 4.516624040920716,
"grad_norm": 0.16208604821494524,
"learning_rate": 6.7380880449395105e-06,
"loss": 1.017,
"step": 1766
},
{
"epoch": 4.5191815856777495,
"grad_norm": 0.1750240175749838,
"learning_rate": 6.726033212745009e-06,
"loss": 1.0448,
"step": 1767
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.17627152188563489,
"learning_rate": 6.713983707089773e-06,
"loss": 1.0431,
"step": 1768
},
{
"epoch": 4.524296675191816,
"grad_norm": 0.172403571140956,
"learning_rate": 6.7019395475775805e-06,
"loss": 1.0014,
"step": 1769
},
{
"epoch": 4.526854219948849,
"grad_norm": 0.16551799261888245,
"learning_rate": 6.6899007538035376e-06,
"loss": 1.0277,
"step": 1770
},
{
"epoch": 4.529411764705882,
"grad_norm": 0.17935995088209722,
"learning_rate": 6.6778673453539984e-06,
"loss": 1.0214,
"step": 1771
},
{
"epoch": 4.531969309462916,
"grad_norm": 0.14762155206935834,
"learning_rate": 6.66583934180658e-06,
"loss": 1.0254,
"step": 1772
},
{
"epoch": 4.534526854219949,
"grad_norm": 0.18205952935739028,
"learning_rate": 6.653816762730079e-06,
"loss": 1.0128,
"step": 1773
},
{
"epoch": 4.537084398976982,
"grad_norm": 0.16531567285520288,
"learning_rate": 6.641799627684481e-06,
"loss": 1.0117,
"step": 1774
},
{
"epoch": 4.539641943734015,
"grad_norm": 0.1761641546294807,
"learning_rate": 6.629787956220924e-06,
"loss": 1.0047,
"step": 1775
},
{
"epoch": 4.542199488491049,
"grad_norm": 0.16044890357588265,
"learning_rate": 6.617781767881635e-06,
"loss": 1.0193,
"step": 1776
},
{
"epoch": 4.544757033248082,
"grad_norm": 0.159801416179025,
"learning_rate": 6.6057810821999406e-06,
"loss": 1.0344,
"step": 1777
},
{
"epoch": 4.547314578005115,
"grad_norm": 0.18194045342283055,
"learning_rate": 6.593785918700197e-06,
"loss": 1.046,
"step": 1778
},
{
"epoch": 4.549872122762149,
"grad_norm": 0.15701008924351048,
"learning_rate": 6.581796296897795e-06,
"loss": 1.0264,
"step": 1779
},
{
"epoch": 4.552429667519181,
"grad_norm": 0.16610935282488204,
"learning_rate": 6.569812236299089e-06,
"loss": 1.0207,
"step": 1780
},
{
"epoch": 4.554987212276215,
"grad_norm": 0.15940091408671517,
"learning_rate": 6.557833756401404e-06,
"loss": 1.049,
"step": 1781
},
{
"epoch": 4.557544757033249,
"grad_norm": 0.16618353240025538,
"learning_rate": 6.545860876692979e-06,
"loss": 1.0266,
"step": 1782
},
{
"epoch": 4.560102301790281,
"grad_norm": 0.17022750553375388,
"learning_rate": 6.533893616652932e-06,
"loss": 1.0791,
"step": 1783
},
{
"epoch": 4.562659846547315,
"grad_norm": 0.17223278557669286,
"learning_rate": 6.521931995751258e-06,
"loss": 1.001,
"step": 1784
},
{
"epoch": 4.565217391304348,
"grad_norm": 0.18588830803208972,
"learning_rate": 6.509976033448755e-06,
"loss": 1.0029,
"step": 1785
},
{
"epoch": 4.567774936061381,
"grad_norm": 0.15803052054999583,
"learning_rate": 6.498025749197036e-06,
"loss": 1.0085,
"step": 1786
},
{
"epoch": 4.570332480818414,
"grad_norm": 0.17758373561683846,
"learning_rate": 6.486081162438458e-06,
"loss": 1.0215,
"step": 1787
},
{
"epoch": 4.572890025575448,
"grad_norm": 0.1675050184516244,
"learning_rate": 6.4741422926061225e-06,
"loss": 1.0101,
"step": 1788
},
{
"epoch": 4.57544757033248,
"grad_norm": 0.1802049784719144,
"learning_rate": 6.462209159123825e-06,
"loss": 1.0594,
"step": 1789
},
{
"epoch": 4.578005115089514,
"grad_norm": 0.15407960949128488,
"learning_rate": 6.450281781406022e-06,
"loss": 1.0351,
"step": 1790
},
{
"epoch": 4.580562659846548,
"grad_norm": 0.17251700051840302,
"learning_rate": 6.438360178857818e-06,
"loss": 1.0237,
"step": 1791
},
{
"epoch": 4.58312020460358,
"grad_norm": 0.17736986767063925,
"learning_rate": 6.426444370874906e-06,
"loss": 1.0262,
"step": 1792
},
{
"epoch": 4.585677749360614,
"grad_norm": 0.18476336736016494,
"learning_rate": 6.414534376843566e-06,
"loss": 1.018,
"step": 1793
},
{
"epoch": 4.588235294117647,
"grad_norm": 0.17911429354068129,
"learning_rate": 6.402630216140618e-06,
"loss": 1.0286,
"step": 1794
},
{
"epoch": 4.59079283887468,
"grad_norm": 0.17311984725832297,
"learning_rate": 6.39073190813338e-06,
"loss": 1.0103,
"step": 1795
},
{
"epoch": 4.593350383631714,
"grad_norm": 0.1621278479186866,
"learning_rate": 6.37883947217966e-06,
"loss": 1.0228,
"step": 1796
},
{
"epoch": 4.595907928388747,
"grad_norm": 0.18444591716270403,
"learning_rate": 6.366952927627703e-06,
"loss": 1.0306,
"step": 1797
},
{
"epoch": 4.59846547314578,
"grad_norm": 0.1659804117379894,
"learning_rate": 6.355072293816178e-06,
"loss": 1.0072,
"step": 1798
},
{
"epoch": 4.601023017902813,
"grad_norm": 0.16571291930690385,
"learning_rate": 6.34319759007413e-06,
"loss": 1.0122,
"step": 1799
},
{
"epoch": 4.603580562659847,
"grad_norm": 0.1720471422264511,
"learning_rate": 6.331328835720961e-06,
"loss": 1.0465,
"step": 1800
},
{
"epoch": 4.6061381074168795,
"grad_norm": 0.16256427527474918,
"learning_rate": 6.319466050066395e-06,
"loss": 1.0069,
"step": 1801
},
{
"epoch": 4.608695652173913,
"grad_norm": 0.16289290458169317,
"learning_rate": 6.307609252410438e-06,
"loss": 0.9955,
"step": 1802
},
{
"epoch": 4.611253196930946,
"grad_norm": 0.16420344005471815,
"learning_rate": 6.295758462043362e-06,
"loss": 1.021,
"step": 1803
},
{
"epoch": 4.6138107416879794,
"grad_norm": 0.16431618715461352,
"learning_rate": 6.283913698245659e-06,
"loss": 0.9887,
"step": 1804
},
{
"epoch": 4.616368286445013,
"grad_norm": 0.162477757683666,
"learning_rate": 6.272074980288021e-06,
"loss": 1.0315,
"step": 1805
},
{
"epoch": 4.618925831202046,
"grad_norm": 0.1420949863331362,
"learning_rate": 6.2602423274313e-06,
"loss": 0.9946,
"step": 1806
},
{
"epoch": 4.621483375959079,
"grad_norm": 0.1617352765159284,
"learning_rate": 6.248415758926485e-06,
"loss": 1.0247,
"step": 1807
},
{
"epoch": 4.624040920716112,
"grad_norm": 0.14727458038419122,
"learning_rate": 6.236595294014662e-06,
"loss": 1.0695,
"step": 1808
},
{
"epoch": 4.626598465473146,
"grad_norm": 0.15513852752076332,
"learning_rate": 6.22478095192699e-06,
"loss": 1.0361,
"step": 1809
},
{
"epoch": 4.629156010230179,
"grad_norm": 0.15023148854538287,
"learning_rate": 6.212972751884663e-06,
"loss": 1.0263,
"step": 1810
},
{
"epoch": 4.631713554987212,
"grad_norm": 0.16087300720694614,
"learning_rate": 6.201170713098883e-06,
"loss": 1.0248,
"step": 1811
},
{
"epoch": 4.634271099744246,
"grad_norm": 0.15834981601790443,
"learning_rate": 6.189374854770832e-06,
"loss": 1.053,
"step": 1812
},
{
"epoch": 4.6368286445012785,
"grad_norm": 0.1573655447598696,
"learning_rate": 6.177585196091631e-06,
"loss": 0.9904,
"step": 1813
},
{
"epoch": 4.639386189258312,
"grad_norm": 0.158683133829273,
"learning_rate": 6.16580175624232e-06,
"loss": 1.0595,
"step": 1814
},
{
"epoch": 4.641943734015345,
"grad_norm": 0.1597812398342448,
"learning_rate": 6.15402455439382e-06,
"loss": 1.0517,
"step": 1815
},
{
"epoch": 4.6445012787723785,
"grad_norm": 0.15551450371650033,
"learning_rate": 6.142253609706898e-06,
"loss": 1.054,
"step": 1816
},
{
"epoch": 4.647058823529412,
"grad_norm": 0.19632917660508345,
"learning_rate": 6.130488941332151e-06,
"loss": 1.0512,
"step": 1817
},
{
"epoch": 4.649616368286445,
"grad_norm": 0.15643968941800954,
"learning_rate": 6.118730568409951e-06,
"loss": 1.039,
"step": 1818
},
{
"epoch": 4.6521739130434785,
"grad_norm": 0.20652844984094032,
"learning_rate": 6.106978510070443e-06,
"loss": 1.0129,
"step": 1819
},
{
"epoch": 4.654731457800511,
"grad_norm": 0.15097637750201956,
"learning_rate": 6.095232785433485e-06,
"loss": 1.0003,
"step": 1820
},
{
"epoch": 4.657289002557545,
"grad_norm": 0.20892906717171159,
"learning_rate": 6.083493413608639e-06,
"loss": 1.0032,
"step": 1821
},
{
"epoch": 4.659846547314578,
"grad_norm": 0.14676895460609313,
"learning_rate": 6.0717604136951315e-06,
"loss": 1.0575,
"step": 1822
},
{
"epoch": 4.662404092071611,
"grad_norm": 0.1744598380072282,
"learning_rate": 6.0600338047818155e-06,
"loss": 1.0012,
"step": 1823
},
{
"epoch": 4.664961636828645,
"grad_norm": 0.15898084906509888,
"learning_rate": 6.048313605947153e-06,
"loss": 1.0152,
"step": 1824
},
{
"epoch": 4.667519181585678,
"grad_norm": 0.18500242483627394,
"learning_rate": 6.036599836259175e-06,
"loss": 1.0202,
"step": 1825
},
{
"epoch": 4.670076726342711,
"grad_norm": 0.17586881973502083,
"learning_rate": 6.024892514775451e-06,
"loss": 1.0152,
"step": 1826
},
{
"epoch": 4.672634271099744,
"grad_norm": 0.1751917297897623,
"learning_rate": 6.013191660543063e-06,
"loss": 1.0185,
"step": 1827
},
{
"epoch": 4.675191815856778,
"grad_norm": 0.16539844174921248,
"learning_rate": 6.001497292598566e-06,
"loss": 1.0091,
"step": 1828
},
{
"epoch": 4.677749360613811,
"grad_norm": 0.16305138932194513,
"learning_rate": 5.98980942996797e-06,
"loss": 1.0171,
"step": 1829
},
{
"epoch": 4.680306905370844,
"grad_norm": 0.1978081666622713,
"learning_rate": 5.97812809166669e-06,
"loss": 1.04,
"step": 1830
},
{
"epoch": 4.6828644501278776,
"grad_norm": 0.14529737115947974,
"learning_rate": 5.966453296699541e-06,
"loss": 1.0219,
"step": 1831
},
{
"epoch": 4.68542199488491,
"grad_norm": 0.19132792503166993,
"learning_rate": 5.954785064060678e-06,
"loss": 1.0466,
"step": 1832
},
{
"epoch": 4.687979539641944,
"grad_norm": 0.14925809757481498,
"learning_rate": 5.943123412733587e-06,
"loss": 1.0168,
"step": 1833
},
{
"epoch": 4.690537084398977,
"grad_norm": 0.19480783069632648,
"learning_rate": 5.931468361691053e-06,
"loss": 1.074,
"step": 1834
},
{
"epoch": 4.69309462915601,
"grad_norm": 0.1597024405029427,
"learning_rate": 5.919819929895106e-06,
"loss": 1.0365,
"step": 1835
},
{
"epoch": 4.695652173913043,
"grad_norm": 0.179287834985346,
"learning_rate": 5.9081781362970205e-06,
"loss": 1.0461,
"step": 1836
},
{
"epoch": 4.698209718670077,
"grad_norm": 0.16882218098581764,
"learning_rate": 5.896542999837265e-06,
"loss": 1.0305,
"step": 1837
},
{
"epoch": 4.70076726342711,
"grad_norm": 0.14058129617791865,
"learning_rate": 5.8849145394454806e-06,
"loss": 0.9987,
"step": 1838
},
{
"epoch": 4.703324808184143,
"grad_norm": 0.18349693674349288,
"learning_rate": 5.873292774040442e-06,
"loss": 0.9943,
"step": 1839
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.1610970199108,
"learning_rate": 5.861677722530037e-06,
"loss": 1.0579,
"step": 1840
},
{
"epoch": 4.708439897698209,
"grad_norm": 0.166987113555728,
"learning_rate": 5.850069403811235e-06,
"loss": 1.0181,
"step": 1841
},
{
"epoch": 4.710997442455243,
"grad_norm": 0.1677755864894642,
"learning_rate": 5.8384678367700325e-06,
"loss": 1.0125,
"step": 1842
},
{
"epoch": 4.713554987212277,
"grad_norm": 0.1779899568878102,
"learning_rate": 5.826873040281462e-06,
"loss": 1.0157,
"step": 1843
},
{
"epoch": 4.716112531969309,
"grad_norm": 0.16348039752545065,
"learning_rate": 5.81528503320953e-06,
"loss": 1.0343,
"step": 1844
},
{
"epoch": 4.718670076726343,
"grad_norm": 0.1670971620135551,
"learning_rate": 5.8037038344072e-06,
"loss": 1.0318,
"step": 1845
},
{
"epoch": 4.721227621483376,
"grad_norm": 0.18617223073968917,
"learning_rate": 5.792129462716355e-06,
"loss": 1.0219,
"step": 1846
},
{
"epoch": 4.723785166240409,
"grad_norm": 0.15449905092529612,
"learning_rate": 5.780561936967779e-06,
"loss": 1.0272,
"step": 1847
},
{
"epoch": 4.726342710997442,
"grad_norm": 0.1750868480359856,
"learning_rate": 5.769001275981112e-06,
"loss": 1.0565,
"step": 1848
},
{
"epoch": 4.728900255754476,
"grad_norm": 0.1663229129876114,
"learning_rate": 5.757447498564821e-06,
"loss": 1.0535,
"step": 1849
},
{
"epoch": 4.731457800511509,
"grad_norm": 0.15809631122185844,
"learning_rate": 5.745900623516189e-06,
"loss": 1.021,
"step": 1850
},
{
"epoch": 4.734015345268542,
"grad_norm": 0.16459750473842777,
"learning_rate": 5.734360669621255e-06,
"loss": 1.0248,
"step": 1851
},
{
"epoch": 4.736572890025576,
"grad_norm": 0.15287249372875325,
"learning_rate": 5.722827655654801e-06,
"loss": 1.0156,
"step": 1852
},
{
"epoch": 4.739130434782608,
"grad_norm": 0.1605211421637796,
"learning_rate": 5.711301600380317e-06,
"loss": 1.0569,
"step": 1853
},
{
"epoch": 4.741687979539642,
"grad_norm": 0.14939498740260876,
"learning_rate": 5.699782522549983e-06,
"loss": 1.0509,
"step": 1854
},
{
"epoch": 4.744245524296675,
"grad_norm": 0.16398542522125342,
"learning_rate": 5.688270440904613e-06,
"loss": 1.0273,
"step": 1855
},
{
"epoch": 4.746803069053708,
"grad_norm": 0.16733173044314129,
"learning_rate": 5.6767653741736405e-06,
"loss": 0.9938,
"step": 1856
},
{
"epoch": 4.749360613810742,
"grad_norm": 0.1505426061615439,
"learning_rate": 5.665267341075098e-06,
"loss": 1.0144,
"step": 1857
},
{
"epoch": 4.751918158567775,
"grad_norm": 0.1527851077672571,
"learning_rate": 5.653776360315562e-06,
"loss": 1.0478,
"step": 1858
},
{
"epoch": 4.754475703324808,
"grad_norm": 0.16913240191236387,
"learning_rate": 5.642292450590134e-06,
"loss": 1.0122,
"step": 1859
},
{
"epoch": 4.757033248081841,
"grad_norm": 0.158875356158748,
"learning_rate": 5.630815630582429e-06,
"loss": 1.0413,
"step": 1860
},
{
"epoch": 4.759590792838875,
"grad_norm": 0.14953756040104652,
"learning_rate": 5.61934591896451e-06,
"loss": 1.0337,
"step": 1861
},
{
"epoch": 4.762148337595908,
"grad_norm": 0.17219828313172605,
"learning_rate": 5.60788333439688e-06,
"loss": 1.0287,
"step": 1862
},
{
"epoch": 4.764705882352941,
"grad_norm": 0.1659776610530445,
"learning_rate": 5.596427895528443e-06,
"loss": 1.0443,
"step": 1863
},
{
"epoch": 4.767263427109975,
"grad_norm": 0.1676484186832149,
"learning_rate": 5.584979620996491e-06,
"loss": 1.0489,
"step": 1864
},
{
"epoch": 4.7698209718670075,
"grad_norm": 0.1623795959715509,
"learning_rate": 5.573538529426645e-06,
"loss": 1.0144,
"step": 1865
},
{
"epoch": 4.772378516624041,
"grad_norm": 0.16256260144035772,
"learning_rate": 5.562104639432845e-06,
"loss": 1.0427,
"step": 1866
},
{
"epoch": 4.774936061381074,
"grad_norm": 0.17175961986303814,
"learning_rate": 5.550677969617319e-06,
"loss": 1.0162,
"step": 1867
},
{
"epoch": 4.7774936061381075,
"grad_norm": 0.1542050330321217,
"learning_rate": 5.539258538570544e-06,
"loss": 1.0164,
"step": 1868
},
{
"epoch": 4.78005115089514,
"grad_norm": 0.15918533657676529,
"learning_rate": 5.527846364871219e-06,
"loss": 1.0309,
"step": 1869
},
{
"epoch": 4.782608695652174,
"grad_norm": 0.1403676241793028,
"learning_rate": 5.516441467086231e-06,
"loss": 1.0228,
"step": 1870
},
{
"epoch": 4.7851662404092075,
"grad_norm": 0.14773251181856192,
"learning_rate": 5.505043863770646e-06,
"loss": 1.0734,
"step": 1871
},
{
"epoch": 4.78772378516624,
"grad_norm": 0.16196858898805197,
"learning_rate": 5.493653573467647e-06,
"loss": 1.0048,
"step": 1872
},
{
"epoch": 4.790281329923274,
"grad_norm": 0.15355301379517172,
"learning_rate": 5.4822706147085205e-06,
"loss": 1.0125,
"step": 1873
},
{
"epoch": 4.792838874680307,
"grad_norm": 0.18982539717495267,
"learning_rate": 5.470895006012637e-06,
"loss": 0.9959,
"step": 1874
},
{
"epoch": 4.79539641943734,
"grad_norm": 0.1573171337655545,
"learning_rate": 5.459526765887397e-06,
"loss": 1.0297,
"step": 1875
},
{
"epoch": 4.797953964194374,
"grad_norm": 0.16351573968402464,
"learning_rate": 5.448165912828214e-06,
"loss": 0.9945,
"step": 1876
},
{
"epoch": 4.8005115089514065,
"grad_norm": 0.18629349709548856,
"learning_rate": 5.4368124653184835e-06,
"loss": 1.0363,
"step": 1877
},
{
"epoch": 4.80306905370844,
"grad_norm": 0.17008978855695026,
"learning_rate": 5.4254664418295634e-06,
"loss": 1.0273,
"step": 1878
},
{
"epoch": 4.805626598465473,
"grad_norm": 0.16524085689648021,
"learning_rate": 5.414127860820719e-06,
"loss": 1.0098,
"step": 1879
},
{
"epoch": 4.8081841432225065,
"grad_norm": 0.18739927868121126,
"learning_rate": 5.402796740739109e-06,
"loss": 1.0057,
"step": 1880
},
{
"epoch": 4.810741687979539,
"grad_norm": 0.17551431540439197,
"learning_rate": 5.391473100019767e-06,
"loss": 1.0378,
"step": 1881
},
{
"epoch": 4.813299232736573,
"grad_norm": 0.20076574431883742,
"learning_rate": 5.380156957085536e-06,
"loss": 1.0054,
"step": 1882
},
{
"epoch": 4.8158567774936065,
"grad_norm": 0.1633457331284817,
"learning_rate": 5.3688483303470895e-06,
"loss": 0.9945,
"step": 1883
},
{
"epoch": 4.818414322250639,
"grad_norm": 0.18981752589117254,
"learning_rate": 5.3575472382028386e-06,
"loss": 1.018,
"step": 1884
},
{
"epoch": 4.820971867007673,
"grad_norm": 0.1796254125656967,
"learning_rate": 5.346253699038966e-06,
"loss": 1.0175,
"step": 1885
},
{
"epoch": 4.823529411764706,
"grad_norm": 0.18612504881053146,
"learning_rate": 5.334967731229348e-06,
"loss": 1.0343,
"step": 1886
},
{
"epoch": 4.826086956521739,
"grad_norm": 0.1896503989682664,
"learning_rate": 5.323689353135546e-06,
"loss": 1.033,
"step": 1887
},
{
"epoch": 4.828644501278772,
"grad_norm": 0.17351769644886408,
"learning_rate": 5.312418583106784e-06,
"loss": 1.0341,
"step": 1888
},
{
"epoch": 4.831202046035806,
"grad_norm": 0.19813048664100952,
"learning_rate": 5.301155439479893e-06,
"loss": 1.0189,
"step": 1889
},
{
"epoch": 4.833759590792839,
"grad_norm": 0.17414587401870055,
"learning_rate": 5.289899940579315e-06,
"loss": 0.9979,
"step": 1890
},
{
"epoch": 4.836317135549872,
"grad_norm": 0.17954394790720937,
"learning_rate": 5.278652104717026e-06,
"loss": 1.033,
"step": 1891
},
{
"epoch": 4.838874680306906,
"grad_norm": 0.18225354012614833,
"learning_rate": 5.267411950192558e-06,
"loss": 1.0006,
"step": 1892
},
{
"epoch": 4.841432225063938,
"grad_norm": 0.19171250300846782,
"learning_rate": 5.256179495292953e-06,
"loss": 0.976,
"step": 1893
},
{
"epoch": 4.843989769820972,
"grad_norm": 0.16560762200333132,
"learning_rate": 5.244954758292691e-06,
"loss": 1.03,
"step": 1894
},
{
"epoch": 4.846547314578006,
"grad_norm": 0.17384349031638302,
"learning_rate": 5.233737757453733e-06,
"loss": 1.017,
"step": 1895
},
{
"epoch": 4.849104859335038,
"grad_norm": 0.18200737855014837,
"learning_rate": 5.222528511025429e-06,
"loss": 1.0544,
"step": 1896
},
{
"epoch": 4.851662404092072,
"grad_norm": 0.1674383880489774,
"learning_rate": 5.2113270372445334e-06,
"loss": 1.0199,
"step": 1897
},
{
"epoch": 4.854219948849105,
"grad_norm": 0.16206185822222566,
"learning_rate": 5.200133354335129e-06,
"loss": 1.0297,
"step": 1898
},
{
"epoch": 4.856777493606138,
"grad_norm": 0.16330979230562037,
"learning_rate": 5.188947480508644e-06,
"loss": 1.0618,
"step": 1899
},
{
"epoch": 4.859335038363171,
"grad_norm": 0.1641289208809162,
"learning_rate": 5.177769433963801e-06,
"loss": 1.0095,
"step": 1900
},
{
"epoch": 4.861892583120205,
"grad_norm": 0.16857653947800838,
"learning_rate": 5.166599232886579e-06,
"loss": 1.0132,
"step": 1901
},
{
"epoch": 4.864450127877237,
"grad_norm": 0.15123752972525892,
"learning_rate": 5.155436895450197e-06,
"loss": 1.0231,
"step": 1902
},
{
"epoch": 4.867007672634271,
"grad_norm": 0.18007827051394826,
"learning_rate": 5.144282439815075e-06,
"loss": 1.0299,
"step": 1903
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.17145491388315698,
"learning_rate": 5.133135884128828e-06,
"loss": 1.0426,
"step": 1904
},
{
"epoch": 4.872122762148337,
"grad_norm": 0.15111451411798363,
"learning_rate": 5.121997246526188e-06,
"loss": 1.0335,
"step": 1905
},
{
"epoch": 4.874680306905371,
"grad_norm": 0.17562740075351813,
"learning_rate": 5.110866545129031e-06,
"loss": 1.0226,
"step": 1906
},
{
"epoch": 4.877237851662404,
"grad_norm": 0.14883986205754957,
"learning_rate": 5.099743798046315e-06,
"loss": 1.03,
"step": 1907
},
{
"epoch": 4.879795396419437,
"grad_norm": 0.16425606815927463,
"learning_rate": 5.088629023374052e-06,
"loss": 1.0524,
"step": 1908
},
{
"epoch": 4.882352941176471,
"grad_norm": 0.15699998164150683,
"learning_rate": 5.0775222391952826e-06,
"loss": 1.0598,
"step": 1909
},
{
"epoch": 4.884910485933504,
"grad_norm": 0.16747367530556498,
"learning_rate": 5.06642346358005e-06,
"loss": 1.0197,
"step": 1910
},
{
"epoch": 4.887468030690537,
"grad_norm": 0.19072243056188606,
"learning_rate": 5.055332714585372e-06,
"loss": 1.001,
"step": 1911
},
{
"epoch": 4.89002557544757,
"grad_norm": 0.16853967810789172,
"learning_rate": 5.044250010255202e-06,
"loss": 1.0432,
"step": 1912
},
{
"epoch": 4.892583120204604,
"grad_norm": 0.17828385119329374,
"learning_rate": 5.033175368620406e-06,
"loss": 1.0314,
"step": 1913
},
{
"epoch": 4.8951406649616365,
"grad_norm": 0.15062414843555882,
"learning_rate": 5.022108807698735e-06,
"loss": 1.0358,
"step": 1914
},
{
"epoch": 4.89769820971867,
"grad_norm": 0.17399854674836523,
"learning_rate": 5.0110503454947926e-06,
"loss": 1.0265,
"step": 1915
},
{
"epoch": 4.900255754475703,
"grad_norm": 0.16505478849391259,
"learning_rate": 5.000000000000003e-06,
"loss": 1.0495,
"step": 1916
},
{
"epoch": 4.9028132992327365,
"grad_norm": 0.1446909805445552,
"learning_rate": 4.988957789192583e-06,
"loss": 1.0044,
"step": 1917
},
{
"epoch": 4.90537084398977,
"grad_norm": 0.16047225013403066,
"learning_rate": 4.97792373103753e-06,
"loss": 0.977,
"step": 1918
},
{
"epoch": 4.907928388746803,
"grad_norm": 0.15267602057033672,
"learning_rate": 4.966897843486561e-06,
"loss": 1.0563,
"step": 1919
},
{
"epoch": 4.910485933503836,
"grad_norm": 0.14094891470116488,
"learning_rate": 4.955880144478101e-06,
"loss": 1.0172,
"step": 1920
},
{
"epoch": 4.913043478260869,
"grad_norm": 0.16225336285064607,
"learning_rate": 4.944870651937267e-06,
"loss": 1.0332,
"step": 1921
},
{
"epoch": 4.915601023017903,
"grad_norm": 0.15352807995544615,
"learning_rate": 4.933869383775809e-06,
"loss": 1.0285,
"step": 1922
},
{
"epoch": 4.918158567774936,
"grad_norm": 0.14893755036217834,
"learning_rate": 4.922876357892103e-06,
"loss": 1.0082,
"step": 1923
},
{
"epoch": 4.920716112531969,
"grad_norm": 0.17251988177114058,
"learning_rate": 4.911891592171113e-06,
"loss": 1.0131,
"step": 1924
},
{
"epoch": 4.923273657289003,
"grad_norm": 0.15340872718806947,
"learning_rate": 4.900915104484372e-06,
"loss": 1.0502,
"step": 1925
},
{
"epoch": 4.9258312020460355,
"grad_norm": 0.16259551968874744,
"learning_rate": 4.889946912689936e-06,
"loss": 1.0457,
"step": 1926
},
{
"epoch": 4.928388746803069,
"grad_norm": 0.15432669889294595,
"learning_rate": 4.878987034632361e-06,
"loss": 1.0491,
"step": 1927
},
{
"epoch": 4.930946291560103,
"grad_norm": 0.16399149074989694,
"learning_rate": 4.8680354881426935e-06,
"loss": 1.011,
"step": 1928
},
{
"epoch": 4.9335038363171355,
"grad_norm": 0.17537267004354543,
"learning_rate": 4.857092291038411e-06,
"loss": 1.0356,
"step": 1929
},
{
"epoch": 4.936061381074169,
"grad_norm": 0.15804425089068397,
"learning_rate": 4.846157461123411e-06,
"loss": 1.0556,
"step": 1930
},
{
"epoch": 4.938618925831202,
"grad_norm": 0.1644217524312441,
"learning_rate": 4.8352310161879724e-06,
"loss": 1.0521,
"step": 1931
},
{
"epoch": 4.9411764705882355,
"grad_norm": 0.166490586450367,
"learning_rate": 4.824312974008748e-06,
"loss": 1.0348,
"step": 1932
},
{
"epoch": 4.943734015345268,
"grad_norm": 0.15262614264530625,
"learning_rate": 4.813403352348703e-06,
"loss": 1.003,
"step": 1933
},
{
"epoch": 4.946291560102302,
"grad_norm": 0.16914604106371434,
"learning_rate": 4.8025021689571095e-06,
"loss": 1.0261,
"step": 1934
},
{
"epoch": 4.948849104859335,
"grad_norm": 0.14949420788516232,
"learning_rate": 4.791609441569517e-06,
"loss": 1.013,
"step": 1935
},
{
"epoch": 4.951406649616368,
"grad_norm": 0.18410232609002486,
"learning_rate": 4.780725187907707e-06,
"loss": 1.0211,
"step": 1936
},
{
"epoch": 4.953964194373402,
"grad_norm": 0.14300056243568887,
"learning_rate": 4.769849425679683e-06,
"loss": 1.0222,
"step": 1937
},
{
"epoch": 4.956521739130435,
"grad_norm": 0.17246451645014146,
"learning_rate": 4.758982172579621e-06,
"loss": 0.9967,
"step": 1938
},
{
"epoch": 4.959079283887468,
"grad_norm": 0.17259140226193048,
"learning_rate": 4.748123446287875e-06,
"loss": 1.0321,
"step": 1939
},
{
"epoch": 4.961636828644501,
"grad_norm": 1.1109363534677956,
"learning_rate": 4.737273264470909e-06,
"loss": 1.0923,
"step": 1940
},
{
"epoch": 4.964194373401535,
"grad_norm": 0.17074890567417172,
"learning_rate": 4.726431644781284e-06,
"loss": 1.0245,
"step": 1941
},
{
"epoch": 4.966751918158568,
"grad_norm": 0.15432050773937248,
"learning_rate": 4.715598604857648e-06,
"loss": 1.0378,
"step": 1942
},
{
"epoch": 4.969309462915601,
"grad_norm": 0.15888604747270782,
"learning_rate": 4.704774162324673e-06,
"loss": 1.0287,
"step": 1943
},
{
"epoch": 4.971867007672635,
"grad_norm": 0.17597082523498278,
"learning_rate": 4.6939583347930525e-06,
"loss": 1.0024,
"step": 1944
},
{
"epoch": 4.974424552429667,
"grad_norm": 0.15465610920055028,
"learning_rate": 4.6831511398594574e-06,
"loss": 1.0216,
"step": 1945
},
{
"epoch": 4.976982097186701,
"grad_norm": 0.16914400485984177,
"learning_rate": 4.672352595106525e-06,
"loss": 1.0595,
"step": 1946
},
{
"epoch": 4.979539641943734,
"grad_norm": 0.17772012019293779,
"learning_rate": 4.661562718102808e-06,
"loss": 1.0056,
"step": 1947
},
{
"epoch": 4.982097186700767,
"grad_norm": 0.14226899552306443,
"learning_rate": 4.65078152640276e-06,
"loss": 1.0221,
"step": 1948
},
{
"epoch": 4.9846547314578,
"grad_norm": 0.14866025187075746,
"learning_rate": 4.640009037546711e-06,
"loss": 1.0534,
"step": 1949
},
{
"epoch": 4.987212276214834,
"grad_norm": 0.18309163357147787,
"learning_rate": 4.629245269060826e-06,
"loss": 1.046,
"step": 1950
},
{
"epoch": 4.989769820971867,
"grad_norm": 0.14195791571684566,
"learning_rate": 4.61849023845708e-06,
"loss": 1.0119,
"step": 1951
},
{
"epoch": 4.9923273657289,
"grad_norm": 0.15240227847957083,
"learning_rate": 4.607743963233233e-06,
"loss": 1.0373,
"step": 1952
},
{
"epoch": 4.994884910485934,
"grad_norm": 0.1706260447764414,
"learning_rate": 4.5970064608728085e-06,
"loss": 0.9995,
"step": 1953
},
{
"epoch": 4.997442455242966,
"grad_norm": 0.16263531281395652,
"learning_rate": 4.586277748845056e-06,
"loss": 1.0053,
"step": 1954
},
{
"epoch": 5.0,
"grad_norm": 0.15411495644560275,
"learning_rate": 4.575557844604905e-06,
"loss": 1.0268,
"step": 1955
},
{
"epoch": 5.002557544757034,
"grad_norm": 0.15615925966080388,
"learning_rate": 4.5648467655929815e-06,
"loss": 1.0199,
"step": 1956
},
{
"epoch": 5.005115089514066,
"grad_norm": 0.16045903540647527,
"learning_rate": 4.554144529235537e-06,
"loss": 1.0277,
"step": 1957
},
{
"epoch": 5.0076726342711,
"grad_norm": 0.16031341969126212,
"learning_rate": 4.543451152944438e-06,
"loss": 1.0562,
"step": 1958
},
{
"epoch": 5.010230179028133,
"grad_norm": 0.1429706019310508,
"learning_rate": 4.532766654117146e-06,
"loss": 1.031,
"step": 1959
},
{
"epoch": 5.012787723785166,
"grad_norm": 0.15753846906492294,
"learning_rate": 4.5220910501366635e-06,
"loss": 1.0368,
"step": 1960
},
{
"epoch": 5.015345268542199,
"grad_norm": 0.14579202507979455,
"learning_rate": 4.511424358371544e-06,
"loss": 1.0358,
"step": 1961
},
{
"epoch": 5.017902813299233,
"grad_norm": 0.15694921661063782,
"learning_rate": 4.500766596175813e-06,
"loss": 1.0037,
"step": 1962
},
{
"epoch": 5.020460358056266,
"grad_norm": 0.16268209756361607,
"learning_rate": 4.490117780888994e-06,
"loss": 1.0191,
"step": 1963
},
{
"epoch": 5.023017902813299,
"grad_norm": 0.13601692002794843,
"learning_rate": 4.479477929836039e-06,
"loss": 1.0225,
"step": 1964
},
{
"epoch": 5.025575447570333,
"grad_norm": 0.1513485213042126,
"learning_rate": 4.4688470603273184e-06,
"loss": 0.9987,
"step": 1965
},
{
"epoch": 5.028132992327365,
"grad_norm": 0.14505501997147888,
"learning_rate": 4.458225189658598e-06,
"loss": 1.0244,
"step": 1966
},
{
"epoch": 5.030690537084399,
"grad_norm": 0.15866972934335427,
"learning_rate": 4.447612335110991e-06,
"loss": 1.0147,
"step": 1967
},
{
"epoch": 5.033248081841432,
"grad_norm": 0.15717036214065513,
"learning_rate": 4.43700851395096e-06,
"loss": 1.0056,
"step": 1968
},
{
"epoch": 5.035805626598465,
"grad_norm": 0.15634999112536652,
"learning_rate": 4.426413743430241e-06,
"loss": 1.0486,
"step": 1969
},
{
"epoch": 5.038363171355499,
"grad_norm": 0.1549586768650421,
"learning_rate": 4.415828040785877e-06,
"loss": 1.0046,
"step": 1970
},
{
"epoch": 5.040920716112532,
"grad_norm": 0.1643495461245206,
"learning_rate": 4.405251423240138e-06,
"loss": 1.0158,
"step": 1971
},
{
"epoch": 5.043478260869565,
"grad_norm": 0.14558675280550004,
"learning_rate": 4.3946839080005236e-06,
"loss": 1.0167,
"step": 1972
},
{
"epoch": 5.046035805626598,
"grad_norm": 0.16057769002475886,
"learning_rate": 4.384125512259718e-06,
"loss": 1.0412,
"step": 1973
},
{
"epoch": 5.048593350383632,
"grad_norm": 0.1589654545230765,
"learning_rate": 4.373576253195568e-06,
"loss": 1.0058,
"step": 1974
},
{
"epoch": 5.051150895140665,
"grad_norm": 0.14004326798784272,
"learning_rate": 4.363036147971069e-06,
"loss": 0.9958,
"step": 1975
},
{
"epoch": 5.053708439897698,
"grad_norm": 0.16704739125788623,
"learning_rate": 4.352505213734298e-06,
"loss": 1.0202,
"step": 1976
},
{
"epoch": 5.056265984654732,
"grad_norm": 0.15270263482532218,
"learning_rate": 4.3419834676184395e-06,
"loss": 1.0221,
"step": 1977
},
{
"epoch": 5.0588235294117645,
"grad_norm": 0.15264750560420307,
"learning_rate": 4.331470926741707e-06,
"loss": 1.0264,
"step": 1978
},
{
"epoch": 5.061381074168798,
"grad_norm": 0.1675831575968936,
"learning_rate": 4.320967608207354e-06,
"loss": 1.0256,
"step": 1979
},
{
"epoch": 5.063938618925831,
"grad_norm": 0.15506176173449848,
"learning_rate": 4.3104735291036214e-06,
"loss": 1.0246,
"step": 1980
},
{
"epoch": 5.0664961636828645,
"grad_norm": 0.147438074557832,
"learning_rate": 4.299988706503716e-06,
"loss": 0.9895,
"step": 1981
},
{
"epoch": 5.069053708439898,
"grad_norm": 0.13712823238173896,
"learning_rate": 4.289513157465796e-06,
"loss": 1.0069,
"step": 1982
},
{
"epoch": 5.071611253196931,
"grad_norm": 0.1530445973165712,
"learning_rate": 4.279046899032918e-06,
"loss": 1.028,
"step": 1983
},
{
"epoch": 5.0741687979539645,
"grad_norm": 0.1487111811647309,
"learning_rate": 4.268589948233034e-06,
"loss": 0.9806,
"step": 1984
},
{
"epoch": 5.076726342710997,
"grad_norm": 0.1536495899212468,
"learning_rate": 4.258142322078944e-06,
"loss": 1.0141,
"step": 1985
},
{
"epoch": 5.079283887468031,
"grad_norm": 0.1420705753526825,
"learning_rate": 4.247704037568289e-06,
"loss": 1.0484,
"step": 1986
},
{
"epoch": 5.081841432225064,
"grad_norm": 0.14854933088338998,
"learning_rate": 4.237275111683502e-06,
"loss": 1.0176,
"step": 1987
},
{
"epoch": 5.084398976982097,
"grad_norm": 0.15085396882702742,
"learning_rate": 4.226855561391792e-06,
"loss": 1.0241,
"step": 1988
},
{
"epoch": 5.086956521739131,
"grad_norm": 0.13480571166529362,
"learning_rate": 4.2164454036451185e-06,
"loss": 1.0105,
"step": 1989
},
{
"epoch": 5.089514066496164,
"grad_norm": 0.15439478858765343,
"learning_rate": 4.2060446553801585e-06,
"loss": 1.0571,
"step": 1990
},
{
"epoch": 5.092071611253197,
"grad_norm": 0.14887589003918353,
"learning_rate": 4.195653333518271e-06,
"loss": 1.0309,
"step": 1991
},
{
"epoch": 5.09462915601023,
"grad_norm": 0.14823587280930983,
"learning_rate": 4.1852714549654985e-06,
"loss": 1.0286,
"step": 1992
},
{
"epoch": 5.0971867007672635,
"grad_norm": 0.1502816473196306,
"learning_rate": 4.1748990366125005e-06,
"loss": 1.0092,
"step": 1993
},
{
"epoch": 5.099744245524296,
"grad_norm": 0.13426636004437947,
"learning_rate": 4.164536095334557e-06,
"loss": 1.0055,
"step": 1994
},
{
"epoch": 5.10230179028133,
"grad_norm": 0.14869672831898953,
"learning_rate": 4.154182647991519e-06,
"loss": 1.0492,
"step": 1995
},
{
"epoch": 5.1048593350383635,
"grad_norm": 0.15755018419795028,
"learning_rate": 4.143838711427808e-06,
"loss": 1.0103,
"step": 1996
},
{
"epoch": 5.107416879795396,
"grad_norm": 0.1503017786383216,
"learning_rate": 4.133504302472356e-06,
"loss": 1.0015,
"step": 1997
},
{
"epoch": 5.10997442455243,
"grad_norm": 0.14022700208845976,
"learning_rate": 4.123179437938596e-06,
"loss": 1.0394,
"step": 1998
},
{
"epoch": 5.112531969309463,
"grad_norm": 0.149747082086179,
"learning_rate": 4.112864134624447e-06,
"loss": 1.0406,
"step": 1999
},
{
"epoch": 5.115089514066496,
"grad_norm": 0.15174138196167658,
"learning_rate": 4.102558409312256e-06,
"loss": 1.022,
"step": 2000
},
{
"epoch": 5.117647058823529,
"grad_norm": 0.14846170493390945,
"learning_rate": 4.092262278768797e-06,
"loss": 1.0132,
"step": 2001
},
{
"epoch": 5.120204603580563,
"grad_norm": 0.14541949365283377,
"learning_rate": 4.0819757597452246e-06,
"loss": 1.0328,
"step": 2002
},
{
"epoch": 5.122762148337596,
"grad_norm": 0.16073985913183766,
"learning_rate": 4.0716988689770695e-06,
"loss": 1.0067,
"step": 2003
},
{
"epoch": 5.125319693094629,
"grad_norm": 0.14371815787004755,
"learning_rate": 4.061431623184188e-06,
"loss": 1.0289,
"step": 2004
},
{
"epoch": 5.127877237851663,
"grad_norm": 0.14339076964243316,
"learning_rate": 4.051174039070742e-06,
"loss": 0.9812,
"step": 2005
},
{
"epoch": 5.130434782608695,
"grad_norm": 0.1437711220903366,
"learning_rate": 4.040926133325188e-06,
"loss": 1.0059,
"step": 2006
},
{
"epoch": 5.132992327365729,
"grad_norm": 0.1432806446083087,
"learning_rate": 4.030687922620223e-06,
"loss": 1.0183,
"step": 2007
},
{
"epoch": 5.135549872122763,
"grad_norm": 0.14407049755074497,
"learning_rate": 4.020459423612777e-06,
"loss": 1.0328,
"step": 2008
},
{
"epoch": 5.138107416879795,
"grad_norm": 0.14311456671607106,
"learning_rate": 4.010240652943974e-06,
"loss": 1.0247,
"step": 2009
},
{
"epoch": 5.140664961636829,
"grad_norm": 0.14651674275116736,
"learning_rate": 4.000031627239123e-06,
"loss": 1.0271,
"step": 2010
},
{
"epoch": 5.143222506393862,
"grad_norm": 0.14244659447949104,
"learning_rate": 3.989832363107664e-06,
"loss": 0.9729,
"step": 2011
},
{
"epoch": 5.145780051150895,
"grad_norm": 0.1474525383109307,
"learning_rate": 3.9796428771431625e-06,
"loss": 1.0208,
"step": 2012
},
{
"epoch": 5.148337595907928,
"grad_norm": 0.14684653759057748,
"learning_rate": 3.96946318592328e-06,
"loss": 0.9944,
"step": 2013
},
{
"epoch": 5.150895140664962,
"grad_norm": 0.14793817657477276,
"learning_rate": 3.959293306009734e-06,
"loss": 1.0606,
"step": 2014
},
{
"epoch": 5.153452685421995,
"grad_norm": 0.13847357302909763,
"learning_rate": 3.949133253948284e-06,
"loss": 1.0035,
"step": 2015
},
{
"epoch": 5.156010230179028,
"grad_norm": 0.14747847539008258,
"learning_rate": 3.938983046268695e-06,
"loss": 0.9869,
"step": 2016
},
{
"epoch": 5.158567774936062,
"grad_norm": 0.14511374476416694,
"learning_rate": 3.9288426994847285e-06,
"loss": 1.0238,
"step": 2017
},
{
"epoch": 5.161125319693094,
"grad_norm": 0.15030414965811079,
"learning_rate": 3.918712230094091e-06,
"loss": 1.0521,
"step": 2018
},
{
"epoch": 5.163682864450128,
"grad_norm": 0.14420923408617164,
"learning_rate": 3.908591654578417e-06,
"loss": 0.9878,
"step": 2019
},
{
"epoch": 5.166240409207161,
"grad_norm": 0.1369795797536583,
"learning_rate": 3.89848098940326e-06,
"loss": 1.0203,
"step": 2020
},
{
"epoch": 5.168797953964194,
"grad_norm": 0.15862135307508646,
"learning_rate": 3.888380251018035e-06,
"loss": 1.0112,
"step": 2021
},
{
"epoch": 5.171355498721228,
"grad_norm": 0.13968732984433663,
"learning_rate": 3.878289455856013e-06,
"loss": 1.0589,
"step": 2022
},
{
"epoch": 5.173913043478261,
"grad_norm": 0.14444481777607088,
"learning_rate": 3.868208620334282e-06,
"loss": 1.0065,
"step": 2023
},
{
"epoch": 5.176470588235294,
"grad_norm": 0.14184611750434217,
"learning_rate": 3.858137760853737e-06,
"loss": 1.0189,
"step": 2024
},
{
"epoch": 5.179028132992327,
"grad_norm": 0.14923144029216218,
"learning_rate": 3.84807689379904e-06,
"loss": 1.0052,
"step": 2025
},
{
"epoch": 5.181585677749361,
"grad_norm": 0.15459564247502722,
"learning_rate": 3.838026035538581e-06,
"loss": 0.9946,
"step": 2026
},
{
"epoch": 5.1841432225063935,
"grad_norm": 0.1418795966374483,
"learning_rate": 3.827985202424488e-06,
"loss": 1.0234,
"step": 2027
},
{
"epoch": 5.186700767263427,
"grad_norm": 0.1553154903132494,
"learning_rate": 3.817954410792565e-06,
"loss": 1.0137,
"step": 2028
},
{
"epoch": 5.189258312020461,
"grad_norm": 0.14275503896178632,
"learning_rate": 3.8079336769622834e-06,
"loss": 1.0289,
"step": 2029
},
{
"epoch": 5.1918158567774935,
"grad_norm": 0.13897565956134958,
"learning_rate": 3.7979230172367453e-06,
"loss": 1.0148,
"step": 2030
},
{
"epoch": 5.194373401534527,
"grad_norm": 0.14252828284486727,
"learning_rate": 3.7879224479026745e-06,
"loss": 1.0068,
"step": 2031
},
{
"epoch": 5.19693094629156,
"grad_norm": 0.1517901716492953,
"learning_rate": 3.7779319852303766e-06,
"loss": 1.0572,
"step": 2032
},
{
"epoch": 5.1994884910485935,
"grad_norm": 0.1439259357160915,
"learning_rate": 3.7679516454736977e-06,
"loss": 1.0446,
"step": 2033
},
{
"epoch": 5.202046035805626,
"grad_norm": 0.1371345617669485,
"learning_rate": 3.757981444870035e-06,
"loss": 0.9957,
"step": 2034
},
{
"epoch": 5.20460358056266,
"grad_norm": 0.16004739713130242,
"learning_rate": 3.748021399640279e-06,
"loss": 1.0276,
"step": 2035
},
{
"epoch": 5.207161125319693,
"grad_norm": 0.1441426514349444,
"learning_rate": 3.7380715259888e-06,
"loss": 1.0344,
"step": 2036
},
{
"epoch": 5.209718670076726,
"grad_norm": 0.14152534835692054,
"learning_rate": 3.7281318401034183e-06,
"loss": 0.9949,
"step": 2037
},
{
"epoch": 5.21227621483376,
"grad_norm": 0.1481149663167974,
"learning_rate": 3.718202358155384e-06,
"loss": 1.0545,
"step": 2038
},
{
"epoch": 5.2148337595907925,
"grad_norm": 0.13716666870403715,
"learning_rate": 3.7082830962993497e-06,
"loss": 1.0388,
"step": 2039
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.1427599492035968,
"learning_rate": 3.6983740706733207e-06,
"loss": 0.9945,
"step": 2040
},
{
"epoch": 5.21994884910486,
"grad_norm": 0.14437989757241948,
"learning_rate": 3.688475297398674e-06,
"loss": 1.037,
"step": 2041
},
{
"epoch": 5.2225063938618925,
"grad_norm": 0.1407689885502161,
"learning_rate": 3.6785867925800856e-06,
"loss": 1.0019,
"step": 2042
},
{
"epoch": 5.225063938618926,
"grad_norm": 0.1381622930416597,
"learning_rate": 3.668708572305546e-06,
"loss": 1.0384,
"step": 2043
},
{
"epoch": 5.227621483375959,
"grad_norm": 0.13975927307572164,
"learning_rate": 3.658840652646287e-06,
"loss": 1.0018,
"step": 2044
},
{
"epoch": 5.2301790281329925,
"grad_norm": 0.15578171256673842,
"learning_rate": 3.6489830496568067e-06,
"loss": 1.0221,
"step": 2045
},
{
"epoch": 5.232736572890025,
"grad_norm": 0.14587450260403836,
"learning_rate": 3.639135779374813e-06,
"loss": 1.0462,
"step": 2046
},
{
"epoch": 5.235294117647059,
"grad_norm": 0.14336907869458113,
"learning_rate": 3.6292988578211863e-06,
"loss": 1.0242,
"step": 2047
},
{
"epoch": 5.2378516624040925,
"grad_norm": 0.13614785911809554,
"learning_rate": 3.619472300999992e-06,
"loss": 1.002,
"step": 2048
},
{
"epoch": 5.240409207161125,
"grad_norm": 0.14654873047839187,
"learning_rate": 3.6096561248984186e-06,
"loss": 1.0365,
"step": 2049
},
{
"epoch": 5.242966751918159,
"grad_norm": 0.14832735168435557,
"learning_rate": 3.5998503454867807e-06,
"loss": 1.0206,
"step": 2050
},
{
"epoch": 5.245524296675192,
"grad_norm": 0.15182549845090051,
"learning_rate": 3.5900549787184534e-06,
"loss": 1.0086,
"step": 2051
},
{
"epoch": 5.248081841432225,
"grad_norm": 0.15218834374865772,
"learning_rate": 3.580270040529894e-06,
"loss": 1.0457,
"step": 2052
},
{
"epoch": 5.250639386189258,
"grad_norm": 0.1386445311628316,
"learning_rate": 3.570495546840591e-06,
"loss": 1.0316,
"step": 2053
},
{
"epoch": 5.253196930946292,
"grad_norm": 0.1415172130548022,
"learning_rate": 3.560731513553022e-06,
"loss": 1.033,
"step": 2054
},
{
"epoch": 5.255754475703325,
"grad_norm": 0.134688736061587,
"learning_rate": 3.5509779565526683e-06,
"loss": 1.0341,
"step": 2055
},
{
"epoch": 5.258312020460358,
"grad_norm": 0.14665953403303808,
"learning_rate": 3.5412348917079507e-06,
"loss": 1.0621,
"step": 2056
},
{
"epoch": 5.260869565217392,
"grad_norm": 0.13619183573807261,
"learning_rate": 3.5315023348702325e-06,
"loss": 1.0366,
"step": 2057
},
{
"epoch": 5.263427109974424,
"grad_norm": 0.13658849089622857,
"learning_rate": 3.521780301873773e-06,
"loss": 1.0008,
"step": 2058
},
{
"epoch": 5.265984654731458,
"grad_norm": 0.14630387436677678,
"learning_rate": 3.512068808535707e-06,
"loss": 1.0147,
"step": 2059
},
{
"epoch": 5.268542199488491,
"grad_norm": 0.13734073999332427,
"learning_rate": 3.502367870656035e-06,
"loss": 1.028,
"step": 2060
},
{
"epoch": 5.271099744245524,
"grad_norm": 0.1355644028489033,
"learning_rate": 3.492677504017573e-06,
"loss": 1.0026,
"step": 2061
},
{
"epoch": 5.273657289002558,
"grad_norm": 0.14119902993384847,
"learning_rate": 3.4829977243859414e-06,
"loss": 1.0093,
"step": 2062
},
{
"epoch": 5.276214833759591,
"grad_norm": 0.14118557253626327,
"learning_rate": 3.4733285475095324e-06,
"loss": 1.0255,
"step": 2063
},
{
"epoch": 5.278772378516624,
"grad_norm": 0.13630213438701977,
"learning_rate": 3.4636699891195e-06,
"loss": 1.0176,
"step": 2064
},
{
"epoch": 5.281329923273657,
"grad_norm": 0.1355438862392238,
"learning_rate": 3.454022064929711e-06,
"loss": 1.0355,
"step": 2065
},
{
"epoch": 5.283887468030691,
"grad_norm": 0.1335405410237401,
"learning_rate": 3.4443847906367313e-06,
"loss": 0.9999,
"step": 2066
},
{
"epoch": 5.286445012787723,
"grad_norm": 0.13568542243072879,
"learning_rate": 3.4347581819198095e-06,
"loss": 1.0069,
"step": 2067
},
{
"epoch": 5.289002557544757,
"grad_norm": 0.14279750042804518,
"learning_rate": 3.425142254440835e-06,
"loss": 1.0316,
"step": 2068
},
{
"epoch": 5.291560102301791,
"grad_norm": 0.1421562223189775,
"learning_rate": 3.4155370238443185e-06,
"loss": 0.9929,
"step": 2069
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.13090998129388792,
"learning_rate": 3.405942505757367e-06,
"loss": 1.0235,
"step": 2070
},
{
"epoch": 5.296675191815857,
"grad_norm": 0.1447611334505954,
"learning_rate": 3.3963587157896694e-06,
"loss": 0.9883,
"step": 2071
},
{
"epoch": 5.29923273657289,
"grad_norm": 0.1486460622906693,
"learning_rate": 3.386785669533447e-06,
"loss": 1.0614,
"step": 2072
},
{
"epoch": 5.301790281329923,
"grad_norm": 0.13082209863415079,
"learning_rate": 3.377223382563446e-06,
"loss": 1.019,
"step": 2073
},
{
"epoch": 5.304347826086957,
"grad_norm": 0.14431855838963542,
"learning_rate": 3.367671870436915e-06,
"loss": 1.0744,
"step": 2074
},
{
"epoch": 5.30690537084399,
"grad_norm": 0.13501366283453947,
"learning_rate": 3.358131148693564e-06,
"loss": 1.0204,
"step": 2075
},
{
"epoch": 5.309462915601023,
"grad_norm": 0.13647498103708036,
"learning_rate": 3.3486012328555505e-06,
"loss": 1.0361,
"step": 2076
},
{
"epoch": 5.312020460358056,
"grad_norm": 0.13678423051822214,
"learning_rate": 3.33908213842745e-06,
"loss": 1.0416,
"step": 2077
},
{
"epoch": 5.31457800511509,
"grad_norm": 0.15117370323671084,
"learning_rate": 3.3295738808962388e-06,
"loss": 1.0398,
"step": 2078
},
{
"epoch": 5.3171355498721224,
"grad_norm": 0.13218102548293045,
"learning_rate": 3.3200764757312555e-06,
"loss": 1.0211,
"step": 2079
},
{
"epoch": 5.319693094629156,
"grad_norm": 0.13875158228376064,
"learning_rate": 3.310589938384179e-06,
"loss": 1.0246,
"step": 2080
},
{
"epoch": 5.322250639386189,
"grad_norm": 0.1390888027343779,
"learning_rate": 3.301114284289021e-06,
"loss": 1.0228,
"step": 2081
},
{
"epoch": 5.324808184143222,
"grad_norm": 0.14311106791965889,
"learning_rate": 3.291649528862074e-06,
"loss": 1.0366,
"step": 2082
},
{
"epoch": 5.327365728900256,
"grad_norm": 0.1329482436934704,
"learning_rate": 3.2821956875019045e-06,
"loss": 0.9983,
"step": 2083
},
{
"epoch": 5.329923273657289,
"grad_norm": 0.1353254341465528,
"learning_rate": 3.272752775589316e-06,
"loss": 1.0262,
"step": 2084
},
{
"epoch": 5.332480818414322,
"grad_norm": 0.14279181335598803,
"learning_rate": 3.2633208084873445e-06,
"loss": 1.0214,
"step": 2085
},
{
"epoch": 5.335038363171355,
"grad_norm": 0.14938681808695,
"learning_rate": 3.253899801541206e-06,
"loss": 1.0458,
"step": 2086
},
{
"epoch": 5.337595907928389,
"grad_norm": 0.13903091402439763,
"learning_rate": 3.244489770078286e-06,
"loss": 1.0699,
"step": 2087
},
{
"epoch": 5.340153452685422,
"grad_norm": 0.14447995472723943,
"learning_rate": 3.2350907294081258e-06,
"loss": 0.9936,
"step": 2088
},
{
"epoch": 5.342710997442455,
"grad_norm": 0.14276869094442168,
"learning_rate": 3.2257026948223726e-06,
"loss": 1.0565,
"step": 2089
},
{
"epoch": 5.345268542199489,
"grad_norm": 0.14335515694613532,
"learning_rate": 3.2163256815947674e-06,
"loss": 0.9993,
"step": 2090
},
{
"epoch": 5.3478260869565215,
"grad_norm": 0.14665513927933138,
"learning_rate": 3.206959704981133e-06,
"loss": 1.0555,
"step": 2091
},
{
"epoch": 5.350383631713555,
"grad_norm": 0.1322833527352921,
"learning_rate": 3.197604780219323e-06,
"loss": 0.9652,
"step": 2092
},
{
"epoch": 5.352941176470588,
"grad_norm": 0.13906561826785738,
"learning_rate": 3.188260922529215e-06,
"loss": 1.0432,
"step": 2093
},
{
"epoch": 5.3554987212276215,
"grad_norm": 0.14254937224329012,
"learning_rate": 3.1789281471126786e-06,
"loss": 1.0175,
"step": 2094
},
{
"epoch": 5.358056265984655,
"grad_norm": 0.14911195774932937,
"learning_rate": 3.1696064691535634e-06,
"loss": 1.0024,
"step": 2095
},
{
"epoch": 5.360613810741688,
"grad_norm": 0.1296333526942248,
"learning_rate": 3.1602959038176516e-06,
"loss": 1.016,
"step": 2096
},
{
"epoch": 5.3631713554987215,
"grad_norm": 0.14492528039945102,
"learning_rate": 3.1509964662526484e-06,
"loss": 1.0072,
"step": 2097
},
{
"epoch": 5.365728900255754,
"grad_norm": 0.14261896658846623,
"learning_rate": 3.1417081715881623e-06,
"loss": 0.997,
"step": 2098
},
{
"epoch": 5.368286445012788,
"grad_norm": 0.15062841301973245,
"learning_rate": 3.132431034935667e-06,
"loss": 1.0286,
"step": 2099
},
{
"epoch": 5.370843989769821,
"grad_norm": 0.14079332067477582,
"learning_rate": 3.1231650713884832e-06,
"loss": 1.0331,
"step": 2100
},
{
"epoch": 5.373401534526854,
"grad_norm": 0.13555419460898196,
"learning_rate": 3.1139102960217493e-06,
"loss": 1.0041,
"step": 2101
},
{
"epoch": 5.375959079283888,
"grad_norm": 0.13880524146849596,
"learning_rate": 3.1046667238924155e-06,
"loss": 1.0423,
"step": 2102
},
{
"epoch": 5.378516624040921,
"grad_norm": 0.1511402878049476,
"learning_rate": 3.0954343700391897e-06,
"loss": 1.0349,
"step": 2103
},
{
"epoch": 5.381074168797954,
"grad_norm": 0.14254863702344298,
"learning_rate": 3.0862132494825325e-06,
"loss": 1.026,
"step": 2104
},
{
"epoch": 5.383631713554987,
"grad_norm": 0.1352194409726658,
"learning_rate": 3.0770033772246376e-06,
"loss": 0.9938,
"step": 2105
},
{
"epoch": 5.3861892583120206,
"grad_norm": 0.14319029352124846,
"learning_rate": 3.067804768249386e-06,
"loss": 0.9968,
"step": 2106
},
{
"epoch": 5.388746803069053,
"grad_norm": 0.1348404188548053,
"learning_rate": 3.058617437522342e-06,
"loss": 1.0166,
"step": 2107
},
{
"epoch": 5.391304347826087,
"grad_norm": 0.14010852729827156,
"learning_rate": 3.0494413999907125e-06,
"loss": 1.0066,
"step": 2108
},
{
"epoch": 5.3938618925831205,
"grad_norm": 0.1351055036158788,
"learning_rate": 3.0402766705833455e-06,
"loss": 1.0052,
"step": 2109
},
{
"epoch": 5.396419437340153,
"grad_norm": 0.13186613064153313,
"learning_rate": 3.0311232642106768e-06,
"loss": 0.9969,
"step": 2110
},
{
"epoch": 5.398976982097187,
"grad_norm": 0.1408809630359071,
"learning_rate": 3.021981195764726e-06,
"loss": 1.0283,
"step": 2111
},
{
"epoch": 5.40153452685422,
"grad_norm": 0.12965889759923607,
"learning_rate": 3.0128504801190716e-06,
"loss": 1.0179,
"step": 2112
},
{
"epoch": 5.404092071611253,
"grad_norm": 0.13945206906826596,
"learning_rate": 3.003731132128811e-06,
"loss": 1.0099,
"step": 2113
},
{
"epoch": 5.406649616368286,
"grad_norm": 0.1400549514773388,
"learning_rate": 2.9946231666305627e-06,
"loss": 0.998,
"step": 2114
},
{
"epoch": 5.40920716112532,
"grad_norm": 0.13519306803227119,
"learning_rate": 2.9855265984424042e-06,
"loss": 1.0069,
"step": 2115
},
{
"epoch": 5.411764705882353,
"grad_norm": 0.12988356378358373,
"learning_rate": 2.976441442363893e-06,
"loss": 0.9928,
"step": 2116
},
{
"epoch": 5.414322250639386,
"grad_norm": 0.13225437647406532,
"learning_rate": 2.967367713176007e-06,
"loss": 1.0082,
"step": 2117
},
{
"epoch": 5.41687979539642,
"grad_norm": 0.13453763452834291,
"learning_rate": 2.9583054256411326e-06,
"loss": 0.9779,
"step": 2118
},
{
"epoch": 5.419437340153452,
"grad_norm": 0.13933174777230192,
"learning_rate": 2.9492545945030517e-06,
"loss": 0.9947,
"step": 2119
},
{
"epoch": 5.421994884910486,
"grad_norm": 0.13265772100907866,
"learning_rate": 2.940215234486894e-06,
"loss": 1.0304,
"step": 2120
},
{
"epoch": 5.42455242966752,
"grad_norm": 0.13461066684644984,
"learning_rate": 2.9311873602991435e-06,
"loss": 1.0265,
"step": 2121
},
{
"epoch": 5.427109974424552,
"grad_norm": 0.13302962430701365,
"learning_rate": 2.922170986627573e-06,
"loss": 0.9907,
"step": 2122
},
{
"epoch": 5.429667519181586,
"grad_norm": 0.1372156107097446,
"learning_rate": 2.913166128141265e-06,
"loss": 1.0362,
"step": 2123
},
{
"epoch": 5.432225063938619,
"grad_norm": 0.13526418969755188,
"learning_rate": 2.9041727994905686e-06,
"loss": 1.0335,
"step": 2124
},
{
"epoch": 5.434782608695652,
"grad_norm": 0.14056788233892892,
"learning_rate": 2.895191015307055e-06,
"loss": 0.9863,
"step": 2125
},
{
"epoch": 5.437340153452685,
"grad_norm": 0.13830914570568487,
"learning_rate": 2.8862207902035334e-06,
"loss": 1.0279,
"step": 2126
},
{
"epoch": 5.439897698209719,
"grad_norm": 0.13255464251905436,
"learning_rate": 2.877262138773994e-06,
"loss": 1.0074,
"step": 2127
},
{
"epoch": 5.442455242966752,
"grad_norm": 0.13094809127879986,
"learning_rate": 2.8683150755936107e-06,
"loss": 1.0007,
"step": 2128
},
{
"epoch": 5.445012787723785,
"grad_norm": 0.13969902391137623,
"learning_rate": 2.859379615218685e-06,
"loss": 1.0183,
"step": 2129
},
{
"epoch": 5.447570332480819,
"grad_norm": 0.13298200813066383,
"learning_rate": 2.850455772186658e-06,
"loss": 1.0553,
"step": 2130
},
{
"epoch": 5.450127877237851,
"grad_norm": 0.13752465215056384,
"learning_rate": 2.8415435610160667e-06,
"loss": 1.0029,
"step": 2131
},
{
"epoch": 5.452685421994885,
"grad_norm": 0.13776730476333435,
"learning_rate": 2.8326429962065184e-06,
"loss": 1.0591,
"step": 2132
},
{
"epoch": 5.455242966751918,
"grad_norm": 0.15290697841832607,
"learning_rate": 2.8237540922386764e-06,
"loss": 1.0234,
"step": 2133
},
{
"epoch": 5.457800511508951,
"grad_norm": 0.1435647245473299,
"learning_rate": 2.8148768635742286e-06,
"loss": 1.0408,
"step": 2134
},
{
"epoch": 5.460358056265985,
"grad_norm": 0.1348972282036283,
"learning_rate": 2.8060113246558783e-06,
"loss": 1.0582,
"step": 2135
},
{
"epoch": 5.462915601023018,
"grad_norm": 0.14312694503231538,
"learning_rate": 2.7971574899072938e-06,
"loss": 1.0557,
"step": 2136
},
{
"epoch": 5.465473145780051,
"grad_norm": 0.14626596664710145,
"learning_rate": 2.7883153737331136e-06,
"loss": 1.0213,
"step": 2137
},
{
"epoch": 5.468030690537084,
"grad_norm": 0.12723321182479033,
"learning_rate": 2.7794849905189138e-06,
"loss": 1.0258,
"step": 2138
},
{
"epoch": 5.470588235294118,
"grad_norm": 0.1297835067922189,
"learning_rate": 2.7706663546311705e-06,
"loss": 0.9791,
"step": 2139
},
{
"epoch": 5.4731457800511505,
"grad_norm": 0.14065052834912603,
"learning_rate": 2.761859480417255e-06,
"loss": 1.0364,
"step": 2140
},
{
"epoch": 5.475703324808184,
"grad_norm": 0.14903101964341123,
"learning_rate": 2.753064382205396e-06,
"loss": 1.046,
"step": 2141
},
{
"epoch": 5.478260869565218,
"grad_norm": 0.12884063957129957,
"learning_rate": 2.7442810743046742e-06,
"loss": 1.0377,
"step": 2142
},
{
"epoch": 5.4808184143222505,
"grad_norm": 0.13327063753076238,
"learning_rate": 2.735509571004982e-06,
"loss": 1.0095,
"step": 2143
},
{
"epoch": 5.483375959079284,
"grad_norm": 0.1571390786677921,
"learning_rate": 2.7267498865770005e-06,
"loss": 0.9769,
"step": 2144
},
{
"epoch": 5.485933503836317,
"grad_norm": 0.1320156220064998,
"learning_rate": 2.718002035272197e-06,
"loss": 1.0057,
"step": 2145
},
{
"epoch": 5.4884910485933505,
"grad_norm": 0.1360636747597633,
"learning_rate": 2.7092660313227748e-06,
"loss": 1.0064,
"step": 2146
},
{
"epoch": 5.491048593350383,
"grad_norm": 0.13394654726028757,
"learning_rate": 2.700541888941667e-06,
"loss": 1.0025,
"step": 2147
},
{
"epoch": 5.493606138107417,
"grad_norm": 0.1460012982176339,
"learning_rate": 2.6918296223225026e-06,
"loss": 1.0227,
"step": 2148
},
{
"epoch": 5.4961636828644505,
"grad_norm": 0.13049152775591077,
"learning_rate": 2.683129245639603e-06,
"loss": 1.0393,
"step": 2149
},
{
"epoch": 5.498721227621483,
"grad_norm": 0.15254103744247385,
"learning_rate": 2.6744407730479325e-06,
"loss": 1.0279,
"step": 2150
},
{
"epoch": 5.501278772378517,
"grad_norm": 0.1440023793657765,
"learning_rate": 2.66576421868309e-06,
"loss": 1.0295,
"step": 2151
},
{
"epoch": 5.5038363171355495,
"grad_norm": 0.13606809517331622,
"learning_rate": 2.6570995966612945e-06,
"loss": 1.0299,
"step": 2152
},
{
"epoch": 5.506393861892583,
"grad_norm": 0.13926181662872325,
"learning_rate": 2.6484469210793384e-06,
"loss": 1.037,
"step": 2153
},
{
"epoch": 5.508951406649617,
"grad_norm": 0.14473456019169403,
"learning_rate": 2.6398062060145867e-06,
"loss": 1.017,
"step": 2154
},
{
"epoch": 5.5115089514066495,
"grad_norm": 0.13272081994045937,
"learning_rate": 2.631177465524938e-06,
"loss": 1.0217,
"step": 2155
},
{
"epoch": 5.514066496163683,
"grad_norm": 0.14026203110310534,
"learning_rate": 2.6225607136488194e-06,
"loss": 1.0021,
"step": 2156
},
{
"epoch": 5.516624040920716,
"grad_norm": 0.13205919977316974,
"learning_rate": 2.613955964405146e-06,
"loss": 1.052,
"step": 2157
},
{
"epoch": 5.5191815856777495,
"grad_norm": 0.13360379756882199,
"learning_rate": 2.605363231793302e-06,
"loss": 1.0499,
"step": 2158
},
{
"epoch": 5.521739130434782,
"grad_norm": 0.14208435941220482,
"learning_rate": 2.5967825297931328e-06,
"loss": 1.0172,
"step": 2159
},
{
"epoch": 5.524296675191816,
"grad_norm": 0.13295870010362018,
"learning_rate": 2.5882138723649018e-06,
"loss": 1.0334,
"step": 2160
},
{
"epoch": 5.526854219948849,
"grad_norm": 0.12489034371588933,
"learning_rate": 2.5796572734492777e-06,
"loss": 1.0103,
"step": 2161
},
{
"epoch": 5.529411764705882,
"grad_norm": 0.13244599397256537,
"learning_rate": 2.571112746967309e-06,
"loss": 1.0218,
"step": 2162
},
{
"epoch": 5.531969309462916,
"grad_norm": 0.15003256070846932,
"learning_rate": 2.5625803068204126e-06,
"loss": 1.0759,
"step": 2163
},
{
"epoch": 5.534526854219949,
"grad_norm": 0.1356632599292978,
"learning_rate": 2.554059966890332e-06,
"loss": 1.0042,
"step": 2164
},
{
"epoch": 5.537084398976982,
"grad_norm": 0.15088785982749878,
"learning_rate": 2.545551741039125e-06,
"loss": 1.0084,
"step": 2165
},
{
"epoch": 5.539641943734015,
"grad_norm": 0.13549191741444538,
"learning_rate": 2.5370556431091486e-06,
"loss": 1.0447,
"step": 2166
},
{
"epoch": 5.542199488491049,
"grad_norm": 0.1345097927774657,
"learning_rate": 2.5285716869230192e-06,
"loss": 1.0352,
"step": 2167
},
{
"epoch": 5.544757033248082,
"grad_norm": 0.1377603438588639,
"learning_rate": 2.5200998862836044e-06,
"loss": 1.0456,
"step": 2168
},
{
"epoch": 5.547314578005115,
"grad_norm": 0.13719837282442893,
"learning_rate": 2.5116402549739904e-06,
"loss": 1.0111,
"step": 2169
},
{
"epoch": 5.549872122762149,
"grad_norm": 0.12784774794791698,
"learning_rate": 2.503192806757474e-06,
"loss": 1.0555,
"step": 2170
},
{
"epoch": 5.552429667519181,
"grad_norm": 0.1377625979101254,
"learning_rate": 2.494757555377524e-06,
"loss": 1.0217,
"step": 2171
},
{
"epoch": 5.554987212276215,
"grad_norm": 0.13849942681054245,
"learning_rate": 2.486334514557761e-06,
"loss": 1.0175,
"step": 2172
},
{
"epoch": 5.557544757033249,
"grad_norm": 0.14070221787371265,
"learning_rate": 2.477923698001955e-06,
"loss": 1.03,
"step": 2173
},
{
"epoch": 5.560102301790281,
"grad_norm": 0.12917105115289923,
"learning_rate": 2.469525119393974e-06,
"loss": 1.0316,
"step": 2174
},
{
"epoch": 5.562659846547315,
"grad_norm": 0.14393204543904917,
"learning_rate": 2.461138792397779e-06,
"loss": 1.0429,
"step": 2175
},
{
"epoch": 5.565217391304348,
"grad_norm": 0.1350830986575781,
"learning_rate": 2.4527647306574e-06,
"loss": 1.0005,
"step": 2176
},
{
"epoch": 5.567774936061381,
"grad_norm": 0.1272869817285887,
"learning_rate": 2.4444029477969157e-06,
"loss": 1.0083,
"step": 2177
},
{
"epoch": 5.570332480818414,
"grad_norm": 0.1329875176980315,
"learning_rate": 2.4360534574204196e-06,
"loss": 1.0064,
"step": 2178
},
{
"epoch": 5.572890025575448,
"grad_norm": 0.13284521850316935,
"learning_rate": 2.427716273112011e-06,
"loss": 1.026,
"step": 2179
},
{
"epoch": 5.57544757033248,
"grad_norm": 0.13655729094534802,
"learning_rate": 2.4193914084357708e-06,
"loss": 1.0311,
"step": 2180
},
{
"epoch": 5.578005115089514,
"grad_norm": 0.13249886049800538,
"learning_rate": 2.4110788769357305e-06,
"loss": 1.0245,
"step": 2181
},
{
"epoch": 5.580562659846548,
"grad_norm": 0.14032611666517894,
"learning_rate": 2.402778692135861e-06,
"loss": 1.0218,
"step": 2182
},
{
"epoch": 5.58312020460358,
"grad_norm": 0.13366091002172387,
"learning_rate": 2.394490867540039e-06,
"loss": 1.0275,
"step": 2183
},
{
"epoch": 5.585677749360614,
"grad_norm": 0.13700684117392312,
"learning_rate": 2.3862154166320417e-06,
"loss": 1.0055,
"step": 2184
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.13884798487973146,
"learning_rate": 2.3779523528755143e-06,
"loss": 1.0298,
"step": 2185
},
{
"epoch": 5.59079283887468,
"grad_norm": 0.14068128211510497,
"learning_rate": 2.3697016897139345e-06,
"loss": 1.0568,
"step": 2186
},
{
"epoch": 5.593350383631714,
"grad_norm": 0.1367538445761975,
"learning_rate": 2.361463440570623e-06,
"loss": 1.0211,
"step": 2187
},
{
"epoch": 5.595907928388747,
"grad_norm": 0.137882423029852,
"learning_rate": 2.353237618848695e-06,
"loss": 1.0388,
"step": 2188
},
{
"epoch": 5.59846547314578,
"grad_norm": 0.13627762962811446,
"learning_rate": 2.3450242379310427e-06,
"loss": 1.0423,
"step": 2189
},
{
"epoch": 5.601023017902813,
"grad_norm": 0.13080557028764447,
"learning_rate": 2.3368233111803305e-06,
"loss": 1.0209,
"step": 2190
},
{
"epoch": 5.603580562659847,
"grad_norm": 0.13373365809565754,
"learning_rate": 2.328634851938949e-06,
"loss": 1.0548,
"step": 2191
},
{
"epoch": 5.6061381074168795,
"grad_norm": 0.14670903806258018,
"learning_rate": 2.3204588735290155e-06,
"loss": 1.0283,
"step": 2192
},
{
"epoch": 5.608695652173913,
"grad_norm": 0.1351316953465856,
"learning_rate": 2.312295389252326e-06,
"loss": 1.0253,
"step": 2193
},
{
"epoch": 5.611253196930946,
"grad_norm": 0.14536763822784776,
"learning_rate": 2.304144412390367e-06,
"loss": 1.0289,
"step": 2194
},
{
"epoch": 5.6138107416879794,
"grad_norm": 0.1373151541315976,
"learning_rate": 2.2960059562042647e-06,
"loss": 1.0227,
"step": 2195
},
{
"epoch": 5.616368286445013,
"grad_norm": 0.12983515898716327,
"learning_rate": 2.2878800339347763e-06,
"loss": 1.0256,
"step": 2196
},
{
"epoch": 5.618925831202046,
"grad_norm": 0.12825544867685706,
"learning_rate": 2.279766658802275e-06,
"loss": 1.0468,
"step": 2197
},
{
"epoch": 5.621483375959079,
"grad_norm": 0.14977773117762613,
"learning_rate": 2.2716658440067085e-06,
"loss": 1.0045,
"step": 2198
},
{
"epoch": 5.624040920716112,
"grad_norm": 0.163815240244753,
"learning_rate": 2.2635776027276056e-06,
"loss": 1.0211,
"step": 2199
},
{
"epoch": 5.626598465473146,
"grad_norm": 0.1311668589781632,
"learning_rate": 2.255501948124017e-06,
"loss": 1.0318,
"step": 2200
},
{
"epoch": 5.629156010230179,
"grad_norm": 0.13085196604157895,
"learning_rate": 2.247438893334537e-06,
"loss": 1.0219,
"step": 2201
},
{
"epoch": 5.631713554987212,
"grad_norm": 0.1273903714267332,
"learning_rate": 2.2393884514772457e-06,
"loss": 0.9929,
"step": 2202
},
{
"epoch": 5.634271099744246,
"grad_norm": 0.13914897324377146,
"learning_rate": 2.231350635649713e-06,
"loss": 1.0452,
"step": 2203
},
{
"epoch": 5.6368286445012785,
"grad_norm": 0.13636448611829766,
"learning_rate": 2.223325458928961e-06,
"loss": 1.0078,
"step": 2204
},
{
"epoch": 5.639386189258312,
"grad_norm": 0.13875063448351502,
"learning_rate": 2.2153129343714484e-06,
"loss": 1.044,
"step": 2205
},
{
"epoch": 5.641943734015345,
"grad_norm": 0.1268762090418032,
"learning_rate": 2.207313075013059e-06,
"loss": 1.021,
"step": 2206
},
{
"epoch": 5.6445012787723785,
"grad_norm": 0.14115564139986136,
"learning_rate": 2.1993258938690533e-06,
"loss": 0.9935,
"step": 2207
},
{
"epoch": 5.647058823529412,
"grad_norm": 0.13114159318824248,
"learning_rate": 2.191351403934082e-06,
"loss": 1.0314,
"step": 2208
},
{
"epoch": 5.649616368286445,
"grad_norm": 0.12884976286632582,
"learning_rate": 2.183389618182139e-06,
"loss": 1.0046,
"step": 2209
},
{
"epoch": 5.6521739130434785,
"grad_norm": 0.12995582182420992,
"learning_rate": 2.1754405495665553e-06,
"loss": 1.0373,
"step": 2210
},
{
"epoch": 5.654731457800511,
"grad_norm": 0.13421458626767693,
"learning_rate": 2.1675042110199664e-06,
"loss": 1.016,
"step": 2211
},
{
"epoch": 5.657289002557545,
"grad_norm": 0.13511795554454278,
"learning_rate": 2.1595806154542965e-06,
"loss": 1.0203,
"step": 2212
},
{
"epoch": 5.659846547314578,
"grad_norm": 0.12526718028345482,
"learning_rate": 2.1516697757607464e-06,
"loss": 1.048,
"step": 2213
},
{
"epoch": 5.662404092071611,
"grad_norm": 0.13609131915375153,
"learning_rate": 2.143771704809753e-06,
"loss": 1.0221,
"step": 2214
},
{
"epoch": 5.664961636828645,
"grad_norm": 0.13389453092548842,
"learning_rate": 2.1358864154509838e-06,
"loss": 0.995,
"step": 2215
},
{
"epoch": 5.667519181585678,
"grad_norm": 0.13120531247951384,
"learning_rate": 2.128013920513311e-06,
"loss": 1.002,
"step": 2216
},
{
"epoch": 5.670076726342711,
"grad_norm": 0.12595917765897047,
"learning_rate": 2.1201542328047965e-06,
"loss": 1.0307,
"step": 2217
},
{
"epoch": 5.672634271099744,
"grad_norm": 0.1327291524503786,
"learning_rate": 2.112307365112657e-06,
"loss": 1.0042,
"step": 2218
},
{
"epoch": 5.675191815856778,
"grad_norm": 0.14073038841763177,
"learning_rate": 2.1044733302032527e-06,
"loss": 1.0089,
"step": 2219
},
{
"epoch": 5.677749360613811,
"grad_norm": 0.13145348857222067,
"learning_rate": 2.0966521408220753e-06,
"loss": 1.0191,
"step": 2220
},
{
"epoch": 5.680306905370844,
"grad_norm": 0.13758179967194598,
"learning_rate": 2.088843809693708e-06,
"loss": 1.0389,
"step": 2221
},
{
"epoch": 5.6828644501278776,
"grad_norm": 0.12934306601192186,
"learning_rate": 2.081048349521814e-06,
"loss": 1.0386,
"step": 2222
},
{
"epoch": 5.68542199488491,
"grad_norm": 0.12132994106171455,
"learning_rate": 2.0732657729891236e-06,
"loss": 1.0237,
"step": 2223
},
{
"epoch": 5.687979539641944,
"grad_norm": 0.12639844337210293,
"learning_rate": 2.065496092757403e-06,
"loss": 1.0039,
"step": 2224
},
{
"epoch": 5.690537084398977,
"grad_norm": 0.1397408236378054,
"learning_rate": 2.0577393214674335e-06,
"loss": 1.0782,
"step": 2225
},
{
"epoch": 5.69309462915601,
"grad_norm": 0.12975569414651789,
"learning_rate": 2.049995471738995e-06,
"loss": 1.029,
"step": 2226
},
{
"epoch": 5.695652173913043,
"grad_norm": 0.13025784101096557,
"learning_rate": 2.042264556170853e-06,
"loss": 0.9846,
"step": 2227
},
{
"epoch": 5.698209718670077,
"grad_norm": 0.1282941793591346,
"learning_rate": 2.034546587340719e-06,
"loss": 1.0143,
"step": 2228
},
{
"epoch": 5.70076726342711,
"grad_norm": 0.13236558983338137,
"learning_rate": 2.026841577805245e-06,
"loss": 1.0534,
"step": 2229
},
{
"epoch": 5.703324808184143,
"grad_norm": 0.13423342295188723,
"learning_rate": 2.019149540100005e-06,
"loss": 1.0568,
"step": 2230
},
{
"epoch": 5.705882352941177,
"grad_norm": 0.13468947441049006,
"learning_rate": 2.0114704867394598e-06,
"loss": 1.014,
"step": 2231
},
{
"epoch": 5.708439897698209,
"grad_norm": 0.13388666927274886,
"learning_rate": 2.0038044302169492e-06,
"loss": 1.0246,
"step": 2232
},
{
"epoch": 5.710997442455243,
"grad_norm": 0.13458582769078975,
"learning_rate": 1.9961513830046663e-06,
"loss": 1.0335,
"step": 2233
},
{
"epoch": 5.713554987212277,
"grad_norm": 0.1334530516759338,
"learning_rate": 1.988511357553644e-06,
"loss": 1.0107,
"step": 2234
},
{
"epoch": 5.716112531969309,
"grad_norm": 0.13432155143391286,
"learning_rate": 1.980884366293725e-06,
"loss": 1.002,
"step": 2235
},
{
"epoch": 5.718670076726343,
"grad_norm": 0.1321302038455819,
"learning_rate": 1.973270421633543e-06,
"loss": 1.0281,
"step": 2236
},
{
"epoch": 5.721227621483376,
"grad_norm": 0.13482083547904436,
"learning_rate": 1.965669535960516e-06,
"loss": 1.0032,
"step": 2237
},
{
"epoch": 5.723785166240409,
"grad_norm": 0.1362582011621695,
"learning_rate": 1.9580817216408075e-06,
"loss": 1.0151,
"step": 2238
},
{
"epoch": 5.726342710997442,
"grad_norm": 0.13381683599607858,
"learning_rate": 1.9505069910193164e-06,
"loss": 0.9876,
"step": 2239
},
{
"epoch": 5.728900255754476,
"grad_norm": 0.12202356902109507,
"learning_rate": 1.9429453564196543e-06,
"loss": 1.0203,
"step": 2240
},
{
"epoch": 5.731457800511509,
"grad_norm": 0.12193705736628206,
"learning_rate": 1.9353968301441306e-06,
"loss": 0.9752,
"step": 2241
},
{
"epoch": 5.734015345268542,
"grad_norm": 0.1264989543927549,
"learning_rate": 1.927861424473726e-06,
"loss": 1.025,
"step": 2242
},
{
"epoch": 5.736572890025576,
"grad_norm": 0.14123473229613026,
"learning_rate": 1.920339151668069e-06,
"loss": 1.0125,
"step": 2243
},
{
"epoch": 5.739130434782608,
"grad_norm": 0.12538976213285152,
"learning_rate": 1.9128300239654353e-06,
"loss": 1.0103,
"step": 2244
},
{
"epoch": 5.741687979539642,
"grad_norm": 0.12777815103030538,
"learning_rate": 1.9053340535827004e-06,
"loss": 1.0365,
"step": 2245
},
{
"epoch": 5.744245524296675,
"grad_norm": 0.9983046758036718,
"learning_rate": 1.8978512527153414e-06,
"loss": 1.0208,
"step": 2246
},
{
"epoch": 5.746803069053708,
"grad_norm": 0.13869698166830857,
"learning_rate": 1.8903816335374048e-06,
"loss": 1.0092,
"step": 2247
},
{
"epoch": 5.749360613810742,
"grad_norm": 0.13909895674572456,
"learning_rate": 1.882925208201498e-06,
"loss": 0.9976,
"step": 2248
},
{
"epoch": 5.751918158567775,
"grad_norm": 0.13223029843900272,
"learning_rate": 1.8754819888387576e-06,
"loss": 1.0226,
"step": 2249
},
{
"epoch": 5.754475703324808,
"grad_norm": 0.1355611449623982,
"learning_rate": 1.868051987558832e-06,
"loss": 1.0547,
"step": 2250
},
{
"epoch": 5.757033248081841,
"grad_norm": 0.1335592612771471,
"learning_rate": 1.8606352164498754e-06,
"loss": 1.022,
"step": 2251
},
{
"epoch": 5.759590792838875,
"grad_norm": 0.13517321815446315,
"learning_rate": 1.8532316875785084e-06,
"loss": 1.059,
"step": 2252
},
{
"epoch": 5.762148337595908,
"grad_norm": 0.12900109188000092,
"learning_rate": 1.8458414129898072e-06,
"loss": 1.0121,
"step": 2253
},
{
"epoch": 5.764705882352941,
"grad_norm": 0.13164593690766663,
"learning_rate": 1.8384644047072864e-06,
"loss": 1.0363,
"step": 2254
},
{
"epoch": 5.767263427109975,
"grad_norm": 0.12836234729861262,
"learning_rate": 1.8311006747328775e-06,
"loss": 1.0342,
"step": 2255
},
{
"epoch": 5.7698209718670075,
"grad_norm": 0.13352486032417052,
"learning_rate": 1.8237502350469161e-06,
"loss": 1.028,
"step": 2256
},
{
"epoch": 5.772378516624041,
"grad_norm": 0.12666547237956713,
"learning_rate": 1.8164130976080962e-06,
"loss": 0.9998,
"step": 2257
},
{
"epoch": 5.774936061381074,
"grad_norm": 0.12597408036958038,
"learning_rate": 1.8090892743534904e-06,
"loss": 0.9861,
"step": 2258
},
{
"epoch": 5.7774936061381075,
"grad_norm": 0.13091969265184827,
"learning_rate": 1.8017787771984973e-06,
"loss": 1.0196,
"step": 2259
},
{
"epoch": 5.78005115089514,
"grad_norm": 0.1328229090332335,
"learning_rate": 1.7944816180368408e-06,
"loss": 1.0422,
"step": 2260
},
{
"epoch": 5.782608695652174,
"grad_norm": 0.12677176745235394,
"learning_rate": 1.7871978087405384e-06,
"loss": 1.0097,
"step": 2261
},
{
"epoch": 5.7851662404092075,
"grad_norm": 0.12437893059639113,
"learning_rate": 1.7799273611598943e-06,
"loss": 1.0121,
"step": 2262
},
{
"epoch": 5.78772378516624,
"grad_norm": 0.1251367564202301,
"learning_rate": 1.772670287123479e-06,
"loss": 0.9939,
"step": 2263
},
{
"epoch": 5.790281329923274,
"grad_norm": 0.1302978820127013,
"learning_rate": 1.765426598438088e-06,
"loss": 1.0377,
"step": 2264
},
{
"epoch": 5.792838874680307,
"grad_norm": 0.12296911765019702,
"learning_rate": 1.7581963068887554e-06,
"loss": 1.0082,
"step": 2265
},
{
"epoch": 5.79539641943734,
"grad_norm": 0.1310292740348814,
"learning_rate": 1.7509794242387135e-06,
"loss": 1.0455,
"step": 2266
},
{
"epoch": 5.797953964194374,
"grad_norm": 0.11962773068304663,
"learning_rate": 1.7437759622293771e-06,
"loss": 1.0301,
"step": 2267
},
{
"epoch": 5.8005115089514065,
"grad_norm": 0.1338997971252641,
"learning_rate": 1.7365859325803269e-06,
"loss": 1.028,
"step": 2268
},
{
"epoch": 5.80306905370844,
"grad_norm": 0.12161266269112997,
"learning_rate": 1.7294093469892948e-06,
"loss": 1.0253,
"step": 2269
},
{
"epoch": 5.805626598465473,
"grad_norm": 0.12194546591797659,
"learning_rate": 1.7222462171321397e-06,
"loss": 1.0112,
"step": 2270
},
{
"epoch": 5.8081841432225065,
"grad_norm": 0.12690399558973253,
"learning_rate": 1.7150965546628184e-06,
"loss": 1.0168,
"step": 2271
},
{
"epoch": 5.810741687979539,
"grad_norm": 0.1329159422591136,
"learning_rate": 1.7079603712133908e-06,
"loss": 0.9867,
"step": 2272
},
{
"epoch": 5.813299232736573,
"grad_norm": 0.12116530026113131,
"learning_rate": 1.7008376783939772e-06,
"loss": 1.0085,
"step": 2273
},
{
"epoch": 5.8158567774936065,
"grad_norm": 0.12935715986878404,
"learning_rate": 1.6937284877927596e-06,
"loss": 1.0162,
"step": 2274
},
{
"epoch": 5.818414322250639,
"grad_norm": 0.12690629229315065,
"learning_rate": 1.6866328109759377e-06,
"loss": 0.9794,
"step": 2275
},
{
"epoch": 5.820971867007673,
"grad_norm": 0.12407793133570494,
"learning_rate": 1.6795506594877388e-06,
"loss": 1.031,
"step": 2276
},
{
"epoch": 5.823529411764706,
"grad_norm": 0.12704984040936246,
"learning_rate": 1.6724820448503852e-06,
"loss": 1.0204,
"step": 2277
},
{
"epoch": 5.826086956521739,
"grad_norm": 0.13001027110393584,
"learning_rate": 1.6654269785640608e-06,
"loss": 1.0448,
"step": 2278
},
{
"epoch": 5.828644501278772,
"grad_norm": 0.11915860756194478,
"learning_rate": 1.658385472106926e-06,
"loss": 1.0146,
"step": 2279
},
{
"epoch": 5.831202046035806,
"grad_norm": 0.12897358959587038,
"learning_rate": 1.6513575369350654e-06,
"loss": 1.021,
"step": 2280
},
{
"epoch": 5.833759590792839,
"grad_norm": 0.13505425066582885,
"learning_rate": 1.6443431844824975e-06,
"loss": 1.0002,
"step": 2281
},
{
"epoch": 5.836317135549872,
"grad_norm": 0.12555260697675938,
"learning_rate": 1.637342426161126e-06,
"loss": 1.0013,
"step": 2282
},
{
"epoch": 5.838874680306906,
"grad_norm": 0.1276721077986895,
"learning_rate": 1.630355273360752e-06,
"loss": 1.0083,
"step": 2283
},
{
"epoch": 5.841432225063938,
"grad_norm": 0.12628248303483217,
"learning_rate": 1.623381737449038e-06,
"loss": 1.0495,
"step": 2284
},
{
"epoch": 5.843989769820972,
"grad_norm": 0.13396531513865312,
"learning_rate": 1.6164218297714884e-06,
"loss": 0.9778,
"step": 2285
},
{
"epoch": 5.846547314578006,
"grad_norm": 0.13405119018709796,
"learning_rate": 1.609475561651438e-06,
"loss": 0.9882,
"step": 2286
},
{
"epoch": 5.849104859335038,
"grad_norm": 0.11946775190358987,
"learning_rate": 1.6025429443900286e-06,
"loss": 1.0402,
"step": 2287
},
{
"epoch": 5.851662404092072,
"grad_norm": 0.1286546110791319,
"learning_rate": 1.5956239892661995e-06,
"loss": 1.0323,
"step": 2288
},
{
"epoch": 5.854219948849105,
"grad_norm": 0.12706067523411144,
"learning_rate": 1.588718707536656e-06,
"loss": 1.0153,
"step": 2289
},
{
"epoch": 5.856777493606138,
"grad_norm": 0.12632255275977317,
"learning_rate": 1.5818271104358574e-06,
"loss": 1.0359,
"step": 2290
},
{
"epoch": 5.859335038363171,
"grad_norm": 0.12022429130741803,
"learning_rate": 1.5749492091760054e-06,
"loss": 1.0272,
"step": 2291
},
{
"epoch": 5.861892583120205,
"grad_norm": 0.12754203390815988,
"learning_rate": 1.5680850149470139e-06,
"loss": 1.0141,
"step": 2292
},
{
"epoch": 5.864450127877237,
"grad_norm": 0.12789955923845803,
"learning_rate": 1.5612345389164974e-06,
"loss": 1.0213,
"step": 2293
},
{
"epoch": 5.867007672634271,
"grad_norm": 0.13105545311215508,
"learning_rate": 1.5543977922297494e-06,
"loss": 1.0203,
"step": 2294
},
{
"epoch": 5.869565217391305,
"grad_norm": 0.12692375648838364,
"learning_rate": 1.5475747860097335e-06,
"loss": 1.0175,
"step": 2295
},
{
"epoch": 5.872122762148337,
"grad_norm": 0.12758413074272634,
"learning_rate": 1.5407655313570525e-06,
"loss": 1.0187,
"step": 2296
},
{
"epoch": 5.874680306905371,
"grad_norm": 0.1347266986438743,
"learning_rate": 1.5339700393499357e-06,
"loss": 0.978,
"step": 2297
},
{
"epoch": 5.877237851662404,
"grad_norm": 0.1286412634763229,
"learning_rate": 1.5271883210442285e-06,
"loss": 1.0243,
"step": 2298
},
{
"epoch": 5.879795396419437,
"grad_norm": 0.13598473504010955,
"learning_rate": 1.5204203874733604e-06,
"loss": 1.0458,
"step": 2299
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.12217909066335947,
"learning_rate": 1.5136662496483346e-06,
"loss": 1.0159,
"step": 2300
},
{
"epoch": 5.884910485933504,
"grad_norm": 0.13697298325476193,
"learning_rate": 1.5069259185577112e-06,
"loss": 1.0234,
"step": 2301
},
{
"epoch": 5.887468030690537,
"grad_norm": 0.12856950834935316,
"learning_rate": 1.5001994051675894e-06,
"loss": 1.0005,
"step": 2302
},
{
"epoch": 5.89002557544757,
"grad_norm": 0.12272037964597306,
"learning_rate": 1.4934867204215864e-06,
"loss": 1.0182,
"step": 2303
},
{
"epoch": 5.892583120204604,
"grad_norm": 0.12396363368680077,
"learning_rate": 1.486787875240816e-06,
"loss": 1.0023,
"step": 2304
},
{
"epoch": 5.8951406649616365,
"grad_norm": 0.12822276354353365,
"learning_rate": 1.480102880523886e-06,
"loss": 1.0114,
"step": 2305
},
{
"epoch": 5.89769820971867,
"grad_norm": 0.12823957750976692,
"learning_rate": 1.4734317471468618e-06,
"loss": 1.0279,
"step": 2306
},
{
"epoch": 5.900255754475703,
"grad_norm": 0.12481205791568802,
"learning_rate": 1.4667744859632615e-06,
"loss": 0.9748,
"step": 2307
},
{
"epoch": 5.9028132992327365,
"grad_norm": 0.12376259417000356,
"learning_rate": 1.4601311078040304e-06,
"loss": 1.0291,
"step": 2308
},
{
"epoch": 5.90537084398977,
"grad_norm": 0.12039082706987389,
"learning_rate": 1.4535016234775324e-06,
"loss": 0.9835,
"step": 2309
},
{
"epoch": 5.907928388746803,
"grad_norm": 0.1278580324817726,
"learning_rate": 1.4468860437695243e-06,
"loss": 1.0276,
"step": 2310
},
{
"epoch": 5.910485933503836,
"grad_norm": 0.12971723157693313,
"learning_rate": 1.4402843794431354e-06,
"loss": 1.0085,
"step": 2311
},
{
"epoch": 5.913043478260869,
"grad_norm": 0.12766208083651814,
"learning_rate": 1.4336966412388674e-06,
"loss": 1.0392,
"step": 2312
},
{
"epoch": 5.915601023017903,
"grad_norm": 0.12363722996422528,
"learning_rate": 1.4271228398745552e-06,
"loss": 1.0063,
"step": 2313
},
{
"epoch": 5.918158567774936,
"grad_norm": 0.12491762028888559,
"learning_rate": 1.4205629860453641e-06,
"loss": 1.0598,
"step": 2314
},
{
"epoch": 5.920716112531969,
"grad_norm": 0.12614418988739717,
"learning_rate": 1.4140170904237616e-06,
"loss": 1.0078,
"step": 2315
},
{
"epoch": 5.923273657289003,
"grad_norm": 0.12871200444350614,
"learning_rate": 1.4074851636595165e-06,
"loss": 0.9912,
"step": 2316
},
{
"epoch": 5.9258312020460355,
"grad_norm": 0.12176341068010405,
"learning_rate": 1.400967216379663e-06,
"loss": 1.0023,
"step": 2317
},
{
"epoch": 5.928388746803069,
"grad_norm": 0.12736989149935335,
"learning_rate": 1.394463259188491e-06,
"loss": 1.0097,
"step": 2318
},
{
"epoch": 5.930946291560103,
"grad_norm": 0.12401472625813548,
"learning_rate": 1.3879733026675367e-06,
"loss": 1.036,
"step": 2319
},
{
"epoch": 5.9335038363171355,
"grad_norm": 0.12937517228342466,
"learning_rate": 1.3814973573755518e-06,
"loss": 1.036,
"step": 2320
},
{
"epoch": 5.936061381074169,
"grad_norm": 0.127613205394154,
"learning_rate": 1.3750354338484916e-06,
"loss": 0.9881,
"step": 2321
},
{
"epoch": 5.938618925831202,
"grad_norm": 0.12739173803258835,
"learning_rate": 1.3685875425995064e-06,
"loss": 1.0191,
"step": 2322
},
{
"epoch": 5.9411764705882355,
"grad_norm": 0.13795008867321654,
"learning_rate": 1.3621536941189107e-06,
"loss": 1.0144,
"step": 2323
},
{
"epoch": 5.943734015345268,
"grad_norm": 0.12984194360371934,
"learning_rate": 1.355733898874173e-06,
"loss": 1.049,
"step": 2324
},
{
"epoch": 5.946291560102302,
"grad_norm": 0.13129623864662363,
"learning_rate": 1.3493281673098956e-06,
"loss": 1.015,
"step": 2325
},
{
"epoch": 5.948849104859335,
"grad_norm": 0.12793818903871373,
"learning_rate": 1.3429365098478087e-06,
"loss": 0.9981,
"step": 2326
},
{
"epoch": 5.951406649616368,
"grad_norm": 0.1255755665233896,
"learning_rate": 1.3365589368867371e-06,
"loss": 0.9794,
"step": 2327
},
{
"epoch": 5.953964194373402,
"grad_norm": 0.1279352390496069,
"learning_rate": 1.330195458802591e-06,
"loss": 1.0249,
"step": 2328
},
{
"epoch": 5.956521739130435,
"grad_norm": 0.128293917496119,
"learning_rate": 1.323846085948356e-06,
"loss": 0.9898,
"step": 2329
},
{
"epoch": 5.959079283887468,
"grad_norm": 0.12767639872018413,
"learning_rate": 1.3175108286540617e-06,
"loss": 1.0352,
"step": 2330
},
{
"epoch": 5.961636828644501,
"grad_norm": 0.12662645466299385,
"learning_rate": 1.3111896972267768e-06,
"loss": 1.0055,
"step": 2331
},
{
"epoch": 5.964194373401535,
"grad_norm": 0.12253304775794958,
"learning_rate": 1.3048827019505828e-06,
"loss": 0.9892,
"step": 2332
},
{
"epoch": 5.966751918158568,
"grad_norm": 0.13233724231669944,
"learning_rate": 1.2985898530865736e-06,
"loss": 0.9883,
"step": 2333
},
{
"epoch": 5.969309462915601,
"grad_norm": 0.12275354609893704,
"learning_rate": 1.2923111608728168e-06,
"loss": 1.0221,
"step": 2334
},
{
"epoch": 5.971867007672635,
"grad_norm": 0.13544461017695578,
"learning_rate": 1.2860466355243506e-06,
"loss": 1.0587,
"step": 2335
},
{
"epoch": 5.974424552429667,
"grad_norm": 0.125504059793445,
"learning_rate": 1.2797962872331693e-06,
"loss": 1.0096,
"step": 2336
},
{
"epoch": 5.976982097186701,
"grad_norm": 0.13226317160144294,
"learning_rate": 1.2735601261681985e-06,
"loss": 1.0489,
"step": 2337
},
{
"epoch": 5.979539641943734,
"grad_norm": 0.12803280744387227,
"learning_rate": 1.2673381624752813e-06,
"loss": 1.0307,
"step": 2338
},
{
"epoch": 5.982097186700767,
"grad_norm": 0.12863654527584692,
"learning_rate": 1.2611304062771613e-06,
"loss": 1.017,
"step": 2339
},
{
"epoch": 5.9846547314578,
"grad_norm": 0.12401870969986709,
"learning_rate": 1.254936867673474e-06,
"loss": 1.0056,
"step": 2340
},
{
"epoch": 5.987212276214834,
"grad_norm": 0.11891932350440772,
"learning_rate": 1.2487575567407184e-06,
"loss": 0.9998,
"step": 2341
},
{
"epoch": 5.989769820971867,
"grad_norm": 0.12341714944406178,
"learning_rate": 1.2425924835322422e-06,
"loss": 1.0247,
"step": 2342
},
{
"epoch": 5.9923273657289,
"grad_norm": 0.1229416512376773,
"learning_rate": 1.2364416580782413e-06,
"loss": 1.0195,
"step": 2343
},
{
"epoch": 5.994884910485934,
"grad_norm": 0.12303637728566778,
"learning_rate": 1.2303050903857195e-06,
"loss": 1.0156,
"step": 2344
},
{
"epoch": 5.997442455242966,
"grad_norm": 0.13561743214244987,
"learning_rate": 1.2241827904384928e-06,
"loss": 1.0304,
"step": 2345
},
{
"epoch": 6.0,
"grad_norm": 0.11664031093263695,
"learning_rate": 1.2180747681971539e-06,
"loss": 1.0047,
"step": 2346
},
{
"epoch": 6.002557544757034,
"grad_norm": 0.1230389316598828,
"learning_rate": 1.211981033599079e-06,
"loss": 1.0416,
"step": 2347
},
{
"epoch": 6.005115089514066,
"grad_norm": 0.12948288079807183,
"learning_rate": 1.2059015965583908e-06,
"loss": 1.0123,
"step": 2348
},
{
"epoch": 6.0076726342711,
"grad_norm": 0.1207876296019636,
"learning_rate": 1.1998364669659524e-06,
"loss": 0.9796,
"step": 2349
},
{
"epoch": 6.010230179028133,
"grad_norm": 0.1191785329656778,
"learning_rate": 1.1937856546893533e-06,
"loss": 0.9862,
"step": 2350
},
{
"epoch": 6.012787723785166,
"grad_norm": 0.12106597514269477,
"learning_rate": 1.1877491695728827e-06,
"loss": 1.0181,
"step": 2351
},
{
"epoch": 6.015345268542199,
"grad_norm": 0.12714775517717014,
"learning_rate": 1.181727021437531e-06,
"loss": 0.9901,
"step": 2352
},
{
"epoch": 6.017902813299233,
"grad_norm": 0.12314221662217836,
"learning_rate": 1.1757192200809487e-06,
"loss": 1.0139,
"step": 2353
},
{
"epoch": 6.020460358056266,
"grad_norm": 0.1205656248704543,
"learning_rate": 1.1697257752774581e-06,
"loss": 1.0064,
"step": 2354
},
{
"epoch": 6.023017902813299,
"grad_norm": 0.12375532206452915,
"learning_rate": 1.1637466967780186e-06,
"loss": 1.0055,
"step": 2355
},
{
"epoch": 6.025575447570333,
"grad_norm": 0.13727612152509278,
"learning_rate": 1.1577819943102132e-06,
"loss": 1.0334,
"step": 2356
},
{
"epoch": 6.028132992327365,
"grad_norm": 0.13743682672187252,
"learning_rate": 1.1518316775782456e-06,
"loss": 1.063,
"step": 2357
},
{
"epoch": 6.030690537084399,
"grad_norm": 0.1269152481030464,
"learning_rate": 1.1458957562629048e-06,
"loss": 1.0245,
"step": 2358
},
{
"epoch": 6.033248081841432,
"grad_norm": 0.12054742496527425,
"learning_rate": 1.1399742400215685e-06,
"loss": 1.016,
"step": 2359
},
{
"epoch": 6.035805626598465,
"grad_norm": 0.11563655740461991,
"learning_rate": 1.1340671384881664e-06,
"loss": 1.0034,
"step": 2360
},
{
"epoch": 6.038363171355499,
"grad_norm": 0.12654719374228424,
"learning_rate": 1.128174461273187e-06,
"loss": 1.0303,
"step": 2361
},
{
"epoch": 6.040920716112532,
"grad_norm": 0.13400791982749355,
"learning_rate": 1.122296217963651e-06,
"loss": 0.9908,
"step": 2362
},
{
"epoch": 6.043478260869565,
"grad_norm": 0.13721318190820386,
"learning_rate": 1.116432418123088e-06,
"loss": 1.0143,
"step": 2363
},
{
"epoch": 6.046035805626598,
"grad_norm": 0.1331473057560735,
"learning_rate": 1.1105830712915355e-06,
"loss": 1.0389,
"step": 2364
},
{
"epoch": 6.048593350383632,
"grad_norm": 0.12186052033355585,
"learning_rate": 1.1047481869855136e-06,
"loss": 0.9923,
"step": 2365
},
{
"epoch": 6.051150895140665,
"grad_norm": 0.130398414275441,
"learning_rate": 1.0989277746980186e-06,
"loss": 0.9989,
"step": 2366
},
{
"epoch": 6.053708439897698,
"grad_norm": 0.1212752348474763,
"learning_rate": 1.0931218438984903e-06,
"loss": 1.0002,
"step": 2367
},
{
"epoch": 6.056265984654732,
"grad_norm": 0.12066129403697316,
"learning_rate": 1.0873304040328193e-06,
"loss": 0.9855,
"step": 2368
},
{
"epoch": 6.0588235294117645,
"grad_norm": 0.12980745503624036,
"learning_rate": 1.0815534645233182e-06,
"loss": 1.0108,
"step": 2369
},
{
"epoch": 6.061381074168798,
"grad_norm": 0.12190895753762201,
"learning_rate": 1.075791034768704e-06,
"loss": 1.0134,
"step": 2370
},
{
"epoch": 6.063938618925831,
"grad_norm": 0.11736296572501317,
"learning_rate": 1.0700431241440888e-06,
"loss": 0.9819,
"step": 2371
},
{
"epoch": 6.0664961636828645,
"grad_norm": 0.11803134631202541,
"learning_rate": 1.064309742000963e-06,
"loss": 0.999,
"step": 2372
},
{
"epoch": 6.069053708439898,
"grad_norm": 0.12274428069266924,
"learning_rate": 1.0585908976671844e-06,
"loss": 1.0263,
"step": 2373
},
{
"epoch": 6.071611253196931,
"grad_norm": 0.1280904409678555,
"learning_rate": 1.052886600446954e-06,
"loss": 0.9989,
"step": 2374
},
{
"epoch": 6.0741687979539645,
"grad_norm": 0.13800491036101872,
"learning_rate": 1.0471968596208026e-06,
"loss": 1.0168,
"step": 2375
},
{
"epoch": 6.076726342710997,
"grad_norm": 0.125255996087832,
"learning_rate": 1.0415216844455889e-06,
"loss": 1.0016,
"step": 2376
},
{
"epoch": 6.079283887468031,
"grad_norm": 0.12500402095406113,
"learning_rate": 1.0358610841544657e-06,
"loss": 1.0207,
"step": 2377
},
{
"epoch": 6.081841432225064,
"grad_norm": 0.12102753345414748,
"learning_rate": 1.0302150679568745e-06,
"loss": 0.9889,
"step": 2378
},
{
"epoch": 6.084398976982097,
"grad_norm": 0.1263965580697967,
"learning_rate": 1.0245836450385304e-06,
"loss": 1.0278,
"step": 2379
},
{
"epoch": 6.086956521739131,
"grad_norm": 0.12426986420829644,
"learning_rate": 1.0189668245614092e-06,
"loss": 1.0024,
"step": 2380
},
{
"epoch": 6.089514066496164,
"grad_norm": 0.12124987678343191,
"learning_rate": 1.0133646156637244e-06,
"loss": 1.0346,
"step": 2381
},
{
"epoch": 6.092071611253197,
"grad_norm": 0.11760759251820775,
"learning_rate": 1.0077770274599187e-06,
"loss": 1.0176,
"step": 2382
},
{
"epoch": 6.09462915601023,
"grad_norm": 0.11882704515829542,
"learning_rate": 1.002204069040652e-06,
"loss": 0.9894,
"step": 2383
},
{
"epoch": 6.0971867007672635,
"grad_norm": 0.12369290549039276,
"learning_rate": 9.966457494727777e-07,
"loss": 1.04,
"step": 2384
},
{
"epoch": 6.099744245524296,
"grad_norm": 0.12345493397851956,
"learning_rate": 9.91102077799333e-07,
"loss": 1.0049,
"step": 2385
},
{
"epoch": 6.10230179028133,
"grad_norm": 0.12872126244712379,
"learning_rate": 9.855730630395244e-07,
"loss": 0.9933,
"step": 2386
},
{
"epoch": 6.1048593350383635,
"grad_norm": 0.11772835201472491,
"learning_rate": 9.800587141887173e-07,
"loss": 1.0285,
"step": 2387
},
{
"epoch": 6.107416879795396,
"grad_norm": 0.12252902927138364,
"learning_rate": 9.745590402184092e-07,
"loss": 1.0134,
"step": 2388
},
{
"epoch": 6.10997442455243,
"grad_norm": 0.12214679346044635,
"learning_rate": 9.690740500762241e-07,
"loss": 0.9778,
"step": 2389
},
{
"epoch": 6.112531969309463,
"grad_norm": 0.12270563199721099,
"learning_rate": 9.636037526859032e-07,
"loss": 1.0048,
"step": 2390
},
{
"epoch": 6.115089514066496,
"grad_norm": 0.13289561214559903,
"learning_rate": 9.58148156947276e-07,
"loss": 1.0355,
"step": 2391
},
{
"epoch": 6.117647058823529,
"grad_norm": 0.124015797218616,
"learning_rate": 9.52707271736254e-07,
"loss": 0.9894,
"step": 2392
},
{
"epoch": 6.120204603580563,
"grad_norm": 0.12869746602968873,
"learning_rate": 9.472811059048182e-07,
"loss": 1.034,
"step": 2393
},
{
"epoch": 6.122762148337596,
"grad_norm": 0.11502225665357182,
"learning_rate": 9.418696682810014e-07,
"loss": 1.0279,
"step": 2394
},
{
"epoch": 6.125319693094629,
"grad_norm": 0.12442843747682036,
"learning_rate": 9.364729676688755e-07,
"loss": 1.0346,
"step": 2395
},
{
"epoch": 6.127877237851663,
"grad_norm": 0.12203934311867798,
"learning_rate": 9.310910128485317e-07,
"loss": 1.0042,
"step": 2396
},
{
"epoch": 6.130434782608695,
"grad_norm": 0.13225053449453802,
"learning_rate": 9.257238125760781e-07,
"loss": 0.9979,
"step": 2397
},
{
"epoch": 6.132992327365729,
"grad_norm": 0.11626249473093271,
"learning_rate": 9.203713755836108e-07,
"loss": 1.0151,
"step": 2398
},
{
"epoch": 6.135549872122763,
"grad_norm": 0.12565196489418815,
"learning_rate": 9.150337105792129e-07,
"loss": 1.0003,
"step": 2399
},
{
"epoch": 6.138107416879795,
"grad_norm": 0.1176707888425743,
"learning_rate": 9.097108262469268e-07,
"loss": 1.0174,
"step": 2400
},
{
"epoch": 6.140664961636829,
"grad_norm": 0.1254506125476653,
"learning_rate": 9.044027312467574e-07,
"loss": 1.024,
"step": 2401
},
{
"epoch": 6.143222506393862,
"grad_norm": 0.12040306772801906,
"learning_rate": 8.991094342146423e-07,
"loss": 1.0238,
"step": 2402
},
{
"epoch": 6.145780051150895,
"grad_norm": 0.12003711394998114,
"learning_rate": 8.938309437624415e-07,
"loss": 1.0361,
"step": 2403
},
{
"epoch": 6.148337595907928,
"grad_norm": 0.1222116778211444,
"learning_rate": 8.885672684779345e-07,
"loss": 1.0195,
"step": 2404
},
{
"epoch": 6.150895140664962,
"grad_norm": 0.12213600424627216,
"learning_rate": 8.833184169247877e-07,
"loss": 1.0147,
"step": 2405
},
{
"epoch": 6.153452685421995,
"grad_norm": 0.11882499943476486,
"learning_rate": 8.780843976425568e-07,
"loss": 1.0443,
"step": 2406
},
{
"epoch": 6.156010230179028,
"grad_norm": 0.11944071935758879,
"learning_rate": 8.728652191466602e-07,
"loss": 1.0269,
"step": 2407
},
{
"epoch": 6.158567774936062,
"grad_norm": 0.12479032723786981,
"learning_rate": 8.676608899283789e-07,
"loss": 1.0407,
"step": 2408
},
{
"epoch": 6.161125319693094,
"grad_norm": 0.1232368778241773,
"learning_rate": 8.62471418454831e-07,
"loss": 0.998,
"step": 2409
},
{
"epoch": 6.163682864450128,
"grad_norm": 0.12380002645622601,
"learning_rate": 8.572968131689585e-07,
"loss": 1.0215,
"step": 2410
},
{
"epoch": 6.166240409207161,
"grad_norm": 0.11990258505813678,
"learning_rate": 8.521370824895236e-07,
"loss": 1.0362,
"step": 2411
},
{
"epoch": 6.168797953964194,
"grad_norm": 0.12763582460814127,
"learning_rate": 8.469922348110871e-07,
"loss": 1.0005,
"step": 2412
},
{
"epoch": 6.171355498721228,
"grad_norm": 0.12048771338001237,
"learning_rate": 8.41862278503991e-07,
"loss": 1.0154,
"step": 2413
},
{
"epoch": 6.173913043478261,
"grad_norm": 0.11110330026915051,
"learning_rate": 8.367472219143524e-07,
"loss": 0.9864,
"step": 2414
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.12274015937027666,
"learning_rate": 8.316470733640525e-07,
"loss": 1.01,
"step": 2415
},
{
"epoch": 6.179028132992327,
"grad_norm": 0.11875414799502092,
"learning_rate": 8.265618411507148e-07,
"loss": 1.0349,
"step": 2416
},
{
"epoch": 6.181585677749361,
"grad_norm": 0.12112785116554001,
"learning_rate": 8.214915335476892e-07,
"loss": 1.0108,
"step": 2417
},
{
"epoch": 6.1841432225063935,
"grad_norm": 0.11843273179000395,
"learning_rate": 8.164361588040526e-07,
"loss": 1.0316,
"step": 2418
},
{
"epoch": 6.186700767263427,
"grad_norm": 0.12171206599055973,
"learning_rate": 8.113957251445837e-07,
"loss": 1.0181,
"step": 2419
},
{
"epoch": 6.189258312020461,
"grad_norm": 0.1332901069553243,
"learning_rate": 8.063702407697515e-07,
"loss": 1.0163,
"step": 2420
},
{
"epoch": 6.1918158567774935,
"grad_norm": 0.12665149802988054,
"learning_rate": 8.013597138557039e-07,
"loss": 1.0316,
"step": 2421
},
{
"epoch": 6.194373401534527,
"grad_norm": 0.11748240466353733,
"learning_rate": 7.963641525542564e-07,
"loss": 1.0295,
"step": 2422
},
{
"epoch": 6.19693094629156,
"grad_norm": 0.12263136155853388,
"learning_rate": 7.913835649928792e-07,
"loss": 1.0443,
"step": 2423
},
{
"epoch": 6.1994884910485935,
"grad_norm": 0.12057268564537553,
"learning_rate": 7.864179592746679e-07,
"loss": 0.9758,
"step": 2424
},
{
"epoch": 6.202046035805626,
"grad_norm": 0.11757878694680841,
"learning_rate": 7.814673434783604e-07,
"loss": 0.998,
"step": 2425
},
{
"epoch": 6.20460358056266,
"grad_norm": 0.18582779787648557,
"learning_rate": 7.765317256582949e-07,
"loss": 1.0115,
"step": 2426
},
{
"epoch": 6.207161125319693,
"grad_norm": 0.13582232353707813,
"learning_rate": 7.716111138444115e-07,
"loss": 1.0459,
"step": 2427
},
{
"epoch": 6.209718670076726,
"grad_norm": 0.13389475712289786,
"learning_rate": 7.667055160422432e-07,
"loss": 1.0274,
"step": 2428
},
{
"epoch": 6.21227621483376,
"grad_norm": 0.12673104354118297,
"learning_rate": 7.618149402328867e-07,
"loss": 1.0011,
"step": 2429
},
{
"epoch": 6.2148337595907925,
"grad_norm": 0.12765584122890725,
"learning_rate": 7.569393943730064e-07,
"loss": 1.0635,
"step": 2430
},
{
"epoch": 6.217391304347826,
"grad_norm": 0.11473857666105772,
"learning_rate": 7.52078886394807e-07,
"loss": 0.9878,
"step": 2431
},
{
"epoch": 6.21994884910486,
"grad_norm": 0.12228794360420046,
"learning_rate": 7.472334242060331e-07,
"loss": 1.0316,
"step": 2432
},
{
"epoch": 6.2225063938618925,
"grad_norm": 0.12426451417815787,
"learning_rate": 7.424030156899475e-07,
"loss": 1.0098,
"step": 2433
},
{
"epoch": 6.225063938618926,
"grad_norm": 0.11800919098475897,
"learning_rate": 7.375876687053252e-07,
"loss": 1.0508,
"step": 2434
},
{
"epoch": 6.227621483375959,
"grad_norm": 0.1309293626602563,
"learning_rate": 7.327873910864325e-07,
"loss": 1.0265,
"step": 2435
},
{
"epoch": 6.2301790281329925,
"grad_norm": 0.12364264713239634,
"learning_rate": 7.280021906430201e-07,
"loss": 1.038,
"step": 2436
},
{
"epoch": 6.232736572890025,
"grad_norm": 0.12731230734269985,
"learning_rate": 7.23232075160315e-07,
"loss": 0.9938,
"step": 2437
},
{
"epoch": 6.235294117647059,
"grad_norm": 0.11754730324986598,
"learning_rate": 7.184770523989904e-07,
"loss": 1.0209,
"step": 2438
},
{
"epoch": 6.2378516624040925,
"grad_norm": 0.12687711722398867,
"learning_rate": 7.137371300951746e-07,
"loss": 1.0369,
"step": 2439
},
{
"epoch": 6.240409207161125,
"grad_norm": 0.1226944492744433,
"learning_rate": 7.090123159604234e-07,
"loss": 1.0417,
"step": 2440
},
{
"epoch": 6.242966751918159,
"grad_norm": 0.11721843519340895,
"learning_rate": 7.043026176817158e-07,
"loss": 0.99,
"step": 2441
},
{
"epoch": 6.245524296675192,
"grad_norm": 0.12080675281454777,
"learning_rate": 6.996080429214347e-07,
"loss": 1.0065,
"step": 2442
},
{
"epoch": 6.248081841432225,
"grad_norm": 0.12010992913398671,
"learning_rate": 6.949285993173593e-07,
"loss": 1.0359,
"step": 2443
},
{
"epoch": 6.250639386189258,
"grad_norm": 0.11624614678372433,
"learning_rate": 6.902642944826544e-07,
"loss": 0.97,
"step": 2444
},
{
"epoch": 6.253196930946292,
"grad_norm": 0.12257573737475404,
"learning_rate": 6.856151360058505e-07,
"loss": 1.0192,
"step": 2445
},
{
"epoch": 6.255754475703325,
"grad_norm": 0.1201829684398593,
"learning_rate": 6.809811314508386e-07,
"loss": 1.0466,
"step": 2446
},
{
"epoch": 6.258312020460358,
"grad_norm": 0.12401967000820303,
"learning_rate": 6.763622883568521e-07,
"loss": 1.0356,
"step": 2447
},
{
"epoch": 6.260869565217392,
"grad_norm": 0.11778396980454381,
"learning_rate": 6.717586142384624e-07,
"loss": 1.036,
"step": 2448
},
{
"epoch": 6.263427109974424,
"grad_norm": 0.12185872889499474,
"learning_rate": 6.671701165855593e-07,
"loss": 1.0261,
"step": 2449
},
{
"epoch": 6.265984654731458,
"grad_norm": 0.1201489344194391,
"learning_rate": 6.625968028633389e-07,
"loss": 1.0119,
"step": 2450
},
{
"epoch": 6.268542199488491,
"grad_norm": 0.11988021977061444,
"learning_rate": 6.580386805122996e-07,
"loss": 1.021,
"step": 2451
},
{
"epoch": 6.271099744245524,
"grad_norm": 0.11792524228657224,
"learning_rate": 6.534957569482214e-07,
"loss": 1.0635,
"step": 2452
},
{
"epoch": 6.273657289002558,
"grad_norm": 0.11687466392592072,
"learning_rate": 6.489680395621556e-07,
"loss": 1.0129,
"step": 2453
},
{
"epoch": 6.276214833759591,
"grad_norm": 0.12220153331468454,
"learning_rate": 6.444555357204152e-07,
"loss": 0.9876,
"step": 2454
},
{
"epoch": 6.278772378516624,
"grad_norm": 0.11658584388896727,
"learning_rate": 6.39958252764562e-07,
"loss": 1.0258,
"step": 2455
},
{
"epoch": 6.281329923273657,
"grad_norm": 0.11595243705777233,
"learning_rate": 6.354761980113966e-07,
"loss": 1.0364,
"step": 2456
},
{
"epoch": 6.283887468030691,
"grad_norm": 0.11948349789713839,
"learning_rate": 6.31009378752937e-07,
"loss": 1.0295,
"step": 2457
},
{
"epoch": 6.286445012787723,
"grad_norm": 0.11578209417911318,
"learning_rate": 6.265578022564233e-07,
"loss": 1.003,
"step": 2458
},
{
"epoch": 6.289002557544757,
"grad_norm": 0.11954141892522423,
"learning_rate": 6.221214757642901e-07,
"loss": 1.0186,
"step": 2459
},
{
"epoch": 6.291560102301791,
"grad_norm": 0.1214032884466788,
"learning_rate": 6.177004064941616e-07,
"loss": 1.0325,
"step": 2460
},
{
"epoch": 6.294117647058823,
"grad_norm": 0.11798550854551848,
"learning_rate": 6.132946016388453e-07,
"loss": 1.0034,
"step": 2461
},
{
"epoch": 6.296675191815857,
"grad_norm": 0.12025821516068275,
"learning_rate": 6.089040683663083e-07,
"loss": 0.9823,
"step": 2462
},
{
"epoch": 6.29923273657289,
"grad_norm": 0.11951253909474888,
"learning_rate": 6.045288138196725e-07,
"loss": 1.0409,
"step": 2463
},
{
"epoch": 6.301790281329923,
"grad_norm": 0.11418311978255119,
"learning_rate": 6.001688451172027e-07,
"loss": 1.0022,
"step": 2464
},
{
"epoch": 6.304347826086957,
"grad_norm": 0.11934858308797691,
"learning_rate": 5.958241693522993e-07,
"loss": 1.0107,
"step": 2465
},
{
"epoch": 6.30690537084399,
"grad_norm": 0.12241414028875457,
"learning_rate": 5.914947935934756e-07,
"loss": 0.9971,
"step": 2466
},
{
"epoch": 6.309462915601023,
"grad_norm": 0.11903591318763888,
"learning_rate": 5.871807248843542e-07,
"loss": 1.0117,
"step": 2467
},
{
"epoch": 6.312020460358056,
"grad_norm": 0.11896713837542751,
"learning_rate": 5.828819702436573e-07,
"loss": 1.0199,
"step": 2468
},
{
"epoch": 6.31457800511509,
"grad_norm": 0.12256891371488562,
"learning_rate": 5.785985366651892e-07,
"loss": 1.003,
"step": 2469
},
{
"epoch": 6.3171355498721224,
"grad_norm": 0.1224791957117775,
"learning_rate": 5.743304311178289e-07,
"loss": 1.0067,
"step": 2470
},
{
"epoch": 6.319693094629156,
"grad_norm": 0.12119833550268867,
"learning_rate": 5.70077660545515e-07,
"loss": 1.0196,
"step": 2471
},
{
"epoch": 6.322250639386189,
"grad_norm": 0.11520605275376457,
"learning_rate": 5.658402318672418e-07,
"loss": 1.0127,
"step": 2472
},
{
"epoch": 6.324808184143222,
"grad_norm": 0.11525398133510434,
"learning_rate": 5.616181519770414e-07,
"loss": 1.0161,
"step": 2473
},
{
"epoch": 6.327365728900256,
"grad_norm": 0.12176149506861418,
"learning_rate": 5.574114277439702e-07,
"loss": 1.0216,
"step": 2474
},
{
"epoch": 6.329923273657289,
"grad_norm": 0.12541686899065785,
"learning_rate": 5.53220066012109e-07,
"loss": 1.0263,
"step": 2475
},
{
"epoch": 6.332480818414322,
"grad_norm": 0.12958665943781433,
"learning_rate": 5.490440736005397e-07,
"loss": 1.0737,
"step": 2476
},
{
"epoch": 6.335038363171355,
"grad_norm": 0.1273940622092984,
"learning_rate": 5.448834573033424e-07,
"loss": 1.028,
"step": 2477
},
{
"epoch": 6.337595907928389,
"grad_norm": 0.11799709709320902,
"learning_rate": 5.407382238895765e-07,
"loss": 0.9949,
"step": 2478
},
{
"epoch": 6.340153452685422,
"grad_norm": 0.1220634348791913,
"learning_rate": 5.366083801032806e-07,
"loss": 1.0422,
"step": 2479
},
{
"epoch": 6.342710997442455,
"grad_norm": 0.11889607141087616,
"learning_rate": 5.324939326634515e-07,
"loss": 1.0017,
"step": 2480
},
{
"epoch": 6.345268542199489,
"grad_norm": 0.12002156059223426,
"learning_rate": 5.283948882640355e-07,
"loss": 1.0181,
"step": 2481
},
{
"epoch": 6.3478260869565215,
"grad_norm": 0.11596540294437355,
"learning_rate": 5.24311253573927e-07,
"loss": 1.0346,
"step": 2482
},
{
"epoch": 6.350383631713555,
"grad_norm": 0.11502520531650343,
"learning_rate": 5.202430352369392e-07,
"loss": 1.0135,
"step": 2483
},
{
"epoch": 6.352941176470588,
"grad_norm": 0.12267491898314155,
"learning_rate": 5.161902398718121e-07,
"loss": 1.0435,
"step": 2484
},
{
"epoch": 6.3554987212276215,
"grad_norm": 0.12185761812901445,
"learning_rate": 5.121528740721871e-07,
"loss": 1.0377,
"step": 2485
},
{
"epoch": 6.358056265984655,
"grad_norm": 0.11976615175350093,
"learning_rate": 5.081309444066085e-07,
"loss": 1.034,
"step": 2486
},
{
"epoch": 6.360613810741688,
"grad_norm": 0.116555412280644,
"learning_rate": 5.041244574185056e-07,
"loss": 1.011,
"step": 2487
},
{
"epoch": 6.3631713554987215,
"grad_norm": 0.12515368166748755,
"learning_rate": 5.001334196261776e-07,
"loss": 0.9861,
"step": 2488
},
{
"epoch": 6.365728900255754,
"grad_norm": 0.11814447264484773,
"learning_rate": 4.961578375227982e-07,
"loss": 1.0146,
"step": 2489
},
{
"epoch": 6.368286445012788,
"grad_norm": 0.12245094109059326,
"learning_rate": 4.921977175763881e-07,
"loss": 1.0204,
"step": 2490
},
{
"epoch": 6.370843989769821,
"grad_norm": 0.12283694751475284,
"learning_rate": 4.882530662298168e-07,
"loss": 1.0313,
"step": 2491
},
{
"epoch": 6.373401534526854,
"grad_norm": 0.12224108783096758,
"learning_rate": 4.843238899007829e-07,
"loss": 1.032,
"step": 2492
},
{
"epoch": 6.375959079283888,
"grad_norm": 0.11751909048944272,
"learning_rate": 4.804101949818119e-07,
"loss": 1.0037,
"step": 2493
},
{
"epoch": 6.378516624040921,
"grad_norm": 0.1189722841334927,
"learning_rate": 4.765119878402424e-07,
"loss": 1.0218,
"step": 2494
},
{
"epoch": 6.381074168797954,
"grad_norm": 0.12188011601377355,
"learning_rate": 4.726292748182104e-07,
"loss": 1.0235,
"step": 2495
},
{
"epoch": 6.383631713554987,
"grad_norm": 0.11601162144284871,
"learning_rate": 4.687620622326505e-07,
"loss": 1.0095,
"step": 2496
},
{
"epoch": 6.3861892583120206,
"grad_norm": 0.11794823628283956,
"learning_rate": 4.6491035637527437e-07,
"loss": 1.0211,
"step": 2497
},
{
"epoch": 6.388746803069053,
"grad_norm": 0.12080963912657082,
"learning_rate": 4.6107416351256595e-07,
"loss": 0.996,
"step": 2498
},
{
"epoch": 6.391304347826087,
"grad_norm": 0.11852593163423941,
"learning_rate": 4.5725348988577057e-07,
"loss": 1.0473,
"step": 2499
},
{
"epoch": 6.3938618925831205,
"grad_norm": 0.1154582217572824,
"learning_rate": 4.5344834171088594e-07,
"loss": 0.9916,
"step": 2500
},
{
"epoch": 6.396419437340153,
"grad_norm": 0.12611349351005327,
"learning_rate": 4.496587251786544e-07,
"loss": 1.0537,
"step": 2501
},
{
"epoch": 6.398976982097187,
"grad_norm": 0.11841147140282605,
"learning_rate": 4.4588464645453856e-07,
"loss": 1.0354,
"step": 2502
},
{
"epoch": 6.40153452685422,
"grad_norm": 0.11761246404197793,
"learning_rate": 4.421261116787323e-07,
"loss": 1.0056,
"step": 2503
},
{
"epoch": 6.404092071611253,
"grad_norm": 0.116833267265145,
"learning_rate": 4.383831269661343e-07,
"loss": 0.9983,
"step": 2504
},
{
"epoch": 6.406649616368286,
"grad_norm": 0.12485584628194238,
"learning_rate": 4.3465569840635105e-07,
"loss": 1.0276,
"step": 2505
},
{
"epoch": 6.40920716112532,
"grad_norm": 0.11771747761741529,
"learning_rate": 4.309438320636705e-07,
"loss": 1.0119,
"step": 2506
},
{
"epoch": 6.411764705882353,
"grad_norm": 0.1167766752899283,
"learning_rate": 4.272475339770699e-07,
"loss": 1.0257,
"step": 2507
},
{
"epoch": 6.414322250639386,
"grad_norm": 0.11997899496687212,
"learning_rate": 4.235668101601964e-07,
"loss": 0.9887,
"step": 2508
},
{
"epoch": 6.41687979539642,
"grad_norm": 0.11897278858577053,
"learning_rate": 4.199016666013533e-07,
"loss": 1.0162,
"step": 2509
},
{
"epoch": 6.419437340153452,
"grad_norm": 0.1213013490317867,
"learning_rate": 4.1625210926350413e-07,
"loss": 1.0141,
"step": 2510
},
{
"epoch": 6.421994884910486,
"grad_norm": 0.12533002989447992,
"learning_rate": 4.1261814408424806e-07,
"loss": 1.0251,
"step": 2511
},
{
"epoch": 6.42455242966752,
"grad_norm": 0.12196478149472252,
"learning_rate": 4.089997769758225e-07,
"loss": 1.0365,
"step": 2512
},
{
"epoch": 6.427109974424552,
"grad_norm": 0.12143791187790264,
"learning_rate": 4.0539701382507847e-07,
"loss": 1.0032,
"step": 2513
},
{
"epoch": 6.429667519181586,
"grad_norm": 0.11682750481108217,
"learning_rate": 4.018098604934906e-07,
"loss": 1.0045,
"step": 2514
},
{
"epoch": 6.432225063938619,
"grad_norm": 0.11654420434670919,
"learning_rate": 3.982383228171338e-07,
"loss": 1.0122,
"step": 2515
},
{
"epoch": 6.434782608695652,
"grad_norm": 0.12087376970393812,
"learning_rate": 3.946824066066757e-07,
"loss": 1.0091,
"step": 2516
},
{
"epoch": 6.437340153452685,
"grad_norm": 0.11198028929740504,
"learning_rate": 3.9114211764736843e-07,
"loss": 0.9916,
"step": 2517
},
{
"epoch": 6.439897698209719,
"grad_norm": 0.117876547438714,
"learning_rate": 3.876174616990402e-07,
"loss": 0.9688,
"step": 2518
},
{
"epoch": 6.442455242966752,
"grad_norm": 0.11691097425539704,
"learning_rate": 3.8410844449608966e-07,
"loss": 1.0262,
"step": 2519
},
{
"epoch": 6.445012787723785,
"grad_norm": 0.12067476965271878,
"learning_rate": 3.8061507174746326e-07,
"loss": 1.0357,
"step": 2520
},
{
"epoch": 6.447570332480819,
"grad_norm": 0.11448044711242149,
"learning_rate": 3.7713734913666254e-07,
"loss": 1.0278,
"step": 2521
},
{
"epoch": 6.450127877237851,
"grad_norm": 0.11900503374045875,
"learning_rate": 3.73675282321726e-07,
"loss": 1.0293,
"step": 2522
},
{
"epoch": 6.452685421994885,
"grad_norm": 0.1237852363860751,
"learning_rate": 3.7022887693521914e-07,
"loss": 1.0432,
"step": 2523
},
{
"epoch": 6.455242966751918,
"grad_norm": 0.11395769439497158,
"learning_rate": 3.6679813858422673e-07,
"loss": 1.0451,
"step": 2524
},
{
"epoch": 6.457800511508951,
"grad_norm": 0.11755851431433859,
"learning_rate": 3.6338307285034626e-07,
"loss": 1.0166,
"step": 2525
},
{
"epoch": 6.460358056265985,
"grad_norm": 0.11537719335337888,
"learning_rate": 3.5998368528967764e-07,
"loss": 1.0221,
"step": 2526
},
{
"epoch": 6.462915601023018,
"grad_norm": 0.12098800578611382,
"learning_rate": 3.5659998143281027e-07,
"loss": 1.0474,
"step": 2527
},
{
"epoch": 6.465473145780051,
"grad_norm": 0.11989356063597686,
"learning_rate": 3.532319667848172e-07,
"loss": 1.0187,
"step": 2528
},
{
"epoch": 6.468030690537084,
"grad_norm": 0.1156244817453119,
"learning_rate": 3.498796468252508e-07,
"loss": 0.9894,
"step": 2529
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.11213145863456157,
"learning_rate": 3.46543027008126e-07,
"loss": 1.0331,
"step": 2530
},
{
"epoch": 6.4731457800511505,
"grad_norm": 0.11707883319628067,
"learning_rate": 3.4322211276191176e-07,
"loss": 1.0259,
"step": 2531
},
{
"epoch": 6.475703324808184,
"grad_norm": 0.11350670721406404,
"learning_rate": 3.399169094895294e-07,
"loss": 1.0065,
"step": 2532
},
{
"epoch": 6.478260869565218,
"grad_norm": 0.11452239943111842,
"learning_rate": 3.366274225683397e-07,
"loss": 1.0382,
"step": 2533
},
{
"epoch": 6.4808184143222505,
"grad_norm": 0.11645854358551593,
"learning_rate": 3.3335365735012947e-07,
"loss": 0.9849,
"step": 2534
},
{
"epoch": 6.483375959079284,
"grad_norm": 0.1150643632230636,
"learning_rate": 3.3009561916111045e-07,
"loss": 1.0441,
"step": 2535
},
{
"epoch": 6.485933503836317,
"grad_norm": 0.11565843726243669,
"learning_rate": 3.2685331330190916e-07,
"loss": 1.0256,
"step": 2536
},
{
"epoch": 6.4884910485933505,
"grad_norm": 0.12172892123412701,
"learning_rate": 3.2362674504755385e-07,
"loss": 1.0006,
"step": 2537
},
{
"epoch": 6.491048593350383,
"grad_norm": 0.11416395245772691,
"learning_rate": 3.2041591964746767e-07,
"loss": 0.9981,
"step": 2538
},
{
"epoch": 6.493606138107417,
"grad_norm": 0.11099012627200047,
"learning_rate": 3.17220842325463e-07,
"loss": 0.9971,
"step": 2539
},
{
"epoch": 6.4961636828644505,
"grad_norm": 0.12666071845516697,
"learning_rate": 3.14041518279733e-07,
"loss": 1.019,
"step": 2540
},
{
"epoch": 6.498721227621483,
"grad_norm": 0.11694427326316041,
"learning_rate": 3.108779526828365e-07,
"loss": 1.048,
"step": 2541
},
{
"epoch": 6.501278772378517,
"grad_norm": 0.11663277776194486,
"learning_rate": 3.0773015068169876e-07,
"loss": 1.0205,
"step": 2542
},
{
"epoch": 6.5038363171355495,
"grad_norm": 0.11421370105035522,
"learning_rate": 3.045981173975965e-07,
"loss": 1.0062,
"step": 2543
},
{
"epoch": 6.506393861892583,
"grad_norm": 0.11416247400561318,
"learning_rate": 3.0148185792615137e-07,
"loss": 1.0221,
"step": 2544
},
{
"epoch": 6.508951406649617,
"grad_norm": 0.12004167269390631,
"learning_rate": 2.9838137733732343e-07,
"loss": 1.0336,
"step": 2545
},
{
"epoch": 6.5115089514066495,
"grad_norm": 0.12185027359479889,
"learning_rate": 2.9529668067539986e-07,
"loss": 1.0085,
"step": 2546
},
{
"epoch": 6.514066496163683,
"grad_norm": 0.11920181864869182,
"learning_rate": 2.922277729589906e-07,
"loss": 1.0212,
"step": 2547
},
{
"epoch": 6.516624040920716,
"grad_norm": 0.11457206340363568,
"learning_rate": 2.891746591810152e-07,
"loss": 1.0062,
"step": 2548
},
{
"epoch": 6.5191815856777495,
"grad_norm": 0.11396161204686395,
"learning_rate": 2.86137344308699e-07,
"loss": 1.0269,
"step": 2549
},
{
"epoch": 6.521739130434782,
"grad_norm": 0.11716042134956894,
"learning_rate": 2.8311583328356485e-07,
"loss": 1.0513,
"step": 2550
},
{
"epoch": 6.524296675191816,
"grad_norm": 0.11082138416428153,
"learning_rate": 2.801101310214205e-07,
"loss": 1.0133,
"step": 2551
},
{
"epoch": 6.526854219948849,
"grad_norm": 0.11831445098631707,
"learning_rate": 2.7712024241235757e-07,
"loss": 1.0184,
"step": 2552
},
{
"epoch": 6.529411764705882,
"grad_norm": 0.11918281125426747,
"learning_rate": 2.7414617232073505e-07,
"loss": 1.0344,
"step": 2553
},
{
"epoch": 6.531969309462916,
"grad_norm": 0.11681313613977624,
"learning_rate": 2.7118792558518237e-07,
"loss": 1.0219,
"step": 2554
},
{
"epoch": 6.534526854219949,
"grad_norm": 0.12570449518559115,
"learning_rate": 2.6824550701857966e-07,
"loss": 1.0192,
"step": 2555
},
{
"epoch": 6.537084398976982,
"grad_norm": 0.11631595597156608,
"learning_rate": 2.653189214080576e-07,
"loss": 0.9885,
"step": 2556
},
{
"epoch": 6.539641943734015,
"grad_norm": 0.11976742856004091,
"learning_rate": 2.624081735149897e-07,
"loss": 1.0225,
"step": 2557
},
{
"epoch": 6.542199488491049,
"grad_norm": 0.11687676414472607,
"learning_rate": 2.5951326807498123e-07,
"loss": 1.0051,
"step": 2558
},
{
"epoch": 6.544757033248082,
"grad_norm": 0.11626243542745685,
"learning_rate": 2.5663420979785915e-07,
"loss": 1.0256,
"step": 2559
},
{
"epoch": 6.547314578005115,
"grad_norm": 0.11473271542819383,
"learning_rate": 2.5377100336767547e-07,
"loss": 1.0134,
"step": 2560
},
{
"epoch": 6.549872122762149,
"grad_norm": 0.11617767916671155,
"learning_rate": 2.509236534426851e-07,
"loss": 1.0045,
"step": 2561
},
{
"epoch": 6.552429667519181,
"grad_norm": 0.11177045938404909,
"learning_rate": 2.4809216465534913e-07,
"loss": 1.0377,
"step": 2562
},
{
"epoch": 6.554987212276215,
"grad_norm": 0.11344781404055954,
"learning_rate": 2.4527654161232153e-07,
"loss": 1.0037,
"step": 2563
},
{
"epoch": 6.557544757033249,
"grad_norm": 0.12399390000812018,
"learning_rate": 2.424767888944468e-07,
"loss": 1.0462,
"step": 2564
},
{
"epoch": 6.560102301790281,
"grad_norm": 0.11847061868510626,
"learning_rate": 2.3969291105674805e-07,
"loss": 0.9959,
"step": 2565
},
{
"epoch": 6.562659846547315,
"grad_norm": 0.116920831153564,
"learning_rate": 2.3692491262841788e-07,
"loss": 0.9783,
"step": 2566
},
{
"epoch": 6.565217391304348,
"grad_norm": 0.12018087616989655,
"learning_rate": 2.3417279811281947e-07,
"loss": 0.9778,
"step": 2567
},
{
"epoch": 6.567774936061381,
"grad_norm": 0.11727845557913934,
"learning_rate": 2.3143657198746893e-07,
"loss": 1.042,
"step": 2568
},
{
"epoch": 6.570332480818414,
"grad_norm": 0.1156893274747709,
"learning_rate": 2.2871623870403649e-07,
"loss": 1.0302,
"step": 2569
},
{
"epoch": 6.572890025575448,
"grad_norm": 0.11720330890092409,
"learning_rate": 2.260118026883318e-07,
"loss": 1.0267,
"step": 2570
},
{
"epoch": 6.57544757033248,
"grad_norm": 0.11688767903985245,
"learning_rate": 2.233232683403075e-07,
"loss": 1.0292,
"step": 2571
},
{
"epoch": 6.578005115089514,
"grad_norm": 0.11603026043379294,
"learning_rate": 2.206506400340369e-07,
"loss": 1.0017,
"step": 2572
},
{
"epoch": 6.580562659846548,
"grad_norm": 0.11389458080146765,
"learning_rate": 2.1799392211772074e-07,
"loss": 1.0082,
"step": 2573
},
{
"epoch": 6.58312020460358,
"grad_norm": 0.1161474107114186,
"learning_rate": 2.1535311891367373e-07,
"loss": 1.0219,
"step": 2574
},
{
"epoch": 6.585677749360614,
"grad_norm": 0.11523869949699879,
"learning_rate": 2.1272823471831573e-07,
"loss": 1.0048,
"step": 2575
},
{
"epoch": 6.588235294117647,
"grad_norm": 0.11447790591214169,
"learning_rate": 2.101192738021718e-07,
"loss": 1.0116,
"step": 2576
},
{
"epoch": 6.59079283887468,
"grad_norm": 0.11643651666513412,
"learning_rate": 2.0752624040985436e-07,
"loss": 1.0117,
"step": 2577
},
{
"epoch": 6.593350383631714,
"grad_norm": 0.12040988081003166,
"learning_rate": 2.0494913876007105e-07,
"loss": 1.0255,
"step": 2578
},
{
"epoch": 6.595907928388747,
"grad_norm": 0.11872708662460554,
"learning_rate": 2.0238797304560243e-07,
"loss": 1.0241,
"step": 2579
},
{
"epoch": 6.59846547314578,
"grad_norm": 0.10983144316407795,
"learning_rate": 1.9984274743330424e-07,
"loss": 1.0106,
"step": 2580
},
{
"epoch": 6.601023017902813,
"grad_norm": 0.112895943367732,
"learning_rate": 1.9731346606410185e-07,
"loss": 1.0405,
"step": 2581
},
{
"epoch": 6.603580562659847,
"grad_norm": 0.11309181158689928,
"learning_rate": 1.9480013305297585e-07,
"loss": 1.0286,
"step": 2582
},
{
"epoch": 6.6061381074168795,
"grad_norm": 0.11579577875848088,
"learning_rate": 1.9230275248896425e-07,
"loss": 1.0137,
"step": 2583
},
{
"epoch": 6.608695652173913,
"grad_norm": 0.11932271374275923,
"learning_rate": 1.8982132843514577e-07,
"loss": 1.0352,
"step": 2584
},
{
"epoch": 6.611253196930946,
"grad_norm": 0.1187240263728754,
"learning_rate": 1.8735586492864556e-07,
"loss": 0.9899,
"step": 2585
},
{
"epoch": 6.6138107416879794,
"grad_norm": 0.12010362235501355,
"learning_rate": 1.8490636598061605e-07,
"loss": 1.0202,
"step": 2586
},
{
"epoch": 6.616368286445013,
"grad_norm": 0.11896072789581243,
"learning_rate": 1.8247283557624062e-07,
"loss": 1.0801,
"step": 2587
},
{
"epoch": 6.618925831202046,
"grad_norm": 0.11269695438058397,
"learning_rate": 1.8005527767471998e-07,
"loss": 1.0323,
"step": 2588
},
{
"epoch": 6.621483375959079,
"grad_norm": 0.11595014960172056,
"learning_rate": 1.7765369620926899e-07,
"loss": 1.0247,
"step": 2589
},
{
"epoch": 6.624040920716112,
"grad_norm": 0.11457210948093192,
"learning_rate": 1.752680950871144e-07,
"loss": 1.0561,
"step": 2590
},
{
"epoch": 6.626598465473146,
"grad_norm": 0.11577860483951284,
"learning_rate": 1.7289847818947492e-07,
"loss": 1.0182,
"step": 2591
},
{
"epoch": 6.629156010230179,
"grad_norm": 0.11240383490721378,
"learning_rate": 1.7054484937157112e-07,
"loss": 1.0255,
"step": 2592
},
{
"epoch": 6.631713554987212,
"grad_norm": 0.11631232042116323,
"learning_rate": 1.6820721246261106e-07,
"loss": 1.0299,
"step": 2593
},
{
"epoch": 6.634271099744246,
"grad_norm": 0.11273655621311057,
"learning_rate": 1.6588557126578365e-07,
"loss": 1.0407,
"step": 2594
},
{
"epoch": 6.6368286445012785,
"grad_norm": 0.11767164102993428,
"learning_rate": 1.6357992955825297e-07,
"loss": 1.0145,
"step": 2595
},
{
"epoch": 6.639386189258312,
"grad_norm": 0.11534695075999606,
"learning_rate": 1.6129029109115401e-07,
"loss": 1.0106,
"step": 2596
},
{
"epoch": 6.641943734015345,
"grad_norm": 0.11539400507669376,
"learning_rate": 1.59016659589587e-07,
"loss": 0.9862,
"step": 2597
},
{
"epoch": 6.6445012787723785,
"grad_norm": 0.11483047616375414,
"learning_rate": 1.567590387526041e-07,
"loss": 1.0301,
"step": 2598
},
{
"epoch": 6.647058823529412,
"grad_norm": 0.11260638212850177,
"learning_rate": 1.5451743225321726e-07,
"loss": 1.0088,
"step": 2599
},
{
"epoch": 6.649616368286445,
"grad_norm": 0.11619144848069289,
"learning_rate": 1.5229184373837912e-07,
"loss": 1.0117,
"step": 2600
},
{
"epoch": 6.6521739130434785,
"grad_norm": 0.12170161725444163,
"learning_rate": 1.5008227682898337e-07,
"loss": 1.0345,
"step": 2601
},
{
"epoch": 6.654731457800511,
"grad_norm": 0.11009879990340311,
"learning_rate": 1.4788873511985656e-07,
"loss": 1.0074,
"step": 2602
},
{
"epoch": 6.657289002557545,
"grad_norm": 0.11242257451547451,
"learning_rate": 1.4571122217975298e-07,
"loss": 1.0295,
"step": 2603
},
{
"epoch": 6.659846547314578,
"grad_norm": 0.11604613398078274,
"learning_rate": 1.4354974155135203e-07,
"loss": 1.0287,
"step": 2604
},
{
"epoch": 6.662404092071611,
"grad_norm": 0.11447891191608152,
"learning_rate": 1.4140429675124633e-07,
"loss": 1.0059,
"step": 2605
},
{
"epoch": 6.664961636828645,
"grad_norm": 0.11195548180186611,
"learning_rate": 1.3927489126993932e-07,
"loss": 1.0347,
"step": 2606
},
{
"epoch": 6.667519181585678,
"grad_norm": 0.11445065696070437,
"learning_rate": 1.3716152857184306e-07,
"loss": 1.012,
"step": 2607
},
{
"epoch": 6.670076726342711,
"grad_norm": 0.11614977059279803,
"learning_rate": 1.350642120952661e-07,
"loss": 0.9918,
"step": 2608
},
{
"epoch": 6.672634271099744,
"grad_norm": 0.11871269418863775,
"learning_rate": 1.3298294525241008e-07,
"loss": 1.0269,
"step": 2609
},
{
"epoch": 6.675191815856778,
"grad_norm": 0.10866128338893077,
"learning_rate": 1.3091773142936525e-07,
"loss": 1.0334,
"step": 2610
},
{
"epoch": 6.677749360613811,
"grad_norm": 0.12041795104852608,
"learning_rate": 1.2886857398610731e-07,
"loss": 0.9974,
"step": 2611
},
{
"epoch": 6.680306905370844,
"grad_norm": 0.11406194376177828,
"learning_rate": 1.2683547625648718e-07,
"loss": 1.0222,
"step": 2612
},
{
"epoch": 6.6828644501278776,
"grad_norm": 0.11240623577621248,
"learning_rate": 1.2481844154822565e-07,
"loss": 0.9952,
"step": 2613
},
{
"epoch": 6.68542199488491,
"grad_norm": 0.11514164915047609,
"learning_rate": 1.2281747314291437e-07,
"loss": 1.0026,
"step": 2614
},
{
"epoch": 6.687979539641944,
"grad_norm": 0.11222335726022206,
"learning_rate": 1.208325742960037e-07,
"loss": 1.0056,
"step": 2615
},
{
"epoch": 6.690537084398977,
"grad_norm": 0.11243016039454592,
"learning_rate": 1.1886374823679825e-07,
"loss": 1.0492,
"step": 2616
},
{
"epoch": 6.69309462915601,
"grad_norm": 0.11317201484958644,
"learning_rate": 1.1691099816845574e-07,
"loss": 1.0213,
"step": 2617
},
{
"epoch": 6.695652173913043,
"grad_norm": 0.1170626311837824,
"learning_rate": 1.149743272679793e-07,
"loss": 0.9974,
"step": 2618
},
{
"epoch": 6.698209718670077,
"grad_norm": 0.12262867677149476,
"learning_rate": 1.1305373868620961e-07,
"loss": 0.9967,
"step": 2619
},
{
"epoch": 6.70076726342711,
"grad_norm": 0.11396022297257247,
"learning_rate": 1.1114923554782608e-07,
"loss": 0.9956,
"step": 2620
},
{
"epoch": 6.703324808184143,
"grad_norm": 0.11735281558425238,
"learning_rate": 1.0926082095133572e-07,
"loss": 1.0193,
"step": 2621
},
{
"epoch": 6.705882352941177,
"grad_norm": 0.12029512917783149,
"learning_rate": 1.0738849796907091e-07,
"loss": 1.0473,
"step": 2622
},
{
"epoch": 6.708439897698209,
"grad_norm": 0.11312555151340069,
"learning_rate": 1.0553226964718277e-07,
"loss": 1.008,
"step": 2623
},
{
"epoch": 6.710997442455243,
"grad_norm": 0.11541322342299927,
"learning_rate": 1.0369213900564001e-07,
"loss": 1.0029,
"step": 2624
},
{
"epoch": 6.713554987212277,
"grad_norm": 0.11302071428638145,
"learning_rate": 1.0186810903822119e-07,
"loss": 0.9623,
"step": 2625
},
{
"epoch": 6.716112531969309,
"grad_norm": 0.11291140484686953,
"learning_rate": 1.0006018271250695e-07,
"loss": 1.0305,
"step": 2626
},
{
"epoch": 6.718670076726343,
"grad_norm": 0.11524487387426563,
"learning_rate": 9.826836296988107e-08,
"loss": 1.0596,
"step": 2627
},
{
"epoch": 6.721227621483376,
"grad_norm": 0.11543260535666969,
"learning_rate": 9.649265272552277e-08,
"loss": 1.0237,
"step": 2628
},
{
"epoch": 6.723785166240409,
"grad_norm": 0.11302904037284935,
"learning_rate": 9.473305486840112e-08,
"loss": 1.0177,
"step": 2629
},
{
"epoch": 6.726342710997442,
"grad_norm": 0.11210024116892857,
"learning_rate": 9.29895722612717e-08,
"loss": 1.0284,
"step": 2630
},
{
"epoch": 6.728900255754476,
"grad_norm": 0.11611360048557691,
"learning_rate": 9.126220774067218e-08,
"loss": 1.0313,
"step": 2631
},
{
"epoch": 6.731457800511509,
"grad_norm": 0.11281080704543008,
"learning_rate": 8.955096411691566e-08,
"loss": 1.0156,
"step": 2632
},
{
"epoch": 6.734015345268542,
"grad_norm": 0.11192307343079083,
"learning_rate": 8.785584417409065e-08,
"loss": 1.0173,
"step": 2633
},
{
"epoch": 6.736572890025576,
"grad_norm": 0.11483249975315203,
"learning_rate": 8.617685067004777e-08,
"loss": 1.0269,
"step": 2634
},
{
"epoch": 6.739130434782608,
"grad_norm": 0.11652633110386056,
"learning_rate": 8.451398633640861e-08,
"loss": 0.9978,
"step": 2635
},
{
"epoch": 6.741687979539642,
"grad_norm": 0.11193935061569056,
"learning_rate": 8.286725387854689e-08,
"loss": 1.0166,
"step": 2636
},
{
"epoch": 6.744245524296675,
"grad_norm": 0.1132575109344062,
"learning_rate": 8.123665597559393e-08,
"loss": 1.03,
"step": 2637
},
{
"epoch": 6.746803069053708,
"grad_norm": 0.10909141114205528,
"learning_rate": 7.962219528042991e-08,
"loss": 0.9843,
"step": 2638
},
{
"epoch": 6.749360613810742,
"grad_norm": 0.11510554903103819,
"learning_rate": 7.802387441968262e-08,
"loss": 1.0058,
"step": 2639
},
{
"epoch": 6.751918158567775,
"grad_norm": 0.1126125629269261,
"learning_rate": 7.644169599371975e-08,
"loss": 1.0451,
"step": 2640
},
{
"epoch": 6.754475703324808,
"grad_norm": 0.11361718582807691,
"learning_rate": 7.487566257664558e-08,
"loss": 1.0447,
"step": 2641
},
{
"epoch": 6.757033248081841,
"grad_norm": 0.11201362418480085,
"learning_rate": 7.332577671629982e-08,
"loss": 1.0003,
"step": 2642
},
{
"epoch": 6.759590792838875,
"grad_norm": 0.11250812055949669,
"learning_rate": 7.179204093424985e-08,
"loss": 1.0152,
"step": 2643
},
{
"epoch": 6.762148337595908,
"grad_norm": 0.11340595916397253,
"learning_rate": 7.027445772578856e-08,
"loss": 1.0136,
"step": 2644
},
{
"epoch": 6.764705882352941,
"grad_norm": 0.11043173067397596,
"learning_rate": 6.877302955992649e-08,
"loss": 1.0039,
"step": 2645
},
{
"epoch": 6.767263427109975,
"grad_norm": 0.11320152606971275,
"learning_rate": 6.72877588793952e-08,
"loss": 1.0263,
"step": 2646
},
{
"epoch": 6.7698209718670075,
"grad_norm": 0.11555065643180781,
"learning_rate": 6.581864810063732e-08,
"loss": 1.0095,
"step": 2647
},
{
"epoch": 6.772378516624041,
"grad_norm": 0.1114703182443358,
"learning_rate": 6.436569961380313e-08,
"loss": 1.0014,
"step": 2648
},
{
"epoch": 6.774936061381074,
"grad_norm": 0.11945044598900786,
"learning_rate": 6.292891578275063e-08,
"loss": 1.0308,
"step": 2649
},
{
"epoch": 6.7774936061381075,
"grad_norm": 0.11250868328242511,
"learning_rate": 6.150829894503662e-08,
"loss": 1.0107,
"step": 2650
},
{
"epoch": 6.78005115089514,
"grad_norm": 0.11491638958663465,
"learning_rate": 6.010385141191455e-08,
"loss": 1.0279,
"step": 2651
},
{
"epoch": 6.782608695652174,
"grad_norm": 0.1160903563132126,
"learning_rate": 5.8715575468333286e-08,
"loss": 1.0067,
"step": 2652
},
{
"epoch": 6.7851662404092075,
"grad_norm": 0.11673880519657757,
"learning_rate": 5.734347337293167e-08,
"loss": 1.0253,
"step": 2653
},
{
"epoch": 6.78772378516624,
"grad_norm": 0.11345092121417273,
"learning_rate": 5.598754735803513e-08,
"loss": 1.0256,
"step": 2654
},
{
"epoch": 6.790281329923274,
"grad_norm": 0.11245719320265857,
"learning_rate": 5.464779962964795e-08,
"loss": 1.023,
"step": 2655
},
{
"epoch": 6.792838874680307,
"grad_norm": 0.11318781711220266,
"learning_rate": 5.332423236745765e-08,
"loss": 0.9817,
"step": 2656
},
{
"epoch": 6.79539641943734,
"grad_norm": 0.11393255984182678,
"learning_rate": 5.201684772482507e-08,
"loss": 0.9919,
"step": 2657
},
{
"epoch": 6.797953964194374,
"grad_norm": 0.1114106983420887,
"learning_rate": 5.0725647828783196e-08,
"loss": 0.9949,
"step": 2658
},
{
"epoch": 6.8005115089514065,
"grad_norm": 0.11613702586163382,
"learning_rate": 4.945063478003276e-08,
"loss": 1.0246,
"step": 2659
},
{
"epoch": 6.80306905370844,
"grad_norm": 0.11426036986413816,
"learning_rate": 4.8191810652941096e-08,
"loss": 1.0434,
"step": 2660
},
{
"epoch": 6.805626598465473,
"grad_norm": 0.11654706791098739,
"learning_rate": 4.694917749553663e-08,
"loss": 1.0256,
"step": 2661
},
{
"epoch": 6.8081841432225065,
"grad_norm": 0.10999242921563646,
"learning_rate": 4.5722737329505495e-08,
"loss": 0.9802,
"step": 2662
},
{
"epoch": 6.810741687979539,
"grad_norm": 0.11948231260555445,
"learning_rate": 4.451249215018827e-08,
"loss": 1.0593,
"step": 2663
},
{
"epoch": 6.813299232736573,
"grad_norm": 0.11285924950704992,
"learning_rate": 4.331844392657991e-08,
"loss": 1.026,
"step": 2664
},
{
"epoch": 6.8158567774936065,
"grad_norm": 0.11299230638204774,
"learning_rate": 4.2140594601320915e-08,
"loss": 1.0162,
"step": 2665
},
{
"epoch": 6.818414322250639,
"grad_norm": 0.11595950299690573,
"learning_rate": 4.097894609069841e-08,
"loss": 0.9853,
"step": 2666
},
{
"epoch": 6.820971867007673,
"grad_norm": 0.1155751170348188,
"learning_rate": 3.983350028464283e-08,
"loss": 1.0022,
"step": 2667
},
{
"epoch": 6.823529411764706,
"grad_norm": 0.11385145480733656,
"learning_rate": 3.870425904672237e-08,
"loss": 1.0905,
"step": 2668
},
{
"epoch": 6.826086956521739,
"grad_norm": 0.11441498941945787,
"learning_rate": 3.7591224214141855e-08,
"loss": 1.032,
"step": 2669
},
{
"epoch": 6.828644501278772,
"grad_norm": 0.11622957694085463,
"learning_rate": 3.649439759773943e-08,
"loss": 1.0273,
"step": 2670
},
{
"epoch": 6.831202046035806,
"grad_norm": 0.11310752492427763,
"learning_rate": 3.541378098198323e-08,
"loss": 1.0202,
"step": 2671
},
{
"epoch": 6.833759590792839,
"grad_norm": 0.11205928985871322,
"learning_rate": 3.4349376124969136e-08,
"loss": 0.9919,
"step": 2672
},
{
"epoch": 6.836317135549872,
"grad_norm": 0.11055590937853152,
"learning_rate": 3.330118475841859e-08,
"loss": 1.019,
"step": 2673
},
{
"epoch": 6.838874680306906,
"grad_norm": 0.11098200209047006,
"learning_rate": 3.22692085876708e-08,
"loss": 0.9972,
"step": 2674
},
{
"epoch": 6.841432225063938,
"grad_norm": 0.11522340948350532,
"learning_rate": 3.125344929168828e-08,
"loss": 1.0004,
"step": 2675
},
{
"epoch": 6.843989769820972,
"grad_norm": 0.11422976235509531,
"learning_rate": 3.025390852304688e-08,
"loss": 1.0273,
"step": 2676
},
{
"epoch": 6.846547314578006,
"grad_norm": 0.11018216168196639,
"learning_rate": 2.927058790793802e-08,
"loss": 1.0102,
"step": 2677
},
{
"epoch": 6.849104859335038,
"grad_norm": 0.10995140569223621,
"learning_rate": 2.830348904616198e-08,
"loss": 0.991,
"step": 2678
},
{
"epoch": 6.851662404092072,
"grad_norm": 0.11543991907521552,
"learning_rate": 2.7352613511127946e-08,
"loss": 1.0338,
"step": 2679
},
{
"epoch": 6.854219948849105,
"grad_norm": 0.11129720513762761,
"learning_rate": 2.6417962849852875e-08,
"loss": 1.0094,
"step": 2680
},
{
"epoch": 6.856777493606138,
"grad_norm": 0.1097107046759256,
"learning_rate": 2.549953858295262e-08,
"loss": 1.0208,
"step": 2681
},
{
"epoch": 6.859335038363171,
"grad_norm": 0.1181695445768175,
"learning_rate": 2.459734220464638e-08,
"loss": 1.0015,
"step": 2682
},
{
"epoch": 6.861892583120205,
"grad_norm": 0.11107816598809478,
"learning_rate": 2.3711375182753347e-08,
"loss": 1.0261,
"step": 2683
},
{
"epoch": 6.864450127877237,
"grad_norm": 0.10839159774339671,
"learning_rate": 2.2841638958683855e-08,
"loss": 1.0135,
"step": 2684
},
{
"epoch": 6.867007672634271,
"grad_norm": 0.1121417586939987,
"learning_rate": 2.1988134947446004e-08,
"loss": 1.0035,
"step": 2685
},
{
"epoch": 6.869565217391305,
"grad_norm": 0.11209845991644457,
"learning_rate": 2.1150864537636817e-08,
"loss": 1.0321,
"step": 2686
},
{
"epoch": 6.872122762148337,
"grad_norm": 0.11303462530389491,
"learning_rate": 2.032982909144332e-08,
"loss": 1.012,
"step": 2687
},
{
"epoch": 6.874680306905371,
"grad_norm": 0.11117791002965544,
"learning_rate": 1.9525029944637008e-08,
"loss": 0.9929,
"step": 2688
},
{
"epoch": 6.877237851662404,
"grad_norm": 0.1089777437805983,
"learning_rate": 1.8736468406579388e-08,
"loss": 0.9931,
"step": 2689
},
{
"epoch": 6.879795396419437,
"grad_norm": 0.11251100033934079,
"learning_rate": 1.796414576020755e-08,
"loss": 1.0153,
"step": 2690
},
{
"epoch": 6.882352941176471,
"grad_norm": 0.11299998984552379,
"learning_rate": 1.720806326204305e-08,
"loss": 1.005,
"step": 2691
},
{
"epoch": 6.884910485933504,
"grad_norm": 0.11290626743296132,
"learning_rate": 1.646822214218524e-08,
"loss": 1.049,
"step": 2692
},
{
"epoch": 6.887468030690537,
"grad_norm": 0.11186130749976496,
"learning_rate": 1.5744623604310172e-08,
"loss": 1.003,
"step": 2693
},
{
"epoch": 6.89002557544757,
"grad_norm": 0.11028332749990057,
"learning_rate": 1.503726882566503e-08,
"loss": 0.9892,
"step": 2694
},
{
"epoch": 6.892583120204604,
"grad_norm": 0.11457205700764143,
"learning_rate": 1.4346158957073696e-08,
"loss": 1.0261,
"step": 2695
},
{
"epoch": 6.8951406649616365,
"grad_norm": 0.11434079231719742,
"learning_rate": 1.3671295122928974e-08,
"loss": 1.0118,
"step": 2696
},
{
"epoch": 6.89769820971867,
"grad_norm": 0.11590548541933458,
"learning_rate": 1.3012678421191471e-08,
"loss": 1.0397,
"step": 2697
},
{
"epoch": 6.900255754475703,
"grad_norm": 0.11241776946007812,
"learning_rate": 1.2370309923388501e-08,
"loss": 1.0214,
"step": 2698
},
{
"epoch": 6.9028132992327365,
"grad_norm": 0.11386908312296881,
"learning_rate": 1.1744190674614076e-08,
"loss": 1.0249,
"step": 2699
},
{
"epoch": 6.90537084398977,
"grad_norm": 0.1111155708841944,
"learning_rate": 1.1134321693525574e-08,
"loss": 1.0013,
"step": 2700
},
{
"epoch": 6.907928388746803,
"grad_norm": 0.11383791079341445,
"learning_rate": 1.0540703972341525e-08,
"loss": 1.0148,
"step": 2701
},
{
"epoch": 6.910485933503836,
"grad_norm": 0.11458774717785482,
"learning_rate": 9.963338476840501e-09,
"loss": 1.029,
"step": 2702
},
{
"epoch": 6.913043478260869,
"grad_norm": 0.11295695096599505,
"learning_rate": 9.402226146361104e-09,
"loss": 1.0136,
"step": 2703
},
{
"epoch": 6.915601023017903,
"grad_norm": 0.11389257052620162,
"learning_rate": 8.857367893796431e-09,
"loss": 0.9989,
"step": 2704
},
{
"epoch": 6.918158567774936,
"grad_norm": 0.11405136091559014,
"learning_rate": 8.328764605597395e-09,
"loss": 1.0239,
"step": 2705
},
{
"epoch": 6.920716112531969,
"grad_norm": 0.11514239271194625,
"learning_rate": 7.816417141768284e-09,
"loss": 1.041,
"step": 2706
},
{
"epoch": 6.923273657289003,
"grad_norm": 0.11236159186101047,
"learning_rate": 7.3203263358678775e-09,
"loss": 1.0297,
"step": 2707
},
{
"epoch": 6.9258312020460355,
"grad_norm": 0.112779013609661,
"learning_rate": 6.840492995002779e-09,
"loss": 1.0177,
"step": 2708
},
{
"epoch": 6.928388746803069,
"grad_norm": 0.11154163182583252,
"learning_rate": 6.376917899832968e-09,
"loss": 1.0262,
"step": 2709
},
{
"epoch": 6.930946291560103,
"grad_norm": 0.11358295898234577,
"learning_rate": 5.929601804566254e-09,
"loss": 1.0057,
"step": 2710
},
{
"epoch": 6.9335038363171355,
"grad_norm": 0.11003717187565273,
"learning_rate": 5.498545436957159e-09,
"loss": 1.0269,
"step": 2711
},
{
"epoch": 6.936061381074169,
"grad_norm": 0.10600474645039837,
"learning_rate": 5.0837494983091425e-09,
"loss": 0.9854,
"step": 2712
},
{
"epoch": 6.938618925831202,
"grad_norm": 0.10929642667614789,
"learning_rate": 4.6852146634668304e-09,
"loss": 1.0149,
"step": 2713
},
{
"epoch": 6.9411764705882355,
"grad_norm": 0.11582392789733863,
"learning_rate": 4.302941580823783e-09,
"loss": 0.9864,
"step": 2714
},
{
"epoch": 6.943734015345268,
"grad_norm": 0.11406855862931596,
"learning_rate": 3.936930872312506e-09,
"loss": 1.0296,
"step": 2715
},
{
"epoch": 6.946291560102302,
"grad_norm": 0.11629050448797144,
"learning_rate": 3.5871831334099992e-09,
"loss": 1.0319,
"step": 2716
},
{
"epoch": 6.948849104859335,
"grad_norm": 0.11235711633426523,
"learning_rate": 3.2536989331355406e-09,
"loss": 1.0061,
"step": 2717
},
{
"epoch": 6.951406649616368,
"grad_norm": 0.11339029722347495,
"learning_rate": 2.9364788140451296e-09,
"loss": 1.0558,
"step": 2718
},
{
"epoch": 6.953964194373402,
"grad_norm": 0.1122327401431765,
"learning_rate": 2.635523292237041e-09,
"loss": 1.043,
"step": 2719
},
{
"epoch": 6.956521739130435,
"grad_norm": 0.1150922652013077,
"learning_rate": 2.3508328573462745e-09,
"loss": 1.0157,
"step": 2720
},
{
"epoch": 6.959079283887468,
"grad_norm": 0.11034749878838018,
"learning_rate": 2.082407972547884e-09,
"loss": 1.0172,
"step": 2721
},
{
"epoch": 6.961636828644501,
"grad_norm": 0.11414568906111035,
"learning_rate": 1.8302490745503166e-09,
"loss": 1.0294,
"step": 2722
},
{
"epoch": 6.964194373401535,
"grad_norm": 0.11166620944982035,
"learning_rate": 1.5943565736020739e-09,
"loss": 1.0242,
"step": 2723
},
{
"epoch": 6.966751918158568,
"grad_norm": 0.11672921275884213,
"learning_rate": 1.3747308534850512e-09,
"loss": 1.0372,
"step": 2724
},
{
"epoch": 6.969309462915601,
"grad_norm": 0.11540312400728218,
"learning_rate": 1.1713722715167575e-09,
"loss": 1.0515,
"step": 2725
},
{
"epoch": 6.971867007672635,
"grad_norm": 0.11588267312835213,
"learning_rate": 9.84281158548095e-10,
"loss": 1.0291,
"step": 2726
},
{
"epoch": 6.974424552429667,
"grad_norm": 0.11642536438528109,
"learning_rate": 8.134578189644692e-10,
"loss": 1.013,
"step": 2727
},
{
"epoch": 6.976982097186701,
"grad_norm": 0.11741237126233431,
"learning_rate": 6.589025306869002e-10,
"loss": 1.0054,
"step": 2728
},
{
"epoch": 6.979539641943734,
"grad_norm": 0.1116075879721608,
"learning_rate": 5.206155451642491e-10,
"loss": 1.0299,
"step": 2729
},
{
"epoch": 6.982097186700767,
"grad_norm": 0.11444442287287329,
"learning_rate": 3.985970873821021e-10,
"loss": 1.0413,
"step": 2730
},
{
"epoch": 6.9846547314578,
"grad_norm": 0.12160291833827606,
"learning_rate": 2.928473558583278e-10,
"loss": 1.0317,
"step": 2731
},
{
"epoch": 6.987212276214834,
"grad_norm": 0.1124635627813877,
"learning_rate": 2.033665226386372e-10,
"loss": 1.0144,
"step": 2732
},
{
"epoch": 6.989769820971867,
"grad_norm": 0.11276149081438312,
"learning_rate": 1.301547333032449e-10,
"loss": 1.0007,
"step": 2733
},
{
"epoch": 6.9923273657289,
"grad_norm": 0.10984392228143453,
"learning_rate": 7.321210696464853e-11,
"loss": 0.9763,
"step": 2734
},
{
"epoch": 6.994884910485934,
"grad_norm": 0.11019543161726779,
"learning_rate": 3.253873626429816e-11,
"loss": 1.0013,
"step": 2735
},
{
"epoch": 6.997442455242966,
"grad_norm": 0.11197749059770203,
"learning_rate": 8.134687374816708e-12,
"loss": 1.0472,
"step": 2736
},
{
"epoch": 7.0,
"grad_norm": 0.11208987109779546,
"learning_rate": 0.0,
"loss": 0.9774,
"step": 2737
},
{
"epoch": 7.0,
"step": 2737,
"total_flos": 9969287656374272.0,
"train_loss": 1.063590354692078,
"train_runtime": 97730.0822,
"train_samples_per_second": 7.163,
"train_steps_per_second": 0.028
}
],
"logging_steps": 1.0,
"max_steps": 2737,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9969287656374272.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}