Files
ModelHub XC b7300314ac 初始化项目,由ModelHub XC社区提供模型
Model: boradorish/baseline-qwen3-4b-grounded_table
Source: Original Platform
2026-05-28 16:10:18 +08:00

4599 lines
112 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.376600899965386,
"eval_steps": 92,
"global_step": 644,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003692165685935156,
"grad_norm": 1.7612229585647583,
"learning_rate": 0.0,
"loss": 0.0643,
"step": 1
},
{
"epoch": 0.007384331371870312,
"grad_norm": 1.5242195129394531,
"learning_rate": 4.878048780487805e-07,
"loss": 0.0413,
"step": 2
},
{
"epoch": 0.01107649705780547,
"grad_norm": 1.9117767810821533,
"learning_rate": 9.75609756097561e-07,
"loss": 0.0658,
"step": 3
},
{
"epoch": 0.014768662743740625,
"grad_norm": 1.904267430305481,
"learning_rate": 1.4634146341463414e-06,
"loss": 0.063,
"step": 4
},
{
"epoch": 0.018460828429675783,
"grad_norm": 1.6038223505020142,
"learning_rate": 1.951219512195122e-06,
"loss": 0.0546,
"step": 5
},
{
"epoch": 0.02215299411561094,
"grad_norm": 0.9653654098510742,
"learning_rate": 2.4390243902439027e-06,
"loss": 0.0417,
"step": 6
},
{
"epoch": 0.025845159801546093,
"grad_norm": 0.8979856371879578,
"learning_rate": 2.926829268292683e-06,
"loss": 0.0398,
"step": 7
},
{
"epoch": 0.02953732548748125,
"grad_norm": 0.7607001066207886,
"learning_rate": 3.414634146341464e-06,
"loss": 0.0419,
"step": 8
},
{
"epoch": 0.033229491173416406,
"grad_norm": 0.7884329557418823,
"learning_rate": 3.902439024390244e-06,
"loss": 0.0267,
"step": 9
},
{
"epoch": 0.036921656859351566,
"grad_norm": 0.3715834617614746,
"learning_rate": 4.390243902439025e-06,
"loss": 0.0277,
"step": 10
},
{
"epoch": 0.04061382254528672,
"grad_norm": 0.31171751022338867,
"learning_rate": 4.8780487804878055e-06,
"loss": 0.0296,
"step": 11
},
{
"epoch": 0.04430598823122188,
"grad_norm": 0.41572287678718567,
"learning_rate": 5.365853658536586e-06,
"loss": 0.0184,
"step": 12
},
{
"epoch": 0.04799815391715703,
"grad_norm": 0.46749481558799744,
"learning_rate": 5.853658536585366e-06,
"loss": 0.0182,
"step": 13
},
{
"epoch": 0.051690319603092186,
"grad_norm": 0.2749096751213074,
"learning_rate": 6.341463414634147e-06,
"loss": 0.0248,
"step": 14
},
{
"epoch": 0.055382485289027346,
"grad_norm": 0.17462071776390076,
"learning_rate": 6.829268292682928e-06,
"loss": 0.0123,
"step": 15
},
{
"epoch": 0.0590746509749625,
"grad_norm": 0.18316905200481415,
"learning_rate": 7.317073170731707e-06,
"loss": 0.0145,
"step": 16
},
{
"epoch": 0.06276681666089766,
"grad_norm": 0.16834756731987,
"learning_rate": 7.804878048780489e-06,
"loss": 0.0194,
"step": 17
},
{
"epoch": 0.06645898234683281,
"grad_norm": 0.150247722864151,
"learning_rate": 8.292682926829268e-06,
"loss": 0.0117,
"step": 18
},
{
"epoch": 0.07015114803276797,
"grad_norm": 0.193730428814888,
"learning_rate": 8.78048780487805e-06,
"loss": 0.0208,
"step": 19
},
{
"epoch": 0.07384331371870313,
"grad_norm": 0.16806142032146454,
"learning_rate": 9.268292682926831e-06,
"loss": 0.0099,
"step": 20
},
{
"epoch": 0.07753547940463829,
"grad_norm": 0.2117680311203003,
"learning_rate": 9.756097560975611e-06,
"loss": 0.0368,
"step": 21
},
{
"epoch": 0.08122764509057344,
"grad_norm": 0.12042353302240372,
"learning_rate": 1.024390243902439e-05,
"loss": 0.013,
"step": 22
},
{
"epoch": 0.08491981077650859,
"grad_norm": 0.16888391971588135,
"learning_rate": 1.0731707317073172e-05,
"loss": 0.0137,
"step": 23
},
{
"epoch": 0.08861197646244376,
"grad_norm": 0.32111823558807373,
"learning_rate": 1.1219512195121953e-05,
"loss": 0.0248,
"step": 24
},
{
"epoch": 0.09230414214837891,
"grad_norm": 0.1346050500869751,
"learning_rate": 1.1707317073170731e-05,
"loss": 0.0256,
"step": 25
},
{
"epoch": 0.09599630783431407,
"grad_norm": 0.1447174847126007,
"learning_rate": 1.2195121951219513e-05,
"loss": 0.0124,
"step": 26
},
{
"epoch": 0.09968847352024922,
"grad_norm": 0.13269546627998352,
"learning_rate": 1.2682926829268294e-05,
"loss": 0.0096,
"step": 27
},
{
"epoch": 0.10338063920618437,
"grad_norm": 0.08903706818819046,
"learning_rate": 1.3170731707317076e-05,
"loss": 0.0076,
"step": 28
},
{
"epoch": 0.10707280489211954,
"grad_norm": 0.1776096373796463,
"learning_rate": 1.3658536585365855e-05,
"loss": 0.0181,
"step": 29
},
{
"epoch": 0.11076497057805469,
"grad_norm": 0.10876930505037308,
"learning_rate": 1.4146341463414635e-05,
"loss": 0.0111,
"step": 30
},
{
"epoch": 0.11445713626398984,
"grad_norm": 0.1793457418680191,
"learning_rate": 1.4634146341463415e-05,
"loss": 0.023,
"step": 31
},
{
"epoch": 0.118149301949925,
"grad_norm": 0.17572703957557678,
"learning_rate": 1.5121951219512196e-05,
"loss": 0.0162,
"step": 32
},
{
"epoch": 0.12184146763586017,
"grad_norm": 0.14067068696022034,
"learning_rate": 1.5609756097560978e-05,
"loss": 0.0071,
"step": 33
},
{
"epoch": 0.12553363332179532,
"grad_norm": 0.1744857132434845,
"learning_rate": 1.6097560975609757e-05,
"loss": 0.0153,
"step": 34
},
{
"epoch": 0.12922579900773049,
"grad_norm": 0.17557364702224731,
"learning_rate": 1.6585365853658537e-05,
"loss": 0.0227,
"step": 35
},
{
"epoch": 0.13291796469366562,
"grad_norm": 0.10515311360359192,
"learning_rate": 1.7073170731707317e-05,
"loss": 0.0152,
"step": 36
},
{
"epoch": 0.1366101303796008,
"grad_norm": 0.12522749602794647,
"learning_rate": 1.75609756097561e-05,
"loss": 0.0103,
"step": 37
},
{
"epoch": 0.14030229606553593,
"grad_norm": 0.10669893026351929,
"learning_rate": 1.804878048780488e-05,
"loss": 0.0161,
"step": 38
},
{
"epoch": 0.1439944617514711,
"grad_norm": 0.09148227423429489,
"learning_rate": 1.8536585365853663e-05,
"loss": 0.0072,
"step": 39
},
{
"epoch": 0.14768662743740626,
"grad_norm": 0.2032286524772644,
"learning_rate": 1.902439024390244e-05,
"loss": 0.0092,
"step": 40
},
{
"epoch": 0.1513787931233414,
"grad_norm": 0.10257123410701752,
"learning_rate": 1.9512195121951222e-05,
"loss": 0.007,
"step": 41
},
{
"epoch": 0.15507095880927657,
"grad_norm": 0.07334409654140472,
"learning_rate": 2e-05,
"loss": 0.0064,
"step": 42
},
{
"epoch": 0.1587631244952117,
"grad_norm": 0.09883085638284683,
"learning_rate": 2.048780487804878e-05,
"loss": 0.0089,
"step": 43
},
{
"epoch": 0.16245529018114688,
"grad_norm": 0.10087648034095764,
"learning_rate": 2.0975609756097564e-05,
"loss": 0.0065,
"step": 44
},
{
"epoch": 0.16614745586708204,
"grad_norm": 0.10451877117156982,
"learning_rate": 2.1463414634146344e-05,
"loss": 0.0102,
"step": 45
},
{
"epoch": 0.16983962155301718,
"grad_norm": 0.13105975091457367,
"learning_rate": 2.1951219512195124e-05,
"loss": 0.009,
"step": 46
},
{
"epoch": 0.17353178723895235,
"grad_norm": 0.2514360547065735,
"learning_rate": 2.2439024390243907e-05,
"loss": 0.0168,
"step": 47
},
{
"epoch": 0.17722395292488752,
"grad_norm": 0.11838189512491226,
"learning_rate": 2.2926829268292683e-05,
"loss": 0.0119,
"step": 48
},
{
"epoch": 0.18091611861082266,
"grad_norm": 0.16984423995018005,
"learning_rate": 2.3414634146341463e-05,
"loss": 0.0065,
"step": 49
},
{
"epoch": 0.18460828429675782,
"grad_norm": 0.11164893954992294,
"learning_rate": 2.3902439024390246e-05,
"loss": 0.0135,
"step": 50
},
{
"epoch": 0.18830044998269296,
"grad_norm": 0.09878280013799667,
"learning_rate": 2.4390243902439026e-05,
"loss": 0.0066,
"step": 51
},
{
"epoch": 0.19199261566862813,
"grad_norm": 0.09212549030780792,
"learning_rate": 2.4878048780487805e-05,
"loss": 0.0082,
"step": 52
},
{
"epoch": 0.1956847813545633,
"grad_norm": 0.09363257884979248,
"learning_rate": 2.536585365853659e-05,
"loss": 0.0099,
"step": 53
},
{
"epoch": 0.19937694704049844,
"grad_norm": 0.07449876517057419,
"learning_rate": 2.5853658536585368e-05,
"loss": 0.006,
"step": 54
},
{
"epoch": 0.2030691127264336,
"grad_norm": 0.07617678493261337,
"learning_rate": 2.634146341463415e-05,
"loss": 0.0076,
"step": 55
},
{
"epoch": 0.20676127841236874,
"grad_norm": 0.09494733065366745,
"learning_rate": 2.682926829268293e-05,
"loss": 0.0082,
"step": 56
},
{
"epoch": 0.2104534440983039,
"grad_norm": 0.10162504017353058,
"learning_rate": 2.731707317073171e-05,
"loss": 0.011,
"step": 57
},
{
"epoch": 0.21414560978423908,
"grad_norm": 0.16772620379924774,
"learning_rate": 2.7804878048780487e-05,
"loss": 0.0086,
"step": 58
},
{
"epoch": 0.21783777547017422,
"grad_norm": 0.10658068209886551,
"learning_rate": 2.829268292682927e-05,
"loss": 0.0067,
"step": 59
},
{
"epoch": 0.22152994115610938,
"grad_norm": 0.11568617820739746,
"learning_rate": 2.878048780487805e-05,
"loss": 0.0081,
"step": 60
},
{
"epoch": 0.22522210684204455,
"grad_norm": 0.11837513744831085,
"learning_rate": 2.926829268292683e-05,
"loss": 0.0086,
"step": 61
},
{
"epoch": 0.2289142725279797,
"grad_norm": 0.08128459751605988,
"learning_rate": 2.9756097560975613e-05,
"loss": 0.0047,
"step": 62
},
{
"epoch": 0.23260643821391486,
"grad_norm": 0.36463257670402527,
"learning_rate": 3.0243902439024392e-05,
"loss": 0.0099,
"step": 63
},
{
"epoch": 0.23629860389985,
"grad_norm": 0.08298784494400024,
"learning_rate": 3.073170731707317e-05,
"loss": 0.006,
"step": 64
},
{
"epoch": 0.23999076958578516,
"grad_norm": 0.2765791416168213,
"learning_rate": 3.1219512195121955e-05,
"loss": 0.015,
"step": 65
},
{
"epoch": 0.24368293527172033,
"grad_norm": 0.09410832822322845,
"learning_rate": 3.170731707317074e-05,
"loss": 0.007,
"step": 66
},
{
"epoch": 0.24737510095765547,
"grad_norm": 0.08687976747751236,
"learning_rate": 3.2195121951219514e-05,
"loss": 0.0054,
"step": 67
},
{
"epoch": 0.25106726664359064,
"grad_norm": 0.1658174693584442,
"learning_rate": 3.268292682926829e-05,
"loss": 0.0059,
"step": 68
},
{
"epoch": 0.2547594323295258,
"grad_norm": 0.16597941517829895,
"learning_rate": 3.3170731707317074e-05,
"loss": 0.0043,
"step": 69
},
{
"epoch": 0.25845159801546097,
"grad_norm": 0.14238758385181427,
"learning_rate": 3.365853658536586e-05,
"loss": 0.0202,
"step": 70
},
{
"epoch": 0.2621437637013961,
"grad_norm": 0.28217750787734985,
"learning_rate": 3.414634146341463e-05,
"loss": 0.0245,
"step": 71
},
{
"epoch": 0.26583592938733125,
"grad_norm": 0.08783560991287231,
"learning_rate": 3.4634146341463416e-05,
"loss": 0.0062,
"step": 72
},
{
"epoch": 0.2695280950732664,
"grad_norm": 0.13205395638942719,
"learning_rate": 3.51219512195122e-05,
"loss": 0.0091,
"step": 73
},
{
"epoch": 0.2732202607592016,
"grad_norm": 0.11077655106782913,
"learning_rate": 3.5609756097560976e-05,
"loss": 0.0068,
"step": 74
},
{
"epoch": 0.27691242644513675,
"grad_norm": 0.07510002702474594,
"learning_rate": 3.609756097560976e-05,
"loss": 0.0056,
"step": 75
},
{
"epoch": 0.28060459213107186,
"grad_norm": 0.1183568611741066,
"learning_rate": 3.658536585365854e-05,
"loss": 0.0107,
"step": 76
},
{
"epoch": 0.28429675781700703,
"grad_norm": 0.19435347616672516,
"learning_rate": 3.7073170731707325e-05,
"loss": 0.0194,
"step": 77
},
{
"epoch": 0.2879889235029422,
"grad_norm": 0.1364523470401764,
"learning_rate": 3.75609756097561e-05,
"loss": 0.0149,
"step": 78
},
{
"epoch": 0.29168108918887736,
"grad_norm": 0.20089037716388702,
"learning_rate": 3.804878048780488e-05,
"loss": 0.0211,
"step": 79
},
{
"epoch": 0.29537325487481253,
"grad_norm": 0.13373729586601257,
"learning_rate": 3.853658536585366e-05,
"loss": 0.0111,
"step": 80
},
{
"epoch": 0.29906542056074764,
"grad_norm": 0.08260685950517654,
"learning_rate": 3.9024390243902444e-05,
"loss": 0.0094,
"step": 81
},
{
"epoch": 0.3027575862466828,
"grad_norm": 0.15578778088092804,
"learning_rate": 3.951219512195122e-05,
"loss": 0.01,
"step": 82
},
{
"epoch": 0.306449751932618,
"grad_norm": 0.11668805778026581,
"learning_rate": 4e-05,
"loss": 0.0135,
"step": 83
},
{
"epoch": 0.31014191761855314,
"grad_norm": 0.08324134349822998,
"learning_rate": 3.999981530109401e-05,
"loss": 0.0086,
"step": 84
},
{
"epoch": 0.3138340833044883,
"grad_norm": 0.2406614124774933,
"learning_rate": 3.999926120778742e-05,
"loss": 0.0125,
"step": 85
},
{
"epoch": 0.3175262489904234,
"grad_norm": 0.1322041004896164,
"learning_rate": 3.9998337730314274e-05,
"loss": 0.0094,
"step": 86
},
{
"epoch": 0.3212184146763586,
"grad_norm": 0.18763262033462524,
"learning_rate": 3.999704488573108e-05,
"loss": 0.0123,
"step": 87
},
{
"epoch": 0.32491058036229375,
"grad_norm": 0.16090139746665955,
"learning_rate": 3.9995382697916555e-05,
"loss": 0.0084,
"step": 88
},
{
"epoch": 0.3286027460482289,
"grad_norm": 0.11704199016094208,
"learning_rate": 3.999335119757112e-05,
"loss": 0.0088,
"step": 89
},
{
"epoch": 0.3322949117341641,
"grad_norm": 0.13167352974414825,
"learning_rate": 3.9990950422216367e-05,
"loss": 0.0148,
"step": 90
},
{
"epoch": 0.3359870774200992,
"grad_norm": 0.09690048545598984,
"learning_rate": 3.998818041619435e-05,
"loss": 0.0077,
"step": 91
},
{
"epoch": 0.33967924310603437,
"grad_norm": 0.11628638207912445,
"learning_rate": 3.998504123066679e-05,
"loss": 0.007,
"step": 92
},
{
"epoch": 0.33967924310603437,
"eval_loss": 0.010080486536026001,
"eval_runtime": 89.886,
"eval_samples_per_second": 10.157,
"eval_steps_per_second": 5.084,
"step": 92
},
{
"epoch": 0.34337140879196953,
"grad_norm": 0.11528339982032776,
"learning_rate": 3.9981532923614074e-05,
"loss": 0.0082,
"step": 93
},
{
"epoch": 0.3470635744779047,
"grad_norm": 0.2657609283924103,
"learning_rate": 3.9977655559834275e-05,
"loss": 0.013,
"step": 94
},
{
"epoch": 0.35075574016383987,
"grad_norm": 0.11277999728918076,
"learning_rate": 3.9973409210941864e-05,
"loss": 0.0076,
"step": 95
},
{
"epoch": 0.35444790584977504,
"grad_norm": 5.088494777679443,
"learning_rate": 3.9968793955366445e-05,
"loss": 0.0289,
"step": 96
},
{
"epoch": 0.35814007153571015,
"grad_norm": 0.1601097732782364,
"learning_rate": 3.996380987835128e-05,
"loss": 0.0219,
"step": 97
},
{
"epoch": 0.3618322372216453,
"grad_norm": 0.20405948162078857,
"learning_rate": 3.995845707195173e-05,
"loss": 0.0162,
"step": 98
},
{
"epoch": 0.3655244029075805,
"grad_norm": 0.6011687517166138,
"learning_rate": 3.995273563503355e-05,
"loss": 0.0096,
"step": 99
},
{
"epoch": 0.36921656859351565,
"grad_norm": 0.2641352117061615,
"learning_rate": 3.9946645673271034e-05,
"loss": 0.0231,
"step": 100
},
{
"epoch": 0.3729087342794508,
"grad_norm": 0.18041729927062988,
"learning_rate": 3.9940187299145134e-05,
"loss": 0.0124,
"step": 101
},
{
"epoch": 0.3766008999653859,
"grad_norm": 0.13946884870529175,
"learning_rate": 3.9933360631941294e-05,
"loss": 0.0136,
"step": 102
},
{
"epoch": 0.3802930656513211,
"grad_norm": 0.14014215767383575,
"learning_rate": 3.992616579774732e-05,
"loss": 0.0123,
"step": 103
},
{
"epoch": 0.38398523133725626,
"grad_norm": 0.215935617685318,
"learning_rate": 3.9918602929451015e-05,
"loss": 0.0166,
"step": 104
},
{
"epoch": 0.38767739702319143,
"grad_norm": 0.18600742518901825,
"learning_rate": 3.991067216673772e-05,
"loss": 0.0249,
"step": 105
},
{
"epoch": 0.3913695627091266,
"grad_norm": 0.19758182764053345,
"learning_rate": 3.990237365608776e-05,
"loss": 0.0215,
"step": 106
},
{
"epoch": 0.3950617283950617,
"grad_norm": 0.09306969493627548,
"learning_rate": 3.9893707550773714e-05,
"loss": 0.0062,
"step": 107
},
{
"epoch": 0.3987538940809969,
"grad_norm": 0.15354327857494354,
"learning_rate": 3.988467401085761e-05,
"loss": 0.0153,
"step": 108
},
{
"epoch": 0.40244605976693204,
"grad_norm": 0.18650248646736145,
"learning_rate": 3.987527320318793e-05,
"loss": 0.0207,
"step": 109
},
{
"epoch": 0.4061382254528672,
"grad_norm": 0.11927786469459534,
"learning_rate": 3.986550530139657e-05,
"loss": 0.0071,
"step": 110
},
{
"epoch": 0.4098303911388024,
"grad_norm": 0.18260295689105988,
"learning_rate": 3.985537048589561e-05,
"loss": 0.0165,
"step": 111
},
{
"epoch": 0.4135225568247375,
"grad_norm": 0.13270637392997742,
"learning_rate": 3.9844868943873975e-05,
"loss": 0.0095,
"step": 112
},
{
"epoch": 0.41721472251067265,
"grad_norm": 0.10370643436908722,
"learning_rate": 3.9834000869294e-05,
"loss": 0.0075,
"step": 113
},
{
"epoch": 0.4209068881966078,
"grad_norm": 0.11767494678497314,
"learning_rate": 3.982276646288784e-05,
"loss": 0.0129,
"step": 114
},
{
"epoch": 0.424599053882543,
"grad_norm": 0.12992636859416962,
"learning_rate": 3.981116593215374e-05,
"loss": 0.0145,
"step": 115
},
{
"epoch": 0.42829121956847815,
"grad_norm": 0.27428507804870605,
"learning_rate": 3.9799199491352246e-05,
"loss": 0.0166,
"step": 116
},
{
"epoch": 0.43198338525441327,
"grad_norm": 0.1593196988105774,
"learning_rate": 3.978686736150221e-05,
"loss": 0.0102,
"step": 117
},
{
"epoch": 0.43567555094034843,
"grad_norm": 0.20442363619804382,
"learning_rate": 3.977416977037671e-05,
"loss": 0.0143,
"step": 118
},
{
"epoch": 0.4393677166262836,
"grad_norm": 0.11397796869277954,
"learning_rate": 3.9761106952498874e-05,
"loss": 0.0108,
"step": 119
},
{
"epoch": 0.44305988231221877,
"grad_norm": 0.12436648458242416,
"learning_rate": 3.974767914913751e-05,
"loss": 0.008,
"step": 120
},
{
"epoch": 0.44675204799815393,
"grad_norm": 0.10099371522665024,
"learning_rate": 3.973388660830269e-05,
"loss": 0.0071,
"step": 121
},
{
"epoch": 0.4504442136840891,
"grad_norm": 0.18873853981494904,
"learning_rate": 3.971972958474113e-05,
"loss": 0.0126,
"step": 122
},
{
"epoch": 0.4541363793700242,
"grad_norm": 0.14228899776935577,
"learning_rate": 3.97052083399315e-05,
"loss": 0.0108,
"step": 123
},
{
"epoch": 0.4578285450559594,
"grad_norm": 0.16871212422847748,
"learning_rate": 3.969032314207961e-05,
"loss": 0.0127,
"step": 124
},
{
"epoch": 0.46152071074189455,
"grad_norm": 0.10817577689886093,
"learning_rate": 3.967507426611344e-05,
"loss": 0.0166,
"step": 125
},
{
"epoch": 0.4652128764278297,
"grad_norm": 0.09942852705717087,
"learning_rate": 3.965946199367804e-05,
"loss": 0.0087,
"step": 126
},
{
"epoch": 0.4689050421137649,
"grad_norm": 0.08521483838558197,
"learning_rate": 3.96434866131304e-05,
"loss": 0.01,
"step": 127
},
{
"epoch": 0.4725972077997,
"grad_norm": 0.09523651003837585,
"learning_rate": 3.9627148419534026e-05,
"loss": 0.0172,
"step": 128
},
{
"epoch": 0.47628937348563516,
"grad_norm": 0.08050525188446045,
"learning_rate": 3.961044771465359e-05,
"loss": 0.0052,
"step": 129
},
{
"epoch": 0.4799815391715703,
"grad_norm": 0.13714231550693512,
"learning_rate": 3.9593384806949263e-05,
"loss": 0.0156,
"step": 130
},
{
"epoch": 0.4836737048575055,
"grad_norm": 0.08329559117555618,
"learning_rate": 3.9575960011571106e-05,
"loss": 0.014,
"step": 131
},
{
"epoch": 0.48736587054344066,
"grad_norm": 0.109232597053051,
"learning_rate": 3.955817365035316e-05,
"loss": 0.0081,
"step": 132
},
{
"epoch": 0.49105803622937577,
"grad_norm": 0.09475994855165482,
"learning_rate": 3.954002605180759e-05,
"loss": 0.0066,
"step": 133
},
{
"epoch": 0.49475020191531094,
"grad_norm": 0.1157636046409607,
"learning_rate": 3.952151755111855e-05,
"loss": 0.0226,
"step": 134
},
{
"epoch": 0.4984423676012461,
"grad_norm": 0.1317131668329239,
"learning_rate": 3.9502648490136016e-05,
"loss": 0.0097,
"step": 135
},
{
"epoch": 0.5021345332871813,
"grad_norm": 0.12749852240085602,
"learning_rate": 3.948341921736948e-05,
"loss": 0.0104,
"step": 136
},
{
"epoch": 0.5058266989731164,
"grad_norm": 0.08242950588464737,
"learning_rate": 3.946383008798152e-05,
"loss": 0.0054,
"step": 137
},
{
"epoch": 0.5095188646590516,
"grad_norm": 0.125824436545372,
"learning_rate": 3.94438814637812e-05,
"loss": 0.0173,
"step": 138
},
{
"epoch": 0.5132110303449867,
"grad_norm": 0.09344890713691711,
"learning_rate": 3.942357371321743e-05,
"loss": 0.0105,
"step": 139
},
{
"epoch": 0.5169031960309219,
"grad_norm": 0.11648397147655487,
"learning_rate": 3.940290721137214e-05,
"loss": 0.0211,
"step": 140
},
{
"epoch": 0.520595361716857,
"grad_norm": 0.10685181617736816,
"learning_rate": 3.938188233995336e-05,
"loss": 0.0105,
"step": 141
},
{
"epoch": 0.5242875274027922,
"grad_norm": 0.1415766328573227,
"learning_rate": 3.936049948728816e-05,
"loss": 0.0132,
"step": 142
},
{
"epoch": 0.5279796930887274,
"grad_norm": 0.20898067951202393,
"learning_rate": 3.933875904831551e-05,
"loss": 0.0163,
"step": 143
},
{
"epoch": 0.5316718587746625,
"grad_norm": 0.09275670349597931,
"learning_rate": 3.931666142457891e-05,
"loss": 0.0099,
"step": 144
},
{
"epoch": 0.5353640244605977,
"grad_norm": 0.08970851451158524,
"learning_rate": 3.929420702421907e-05,
"loss": 0.0064,
"step": 145
},
{
"epoch": 0.5390561901465328,
"grad_norm": 0.10725483298301697,
"learning_rate": 3.9271396261966305e-05,
"loss": 0.0061,
"step": 146
},
{
"epoch": 0.5427483558324679,
"grad_norm": 0.10847834497690201,
"learning_rate": 3.92482295591329e-05,
"loss": 0.0135,
"step": 147
},
{
"epoch": 0.5464405215184032,
"grad_norm": 0.08085348457098007,
"learning_rate": 3.9224707343605315e-05,
"loss": 0.0092,
"step": 148
},
{
"epoch": 0.5501326872043383,
"grad_norm": 0.10083277523517609,
"learning_rate": 3.92008300498363e-05,
"loss": 0.0107,
"step": 149
},
{
"epoch": 0.5538248528902735,
"grad_norm": 0.25901204347610474,
"learning_rate": 3.917659811883687e-05,
"loss": 0.0061,
"step": 150
},
{
"epoch": 0.5575170185762086,
"grad_norm": 0.06085089221596718,
"learning_rate": 3.915201199816812e-05,
"loss": 0.0065,
"step": 151
},
{
"epoch": 0.5612091842621437,
"grad_norm": 0.19471333920955658,
"learning_rate": 3.9127072141933025e-05,
"loss": 0.0159,
"step": 152
},
{
"epoch": 0.564901349948079,
"grad_norm": 1.377177119255066,
"learning_rate": 3.910177901076799e-05,
"loss": 0.0129,
"step": 153
},
{
"epoch": 0.5685935156340141,
"grad_norm": 0.12816794216632843,
"learning_rate": 3.907613307183439e-05,
"loss": 0.0145,
"step": 154
},
{
"epoch": 0.5722856813199493,
"grad_norm": 1.0884933471679688,
"learning_rate": 3.905013479880992e-05,
"loss": 0.0296,
"step": 155
},
{
"epoch": 0.5759778470058844,
"grad_norm": 0.19217349588871002,
"learning_rate": 3.902378467187981e-05,
"loss": 0.0066,
"step": 156
},
{
"epoch": 0.5796700126918195,
"grad_norm": 0.17874930799007416,
"learning_rate": 3.8997083177728044e-05,
"loss": 0.007,
"step": 157
},
{
"epoch": 0.5833621783777547,
"grad_norm": 0.17409397661685944,
"learning_rate": 3.897003080952828e-05,
"loss": 0.0131,
"step": 158
},
{
"epoch": 0.5870543440636898,
"grad_norm": 0.1562495231628418,
"learning_rate": 3.8942628066934826e-05,
"loss": 0.0086,
"step": 159
},
{
"epoch": 0.5907465097496251,
"grad_norm": 0.15565301477909088,
"learning_rate": 3.891487545607332e-05,
"loss": 0.0104,
"step": 160
},
{
"epoch": 0.5944386754355602,
"grad_norm": 0.14708881080150604,
"learning_rate": 3.888677348953145e-05,
"loss": 0.0097,
"step": 161
},
{
"epoch": 0.5981308411214953,
"grad_norm": 0.14112205803394318,
"learning_rate": 3.885832268634946e-05,
"loss": 0.0111,
"step": 162
},
{
"epoch": 0.6018230068074305,
"grad_norm": 0.3906686305999756,
"learning_rate": 3.8829523572010586e-05,
"loss": 0.0163,
"step": 163
},
{
"epoch": 0.6055151724933656,
"grad_norm": 0.33456534147262573,
"learning_rate": 3.880037667843131e-05,
"loss": 0.0183,
"step": 164
},
{
"epoch": 0.6092073381793008,
"grad_norm": 0.18434567749500275,
"learning_rate": 3.877088254395157e-05,
"loss": 0.0128,
"step": 165
},
{
"epoch": 0.612899503865236,
"grad_norm": 0.1495783030986786,
"learning_rate": 3.874104171332481e-05,
"loss": 0.0255,
"step": 166
},
{
"epoch": 0.6165916695511711,
"grad_norm": 0.17371267080307007,
"learning_rate": 3.871085473770789e-05,
"loss": 0.01,
"step": 167
},
{
"epoch": 0.6202838352371063,
"grad_norm": 0.11804357171058655,
"learning_rate": 3.868032217465097e-05,
"loss": 0.0089,
"step": 168
},
{
"epoch": 0.6239760009230414,
"grad_norm": 0.19819952547550201,
"learning_rate": 3.864944458808712e-05,
"loss": 0.0146,
"step": 169
},
{
"epoch": 0.6276681666089766,
"grad_norm": 0.14190199971199036,
"learning_rate": 3.861822254832201e-05,
"loss": 0.0106,
"step": 170
},
{
"epoch": 0.6313603322949117,
"grad_norm": 0.08863028138875961,
"learning_rate": 3.858665663202329e-05,
"loss": 0.0077,
"step": 171
},
{
"epoch": 0.6350524979808468,
"grad_norm": 0.2075955718755722,
"learning_rate": 3.855474742220998e-05,
"loss": 0.0097,
"step": 172
},
{
"epoch": 0.6387446636667821,
"grad_norm": 0.1362001597881317,
"learning_rate": 3.852249550824169e-05,
"loss": 0.0168,
"step": 173
},
{
"epoch": 0.6424368293527172,
"grad_norm": 0.8073941469192505,
"learning_rate": 3.848990148580776e-05,
"loss": 0.0298,
"step": 174
},
{
"epoch": 0.6461289950386524,
"grad_norm": 0.08775315433740616,
"learning_rate": 3.84569659569162e-05,
"loss": 0.0106,
"step": 175
},
{
"epoch": 0.6498211607245875,
"grad_norm": 0.3348487615585327,
"learning_rate": 3.8423689529882635e-05,
"loss": 0.0205,
"step": 176
},
{
"epoch": 0.6535133264105226,
"grad_norm": 0.1472369283437729,
"learning_rate": 3.839007281931902e-05,
"loss": 0.0249,
"step": 177
},
{
"epoch": 0.6572054920964578,
"grad_norm": 0.12712818384170532,
"learning_rate": 3.835611644612234e-05,
"loss": 0.0273,
"step": 178
},
{
"epoch": 0.660897657782393,
"grad_norm": 0.16857929527759552,
"learning_rate": 3.832182103746308e-05,
"loss": 0.0154,
"step": 179
},
{
"epoch": 0.6645898234683282,
"grad_norm": 0.11381746828556061,
"learning_rate": 3.828718722677369e-05,
"loss": 0.0072,
"step": 180
},
{
"epoch": 0.6682819891542633,
"grad_norm": 0.27607351541519165,
"learning_rate": 3.825221565373687e-05,
"loss": 0.0147,
"step": 181
},
{
"epoch": 0.6719741548401984,
"grad_norm": 0.09366216510534286,
"learning_rate": 3.821690696427373e-05,
"loss": 0.0075,
"step": 182
},
{
"epoch": 0.6756663205261336,
"grad_norm": 0.11064954102039337,
"learning_rate": 3.8181261810531926e-05,
"loss": 0.0068,
"step": 183
},
{
"epoch": 0.6793584862120687,
"grad_norm": 0.10137422382831573,
"learning_rate": 3.8145280850873524e-05,
"loss": 0.0078,
"step": 184
},
{
"epoch": 0.6793584862120687,
"eval_loss": 0.01165215577930212,
"eval_runtime": 90.6071,
"eval_samples_per_second": 10.076,
"eval_steps_per_second": 5.044,
"step": 184
},
{
"epoch": 0.683050651898004,
"grad_norm": 0.1427939236164093,
"learning_rate": 3.810896474986294e-05,
"loss": 0.0099,
"step": 185
},
{
"epoch": 0.6867428175839391,
"grad_norm": 0.11883988231420517,
"learning_rate": 3.8072314178254556e-05,
"loss": 0.0111,
"step": 186
},
{
"epoch": 0.6904349832698742,
"grad_norm": 0.10005701333284378,
"learning_rate": 3.803532981298044e-05,
"loss": 0.0066,
"step": 187
},
{
"epoch": 0.6941271489558094,
"grad_norm": 0.11374926567077637,
"learning_rate": 3.7998012337137765e-05,
"loss": 0.0115,
"step": 188
},
{
"epoch": 0.6978193146417445,
"grad_norm": 0.1569487750530243,
"learning_rate": 3.7960362439976234e-05,
"loss": 0.0086,
"step": 189
},
{
"epoch": 0.7015114803276797,
"grad_norm": 0.11175687611103058,
"learning_rate": 3.7922380816885323e-05,
"loss": 0.0112,
"step": 190
},
{
"epoch": 0.7052036460136148,
"grad_norm": 0.1092715710401535,
"learning_rate": 3.7884068169381454e-05,
"loss": 0.0077,
"step": 191
},
{
"epoch": 0.7088958116995501,
"grad_norm": 0.19558553397655487,
"learning_rate": 3.784542520509503e-05,
"loss": 0.0098,
"step": 192
},
{
"epoch": 0.7125879773854852,
"grad_norm": 0.13434702157974243,
"learning_rate": 3.78064526377574e-05,
"loss": 0.0151,
"step": 193
},
{
"epoch": 0.7162801430714203,
"grad_norm": 0.15444593131542206,
"learning_rate": 3.7767151187187586e-05,
"loss": 0.0106,
"step": 194
},
{
"epoch": 0.7199723087573555,
"grad_norm": 0.06894934922456741,
"learning_rate": 3.7727521579279095e-05,
"loss": 0.0049,
"step": 195
},
{
"epoch": 0.7236644744432906,
"grad_norm": 0.1396859586238861,
"learning_rate": 3.768756454598645e-05,
"loss": 0.0118,
"step": 196
},
{
"epoch": 0.7273566401292259,
"grad_norm": 0.1533813774585724,
"learning_rate": 3.764728082531169e-05,
"loss": 0.0104,
"step": 197
},
{
"epoch": 0.731048805815161,
"grad_norm": 0.1005004420876503,
"learning_rate": 3.760667116129072e-05,
"loss": 0.0041,
"step": 198
},
{
"epoch": 0.7347409715010961,
"grad_norm": 0.1538483053445816,
"learning_rate": 3.756573630397958e-05,
"loss": 0.0083,
"step": 199
},
{
"epoch": 0.7384331371870313,
"grad_norm": 0.09973620623350143,
"learning_rate": 3.752447700944064e-05,
"loss": 0.0067,
"step": 200
},
{
"epoch": 0.7421253028729664,
"grad_norm": 0.17357227206230164,
"learning_rate": 3.7482894039728525e-05,
"loss": 0.0165,
"step": 201
},
{
"epoch": 0.7458174685589016,
"grad_norm": 0.1083260253071785,
"learning_rate": 3.744098816287616e-05,
"loss": 0.0063,
"step": 202
},
{
"epoch": 0.7495096342448367,
"grad_norm": 0.0863720178604126,
"learning_rate": 3.7398760152880484e-05,
"loss": 0.0072,
"step": 203
},
{
"epoch": 0.7532017999307719,
"grad_norm": 0.1260920763015747,
"learning_rate": 3.735621078968823e-05,
"loss": 0.0123,
"step": 204
},
{
"epoch": 0.7568939656167071,
"grad_norm": 0.0873272716999054,
"learning_rate": 3.731334085918149e-05,
"loss": 0.0223,
"step": 205
},
{
"epoch": 0.7605861313026422,
"grad_norm": 0.0700017586350441,
"learning_rate": 3.7270151153163174e-05,
"loss": 0.0073,
"step": 206
},
{
"epoch": 0.7642782969885774,
"grad_norm": 0.07996451109647751,
"learning_rate": 3.722664246934244e-05,
"loss": 0.0061,
"step": 207
},
{
"epoch": 0.7679704626745125,
"grad_norm": 0.10587499290704727,
"learning_rate": 3.718281561131992e-05,
"loss": 0.0365,
"step": 208
},
{
"epoch": 0.7716626283604476,
"grad_norm": 0.14550632238388062,
"learning_rate": 3.713867138857288e-05,
"loss": 0.0151,
"step": 209
},
{
"epoch": 0.7753547940463829,
"grad_norm": 0.17798705399036407,
"learning_rate": 3.7094210616440284e-05,
"loss": 0.0116,
"step": 210
},
{
"epoch": 0.779046959732318,
"grad_norm": 0.08906543254852295,
"learning_rate": 3.704943411610774e-05,
"loss": 0.0076,
"step": 211
},
{
"epoch": 0.7827391254182532,
"grad_norm": 0.07332167774438858,
"learning_rate": 3.700434271459229e-05,
"loss": 0.0086,
"step": 212
},
{
"epoch": 0.7864312911041883,
"grad_norm": 0.11497493088245392,
"learning_rate": 3.69589372447272e-05,
"loss": 0.015,
"step": 213
},
{
"epoch": 0.7901234567901234,
"grad_norm": 0.08599186688661575,
"learning_rate": 3.6913218545146536e-05,
"loss": 0.0111,
"step": 214
},
{
"epoch": 0.7938156224760586,
"grad_norm": 0.0786437839269638,
"learning_rate": 3.686718746026967e-05,
"loss": 0.0084,
"step": 215
},
{
"epoch": 0.7975077881619937,
"grad_norm": 0.07181154191493988,
"learning_rate": 3.68208448402857e-05,
"loss": 0.008,
"step": 216
},
{
"epoch": 0.801199953847929,
"grad_norm": 0.11212016642093658,
"learning_rate": 3.677419154113776e-05,
"loss": 0.014,
"step": 217
},
{
"epoch": 0.8048921195338641,
"grad_norm": 0.0851934626698494,
"learning_rate": 3.672722842450717e-05,
"loss": 0.012,
"step": 218
},
{
"epoch": 0.8085842852197992,
"grad_norm": 0.053547583520412445,
"learning_rate": 3.667995635779756e-05,
"loss": 0.0048,
"step": 219
},
{
"epoch": 0.8122764509057344,
"grad_norm": 0.09233611822128296,
"learning_rate": 3.6632376214118836e-05,
"loss": 0.0082,
"step": 220
},
{
"epoch": 0.8159686165916695,
"grad_norm": 0.08250945061445236,
"learning_rate": 3.6584488872271035e-05,
"loss": 0.005,
"step": 221
},
{
"epoch": 0.8196607822776047,
"grad_norm": 0.06966373324394226,
"learning_rate": 3.6536295216728136e-05,
"loss": 0.0058,
"step": 222
},
{
"epoch": 0.8233529479635399,
"grad_norm": 0.08930462598800659,
"learning_rate": 3.648779613762167e-05,
"loss": 0.0098,
"step": 223
},
{
"epoch": 0.827045113649475,
"grad_norm": 0.11651594936847687,
"learning_rate": 3.643899253072433e-05,
"loss": 0.0149,
"step": 224
},
{
"epoch": 0.8307372793354102,
"grad_norm": 0.08121360838413239,
"learning_rate": 3.63898852974334e-05,
"loss": 0.0077,
"step": 225
},
{
"epoch": 0.8344294450213453,
"grad_norm": 0.0807429626584053,
"learning_rate": 3.634047534475409e-05,
"loss": 0.0071,
"step": 226
},
{
"epoch": 0.8381216107072805,
"grad_norm": 0.10255023092031479,
"learning_rate": 3.629076358528284e-05,
"loss": 0.0122,
"step": 227
},
{
"epoch": 0.8418137763932156,
"grad_norm": 0.12542816996574402,
"learning_rate": 3.62407509371904e-05,
"loss": 0.0093,
"step": 228
},
{
"epoch": 0.8455059420791508,
"grad_norm": 0.07941003143787384,
"learning_rate": 3.6190438324204905e-05,
"loss": 0.0059,
"step": 229
},
{
"epoch": 0.849198107765086,
"grad_norm": 0.0921979695558548,
"learning_rate": 3.613982667559483e-05,
"loss": 0.0083,
"step": 230
},
{
"epoch": 0.8528902734510211,
"grad_norm": 0.10604582726955414,
"learning_rate": 3.608891692615176e-05,
"loss": 0.0128,
"step": 231
},
{
"epoch": 0.8565824391369563,
"grad_norm": 0.09270962327718735,
"learning_rate": 3.603771001617322e-05,
"loss": 0.0141,
"step": 232
},
{
"epoch": 0.8602746048228914,
"grad_norm": 0.062396857887506485,
"learning_rate": 3.598620689144523e-05,
"loss": 0.0094,
"step": 233
},
{
"epoch": 0.8639667705088265,
"grad_norm": 0.07858982682228088,
"learning_rate": 3.5934408503224864e-05,
"loss": 0.0071,
"step": 234
},
{
"epoch": 0.8676589361947618,
"grad_norm": 0.07246547937393188,
"learning_rate": 3.588231580822269e-05,
"loss": 0.0057,
"step": 235
},
{
"epoch": 0.8713511018806969,
"grad_norm": 0.3656146824359894,
"learning_rate": 3.5829929768585086e-05,
"loss": 0.0103,
"step": 236
},
{
"epoch": 0.8750432675666321,
"grad_norm": 0.05800218507647514,
"learning_rate": 3.577725135187647e-05,
"loss": 0.0061,
"step": 237
},
{
"epoch": 0.8787354332525672,
"grad_norm": 0.08162401616573334,
"learning_rate": 3.5724281531061436e-05,
"loss": 0.0143,
"step": 238
},
{
"epoch": 0.8824275989385023,
"grad_norm": 0.07134553790092468,
"learning_rate": 3.567102128448678e-05,
"loss": 0.0054,
"step": 239
},
{
"epoch": 0.8861197646244375,
"grad_norm": 0.13681039214134216,
"learning_rate": 3.561747159586343e-05,
"loss": 0.0198,
"step": 240
},
{
"epoch": 0.8898119303103726,
"grad_norm": 0.05192103236913681,
"learning_rate": 3.5563633454248275e-05,
"loss": 0.0043,
"step": 241
},
{
"epoch": 0.8935040959963079,
"grad_norm": 0.08083774149417877,
"learning_rate": 3.550950785402591e-05,
"loss": 0.0062,
"step": 242
},
{
"epoch": 0.897196261682243,
"grad_norm": 0.17133909463882446,
"learning_rate": 3.5455095794890234e-05,
"loss": 0.0086,
"step": 243
},
{
"epoch": 0.9008884273681782,
"grad_norm": 0.05909837409853935,
"learning_rate": 3.540039828182604e-05,
"loss": 0.0052,
"step": 244
},
{
"epoch": 0.9045805930541133,
"grad_norm": 0.051370203495025635,
"learning_rate": 3.53454163250904e-05,
"loss": 0.0058,
"step": 245
},
{
"epoch": 0.9082727587400484,
"grad_norm": 0.14762379229068756,
"learning_rate": 3.529015094019405e-05,
"loss": 0.0182,
"step": 246
},
{
"epoch": 0.9119649244259836,
"grad_norm": 0.052717871963977814,
"learning_rate": 3.523460314788259e-05,
"loss": 0.004,
"step": 247
},
{
"epoch": 0.9156570901119188,
"grad_norm": 0.0905941054224968,
"learning_rate": 3.517877397411768e-05,
"loss": 0.0152,
"step": 248
},
{
"epoch": 0.919349255797854,
"grad_norm": 0.08593659847974777,
"learning_rate": 3.5122664450058044e-05,
"loss": 0.0089,
"step": 249
},
{
"epoch": 0.9230414214837891,
"grad_norm": 0.08689778298139572,
"learning_rate": 3.506627561204045e-05,
"loss": 0.017,
"step": 250
},
{
"epoch": 0.9267335871697242,
"grad_norm": 0.07897453010082245,
"learning_rate": 3.5009608501560585e-05,
"loss": 0.01,
"step": 251
},
{
"epoch": 0.9304257528556594,
"grad_norm": 0.12059691548347473,
"learning_rate": 3.495266416525376e-05,
"loss": 0.0279,
"step": 252
},
{
"epoch": 0.9341179185415945,
"grad_norm": 0.06975448876619339,
"learning_rate": 3.489544365487564e-05,
"loss": 0.0054,
"step": 253
},
{
"epoch": 0.9378100842275298,
"grad_norm": 0.11304624378681183,
"learning_rate": 3.48379480272828e-05,
"loss": 0.0117,
"step": 254
},
{
"epoch": 0.9415022499134649,
"grad_norm": 0.07075347006320953,
"learning_rate": 3.478017834441319e-05,
"loss": 0.0136,
"step": 255
},
{
"epoch": 0.9451944155994,
"grad_norm": 0.04639885574579239,
"learning_rate": 3.472213567326652e-05,
"loss": 0.0039,
"step": 256
},
{
"epoch": 0.9488865812853352,
"grad_norm": 0.048733506351709366,
"learning_rate": 3.4663821085884597e-05,
"loss": 0.0044,
"step": 257
},
{
"epoch": 0.9525787469712703,
"grad_norm": 0.06614992767572403,
"learning_rate": 3.460523565933145e-05,
"loss": 0.0044,
"step": 258
},
{
"epoch": 0.9562709126572055,
"grad_norm": 0.16428013145923615,
"learning_rate": 3.4546380475673514e-05,
"loss": 0.0079,
"step": 259
},
{
"epoch": 0.9599630783431407,
"grad_norm": 0.11396708339452744,
"learning_rate": 3.448725662195959e-05,
"loss": 0.024,
"step": 260
},
{
"epoch": 0.9636552440290758,
"grad_norm": 0.10393685102462769,
"learning_rate": 3.442786519020077e-05,
"loss": 0.0238,
"step": 261
},
{
"epoch": 0.967347409715011,
"grad_norm": 0.09232791513204575,
"learning_rate": 3.436820727735031e-05,
"loss": 0.0145,
"step": 262
},
{
"epoch": 0.9710395754009461,
"grad_norm": 0.1006862074136734,
"learning_rate": 3.430828398528336e-05,
"loss": 0.0166,
"step": 263
},
{
"epoch": 0.9747317410868813,
"grad_norm": 0.10068142414093018,
"learning_rate": 3.4248096420776536e-05,
"loss": 0.0088,
"step": 264
},
{
"epoch": 0.9784239067728164,
"grad_norm": 0.05003920570015907,
"learning_rate": 3.418764569548758e-05,
"loss": 0.005,
"step": 265
},
{
"epoch": 0.9821160724587515,
"grad_norm": 0.05870620906352997,
"learning_rate": 3.412693292593478e-05,
"loss": 0.006,
"step": 266
},
{
"epoch": 0.9858082381446868,
"grad_norm": 0.09036832302808762,
"learning_rate": 3.4065959233476334e-05,
"loss": 0.0097,
"step": 267
},
{
"epoch": 0.9895004038306219,
"grad_norm": 0.07893083244562149,
"learning_rate": 3.4004725744289685e-05,
"loss": 0.0068,
"step": 268
},
{
"epoch": 0.9931925695165571,
"grad_norm": 0.08609715849161148,
"learning_rate": 3.394323358935068e-05,
"loss": 0.0101,
"step": 269
},
{
"epoch": 0.9968847352024922,
"grad_norm": 0.07740162312984467,
"learning_rate": 3.3881483904412685e-05,
"loss": 0.0087,
"step": 270
},
{
"epoch": 1.0,
"grad_norm": 0.0650748685002327,
"learning_rate": 3.3819477829985624e-05,
"loss": 0.0091,
"step": 271
},
{
"epoch": 1.0036921656859352,
"grad_norm": 0.060567598789930344,
"learning_rate": 3.3757216511314915e-05,
"loss": 0.0158,
"step": 272
},
{
"epoch": 1.0073843313718702,
"grad_norm": 0.08802874386310577,
"learning_rate": 3.3694701098360295e-05,
"loss": 0.0063,
"step": 273
},
{
"epoch": 1.0110764970578054,
"grad_norm": 0.05286823958158493,
"learning_rate": 3.363193274577461e-05,
"loss": 0.0022,
"step": 274
},
{
"epoch": 1.0147686627437407,
"grad_norm": 0.059792377054691315,
"learning_rate": 3.356891261288247e-05,
"loss": 0.0094,
"step": 275
},
{
"epoch": 1.018460828429676,
"grad_norm": 0.03914966061711311,
"learning_rate": 3.350564186365882e-05,
"loss": 0.0027,
"step": 276
},
{
"epoch": 1.018460828429676,
"eval_loss": 0.008386676199734211,
"eval_runtime": 89.8024,
"eval_samples_per_second": 10.167,
"eval_steps_per_second": 5.089,
"step": 276
},
{
"epoch": 1.022152994115611,
"grad_norm": 0.05939140170812607,
"learning_rate": 3.344212166670748e-05,
"loss": 0.0046,
"step": 277
},
{
"epoch": 1.0258451598015461,
"grad_norm": 0.23599718511104584,
"learning_rate": 3.3378353195239546e-05,
"loss": 0.0088,
"step": 278
},
{
"epoch": 1.0295373254874813,
"grad_norm": 0.04266897588968277,
"learning_rate": 3.331433762705171e-05,
"loss": 0.0025,
"step": 279
},
{
"epoch": 1.0332294911734163,
"grad_norm": 0.07201959937810898,
"learning_rate": 3.32500761445045e-05,
"loss": 0.0092,
"step": 280
},
{
"epoch": 1.0369216568593516,
"grad_norm": 0.38548168540000916,
"learning_rate": 3.318556993450048e-05,
"loss": 0.0054,
"step": 281
},
{
"epoch": 1.0406138225452868,
"grad_norm": 0.09021361917257309,
"learning_rate": 3.312082018846229e-05,
"loss": 0.0038,
"step": 282
},
{
"epoch": 1.0443059882312218,
"grad_norm": 0.1946287751197815,
"learning_rate": 3.3055828102310656e-05,
"loss": 0.0054,
"step": 283
},
{
"epoch": 1.047998153917157,
"grad_norm": 0.08428189903497696,
"learning_rate": 3.299059487644229e-05,
"loss": 0.0153,
"step": 284
},
{
"epoch": 1.0516903196030922,
"grad_norm": 0.06667447090148926,
"learning_rate": 3.292512171570775e-05,
"loss": 0.0066,
"step": 285
},
{
"epoch": 1.0553824852890274,
"grad_norm": 0.04148027300834656,
"learning_rate": 3.2859409829389146e-05,
"loss": 0.0029,
"step": 286
},
{
"epoch": 1.0590746509749625,
"grad_norm": 0.06811388581991196,
"learning_rate": 3.2793460431177827e-05,
"loss": 0.0058,
"step": 287
},
{
"epoch": 1.0627668166608977,
"grad_norm": 0.04447786882519722,
"learning_rate": 3.272727473915197e-05,
"loss": 0.0034,
"step": 288
},
{
"epoch": 1.066458982346833,
"grad_norm": 0.22201436758041382,
"learning_rate": 3.266085397575406e-05,
"loss": 0.0123,
"step": 289
},
{
"epoch": 1.070151148032768,
"grad_norm": 0.0670245811343193,
"learning_rate": 3.259419936776833e-05,
"loss": 0.0088,
"step": 290
},
{
"epoch": 1.0738433137187031,
"grad_norm": 0.06902515143156052,
"learning_rate": 3.25273121462981e-05,
"loss": 0.0025,
"step": 291
},
{
"epoch": 1.0775354794046383,
"grad_norm": 0.1305130422115326,
"learning_rate": 3.246019354674303e-05,
"loss": 0.0093,
"step": 292
},
{
"epoch": 1.0812276450905733,
"grad_norm": 0.11023421585559845,
"learning_rate": 3.239284480877632e-05,
"loss": 0.0085,
"step": 293
},
{
"epoch": 1.0849198107765086,
"grad_norm": 0.07404778897762299,
"learning_rate": 3.232526717632178e-05,
"loss": 0.0045,
"step": 294
},
{
"epoch": 1.0886119764624438,
"grad_norm": 0.054306432604789734,
"learning_rate": 3.22574618975309e-05,
"loss": 0.0032,
"step": 295
},
{
"epoch": 1.092304142148379,
"grad_norm": 0.05058205872774124,
"learning_rate": 3.218943022475975e-05,
"loss": 0.0046,
"step": 296
},
{
"epoch": 1.095996307834314,
"grad_norm": 0.04942139610648155,
"learning_rate": 3.2121173414545886e-05,
"loss": 0.004,
"step": 297
},
{
"epoch": 1.0996884735202492,
"grad_norm": 0.04936928302049637,
"learning_rate": 3.205269272758513e-05,
"loss": 0.0038,
"step": 298
},
{
"epoch": 1.1033806392061845,
"grad_norm": 0.06841211020946503,
"learning_rate": 3.198398942870828e-05,
"loss": 0.005,
"step": 299
},
{
"epoch": 1.1070728048921195,
"grad_norm": 0.11358197033405304,
"learning_rate": 3.1915064786857745e-05,
"loss": 0.0189,
"step": 300
},
{
"epoch": 1.1107649705780547,
"grad_norm": 0.047055114060640335,
"learning_rate": 3.1845920075064115e-05,
"loss": 0.0022,
"step": 301
},
{
"epoch": 1.11445713626399,
"grad_norm": 0.06010272353887558,
"learning_rate": 3.177655657042266e-05,
"loss": 0.0042,
"step": 302
},
{
"epoch": 1.118149301949925,
"grad_norm": 0.24772150814533234,
"learning_rate": 3.170697555406972e-05,
"loss": 0.0104,
"step": 303
},
{
"epoch": 1.1218414676358601,
"grad_norm": 0.04649265855550766,
"learning_rate": 3.163717831115906e-05,
"loss": 0.0032,
"step": 304
},
{
"epoch": 1.1255336333217953,
"grad_norm": 0.0447462759912014,
"learning_rate": 3.156716613083811e-05,
"loss": 0.0038,
"step": 305
},
{
"epoch": 1.1292257990077306,
"grad_norm": 0.09475228935480118,
"learning_rate": 3.1496940306224185e-05,
"loss": 0.0107,
"step": 306
},
{
"epoch": 1.1329179646936656,
"grad_norm": 0.08427289873361588,
"learning_rate": 3.14265021343806e-05,
"loss": 0.0174,
"step": 307
},
{
"epoch": 1.1366101303796008,
"grad_norm": 0.05597711727023125,
"learning_rate": 3.1355852916292654e-05,
"loss": 0.0089,
"step": 308
},
{
"epoch": 1.140302296065536,
"grad_norm": 0.0892227441072464,
"learning_rate": 3.1284993956843685e-05,
"loss": 0.0135,
"step": 309
},
{
"epoch": 1.143994461751471,
"grad_norm": 0.05312884971499443,
"learning_rate": 3.121392656479094e-05,
"loss": 0.0043,
"step": 310
},
{
"epoch": 1.1476866274374062,
"grad_norm": 0.047065041959285736,
"learning_rate": 3.114265205274135e-05,
"loss": 0.0031,
"step": 311
},
{
"epoch": 1.1513787931233415,
"grad_norm": 0.05725245550274849,
"learning_rate": 3.1071171737127375e-05,
"loss": 0.0035,
"step": 312
},
{
"epoch": 1.1550709588092767,
"grad_norm": 0.04488014802336693,
"learning_rate": 3.0999486938182605e-05,
"loss": 0.003,
"step": 313
},
{
"epoch": 1.1587631244952117,
"grad_norm": 0.047577228397130966,
"learning_rate": 3.0927598979917454e-05,
"loss": 0.0031,
"step": 314
},
{
"epoch": 1.162455290181147,
"grad_norm": 0.0874548926949501,
"learning_rate": 3.085550919009464e-05,
"loss": 0.0102,
"step": 315
},
{
"epoch": 1.1661474558670821,
"grad_norm": 0.19847019016742706,
"learning_rate": 3.078321890020469e-05,
"loss": 0.008,
"step": 316
},
{
"epoch": 1.1698396215530171,
"grad_norm": 0.052011147141456604,
"learning_rate": 3.071072944544135e-05,
"loss": 0.0033,
"step": 317
},
{
"epoch": 1.1735317872389524,
"grad_norm": 0.12744103372097015,
"learning_rate": 3.0638042164676915e-05,
"loss": 0.0048,
"step": 318
},
{
"epoch": 1.1772239529248876,
"grad_norm": 0.0666290894150734,
"learning_rate": 3.0565158400437525e-05,
"loss": 0.0083,
"step": 319
},
{
"epoch": 1.1809161186108226,
"grad_norm": 0.06896385550498962,
"learning_rate": 3.0492079498878318e-05,
"loss": 0.0067,
"step": 320
},
{
"epoch": 1.1846082842967578,
"grad_norm": 0.07686945050954819,
"learning_rate": 3.041880680975861e-05,
"loss": 0.0046,
"step": 321
},
{
"epoch": 1.188300449982693,
"grad_norm": 0.08590448647737503,
"learning_rate": 3.0345341686416955e-05,
"loss": 0.0045,
"step": 322
},
{
"epoch": 1.191992615668628,
"grad_norm": 0.08735356479883194,
"learning_rate": 3.0271685485746154e-05,
"loss": 0.0051,
"step": 323
},
{
"epoch": 1.1956847813545632,
"grad_norm": 0.05842231586575508,
"learning_rate": 3.0197839568168167e-05,
"loss": 0.0035,
"step": 324
},
{
"epoch": 1.1993769470404985,
"grad_norm": 0.05802956968545914,
"learning_rate": 3.0123805297609005e-05,
"loss": 0.0065,
"step": 325
},
{
"epoch": 1.2030691127264337,
"grad_norm": 0.08226858079433441,
"learning_rate": 3.004958404147356e-05,
"loss": 0.0044,
"step": 326
},
{
"epoch": 1.2067612784123687,
"grad_norm": 0.04885147139430046,
"learning_rate": 2.9975177170620307e-05,
"loss": 0.0049,
"step": 327
},
{
"epoch": 1.210453444098304,
"grad_norm": 0.24907778203487396,
"learning_rate": 2.9900586059336008e-05,
"loss": 0.0248,
"step": 328
},
{
"epoch": 1.2141456097842391,
"grad_norm": 0.06446171551942825,
"learning_rate": 2.9825812085310327e-05,
"loss": 0.0036,
"step": 329
},
{
"epoch": 1.2178377754701741,
"grad_norm": 4.368242263793945,
"learning_rate": 2.975085662961039e-05,
"loss": 0.0103,
"step": 330
},
{
"epoch": 1.2215299411561094,
"grad_norm": 0.06198897585272789,
"learning_rate": 2.967572107665526e-05,
"loss": 0.0035,
"step": 331
},
{
"epoch": 1.2252221068420446,
"grad_norm": 0.13626688718795776,
"learning_rate": 2.960040681419039e-05,
"loss": 0.0053,
"step": 332
},
{
"epoch": 1.2289142725279798,
"grad_norm": 0.19492781162261963,
"learning_rate": 2.9524915233261944e-05,
"loss": 0.0084,
"step": 333
},
{
"epoch": 1.2326064382139148,
"grad_norm": 0.2215728908777237,
"learning_rate": 2.944924772819119e-05,
"loss": 0.0086,
"step": 334
},
{
"epoch": 1.23629860389985,
"grad_norm": 0.2793082892894745,
"learning_rate": 2.9373405696548656e-05,
"loss": 0.005,
"step": 335
},
{
"epoch": 1.2399907695857852,
"grad_norm": 0.04554106295108795,
"learning_rate": 2.9297390539128364e-05,
"loss": 0.004,
"step": 336
},
{
"epoch": 1.2436829352717202,
"grad_norm": 0.05815259367227554,
"learning_rate": 2.922120365992196e-05,
"loss": 0.0052,
"step": 337
},
{
"epoch": 1.2473751009576555,
"grad_norm": 0.07114019244909286,
"learning_rate": 2.9144846466092773e-05,
"loss": 0.0079,
"step": 338
},
{
"epoch": 1.2510672666435907,
"grad_norm": 0.0655088722705841,
"learning_rate": 2.9068320367949817e-05,
"loss": 0.0057,
"step": 339
},
{
"epoch": 1.254759432329526,
"grad_norm": 1.1501847505569458,
"learning_rate": 2.899162677892175e-05,
"loss": 0.0089,
"step": 340
},
{
"epoch": 1.258451598015461,
"grad_norm": 0.06778527051210403,
"learning_rate": 2.891476711553077e-05,
"loss": 0.0065,
"step": 341
},
{
"epoch": 1.2621437637013961,
"grad_norm": 0.12092837691307068,
"learning_rate": 2.8837742797366454e-05,
"loss": 0.0112,
"step": 342
},
{
"epoch": 1.2658359293873311,
"grad_norm": 0.08216589689254761,
"learning_rate": 2.876055524705953e-05,
"loss": 0.0085,
"step": 343
},
{
"epoch": 1.2695280950732664,
"grad_norm": 0.0777222067117691,
"learning_rate": 2.8683205890255613e-05,
"loss": 0.0037,
"step": 344
},
{
"epoch": 1.2732202607592016,
"grad_norm": 0.08206533640623093,
"learning_rate": 2.8605696155588855e-05,
"loss": 0.0061,
"step": 345
},
{
"epoch": 1.2769124264451368,
"grad_norm": 0.09279124438762665,
"learning_rate": 2.852802747465558e-05,
"loss": 0.0054,
"step": 346
},
{
"epoch": 1.2806045921310718,
"grad_norm": 0.16200777888298035,
"learning_rate": 2.845020128198782e-05,
"loss": 0.0099,
"step": 347
},
{
"epoch": 1.284296757817007,
"grad_norm": 0.2419300675392151,
"learning_rate": 2.837221901502685e-05,
"loss": 0.0171,
"step": 348
},
{
"epoch": 1.2879889235029423,
"grad_norm": 0.11904854327440262,
"learning_rate": 2.8294082114096607e-05,
"loss": 0.0187,
"step": 349
},
{
"epoch": 1.2916810891888773,
"grad_norm": 0.07375206053256989,
"learning_rate": 2.8215792022377092e-05,
"loss": 0.0154,
"step": 350
},
{
"epoch": 1.2953732548748125,
"grad_norm": 0.060616642236709595,
"learning_rate": 2.8137350185877744e-05,
"loss": 0.0031,
"step": 351
},
{
"epoch": 1.2990654205607477,
"grad_norm": 0.04948243498802185,
"learning_rate": 2.8058758053410704e-05,
"loss": 0.0025,
"step": 352
},
{
"epoch": 1.302757586246683,
"grad_norm": 0.07605982571840286,
"learning_rate": 2.7980017076564053e-05,
"loss": 0.0045,
"step": 353
},
{
"epoch": 1.306449751932618,
"grad_norm": 0.06741812080144882,
"learning_rate": 2.7901128709675025e-05,
"loss": 0.005,
"step": 354
},
{
"epoch": 1.3101419176185531,
"grad_norm": 0.09975893050432205,
"learning_rate": 2.782209440980312e-05,
"loss": 0.0067,
"step": 355
},
{
"epoch": 1.3138340833044884,
"grad_norm": 0.06588315218687057,
"learning_rate": 2.774291563670322e-05,
"loss": 0.0027,
"step": 356
},
{
"epoch": 1.3175262489904234,
"grad_norm": 0.11582572758197784,
"learning_rate": 2.766359385279859e-05,
"loss": 0.0047,
"step": 357
},
{
"epoch": 1.3212184146763586,
"grad_norm": 0.05676430091261864,
"learning_rate": 2.7584130523153906e-05,
"loss": 0.0022,
"step": 358
},
{
"epoch": 1.3249105803622938,
"grad_norm": 0.07599082589149475,
"learning_rate": 2.7504527115448176e-05,
"loss": 0.0047,
"step": 359
},
{
"epoch": 1.328602746048229,
"grad_norm": 0.053951650857925415,
"learning_rate": 2.742478509994763e-05,
"loss": 0.0031,
"step": 360
},
{
"epoch": 1.332294911734164,
"grad_norm": 0.05379689112305641,
"learning_rate": 2.7344905949478557e-05,
"loss": 0.0034,
"step": 361
},
{
"epoch": 1.3359870774200993,
"grad_norm": 0.08939212560653687,
"learning_rate": 2.7264891139400155e-05,
"loss": 0.0103,
"step": 362
},
{
"epoch": 1.3396792431060343,
"grad_norm": 0.05766845494508743,
"learning_rate": 2.718474214757719e-05,
"loss": 0.0036,
"step": 363
},
{
"epoch": 1.3433714087919695,
"grad_norm": 0.11903363466262817,
"learning_rate": 2.710446045435278e-05,
"loss": 0.0057,
"step": 364
},
{
"epoch": 1.3470635744779047,
"grad_norm": 0.07542143017053604,
"learning_rate": 2.7024047542521014e-05,
"loss": 0.0085,
"step": 365
},
{
"epoch": 1.35075574016384,
"grad_norm": 0.08536005765199661,
"learning_rate": 2.694350489729958e-05,
"loss": 0.0144,
"step": 366
},
{
"epoch": 1.3544479058497751,
"grad_norm": 0.09188759326934814,
"learning_rate": 2.6862834006302324e-05,
"loss": 0.0083,
"step": 367
},
{
"epoch": 1.3581400715357101,
"grad_norm": 0.1899387389421463,
"learning_rate": 2.678203635951177e-05,
"loss": 0.0084,
"step": 368
},
{
"epoch": 1.3581400715357101,
"eval_loss": 0.008687354624271393,
"eval_runtime": 90.5037,
"eval_samples_per_second": 10.088,
"eval_steps_per_second": 5.05,
"step": 368
},
{
"epoch": 1.3618322372216454,
"grad_norm": 0.046323299407958984,
"learning_rate": 2.6701113449251618e-05,
"loss": 0.0044,
"step": 369
},
{
"epoch": 1.3655244029075804,
"grad_norm": 0.06219512224197388,
"learning_rate": 2.6620066770159178e-05,
"loss": 0.0032,
"step": 370
},
{
"epoch": 1.3692165685935156,
"grad_norm": 0.1851065307855606,
"learning_rate": 2.6538897819157733e-05,
"loss": 0.005,
"step": 371
},
{
"epoch": 1.3729087342794508,
"grad_norm": 0.12302684038877487,
"learning_rate": 2.6457608095428925e-05,
"loss": 0.0056,
"step": 372
},
{
"epoch": 1.376600899965386,
"grad_norm": 0.06654980778694153,
"learning_rate": 2.6376199100385074e-05,
"loss": 0.0049,
"step": 373
},
{
"epoch": 1.380293065651321,
"grad_norm": 0.08494460582733154,
"learning_rate": 2.62946723376414e-05,
"loss": 0.011,
"step": 374
},
{
"epoch": 1.3839852313372563,
"grad_norm": 0.08226186037063599,
"learning_rate": 2.6213029312988294e-05,
"loss": 0.008,
"step": 375
},
{
"epoch": 1.3876773970231915,
"grad_norm": 0.06261271983385086,
"learning_rate": 2.6131271534363497e-05,
"loss": 0.0063,
"step": 376
},
{
"epoch": 1.3913695627091265,
"grad_norm": 0.040595002472400665,
"learning_rate": 2.604940051182422e-05,
"loss": 0.0029,
"step": 377
},
{
"epoch": 1.3950617283950617,
"grad_norm": 0.054169923067092896,
"learning_rate": 2.596741775751931e-05,
"loss": 0.0023,
"step": 378
},
{
"epoch": 1.398753894080997,
"grad_norm": 0.06638327986001968,
"learning_rate": 2.5885324785661263e-05,
"loss": 0.0059,
"step": 379
},
{
"epoch": 1.4024460597669322,
"grad_norm": 0.08735162764787674,
"learning_rate": 2.580312311249828e-05,
"loss": 0.0053,
"step": 380
},
{
"epoch": 1.4061382254528672,
"grad_norm": 0.03515574708580971,
"learning_rate": 2.572081425628628e-05,
"loss": 0.0026,
"step": 381
},
{
"epoch": 1.4098303911388024,
"grad_norm": 0.07844855636358261,
"learning_rate": 2.5638399737260837e-05,
"loss": 0.0071,
"step": 382
},
{
"epoch": 1.4135225568247374,
"grad_norm": 0.05010690912604332,
"learning_rate": 2.555588107760909e-05,
"loss": 0.0032,
"step": 383
},
{
"epoch": 1.4172147225106726,
"grad_norm": 0.048177916556596756,
"learning_rate": 2.5473259801441663e-05,
"loss": 0.0027,
"step": 384
},
{
"epoch": 1.4209068881966078,
"grad_norm": 0.073530413210392,
"learning_rate": 2.5390537434764483e-05,
"loss": 0.0066,
"step": 385
},
{
"epoch": 1.424599053882543,
"grad_norm": 0.13586187362670898,
"learning_rate": 2.530771550545061e-05,
"loss": 0.0111,
"step": 386
},
{
"epoch": 1.4282912195684783,
"grad_norm": 0.08986911922693253,
"learning_rate": 2.522479554321203e-05,
"loss": 0.003,
"step": 387
},
{
"epoch": 1.4319833852544133,
"grad_norm": 0.09543124586343765,
"learning_rate": 2.5141779079571366e-05,
"loss": 0.0058,
"step": 388
},
{
"epoch": 1.4356755509403485,
"grad_norm": 0.06117438152432442,
"learning_rate": 2.5058667647833615e-05,
"loss": 0.0031,
"step": 389
},
{
"epoch": 1.4393677166262835,
"grad_norm": 0.05431349202990532,
"learning_rate": 2.4975462783057837e-05,
"loss": 0.006,
"step": 390
},
{
"epoch": 1.4430598823122187,
"grad_norm": 0.04814140498638153,
"learning_rate": 2.4892166022028778e-05,
"loss": 0.0026,
"step": 391
},
{
"epoch": 1.446752047998154,
"grad_norm": 0.04245537519454956,
"learning_rate": 2.4808778903228506e-05,
"loss": 0.0024,
"step": 392
},
{
"epoch": 1.4504442136840892,
"grad_norm": 0.027589252218604088,
"learning_rate": 2.472530296680797e-05,
"loss": 0.0014,
"step": 393
},
{
"epoch": 1.4541363793700242,
"grad_norm": 0.053102582693099976,
"learning_rate": 2.4641739754558594e-05,
"loss": 0.0051,
"step": 394
},
{
"epoch": 1.4578285450559594,
"grad_norm": 0.04420861601829529,
"learning_rate": 2.4558090809883767e-05,
"loss": 0.0053,
"step": 395
},
{
"epoch": 1.4615207107418946,
"grad_norm": 0.07793322950601578,
"learning_rate": 2.4474357677770336e-05,
"loss": 0.013,
"step": 396
},
{
"epoch": 1.4652128764278296,
"grad_norm": 0.08467935770750046,
"learning_rate": 2.4390541904760105e-05,
"loss": 0.0059,
"step": 397
},
{
"epoch": 1.4689050421137648,
"grad_norm": 0.10742378234863281,
"learning_rate": 2.430664503892122e-05,
"loss": 0.0077,
"step": 398
},
{
"epoch": 1.4725972077997,
"grad_norm": 0.04297586902976036,
"learning_rate": 2.4222668629819622e-05,
"loss": 0.0021,
"step": 399
},
{
"epoch": 1.4762893734856353,
"grad_norm": 0.047660425305366516,
"learning_rate": 2.4138614228490395e-05,
"loss": 0.0048,
"step": 400
},
{
"epoch": 1.4799815391715703,
"grad_norm": 0.046939097344875336,
"learning_rate": 2.4054483387409135e-05,
"loss": 0.0024,
"step": 401
},
{
"epoch": 1.4836737048575055,
"grad_norm": 0.06429211050271988,
"learning_rate": 2.3970277660463275e-05,
"loss": 0.0032,
"step": 402
},
{
"epoch": 1.4873658705434407,
"grad_norm": 0.053521350026130676,
"learning_rate": 2.3885998602923387e-05,
"loss": 0.0033,
"step": 403
},
{
"epoch": 1.4910580362293757,
"grad_norm": 0.05599725991487503,
"learning_rate": 2.380164777141443e-05,
"loss": 0.0035,
"step": 404
},
{
"epoch": 1.494750201915311,
"grad_norm": 0.055872391909360886,
"learning_rate": 2.3717226723887037e-05,
"loss": 0.0041,
"step": 405
},
{
"epoch": 1.4984423676012462,
"grad_norm": 0.0680844634771347,
"learning_rate": 2.363273701958873e-05,
"loss": 0.0043,
"step": 406
},
{
"epoch": 1.5021345332871814,
"grad_norm": 0.14843975007534027,
"learning_rate": 2.35481802190351e-05,
"loss": 0.0105,
"step": 407
},
{
"epoch": 1.5058266989731164,
"grad_norm": 0.05503234639763832,
"learning_rate": 2.3463557883980995e-05,
"loss": 0.0046,
"step": 408
},
{
"epoch": 1.5095188646590516,
"grad_norm": 0.11701611429452896,
"learning_rate": 2.337887157739169e-05,
"loss": 0.0338,
"step": 409
},
{
"epoch": 1.5132110303449866,
"grad_norm": 0.08817101269960403,
"learning_rate": 2.3294122863414e-05,
"loss": 0.0129,
"step": 410
},
{
"epoch": 1.5169031960309218,
"grad_norm": 0.08257415890693665,
"learning_rate": 2.3209313307347413e-05,
"loss": 0.0104,
"step": 411
},
{
"epoch": 1.520595361716857,
"grad_norm": 0.05527732893824577,
"learning_rate": 2.312444447561514e-05,
"loss": 0.0038,
"step": 412
},
{
"epoch": 1.5242875274027923,
"grad_norm": 0.06380025297403336,
"learning_rate": 2.3039517935735215e-05,
"loss": 0.0051,
"step": 413
},
{
"epoch": 1.5279796930887275,
"grad_norm": 0.040247559547424316,
"learning_rate": 2.2954535256291554e-05,
"loss": 0.0027,
"step": 414
},
{
"epoch": 1.5316718587746625,
"grad_norm": 0.04606495052576065,
"learning_rate": 2.2869498006904934e-05,
"loss": 0.0043,
"step": 415
},
{
"epoch": 1.5353640244605977,
"grad_norm": 0.051601577550172806,
"learning_rate": 2.2784407758204054e-05,
"loss": 0.0039,
"step": 416
},
{
"epoch": 1.5390561901465327,
"grad_norm": 0.06773068755865097,
"learning_rate": 2.2699266081796493e-05,
"loss": 0.0042,
"step": 417
},
{
"epoch": 1.542748355832468,
"grad_norm": 0.06343487650156021,
"learning_rate": 2.2614074550239707e-05,
"loss": 0.0053,
"step": 418
},
{
"epoch": 1.5464405215184032,
"grad_norm": 0.04785839468240738,
"learning_rate": 2.2528834737011963e-05,
"loss": 0.0057,
"step": 419
},
{
"epoch": 1.5501326872043384,
"grad_norm": 0.06275342404842377,
"learning_rate": 2.2443548216483292e-05,
"loss": 0.0049,
"step": 420
},
{
"epoch": 1.5538248528902736,
"grad_norm": 0.06469978392124176,
"learning_rate": 2.235821656388638e-05,
"loss": 0.0049,
"step": 421
},
{
"epoch": 1.5575170185762086,
"grad_norm": 0.07403778284788132,
"learning_rate": 2.2272841355287526e-05,
"loss": 0.01,
"step": 422
},
{
"epoch": 1.5612091842621436,
"grad_norm": 0.041104815900325775,
"learning_rate": 2.2187424167557496e-05,
"loss": 0.0027,
"step": 423
},
{
"epoch": 1.5649013499480788,
"grad_norm": 0.09375158697366714,
"learning_rate": 2.210196657834239e-05,
"loss": 0.0037,
"step": 424
},
{
"epoch": 1.568593515634014,
"grad_norm": 0.043952830135822296,
"learning_rate": 2.2016470166034544e-05,
"loss": 0.0034,
"step": 425
},
{
"epoch": 1.5722856813199493,
"grad_norm": 0.07946529239416122,
"learning_rate": 2.193093650974334e-05,
"loss": 0.0103,
"step": 426
},
{
"epoch": 1.5759778470058845,
"grad_norm": 0.09605729579925537,
"learning_rate": 2.184536718926604e-05,
"loss": 0.0063,
"step": 427
},
{
"epoch": 1.5796700126918195,
"grad_norm": 0.04163216054439545,
"learning_rate": 2.175976378505865e-05,
"loss": 0.0036,
"step": 428
},
{
"epoch": 1.5833621783777547,
"grad_norm": 0.036647167056798935,
"learning_rate": 2.1674127878206664e-05,
"loss": 0.0037,
"step": 429
},
{
"epoch": 1.5870543440636897,
"grad_norm": 0.06975825875997543,
"learning_rate": 2.1588461050395918e-05,
"loss": 0.0077,
"step": 430
},
{
"epoch": 1.590746509749625,
"grad_norm": 0.06405274569988251,
"learning_rate": 2.1502764883883355e-05,
"loss": 0.0085,
"step": 431
},
{
"epoch": 1.5944386754355602,
"grad_norm": 0.09269712120294571,
"learning_rate": 2.141704096146779e-05,
"loss": 0.0192,
"step": 432
},
{
"epoch": 1.5981308411214954,
"grad_norm": 0.07872427254915237,
"learning_rate": 2.133129086646069e-05,
"loss": 0.0115,
"step": 433
},
{
"epoch": 1.6018230068074306,
"grad_norm": 0.053590744733810425,
"learning_rate": 2.1245516182656938e-05,
"loss": 0.0039,
"step": 434
},
{
"epoch": 1.6055151724933656,
"grad_norm": 0.05257750675082207,
"learning_rate": 2.1159718494305547e-05,
"loss": 0.005,
"step": 435
},
{
"epoch": 1.6092073381793008,
"grad_norm": 0.05436495319008827,
"learning_rate": 2.107389938608045e-05,
"loss": 0.0044,
"step": 436
},
{
"epoch": 1.6128995038652358,
"grad_norm": 0.063501738011837,
"learning_rate": 2.0988060443051165e-05,
"loss": 0.0059,
"step": 437
},
{
"epoch": 1.616591669551171,
"grad_norm": 0.06863530725240707,
"learning_rate": 2.0902203250653596e-05,
"loss": 0.0092,
"step": 438
},
{
"epoch": 1.6202838352371063,
"grad_norm": 0.06638474762439728,
"learning_rate": 2.0816329394660696e-05,
"loss": 0.0031,
"step": 439
},
{
"epoch": 1.6239760009230415,
"grad_norm": 0.03661806881427765,
"learning_rate": 2.0730440461153183e-05,
"loss": 0.0036,
"step": 440
},
{
"epoch": 1.6276681666089767,
"grad_norm": 0.05380409210920334,
"learning_rate": 2.0644538036490257e-05,
"loss": 0.0062,
"step": 441
},
{
"epoch": 1.6313603322949117,
"grad_norm": 0.0474727526307106,
"learning_rate": 2.0558623707280313e-05,
"loss": 0.0033,
"step": 442
},
{
"epoch": 1.6350524979808467,
"grad_norm": 0.07232918590307236,
"learning_rate": 2.0472699060351614e-05,
"loss": 0.0035,
"step": 443
},
{
"epoch": 1.638744663666782,
"grad_norm": 0.16932211816310883,
"learning_rate": 2.038676568272298e-05,
"loss": 0.0054,
"step": 444
},
{
"epoch": 1.6424368293527172,
"grad_norm": 0.08939804881811142,
"learning_rate": 2.03008251615745e-05,
"loss": 0.0061,
"step": 445
},
{
"epoch": 1.6461289950386524,
"grad_norm": 0.048010073602199554,
"learning_rate": 2.0214879084218193e-05,
"loss": 0.0033,
"step": 446
},
{
"epoch": 1.6498211607245876,
"grad_norm": 0.08143167942762375,
"learning_rate": 2.0128929038068716e-05,
"loss": 0.0123,
"step": 447
},
{
"epoch": 1.6535133264105226,
"grad_norm": 0.041366055607795715,
"learning_rate": 2.0042976610614006e-05,
"loss": 0.0022,
"step": 448
},
{
"epoch": 1.6572054920964578,
"grad_norm": 0.06036004796624184,
"learning_rate": 1.9957023389385998e-05,
"loss": 0.0031,
"step": 449
},
{
"epoch": 1.6608976577823928,
"grad_norm": 0.06090189516544342,
"learning_rate": 1.9871070961931294e-05,
"loss": 0.0046,
"step": 450
},
{
"epoch": 1.664589823468328,
"grad_norm": 0.06394810229539871,
"learning_rate": 1.9785120915781813e-05,
"loss": 0.0055,
"step": 451
},
{
"epoch": 1.6682819891542633,
"grad_norm": 0.05126480385661125,
"learning_rate": 1.9699174838425502e-05,
"loss": 0.0028,
"step": 452
},
{
"epoch": 1.6719741548401985,
"grad_norm": 0.16779179871082306,
"learning_rate": 1.961323431727703e-05,
"loss": 0.0138,
"step": 453
},
{
"epoch": 1.6756663205261337,
"grad_norm": 0.04399624094367027,
"learning_rate": 1.9527300939648396e-05,
"loss": 0.0028,
"step": 454
},
{
"epoch": 1.6793584862120687,
"grad_norm": 0.07073678821325302,
"learning_rate": 1.9441376292719687e-05,
"loss": 0.0042,
"step": 455
},
{
"epoch": 1.683050651898004,
"grad_norm": 0.06449951231479645,
"learning_rate": 1.935546196350975e-05,
"loss": 0.0047,
"step": 456
},
{
"epoch": 1.686742817583939,
"grad_norm": 0.05503733456134796,
"learning_rate": 1.9269559538846823e-05,
"loss": 0.0054,
"step": 457
},
{
"epoch": 1.6904349832698742,
"grad_norm": 0.0840897262096405,
"learning_rate": 1.9183670605339314e-05,
"loss": 0.0096,
"step": 458
},
{
"epoch": 1.6941271489558094,
"grad_norm": 0.06032564863562584,
"learning_rate": 1.909779674934641e-05,
"loss": 0.0039,
"step": 459
},
{
"epoch": 1.6978193146417446,
"grad_norm": 0.08506152033805847,
"learning_rate": 1.9011939556948835e-05,
"loss": 0.0061,
"step": 460
},
{
"epoch": 1.6978193146417446,
"eval_loss": 0.007821443490684032,
"eval_runtime": 90.0445,
"eval_samples_per_second": 10.139,
"eval_steps_per_second": 5.075,
"step": 460
},
{
"epoch": 1.7015114803276798,
"grad_norm": 0.09359995275735855,
"learning_rate": 1.8926100613919565e-05,
"loss": 0.0137,
"step": 461
},
{
"epoch": 1.7052036460136148,
"grad_norm": 0.050462689250707626,
"learning_rate": 1.884028150569446e-05,
"loss": 0.0026,
"step": 462
},
{
"epoch": 1.70889581169955,
"grad_norm": 0.05139093101024628,
"learning_rate": 1.8754483817343065e-05,
"loss": 0.0038,
"step": 463
},
{
"epoch": 1.712587977385485,
"grad_norm": 0.05640941858291626,
"learning_rate": 1.8668709133539316e-05,
"loss": 0.0048,
"step": 464
},
{
"epoch": 1.7162801430714203,
"grad_norm": 0.0617087222635746,
"learning_rate": 1.8582959038532216e-05,
"loss": 0.0066,
"step": 465
},
{
"epoch": 1.7199723087573555,
"grad_norm": 0.05858307704329491,
"learning_rate": 1.8497235116116656e-05,
"loss": 0.0042,
"step": 466
},
{
"epoch": 1.7236644744432907,
"grad_norm": 0.06570729613304138,
"learning_rate": 1.841153894960409e-05,
"loss": 0.0069,
"step": 467
},
{
"epoch": 1.727356640129226,
"grad_norm": 0.03923754021525383,
"learning_rate": 1.8325872121793343e-05,
"loss": 0.0038,
"step": 468
},
{
"epoch": 1.731048805815161,
"grad_norm": 0.04980519786477089,
"learning_rate": 1.824023621494136e-05,
"loss": 0.0033,
"step": 469
},
{
"epoch": 1.734740971501096,
"grad_norm": 0.0408487394452095,
"learning_rate": 1.815463281073396e-05,
"loss": 0.0025,
"step": 470
},
{
"epoch": 1.7384331371870312,
"grad_norm": 0.07105151563882828,
"learning_rate": 1.8069063490256668e-05,
"loss": 0.0055,
"step": 471
},
{
"epoch": 1.7421253028729664,
"grad_norm": 0.048975639045238495,
"learning_rate": 1.7983529833965463e-05,
"loss": 0.0036,
"step": 472
},
{
"epoch": 1.7458174685589016,
"grad_norm": 0.034441813826560974,
"learning_rate": 1.7898033421657616e-05,
"loss": 0.0029,
"step": 473
},
{
"epoch": 1.7495096342448369,
"grad_norm": 0.07011700421571732,
"learning_rate": 1.7812575832442518e-05,
"loss": 0.0097,
"step": 474
},
{
"epoch": 1.7532017999307719,
"grad_norm": 0.058152489364147186,
"learning_rate": 1.7727158644712484e-05,
"loss": 0.0092,
"step": 475
},
{
"epoch": 1.756893965616707,
"grad_norm": 0.049807388335466385,
"learning_rate": 1.764178343611363e-05,
"loss": 0.0036,
"step": 476
},
{
"epoch": 1.760586131302642,
"grad_norm": 0.05702248960733414,
"learning_rate": 1.755645178351672e-05,
"loss": 0.0091,
"step": 477
},
{
"epoch": 1.7642782969885773,
"grad_norm": 0.051516178995370865,
"learning_rate": 1.747116526298804e-05,
"loss": 0.0044,
"step": 478
},
{
"epoch": 1.7679704626745125,
"grad_norm": 0.09747370332479477,
"learning_rate": 1.7385925449760296e-05,
"loss": 0.0065,
"step": 479
},
{
"epoch": 1.7716626283604477,
"grad_norm": 0.05502758547663689,
"learning_rate": 1.7300733918203514e-05,
"loss": 0.0151,
"step": 480
},
{
"epoch": 1.775354794046383,
"grad_norm": 0.05905942991375923,
"learning_rate": 1.7215592241795956e-05,
"loss": 0.0029,
"step": 481
},
{
"epoch": 1.779046959732318,
"grad_norm": 0.07898251712322235,
"learning_rate": 1.7130501993095076e-05,
"loss": 0.0034,
"step": 482
},
{
"epoch": 1.7827391254182532,
"grad_norm": 0.0482512004673481,
"learning_rate": 1.7045464743708456e-05,
"loss": 0.0065,
"step": 483
},
{
"epoch": 1.7864312911041882,
"grad_norm": 0.07192537188529968,
"learning_rate": 1.6960482064264788e-05,
"loss": 0.0076,
"step": 484
},
{
"epoch": 1.7901234567901234,
"grad_norm": 0.08650217205286026,
"learning_rate": 1.687555552438487e-05,
"loss": 0.0209,
"step": 485
},
{
"epoch": 1.7938156224760586,
"grad_norm": 0.06443698704242706,
"learning_rate": 1.679068669265259e-05,
"loss": 0.0148,
"step": 486
},
{
"epoch": 1.7975077881619939,
"grad_norm": 0.04942217096686363,
"learning_rate": 1.6705877136586e-05,
"loss": 0.0048,
"step": 487
},
{
"epoch": 1.801199953847929,
"grad_norm": 0.04607919976115227,
"learning_rate": 1.6621128422608318e-05,
"loss": 0.0034,
"step": 488
},
{
"epoch": 1.804892119533864,
"grad_norm": 0.06549520045518875,
"learning_rate": 1.6536442116019012e-05,
"loss": 0.0067,
"step": 489
},
{
"epoch": 1.808584285219799,
"grad_norm": 0.1073460504412651,
"learning_rate": 1.6451819780964912e-05,
"loss": 0.0181,
"step": 490
},
{
"epoch": 1.8122764509057343,
"grad_norm": 0.03819148242473602,
"learning_rate": 1.6367262980411273e-05,
"loss": 0.0031,
"step": 491
},
{
"epoch": 1.8159686165916695,
"grad_norm": 0.05567432940006256,
"learning_rate": 1.6282773276112963e-05,
"loss": 0.0032,
"step": 492
},
{
"epoch": 1.8196607822776047,
"grad_norm": 0.0490952767431736,
"learning_rate": 1.619835222858558e-05,
"loss": 0.0022,
"step": 493
},
{
"epoch": 1.82335294796354,
"grad_norm": 0.06697966158390045,
"learning_rate": 1.6114001397076623e-05,
"loss": 0.0051,
"step": 494
},
{
"epoch": 1.827045113649475,
"grad_norm": 0.06809218227863312,
"learning_rate": 1.6029722339536725e-05,
"loss": 0.0065,
"step": 495
},
{
"epoch": 1.8307372793354102,
"grad_norm": 0.07054764032363892,
"learning_rate": 1.5945516612590872e-05,
"loss": 0.0051,
"step": 496
},
{
"epoch": 1.8344294450213452,
"grad_norm": 0.03432456776499748,
"learning_rate": 1.5861385771509612e-05,
"loss": 0.0021,
"step": 497
},
{
"epoch": 1.8381216107072804,
"grad_norm": 0.08520621806383133,
"learning_rate": 1.5777331370180388e-05,
"loss": 0.005,
"step": 498
},
{
"epoch": 1.8418137763932156,
"grad_norm": 0.05386526510119438,
"learning_rate": 1.5693354961078783e-05,
"loss": 0.0048,
"step": 499
},
{
"epoch": 1.8455059420791509,
"grad_norm": 0.07909571379423141,
"learning_rate": 1.56094580952399e-05,
"loss": 0.0068,
"step": 500
},
{
"epoch": 1.849198107765086,
"grad_norm": 0.048736322671175,
"learning_rate": 1.5525642322229667e-05,
"loss": 0.0082,
"step": 501
},
{
"epoch": 1.852890273451021,
"grad_norm": 0.05122867971658707,
"learning_rate": 1.5441909190116237e-05,
"loss": 0.0054,
"step": 502
},
{
"epoch": 1.8565824391369563,
"grad_norm": 0.043926313519477844,
"learning_rate": 1.535826024544141e-05,
"loss": 0.0022,
"step": 503
},
{
"epoch": 1.8602746048228913,
"grad_norm": 0.11847585439682007,
"learning_rate": 1.5274697033192033e-05,
"loss": 0.0228,
"step": 504
},
{
"epoch": 1.8639667705088265,
"grad_norm": 0.06619201600551605,
"learning_rate": 1.51912210967715e-05,
"loss": 0.0038,
"step": 505
},
{
"epoch": 1.8676589361947618,
"grad_norm": 0.11160582304000854,
"learning_rate": 1.5107833977971227e-05,
"loss": 0.0097,
"step": 506
},
{
"epoch": 1.871351101880697,
"grad_norm": 0.053367406129837036,
"learning_rate": 1.5024537216942166e-05,
"loss": 0.0046,
"step": 507
},
{
"epoch": 1.8750432675666322,
"grad_norm": 0.04304146394133568,
"learning_rate": 1.4941332352166385e-05,
"loss": 0.0038,
"step": 508
},
{
"epoch": 1.8787354332525672,
"grad_norm": 0.06712479144334793,
"learning_rate": 1.485822092042864e-05,
"loss": 0.0094,
"step": 509
},
{
"epoch": 1.8824275989385022,
"grad_norm": 0.07085831463336945,
"learning_rate": 1.4775204456787973e-05,
"loss": 0.0065,
"step": 510
},
{
"epoch": 1.8861197646244374,
"grad_norm": 0.06869763880968094,
"learning_rate": 1.469228449454939e-05,
"loss": 0.0062,
"step": 511
},
{
"epoch": 1.8898119303103726,
"grad_norm": 0.25166359543800354,
"learning_rate": 1.4609462565235524e-05,
"loss": 0.0039,
"step": 512
},
{
"epoch": 1.8935040959963079,
"grad_norm": 0.05382119119167328,
"learning_rate": 1.4526740198558345e-05,
"loss": 0.0076,
"step": 513
},
{
"epoch": 1.897196261682243,
"grad_norm": 0.03443971276283264,
"learning_rate": 1.4444118922390921e-05,
"loss": 0.0029,
"step": 514
},
{
"epoch": 1.9008884273681783,
"grad_norm": 0.04824664443731308,
"learning_rate": 1.4361600262739171e-05,
"loss": 0.0034,
"step": 515
},
{
"epoch": 1.9045805930541133,
"grad_norm": 0.05474744364619255,
"learning_rate": 1.4279185743713721e-05,
"loss": 0.0053,
"step": 516
},
{
"epoch": 1.9082727587400483,
"grad_norm": 0.05808331444859505,
"learning_rate": 1.419687688750173e-05,
"loss": 0.0036,
"step": 517
},
{
"epoch": 1.9119649244259835,
"grad_norm": 0.0522182323038578,
"learning_rate": 1.4114675214338745e-05,
"loss": 0.0029,
"step": 518
},
{
"epoch": 1.9156570901119188,
"grad_norm": 0.04176926612854004,
"learning_rate": 1.4032582242480692e-05,
"loss": 0.0069,
"step": 519
},
{
"epoch": 1.919349255797854,
"grad_norm": 0.3265964984893799,
"learning_rate": 1.3950599488175783e-05,
"loss": 0.0124,
"step": 520
},
{
"epoch": 1.9230414214837892,
"grad_norm": 0.05882977694272995,
"learning_rate": 1.3868728465636508e-05,
"loss": 0.0048,
"step": 521
},
{
"epoch": 1.9267335871697242,
"grad_norm": 0.04733727127313614,
"learning_rate": 1.3786970687011713e-05,
"loss": 0.0057,
"step": 522
},
{
"epoch": 1.9304257528556594,
"grad_norm": 0.07429318130016327,
"learning_rate": 1.3705327662358605e-05,
"loss": 0.0154,
"step": 523
},
{
"epoch": 1.9341179185415944,
"grad_norm": 0.04765889793634415,
"learning_rate": 1.362380089961493e-05,
"loss": 0.0025,
"step": 524
},
{
"epoch": 1.9378100842275297,
"grad_norm": 0.11108744144439697,
"learning_rate": 1.3542391904571082e-05,
"loss": 0.013,
"step": 525
},
{
"epoch": 1.9415022499134649,
"grad_norm": 0.05669174715876579,
"learning_rate": 1.3461102180842274e-05,
"loss": 0.0063,
"step": 526
},
{
"epoch": 1.9451944155994,
"grad_norm": 0.04899504780769348,
"learning_rate": 1.3379933229840827e-05,
"loss": 0.0061,
"step": 527
},
{
"epoch": 1.9488865812853353,
"grad_norm": 0.04838700219988823,
"learning_rate": 1.3298886550748387e-05,
"loss": 0.0059,
"step": 528
},
{
"epoch": 1.9525787469712703,
"grad_norm": 0.06490358710289001,
"learning_rate": 1.3217963640488232e-05,
"loss": 0.0032,
"step": 529
},
{
"epoch": 1.9562709126572055,
"grad_norm": 0.06235655024647713,
"learning_rate": 1.3137165993697687e-05,
"loss": 0.0052,
"step": 530
},
{
"epoch": 1.9599630783431405,
"grad_norm": 0.06066396087408066,
"learning_rate": 1.3056495102700426e-05,
"loss": 0.0082,
"step": 531
},
{
"epoch": 1.9636552440290758,
"grad_norm": 0.03929363191127777,
"learning_rate": 1.2975952457478986e-05,
"loss": 0.0035,
"step": 532
},
{
"epoch": 1.967347409715011,
"grad_norm": 0.1256752759218216,
"learning_rate": 1.2895539545647229e-05,
"loss": 0.0051,
"step": 533
},
{
"epoch": 1.9710395754009462,
"grad_norm": 0.04527255520224571,
"learning_rate": 1.2815257852422818e-05,
"loss": 0.0029,
"step": 534
},
{
"epoch": 1.9747317410868814,
"grad_norm": 0.08779493719339371,
"learning_rate": 1.2735108860599848e-05,
"loss": 0.0051,
"step": 535
},
{
"epoch": 1.9784239067728164,
"grad_norm": 0.061192456632852554,
"learning_rate": 1.2655094050521447e-05,
"loss": 0.0061,
"step": 536
},
{
"epoch": 1.9821160724587514,
"grad_norm": 0.03906107693910599,
"learning_rate": 1.2575214900052378e-05,
"loss": 0.0035,
"step": 537
},
{
"epoch": 1.9858082381446867,
"grad_norm": 0.05745285004377365,
"learning_rate": 1.2495472884551836e-05,
"loss": 0.0061,
"step": 538
},
{
"epoch": 1.9895004038306219,
"grad_norm": 0.0978095754981041,
"learning_rate": 1.2415869476846101e-05,
"loss": 0.0058,
"step": 539
},
{
"epoch": 1.993192569516557,
"grad_norm": 0.05764520913362503,
"learning_rate": 1.2336406147201411e-05,
"loss": 0.0038,
"step": 540
},
{
"epoch": 1.9968847352024923,
"grad_norm": 0.04506031796336174,
"learning_rate": 1.225708436329679e-05,
"loss": 0.0066,
"step": 541
},
{
"epoch": 2.0,
"grad_norm": 0.08254817128181458,
"learning_rate": 1.2177905590196884e-05,
"loss": 0.0048,
"step": 542
},
{
"epoch": 2.0036921656859352,
"grad_norm": 0.050131332129240036,
"learning_rate": 1.2098871290324974e-05,
"loss": 0.0011,
"step": 543
},
{
"epoch": 2.0073843313718704,
"grad_norm": 0.04068991169333458,
"learning_rate": 1.2019982923435954e-05,
"loss": 0.0048,
"step": 544
},
{
"epoch": 2.0110764970578057,
"grad_norm": 0.04712502658367157,
"learning_rate": 1.1941241946589299e-05,
"loss": 0.0016,
"step": 545
},
{
"epoch": 2.0147686627437404,
"grad_norm": 0.019259510561823845,
"learning_rate": 1.1862649814122263e-05,
"loss": 0.0013,
"step": 546
},
{
"epoch": 2.0184608284296757,
"grad_norm": 0.06433498114347458,
"learning_rate": 1.1784207977622914e-05,
"loss": 0.0021,
"step": 547
},
{
"epoch": 2.022152994115611,
"grad_norm": 0.054391320794820786,
"learning_rate": 1.1705917885903402e-05,
"loss": 0.0079,
"step": 548
},
{
"epoch": 2.025845159801546,
"grad_norm": 0.04541629180312157,
"learning_rate": 1.1627780984973153e-05,
"loss": 0.0029,
"step": 549
},
{
"epoch": 2.0295373254874813,
"grad_norm": 0.02488025464117527,
"learning_rate": 1.1549798718012184e-05,
"loss": 0.0013,
"step": 550
},
{
"epoch": 2.0332294911734166,
"grad_norm": 0.06702622771263123,
"learning_rate": 1.1471972525344421e-05,
"loss": 0.0067,
"step": 551
},
{
"epoch": 2.036921656859352,
"grad_norm": 0.04141972213983536,
"learning_rate": 1.139430384441115e-05,
"loss": 0.0037,
"step": 552
},
{
"epoch": 2.036921656859352,
"eval_loss": 0.0076590548269450665,
"eval_runtime": 89.9642,
"eval_samples_per_second": 10.148,
"eval_steps_per_second": 5.08,
"step": 552
},
{
"epoch": 2.0406138225452866,
"grad_norm": 0.09934539347887039,
"learning_rate": 1.1316794109744394e-05,
"loss": 0.0017,
"step": 553
},
{
"epoch": 2.044305988231222,
"grad_norm": 0.04188989847898483,
"learning_rate": 1.1239444752940477e-05,
"loss": 0.0017,
"step": 554
},
{
"epoch": 2.047998153917157,
"grad_norm": 0.03835677355527878,
"learning_rate": 1.1162257202633548e-05,
"loss": 0.0054,
"step": 555
},
{
"epoch": 2.0516903196030922,
"grad_norm": 0.04190275818109512,
"learning_rate": 1.1085232884469236e-05,
"loss": 0.0022,
"step": 556
},
{
"epoch": 2.0553824852890274,
"grad_norm": 0.03239237144589424,
"learning_rate": 1.1008373221078261e-05,
"loss": 0.0018,
"step": 557
},
{
"epoch": 2.0590746509749627,
"grad_norm": 0.022900836542248726,
"learning_rate": 1.0931679632050186e-05,
"loss": 0.0014,
"step": 558
},
{
"epoch": 2.0627668166608975,
"grad_norm": 0.04315221309661865,
"learning_rate": 1.085515353390723e-05,
"loss": 0.001,
"step": 559
},
{
"epoch": 2.0664589823468327,
"grad_norm": 0.03839712589979172,
"learning_rate": 1.0778796340078043e-05,
"loss": 0.0019,
"step": 560
},
{
"epoch": 2.070151148032768,
"grad_norm": 0.04536756873130798,
"learning_rate": 1.070260946087164e-05,
"loss": 0.005,
"step": 561
},
{
"epoch": 2.073843313718703,
"grad_norm": 0.0286678746342659,
"learning_rate": 1.0626594303451359e-05,
"loss": 0.0013,
"step": 562
},
{
"epoch": 2.0775354794046383,
"grad_norm": 0.04558982327580452,
"learning_rate": 1.0550752271808817e-05,
"loss": 0.0028,
"step": 563
},
{
"epoch": 2.0812276450905736,
"grad_norm": 0.08176779747009277,
"learning_rate": 1.0475084766738051e-05,
"loss": 0.0031,
"step": 564
},
{
"epoch": 2.084919810776509,
"grad_norm": 0.07253772020339966,
"learning_rate": 1.0399593185809625e-05,
"loss": 0.0081,
"step": 565
},
{
"epoch": 2.0886119764624436,
"grad_norm": 0.045510150492191315,
"learning_rate": 1.0324278923344741e-05,
"loss": 0.0022,
"step": 566
},
{
"epoch": 2.092304142148379,
"grad_norm": 0.04695519059896469,
"learning_rate": 1.0249143370389607e-05,
"loss": 0.0029,
"step": 567
},
{
"epoch": 2.095996307834314,
"grad_norm": 0.04093737155199051,
"learning_rate": 1.0174187914689681e-05,
"loss": 0.0038,
"step": 568
},
{
"epoch": 2.0996884735202492,
"grad_norm": 0.026440149173140526,
"learning_rate": 1.0099413940664e-05,
"loss": 0.0011,
"step": 569
},
{
"epoch": 2.1033806392061845,
"grad_norm": 0.040475890040397644,
"learning_rate": 1.0024822829379701e-05,
"loss": 0.0016,
"step": 570
},
{
"epoch": 2.1070728048921197,
"grad_norm": 0.07037003338336945,
"learning_rate": 9.950415958526449e-06,
"loss": 0.0018,
"step": 571
},
{
"epoch": 2.110764970578055,
"grad_norm": 0.04606116563081741,
"learning_rate": 9.876194702390998e-06,
"loss": 0.0018,
"step": 572
},
{
"epoch": 2.1144571362639897,
"grad_norm": 0.03653557226061821,
"learning_rate": 9.802160431831845e-06,
"loss": 0.0011,
"step": 573
},
{
"epoch": 2.118149301949925,
"grad_norm": 0.03161030635237694,
"learning_rate": 9.728314514253856e-06,
"loss": 0.0015,
"step": 574
},
{
"epoch": 2.12184146763586,
"grad_norm": 0.054874520748853683,
"learning_rate": 9.654658313583045e-06,
"loss": 0.0084,
"step": 575
},
{
"epoch": 2.1255336333217953,
"grad_norm": 0.07581143826246262,
"learning_rate": 9.581193190241398e-06,
"loss": 0.0031,
"step": 576
},
{
"epoch": 2.1292257990077306,
"grad_norm": 0.049715541303157806,
"learning_rate": 9.507920501121685e-06,
"loss": 0.0018,
"step": 577
},
{
"epoch": 2.132917964693666,
"grad_norm": 0.02939217910170555,
"learning_rate": 9.434841599562487e-06,
"loss": 0.0012,
"step": 578
},
{
"epoch": 2.1366101303796006,
"grad_norm": 0.02201433666050434,
"learning_rate": 9.361957835323088e-06,
"loss": 0.0005,
"step": 579
},
{
"epoch": 2.140302296065536,
"grad_norm": 0.03674834221601486,
"learning_rate": 9.289270554558651e-06,
"loss": 0.0018,
"step": 580
},
{
"epoch": 2.143994461751471,
"grad_norm": 0.061158619821071625,
"learning_rate": 9.216781099795322e-06,
"loss": 0.0056,
"step": 581
},
{
"epoch": 2.1476866274374062,
"grad_norm": 0.06538428366184235,
"learning_rate": 9.144490809905365e-06,
"loss": 0.0077,
"step": 582
},
{
"epoch": 2.1513787931233415,
"grad_norm": 0.03936760872602463,
"learning_rate": 9.072401020082542e-06,
"loss": 0.0021,
"step": 583
},
{
"epoch": 2.1550709588092767,
"grad_norm": 0.06418730318546295,
"learning_rate": 9.0005130618174e-06,
"loss": 0.0042,
"step": 584
},
{
"epoch": 2.158763124495212,
"grad_norm": 0.027600156143307686,
"learning_rate": 8.928828262872633e-06,
"loss": 0.0008,
"step": 585
},
{
"epoch": 2.1624552901811467,
"grad_norm": 0.02378762699663639,
"learning_rate": 8.857347947258657e-06,
"loss": 0.0008,
"step": 586
},
{
"epoch": 2.166147455867082,
"grad_norm": 0.05525508150458336,
"learning_rate": 8.786073435209072e-06,
"loss": 0.003,
"step": 587
},
{
"epoch": 2.169839621553017,
"grad_norm": 0.05668642744421959,
"learning_rate": 8.715006043156315e-06,
"loss": 0.0047,
"step": 588
},
{
"epoch": 2.1735317872389524,
"grad_norm": 0.6010801196098328,
"learning_rate": 8.644147083707354e-06,
"loss": 0.0111,
"step": 589
},
{
"epoch": 2.1772239529248876,
"grad_norm": 0.364096075296402,
"learning_rate": 8.573497865619414e-06,
"loss": 0.018,
"step": 590
},
{
"epoch": 2.180916118610823,
"grad_norm": 0.03992763161659241,
"learning_rate": 8.503059693775813e-06,
"loss": 0.0023,
"step": 591
},
{
"epoch": 2.184608284296758,
"grad_norm": 0.025853926315903664,
"learning_rate": 8.432833869161893e-06,
"loss": 0.0012,
"step": 592
},
{
"epoch": 2.188300449982693,
"grad_norm": 0.02981944940984249,
"learning_rate": 8.362821688840947e-06,
"loss": 0.0015,
"step": 593
},
{
"epoch": 2.191992615668628,
"grad_norm": 0.05791231989860535,
"learning_rate": 8.293024445930287e-06,
"loss": 0.0027,
"step": 594
},
{
"epoch": 2.1956847813545632,
"grad_norm": 0.04732658341526985,
"learning_rate": 8.223443429577343e-06,
"loss": 0.0033,
"step": 595
},
{
"epoch": 2.1993769470404985,
"grad_norm": 0.03762805834412575,
"learning_rate": 8.154079924935892e-06,
"loss": 0.0015,
"step": 596
},
{
"epoch": 2.2030691127264337,
"grad_norm": 0.07153934240341187,
"learning_rate": 8.084935213142269e-06,
"loss": 0.0022,
"step": 597
},
{
"epoch": 2.206761278412369,
"grad_norm": 0.04638079181313515,
"learning_rate": 8.016010571291725e-06,
"loss": 0.0018,
"step": 598
},
{
"epoch": 2.210453444098304,
"grad_norm": 0.030643608421087265,
"learning_rate": 7.947307272414874e-06,
"loss": 0.0029,
"step": 599
},
{
"epoch": 2.214145609784239,
"grad_norm": 0.02873793989419937,
"learning_rate": 7.878826585454122e-06,
"loss": 0.0021,
"step": 600
},
{
"epoch": 2.217837775470174,
"grad_norm": 0.04478053003549576,
"learning_rate": 7.810569775240257e-06,
"loss": 0.003,
"step": 601
},
{
"epoch": 2.2215299411561094,
"grad_norm": 0.046556588262319565,
"learning_rate": 7.742538102469111e-06,
"loss": 0.002,
"step": 602
},
{
"epoch": 2.2252221068420446,
"grad_norm": 0.039592090994119644,
"learning_rate": 7.674732823678228e-06,
"loss": 0.0036,
"step": 603
},
{
"epoch": 2.22891427252798,
"grad_norm": 0.031397104263305664,
"learning_rate": 7.607155191223683e-06,
"loss": 0.0013,
"step": 604
},
{
"epoch": 2.232606438213915,
"grad_norm": 0.035691820085048676,
"learning_rate": 7.539806453256973e-06,
"loss": 0.0014,
"step": 605
},
{
"epoch": 2.23629860389985,
"grad_norm": 0.0710548609495163,
"learning_rate": 7.472687853701908e-06,
"loss": 0.0078,
"step": 606
},
{
"epoch": 2.239990769585785,
"grad_norm": 0.04906422272324562,
"learning_rate": 7.405800632231672e-06,
"loss": 0.0025,
"step": 607
},
{
"epoch": 2.2436829352717202,
"grad_norm": 0.08130212128162384,
"learning_rate": 7.339146024245947e-06,
"loss": 0.006,
"step": 608
},
{
"epoch": 2.2473751009576555,
"grad_norm": 0.0357891283929348,
"learning_rate": 7.272725260848037e-06,
"loss": 0.0018,
"step": 609
},
{
"epoch": 2.2510672666435907,
"grad_norm": 0.05284830555319786,
"learning_rate": 7.206539568822179e-06,
"loss": 0.0017,
"step": 610
},
{
"epoch": 2.254759432329526,
"grad_norm": 0.03607559949159622,
"learning_rate": 7.140590170610857e-06,
"loss": 0.0023,
"step": 611
},
{
"epoch": 2.258451598015461,
"grad_norm": 0.043198052793741226,
"learning_rate": 7.0748782842922545e-06,
"loss": 0.0016,
"step": 612
},
{
"epoch": 2.262143763701396,
"grad_norm": 0.07694031298160553,
"learning_rate": 7.0094051235577155e-06,
"loss": 0.0025,
"step": 613
},
{
"epoch": 2.265835929387331,
"grad_norm": 0.040816958993673325,
"learning_rate": 6.944171897689349e-06,
"loss": 0.0043,
"step": 614
},
{
"epoch": 2.2695280950732664,
"grad_norm": 0.02998993545770645,
"learning_rate": 6.879179811537715e-06,
"loss": 0.001,
"step": 615
},
{
"epoch": 2.2732202607592016,
"grad_norm": 0.0377618744969368,
"learning_rate": 6.814430065499526e-06,
"loss": 0.0017,
"step": 616
},
{
"epoch": 2.276912426445137,
"grad_norm": 0.05830111727118492,
"learning_rate": 6.749923855495502e-06,
"loss": 0.0038,
"step": 617
},
{
"epoch": 2.280604592131072,
"grad_norm": 0.042703039944171906,
"learning_rate": 6.685662372948298e-06,
"loss": 0.004,
"step": 618
},
{
"epoch": 2.284296757817007,
"grad_norm": 0.0343906469643116,
"learning_rate": 6.62164680476046e-06,
"loss": 0.0024,
"step": 619
},
{
"epoch": 2.287988923502942,
"grad_norm": 0.030185390263795853,
"learning_rate": 6.55787833329252e-06,
"loss": 0.001,
"step": 620
},
{
"epoch": 2.2916810891888773,
"grad_norm": 0.03131229057908058,
"learning_rate": 6.4943581363411855e-06,
"loss": 0.0015,
"step": 621
},
{
"epoch": 2.2953732548748125,
"grad_norm": 0.03044010140001774,
"learning_rate": 6.431087387117538e-06,
"loss": 0.001,
"step": 622
},
{
"epoch": 2.2990654205607477,
"grad_norm": 0.028354087844491005,
"learning_rate": 6.368067254225387e-06,
"loss": 0.0009,
"step": 623
},
{
"epoch": 2.302757586246683,
"grad_norm": 0.06797152012586594,
"learning_rate": 6.305298901639704e-06,
"loss": 0.0081,
"step": 624
},
{
"epoch": 2.306449751932618,
"grad_norm": 0.035257063806056976,
"learning_rate": 6.242783488685091e-06,
"loss": 0.0013,
"step": 625
},
{
"epoch": 2.3101419176185534,
"grad_norm": 0.09349807351827621,
"learning_rate": 6.1805221700143844e-06,
"loss": 0.017,
"step": 626
},
{
"epoch": 2.313834083304488,
"grad_norm": 0.07014710456132889,
"learning_rate": 6.118516095587321e-06,
"loss": 0.0109,
"step": 627
},
{
"epoch": 2.3175262489904234,
"grad_norm": 0.044707559049129486,
"learning_rate": 6.056766410649329e-06,
"loss": 0.0013,
"step": 628
},
{
"epoch": 2.3212184146763586,
"grad_norm": 0.1256485879421234,
"learning_rate": 5.99527425571032e-06,
"loss": 0.0066,
"step": 629
},
{
"epoch": 2.324910580362294,
"grad_norm": 0.03149070963263512,
"learning_rate": 5.934040766523668e-06,
"loss": 0.0026,
"step": 630
},
{
"epoch": 2.328602746048229,
"grad_norm": 0.029719380661845207,
"learning_rate": 5.873067074065229e-06,
"loss": 0.001,
"step": 631
},
{
"epoch": 2.3322949117341643,
"grad_norm": 0.02905251644551754,
"learning_rate": 5.8123543045124285e-06,
"loss": 0.0015,
"step": 632
},
{
"epoch": 2.335987077420099,
"grad_norm": 0.030503999441862106,
"learning_rate": 5.751903579223468e-06,
"loss": 0.001,
"step": 633
},
{
"epoch": 2.3396792431060343,
"grad_norm": 0.039289508014917374,
"learning_rate": 5.6917160147166525e-06,
"loss": 0.0019,
"step": 634
},
{
"epoch": 2.3433714087919695,
"grad_norm": 0.04832150787115097,
"learning_rate": 5.6317927226496875e-06,
"loss": 0.0016,
"step": 635
},
{
"epoch": 2.3470635744779047,
"grad_norm": 0.03619923070073128,
"learning_rate": 5.572134809799235e-06,
"loss": 0.0023,
"step": 636
},
{
"epoch": 2.35075574016384,
"grad_norm": 0.036724645644426346,
"learning_rate": 5.512743378040428e-06,
"loss": 0.001,
"step": 637
},
{
"epoch": 2.354447905849775,
"grad_norm": 0.026730459183454514,
"learning_rate": 5.453619524326495e-06,
"loss": 0.0013,
"step": 638
},
{
"epoch": 2.3581400715357104,
"grad_norm": 0.036537256091833115,
"learning_rate": 5.39476434066855e-06,
"loss": 0.0011,
"step": 639
},
{
"epoch": 2.361832237221645,
"grad_norm": 0.0313183031976223,
"learning_rate": 5.3361789141154085e-06,
"loss": 0.0011,
"step": 640
},
{
"epoch": 2.3655244029075804,
"grad_norm": 0.036056943237781525,
"learning_rate": 5.277864326733484e-06,
"loss": 0.0015,
"step": 641
},
{
"epoch": 2.3692165685935156,
"grad_norm": 0.06026415154337883,
"learning_rate": 5.219821655586821e-06,
"loss": 0.0081,
"step": 642
},
{
"epoch": 2.372908734279451,
"grad_norm": 0.04079107567667961,
"learning_rate": 5.162051972717204e-06,
"loss": 0.0028,
"step": 643
},
{
"epoch": 2.376600899965386,
"grad_norm": 0.06928452104330063,
"learning_rate": 5.104556345124363e-06,
"loss": 0.0027,
"step": 644
},
{
"epoch": 2.376600899965386,
"eval_loss": 0.008511913008987904,
"eval_runtime": 89.9081,
"eval_samples_per_second": 10.155,
"eval_steps_per_second": 5.083,
"step": 644
}
],
"logging_steps": 1,
"max_steps": 813,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 92,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.275551525360632e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}