Files
LLM4Cov-Qwen3-4B-SFT-Stage1/trainer_state.json
ModelHub XC 72691d3a89 初始化项目,由ModelHub XC社区提供模型
Model: hez2024/LLM4Cov-Qwen3-4B-SFT-Stage1
Source: Original Platform
2026-06-01 05:39:19 +08:00

5091 lines
123 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 721,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013879250520471894,
"grad_norm": 0.2488352358341217,
"learning_rate": 0.0,
"loss": 0.0507,
"step": 1
},
{
"epoch": 0.002775850104094379,
"grad_norm": 0.2802581489086151,
"learning_rate": 4.5454545454545457e-07,
"loss": 0.0448,
"step": 2
},
{
"epoch": 0.004163775156141568,
"grad_norm": 0.20850145816802979,
"learning_rate": 9.090909090909091e-07,
"loss": 0.0382,
"step": 3
},
{
"epoch": 0.005551700208188758,
"grad_norm": 0.24074043333530426,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.0455,
"step": 4
},
{
"epoch": 0.006939625260235947,
"grad_norm": 0.33691975474357605,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.0663,
"step": 5
},
{
"epoch": 0.008327550312283136,
"grad_norm": 0.24708178639411926,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0407,
"step": 6
},
{
"epoch": 0.009715475364330326,
"grad_norm": 0.23655228316783905,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.0348,
"step": 7
},
{
"epoch": 0.011103400416377515,
"grad_norm": 0.2452535629272461,
"learning_rate": 3.181818181818182e-06,
"loss": 0.0507,
"step": 8
},
{
"epoch": 0.012491325468424705,
"grad_norm": 0.31181129813194275,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.0474,
"step": 9
},
{
"epoch": 0.013879250520471894,
"grad_norm": 0.256910115480423,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.0445,
"step": 10
},
{
"epoch": 0.015267175572519083,
"grad_norm": 0.30851638317108154,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.0472,
"step": 11
},
{
"epoch": 0.016655100624566273,
"grad_norm": 0.31822746992111206,
"learning_rate": 5e-06,
"loss": 0.0611,
"step": 12
},
{
"epoch": 0.018043025676613464,
"grad_norm": 0.25003501772880554,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.0393,
"step": 13
},
{
"epoch": 0.01943095072866065,
"grad_norm": 0.2273981273174286,
"learning_rate": 5.90909090909091e-06,
"loss": 0.0328,
"step": 14
},
{
"epoch": 0.020818875780707843,
"grad_norm": 0.2772292196750641,
"learning_rate": 6.363636363636364e-06,
"loss": 0.0349,
"step": 15
},
{
"epoch": 0.02220680083275503,
"grad_norm": 0.36983752250671387,
"learning_rate": 6.818181818181818e-06,
"loss": 0.0476,
"step": 16
},
{
"epoch": 0.02359472588480222,
"grad_norm": 0.27299028635025024,
"learning_rate": 7.272727272727273e-06,
"loss": 0.04,
"step": 17
},
{
"epoch": 0.02498265093684941,
"grad_norm": 0.27781346440315247,
"learning_rate": 7.727272727272727e-06,
"loss": 0.0356,
"step": 18
},
{
"epoch": 0.0263705759888966,
"grad_norm": 0.313568651676178,
"learning_rate": 8.181818181818183e-06,
"loss": 0.0379,
"step": 19
},
{
"epoch": 0.027758501040943788,
"grad_norm": 0.2946470081806183,
"learning_rate": 8.636363636363637e-06,
"loss": 0.0388,
"step": 20
},
{
"epoch": 0.02914642609299098,
"grad_norm": 0.40478628873825073,
"learning_rate": 9.090909090909091e-06,
"loss": 0.036,
"step": 21
},
{
"epoch": 0.030534351145038167,
"grad_norm": 0.26556551456451416,
"learning_rate": 9.545454545454547e-06,
"loss": 0.0483,
"step": 22
},
{
"epoch": 0.031922276197085354,
"grad_norm": 0.24696266651153564,
"learning_rate": 1e-05,
"loss": 0.0408,
"step": 23
},
{
"epoch": 0.033310201249132546,
"grad_norm": 0.26831310987472534,
"learning_rate": 9.999949500779842e-06,
"loss": 0.0408,
"step": 24
},
{
"epoch": 0.03469812630117974,
"grad_norm": 0.27735695242881775,
"learning_rate": 9.999798004139435e-06,
"loss": 0.054,
"step": 25
},
{
"epoch": 0.03608605135322693,
"grad_norm": 0.32683494687080383,
"learning_rate": 9.999545513138964e-06,
"loss": 0.0455,
"step": 26
},
{
"epoch": 0.03747397640527411,
"grad_norm": 0.2764844596385956,
"learning_rate": 9.999192032878667e-06,
"loss": 0.0435,
"step": 27
},
{
"epoch": 0.0388619014573213,
"grad_norm": 0.2449122816324234,
"learning_rate": 9.998737570498737e-06,
"loss": 0.0437,
"step": 28
},
{
"epoch": 0.040249826509368494,
"grad_norm": 0.28722766041755676,
"learning_rate": 9.998182135179173e-06,
"loss": 0.0438,
"step": 29
},
{
"epoch": 0.041637751561415685,
"grad_norm": 0.3285958468914032,
"learning_rate": 9.997525738139595e-06,
"loss": 0.0503,
"step": 30
},
{
"epoch": 0.04302567661346287,
"grad_norm": 0.2966720163822174,
"learning_rate": 9.996768392639015e-06,
"loss": 0.045,
"step": 31
},
{
"epoch": 0.04441360166551006,
"grad_norm": 0.30295976996421814,
"learning_rate": 9.99591011397558e-06,
"loss": 0.0468,
"step": 32
},
{
"epoch": 0.04580152671755725,
"grad_norm": 0.2856103777885437,
"learning_rate": 9.994950919486248e-06,
"loss": 0.0467,
"step": 33
},
{
"epoch": 0.04718945176960444,
"grad_norm": 0.3122520446777344,
"learning_rate": 9.99389082854645e-06,
"loss": 0.0519,
"step": 34
},
{
"epoch": 0.048577376821651634,
"grad_norm": 0.34234896302223206,
"learning_rate": 9.992729862569694e-06,
"loss": 0.0539,
"step": 35
},
{
"epoch": 0.04996530187369882,
"grad_norm": 0.23981930315494537,
"learning_rate": 9.99146804500713e-06,
"loss": 0.046,
"step": 36
},
{
"epoch": 0.05135322692574601,
"grad_norm": 0.30006060004234314,
"learning_rate": 9.990105401347075e-06,
"loss": 0.0563,
"step": 37
},
{
"epoch": 0.0527411519777932,
"grad_norm": 0.23050066828727722,
"learning_rate": 9.988641959114512e-06,
"loss": 0.0419,
"step": 38
},
{
"epoch": 0.05412907702984039,
"grad_norm": 0.3269716501235962,
"learning_rate": 9.987077747870512e-06,
"loss": 0.0634,
"step": 39
},
{
"epoch": 0.055517002081887576,
"grad_norm": 0.25331613421440125,
"learning_rate": 9.985412799211658e-06,
"loss": 0.0446,
"step": 40
},
{
"epoch": 0.05690492713393477,
"grad_norm": 0.22769664227962494,
"learning_rate": 9.98364714676939e-06,
"loss": 0.0384,
"step": 41
},
{
"epoch": 0.05829285218598196,
"grad_norm": 0.290194034576416,
"learning_rate": 9.981780826209342e-06,
"loss": 0.0502,
"step": 42
},
{
"epoch": 0.05968077723802915,
"grad_norm": 0.2815046012401581,
"learning_rate": 9.979813875230604e-06,
"loss": 0.0472,
"step": 43
},
{
"epoch": 0.061068702290076333,
"grad_norm": 0.25529077649116516,
"learning_rate": 9.97774633356497e-06,
"loss": 0.0551,
"step": 44
},
{
"epoch": 0.062456627342123525,
"grad_norm": 0.24432392418384552,
"learning_rate": 9.97557824297614e-06,
"loss": 0.0376,
"step": 45
},
{
"epoch": 0.06384455239417071,
"grad_norm": 0.29393213987350464,
"learning_rate": 9.97330964725887e-06,
"loss": 0.056,
"step": 46
},
{
"epoch": 0.0652324774462179,
"grad_norm": 0.24784711003303528,
"learning_rate": 9.970940592238077e-06,
"loss": 0.0415,
"step": 47
},
{
"epoch": 0.06662040249826509,
"grad_norm": 0.21179543435573578,
"learning_rate": 9.968471125767942e-06,
"loss": 0.0415,
"step": 48
},
{
"epoch": 0.06800832755031229,
"grad_norm": 0.23930367827415466,
"learning_rate": 9.965901297730914e-06,
"loss": 0.0503,
"step": 49
},
{
"epoch": 0.06939625260235947,
"grad_norm": 0.24700821936130524,
"learning_rate": 9.963231160036716e-06,
"loss": 0.0391,
"step": 50
},
{
"epoch": 0.07078417765440666,
"grad_norm": 0.29365330934524536,
"learning_rate": 9.960460766621299e-06,
"loss": 0.0496,
"step": 51
},
{
"epoch": 0.07217210270645386,
"grad_norm": 0.2956799566745758,
"learning_rate": 9.957590173445746e-06,
"loss": 0.0451,
"step": 52
},
{
"epoch": 0.07356002775850104,
"grad_norm": 0.288196861743927,
"learning_rate": 9.954619438495142e-06,
"loss": 0.0512,
"step": 53
},
{
"epoch": 0.07494795281054822,
"grad_norm": 0.24770434200763702,
"learning_rate": 9.951548621777409e-06,
"loss": 0.0437,
"step": 54
},
{
"epoch": 0.07633587786259542,
"grad_norm": 0.23926012217998505,
"learning_rate": 9.948377785322082e-06,
"loss": 0.0325,
"step": 55
},
{
"epoch": 0.0777238029146426,
"grad_norm": 0.25855472683906555,
"learning_rate": 9.945106993179074e-06,
"loss": 0.0426,
"step": 56
},
{
"epoch": 0.0791117279666898,
"grad_norm": 0.3099120259284973,
"learning_rate": 9.941736311417362e-06,
"loss": 0.0474,
"step": 57
},
{
"epoch": 0.08049965301873699,
"grad_norm": 0.22296354174613953,
"learning_rate": 9.938265808123667e-06,
"loss": 0.0377,
"step": 58
},
{
"epoch": 0.08188757807078417,
"grad_norm": 0.2595604658126831,
"learning_rate": 9.934695553401076e-06,
"loss": 0.0492,
"step": 59
},
{
"epoch": 0.08327550312283137,
"grad_norm": 0.2444402575492859,
"learning_rate": 9.931025619367617e-06,
"loss": 0.0489,
"step": 60
},
{
"epoch": 0.08466342817487855,
"grad_norm": 0.2848093807697296,
"learning_rate": 9.927256080154813e-06,
"loss": 0.0601,
"step": 61
},
{
"epoch": 0.08605135322692574,
"grad_norm": 0.24992454051971436,
"learning_rate": 9.923387011906183e-06,
"loss": 0.0392,
"step": 62
},
{
"epoch": 0.08743927827897294,
"grad_norm": 0.28840649127960205,
"learning_rate": 9.919418492775694e-06,
"loss": 0.052,
"step": 63
},
{
"epoch": 0.08882720333102012,
"grad_norm": 0.2522892951965332,
"learning_rate": 9.915350602926198e-06,
"loss": 0.0499,
"step": 64
},
{
"epoch": 0.09021512838306732,
"grad_norm": 0.6776983737945557,
"learning_rate": 9.911183424527802e-06,
"loss": 0.0491,
"step": 65
},
{
"epoch": 0.0916030534351145,
"grad_norm": 0.2539517283439636,
"learning_rate": 9.906917041756208e-06,
"loss": 0.0456,
"step": 66
},
{
"epoch": 0.09299097848716169,
"grad_norm": 0.2568037509918213,
"learning_rate": 9.902551540791016e-06,
"loss": 0.0416,
"step": 67
},
{
"epoch": 0.09437890353920889,
"grad_norm": 0.2917027771472931,
"learning_rate": 9.898087009813985e-06,
"loss": 0.0559,
"step": 68
},
{
"epoch": 0.09576682859125607,
"grad_norm": 0.20621994137763977,
"learning_rate": 9.893523539007248e-06,
"loss": 0.0301,
"step": 69
},
{
"epoch": 0.09715475364330327,
"grad_norm": 0.2382073998451233,
"learning_rate": 9.888861220551494e-06,
"loss": 0.0457,
"step": 70
},
{
"epoch": 0.09854267869535045,
"grad_norm": 0.27149689197540283,
"learning_rate": 9.884100148624096e-06,
"loss": 0.0507,
"step": 71
},
{
"epoch": 0.09993060374739764,
"grad_norm": 0.30372723937034607,
"learning_rate": 9.879240419397227e-06,
"loss": 0.0604,
"step": 72
},
{
"epoch": 0.10131852879944483,
"grad_norm": 0.32783612608909607,
"learning_rate": 9.874282131035899e-06,
"loss": 0.0538,
"step": 73
},
{
"epoch": 0.10270645385149202,
"grad_norm": 0.2645081877708435,
"learning_rate": 9.86922538369599e-06,
"loss": 0.0486,
"step": 74
},
{
"epoch": 0.1040943789035392,
"grad_norm": 0.26425695419311523,
"learning_rate": 9.864070279522222e-06,
"loss": 0.0459,
"step": 75
},
{
"epoch": 0.1054823039555864,
"grad_norm": 0.25774726271629333,
"learning_rate": 9.858816922646088e-06,
"loss": 0.0461,
"step": 76
},
{
"epoch": 0.10687022900763359,
"grad_norm": 0.2583499550819397,
"learning_rate": 9.853465419183759e-06,
"loss": 0.0389,
"step": 77
},
{
"epoch": 0.10825815405968078,
"grad_norm": 0.28452831506729126,
"learning_rate": 9.848015877233935e-06,
"loss": 0.0569,
"step": 78
},
{
"epoch": 0.10964607911172797,
"grad_norm": 0.357303261756897,
"learning_rate": 9.842468406875665e-06,
"loss": 0.0636,
"step": 79
},
{
"epoch": 0.11103400416377515,
"grad_norm": 0.25684916973114014,
"learning_rate": 9.836823120166116e-06,
"loss": 0.0518,
"step": 80
},
{
"epoch": 0.11242192921582235,
"grad_norm": 0.2970696687698364,
"learning_rate": 9.831080131138325e-06,
"loss": 0.0517,
"step": 81
},
{
"epoch": 0.11380985426786953,
"grad_norm": 0.3112433850765228,
"learning_rate": 9.825239555798875e-06,
"loss": 0.0529,
"step": 82
},
{
"epoch": 0.11519777931991672,
"grad_norm": 0.23133604228496552,
"learning_rate": 9.819301512125565e-06,
"loss": 0.0418,
"step": 83
},
{
"epoch": 0.11658570437196392,
"grad_norm": 0.3827936053276062,
"learning_rate": 9.813266120065028e-06,
"loss": 0.0558,
"step": 84
},
{
"epoch": 0.1179736294240111,
"grad_norm": 0.4031021296977997,
"learning_rate": 9.807133501530297e-06,
"loss": 0.0582,
"step": 85
},
{
"epoch": 0.1193615544760583,
"grad_norm": 0.2524511218070984,
"learning_rate": 9.800903780398357e-06,
"loss": 0.0405,
"step": 86
},
{
"epoch": 0.12074947952810548,
"grad_norm": 0.25606897473335266,
"learning_rate": 9.794577082507631e-06,
"loss": 0.0517,
"step": 87
},
{
"epoch": 0.12213740458015267,
"grad_norm": 0.24462860822677612,
"learning_rate": 9.788153535655442e-06,
"loss": 0.0465,
"step": 88
},
{
"epoch": 0.12352532963219987,
"grad_norm": 0.2757568955421448,
"learning_rate": 9.781633269595432e-06,
"loss": 0.0439,
"step": 89
},
{
"epoch": 0.12491325468424705,
"grad_norm": 0.3216729462146759,
"learning_rate": 9.77501641603494e-06,
"loss": 0.0493,
"step": 90
},
{
"epoch": 0.12630117973629423,
"grad_norm": 0.3510143458843231,
"learning_rate": 9.76830310863235e-06,
"loss": 0.0427,
"step": 91
},
{
"epoch": 0.12768910478834142,
"grad_norm": 0.3293008506298065,
"learning_rate": 9.761493482994374e-06,
"loss": 0.0478,
"step": 92
},
{
"epoch": 0.12907702984038863,
"grad_norm": 0.24690201878547668,
"learning_rate": 9.754587676673323e-06,
"loss": 0.0428,
"step": 93
},
{
"epoch": 0.1304649548924358,
"grad_norm": 0.2786445915699005,
"learning_rate": 9.747585829164332e-06,
"loss": 0.0434,
"step": 94
},
{
"epoch": 0.131852879944483,
"grad_norm": 0.2033212035894394,
"learning_rate": 9.74048808190254e-06,
"loss": 0.0416,
"step": 95
},
{
"epoch": 0.13324080499653018,
"grad_norm": 0.2768020033836365,
"learning_rate": 9.733294578260224e-06,
"loss": 0.0495,
"step": 96
},
{
"epoch": 0.13462873004857737,
"grad_norm": 0.2745535671710968,
"learning_rate": 9.726005463543913e-06,
"loss": 0.0464,
"step": 97
},
{
"epoch": 0.13601665510062458,
"grad_norm": 0.2892036736011505,
"learning_rate": 9.718620884991455e-06,
"loss": 0.0471,
"step": 98
},
{
"epoch": 0.13740458015267176,
"grad_norm": 0.26574069261550903,
"learning_rate": 9.711140991769028e-06,
"loss": 0.0497,
"step": 99
},
{
"epoch": 0.13879250520471895,
"grad_norm": 0.23448492586612701,
"learning_rate": 9.703565934968146e-06,
"loss": 0.041,
"step": 100
},
{
"epoch": 0.14018043025676613,
"grad_norm": 0.21899428963661194,
"learning_rate": 9.695895867602591e-06,
"loss": 0.0425,
"step": 101
},
{
"epoch": 0.14156835530881332,
"grad_norm": 0.30496442317962646,
"learning_rate": 9.688130944605332e-06,
"loss": 0.048,
"step": 102
},
{
"epoch": 0.14295628036086053,
"grad_norm": 0.23567788302898407,
"learning_rate": 9.680271322825392e-06,
"loss": 0.0402,
"step": 103
},
{
"epoch": 0.1443442054129077,
"grad_norm": 0.2686285674571991,
"learning_rate": 9.672317161024679e-06,
"loss": 0.0511,
"step": 104
},
{
"epoch": 0.1457321304649549,
"grad_norm": 0.23937608301639557,
"learning_rate": 9.664268619874776e-06,
"loss": 0.0444,
"step": 105
},
{
"epoch": 0.14712005551700208,
"grad_norm": 0.2692619264125824,
"learning_rate": 9.656125861953711e-06,
"loss": 0.0472,
"step": 106
},
{
"epoch": 0.14850798056904926,
"grad_norm": 0.2656213641166687,
"learning_rate": 9.647889051742649e-06,
"loss": 0.0516,
"step": 107
},
{
"epoch": 0.14989590562109645,
"grad_norm": 0.2606026530265808,
"learning_rate": 9.639558355622589e-06,
"loss": 0.0456,
"step": 108
},
{
"epoch": 0.15128383067314366,
"grad_norm": 0.23198707401752472,
"learning_rate": 9.631133941870993e-06,
"loss": 0.05,
"step": 109
},
{
"epoch": 0.15267175572519084,
"grad_norm": 0.25558924674987793,
"learning_rate": 9.622615980658391e-06,
"loss": 0.0535,
"step": 110
},
{
"epoch": 0.15405968077723803,
"grad_norm": 0.2660232186317444,
"learning_rate": 9.614004644044943e-06,
"loss": 0.0481,
"step": 111
},
{
"epoch": 0.1554476058292852,
"grad_norm": 0.21960465610027313,
"learning_rate": 9.60530010597696e-06,
"loss": 0.0424,
"step": 112
},
{
"epoch": 0.1568355308813324,
"grad_norm": 0.26909562945365906,
"learning_rate": 9.596502542283399e-06,
"loss": 0.0621,
"step": 113
},
{
"epoch": 0.1582234559333796,
"grad_norm": 0.21745258569717407,
"learning_rate": 9.587612130672302e-06,
"loss": 0.038,
"step": 114
},
{
"epoch": 0.1596113809854268,
"grad_norm": 0.2411315143108368,
"learning_rate": 9.578629050727208e-06,
"loss": 0.0445,
"step": 115
},
{
"epoch": 0.16099930603747398,
"grad_norm": 0.2715134918689728,
"learning_rate": 9.569553483903531e-06,
"loss": 0.0526,
"step": 116
},
{
"epoch": 0.16238723108952116,
"grad_norm": 0.23721489310264587,
"learning_rate": 9.56038561352489e-06,
"loss": 0.0441,
"step": 117
},
{
"epoch": 0.16377515614156835,
"grad_norm": 0.23496368527412415,
"learning_rate": 9.551125624779407e-06,
"loss": 0.0353,
"step": 118
},
{
"epoch": 0.16516308119361556,
"grad_norm": 0.2572284936904907,
"learning_rate": 9.541773704715966e-06,
"loss": 0.0494,
"step": 119
},
{
"epoch": 0.16655100624566274,
"grad_norm": 0.27511200308799744,
"learning_rate": 9.532330042240434e-06,
"loss": 0.0412,
"step": 120
},
{
"epoch": 0.16793893129770993,
"grad_norm": 0.2772343158721924,
"learning_rate": 9.522794828111849e-06,
"loss": 0.0533,
"step": 121
},
{
"epoch": 0.1693268563497571,
"grad_norm": 0.19398753345012665,
"learning_rate": 9.51316825493856e-06,
"loss": 0.0358,
"step": 122
},
{
"epoch": 0.1707147814018043,
"grad_norm": 0.2360389232635498,
"learning_rate": 9.503450517174344e-06,
"loss": 0.0373,
"step": 123
},
{
"epoch": 0.17210270645385148,
"grad_norm": 0.24422723054885864,
"learning_rate": 9.493641811114472e-06,
"loss": 0.04,
"step": 124
},
{
"epoch": 0.1734906315058987,
"grad_norm": 0.2566449046134949,
"learning_rate": 9.483742334891747e-06,
"loss": 0.0455,
"step": 125
},
{
"epoch": 0.17487855655794587,
"grad_norm": 0.33392754197120667,
"learning_rate": 9.473752288472499e-06,
"loss": 0.0656,
"step": 126
},
{
"epoch": 0.17626648160999306,
"grad_norm": 0.23692674934864044,
"learning_rate": 9.463671873652551e-06,
"loss": 0.0462,
"step": 127
},
{
"epoch": 0.17765440666204024,
"grad_norm": 0.24718192219734192,
"learning_rate": 9.453501294053139e-06,
"loss": 0.0483,
"step": 128
},
{
"epoch": 0.17904233171408743,
"grad_norm": 0.24895285069942474,
"learning_rate": 9.443240755116797e-06,
"loss": 0.0392,
"step": 129
},
{
"epoch": 0.18043025676613464,
"grad_norm": 0.23605003952980042,
"learning_rate": 9.432890464103208e-06,
"loss": 0.043,
"step": 130
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.2124328911304474,
"learning_rate": 9.422450630085026e-06,
"loss": 0.0423,
"step": 131
},
{
"epoch": 0.183206106870229,
"grad_norm": 0.29214364290237427,
"learning_rate": 9.411921463943641e-06,
"loss": 0.0579,
"step": 132
},
{
"epoch": 0.1845940319222762,
"grad_norm": 0.26696211099624634,
"learning_rate": 9.401303178364923e-06,
"loss": 0.05,
"step": 133
},
{
"epoch": 0.18598195697432338,
"grad_norm": 0.37398970127105713,
"learning_rate": 9.39059598783493e-06,
"loss": 0.0428,
"step": 134
},
{
"epoch": 0.1873698820263706,
"grad_norm": 0.44276946783065796,
"learning_rate": 9.37980010863557e-06,
"loss": 0.0713,
"step": 135
},
{
"epoch": 0.18875780707841777,
"grad_norm": 0.21642902493476868,
"learning_rate": 9.368915758840235e-06,
"loss": 0.0328,
"step": 136
},
{
"epoch": 0.19014573213046496,
"grad_norm": 0.23396173119544983,
"learning_rate": 9.357943158309396e-06,
"loss": 0.0316,
"step": 137
},
{
"epoch": 0.19153365718251214,
"grad_norm": 0.29896441102027893,
"learning_rate": 9.346882528686159e-06,
"loss": 0.0547,
"step": 138
},
{
"epoch": 0.19292158223455932,
"grad_norm": 0.30207115411758423,
"learning_rate": 9.335734093391797e-06,
"loss": 0.0611,
"step": 139
},
{
"epoch": 0.19430950728660654,
"grad_norm": 0.21651242673397064,
"learning_rate": 9.32449807762122e-06,
"loss": 0.0363,
"step": 140
},
{
"epoch": 0.19569743233865372,
"grad_norm": 0.2455293834209442,
"learning_rate": 9.313174708338446e-06,
"loss": 0.0378,
"step": 141
},
{
"epoch": 0.1970853573907009,
"grad_norm": 0.27794840931892395,
"learning_rate": 9.301764214272e-06,
"loss": 0.0472,
"step": 142
},
{
"epoch": 0.1984732824427481,
"grad_norm": 0.4069722294807434,
"learning_rate": 9.2902668259103e-06,
"loss": 0.0336,
"step": 143
},
{
"epoch": 0.19986120749479527,
"grad_norm": 0.2580963671207428,
"learning_rate": 9.278682775497012e-06,
"loss": 0.0388,
"step": 144
},
{
"epoch": 0.20124913254684246,
"grad_norm": 0.25055116415023804,
"learning_rate": 9.267012297026334e-06,
"loss": 0.044,
"step": 145
},
{
"epoch": 0.20263705759888967,
"grad_norm": 0.20974084734916687,
"learning_rate": 9.255255626238295e-06,
"loss": 0.0403,
"step": 146
},
{
"epoch": 0.20402498265093685,
"grad_norm": 0.25812828540802,
"learning_rate": 9.243413000613974e-06,
"loss": 0.05,
"step": 147
},
{
"epoch": 0.20541290770298404,
"grad_norm": 0.2810615003108978,
"learning_rate": 9.231484659370717e-06,
"loss": 0.0475,
"step": 148
},
{
"epoch": 0.20680083275503122,
"grad_norm": 0.3402911126613617,
"learning_rate": 9.219470843457294e-06,
"loss": 0.0574,
"step": 149
},
{
"epoch": 0.2081887578070784,
"grad_norm": 0.25966858863830566,
"learning_rate": 9.207371795549043e-06,
"loss": 0.0465,
"step": 150
},
{
"epoch": 0.20957668285912562,
"grad_norm": 0.2727181911468506,
"learning_rate": 9.195187760042952e-06,
"loss": 0.0472,
"step": 151
},
{
"epoch": 0.2109646079111728,
"grad_norm": 0.27386289834976196,
"learning_rate": 9.182918983052743e-06,
"loss": 0.0526,
"step": 152
},
{
"epoch": 0.21235253296322,
"grad_norm": 0.2608488202095032,
"learning_rate": 9.17056571240388e-06,
"loss": 0.0593,
"step": 153
},
{
"epoch": 0.21374045801526717,
"grad_norm": 0.30400654673576355,
"learning_rate": 9.158128197628578e-06,
"loss": 0.0539,
"step": 154
},
{
"epoch": 0.21512838306731435,
"grad_norm": 0.23927266895771027,
"learning_rate": 9.145606689960756e-06,
"loss": 0.0465,
"step": 155
},
{
"epoch": 0.21651630811936157,
"grad_norm": 0.24992169439792633,
"learning_rate": 9.133001442330964e-06,
"loss": 0.0483,
"step": 156
},
{
"epoch": 0.21790423317140875,
"grad_norm": 0.2401110827922821,
"learning_rate": 9.120312709361271e-06,
"loss": 0.038,
"step": 157
},
{
"epoch": 0.21929215822345594,
"grad_norm": 0.516508936882019,
"learning_rate": 9.107540747360124e-06,
"loss": 0.0522,
"step": 158
},
{
"epoch": 0.22068008327550312,
"grad_norm": 0.23530790209770203,
"learning_rate": 9.094685814317174e-06,
"loss": 0.0392,
"step": 159
},
{
"epoch": 0.2220680083275503,
"grad_norm": 0.24144016206264496,
"learning_rate": 9.081748169898054e-06,
"loss": 0.0426,
"step": 160
},
{
"epoch": 0.22345593337959752,
"grad_norm": 0.2304311841726303,
"learning_rate": 9.068728075439153e-06,
"loss": 0.0429,
"step": 161
},
{
"epoch": 0.2248438584316447,
"grad_norm": 0.22649236023426056,
"learning_rate": 9.055625793942308e-06,
"loss": 0.0432,
"step": 162
},
{
"epoch": 0.22623178348369188,
"grad_norm": 0.2122083157300949,
"learning_rate": 9.042441590069526e-06,
"loss": 0.0423,
"step": 163
},
{
"epoch": 0.22761970853573907,
"grad_norm": 0.19142234325408936,
"learning_rate": 9.029175730137611e-06,
"loss": 0.0396,
"step": 164
},
{
"epoch": 0.22900763358778625,
"grad_norm": 0.29952624440193176,
"learning_rate": 9.015828482112793e-06,
"loss": 0.0662,
"step": 165
},
{
"epoch": 0.23039555863983344,
"grad_norm": 0.2410702407360077,
"learning_rate": 9.002400115605319e-06,
"loss": 0.0439,
"step": 166
},
{
"epoch": 0.23178348369188065,
"grad_norm": 0.2898229956626892,
"learning_rate": 8.988890901864006e-06,
"loss": 0.0488,
"step": 167
},
{
"epoch": 0.23317140874392783,
"grad_norm": 0.547232449054718,
"learning_rate": 8.975301113770756e-06,
"loss": 0.0487,
"step": 168
},
{
"epoch": 0.23455933379597502,
"grad_norm": 0.2565780282020569,
"learning_rate": 8.96163102583505e-06,
"loss": 0.0501,
"step": 169
},
{
"epoch": 0.2359472588480222,
"grad_norm": 0.31040069460868835,
"learning_rate": 8.947880914188397e-06,
"loss": 0.0602,
"step": 170
},
{
"epoch": 0.23733518390006939,
"grad_norm": 0.2812497913837433,
"learning_rate": 8.934051056578768e-06,
"loss": 0.0507,
"step": 171
},
{
"epoch": 0.2387231089521166,
"grad_norm": 0.2024613618850708,
"learning_rate": 8.920141732364971e-06,
"loss": 0.0346,
"step": 172
},
{
"epoch": 0.24011103400416378,
"grad_norm": 0.21827936172485352,
"learning_rate": 8.906153222511014e-06,
"loss": 0.0501,
"step": 173
},
{
"epoch": 0.24149895905621097,
"grad_norm": 0.222194641828537,
"learning_rate": 8.892085809580435e-06,
"loss": 0.0447,
"step": 174
},
{
"epoch": 0.24288688410825815,
"grad_norm": 0.2933279275894165,
"learning_rate": 8.877939777730585e-06,
"loss": 0.0561,
"step": 175
},
{
"epoch": 0.24427480916030533,
"grad_norm": 0.26495489478111267,
"learning_rate": 8.863715412706897e-06,
"loss": 0.0488,
"step": 176
},
{
"epoch": 0.24566273421235255,
"grad_norm": 0.25571662187576294,
"learning_rate": 8.849413001837105e-06,
"loss": 0.0469,
"step": 177
},
{
"epoch": 0.24705065926439973,
"grad_norm": 0.24550214409828186,
"learning_rate": 8.83503283402545e-06,
"loss": 0.0516,
"step": 178
},
{
"epoch": 0.24843858431644691,
"grad_norm": 0.25382155179977417,
"learning_rate": 8.820575199746835e-06,
"loss": 0.0531,
"step": 179
},
{
"epoch": 0.2498265093684941,
"grad_norm": 0.3099108338356018,
"learning_rate": 8.806040391040962e-06,
"loss": 0.0507,
"step": 180
},
{
"epoch": 0.2512144344205413,
"grad_norm": 0.25325724482536316,
"learning_rate": 8.791428701506433e-06,
"loss": 0.0473,
"step": 181
},
{
"epoch": 0.25260235947258847,
"grad_norm": 0.24286960065364838,
"learning_rate": 8.776740426294818e-06,
"loss": 0.0531,
"step": 182
},
{
"epoch": 0.2539902845246357,
"grad_norm": 0.2681328356266022,
"learning_rate": 8.761975862104694e-06,
"loss": 0.0506,
"step": 183
},
{
"epoch": 0.25537820957668284,
"grad_norm": 0.2976507544517517,
"learning_rate": 8.747135307175657e-06,
"loss": 0.0464,
"step": 184
},
{
"epoch": 0.25676613462873005,
"grad_norm": 0.2756042182445526,
"learning_rate": 8.73221906128228e-06,
"loss": 0.0454,
"step": 185
},
{
"epoch": 0.25815405968077726,
"grad_norm": 0.24677051603794098,
"learning_rate": 8.71722742572808e-06,
"loss": 0.0443,
"step": 186
},
{
"epoch": 0.2595419847328244,
"grad_norm": 0.2520540654659271,
"learning_rate": 8.702160703339422e-06,
"loss": 0.0411,
"step": 187
},
{
"epoch": 0.2609299097848716,
"grad_norm": 0.26869460940361023,
"learning_rate": 8.687019198459395e-06,
"loss": 0.0538,
"step": 188
},
{
"epoch": 0.2623178348369188,
"grad_norm": 0.2537235915660858,
"learning_rate": 8.671803216941674e-06,
"loss": 0.045,
"step": 189
},
{
"epoch": 0.263705759888966,
"grad_norm": 0.25532054901123047,
"learning_rate": 8.656513066144342e-06,
"loss": 0.0421,
"step": 190
},
{
"epoch": 0.2650936849410132,
"grad_norm": 0.23937132954597473,
"learning_rate": 8.641149054923673e-06,
"loss": 0.0488,
"step": 191
},
{
"epoch": 0.26648160999306036,
"grad_norm": 0.22795534133911133,
"learning_rate": 8.625711493627902e-06,
"loss": 0.047,
"step": 192
},
{
"epoch": 0.2678695350451076,
"grad_norm": 0.24898919463157654,
"learning_rate": 8.610200694090951e-06,
"loss": 0.0496,
"step": 193
},
{
"epoch": 0.26925746009715473,
"grad_norm": 0.26497387886047363,
"learning_rate": 8.594616969626134e-06,
"loss": 0.0485,
"step": 194
},
{
"epoch": 0.27064538514920194,
"grad_norm": 0.24637307226657867,
"learning_rate": 8.578960635019822e-06,
"loss": 0.0497,
"step": 195
},
{
"epoch": 0.27203331020124916,
"grad_norm": 0.3122495412826538,
"learning_rate": 8.563232006525093e-06,
"loss": 0.0512,
"step": 196
},
{
"epoch": 0.2734212352532963,
"grad_norm": 0.23246419429779053,
"learning_rate": 8.547431401855333e-06,
"loss": 0.0428,
"step": 197
},
{
"epoch": 0.2748091603053435,
"grad_norm": 0.24396541714668274,
"learning_rate": 8.531559140177828e-06,
"loss": 0.0487,
"step": 198
},
{
"epoch": 0.2761970853573907,
"grad_norm": 0.22116141021251678,
"learning_rate": 8.515615542107317e-06,
"loss": 0.0395,
"step": 199
},
{
"epoch": 0.2775850104094379,
"grad_norm": 0.28872185945510864,
"learning_rate": 8.499600929699501e-06,
"loss": 0.048,
"step": 200
},
{
"epoch": 0.2789729354614851,
"grad_norm": 0.2683909833431244,
"learning_rate": 8.48351562644456e-06,
"loss": 0.0462,
"step": 201
},
{
"epoch": 0.28036086051353226,
"grad_norm": 0.22411032021045685,
"learning_rate": 8.4673599572606e-06,
"loss": 0.0485,
"step": 202
},
{
"epoch": 0.2817487855655795,
"grad_norm": 0.27028921246528625,
"learning_rate": 8.4511342484871e-06,
"loss": 0.0591,
"step": 203
},
{
"epoch": 0.28313671061762663,
"grad_norm": 0.25318291783332825,
"learning_rate": 8.434838827878315e-06,
"loss": 0.0402,
"step": 204
},
{
"epoch": 0.28452463566967384,
"grad_norm": 0.2501783072948456,
"learning_rate": 8.418474024596659e-06,
"loss": 0.0456,
"step": 205
},
{
"epoch": 0.28591256072172105,
"grad_norm": 0.24944418668746948,
"learning_rate": 8.402040169206054e-06,
"loss": 0.0439,
"step": 206
},
{
"epoch": 0.2873004857737682,
"grad_norm": 0.33963543176651,
"learning_rate": 8.38553759366525e-06,
"loss": 0.0639,
"step": 207
},
{
"epoch": 0.2886884108258154,
"grad_norm": 0.32349342107772827,
"learning_rate": 8.36896663132113e-06,
"loss": 0.0541,
"step": 208
},
{
"epoch": 0.2900763358778626,
"grad_norm": 0.23536472022533417,
"learning_rate": 8.352327616901956e-06,
"loss": 0.0487,
"step": 209
},
{
"epoch": 0.2914642609299098,
"grad_norm": 0.23875856399536133,
"learning_rate": 8.335620886510637e-06,
"loss": 0.048,
"step": 210
},
{
"epoch": 0.29285218598195695,
"grad_norm": 0.25536614656448364,
"learning_rate": 8.318846777617913e-06,
"loss": 0.0538,
"step": 211
},
{
"epoch": 0.29424011103400416,
"grad_norm": 0.37551552057266235,
"learning_rate": 8.302005629055549e-06,
"loss": 0.0464,
"step": 212
},
{
"epoch": 0.29562803608605137,
"grad_norm": 0.25611814856529236,
"learning_rate": 8.285097781009497e-06,
"loss": 0.0451,
"step": 213
},
{
"epoch": 0.2970159611380985,
"grad_norm": 0.26617372035980225,
"learning_rate": 8.268123575013008e-06,
"loss": 0.0567,
"step": 214
},
{
"epoch": 0.29840388619014574,
"grad_norm": 0.2143125832080841,
"learning_rate": 8.251083353939752e-06,
"loss": 0.0481,
"step": 215
},
{
"epoch": 0.2997918112421929,
"grad_norm": 0.2626519203186035,
"learning_rate": 8.233977461996879e-06,
"loss": 0.0474,
"step": 216
},
{
"epoch": 0.3011797362942401,
"grad_norm": 0.20418819785118103,
"learning_rate": 8.216806244718068e-06,
"loss": 0.0396,
"step": 217
},
{
"epoch": 0.3025676613462873,
"grad_norm": 0.30222561955451965,
"learning_rate": 8.199570048956553e-06,
"loss": 0.0494,
"step": 218
},
{
"epoch": 0.3039555863983345,
"grad_norm": 0.37407004833221436,
"learning_rate": 8.182269222878112e-06,
"loss": 0.0536,
"step": 219
},
{
"epoch": 0.3053435114503817,
"grad_norm": 0.3121405839920044,
"learning_rate": 8.164904115954036e-06,
"loss": 0.0448,
"step": 220
},
{
"epoch": 0.30673143650242884,
"grad_norm": 0.31007713079452515,
"learning_rate": 8.147475078954067e-06,
"loss": 0.046,
"step": 221
},
{
"epoch": 0.30811936155447606,
"grad_norm": 0.2768557369709015,
"learning_rate": 8.129982463939313e-06,
"loss": 0.0517,
"step": 222
},
{
"epoch": 0.30950728660652327,
"grad_norm": 0.2517678141593933,
"learning_rate": 8.112426624255145e-06,
"loss": 0.0553,
"step": 223
},
{
"epoch": 0.3108952116585704,
"grad_norm": 0.2336336374282837,
"learning_rate": 8.094807914524048e-06,
"loss": 0.0403,
"step": 224
},
{
"epoch": 0.31228313671061764,
"grad_norm": 0.23134920001029968,
"learning_rate": 8.07712669063846e-06,
"loss": 0.043,
"step": 225
},
{
"epoch": 0.3136710617626648,
"grad_norm": 0.21085438132286072,
"learning_rate": 8.059383309753587e-06,
"loss": 0.0377,
"step": 226
},
{
"epoch": 0.315058986814712,
"grad_norm": 0.24013970792293549,
"learning_rate": 8.041578130280194e-06,
"loss": 0.0481,
"step": 227
},
{
"epoch": 0.3164469118667592,
"grad_norm": 0.24835489690303802,
"learning_rate": 8.023711511877347e-06,
"loss": 0.0463,
"step": 228
},
{
"epoch": 0.3178348369188064,
"grad_norm": 0.24152745306491852,
"learning_rate": 8.005783815445168e-06,
"loss": 0.0394,
"step": 229
},
{
"epoch": 0.3192227619708536,
"grad_norm": 0.259420245885849,
"learning_rate": 7.987795403117528e-06,
"loss": 0.0468,
"step": 230
},
{
"epoch": 0.32061068702290074,
"grad_norm": 0.2520180940628052,
"learning_rate": 7.96974663825475e-06,
"loss": 0.0474,
"step": 231
},
{
"epoch": 0.32199861207494795,
"grad_norm": 0.22478239238262177,
"learning_rate": 7.95163788543625e-06,
"loss": 0.0446,
"step": 232
},
{
"epoch": 0.32338653712699517,
"grad_norm": 0.3189745247364044,
"learning_rate": 7.933469510453189e-06,
"loss": 0.0621,
"step": 233
},
{
"epoch": 0.3247744621790423,
"grad_norm": 0.18363076448440552,
"learning_rate": 7.915241880301075e-06,
"loss": 0.034,
"step": 234
},
{
"epoch": 0.32616238723108953,
"grad_norm": 0.2726271450519562,
"learning_rate": 7.896955363172347e-06,
"loss": 0.0375,
"step": 235
},
{
"epoch": 0.3275503122831367,
"grad_norm": 0.26370498538017273,
"learning_rate": 7.878610328448948e-06,
"loss": 0.0464,
"step": 236
},
{
"epoch": 0.3289382373351839,
"grad_norm": 0.30265697836875916,
"learning_rate": 7.86020714669486e-06,
"loss": 0.061,
"step": 237
},
{
"epoch": 0.3303261623872311,
"grad_norm": 0.20978808403015137,
"learning_rate": 7.84174618964861e-06,
"loss": 0.0371,
"step": 238
},
{
"epoch": 0.33171408743927827,
"grad_norm": 0.22528868913650513,
"learning_rate": 7.823227830215776e-06,
"loss": 0.0432,
"step": 239
},
{
"epoch": 0.3331020124913255,
"grad_norm": 0.22642700374126434,
"learning_rate": 7.804652442461438e-06,
"loss": 0.0492,
"step": 240
},
{
"epoch": 0.33448993754337264,
"grad_norm": 0.25173112750053406,
"learning_rate": 7.786020401602638e-06,
"loss": 0.0489,
"step": 241
},
{
"epoch": 0.33587786259541985,
"grad_norm": 0.2896901071071625,
"learning_rate": 7.767332084000784e-06,
"loss": 0.0481,
"step": 242
},
{
"epoch": 0.33726578764746706,
"grad_norm": 0.2184886485338211,
"learning_rate": 7.748587867154068e-06,
"loss": 0.0356,
"step": 243
},
{
"epoch": 0.3386537126995142,
"grad_norm": 0.22447901964187622,
"learning_rate": 7.72978812968982e-06,
"loss": 0.0347,
"step": 244
},
{
"epoch": 0.34004163775156143,
"grad_norm": 0.20314303040504456,
"learning_rate": 7.71093325135687e-06,
"loss": 0.0431,
"step": 245
},
{
"epoch": 0.3414295628036086,
"grad_norm": 0.22147540748119354,
"learning_rate": 7.692023613017884e-06,
"loss": 0.0423,
"step": 246
},
{
"epoch": 0.3428174878556558,
"grad_norm": 0.2724509537220001,
"learning_rate": 7.673059596641657e-06,
"loss": 0.0414,
"step": 247
},
{
"epoch": 0.34420541290770296,
"grad_norm": 0.22747007012367249,
"learning_rate": 7.6540415852954e-06,
"loss": 0.04,
"step": 248
},
{
"epoch": 0.34559333795975017,
"grad_norm": 0.3057887852191925,
"learning_rate": 7.634969963137015e-06,
"loss": 0.0545,
"step": 249
},
{
"epoch": 0.3469812630117974,
"grad_norm": 0.3109971582889557,
"learning_rate": 7.615845115407316e-06,
"loss": 0.0475,
"step": 250
},
{
"epoch": 0.34836918806384454,
"grad_norm": 0.19809511303901672,
"learning_rate": 7.596667428422264e-06,
"loss": 0.0312,
"step": 251
},
{
"epoch": 0.34975711311589175,
"grad_norm": 0.26312026381492615,
"learning_rate": 7.5774372895651545e-06,
"loss": 0.0494,
"step": 252
},
{
"epoch": 0.3511450381679389,
"grad_norm": 0.23003171384334564,
"learning_rate": 7.558155087278791e-06,
"loss": 0.0542,
"step": 253
},
{
"epoch": 0.3525329632199861,
"grad_norm": 0.24017004668712616,
"learning_rate": 7.538821211057648e-06,
"loss": 0.0478,
"step": 254
},
{
"epoch": 0.35392088827203333,
"grad_norm": 0.20186010003089905,
"learning_rate": 7.519436051439991e-06,
"loss": 0.0326,
"step": 255
},
{
"epoch": 0.3553088133240805,
"grad_norm": 0.2597668170928955,
"learning_rate": 7.500000000000001e-06,
"loss": 0.0501,
"step": 256
},
{
"epoch": 0.3566967383761277,
"grad_norm": 0.23714697360992432,
"learning_rate": 7.480513449339851e-06,
"loss": 0.0483,
"step": 257
},
{
"epoch": 0.35808466342817485,
"grad_norm": 0.3031267821788788,
"learning_rate": 7.460976793081789e-06,
"loss": 0.0507,
"step": 258
},
{
"epoch": 0.35947258848022207,
"grad_norm": 0.2820730209350586,
"learning_rate": 7.441390425860172e-06,
"loss": 0.0502,
"step": 259
},
{
"epoch": 0.3608605135322693,
"grad_norm": 0.3192010223865509,
"learning_rate": 7.421754743313514e-06,
"loss": 0.0619,
"step": 260
},
{
"epoch": 0.36224843858431643,
"grad_norm": 0.2101418673992157,
"learning_rate": 7.402070142076475e-06,
"loss": 0.0411,
"step": 261
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.2577596604824066,
"learning_rate": 7.382337019771859e-06,
"loss": 0.0422,
"step": 262
},
{
"epoch": 0.3650242886884108,
"grad_norm": 0.20709669589996338,
"learning_rate": 7.36255577500258e-06,
"loss": 0.0382,
"step": 263
},
{
"epoch": 0.366412213740458,
"grad_norm": 0.22359001636505127,
"learning_rate": 7.342726807343615e-06,
"loss": 0.0318,
"step": 264
},
{
"epoch": 0.3678001387925052,
"grad_norm": 0.2126387655735016,
"learning_rate": 7.322850517333924e-06,
"loss": 0.0355,
"step": 265
},
{
"epoch": 0.3691880638445524,
"grad_norm": 0.252213716506958,
"learning_rate": 7.302927306468365e-06,
"loss": 0.0451,
"step": 266
},
{
"epoch": 0.3705759888965996,
"grad_norm": 0.21543753147125244,
"learning_rate": 7.282957577189581e-06,
"loss": 0.0531,
"step": 267
},
{
"epoch": 0.37196391394864675,
"grad_norm": 0.2670553922653198,
"learning_rate": 7.2629417328798755e-06,
"loss": 0.0508,
"step": 268
},
{
"epoch": 0.37335183900069396,
"grad_norm": 0.24284611642360687,
"learning_rate": 7.242880177853062e-06,
"loss": 0.0397,
"step": 269
},
{
"epoch": 0.3747397640527412,
"grad_norm": 0.3197598159313202,
"learning_rate": 7.222773317346291e-06,
"loss": 0.0577,
"step": 270
},
{
"epoch": 0.37612768910478833,
"grad_norm": 0.21058222651481628,
"learning_rate": 7.202621557511874e-06,
"loss": 0.0366,
"step": 271
},
{
"epoch": 0.37751561415683554,
"grad_norm": 0.33534789085388184,
"learning_rate": 7.1824253054090735e-06,
"loss": 0.0445,
"step": 272
},
{
"epoch": 0.3789035392088827,
"grad_norm": 0.26425862312316895,
"learning_rate": 7.162184968995882e-06,
"loss": 0.0513,
"step": 273
},
{
"epoch": 0.3802914642609299,
"grad_norm": 0.19292885065078735,
"learning_rate": 7.141900957120781e-06,
"loss": 0.0376,
"step": 274
},
{
"epoch": 0.3816793893129771,
"grad_norm": 0.25801414251327515,
"learning_rate": 7.121573679514484e-06,
"loss": 0.0468,
"step": 275
},
{
"epoch": 0.3830673143650243,
"grad_norm": 0.22623024880886078,
"learning_rate": 7.101203546781655e-06,
"loss": 0.043,
"step": 276
},
{
"epoch": 0.3844552394170715,
"grad_norm": 0.36003199219703674,
"learning_rate": 7.080790970392626e-06,
"loss": 0.0637,
"step": 277
},
{
"epoch": 0.38584316446911865,
"grad_norm": 0.22922679781913757,
"learning_rate": 7.060336362675069e-06,
"loss": 0.0504,
"step": 278
},
{
"epoch": 0.38723108952116586,
"grad_norm": 0.25158169865608215,
"learning_rate": 7.039840136805679e-06,
"loss": 0.039,
"step": 279
},
{
"epoch": 0.3886190145732131,
"grad_norm": 0.3726769685745239,
"learning_rate": 7.019302706801826e-06,
"loss": 0.0577,
"step": 280
},
{
"epoch": 0.39000693962526023,
"grad_norm": 0.2862564027309418,
"learning_rate": 6.998724487513191e-06,
"loss": 0.0446,
"step": 281
},
{
"epoch": 0.39139486467730744,
"grad_norm": 0.2609567642211914,
"learning_rate": 6.978105894613385e-06,
"loss": 0.0574,
"step": 282
},
{
"epoch": 0.3927827897293546,
"grad_norm": 0.3218703866004944,
"learning_rate": 6.9574473445915495e-06,
"loss": 0.0582,
"step": 283
},
{
"epoch": 0.3941707147814018,
"grad_norm": 0.23220689594745636,
"learning_rate": 6.936749254743951e-06,
"loss": 0.0499,
"step": 284
},
{
"epoch": 0.39555863983344897,
"grad_norm": 0.2209801822900772,
"learning_rate": 6.916012043165552e-06,
"loss": 0.0482,
"step": 285
},
{
"epoch": 0.3969465648854962,
"grad_norm": 0.2856426239013672,
"learning_rate": 6.895236128741554e-06,
"loss": 0.0426,
"step": 286
},
{
"epoch": 0.3983344899375434,
"grad_norm": 0.2667888402938843,
"learning_rate": 6.87442193113895e-06,
"loss": 0.0542,
"step": 287
},
{
"epoch": 0.39972241498959055,
"grad_norm": 0.2344602793455124,
"learning_rate": 6.8535698707980356e-06,
"loss": 0.0434,
"step": 288
},
{
"epoch": 0.40111034004163776,
"grad_norm": 0.24343138933181763,
"learning_rate": 6.83268036892393e-06,
"loss": 0.0508,
"step": 289
},
{
"epoch": 0.4024982650936849,
"grad_norm": 0.22512559592723846,
"learning_rate": 6.811753847478051e-06,
"loss": 0.0396,
"step": 290
},
{
"epoch": 0.4038861901457321,
"grad_norm": 0.1864270120859146,
"learning_rate": 6.790790729169604e-06,
"loss": 0.0388,
"step": 291
},
{
"epoch": 0.40527411519777934,
"grad_norm": 0.19488897919654846,
"learning_rate": 6.769791437447042e-06,
"loss": 0.035,
"step": 292
},
{
"epoch": 0.4066620402498265,
"grad_norm": 0.2777364253997803,
"learning_rate": 6.7487563964895066e-06,
"loss": 0.0519,
"step": 293
},
{
"epoch": 0.4080499653018737,
"grad_norm": 0.1941017359495163,
"learning_rate": 6.7276860311982614e-06,
"loss": 0.0361,
"step": 294
},
{
"epoch": 0.40943789035392086,
"grad_norm": 0.20925886929035187,
"learning_rate": 6.7065807671881155e-06,
"loss": 0.0353,
"step": 295
},
{
"epoch": 0.4108258154059681,
"grad_norm": 0.27289116382598877,
"learning_rate": 6.6854410307788175e-06,
"loss": 0.0533,
"step": 296
},
{
"epoch": 0.4122137404580153,
"grad_norm": 0.27907559275627136,
"learning_rate": 6.664267248986447e-06,
"loss": 0.0415,
"step": 297
},
{
"epoch": 0.41360166551006244,
"grad_norm": 0.24534979462623596,
"learning_rate": 6.643059849514795e-06,
"loss": 0.0445,
"step": 298
},
{
"epoch": 0.41498959056210966,
"grad_norm": 0.18391579389572144,
"learning_rate": 6.621819260746713e-06,
"loss": 0.0335,
"step": 299
},
{
"epoch": 0.4163775156141568,
"grad_norm": 0.24801869690418243,
"learning_rate": 6.600545911735468e-06,
"loss": 0.044,
"step": 300
},
{
"epoch": 0.417765440666204,
"grad_norm": 0.2516506314277649,
"learning_rate": 6.579240232196073e-06,
"loss": 0.0507,
"step": 301
},
{
"epoch": 0.41915336571825124,
"grad_norm": 0.2211104929447174,
"learning_rate": 6.5579026524966106e-06,
"loss": 0.0397,
"step": 302
},
{
"epoch": 0.4205412907702984,
"grad_norm": 0.3019264042377472,
"learning_rate": 6.536533603649536e-06,
"loss": 0.0554,
"step": 303
},
{
"epoch": 0.4219292158223456,
"grad_norm": 0.2607039213180542,
"learning_rate": 6.515133517302969e-06,
"loss": 0.0443,
"step": 304
},
{
"epoch": 0.42331714087439276,
"grad_norm": 0.26423439383506775,
"learning_rate": 6.493702825731977e-06,
"loss": 0.052,
"step": 305
},
{
"epoch": 0.42470506592644,
"grad_norm": 0.2512415647506714,
"learning_rate": 6.472241961829846e-06,
"loss": 0.0437,
"step": 306
},
{
"epoch": 0.4260929909784872,
"grad_norm": 0.2525016963481903,
"learning_rate": 6.450751359099332e-06,
"loss": 0.0372,
"step": 307
},
{
"epoch": 0.42748091603053434,
"grad_norm": 0.2588847577571869,
"learning_rate": 6.429231451643907e-06,
"loss": 0.0428,
"step": 308
},
{
"epoch": 0.42886884108258155,
"grad_norm": 0.27101561427116394,
"learning_rate": 6.407682674158988e-06,
"loss": 0.0497,
"step": 309
},
{
"epoch": 0.4302567661346287,
"grad_norm": 0.2133539319038391,
"learning_rate": 6.386105461923159e-06,
"loss": 0.0362,
"step": 310
},
{
"epoch": 0.4316446911866759,
"grad_norm": 0.2591789960861206,
"learning_rate": 6.364500250789375e-06,
"loss": 0.0434,
"step": 311
},
{
"epoch": 0.43303261623872313,
"grad_norm": 0.2357206642627716,
"learning_rate": 6.342867477176164e-06,
"loss": 0.0425,
"step": 312
},
{
"epoch": 0.4344205412907703,
"grad_norm": 0.22705315053462982,
"learning_rate": 6.321207578058803e-06,
"loss": 0.049,
"step": 313
},
{
"epoch": 0.4358084663428175,
"grad_norm": 0.22295016050338745,
"learning_rate": 6.299520990960497e-06,
"loss": 0.0455,
"step": 314
},
{
"epoch": 0.43719639139486466,
"grad_norm": 0.2628712058067322,
"learning_rate": 6.2778081539435436e-06,
"loss": 0.0481,
"step": 315
},
{
"epoch": 0.43858431644691187,
"grad_norm": 0.26134777069091797,
"learning_rate": 6.256069505600474e-06,
"loss": 0.0526,
"step": 316
},
{
"epoch": 0.4399722414989591,
"grad_norm": 0.16641436517238617,
"learning_rate": 6.234305485045205e-06,
"loss": 0.0376,
"step": 317
},
{
"epoch": 0.44136016655100624,
"grad_norm": 0.2876615822315216,
"learning_rate": 6.212516531904164e-06,
"loss": 0.0474,
"step": 318
},
{
"epoch": 0.44274809160305345,
"grad_norm": 0.2932173013687134,
"learning_rate": 6.1907030863074055e-06,
"loss": 0.0508,
"step": 319
},
{
"epoch": 0.4441360166551006,
"grad_norm": 0.2550623118877411,
"learning_rate": 6.16886558887973e-06,
"loss": 0.0457,
"step": 320
},
{
"epoch": 0.4455239417071478,
"grad_norm": 0.22620978951454163,
"learning_rate": 6.1470044807317695e-06,
"loss": 0.0445,
"step": 321
},
{
"epoch": 0.44691186675919503,
"grad_norm": 0.2603720724582672,
"learning_rate": 6.1251202034510905e-06,
"loss": 0.0519,
"step": 322
},
{
"epoch": 0.4482997918112422,
"grad_norm": 0.2070014923810959,
"learning_rate": 6.103213199093267e-06,
"loss": 0.0365,
"step": 323
},
{
"epoch": 0.4496877168632894,
"grad_norm": 0.2892493009567261,
"learning_rate": 6.081283910172956e-06,
"loss": 0.0586,
"step": 324
},
{
"epoch": 0.45107564191533656,
"grad_norm": 0.2348898947238922,
"learning_rate": 6.059332779654953e-06,
"loss": 0.0426,
"step": 325
},
{
"epoch": 0.45246356696738377,
"grad_norm": 0.24212618172168732,
"learning_rate": 6.037360250945243e-06,
"loss": 0.0378,
"step": 326
},
{
"epoch": 0.4538514920194309,
"grad_norm": 0.2537144422531128,
"learning_rate": 6.015366767882054e-06,
"loss": 0.0412,
"step": 327
},
{
"epoch": 0.45523941707147814,
"grad_norm": 0.21163740754127502,
"learning_rate": 5.993352774726885e-06,
"loss": 0.0358,
"step": 328
},
{
"epoch": 0.45662734212352535,
"grad_norm": 0.340040385723114,
"learning_rate": 5.97131871615553e-06,
"loss": 0.0428,
"step": 329
},
{
"epoch": 0.4580152671755725,
"grad_norm": 0.28573116660118103,
"learning_rate": 5.949265037249096e-06,
"loss": 0.0557,
"step": 330
},
{
"epoch": 0.4594031922276197,
"grad_norm": 0.2429424375295639,
"learning_rate": 5.927192183485023e-06,
"loss": 0.0444,
"step": 331
},
{
"epoch": 0.4607911172796669,
"grad_norm": 0.19028761982917786,
"learning_rate": 5.905100600728067e-06,
"loss": 0.0368,
"step": 332
},
{
"epoch": 0.4621790423317141,
"grad_norm": 0.24256815016269684,
"learning_rate": 5.882990735221312e-06,
"loss": 0.0501,
"step": 333
},
{
"epoch": 0.4635669673837613,
"grad_norm": 0.2468879520893097,
"learning_rate": 5.860863033577141e-06,
"loss": 0.0551,
"step": 334
},
{
"epoch": 0.46495489243580845,
"grad_norm": 0.26638200879096985,
"learning_rate": 5.8387179427682265e-06,
"loss": 0.038,
"step": 335
},
{
"epoch": 0.46634281748785567,
"grad_norm": 0.2620946764945984,
"learning_rate": 5.8165559101184955e-06,
"loss": 0.0596,
"step": 336
},
{
"epoch": 0.4677307425399028,
"grad_norm": 0.3298340439796448,
"learning_rate": 5.794377383294094e-06,
"loss": 0.0422,
"step": 337
},
{
"epoch": 0.46911866759195003,
"grad_norm": 0.2864164710044861,
"learning_rate": 5.7721828102943445e-06,
"loss": 0.0557,
"step": 338
},
{
"epoch": 0.47050659264399725,
"grad_norm": 0.19313201308250427,
"learning_rate": 5.749972639442698e-06,
"loss": 0.0352,
"step": 339
},
{
"epoch": 0.4718945176960444,
"grad_norm": 0.2098756730556488,
"learning_rate": 5.72774731937768e-06,
"loss": 0.0465,
"step": 340
},
{
"epoch": 0.4732824427480916,
"grad_norm": 0.23879243433475494,
"learning_rate": 5.705507299043822e-06,
"loss": 0.0407,
"step": 341
},
{
"epoch": 0.47467036780013877,
"grad_norm": 0.24368953704833984,
"learning_rate": 5.683253027682597e-06,
"loss": 0.047,
"step": 342
},
{
"epoch": 0.476058292852186,
"grad_norm": 0.22611577808856964,
"learning_rate": 5.660984954823342e-06,
"loss": 0.0444,
"step": 343
},
{
"epoch": 0.4774462179042332,
"grad_norm": 0.21530263125896454,
"learning_rate": 5.638703530274187e-06,
"loss": 0.0391,
"step": 344
},
{
"epoch": 0.47883414295628035,
"grad_norm": 0.2654690444469452,
"learning_rate": 5.6164092041129544e-06,
"loss": 0.0537,
"step": 345
},
{
"epoch": 0.48022206800832756,
"grad_norm": 0.24219410121440887,
"learning_rate": 5.594102426678082e-06,
"loss": 0.0469,
"step": 346
},
{
"epoch": 0.4816099930603747,
"grad_norm": 0.2516026496887207,
"learning_rate": 5.57178364855951e-06,
"loss": 0.0438,
"step": 347
},
{
"epoch": 0.48299791811242193,
"grad_norm": 0.18920086324214935,
"learning_rate": 5.549453320589598e-06,
"loss": 0.0383,
"step": 348
},
{
"epoch": 0.48438584316446914,
"grad_norm": 0.25243017077445984,
"learning_rate": 5.527111893834004e-06,
"loss": 0.0473,
"step": 349
},
{
"epoch": 0.4857737682165163,
"grad_norm": 0.18175047636032104,
"learning_rate": 5.504759819582581e-06,
"loss": 0.039,
"step": 350
},
{
"epoch": 0.4871616932685635,
"grad_norm": 0.262374609708786,
"learning_rate": 5.482397549340256e-06,
"loss": 0.0502,
"step": 351
},
{
"epoch": 0.48854961832061067,
"grad_norm": 0.22592027485370636,
"learning_rate": 5.460025534817911e-06,
"loss": 0.0463,
"step": 352
},
{
"epoch": 0.4899375433726579,
"grad_norm": 0.24996748566627502,
"learning_rate": 5.437644227923261e-06,
"loss": 0.0492,
"step": 353
},
{
"epoch": 0.4913254684247051,
"grad_norm": 0.2771015167236328,
"learning_rate": 5.415254080751725e-06,
"loss": 0.0441,
"step": 354
},
{
"epoch": 0.49271339347675225,
"grad_norm": 0.21063049137592316,
"learning_rate": 5.39285554557729e-06,
"loss": 0.041,
"step": 355
},
{
"epoch": 0.49410131852879946,
"grad_norm": 0.2659851014614105,
"learning_rate": 5.37044907484338e-06,
"loss": 0.051,
"step": 356
},
{
"epoch": 0.4954892435808466,
"grad_norm": 0.24859070777893066,
"learning_rate": 5.348035121153716e-06,
"loss": 0.0454,
"step": 357
},
{
"epoch": 0.49687716863289383,
"grad_norm": 0.2629816234111786,
"learning_rate": 5.32561413726317e-06,
"loss": 0.0433,
"step": 358
},
{
"epoch": 0.49826509368494104,
"grad_norm": 0.21318283677101135,
"learning_rate": 5.303186576068621e-06,
"loss": 0.0408,
"step": 359
},
{
"epoch": 0.4996530187369882,
"grad_norm": 0.28312963247299194,
"learning_rate": 5.28075289059981e-06,
"loss": 0.0464,
"step": 360
},
{
"epoch": 0.5010409437890354,
"grad_norm": 0.2362765371799469,
"learning_rate": 5.258313534010187e-06,
"loss": 0.0365,
"step": 361
},
{
"epoch": 0.5024288688410826,
"grad_norm": 0.17733339965343475,
"learning_rate": 5.235868959567755e-06,
"loss": 0.0348,
"step": 362
},
{
"epoch": 0.5038167938931297,
"grad_norm": 0.29030370712280273,
"learning_rate": 5.213419620645914e-06,
"loss": 0.0451,
"step": 363
},
{
"epoch": 0.5052047189451769,
"grad_norm": 0.26337236166000366,
"learning_rate": 5.1909659707143105e-06,
"loss": 0.0403,
"step": 364
},
{
"epoch": 0.5065926439972241,
"grad_norm": 0.27861809730529785,
"learning_rate": 5.1685084633296665e-06,
"loss": 0.0489,
"step": 365
},
{
"epoch": 0.5079805690492714,
"grad_norm": 0.26633739471435547,
"learning_rate": 5.14604755212663e-06,
"loss": 0.0539,
"step": 366
},
{
"epoch": 0.5093684941013186,
"grad_norm": 0.2497876137495041,
"learning_rate": 5.123583690808596e-06,
"loss": 0.0469,
"step": 367
},
{
"epoch": 0.5107564191533657,
"grad_norm": 0.2367750108242035,
"learning_rate": 5.101117333138558e-06,
"loss": 0.0455,
"step": 368
},
{
"epoch": 0.5121443442054129,
"grad_norm": 0.21793133020401,
"learning_rate": 5.078648932929933e-06,
"loss": 0.0441,
"step": 369
},
{
"epoch": 0.5135322692574601,
"grad_norm": 0.2375047653913498,
"learning_rate": 5.056178944037396e-06,
"loss": 0.0536,
"step": 370
},
{
"epoch": 0.5149201943095073,
"grad_norm": 0.24593202769756317,
"learning_rate": 5.033707820347715e-06,
"loss": 0.0488,
"step": 371
},
{
"epoch": 0.5163081193615545,
"grad_norm": 0.2014894336462021,
"learning_rate": 5.011236015770577e-06,
"loss": 0.0438,
"step": 372
},
{
"epoch": 0.5176960444136016,
"grad_norm": 0.2295396327972412,
"learning_rate": 4.988763984229425e-06,
"loss": 0.0426,
"step": 373
},
{
"epoch": 0.5190839694656488,
"grad_norm": 0.23237484693527222,
"learning_rate": 4.9662921796522856e-06,
"loss": 0.0432,
"step": 374
},
{
"epoch": 0.520471894517696,
"grad_norm": 0.18679746985435486,
"learning_rate": 4.9438210559626045e-06,
"loss": 0.0314,
"step": 375
},
{
"epoch": 0.5218598195697433,
"grad_norm": 0.2613310217857361,
"learning_rate": 4.921351067070068e-06,
"loss": 0.0517,
"step": 376
},
{
"epoch": 0.5232477446217905,
"grad_norm": 0.21653001010417938,
"learning_rate": 4.898882666861444e-06,
"loss": 0.0374,
"step": 377
},
{
"epoch": 0.5246356696738376,
"grad_norm": 0.23039276897907257,
"learning_rate": 4.876416309191406e-06,
"loss": 0.0353,
"step": 378
},
{
"epoch": 0.5260235947258848,
"grad_norm": 0.2714105248451233,
"learning_rate": 4.853952447873371e-06,
"loss": 0.0552,
"step": 379
},
{
"epoch": 0.527411519777932,
"grad_norm": 0.25004643201828003,
"learning_rate": 4.831491536670334e-06,
"loss": 0.0462,
"step": 380
},
{
"epoch": 0.5287994448299792,
"grad_norm": 0.22087691724300385,
"learning_rate": 4.809034029285691e-06,
"loss": 0.0358,
"step": 381
},
{
"epoch": 0.5301873698820264,
"grad_norm": 0.20914320647716522,
"learning_rate": 4.786580379354087e-06,
"loss": 0.0362,
"step": 382
},
{
"epoch": 0.5315752949340735,
"grad_norm": 0.1910157948732376,
"learning_rate": 4.7641310404322475e-06,
"loss": 0.0388,
"step": 383
},
{
"epoch": 0.5329632199861207,
"grad_norm": 0.2663300037384033,
"learning_rate": 4.741686465989814e-06,
"loss": 0.0458,
"step": 384
},
{
"epoch": 0.5343511450381679,
"grad_norm": 0.19584941864013672,
"learning_rate": 4.719247109400192e-06,
"loss": 0.042,
"step": 385
},
{
"epoch": 0.5357390700902152,
"grad_norm": 0.21342402696609497,
"learning_rate": 4.696813423931381e-06,
"loss": 0.0441,
"step": 386
},
{
"epoch": 0.5371269951422624,
"grad_norm": 0.2350674718618393,
"learning_rate": 4.674385862736832e-06,
"loss": 0.0479,
"step": 387
},
{
"epoch": 0.5385149201943095,
"grad_norm": 0.27498018741607666,
"learning_rate": 4.651964878846285e-06,
"loss": 0.0419,
"step": 388
},
{
"epoch": 0.5399028452463567,
"grad_norm": 0.19162392616271973,
"learning_rate": 4.62955092515662e-06,
"loss": 0.0396,
"step": 389
},
{
"epoch": 0.5412907702984039,
"grad_norm": 0.2587217688560486,
"learning_rate": 4.607144454422711e-06,
"loss": 0.048,
"step": 390
},
{
"epoch": 0.5426786953504511,
"grad_norm": 0.3030252456665039,
"learning_rate": 4.584745919248275e-06,
"loss": 0.0629,
"step": 391
},
{
"epoch": 0.5440666204024983,
"grad_norm": 0.27365297079086304,
"learning_rate": 4.56235577207674e-06,
"loss": 0.0356,
"step": 392
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.22301366925239563,
"learning_rate": 4.5399744651820915e-06,
"loss": 0.0399,
"step": 393
},
{
"epoch": 0.5468424705065926,
"grad_norm": 0.23638130724430084,
"learning_rate": 4.517602450659746e-06,
"loss": 0.0362,
"step": 394
},
{
"epoch": 0.5482303955586398,
"grad_norm": 0.23380909860134125,
"learning_rate": 4.49524018041742e-06,
"loss": 0.0439,
"step": 395
},
{
"epoch": 0.549618320610687,
"grad_norm": 0.1905248910188675,
"learning_rate": 4.472888106165995e-06,
"loss": 0.042,
"step": 396
},
{
"epoch": 0.5510062456627343,
"grad_norm": 0.25941696763038635,
"learning_rate": 4.450546679410403e-06,
"loss": 0.046,
"step": 397
},
{
"epoch": 0.5523941707147814,
"grad_norm": 0.25850844383239746,
"learning_rate": 4.428216351440492e-06,
"loss": 0.0457,
"step": 398
},
{
"epoch": 0.5537820957668286,
"grad_norm": 0.19940395653247833,
"learning_rate": 4.40589757332192e-06,
"loss": 0.0416,
"step": 399
},
{
"epoch": 0.5551700208188758,
"grad_norm": 0.2690625786781311,
"learning_rate": 4.383590795887046e-06,
"loss": 0.0546,
"step": 400
},
{
"epoch": 0.556557945870923,
"grad_norm": 0.21887092292308807,
"learning_rate": 4.361296469725813e-06,
"loss": 0.0458,
"step": 401
},
{
"epoch": 0.5579458709229702,
"grad_norm": 0.19797225296497345,
"learning_rate": 4.339015045176659e-06,
"loss": 0.038,
"step": 402
},
{
"epoch": 0.5593337959750173,
"grad_norm": 0.2749207317829132,
"learning_rate": 4.316746972317406e-06,
"loss": 0.0617,
"step": 403
},
{
"epoch": 0.5607217210270645,
"grad_norm": 0.26302123069763184,
"learning_rate": 4.2944927009561786e-06,
"loss": 0.0404,
"step": 404
},
{
"epoch": 0.5621096460791117,
"grad_norm": 0.2845047116279602,
"learning_rate": 4.272252680622321e-06,
"loss": 0.0537,
"step": 405
},
{
"epoch": 0.563497571131159,
"grad_norm": 0.20317070186138153,
"learning_rate": 4.250027360557302e-06,
"loss": 0.039,
"step": 406
},
{
"epoch": 0.5648854961832062,
"grad_norm": 0.3297349214553833,
"learning_rate": 4.227817189705657e-06,
"loss": 0.0659,
"step": 407
},
{
"epoch": 0.5662734212352533,
"grad_norm": 0.19175824522972107,
"learning_rate": 4.205622616705909e-06,
"loss": 0.0344,
"step": 408
},
{
"epoch": 0.5676613462873005,
"grad_norm": 0.21104075014591217,
"learning_rate": 4.183444089881506e-06,
"loss": 0.0403,
"step": 409
},
{
"epoch": 0.5690492713393477,
"grad_norm": 0.23763036727905273,
"learning_rate": 4.161282057231776e-06,
"loss": 0.0425,
"step": 410
},
{
"epoch": 0.5704371963913949,
"grad_norm": 0.27829548716545105,
"learning_rate": 4.13913696642286e-06,
"loss": 0.0379,
"step": 411
},
{
"epoch": 0.5718251214434421,
"grad_norm": 0.2803460359573364,
"learning_rate": 4.1170092647786895e-06,
"loss": 0.0566,
"step": 412
},
{
"epoch": 0.5732130464954892,
"grad_norm": 0.2148689329624176,
"learning_rate": 4.094899399271935e-06,
"loss": 0.0497,
"step": 413
},
{
"epoch": 0.5746009715475364,
"grad_norm": 0.25749555230140686,
"learning_rate": 4.072807816514978e-06,
"loss": 0.0543,
"step": 414
},
{
"epoch": 0.5759888965995836,
"grad_norm": 0.21719405055046082,
"learning_rate": 4.0507349627509045e-06,
"loss": 0.0368,
"step": 415
},
{
"epoch": 0.5773768216516308,
"grad_norm": 0.1985744833946228,
"learning_rate": 4.028681283844471e-06,
"loss": 0.0326,
"step": 416
},
{
"epoch": 0.5787647467036781,
"grad_norm": 0.33434179425239563,
"learning_rate": 4.006647225273116e-06,
"loss": 0.0477,
"step": 417
},
{
"epoch": 0.5801526717557252,
"grad_norm": 0.2449856400489807,
"learning_rate": 3.984633232117948e-06,
"loss": 0.0495,
"step": 418
},
{
"epoch": 0.5815405968077724,
"grad_norm": 0.2528287172317505,
"learning_rate": 3.96263974905476e-06,
"loss": 0.0463,
"step": 419
},
{
"epoch": 0.5829285218598196,
"grad_norm": 0.19862471520900726,
"learning_rate": 3.94066722034505e-06,
"loss": 0.0404,
"step": 420
},
{
"epoch": 0.5843164469118668,
"grad_norm": 0.22756393253803253,
"learning_rate": 3.9187160898270435e-06,
"loss": 0.0463,
"step": 421
},
{
"epoch": 0.5857043719639139,
"grad_norm": 0.23390746116638184,
"learning_rate": 3.896786800906734e-06,
"loss": 0.0478,
"step": 422
},
{
"epoch": 0.5870922970159611,
"grad_norm": 0.302847295999527,
"learning_rate": 3.87487979654891e-06,
"loss": 0.0481,
"step": 423
},
{
"epoch": 0.5884802220680083,
"grad_norm": 0.27103695273399353,
"learning_rate": 3.852995519268231e-06,
"loss": 0.0501,
"step": 424
},
{
"epoch": 0.5898681471200555,
"grad_norm": 0.2562784254550934,
"learning_rate": 3.831134411120273e-06,
"loss": 0.0443,
"step": 425
},
{
"epoch": 0.5912560721721027,
"grad_norm": 0.2507801949977875,
"learning_rate": 3.809296913692594e-06,
"loss": 0.0419,
"step": 426
},
{
"epoch": 0.5926439972241498,
"grad_norm": 0.2666143476963043,
"learning_rate": 3.787483468095838e-06,
"loss": 0.0338,
"step": 427
},
{
"epoch": 0.594031922276197,
"grad_norm": 0.20649617910385132,
"learning_rate": 3.765694514954796e-06,
"loss": 0.0495,
"step": 428
},
{
"epoch": 0.5954198473282443,
"grad_norm": 0.25489917397499084,
"learning_rate": 3.7439304943995274e-06,
"loss": 0.0466,
"step": 429
},
{
"epoch": 0.5968077723802915,
"grad_norm": 0.21290229260921478,
"learning_rate": 3.72219184605646e-06,
"loss": 0.0488,
"step": 430
},
{
"epoch": 0.5981956974323387,
"grad_norm": 0.25812432169914246,
"learning_rate": 3.7004790090395043e-06,
"loss": 0.0473,
"step": 431
},
{
"epoch": 0.5995836224843858,
"grad_norm": 0.2217107117176056,
"learning_rate": 3.678792421941199e-06,
"loss": 0.0415,
"step": 432
},
{
"epoch": 0.600971547536433,
"grad_norm": 0.21589906513690948,
"learning_rate": 3.657132522823837e-06,
"loss": 0.0395,
"step": 433
},
{
"epoch": 0.6023594725884802,
"grad_norm": 0.2054014950990677,
"learning_rate": 3.6354997492106258e-06,
"loss": 0.0478,
"step": 434
},
{
"epoch": 0.6037473976405274,
"grad_norm": 0.2601326107978821,
"learning_rate": 3.6138945380768442e-06,
"loss": 0.0418,
"step": 435
},
{
"epoch": 0.6051353226925746,
"grad_norm": 0.19384582340717316,
"learning_rate": 3.592317325841014e-06,
"loss": 0.0357,
"step": 436
},
{
"epoch": 0.6065232477446217,
"grad_norm": 0.22405052185058594,
"learning_rate": 3.5707685483560948e-06,
"loss": 0.0373,
"step": 437
},
{
"epoch": 0.607911172796669,
"grad_norm": 0.274188756942749,
"learning_rate": 3.5492486409006684e-06,
"loss": 0.0522,
"step": 438
},
{
"epoch": 0.6092990978487162,
"grad_norm": 0.21671554446220398,
"learning_rate": 3.5277580381701553e-06,
"loss": 0.0417,
"step": 439
},
{
"epoch": 0.6106870229007634,
"grad_norm": 0.2146933525800705,
"learning_rate": 3.5062971742680244e-06,
"loss": 0.04,
"step": 440
},
{
"epoch": 0.6120749479528106,
"grad_norm": 0.20991787314414978,
"learning_rate": 3.484866482697032e-06,
"loss": 0.0365,
"step": 441
},
{
"epoch": 0.6134628730048577,
"grad_norm": 0.26411962509155273,
"learning_rate": 3.4634663963504654e-06,
"loss": 0.0514,
"step": 442
},
{
"epoch": 0.6148507980569049,
"grad_norm": 0.19666975736618042,
"learning_rate": 3.4420973475033894e-06,
"loss": 0.0377,
"step": 443
},
{
"epoch": 0.6162387231089521,
"grad_norm": 0.21241791546344757,
"learning_rate": 3.4207597678039293e-06,
"loss": 0.0358,
"step": 444
},
{
"epoch": 0.6176266481609993,
"grad_norm": 0.24057012796401978,
"learning_rate": 3.3994540882645353e-06,
"loss": 0.0514,
"step": 445
},
{
"epoch": 0.6190145732130465,
"grad_norm": 0.24572890996932983,
"learning_rate": 3.3781807392532893e-06,
"loss": 0.0406,
"step": 446
},
{
"epoch": 0.6204024982650936,
"grad_norm": 0.1983802616596222,
"learning_rate": 3.3569401504852073e-06,
"loss": 0.0371,
"step": 447
},
{
"epoch": 0.6217904233171409,
"grad_norm": 0.25842270255088806,
"learning_rate": 3.335732751013553e-06,
"loss": 0.0476,
"step": 448
},
{
"epoch": 0.6231783483691881,
"grad_norm": 0.19981160759925842,
"learning_rate": 3.3145589692211837e-06,
"loss": 0.0378,
"step": 449
},
{
"epoch": 0.6245662734212353,
"grad_norm": 0.1566598266363144,
"learning_rate": 3.2934192328118866e-06,
"loss": 0.0295,
"step": 450
},
{
"epoch": 0.6259541984732825,
"grad_norm": 0.2110665738582611,
"learning_rate": 3.27231396880174e-06,
"loss": 0.0406,
"step": 451
},
{
"epoch": 0.6273421235253296,
"grad_norm": 0.21389417350292206,
"learning_rate": 3.2512436035104968e-06,
"loss": 0.0445,
"step": 452
},
{
"epoch": 0.6287300485773768,
"grad_norm": 0.2280310094356537,
"learning_rate": 3.2302085625529596e-06,
"loss": 0.0407,
"step": 453
},
{
"epoch": 0.630117973629424,
"grad_norm": 0.23333007097244263,
"learning_rate": 3.2092092708303973e-06,
"loss": 0.042,
"step": 454
},
{
"epoch": 0.6315058986814712,
"grad_norm": 0.2135223001241684,
"learning_rate": 3.18824615252195e-06,
"loss": 0.0394,
"step": 455
},
{
"epoch": 0.6328938237335184,
"grad_norm": 0.2351330667734146,
"learning_rate": 3.1673196310760723e-06,
"loss": 0.042,
"step": 456
},
{
"epoch": 0.6342817487855655,
"grad_norm": 0.30510368943214417,
"learning_rate": 3.146430129201965e-06,
"loss": 0.064,
"step": 457
},
{
"epoch": 0.6356696738376127,
"grad_norm": 0.232173889875412,
"learning_rate": 3.125578068861051e-06,
"loss": 0.0465,
"step": 458
},
{
"epoch": 0.63705759888966,
"grad_norm": 0.2363738715648651,
"learning_rate": 3.104763871258447e-06,
"loss": 0.043,
"step": 459
},
{
"epoch": 0.6384455239417072,
"grad_norm": 0.21220462024211884,
"learning_rate": 3.083987956834449e-06,
"loss": 0.0418,
"step": 460
},
{
"epoch": 0.6398334489937544,
"grad_norm": 0.2919938862323761,
"learning_rate": 3.06325074525605e-06,
"loss": 0.0583,
"step": 461
},
{
"epoch": 0.6412213740458015,
"grad_norm": 0.18480442464351654,
"learning_rate": 3.0425526554084526e-06,
"loss": 0.0376,
"step": 462
},
{
"epoch": 0.6426092990978487,
"grad_norm": 0.21944300830364227,
"learning_rate": 3.0218941053866167e-06,
"loss": 0.035,
"step": 463
},
{
"epoch": 0.6439972241498959,
"grad_norm": 0.2114299088716507,
"learning_rate": 3.00127551248681e-06,
"loss": 0.0374,
"step": 464
},
{
"epoch": 0.6453851492019431,
"grad_norm": 0.222478449344635,
"learning_rate": 2.980697293198174e-06,
"loss": 0.038,
"step": 465
},
{
"epoch": 0.6467730742539903,
"grad_norm": 0.2513335943222046,
"learning_rate": 2.960159863194322e-06,
"loss": 0.0433,
"step": 466
},
{
"epoch": 0.6481609993060374,
"grad_norm": 0.2263910174369812,
"learning_rate": 2.939663637324934e-06,
"loss": 0.0425,
"step": 467
},
{
"epoch": 0.6495489243580846,
"grad_norm": 0.2414645254611969,
"learning_rate": 2.9192090296073755e-06,
"loss": 0.0434,
"step": 468
},
{
"epoch": 0.6509368494101319,
"grad_norm": 0.31796714663505554,
"learning_rate": 2.8987964532183454e-06,
"loss": 0.056,
"step": 469
},
{
"epoch": 0.6523247744621791,
"grad_norm": 0.2518536448478699,
"learning_rate": 2.878426320485518e-06,
"loss": 0.0507,
"step": 470
},
{
"epoch": 0.6537126995142263,
"grad_norm": 0.24848508834838867,
"learning_rate": 2.8580990428792205e-06,
"loss": 0.0432,
"step": 471
},
{
"epoch": 0.6551006245662734,
"grad_norm": 0.2785174250602722,
"learning_rate": 2.8378150310041197e-06,
"loss": 0.0555,
"step": 472
},
{
"epoch": 0.6564885496183206,
"grad_norm": 0.48383262753486633,
"learning_rate": 2.8175746945909277e-06,
"loss": 0.0437,
"step": 473
},
{
"epoch": 0.6578764746703678,
"grad_norm": 0.2524990439414978,
"learning_rate": 2.7973784424881273e-06,
"loss": 0.0392,
"step": 474
},
{
"epoch": 0.659264399722415,
"grad_norm": 0.22584012150764465,
"learning_rate": 2.7772266826537103e-06,
"loss": 0.0398,
"step": 475
},
{
"epoch": 0.6606523247744622,
"grad_norm": 0.20362503826618195,
"learning_rate": 2.75711982214694e-06,
"loss": 0.0431,
"step": 476
},
{
"epoch": 0.6620402498265093,
"grad_norm": 0.2138269990682602,
"learning_rate": 2.7370582671201253e-06,
"loss": 0.0336,
"step": 477
},
{
"epoch": 0.6634281748785565,
"grad_norm": 0.1999903917312622,
"learning_rate": 2.7170424228104207e-06,
"loss": 0.0295,
"step": 478
},
{
"epoch": 0.6648160999306038,
"grad_norm": 0.2650030553340912,
"learning_rate": 2.697072693531637e-06,
"loss": 0.0418,
"step": 479
},
{
"epoch": 0.666204024982651,
"grad_norm": 0.26400989294052124,
"learning_rate": 2.6771494826660782e-06,
"loss": 0.0495,
"step": 480
},
{
"epoch": 0.6675919500346982,
"grad_norm": 0.3058173358440399,
"learning_rate": 2.6572731926563867e-06,
"loss": 0.0366,
"step": 481
},
{
"epoch": 0.6689798750867453,
"grad_norm": 0.26100462675094604,
"learning_rate": 2.6374442249974214e-06,
"loss": 0.0525,
"step": 482
},
{
"epoch": 0.6703678001387925,
"grad_norm": 0.3068617284297943,
"learning_rate": 2.617662980228144e-06,
"loss": 0.0448,
"step": 483
},
{
"epoch": 0.6717557251908397,
"grad_norm": 0.21236619353294373,
"learning_rate": 2.5979298579235276e-06,
"loss": 0.0395,
"step": 484
},
{
"epoch": 0.6731436502428869,
"grad_norm": 0.213504821062088,
"learning_rate": 2.578245256686488e-06,
"loss": 0.0329,
"step": 485
},
{
"epoch": 0.6745315752949341,
"grad_norm": 0.2812097668647766,
"learning_rate": 2.558609574139829e-06,
"loss": 0.0427,
"step": 486
},
{
"epoch": 0.6759195003469812,
"grad_norm": 0.23506538569927216,
"learning_rate": 2.539023206918212e-06,
"loss": 0.0422,
"step": 487
},
{
"epoch": 0.6773074253990284,
"grad_norm": 0.17870992422103882,
"learning_rate": 2.5194865506601507e-06,
"loss": 0.0388,
"step": 488
},
{
"epoch": 0.6786953504510757,
"grad_norm": 0.284327894449234,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0488,
"step": 489
},
{
"epoch": 0.6800832755031229,
"grad_norm": 0.23880119621753693,
"learning_rate": 2.4805639485600087e-06,
"loss": 0.0412,
"step": 490
},
{
"epoch": 0.6814712005551701,
"grad_norm": 0.2482188642024994,
"learning_rate": 2.4611787889423546e-06,
"loss": 0.0404,
"step": 491
},
{
"epoch": 0.6828591256072172,
"grad_norm": 0.24093759059906006,
"learning_rate": 2.441844912721209e-06,
"loss": 0.0488,
"step": 492
},
{
"epoch": 0.6842470506592644,
"grad_norm": 0.2581704258918762,
"learning_rate": 2.422562710434848e-06,
"loss": 0.041,
"step": 493
},
{
"epoch": 0.6856349757113116,
"grad_norm": 0.29225507378578186,
"learning_rate": 2.403332571577738e-06,
"loss": 0.0562,
"step": 494
},
{
"epoch": 0.6870229007633588,
"grad_norm": 0.23954467475414276,
"learning_rate": 2.3841548845926844e-06,
"loss": 0.0468,
"step": 495
},
{
"epoch": 0.6884108258154059,
"grad_norm": 0.23937787115573883,
"learning_rate": 2.365030036862988e-06,
"loss": 0.0405,
"step": 496
},
{
"epoch": 0.6897987508674531,
"grad_norm": 0.28101256489753723,
"learning_rate": 2.3459584147046e-06,
"loss": 0.0555,
"step": 497
},
{
"epoch": 0.6911866759195003,
"grad_norm": 0.1975279599428177,
"learning_rate": 2.3269404033583443e-06,
"loss": 0.0409,
"step": 498
},
{
"epoch": 0.6925746009715475,
"grad_norm": 0.19648875296115875,
"learning_rate": 2.3079763869821176e-06,
"loss": 0.0317,
"step": 499
},
{
"epoch": 0.6939625260235948,
"grad_norm": 0.1975594460964203,
"learning_rate": 2.2890667486431296e-06,
"loss": 0.0346,
"step": 500
},
{
"epoch": 0.6953504510756419,
"grad_norm": 0.2302437722682953,
"learning_rate": 2.270211870310184e-06,
"loss": 0.0427,
"step": 501
},
{
"epoch": 0.6967383761276891,
"grad_norm": 0.2960100769996643,
"learning_rate": 2.251412132845933e-06,
"loss": 0.0535,
"step": 502
},
{
"epoch": 0.6981263011797363,
"grad_norm": 0.18378931283950806,
"learning_rate": 2.232667915999216e-06,
"loss": 0.0375,
"step": 503
},
{
"epoch": 0.6995142262317835,
"grad_norm": 0.20398057997226715,
"learning_rate": 2.2139795983973654e-06,
"loss": 0.0383,
"step": 504
},
{
"epoch": 0.7009021512838307,
"grad_norm": 0.6562597155570984,
"learning_rate": 2.1953475575385618e-06,
"loss": 0.0437,
"step": 505
},
{
"epoch": 0.7022900763358778,
"grad_norm": 0.2080502212047577,
"learning_rate": 2.1767721697842244e-06,
"loss": 0.0396,
"step": 506
},
{
"epoch": 0.703678001387925,
"grad_norm": 0.23737893998622894,
"learning_rate": 2.1582538103513896e-06,
"loss": 0.0446,
"step": 507
},
{
"epoch": 0.7050659264399722,
"grad_norm": 0.24636588990688324,
"learning_rate": 2.139792853305141e-06,
"loss": 0.0462,
"step": 508
},
{
"epoch": 0.7064538514920194,
"grad_norm": 0.2152203917503357,
"learning_rate": 2.121389671551054e-06,
"loss": 0.0492,
"step": 509
},
{
"epoch": 0.7078417765440667,
"grad_norm": 0.26464414596557617,
"learning_rate": 2.1030446368276547e-06,
"loss": 0.0534,
"step": 510
},
{
"epoch": 0.7092297015961138,
"grad_norm": 0.27359458804130554,
"learning_rate": 2.0847581196989277e-06,
"loss": 0.0506,
"step": 511
},
{
"epoch": 0.710617626648161,
"grad_norm": 0.20019769668579102,
"learning_rate": 2.0665304895468114e-06,
"loss": 0.0392,
"step": 512
},
{
"epoch": 0.7120055517002082,
"grad_norm": 0.4770808815956116,
"learning_rate": 2.04836211456375e-06,
"loss": 0.0518,
"step": 513
},
{
"epoch": 0.7133934767522554,
"grad_norm": 0.26947441697120667,
"learning_rate": 2.030253361745251e-06,
"loss": 0.0434,
"step": 514
},
{
"epoch": 0.7147814018043026,
"grad_norm": 0.2415596842765808,
"learning_rate": 2.012204596882472e-06,
"loss": 0.0425,
"step": 515
},
{
"epoch": 0.7161693268563497,
"grad_norm": 0.19842304289340973,
"learning_rate": 1.9942161845548334e-06,
"loss": 0.0361,
"step": 516
},
{
"epoch": 0.7175572519083969,
"grad_norm": 0.27983754873275757,
"learning_rate": 1.9762884881226535e-06,
"loss": 0.0447,
"step": 517
},
{
"epoch": 0.7189451769604441,
"grad_norm": 0.24323104321956635,
"learning_rate": 1.958421869719807e-06,
"loss": 0.0498,
"step": 518
},
{
"epoch": 0.7203331020124913,
"grad_norm": 0.18736103177070618,
"learning_rate": 1.9406166902464128e-06,
"loss": 0.028,
"step": 519
},
{
"epoch": 0.7217210270645386,
"grad_norm": 0.29117128252983093,
"learning_rate": 1.922873309361542e-06,
"loss": 0.0571,
"step": 520
},
{
"epoch": 0.7231089521165857,
"grad_norm": 0.4879210591316223,
"learning_rate": 1.9051920854759543e-06,
"loss": 0.0387,
"step": 521
},
{
"epoch": 0.7244968771686329,
"grad_norm": 0.18582020699977875,
"learning_rate": 1.887573375744856e-06,
"loss": 0.0302,
"step": 522
},
{
"epoch": 0.7258848022206801,
"grad_norm": 0.1959291249513626,
"learning_rate": 1.8700175360606882e-06,
"loss": 0.0339,
"step": 523
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.26895925402641296,
"learning_rate": 1.8525249210459345e-06,
"loss": 0.0473,
"step": 524
},
{
"epoch": 0.7286606523247745,
"grad_norm": 0.30201104283332825,
"learning_rate": 1.8350958840459665e-06,
"loss": 0.0491,
"step": 525
},
{
"epoch": 0.7300485773768216,
"grad_norm": 0.2387959212064743,
"learning_rate": 1.8177307771218894e-06,
"loss": 0.0358,
"step": 526
},
{
"epoch": 0.7314365024288688,
"grad_norm": 0.21746529638767242,
"learning_rate": 1.8004299510434493e-06,
"loss": 0.0377,
"step": 527
},
{
"epoch": 0.732824427480916,
"grad_norm": 0.23417864739894867,
"learning_rate": 1.7831937552819345e-06,
"loss": 0.0426,
"step": 528
},
{
"epoch": 0.7342123525329632,
"grad_norm": 0.18516947329044342,
"learning_rate": 1.766022538003122e-06,
"loss": 0.0279,
"step": 529
},
{
"epoch": 0.7356002775850105,
"grad_norm": 0.24591688811779022,
"learning_rate": 1.7489166460602496e-06,
"loss": 0.0451,
"step": 530
},
{
"epoch": 0.7369882026370576,
"grad_norm": 0.19787262380123138,
"learning_rate": 1.7318764249869934e-06,
"loss": 0.0354,
"step": 531
},
{
"epoch": 0.7383761276891048,
"grad_norm": 0.1984841376543045,
"learning_rate": 1.7149022189905041e-06,
"loss": 0.0411,
"step": 532
},
{
"epoch": 0.739764052741152,
"grad_norm": 0.26569145917892456,
"learning_rate": 1.697994370944452e-06,
"loss": 0.0555,
"step": 533
},
{
"epoch": 0.7411519777931992,
"grad_norm": 0.25187060236930847,
"learning_rate": 1.6811532223820875e-06,
"loss": 0.0423,
"step": 534
},
{
"epoch": 0.7425399028452464,
"grad_norm": 0.21106332540512085,
"learning_rate": 1.6643791134893644e-06,
"loss": 0.0368,
"step": 535
},
{
"epoch": 0.7439278278972935,
"grad_norm": 0.20370684564113617,
"learning_rate": 1.6476723830980451e-06,
"loss": 0.038,
"step": 536
},
{
"epoch": 0.7453157529493407,
"grad_norm": 0.2407875657081604,
"learning_rate": 1.631033368678872e-06,
"loss": 0.0431,
"step": 537
},
{
"epoch": 0.7467036780013879,
"grad_norm": 0.2957558333873749,
"learning_rate": 1.6144624063347514e-06,
"loss": 0.0308,
"step": 538
},
{
"epoch": 0.7480916030534351,
"grad_norm": 0.19274376332759857,
"learning_rate": 1.597959830793947e-06,
"loss": 0.0298,
"step": 539
},
{
"epoch": 0.7494795281054824,
"grad_norm": 0.2099481225013733,
"learning_rate": 1.5815259754033407e-06,
"loss": 0.0343,
"step": 540
},
{
"epoch": 0.7508674531575295,
"grad_norm": 0.2350345253944397,
"learning_rate": 1.5651611721216865e-06,
"loss": 0.0387,
"step": 541
},
{
"epoch": 0.7522553782095767,
"grad_norm": 0.23420676589012146,
"learning_rate": 1.5488657515129001e-06,
"loss": 0.0422,
"step": 542
},
{
"epoch": 0.7536433032616239,
"grad_norm": 0.26220807433128357,
"learning_rate": 1.5326400427394023e-06,
"loss": 0.0444,
"step": 543
},
{
"epoch": 0.7550312283136711,
"grad_norm": 0.19518734514713287,
"learning_rate": 1.5164843735554408e-06,
"loss": 0.0412,
"step": 544
},
{
"epoch": 0.7564191533657183,
"grad_norm": 0.2225373089313507,
"learning_rate": 1.5003990703004994e-06,
"loss": 0.0357,
"step": 545
},
{
"epoch": 0.7578070784177654,
"grad_norm": 0.16066500544548035,
"learning_rate": 1.4843844578926863e-06,
"loss": 0.0308,
"step": 546
},
{
"epoch": 0.7591950034698126,
"grad_norm": 0.27240243554115295,
"learning_rate": 1.4684408598221722e-06,
"loss": 0.0535,
"step": 547
},
{
"epoch": 0.7605829285218598,
"grad_norm": 0.27581334114074707,
"learning_rate": 1.452568598144668e-06,
"loss": 0.0387,
"step": 548
},
{
"epoch": 0.761970853573907,
"grad_norm": 0.20905457437038422,
"learning_rate": 1.4367679934749085e-06,
"loss": 0.0388,
"step": 549
},
{
"epoch": 0.7633587786259542,
"grad_norm": 0.20028021931648254,
"learning_rate": 1.421039364980178e-06,
"loss": 0.0328,
"step": 550
},
{
"epoch": 0.7647467036780013,
"grad_norm": 0.24100172519683838,
"learning_rate": 1.405383030373867e-06,
"loss": 0.0371,
"step": 551
},
{
"epoch": 0.7661346287300486,
"grad_norm": 0.233174666762352,
"learning_rate": 1.3897993059090492e-06,
"loss": 0.0402,
"step": 552
},
{
"epoch": 0.7675225537820958,
"grad_norm": 0.2091444879770279,
"learning_rate": 1.374288506372099e-06,
"loss": 0.0442,
"step": 553
},
{
"epoch": 0.768910478834143,
"grad_norm": 0.20137923955917358,
"learning_rate": 1.3588509450763281e-06,
"loss": 0.04,
"step": 554
},
{
"epoch": 0.7702984038861902,
"grad_norm": 0.24194765090942383,
"learning_rate": 1.3434869338556594e-06,
"loss": 0.0366,
"step": 555
},
{
"epoch": 0.7716863289382373,
"grad_norm": 0.24912679195404053,
"learning_rate": 1.3281967830583264e-06,
"loss": 0.0416,
"step": 556
},
{
"epoch": 0.7730742539902845,
"grad_norm": 0.6993520259857178,
"learning_rate": 1.3129808015406064e-06,
"loss": 0.0495,
"step": 557
},
{
"epoch": 0.7744621790423317,
"grad_norm": 0.2714241147041321,
"learning_rate": 1.297839296660579e-06,
"loss": 0.0429,
"step": 558
},
{
"epoch": 0.7758501040943789,
"grad_norm": 0.26390236616134644,
"learning_rate": 1.2827725742719205e-06,
"loss": 0.0403,
"step": 559
},
{
"epoch": 0.7772380291464261,
"grad_norm": 0.19193479418754578,
"learning_rate": 1.267780938717722e-06,
"loss": 0.0316,
"step": 560
},
{
"epoch": 0.7786259541984732,
"grad_norm": 0.31760939955711365,
"learning_rate": 1.252864692824346e-06,
"loss": 0.0506,
"step": 561
},
{
"epoch": 0.7800138792505205,
"grad_norm": 0.22748614847660065,
"learning_rate": 1.2380241378953067e-06,
"loss": 0.0465,
"step": 562
},
{
"epoch": 0.7814018043025677,
"grad_norm": 0.17882980406284332,
"learning_rate": 1.223259573705184e-06,
"loss": 0.0356,
"step": 563
},
{
"epoch": 0.7827897293546149,
"grad_norm": 0.20773129165172577,
"learning_rate": 1.2085712984935693e-06,
"loss": 0.0346,
"step": 564
},
{
"epoch": 0.7841776544066621,
"grad_norm": 0.2179643213748932,
"learning_rate": 1.1939596089590394e-06,
"loss": 0.0395,
"step": 565
},
{
"epoch": 0.7855655794587092,
"grad_norm": 0.19541744887828827,
"learning_rate": 1.1794248002531644e-06,
"loss": 0.0348,
"step": 566
},
{
"epoch": 0.7869535045107564,
"grad_norm": 0.24888668954372406,
"learning_rate": 1.1649671659745504e-06,
"loss": 0.0481,
"step": 567
},
{
"epoch": 0.7883414295628036,
"grad_norm": 0.20198383927345276,
"learning_rate": 1.1505869981628953e-06,
"loss": 0.0406,
"step": 568
},
{
"epoch": 0.7897293546148508,
"grad_norm": 0.2493666559457779,
"learning_rate": 1.1362845872931044e-06,
"loss": 0.037,
"step": 569
},
{
"epoch": 0.7911172796668979,
"grad_norm": 0.22709275782108307,
"learning_rate": 1.1220602222694166e-06,
"loss": 0.0426,
"step": 570
},
{
"epoch": 0.7925052047189451,
"grad_norm": 0.25838497281074524,
"learning_rate": 1.1079141904195662e-06,
"loss": 0.0486,
"step": 571
},
{
"epoch": 0.7938931297709924,
"grad_norm": 0.2124512940645218,
"learning_rate": 1.0938467774889883e-06,
"loss": 0.034,
"step": 572
},
{
"epoch": 0.7952810548230396,
"grad_norm": 0.22879303991794586,
"learning_rate": 1.0798582676350316e-06,
"loss": 0.0444,
"step": 573
},
{
"epoch": 0.7966689798750868,
"grad_norm": 0.19821453094482422,
"learning_rate": 1.0659489434212323e-06,
"loss": 0.0365,
"step": 574
},
{
"epoch": 0.7980569049271339,
"grad_norm": 0.184886172413826,
"learning_rate": 1.0521190858116042e-06,
"loss": 0.031,
"step": 575
},
{
"epoch": 0.7994448299791811,
"grad_norm": 0.22053441405296326,
"learning_rate": 1.0383689741649516e-06,
"loss": 0.0477,
"step": 576
},
{
"epoch": 0.8008327550312283,
"grad_norm": 0.21095865964889526,
"learning_rate": 1.0246988862292462e-06,
"loss": 0.0315,
"step": 577
},
{
"epoch": 0.8022206800832755,
"grad_norm": 0.21625402569770813,
"learning_rate": 1.0111090981359961e-06,
"loss": 0.044,
"step": 578
},
{
"epoch": 0.8036086051353227,
"grad_norm": 0.2409215271472931,
"learning_rate": 9.975998843946811e-07,
"loss": 0.0504,
"step": 579
},
{
"epoch": 0.8049965301873698,
"grad_norm": 0.18764494359493256,
"learning_rate": 9.841715178872092e-07,
"loss": 0.035,
"step": 580
},
{
"epoch": 0.806384455239417,
"grad_norm": 0.25301676988601685,
"learning_rate": 9.708242698623898e-07,
"loss": 0.0476,
"step": 581
},
{
"epoch": 0.8077723802914643,
"grad_norm": 0.23698049783706665,
"learning_rate": 9.575584099304735e-07,
"loss": 0.0334,
"step": 582
},
{
"epoch": 0.8091603053435115,
"grad_norm": 0.2794058918952942,
"learning_rate": 9.443742060576916e-07,
"loss": 0.0515,
"step": 583
},
{
"epoch": 0.8105482303955587,
"grad_norm": 0.31980571150779724,
"learning_rate": 9.312719245608487e-07,
"loss": 0.0617,
"step": 584
},
{
"epoch": 0.8119361554476058,
"grad_norm": 0.21514901518821716,
"learning_rate": 9.182518301019466e-07,
"loss": 0.0445,
"step": 585
},
{
"epoch": 0.813324080499653,
"grad_norm": 0.1985836774110794,
"learning_rate": 9.053141856828274e-07,
"loss": 0.036,
"step": 586
},
{
"epoch": 0.8147120055517002,
"grad_norm": 0.22017304599285126,
"learning_rate": 8.924592526398762e-07,
"loss": 0.0412,
"step": 587
},
{
"epoch": 0.8160999306037474,
"grad_norm": 0.18507330119609833,
"learning_rate": 8.796872906387299e-07,
"loss": 0.0374,
"step": 588
},
{
"epoch": 0.8174878556557946,
"grad_norm": 0.16561415791511536,
"learning_rate": 8.669985576690371e-07,
"loss": 0.0286,
"step": 589
},
{
"epoch": 0.8188757807078417,
"grad_norm": 0.2667452096939087,
"learning_rate": 8.54393310039246e-07,
"loss": 0.0429,
"step": 590
},
{
"epoch": 0.8202637057598889,
"grad_norm": 0.28538012504577637,
"learning_rate": 8.418718023714235e-07,
"loss": 0.0354,
"step": 591
},
{
"epoch": 0.8216516308119362,
"grad_norm": 0.25441908836364746,
"learning_rate": 8.29434287596122e-07,
"loss": 0.0485,
"step": 592
},
{
"epoch": 0.8230395558639834,
"grad_norm": 0.24638378620147705,
"learning_rate": 8.170810169472593e-07,
"loss": 0.0403,
"step": 593
},
{
"epoch": 0.8244274809160306,
"grad_norm": 0.19534482061862946,
"learning_rate": 8.04812239957049e-07,
"loss": 0.0365,
"step": 594
},
{
"epoch": 0.8258154059680777,
"grad_norm": 0.17373257875442505,
"learning_rate": 7.926282044509593e-07,
"loss": 0.0271,
"step": 595
},
{
"epoch": 0.8272033310201249,
"grad_norm": 0.2546538710594177,
"learning_rate": 7.805291565427065e-07,
"loss": 0.0451,
"step": 596
},
{
"epoch": 0.8285912560721721,
"grad_norm": 0.19341765344142914,
"learning_rate": 7.685153406292845e-07,
"loss": 0.0365,
"step": 597
},
{
"epoch": 0.8299791811242193,
"grad_norm": 0.20034624636173248,
"learning_rate": 7.56586999386027e-07,
"loss": 0.033,
"step": 598
},
{
"epoch": 0.8313671061762665,
"grad_norm": 0.201642245054245,
"learning_rate": 7.447443737617066e-07,
"loss": 0.0263,
"step": 599
},
{
"epoch": 0.8327550312283136,
"grad_norm": 0.2718977928161621,
"learning_rate": 7.329877029736665e-07,
"loss": 0.0461,
"step": 600
},
{
"epoch": 0.8341429562803608,
"grad_norm": 0.21784666180610657,
"learning_rate": 7.213172245029892e-07,
"loss": 0.0336,
"step": 601
},
{
"epoch": 0.835530881332408,
"grad_norm": 0.21657319366931915,
"learning_rate": 7.097331740896995e-07,
"loss": 0.0384,
"step": 602
},
{
"epoch": 0.8369188063844553,
"grad_norm": 0.17798687517642975,
"learning_rate": 6.98235785728002e-07,
"loss": 0.0325,
"step": 603
},
{
"epoch": 0.8383067314365025,
"grad_norm": 0.26031696796417236,
"learning_rate": 6.868252916615553e-07,
"loss": 0.0392,
"step": 604
},
{
"epoch": 0.8396946564885496,
"grad_norm": 0.26929226517677307,
"learning_rate": 6.755019223787807e-07,
"loss": 0.0454,
"step": 605
},
{
"epoch": 0.8410825815405968,
"grad_norm": 0.22964255511760712,
"learning_rate": 6.642659066082046e-07,
"loss": 0.0392,
"step": 606
},
{
"epoch": 0.842470506592644,
"grad_norm": 0.2255278080701828,
"learning_rate": 6.531174713138416e-07,
"loss": 0.0466,
"step": 607
},
{
"epoch": 0.8438584316446912,
"grad_norm": 0.25335291028022766,
"learning_rate": 6.420568416906059e-07,
"loss": 0.0422,
"step": 608
},
{
"epoch": 0.8452463566967384,
"grad_norm": 0.22428208589553833,
"learning_rate": 6.310842411597667e-07,
"loss": 0.0426,
"step": 609
},
{
"epoch": 0.8466342817487855,
"grad_norm": 0.20912961661815643,
"learning_rate": 6.201998913644319e-07,
"loss": 0.0356,
"step": 610
},
{
"epoch": 0.8480222068008327,
"grad_norm": 0.27327394485473633,
"learning_rate": 6.094040121650719e-07,
"loss": 0.0462,
"step": 611
},
{
"epoch": 0.84941013185288,
"grad_norm": 0.2542797029018402,
"learning_rate": 5.986968216350786e-07,
"loss": 0.0304,
"step": 612
},
{
"epoch": 0.8507980569049272,
"grad_norm": 0.18343311548233032,
"learning_rate": 5.880785360563596e-07,
"loss": 0.0357,
"step": 613
},
{
"epoch": 0.8521859819569744,
"grad_norm": 0.1442384272813797,
"learning_rate": 5.775493699149754e-07,
"loss": 0.0243,
"step": 614
},
{
"epoch": 0.8535739070090215,
"grad_norm": 0.23244093358516693,
"learning_rate": 5.671095358967926e-07,
"loss": 0.0391,
"step": 615
},
{
"epoch": 0.8549618320610687,
"grad_norm": 0.20659306645393372,
"learning_rate": 5.56759244883206e-07,
"loss": 0.0315,
"step": 616
},
{
"epoch": 0.8563497571131159,
"grad_norm": 0.2342347949743271,
"learning_rate": 5.464987059468629e-07,
"loss": 0.0483,
"step": 617
},
{
"epoch": 0.8577376821651631,
"grad_norm": 0.2304675132036209,
"learning_rate": 5.36328126347449e-07,
"loss": 0.047,
"step": 618
},
{
"epoch": 0.8591256072172103,
"grad_norm": 0.24737447500228882,
"learning_rate": 5.262477115275022e-07,
"loss": 0.0407,
"step": 619
},
{
"epoch": 0.8605135322692574,
"grad_norm": 0.22765293717384338,
"learning_rate": 5.162576651082541e-07,
"loss": 0.036,
"step": 620
},
{
"epoch": 0.8619014573213046,
"grad_norm": 0.23714007437229156,
"learning_rate": 5.063581888855285e-07,
"loss": 0.0502,
"step": 621
},
{
"epoch": 0.8632893823733518,
"grad_norm": 0.1933499425649643,
"learning_rate": 4.965494828256573e-07,
"loss": 0.0313,
"step": 622
},
{
"epoch": 0.864677307425399,
"grad_norm": 0.3374840021133423,
"learning_rate": 4.868317450614407e-07,
"loss": 0.042,
"step": 623
},
{
"epoch": 0.8660652324774463,
"grad_norm": 0.24019359052181244,
"learning_rate": 4.772051718881532e-07,
"loss": 0.0438,
"step": 624
},
{
"epoch": 0.8674531575294934,
"grad_norm": 0.17875275015830994,
"learning_rate": 4.676699577595667e-07,
"loss": 0.0375,
"step": 625
},
{
"epoch": 0.8688410825815406,
"grad_norm": 0.22650332748889923,
"learning_rate": 4.582262952840355e-07,
"loss": 0.0362,
"step": 626
},
{
"epoch": 0.8702290076335878,
"grad_norm": 0.19926731288433075,
"learning_rate": 4.4887437522059487e-07,
"loss": 0.0314,
"step": 627
},
{
"epoch": 0.871616932685635,
"grad_norm": 0.22209225594997406,
"learning_rate": 4.3961438647511066e-07,
"loss": 0.0325,
"step": 628
},
{
"epoch": 0.8730048577376822,
"grad_norm": 0.23820732533931732,
"learning_rate": 4.304465160964699e-07,
"loss": 0.0392,
"step": 629
},
{
"epoch": 0.8743927827897293,
"grad_norm": 0.1484634429216385,
"learning_rate": 4.2137094927279296e-07,
"loss": 0.0271,
"step": 630
},
{
"epoch": 0.8757807078417765,
"grad_norm": 0.19037142395973206,
"learning_rate": 4.1238786932769947e-07,
"loss": 0.0316,
"step": 631
},
{
"epoch": 0.8771686328938237,
"grad_norm": 0.25099167227745056,
"learning_rate": 4.0349745771660233e-07,
"loss": 0.039,
"step": 632
},
{
"epoch": 0.878556557945871,
"grad_norm": 0.28284958004951477,
"learning_rate": 3.946998940230401e-07,
"loss": 0.0475,
"step": 633
},
{
"epoch": 0.8799444829979182,
"grad_norm": 0.24289296567440033,
"learning_rate": 3.859953559550589e-07,
"loss": 0.0457,
"step": 634
},
{
"epoch": 0.8813324080499653,
"grad_norm": 0.17973592877388,
"learning_rate": 3.7738401934161006e-07,
"loss": 0.0342,
"step": 635
},
{
"epoch": 0.8827203331020125,
"grad_norm": 0.24480481445789337,
"learning_rate": 3.6886605812900766e-07,
"loss": 0.0502,
"step": 636
},
{
"epoch": 0.8841082581540597,
"grad_norm": 0.2745607793331146,
"learning_rate": 3.604416443774117e-07,
"loss": 0.0503,
"step": 637
},
{
"epoch": 0.8854961832061069,
"grad_norm": 0.2708338499069214,
"learning_rate": 3.5211094825735147e-07,
"loss": 0.0434,
"step": 638
},
{
"epoch": 0.8868841082581541,
"grad_norm": 0.2643817663192749,
"learning_rate": 3.4387413804628955e-07,
"loss": 0.0442,
"step": 639
},
{
"epoch": 0.8882720333102012,
"grad_norm": 0.17992961406707764,
"learning_rate": 3.357313801252238e-07,
"loss": 0.0319,
"step": 640
},
{
"epoch": 0.8896599583622484,
"grad_norm": 0.19240032136440277,
"learning_rate": 3.276828389753234e-07,
"loss": 0.0362,
"step": 641
},
{
"epoch": 0.8910478834142956,
"grad_norm": 0.2068304866552353,
"learning_rate": 3.197286771746094e-07,
"loss": 0.0326,
"step": 642
},
{
"epoch": 0.8924358084663429,
"grad_norm": 0.3868578374385834,
"learning_rate": 3.118690553946685e-07,
"loss": 0.0312,
"step": 643
},
{
"epoch": 0.8938237335183901,
"grad_norm": 0.1948038786649704,
"learning_rate": 3.041041323974098e-07,
"loss": 0.0374,
"step": 644
},
{
"epoch": 0.8952116585704372,
"grad_norm": 0.3001810908317566,
"learning_rate": 2.964340650318548e-07,
"loss": 0.0548,
"step": 645
},
{
"epoch": 0.8965995836224844,
"grad_norm": 0.20546355843544006,
"learning_rate": 2.8885900823097223e-07,
"loss": 0.0412,
"step": 646
},
{
"epoch": 0.8979875086745316,
"grad_norm": 0.19419418275356293,
"learning_rate": 2.813791150085454e-07,
"loss": 0.0401,
"step": 647
},
{
"epoch": 0.8993754337265788,
"grad_norm": 0.1783968210220337,
"learning_rate": 2.73994536456087e-07,
"loss": 0.0308,
"step": 648
},
{
"epoch": 0.9007633587786259,
"grad_norm": 0.22798137366771698,
"learning_rate": 2.6670542173977745e-07,
"loss": 0.0422,
"step": 649
},
{
"epoch": 0.9021512838306731,
"grad_norm": 0.28471091389656067,
"learning_rate": 2.5951191809746146e-07,
"loss": 0.0583,
"step": 650
},
{
"epoch": 0.9035392088827203,
"grad_norm": 0.239897683262825,
"learning_rate": 2.524141708356681e-07,
"loss": 0.0455,
"step": 651
},
{
"epoch": 0.9049271339347675,
"grad_norm": 0.21840114891529083,
"learning_rate": 2.454123233266781e-07,
"loss": 0.0396,
"step": 652
},
{
"epoch": 0.9063150589868147,
"grad_norm": 0.23024079203605652,
"learning_rate": 2.385065170056283e-07,
"loss": 0.0395,
"step": 653
},
{
"epoch": 0.9077029840388618,
"grad_norm": 0.23643670976161957,
"learning_rate": 2.3169689136765038e-07,
"loss": 0.0447,
"step": 654
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.24520286917686462,
"learning_rate": 2.249835839650588e-07,
"loss": 0.0476,
"step": 655
},
{
"epoch": 0.9104788341429563,
"grad_norm": 0.21127338707447052,
"learning_rate": 2.1836673040456947e-07,
"loss": 0.0388,
"step": 656
},
{
"epoch": 0.9118667591950035,
"grad_norm": 0.1968703418970108,
"learning_rate": 2.1184646434455947e-07,
"loss": 0.0328,
"step": 657
},
{
"epoch": 0.9132546842470507,
"grad_norm": 0.2115575224161148,
"learning_rate": 2.0542291749237053e-07,
"loss": 0.0352,
"step": 658
},
{
"epoch": 0.9146426092990978,
"grad_norm": 0.15520645678043365,
"learning_rate": 1.9909621960164382e-07,
"loss": 0.0296,
"step": 659
},
{
"epoch": 0.916030534351145,
"grad_norm": 0.6320406198501587,
"learning_rate": 1.9286649846970318e-07,
"loss": 0.0363,
"step": 660
},
{
"epoch": 0.9174184594031922,
"grad_norm": 0.2276618331670761,
"learning_rate": 1.8673387993497383e-07,
"loss": 0.0359,
"step": 661
},
{
"epoch": 0.9188063844552394,
"grad_norm": 0.2655383050441742,
"learning_rate": 1.8069848787443556e-07,
"loss": 0.0506,
"step": 662
},
{
"epoch": 0.9201943095072866,
"grad_norm": 0.22439704835414886,
"learning_rate": 1.7476044420112637e-07,
"loss": 0.0292,
"step": 663
},
{
"epoch": 0.9215822345593337,
"grad_norm": 0.18324856460094452,
"learning_rate": 1.689198688616761e-07,
"loss": 0.0285,
"step": 664
},
{
"epoch": 0.922970159611381,
"grad_norm": 0.2790418267250061,
"learning_rate": 1.631768798338834e-07,
"loss": 0.0417,
"step": 665
},
{
"epoch": 0.9243580846634282,
"grad_norm": 0.22626622021198273,
"learning_rate": 1.5753159312433762e-07,
"loss": 0.0432,
"step": 666
},
{
"epoch": 0.9257460097154754,
"grad_norm": 0.22337311506271362,
"learning_rate": 1.5198412276606622e-07,
"loss": 0.0379,
"step": 667
},
{
"epoch": 0.9271339347675226,
"grad_norm": 0.1884629875421524,
"learning_rate": 1.465345808162427e-07,
"loss": 0.0315,
"step": 668
},
{
"epoch": 0.9285218598195697,
"grad_norm": 0.18185679614543915,
"learning_rate": 1.4118307735391412e-07,
"loss": 0.0322,
"step": 669
},
{
"epoch": 0.9299097848716169,
"grad_norm": 0.20479874312877655,
"learning_rate": 1.3592972047777874e-07,
"loss": 0.0303,
"step": 670
},
{
"epoch": 0.9312977099236641,
"grad_norm": 0.3006023168563843,
"learning_rate": 1.3077461630400967e-07,
"loss": 0.055,
"step": 671
},
{
"epoch": 0.9326856349757113,
"grad_norm": 0.27527281641960144,
"learning_rate": 1.2571786896410144e-07,
"loss": 0.0491,
"step": 672
},
{
"epoch": 0.9340735600277585,
"grad_norm": 0.23598523437976837,
"learning_rate": 1.2075958060277394e-07,
"loss": 0.0412,
"step": 673
},
{
"epoch": 0.9354614850798056,
"grad_norm": 0.21559569239616394,
"learning_rate": 1.158998513759052e-07,
"loss": 0.04,
"step": 674
},
{
"epoch": 0.9368494101318529,
"grad_norm": 0.18777711689472198,
"learning_rate": 1.1113877944850804e-07,
"loss": 0.0287,
"step": 675
},
{
"epoch": 0.9382373351839001,
"grad_norm": 0.22608903050422668,
"learning_rate": 1.0647646099275267e-07,
"loss": 0.0501,
"step": 676
},
{
"epoch": 0.9396252602359473,
"grad_norm": 0.23553456366062164,
"learning_rate": 1.0191299018601608e-07,
"loss": 0.0369,
"step": 677
},
{
"epoch": 0.9410131852879945,
"grad_norm": 0.17608265578746796,
"learning_rate": 9.744845920898527e-08,
"loss": 0.0286,
"step": 678
},
{
"epoch": 0.9424011103400416,
"grad_norm": 0.26702162623405457,
"learning_rate": 9.308295824379365e-08,
"loss": 0.0421,
"step": 679
},
{
"epoch": 0.9437890353920888,
"grad_norm": 0.2531726360321045,
"learning_rate": 8.881657547219869e-08,
"loss": 0.0409,
"step": 680
},
{
"epoch": 0.945176960444136,
"grad_norm": 0.18312399089336395,
"learning_rate": 8.46493970738016e-08,
"loss": 0.0284,
"step": 681
},
{
"epoch": 0.9465648854961832,
"grad_norm": 0.20322586596012115,
"learning_rate": 8.058150722430658e-08,
"loss": 0.0353,
"step": 682
},
{
"epoch": 0.9479528105482304,
"grad_norm": 0.23744148015975952,
"learning_rate": 7.661298809381878e-08,
"loss": 0.0381,
"step": 683
},
{
"epoch": 0.9493407356002775,
"grad_norm": 0.17557460069656372,
"learning_rate": 7.274391984518736e-08,
"loss": 0.0311,
"step": 684
},
{
"epoch": 0.9507286606523248,
"grad_norm": 0.2498820275068283,
"learning_rate": 6.897438063238393e-08,
"loss": 0.0446,
"step": 685
},
{
"epoch": 0.952116585704372,
"grad_norm": 0.2372819185256958,
"learning_rate": 6.530444659892443e-08,
"loss": 0.0427,
"step": 686
},
{
"epoch": 0.9535045107564192,
"grad_norm": 0.18727374076843262,
"learning_rate": 6.173419187633201e-08,
"loss": 0.0341,
"step": 687
},
{
"epoch": 0.9548924358084664,
"grad_norm": 0.24864086508750916,
"learning_rate": 5.82636885826382e-08,
"loss": 0.045,
"step": 688
},
{
"epoch": 0.9562803608605135,
"grad_norm": 0.2669983506202698,
"learning_rate": 5.4893006820926355e-08,
"loss": 0.0377,
"step": 689
},
{
"epoch": 0.9576682859125607,
"grad_norm": 0.36765196919441223,
"learning_rate": 5.162221467791772e-08,
"loss": 0.0432,
"step": 690
},
{
"epoch": 0.9590562109646079,
"grad_norm": 0.18304842710494995,
"learning_rate": 4.8451378222592605e-08,
"loss": 0.0356,
"step": 691
},
{
"epoch": 0.9604441360166551,
"grad_norm": 0.24612656235694885,
"learning_rate": 4.5380561504858586e-08,
"loss": 0.0385,
"step": 692
},
{
"epoch": 0.9618320610687023,
"grad_norm": 0.2270457148551941,
"learning_rate": 4.240982655425552e-08,
"loss": 0.0505,
"step": 693
},
{
"epoch": 0.9632199861207494,
"grad_norm": 0.1908656507730484,
"learning_rate": 3.953923337870147e-08,
"loss": 0.0359,
"step": 694
},
{
"epoch": 0.9646079111727967,
"grad_norm": 0.1745605766773224,
"learning_rate": 3.6768839963285395e-08,
"loss": 0.0304,
"step": 695
},
{
"epoch": 0.9659958362248439,
"grad_norm": 0.23264414072036743,
"learning_rate": 3.409870226908863e-08,
"loss": 0.0437,
"step": 696
},
{
"epoch": 0.9673837612768911,
"grad_norm": 0.2552529573440552,
"learning_rate": 3.1528874232059635e-08,
"loss": 0.0507,
"step": 697
},
{
"epoch": 0.9687716863289383,
"grad_norm": 0.22192077338695526,
"learning_rate": 2.905940776192384e-08,
"loss": 0.0445,
"step": 698
},
{
"epoch": 0.9701596113809854,
"grad_norm": 0.23040442168712616,
"learning_rate": 2.669035274113274e-08,
"loss": 0.0415,
"step": 699
},
{
"epoch": 0.9715475364330326,
"grad_norm": 0.21938888728618622,
"learning_rate": 2.4421757023859737e-08,
"loss": 0.0369,
"step": 700
},
{
"epoch": 0.9729354614850798,
"grad_norm": 0.19337037205696106,
"learning_rate": 2.2253666435029797e-08,
"loss": 0.0311,
"step": 701
},
{
"epoch": 0.974323386537127,
"grad_norm": 0.20000146329402924,
"learning_rate": 2.0186124769396855e-08,
"loss": 0.0371,
"step": 702
},
{
"epoch": 0.9757113115891742,
"grad_norm": 0.22154293954372406,
"learning_rate": 1.8219173790658406e-08,
"loss": 0.0336,
"step": 703
},
{
"epoch": 0.9770992366412213,
"grad_norm": 0.234355166554451,
"learning_rate": 1.6352853230609534e-08,
"loss": 0.0438,
"step": 704
},
{
"epoch": 0.9784871616932685,
"grad_norm": 0.20534580945968628,
"learning_rate": 1.4587200788343524e-08,
"loss": 0.0301,
"step": 705
},
{
"epoch": 0.9798750867453158,
"grad_norm": 0.22104544937610626,
"learning_rate": 1.2922252129489165e-08,
"loss": 0.0442,
"step": 706
},
{
"epoch": 0.981263011797363,
"grad_norm": 0.23223792016506195,
"learning_rate": 1.1358040885490196e-08,
"loss": 0.0369,
"step": 707
},
{
"epoch": 0.9826509368494102,
"grad_norm": 0.1923191100358963,
"learning_rate": 9.894598652925857e-09,
"loss": 0.0356,
"step": 708
},
{
"epoch": 0.9840388619014573,
"grad_norm": 0.2507604956626892,
"learning_rate": 8.53195499287196e-09,
"loss": 0.0472,
"step": 709
},
{
"epoch": 0.9854267869535045,
"grad_norm": 0.2132437825202942,
"learning_rate": 7.2701374303063565e-09,
"loss": 0.0381,
"step": 710
},
{
"epoch": 0.9868147120055517,
"grad_norm": 0.16438357532024384,
"learning_rate": 6.109171453549944e-09,
"loss": 0.028,
"step": 711
},
{
"epoch": 0.9882026370575989,
"grad_norm": 0.24717937409877777,
"learning_rate": 5.049080513752636e-09,
"loss": 0.0439,
"step": 712
},
{
"epoch": 0.9895905621096461,
"grad_norm": 0.2316623479127884,
"learning_rate": 4.089886024421508e-09,
"loss": 0.0378,
"step": 713
},
{
"epoch": 0.9909784871616932,
"grad_norm": 0.2512715756893158,
"learning_rate": 3.2316073609856e-09,
"loss": 0.0378,
"step": 714
},
{
"epoch": 0.9923664122137404,
"grad_norm": 0.27887260913848877,
"learning_rate": 2.474261860406779e-09,
"loss": 0.0537,
"step": 715
},
{
"epoch": 0.9937543372657877,
"grad_norm": 0.19671159982681274,
"learning_rate": 1.817864820827242e-09,
"loss": 0.0277,
"step": 716
},
{
"epoch": 0.9951422623178349,
"grad_norm": 0.22438108921051025,
"learning_rate": 1.2624295012625409e-09,
"loss": 0.0389,
"step": 717
},
{
"epoch": 0.9965301873698821,
"grad_norm": 0.17432349920272827,
"learning_rate": 8.079671213334639e-10,
"loss": 0.0333,
"step": 718
},
{
"epoch": 0.9979181124219292,
"grad_norm": 0.24869811534881592,
"learning_rate": 4.5448686103732876e-10,
"loss": 0.0411,
"step": 719
},
{
"epoch": 0.9993060374739764,
"grad_norm": 0.24824249744415283,
"learning_rate": 2.0199586056590669e-10,
"loss": 0.0419,
"step": 720
},
{
"epoch": 1.0,
"grad_norm": 0.2819853723049164,
"learning_rate": 5.049922015887276e-11,
"loss": 0.0334,
"step": 721
},
{
"epoch": 1.0,
"step": 721,
"total_flos": 140605265387520.0,
"train_loss": 0.04385479942129488,
"train_runtime": 20865.6154,
"train_samples_per_second": 0.829,
"train_steps_per_second": 0.035
}
],
"logging_steps": 1,
"max_steps": 721,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 140605265387520.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}