Files
Drummond-1b1-Instruct/checkpoint-1200/trainer_state.json

8435 lines
188 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.8,
"eval_steps": 500,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004,
"grad_norm": 7.180798530578613,
"learning_rate": 0.0,
"loss": 1.4284,
"step": 1
},
{
"epoch": 0.008,
"grad_norm": 7.071687698364258,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.2964,
"step": 2
},
{
"epoch": 0.012,
"grad_norm": 8.010491371154785,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5582,
"step": 3
},
{
"epoch": 0.016,
"grad_norm": 6.5002946853637695,
"learning_rate": 3e-06,
"loss": 1.3382,
"step": 4
},
{
"epoch": 0.02,
"grad_norm": 5.291814804077148,
"learning_rate": 4.000000000000001e-06,
"loss": 1.249,
"step": 5
},
{
"epoch": 0.024,
"grad_norm": 5.405974864959717,
"learning_rate": 5e-06,
"loss": 1.4124,
"step": 6
},
{
"epoch": 0.028,
"grad_norm": 3.7647454738616943,
"learning_rate": 6e-06,
"loss": 1.2449,
"step": 7
},
{
"epoch": 0.032,
"grad_norm": 4.103897571563721,
"learning_rate": 7e-06,
"loss": 1.1926,
"step": 8
},
{
"epoch": 0.036,
"grad_norm": 4.49144983291626,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3012,
"step": 9
},
{
"epoch": 0.04,
"grad_norm": 4.010332107543945,
"learning_rate": 9e-06,
"loss": 1.1781,
"step": 10
},
{
"epoch": 0.044,
"grad_norm": 3.8404905796051025,
"learning_rate": 1e-05,
"loss": 1.2206,
"step": 11
},
{
"epoch": 0.048,
"grad_norm": 2.9078028202056885,
"learning_rate": 9.991935483870968e-06,
"loss": 1.0996,
"step": 12
},
{
"epoch": 0.052,
"grad_norm": 3.6060760021209717,
"learning_rate": 9.983870967741936e-06,
"loss": 1.4656,
"step": 13
},
{
"epoch": 0.056,
"grad_norm": 3.05355167388916,
"learning_rate": 9.975806451612904e-06,
"loss": 1.2291,
"step": 14
},
{
"epoch": 0.06,
"grad_norm": 3.198852062225342,
"learning_rate": 9.967741935483871e-06,
"loss": 1.2603,
"step": 15
},
{
"epoch": 0.064,
"grad_norm": 3.001765012741089,
"learning_rate": 9.959677419354839e-06,
"loss": 1.2455,
"step": 16
},
{
"epoch": 0.068,
"grad_norm": 2.4708402156829834,
"learning_rate": 9.951612903225807e-06,
"loss": 1.3416,
"step": 17
},
{
"epoch": 0.072,
"grad_norm": 2.4723620414733887,
"learning_rate": 9.943548387096776e-06,
"loss": 1.3001,
"step": 18
},
{
"epoch": 0.076,
"grad_norm": 2.3858444690704346,
"learning_rate": 9.935483870967742e-06,
"loss": 1.3496,
"step": 19
},
{
"epoch": 0.08,
"grad_norm": 2.5942535400390625,
"learning_rate": 9.927419354838711e-06,
"loss": 1.1709,
"step": 20
},
{
"epoch": 0.084,
"grad_norm": 2.6939024925231934,
"learning_rate": 9.919354838709679e-06,
"loss": 1.3068,
"step": 21
},
{
"epoch": 0.088,
"grad_norm": 2.307511568069458,
"learning_rate": 9.911290322580645e-06,
"loss": 1.0283,
"step": 22
},
{
"epoch": 0.092,
"grad_norm": 2.2905514240264893,
"learning_rate": 9.903225806451614e-06,
"loss": 1.3321,
"step": 23
},
{
"epoch": 0.096,
"grad_norm": 2.527710437774658,
"learning_rate": 9.895161290322582e-06,
"loss": 1.4547,
"step": 24
},
{
"epoch": 0.1,
"grad_norm": 2.5127201080322266,
"learning_rate": 9.88709677419355e-06,
"loss": 1.4558,
"step": 25
},
{
"epoch": 0.104,
"grad_norm": 2.472888708114624,
"learning_rate": 9.879032258064517e-06,
"loss": 1.2011,
"step": 26
},
{
"epoch": 0.108,
"grad_norm": 2.0180959701538086,
"learning_rate": 9.870967741935485e-06,
"loss": 1.1346,
"step": 27
},
{
"epoch": 0.112,
"grad_norm": 2.4276342391967773,
"learning_rate": 9.862903225806453e-06,
"loss": 1.4617,
"step": 28
},
{
"epoch": 0.116,
"grad_norm": 2.316664218902588,
"learning_rate": 9.85483870967742e-06,
"loss": 1.2723,
"step": 29
},
{
"epoch": 0.12,
"grad_norm": 2.4735896587371826,
"learning_rate": 9.846774193548388e-06,
"loss": 1.1424,
"step": 30
},
{
"epoch": 0.124,
"grad_norm": 1.9205927848815918,
"learning_rate": 9.838709677419356e-06,
"loss": 1.035,
"step": 31
},
{
"epoch": 0.128,
"grad_norm": 2.980786085128784,
"learning_rate": 9.830645161290323e-06,
"loss": 1.0171,
"step": 32
},
{
"epoch": 0.132,
"grad_norm": 2.645132303237915,
"learning_rate": 9.822580645161291e-06,
"loss": 1.5191,
"step": 33
},
{
"epoch": 0.136,
"grad_norm": 2.342191696166992,
"learning_rate": 9.814516129032259e-06,
"loss": 1.5577,
"step": 34
},
{
"epoch": 0.14,
"grad_norm": 2.1698765754699707,
"learning_rate": 9.806451612903226e-06,
"loss": 1.3242,
"step": 35
},
{
"epoch": 0.144,
"grad_norm": 2.1578097343444824,
"learning_rate": 9.798387096774194e-06,
"loss": 1.297,
"step": 36
},
{
"epoch": 0.148,
"grad_norm": 2.3967559337615967,
"learning_rate": 9.790322580645162e-06,
"loss": 1.4494,
"step": 37
},
{
"epoch": 0.152,
"grad_norm": 2.436760902404785,
"learning_rate": 9.782258064516131e-06,
"loss": 1.6418,
"step": 38
},
{
"epoch": 0.156,
"grad_norm": 2.287909507751465,
"learning_rate": 9.774193548387097e-06,
"loss": 1.3089,
"step": 39
},
{
"epoch": 0.16,
"grad_norm": 2.4291112422943115,
"learning_rate": 9.766129032258065e-06,
"loss": 1.1915,
"step": 40
},
{
"epoch": 0.164,
"grad_norm": 2.0664143562316895,
"learning_rate": 9.758064516129034e-06,
"loss": 1.3664,
"step": 41
},
{
"epoch": 0.168,
"grad_norm": 2.0575056076049805,
"learning_rate": 9.75e-06,
"loss": 1.0187,
"step": 42
},
{
"epoch": 0.172,
"grad_norm": 2.311537742614746,
"learning_rate": 9.74193548387097e-06,
"loss": 1.3643,
"step": 43
},
{
"epoch": 0.176,
"grad_norm": 2.620941400527954,
"learning_rate": 9.733870967741937e-06,
"loss": 1.4038,
"step": 44
},
{
"epoch": 0.18,
"grad_norm": 2.0609350204467773,
"learning_rate": 9.725806451612903e-06,
"loss": 1.0915,
"step": 45
},
{
"epoch": 0.184,
"grad_norm": 2.421088218688965,
"learning_rate": 9.717741935483872e-06,
"loss": 1.1979,
"step": 46
},
{
"epoch": 0.188,
"grad_norm": 2.348494291305542,
"learning_rate": 9.70967741935484e-06,
"loss": 1.2259,
"step": 47
},
{
"epoch": 0.192,
"grad_norm": 2.0419907569885254,
"learning_rate": 9.701612903225807e-06,
"loss": 1.3981,
"step": 48
},
{
"epoch": 0.196,
"grad_norm": 2.134453058242798,
"learning_rate": 9.693548387096775e-06,
"loss": 1.1282,
"step": 49
},
{
"epoch": 0.2,
"grad_norm": 2.2903339862823486,
"learning_rate": 9.685483870967743e-06,
"loss": 1.0865,
"step": 50
},
{
"epoch": 0.204,
"grad_norm": 1.951794981956482,
"learning_rate": 9.67741935483871e-06,
"loss": 1.2592,
"step": 51
},
{
"epoch": 0.208,
"grad_norm": 1.9836851358413696,
"learning_rate": 9.669354838709678e-06,
"loss": 0.9517,
"step": 52
},
{
"epoch": 0.212,
"grad_norm": 1.9208632707595825,
"learning_rate": 9.661290322580646e-06,
"loss": 1.2101,
"step": 53
},
{
"epoch": 0.216,
"grad_norm": 2.3081140518188477,
"learning_rate": 9.653225806451613e-06,
"loss": 1.4485,
"step": 54
},
{
"epoch": 0.22,
"grad_norm": 2.1603920459747314,
"learning_rate": 9.645161290322581e-06,
"loss": 1.2366,
"step": 55
},
{
"epoch": 0.224,
"grad_norm": 2.097900152206421,
"learning_rate": 9.637096774193549e-06,
"loss": 1.2874,
"step": 56
},
{
"epoch": 0.228,
"grad_norm": 2.3777639865875244,
"learning_rate": 9.629032258064516e-06,
"loss": 1.7109,
"step": 57
},
{
"epoch": 0.232,
"grad_norm": 2.0834310054779053,
"learning_rate": 9.620967741935484e-06,
"loss": 1.0803,
"step": 58
},
{
"epoch": 0.236,
"grad_norm": 2.084362030029297,
"learning_rate": 9.612903225806453e-06,
"loss": 1.2013,
"step": 59
},
{
"epoch": 0.24,
"grad_norm": 2.0952343940734863,
"learning_rate": 9.60483870967742e-06,
"loss": 1.0039,
"step": 60
},
{
"epoch": 0.244,
"grad_norm": 2.1214962005615234,
"learning_rate": 9.596774193548389e-06,
"loss": 1.2694,
"step": 61
},
{
"epoch": 0.248,
"grad_norm": 2.306159734725952,
"learning_rate": 9.588709677419356e-06,
"loss": 1.325,
"step": 62
},
{
"epoch": 0.252,
"grad_norm": 1.8500971794128418,
"learning_rate": 9.580645161290322e-06,
"loss": 1.0988,
"step": 63
},
{
"epoch": 0.256,
"grad_norm": 2.684373140335083,
"learning_rate": 9.572580645161292e-06,
"loss": 1.6293,
"step": 64
},
{
"epoch": 0.26,
"grad_norm": 2.1704890727996826,
"learning_rate": 9.56451612903226e-06,
"loss": 1.4308,
"step": 65
},
{
"epoch": 0.264,
"grad_norm": 2.252781629562378,
"learning_rate": 9.556451612903227e-06,
"loss": 1.3501,
"step": 66
},
{
"epoch": 0.268,
"grad_norm": 1.9889864921569824,
"learning_rate": 9.548387096774195e-06,
"loss": 1.4615,
"step": 67
},
{
"epoch": 0.272,
"grad_norm": 2.2946317195892334,
"learning_rate": 9.540322580645162e-06,
"loss": 1.3951,
"step": 68
},
{
"epoch": 0.276,
"grad_norm": 2.1460459232330322,
"learning_rate": 9.53225806451613e-06,
"loss": 1.1989,
"step": 69
},
{
"epoch": 0.28,
"grad_norm": 2.1480677127838135,
"learning_rate": 9.524193548387098e-06,
"loss": 1.3122,
"step": 70
},
{
"epoch": 0.284,
"grad_norm": 1.9918060302734375,
"learning_rate": 9.516129032258065e-06,
"loss": 0.8457,
"step": 71
},
{
"epoch": 0.288,
"grad_norm": 2.288792371749878,
"learning_rate": 9.508064516129033e-06,
"loss": 0.9798,
"step": 72
},
{
"epoch": 0.292,
"grad_norm": 1.8729608058929443,
"learning_rate": 9.5e-06,
"loss": 1.0014,
"step": 73
},
{
"epoch": 0.296,
"grad_norm": 2.185215473175049,
"learning_rate": 9.491935483870968e-06,
"loss": 1.1505,
"step": 74
},
{
"epoch": 0.3,
"grad_norm": 2.0389175415039062,
"learning_rate": 9.483870967741936e-06,
"loss": 1.1004,
"step": 75
},
{
"epoch": 0.304,
"grad_norm": 1.9789763689041138,
"learning_rate": 9.475806451612905e-06,
"loss": 1.2499,
"step": 76
},
{
"epoch": 0.308,
"grad_norm": 2.25372052192688,
"learning_rate": 9.467741935483871e-06,
"loss": 1.2954,
"step": 77
},
{
"epoch": 0.312,
"grad_norm": 2.0553982257843018,
"learning_rate": 9.459677419354839e-06,
"loss": 1.1647,
"step": 78
},
{
"epoch": 0.316,
"grad_norm": 2.0189993381500244,
"learning_rate": 9.451612903225808e-06,
"loss": 1.263,
"step": 79
},
{
"epoch": 0.32,
"grad_norm": 2.3836658000946045,
"learning_rate": 9.443548387096774e-06,
"loss": 1.2895,
"step": 80
},
{
"epoch": 0.324,
"grad_norm": 2.480907678604126,
"learning_rate": 9.435483870967743e-06,
"loss": 1.4312,
"step": 81
},
{
"epoch": 0.328,
"grad_norm": 2.28787899017334,
"learning_rate": 9.427419354838711e-06,
"loss": 1.2719,
"step": 82
},
{
"epoch": 0.332,
"grad_norm": 2.060723066329956,
"learning_rate": 9.419354838709677e-06,
"loss": 1.2143,
"step": 83
},
{
"epoch": 0.336,
"grad_norm": 1.9831432104110718,
"learning_rate": 9.411290322580646e-06,
"loss": 1.1807,
"step": 84
},
{
"epoch": 0.34,
"grad_norm": 2.140202283859253,
"learning_rate": 9.403225806451614e-06,
"loss": 1.4506,
"step": 85
},
{
"epoch": 0.344,
"grad_norm": 1.962363600730896,
"learning_rate": 9.395161290322582e-06,
"loss": 1.0856,
"step": 86
},
{
"epoch": 0.348,
"grad_norm": 2.1660656929016113,
"learning_rate": 9.38709677419355e-06,
"loss": 1.4112,
"step": 87
},
{
"epoch": 0.352,
"grad_norm": 2.1091177463531494,
"learning_rate": 9.379032258064517e-06,
"loss": 1.429,
"step": 88
},
{
"epoch": 0.356,
"grad_norm": 2.195801019668579,
"learning_rate": 9.370967741935485e-06,
"loss": 0.9717,
"step": 89
},
{
"epoch": 0.36,
"grad_norm": 2.0965685844421387,
"learning_rate": 9.362903225806452e-06,
"loss": 1.0021,
"step": 90
},
{
"epoch": 0.364,
"grad_norm": 2.0085933208465576,
"learning_rate": 9.35483870967742e-06,
"loss": 1.0799,
"step": 91
},
{
"epoch": 0.368,
"grad_norm": 2.3196582794189453,
"learning_rate": 9.346774193548388e-06,
"loss": 1.4787,
"step": 92
},
{
"epoch": 0.372,
"grad_norm": 1.9600956439971924,
"learning_rate": 9.338709677419355e-06,
"loss": 1.0858,
"step": 93
},
{
"epoch": 0.376,
"grad_norm": 1.9554862976074219,
"learning_rate": 9.330645161290323e-06,
"loss": 1.1764,
"step": 94
},
{
"epoch": 0.38,
"grad_norm": 2.0512537956237793,
"learning_rate": 9.32258064516129e-06,
"loss": 1.4589,
"step": 95
},
{
"epoch": 0.384,
"grad_norm": 2.0751116275787354,
"learning_rate": 9.314516129032258e-06,
"loss": 0.9747,
"step": 96
},
{
"epoch": 0.388,
"grad_norm": 2.3022332191467285,
"learning_rate": 9.306451612903226e-06,
"loss": 1.1592,
"step": 97
},
{
"epoch": 0.392,
"grad_norm": 1.7814546823501587,
"learning_rate": 9.298387096774194e-06,
"loss": 0.8474,
"step": 98
},
{
"epoch": 0.396,
"grad_norm": 2.174898862838745,
"learning_rate": 9.290322580645163e-06,
"loss": 1.2139,
"step": 99
},
{
"epoch": 0.4,
"grad_norm": 2.0458245277404785,
"learning_rate": 9.28225806451613e-06,
"loss": 1.427,
"step": 100
},
{
"epoch": 0.404,
"grad_norm": 1.9763966798782349,
"learning_rate": 9.274193548387097e-06,
"loss": 1.3954,
"step": 101
},
{
"epoch": 0.408,
"grad_norm": 2.1670455932617188,
"learning_rate": 9.266129032258066e-06,
"loss": 0.9883,
"step": 102
},
{
"epoch": 0.412,
"grad_norm": 1.9239592552185059,
"learning_rate": 9.258064516129034e-06,
"loss": 1.0499,
"step": 103
},
{
"epoch": 0.416,
"grad_norm": 2.060406446456909,
"learning_rate": 9.250000000000001e-06,
"loss": 1.2469,
"step": 104
},
{
"epoch": 0.42,
"grad_norm": 2.253674030303955,
"learning_rate": 9.241935483870969e-06,
"loss": 1.5057,
"step": 105
},
{
"epoch": 0.424,
"grad_norm": 2.1826705932617188,
"learning_rate": 9.233870967741937e-06,
"loss": 1.4926,
"step": 106
},
{
"epoch": 0.428,
"grad_norm": 2.0552401542663574,
"learning_rate": 9.225806451612904e-06,
"loss": 1.4567,
"step": 107
},
{
"epoch": 0.432,
"grad_norm": 2.0697274208068848,
"learning_rate": 9.217741935483872e-06,
"loss": 1.4088,
"step": 108
},
{
"epoch": 0.436,
"grad_norm": 1.7860722541809082,
"learning_rate": 9.20967741935484e-06,
"loss": 0.9367,
"step": 109
},
{
"epoch": 0.44,
"grad_norm": 2.535959243774414,
"learning_rate": 9.201612903225807e-06,
"loss": 0.9932,
"step": 110
},
{
"epoch": 0.444,
"grad_norm": 2.0369575023651123,
"learning_rate": 9.193548387096775e-06,
"loss": 1.2099,
"step": 111
},
{
"epoch": 0.448,
"grad_norm": 1.94306480884552,
"learning_rate": 9.185483870967742e-06,
"loss": 1.0501,
"step": 112
},
{
"epoch": 0.452,
"grad_norm": 2.0562283992767334,
"learning_rate": 9.17741935483871e-06,
"loss": 1.3824,
"step": 113
},
{
"epoch": 0.456,
"grad_norm": 1.9372371435165405,
"learning_rate": 9.16935483870968e-06,
"loss": 1.0479,
"step": 114
},
{
"epoch": 0.46,
"grad_norm": 2.2749247550964355,
"learning_rate": 9.161290322580645e-06,
"loss": 1.3404,
"step": 115
},
{
"epoch": 0.464,
"grad_norm": 2.251894950866699,
"learning_rate": 9.153225806451613e-06,
"loss": 1.5167,
"step": 116
},
{
"epoch": 0.468,
"grad_norm": 2.301532030105591,
"learning_rate": 9.145161290322582e-06,
"loss": 1.003,
"step": 117
},
{
"epoch": 0.472,
"grad_norm": 2.2831201553344727,
"learning_rate": 9.137096774193548e-06,
"loss": 1.304,
"step": 118
},
{
"epoch": 0.476,
"grad_norm": 2.0962982177734375,
"learning_rate": 9.129032258064518e-06,
"loss": 1.1256,
"step": 119
},
{
"epoch": 0.48,
"grad_norm": 1.9318597316741943,
"learning_rate": 9.120967741935485e-06,
"loss": 0.8632,
"step": 120
},
{
"epoch": 0.484,
"grad_norm": 2.1837871074676514,
"learning_rate": 9.112903225806451e-06,
"loss": 1.481,
"step": 121
},
{
"epoch": 0.488,
"grad_norm": 2.128391742706299,
"learning_rate": 9.10483870967742e-06,
"loss": 1.0906,
"step": 122
},
{
"epoch": 0.492,
"grad_norm": 1.8772141933441162,
"learning_rate": 9.096774193548388e-06,
"loss": 1.0969,
"step": 123
},
{
"epoch": 0.496,
"grad_norm": 2.020388603210449,
"learning_rate": 9.088709677419354e-06,
"loss": 1.1063,
"step": 124
},
{
"epoch": 0.5,
"grad_norm": 2.527641773223877,
"learning_rate": 9.080645161290324e-06,
"loss": 1.1885,
"step": 125
},
{
"epoch": 0.504,
"grad_norm": 2.061718702316284,
"learning_rate": 9.072580645161291e-06,
"loss": 1.2897,
"step": 126
},
{
"epoch": 0.508,
"grad_norm": 2.0752320289611816,
"learning_rate": 9.064516129032259e-06,
"loss": 1.1053,
"step": 127
},
{
"epoch": 0.512,
"grad_norm": 2.0922017097473145,
"learning_rate": 9.056451612903227e-06,
"loss": 1.1852,
"step": 128
},
{
"epoch": 0.516,
"grad_norm": 2.303179979324341,
"learning_rate": 9.048387096774194e-06,
"loss": 1.2175,
"step": 129
},
{
"epoch": 0.52,
"grad_norm": 1.902031660079956,
"learning_rate": 9.040322580645162e-06,
"loss": 0.9485,
"step": 130
},
{
"epoch": 0.524,
"grad_norm": 2.122629165649414,
"learning_rate": 9.03225806451613e-06,
"loss": 1.1103,
"step": 131
},
{
"epoch": 0.528,
"grad_norm": 2.079432725906372,
"learning_rate": 9.024193548387097e-06,
"loss": 1.3427,
"step": 132
},
{
"epoch": 0.532,
"grad_norm": 2.4613113403320312,
"learning_rate": 9.016129032258065e-06,
"loss": 1.3931,
"step": 133
},
{
"epoch": 0.536,
"grad_norm": 2.04841685295105,
"learning_rate": 9.008064516129033e-06,
"loss": 1.4206,
"step": 134
},
{
"epoch": 0.54,
"grad_norm": 2.021791934967041,
"learning_rate": 9e-06,
"loss": 1.2454,
"step": 135
},
{
"epoch": 0.544,
"grad_norm": 2.0248231887817383,
"learning_rate": 8.991935483870968e-06,
"loss": 1.1466,
"step": 136
},
{
"epoch": 0.548,
"grad_norm": 2.7430648803710938,
"learning_rate": 8.983870967741937e-06,
"loss": 1.1914,
"step": 137
},
{
"epoch": 0.552,
"grad_norm": 2.1386618614196777,
"learning_rate": 8.975806451612903e-06,
"loss": 1.1773,
"step": 138
},
{
"epoch": 0.556,
"grad_norm": 1.9300769567489624,
"learning_rate": 8.967741935483871e-06,
"loss": 1.0833,
"step": 139
},
{
"epoch": 0.56,
"grad_norm": 2.113100290298462,
"learning_rate": 8.95967741935484e-06,
"loss": 1.33,
"step": 140
},
{
"epoch": 0.564,
"grad_norm": 1.8950531482696533,
"learning_rate": 8.951612903225806e-06,
"loss": 0.961,
"step": 141
},
{
"epoch": 0.568,
"grad_norm": 2.254307985305786,
"learning_rate": 8.943548387096776e-06,
"loss": 1.0956,
"step": 142
},
{
"epoch": 0.572,
"grad_norm": 2.296546697616577,
"learning_rate": 8.935483870967743e-06,
"loss": 0.9973,
"step": 143
},
{
"epoch": 0.576,
"grad_norm": 2.0698964595794678,
"learning_rate": 8.92741935483871e-06,
"loss": 1.1144,
"step": 144
},
{
"epoch": 0.58,
"grad_norm": 1.9619230031967163,
"learning_rate": 8.919354838709678e-06,
"loss": 0.9931,
"step": 145
},
{
"epoch": 0.584,
"grad_norm": 1.7675387859344482,
"learning_rate": 8.911290322580646e-06,
"loss": 0.9522,
"step": 146
},
{
"epoch": 0.588,
"grad_norm": 1.7958853244781494,
"learning_rate": 8.903225806451614e-06,
"loss": 0.9351,
"step": 147
},
{
"epoch": 0.592,
"grad_norm": 2.1336379051208496,
"learning_rate": 8.895161290322581e-06,
"loss": 1.3449,
"step": 148
},
{
"epoch": 0.596,
"grad_norm": 1.9022713899612427,
"learning_rate": 8.887096774193549e-06,
"loss": 1.2237,
"step": 149
},
{
"epoch": 0.6,
"grad_norm": 1.8845552206039429,
"learning_rate": 8.879032258064517e-06,
"loss": 1.0805,
"step": 150
},
{
"epoch": 0.604,
"grad_norm": 1.8935421705245972,
"learning_rate": 8.870967741935484e-06,
"loss": 0.9688,
"step": 151
},
{
"epoch": 0.608,
"grad_norm": 2.0399153232574463,
"learning_rate": 8.862903225806452e-06,
"loss": 0.826,
"step": 152
},
{
"epoch": 0.612,
"grad_norm": 2.116387128829956,
"learning_rate": 8.85483870967742e-06,
"loss": 1.0629,
"step": 153
},
{
"epoch": 0.616,
"grad_norm": 2.244560480117798,
"learning_rate": 8.846774193548387e-06,
"loss": 1.2835,
"step": 154
},
{
"epoch": 0.62,
"grad_norm": 2.0261640548706055,
"learning_rate": 8.838709677419357e-06,
"loss": 0.8826,
"step": 155
},
{
"epoch": 0.624,
"grad_norm": 2.364264726638794,
"learning_rate": 8.830645161290323e-06,
"loss": 1.0676,
"step": 156
},
{
"epoch": 0.628,
"grad_norm": 2.4923110008239746,
"learning_rate": 8.82258064516129e-06,
"loss": 1.5691,
"step": 157
},
{
"epoch": 0.632,
"grad_norm": 1.8004636764526367,
"learning_rate": 8.81451612903226e-06,
"loss": 0.8545,
"step": 158
},
{
"epoch": 0.636,
"grad_norm": 2.0030720233917236,
"learning_rate": 8.806451612903226e-06,
"loss": 1.1769,
"step": 159
},
{
"epoch": 0.64,
"grad_norm": 1.9849048852920532,
"learning_rate": 8.798387096774195e-06,
"loss": 1.0444,
"step": 160
},
{
"epoch": 0.644,
"grad_norm": 1.9952068328857422,
"learning_rate": 8.790322580645163e-06,
"loss": 1.2118,
"step": 161
},
{
"epoch": 0.648,
"grad_norm": 2.0582053661346436,
"learning_rate": 8.782258064516129e-06,
"loss": 1.4597,
"step": 162
},
{
"epoch": 0.652,
"grad_norm": 1.863604187965393,
"learning_rate": 8.774193548387098e-06,
"loss": 1.3389,
"step": 163
},
{
"epoch": 0.656,
"grad_norm": 1.4981471300125122,
"learning_rate": 8.766129032258066e-06,
"loss": 0.7606,
"step": 164
},
{
"epoch": 0.66,
"grad_norm": 2.0630156993865967,
"learning_rate": 8.758064516129033e-06,
"loss": 0.778,
"step": 165
},
{
"epoch": 0.664,
"grad_norm": 2.324496030807495,
"learning_rate": 8.750000000000001e-06,
"loss": 1.713,
"step": 166
},
{
"epoch": 0.668,
"grad_norm": 1.8909348249435425,
"learning_rate": 8.741935483870969e-06,
"loss": 1.1013,
"step": 167
},
{
"epoch": 0.672,
"grad_norm": 2.0191423892974854,
"learning_rate": 8.733870967741936e-06,
"loss": 1.4583,
"step": 168
},
{
"epoch": 0.676,
"grad_norm": 2.192335605621338,
"learning_rate": 8.725806451612904e-06,
"loss": 1.0042,
"step": 169
},
{
"epoch": 0.68,
"grad_norm": 1.9449669122695923,
"learning_rate": 8.717741935483872e-06,
"loss": 1.2174,
"step": 170
},
{
"epoch": 0.684,
"grad_norm": 2.0404837131500244,
"learning_rate": 8.70967741935484e-06,
"loss": 1.4411,
"step": 171
},
{
"epoch": 0.688,
"grad_norm": 2.162858486175537,
"learning_rate": 8.701612903225807e-06,
"loss": 1.593,
"step": 172
},
{
"epoch": 0.692,
"grad_norm": 2.00299072265625,
"learning_rate": 8.693548387096775e-06,
"loss": 1.0078,
"step": 173
},
{
"epoch": 0.696,
"grad_norm": 1.987657070159912,
"learning_rate": 8.685483870967742e-06,
"loss": 1.2282,
"step": 174
},
{
"epoch": 0.7,
"grad_norm": 1.8024024963378906,
"learning_rate": 8.677419354838712e-06,
"loss": 0.9503,
"step": 175
},
{
"epoch": 0.704,
"grad_norm": 1.9623843431472778,
"learning_rate": 8.669354838709677e-06,
"loss": 1.1567,
"step": 176
},
{
"epoch": 0.708,
"grad_norm": 1.9090282917022705,
"learning_rate": 8.661290322580645e-06,
"loss": 0.9131,
"step": 177
},
{
"epoch": 0.712,
"grad_norm": 1.9411958456039429,
"learning_rate": 8.653225806451614e-06,
"loss": 1.0084,
"step": 178
},
{
"epoch": 0.716,
"grad_norm": 1.9970715045928955,
"learning_rate": 8.64516129032258e-06,
"loss": 1.0465,
"step": 179
},
{
"epoch": 0.72,
"grad_norm": 2.0260093212127686,
"learning_rate": 8.63709677419355e-06,
"loss": 1.1494,
"step": 180
},
{
"epoch": 0.724,
"grad_norm": 2.1397087574005127,
"learning_rate": 8.629032258064517e-06,
"loss": 1.14,
"step": 181
},
{
"epoch": 0.728,
"grad_norm": 2.3102598190307617,
"learning_rate": 8.620967741935483e-06,
"loss": 1.0327,
"step": 182
},
{
"epoch": 0.732,
"grad_norm": 1.8367112874984741,
"learning_rate": 8.612903225806453e-06,
"loss": 1.0707,
"step": 183
},
{
"epoch": 0.736,
"grad_norm": 1.896563172340393,
"learning_rate": 8.60483870967742e-06,
"loss": 1.1511,
"step": 184
},
{
"epoch": 0.74,
"grad_norm": 2.064681053161621,
"learning_rate": 8.596774193548388e-06,
"loss": 1.3135,
"step": 185
},
{
"epoch": 0.744,
"grad_norm": 2.307497024536133,
"learning_rate": 8.588709677419356e-06,
"loss": 1.0588,
"step": 186
},
{
"epoch": 0.748,
"grad_norm": 1.8731898069381714,
"learning_rate": 8.580645161290323e-06,
"loss": 1.0906,
"step": 187
},
{
"epoch": 0.752,
"grad_norm": 1.9699805974960327,
"learning_rate": 8.572580645161291e-06,
"loss": 1.1101,
"step": 188
},
{
"epoch": 0.756,
"grad_norm": 1.9156608581542969,
"learning_rate": 8.564516129032259e-06,
"loss": 0.9281,
"step": 189
},
{
"epoch": 0.76,
"grad_norm": 2.0008058547973633,
"learning_rate": 8.556451612903226e-06,
"loss": 1.2469,
"step": 190
},
{
"epoch": 0.764,
"grad_norm": 2.0837509632110596,
"learning_rate": 8.548387096774194e-06,
"loss": 1.1126,
"step": 191
},
{
"epoch": 0.768,
"grad_norm": 2.146651029586792,
"learning_rate": 8.540322580645162e-06,
"loss": 0.9677,
"step": 192
},
{
"epoch": 0.772,
"grad_norm": 1.7224905490875244,
"learning_rate": 8.53225806451613e-06,
"loss": 0.8243,
"step": 193
},
{
"epoch": 0.776,
"grad_norm": 2.0242717266082764,
"learning_rate": 8.524193548387097e-06,
"loss": 0.9788,
"step": 194
},
{
"epoch": 0.78,
"grad_norm": 2.278810739517212,
"learning_rate": 8.516129032258065e-06,
"loss": 1.7079,
"step": 195
},
{
"epoch": 0.784,
"grad_norm": 2.4467318058013916,
"learning_rate": 8.508064516129034e-06,
"loss": 1.1674,
"step": 196
},
{
"epoch": 0.788,
"grad_norm": 2.0715184211730957,
"learning_rate": 8.5e-06,
"loss": 1.1899,
"step": 197
},
{
"epoch": 0.792,
"grad_norm": 2.1980104446411133,
"learning_rate": 8.49193548387097e-06,
"loss": 1.3366,
"step": 198
},
{
"epoch": 0.796,
"grad_norm": 1.8685814142227173,
"learning_rate": 8.483870967741937e-06,
"loss": 0.9196,
"step": 199
},
{
"epoch": 0.8,
"grad_norm": 2.1090056896209717,
"learning_rate": 8.475806451612903e-06,
"loss": 1.1189,
"step": 200
},
{
"epoch": 0.804,
"grad_norm": 1.9667922258377075,
"learning_rate": 8.467741935483872e-06,
"loss": 0.9924,
"step": 201
},
{
"epoch": 0.808,
"grad_norm": 2.147761106491089,
"learning_rate": 8.45967741935484e-06,
"loss": 1.1266,
"step": 202
},
{
"epoch": 0.812,
"grad_norm": 1.9285451173782349,
"learning_rate": 8.451612903225808e-06,
"loss": 1.0698,
"step": 203
},
{
"epoch": 0.816,
"grad_norm": 1.980491280555725,
"learning_rate": 8.443548387096775e-06,
"loss": 1.3168,
"step": 204
},
{
"epoch": 0.82,
"grad_norm": 2.2818551063537598,
"learning_rate": 8.435483870967743e-06,
"loss": 1.0842,
"step": 205
},
{
"epoch": 0.824,
"grad_norm": 1.848507285118103,
"learning_rate": 8.42741935483871e-06,
"loss": 1.1319,
"step": 206
},
{
"epoch": 0.828,
"grad_norm": 2.248605489730835,
"learning_rate": 8.419354838709678e-06,
"loss": 1.3158,
"step": 207
},
{
"epoch": 0.832,
"grad_norm": 1.948811650276184,
"learning_rate": 8.411290322580646e-06,
"loss": 1.0943,
"step": 208
},
{
"epoch": 0.836,
"grad_norm": 1.8275185823440552,
"learning_rate": 8.403225806451613e-06,
"loss": 1.1819,
"step": 209
},
{
"epoch": 0.84,
"grad_norm": 2.0042636394500732,
"learning_rate": 8.395161290322581e-06,
"loss": 1.0532,
"step": 210
},
{
"epoch": 0.844,
"grad_norm": 1.974337100982666,
"learning_rate": 8.387096774193549e-06,
"loss": 1.2762,
"step": 211
},
{
"epoch": 0.848,
"grad_norm": 1.8739511966705322,
"learning_rate": 8.379032258064516e-06,
"loss": 0.9527,
"step": 212
},
{
"epoch": 0.852,
"grad_norm": 2.174652338027954,
"learning_rate": 8.370967741935484e-06,
"loss": 1.3532,
"step": 213
},
{
"epoch": 0.856,
"grad_norm": 2.062572956085205,
"learning_rate": 8.362903225806452e-06,
"loss": 0.9614,
"step": 214
},
{
"epoch": 0.86,
"grad_norm": 1.943835973739624,
"learning_rate": 8.35483870967742e-06,
"loss": 1.2107,
"step": 215
},
{
"epoch": 0.864,
"grad_norm": 1.712361454963684,
"learning_rate": 8.346774193548389e-06,
"loss": 0.9988,
"step": 216
},
{
"epoch": 0.868,
"grad_norm": 1.9365915060043335,
"learning_rate": 8.338709677419355e-06,
"loss": 1.3026,
"step": 217
},
{
"epoch": 0.872,
"grad_norm": 1.9738059043884277,
"learning_rate": 8.330645161290322e-06,
"loss": 1.5525,
"step": 218
},
{
"epoch": 0.876,
"grad_norm": 1.974971890449524,
"learning_rate": 8.322580645161292e-06,
"loss": 1.0492,
"step": 219
},
{
"epoch": 0.88,
"grad_norm": 1.9549391269683838,
"learning_rate": 8.314516129032258e-06,
"loss": 1.0535,
"step": 220
},
{
"epoch": 0.884,
"grad_norm": 1.8761863708496094,
"learning_rate": 8.306451612903227e-06,
"loss": 1.3403,
"step": 221
},
{
"epoch": 0.888,
"grad_norm": 1.9009919166564941,
"learning_rate": 8.298387096774195e-06,
"loss": 1.0114,
"step": 222
},
{
"epoch": 0.892,
"grad_norm": 1.8550859689712524,
"learning_rate": 8.29032258064516e-06,
"loss": 1.1675,
"step": 223
},
{
"epoch": 0.896,
"grad_norm": 1.727038860321045,
"learning_rate": 8.28225806451613e-06,
"loss": 0.9587,
"step": 224
},
{
"epoch": 0.9,
"grad_norm": 1.8912358283996582,
"learning_rate": 8.274193548387098e-06,
"loss": 1.101,
"step": 225
},
{
"epoch": 0.904,
"grad_norm": 1.8172194957733154,
"learning_rate": 8.266129032258065e-06,
"loss": 1.0145,
"step": 226
},
{
"epoch": 0.908,
"grad_norm": 2.0112392902374268,
"learning_rate": 8.258064516129033e-06,
"loss": 1.1499,
"step": 227
},
{
"epoch": 0.912,
"grad_norm": 2.0185554027557373,
"learning_rate": 8.25e-06,
"loss": 1.5163,
"step": 228
},
{
"epoch": 0.916,
"grad_norm": 1.9389221668243408,
"learning_rate": 8.241935483870968e-06,
"loss": 1.0711,
"step": 229
},
{
"epoch": 0.92,
"grad_norm": 2.015944004058838,
"learning_rate": 8.233870967741936e-06,
"loss": 1.1014,
"step": 230
},
{
"epoch": 0.924,
"grad_norm": 1.83272385597229,
"learning_rate": 8.225806451612904e-06,
"loss": 0.8464,
"step": 231
},
{
"epoch": 0.928,
"grad_norm": 1.9649468660354614,
"learning_rate": 8.217741935483871e-06,
"loss": 1.1034,
"step": 232
},
{
"epoch": 0.932,
"grad_norm": 2.1457502841949463,
"learning_rate": 8.209677419354839e-06,
"loss": 1.3939,
"step": 233
},
{
"epoch": 0.936,
"grad_norm": 1.770036220550537,
"learning_rate": 8.201612903225807e-06,
"loss": 0.9748,
"step": 234
},
{
"epoch": 0.94,
"grad_norm": 1.947691559791565,
"learning_rate": 8.193548387096774e-06,
"loss": 1.014,
"step": 235
},
{
"epoch": 0.944,
"grad_norm": 2.1747374534606934,
"learning_rate": 8.185483870967744e-06,
"loss": 1.2469,
"step": 236
},
{
"epoch": 0.948,
"grad_norm": 2.02323579788208,
"learning_rate": 8.177419354838711e-06,
"loss": 1.2203,
"step": 237
},
{
"epoch": 0.952,
"grad_norm": 1.9631866216659546,
"learning_rate": 8.169354838709677e-06,
"loss": 1.0833,
"step": 238
},
{
"epoch": 0.956,
"grad_norm": 2.2246932983398438,
"learning_rate": 8.161290322580647e-06,
"loss": 1.4365,
"step": 239
},
{
"epoch": 0.96,
"grad_norm": 1.8288980722427368,
"learning_rate": 8.153225806451614e-06,
"loss": 0.9002,
"step": 240
},
{
"epoch": 0.964,
"grad_norm": 2.1047258377075195,
"learning_rate": 8.145161290322582e-06,
"loss": 1.0669,
"step": 241
},
{
"epoch": 0.968,
"grad_norm": 2.1168065071105957,
"learning_rate": 8.13709677419355e-06,
"loss": 1.1497,
"step": 242
},
{
"epoch": 0.972,
"grad_norm": 2.0137243270874023,
"learning_rate": 8.129032258064517e-06,
"loss": 0.9738,
"step": 243
},
{
"epoch": 0.976,
"grad_norm": 1.6851848363876343,
"learning_rate": 8.120967741935485e-06,
"loss": 0.8467,
"step": 244
},
{
"epoch": 0.98,
"grad_norm": 1.9099082946777344,
"learning_rate": 8.112903225806452e-06,
"loss": 1.1605,
"step": 245
},
{
"epoch": 0.984,
"grad_norm": 2.069336414337158,
"learning_rate": 8.10483870967742e-06,
"loss": 0.9732,
"step": 246
},
{
"epoch": 0.988,
"grad_norm": 2.0151474475860596,
"learning_rate": 8.096774193548388e-06,
"loss": 1.0807,
"step": 247
},
{
"epoch": 0.992,
"grad_norm": 1.981397271156311,
"learning_rate": 8.088709677419355e-06,
"loss": 1.2777,
"step": 248
},
{
"epoch": 0.996,
"grad_norm": 1.761157751083374,
"learning_rate": 8.080645161290323e-06,
"loss": 0.8838,
"step": 249
},
{
"epoch": 1.0,
"grad_norm": 2.0894131660461426,
"learning_rate": 8.07258064516129e-06,
"loss": 1.2467,
"step": 250
},
{
"epoch": 1.004,
"grad_norm": 2.042200803756714,
"learning_rate": 8.064516129032258e-06,
"loss": 1.226,
"step": 251
},
{
"epoch": 1.008,
"grad_norm": 1.893397569656372,
"learning_rate": 8.056451612903226e-06,
"loss": 0.9192,
"step": 252
},
{
"epoch": 1.012,
"grad_norm": 1.9246549606323242,
"learning_rate": 8.048387096774194e-06,
"loss": 0.9392,
"step": 253
},
{
"epoch": 1.016,
"grad_norm": 1.8478291034698486,
"learning_rate": 8.040322580645163e-06,
"loss": 1.285,
"step": 254
},
{
"epoch": 1.02,
"grad_norm": 1.9581583738327026,
"learning_rate": 8.032258064516129e-06,
"loss": 1.1652,
"step": 255
},
{
"epoch": 1.024,
"grad_norm": 1.6915805339813232,
"learning_rate": 8.024193548387097e-06,
"loss": 0.7934,
"step": 256
},
{
"epoch": 1.028,
"grad_norm": 1.7810734510421753,
"learning_rate": 8.016129032258066e-06,
"loss": 1.1249,
"step": 257
},
{
"epoch": 1.032,
"grad_norm": 1.6987296342849731,
"learning_rate": 8.008064516129032e-06,
"loss": 0.8559,
"step": 258
},
{
"epoch": 1.036,
"grad_norm": 1.769662857055664,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0913,
"step": 259
},
{
"epoch": 1.04,
"grad_norm": 1.8633121252059937,
"learning_rate": 7.991935483870969e-06,
"loss": 0.8986,
"step": 260
},
{
"epoch": 1.044,
"grad_norm": 2.1259944438934326,
"learning_rate": 7.983870967741935e-06,
"loss": 1.0982,
"step": 261
},
{
"epoch": 1.048,
"grad_norm": 1.9265049695968628,
"learning_rate": 7.975806451612904e-06,
"loss": 1.0264,
"step": 262
},
{
"epoch": 1.052,
"grad_norm": 1.9437800645828247,
"learning_rate": 7.967741935483872e-06,
"loss": 1.2013,
"step": 263
},
{
"epoch": 1.056,
"grad_norm": 1.837809681892395,
"learning_rate": 7.95967741935484e-06,
"loss": 1.3034,
"step": 264
},
{
"epoch": 1.06,
"grad_norm": 1.9760485887527466,
"learning_rate": 7.951612903225807e-06,
"loss": 1.2059,
"step": 265
},
{
"epoch": 1.064,
"grad_norm": 1.8548455238342285,
"learning_rate": 7.943548387096775e-06,
"loss": 0.807,
"step": 266
},
{
"epoch": 1.068,
"grad_norm": 1.6947693824768066,
"learning_rate": 7.935483870967743e-06,
"loss": 0.8937,
"step": 267
},
{
"epoch": 1.072,
"grad_norm": 1.9188228845596313,
"learning_rate": 7.92741935483871e-06,
"loss": 1.1405,
"step": 268
},
{
"epoch": 1.076,
"grad_norm": 1.9461287260055542,
"learning_rate": 7.919354838709678e-06,
"loss": 1.2617,
"step": 269
},
{
"epoch": 1.08,
"grad_norm": 1.8000714778900146,
"learning_rate": 7.911290322580646e-06,
"loss": 1.2302,
"step": 270
},
{
"epoch": 1.084,
"grad_norm": 1.9787355661392212,
"learning_rate": 7.903225806451613e-06,
"loss": 1.2263,
"step": 271
},
{
"epoch": 1.088,
"grad_norm": 1.7489923238754272,
"learning_rate": 7.895161290322581e-06,
"loss": 1.0783,
"step": 272
},
{
"epoch": 1.092,
"grad_norm": 1.8225479125976562,
"learning_rate": 7.887096774193549e-06,
"loss": 0.9199,
"step": 273
},
{
"epoch": 1.096,
"grad_norm": 1.7303674221038818,
"learning_rate": 7.879032258064518e-06,
"loss": 0.9423,
"step": 274
},
{
"epoch": 1.1,
"grad_norm": 1.911805510520935,
"learning_rate": 7.870967741935484e-06,
"loss": 0.9044,
"step": 275
},
{
"epoch": 1.104,
"grad_norm": 1.943996548652649,
"learning_rate": 7.862903225806451e-06,
"loss": 0.9731,
"step": 276
},
{
"epoch": 1.108,
"grad_norm": 2.116546154022217,
"learning_rate": 7.85483870967742e-06,
"loss": 1.3928,
"step": 277
},
{
"epoch": 1.112,
"grad_norm": 1.8358001708984375,
"learning_rate": 7.846774193548388e-06,
"loss": 1.0155,
"step": 278
},
{
"epoch": 1.116,
"grad_norm": 2.039339780807495,
"learning_rate": 7.838709677419354e-06,
"loss": 0.9361,
"step": 279
},
{
"epoch": 1.12,
"grad_norm": 1.9571505784988403,
"learning_rate": 7.830645161290324e-06,
"loss": 0.7762,
"step": 280
},
{
"epoch": 1.124,
"grad_norm": 2.0101685523986816,
"learning_rate": 7.822580645161291e-06,
"loss": 1.1705,
"step": 281
},
{
"epoch": 1.1280000000000001,
"grad_norm": 1.9054312705993652,
"learning_rate": 7.814516129032259e-06,
"loss": 0.9305,
"step": 282
},
{
"epoch": 1.1320000000000001,
"grad_norm": 1.6266229152679443,
"learning_rate": 7.806451612903227e-06,
"loss": 0.7991,
"step": 283
},
{
"epoch": 1.1360000000000001,
"grad_norm": 1.9757091999053955,
"learning_rate": 7.798387096774194e-06,
"loss": 0.9544,
"step": 284
},
{
"epoch": 1.1400000000000001,
"grad_norm": 1.9758026599884033,
"learning_rate": 7.790322580645162e-06,
"loss": 1.2189,
"step": 285
},
{
"epoch": 1.144,
"grad_norm": 1.8889524936676025,
"learning_rate": 7.78225806451613e-06,
"loss": 0.8184,
"step": 286
},
{
"epoch": 1.148,
"grad_norm": 1.9242223501205444,
"learning_rate": 7.774193548387097e-06,
"loss": 1.1267,
"step": 287
},
{
"epoch": 1.152,
"grad_norm": 1.8673722743988037,
"learning_rate": 7.766129032258065e-06,
"loss": 1.0659,
"step": 288
},
{
"epoch": 1.156,
"grad_norm": 2.0750553607940674,
"learning_rate": 7.758064516129033e-06,
"loss": 0.912,
"step": 289
},
{
"epoch": 1.16,
"grad_norm": 1.7121838331222534,
"learning_rate": 7.75e-06,
"loss": 0.7481,
"step": 290
},
{
"epoch": 1.164,
"grad_norm": 1.895327091217041,
"learning_rate": 7.741935483870968e-06,
"loss": 1.0651,
"step": 291
},
{
"epoch": 1.168,
"grad_norm": 1.9888561964035034,
"learning_rate": 7.733870967741937e-06,
"loss": 1.0139,
"step": 292
},
{
"epoch": 1.172,
"grad_norm": 2.134798049926758,
"learning_rate": 7.725806451612903e-06,
"loss": 1.116,
"step": 293
},
{
"epoch": 1.176,
"grad_norm": 2.063161611557007,
"learning_rate": 7.717741935483871e-06,
"loss": 1.1218,
"step": 294
},
{
"epoch": 1.18,
"grad_norm": 1.748466968536377,
"learning_rate": 7.70967741935484e-06,
"loss": 0.9331,
"step": 295
},
{
"epoch": 1.184,
"grad_norm": 1.7615900039672852,
"learning_rate": 7.701612903225806e-06,
"loss": 0.9822,
"step": 296
},
{
"epoch": 1.188,
"grad_norm": 1.980162501335144,
"learning_rate": 7.693548387096776e-06,
"loss": 1.1527,
"step": 297
},
{
"epoch": 1.192,
"grad_norm": 1.8901320695877075,
"learning_rate": 7.685483870967743e-06,
"loss": 1.0305,
"step": 298
},
{
"epoch": 1.196,
"grad_norm": 2.2169246673583984,
"learning_rate": 7.67741935483871e-06,
"loss": 1.2761,
"step": 299
},
{
"epoch": 1.2,
"grad_norm": 2.0211877822875977,
"learning_rate": 7.669354838709679e-06,
"loss": 1.1125,
"step": 300
},
{
"epoch": 1.204,
"grad_norm": 1.9308112859725952,
"learning_rate": 7.661290322580646e-06,
"loss": 1.1144,
"step": 301
},
{
"epoch": 1.208,
"grad_norm": 1.9019742012023926,
"learning_rate": 7.653225806451614e-06,
"loss": 1.0119,
"step": 302
},
{
"epoch": 1.212,
"grad_norm": 2.1698246002197266,
"learning_rate": 7.645161290322582e-06,
"loss": 1.0064,
"step": 303
},
{
"epoch": 1.216,
"grad_norm": 2.0672056674957275,
"learning_rate": 7.63709677419355e-06,
"loss": 1.3564,
"step": 304
},
{
"epoch": 1.22,
"grad_norm": 1.9018466472625732,
"learning_rate": 7.629032258064517e-06,
"loss": 1.1786,
"step": 305
},
{
"epoch": 1.224,
"grad_norm": 2.1040728092193604,
"learning_rate": 7.6209677419354845e-06,
"loss": 1.1445,
"step": 306
},
{
"epoch": 1.228,
"grad_norm": 2.0901339054107666,
"learning_rate": 7.612903225806451e-06,
"loss": 0.9766,
"step": 307
},
{
"epoch": 1.232,
"grad_norm": 2.0483977794647217,
"learning_rate": 7.60483870967742e-06,
"loss": 0.8798,
"step": 308
},
{
"epoch": 1.236,
"grad_norm": 1.9911725521087646,
"learning_rate": 7.5967741935483875e-06,
"loss": 1.1939,
"step": 309
},
{
"epoch": 1.24,
"grad_norm": 1.9746226072311401,
"learning_rate": 7.588709677419356e-06,
"loss": 1.2349,
"step": 310
},
{
"epoch": 1.244,
"grad_norm": 1.916457176208496,
"learning_rate": 7.580645161290323e-06,
"loss": 0.8643,
"step": 311
},
{
"epoch": 1.248,
"grad_norm": 1.877477765083313,
"learning_rate": 7.5725806451612904e-06,
"loss": 1.0597,
"step": 312
},
{
"epoch": 1.252,
"grad_norm": 1.820234775543213,
"learning_rate": 7.564516129032259e-06,
"loss": 0.8479,
"step": 313
},
{
"epoch": 1.256,
"grad_norm": 2.0701229572296143,
"learning_rate": 7.556451612903226e-06,
"loss": 0.87,
"step": 314
},
{
"epoch": 1.26,
"grad_norm": 1.964243769645691,
"learning_rate": 7.548387096774194e-06,
"loss": 0.9032,
"step": 315
},
{
"epoch": 1.264,
"grad_norm": 1.9548320770263672,
"learning_rate": 7.540322580645162e-06,
"loss": 1.1532,
"step": 316
},
{
"epoch": 1.268,
"grad_norm": 1.8334695100784302,
"learning_rate": 7.5322580645161296e-06,
"loss": 1.005,
"step": 317
},
{
"epoch": 1.272,
"grad_norm": 1.7740626335144043,
"learning_rate": 7.524193548387097e-06,
"loss": 0.6827,
"step": 318
},
{
"epoch": 1.276,
"grad_norm": 1.8981671333312988,
"learning_rate": 7.516129032258065e-06,
"loss": 1.0224,
"step": 319
},
{
"epoch": 1.28,
"grad_norm": 1.9381623268127441,
"learning_rate": 7.508064516129033e-06,
"loss": 1.1752,
"step": 320
},
{
"epoch": 1.284,
"grad_norm": 1.8517154455184937,
"learning_rate": 7.500000000000001e-06,
"loss": 1.17,
"step": 321
},
{
"epoch": 1.288,
"grad_norm": 1.7893662452697754,
"learning_rate": 7.491935483870968e-06,
"loss": 1.0283,
"step": 322
},
{
"epoch": 1.292,
"grad_norm": 2.0215885639190674,
"learning_rate": 7.483870967741936e-06,
"loss": 1.2029,
"step": 323
},
{
"epoch": 1.296,
"grad_norm": 1.9150351285934448,
"learning_rate": 7.475806451612904e-06,
"loss": 0.888,
"step": 324
},
{
"epoch": 1.3,
"grad_norm": 1.7969810962677002,
"learning_rate": 7.467741935483872e-06,
"loss": 0.9533,
"step": 325
},
{
"epoch": 1.304,
"grad_norm": 1.9353724718093872,
"learning_rate": 7.459677419354839e-06,
"loss": 0.8941,
"step": 326
},
{
"epoch": 1.308,
"grad_norm": 1.9007946252822876,
"learning_rate": 7.451612903225807e-06,
"loss": 0.88,
"step": 327
},
{
"epoch": 1.312,
"grad_norm": 1.9816163778305054,
"learning_rate": 7.4435483870967755e-06,
"loss": 0.9307,
"step": 328
},
{
"epoch": 1.316,
"grad_norm": 1.7767106294631958,
"learning_rate": 7.435483870967742e-06,
"loss": 0.8723,
"step": 329
},
{
"epoch": 1.32,
"grad_norm": 2.6831021308898926,
"learning_rate": 7.427419354838711e-06,
"loss": 0.9343,
"step": 330
},
{
"epoch": 1.324,
"grad_norm": 2.1091558933258057,
"learning_rate": 7.4193548387096784e-06,
"loss": 1.0657,
"step": 331
},
{
"epoch": 1.328,
"grad_norm": 1.727107048034668,
"learning_rate": 7.411290322580645e-06,
"loss": 0.7765,
"step": 332
},
{
"epoch": 1.332,
"grad_norm": 1.861849069595337,
"learning_rate": 7.403225806451614e-06,
"loss": 0.8598,
"step": 333
},
{
"epoch": 1.336,
"grad_norm": 1.8695261478424072,
"learning_rate": 7.395161290322581e-06,
"loss": 0.8469,
"step": 334
},
{
"epoch": 1.34,
"grad_norm": 1.9783046245574951,
"learning_rate": 7.38709677419355e-06,
"loss": 1.1789,
"step": 335
},
{
"epoch": 1.3439999999999999,
"grad_norm": 2.044293165206909,
"learning_rate": 7.379032258064517e-06,
"loss": 1.1059,
"step": 336
},
{
"epoch": 1.3479999999999999,
"grad_norm": 2.19840145111084,
"learning_rate": 7.370967741935484e-06,
"loss": 1.1622,
"step": 337
},
{
"epoch": 1.3519999999999999,
"grad_norm": 2.1134932041168213,
"learning_rate": 7.362903225806453e-06,
"loss": 1.3109,
"step": 338
},
{
"epoch": 1.3559999999999999,
"grad_norm": 2.0898921489715576,
"learning_rate": 7.35483870967742e-06,
"loss": 1.1852,
"step": 339
},
{
"epoch": 1.3599999999999999,
"grad_norm": 2.190387010574341,
"learning_rate": 7.346774193548387e-06,
"loss": 1.2016,
"step": 340
},
{
"epoch": 1.3639999999999999,
"grad_norm": 1.914016604423523,
"learning_rate": 7.338709677419356e-06,
"loss": 1.0764,
"step": 341
},
{
"epoch": 1.3679999999999999,
"grad_norm": 1.908305287361145,
"learning_rate": 7.330645161290323e-06,
"loss": 0.9787,
"step": 342
},
{
"epoch": 1.3719999999999999,
"grad_norm": 2.0489273071289062,
"learning_rate": 7.322580645161291e-06,
"loss": 1.37,
"step": 343
},
{
"epoch": 1.376,
"grad_norm": 1.885549783706665,
"learning_rate": 7.314516129032259e-06,
"loss": 0.8716,
"step": 344
},
{
"epoch": 1.38,
"grad_norm": 1.9682085514068604,
"learning_rate": 7.306451612903226e-06,
"loss": 1.0618,
"step": 345
},
{
"epoch": 1.384,
"grad_norm": 1.9825321435928345,
"learning_rate": 7.298387096774194e-06,
"loss": 0.9762,
"step": 346
},
{
"epoch": 1.388,
"grad_norm": 1.710522174835205,
"learning_rate": 7.290322580645162e-06,
"loss": 0.8213,
"step": 347
},
{
"epoch": 1.392,
"grad_norm": 1.8329979181289673,
"learning_rate": 7.28225806451613e-06,
"loss": 0.8985,
"step": 348
},
{
"epoch": 1.396,
"grad_norm": 1.7996221780776978,
"learning_rate": 7.274193548387097e-06,
"loss": 0.8497,
"step": 349
},
{
"epoch": 1.4,
"grad_norm": 2.049039363861084,
"learning_rate": 7.266129032258065e-06,
"loss": 1.0881,
"step": 350
},
{
"epoch": 1.404,
"grad_norm": 1.8585624694824219,
"learning_rate": 7.258064516129033e-06,
"loss": 0.7929,
"step": 351
},
{
"epoch": 1.408,
"grad_norm": 1.9661332368850708,
"learning_rate": 7.25e-06,
"loss": 1.1871,
"step": 352
},
{
"epoch": 1.412,
"grad_norm": 2.0757462978363037,
"learning_rate": 7.2419354838709685e-06,
"loss": 1.2421,
"step": 353
},
{
"epoch": 1.416,
"grad_norm": 1.9401495456695557,
"learning_rate": 7.233870967741936e-06,
"loss": 1.158,
"step": 354
},
{
"epoch": 1.42,
"grad_norm": 1.9338098764419556,
"learning_rate": 7.225806451612903e-06,
"loss": 1.0955,
"step": 355
},
{
"epoch": 1.424,
"grad_norm": 1.8598445653915405,
"learning_rate": 7.2177419354838715e-06,
"loss": 0.9375,
"step": 356
},
{
"epoch": 1.428,
"grad_norm": 2.073228359222412,
"learning_rate": 7.209677419354839e-06,
"loss": 0.99,
"step": 357
},
{
"epoch": 1.432,
"grad_norm": 1.7986657619476318,
"learning_rate": 7.201612903225808e-06,
"loss": 0.9106,
"step": 358
},
{
"epoch": 1.436,
"grad_norm": 1.9713162183761597,
"learning_rate": 7.1935483870967745e-06,
"loss": 0.7723,
"step": 359
},
{
"epoch": 1.44,
"grad_norm": 1.9422978162765503,
"learning_rate": 7.185483870967742e-06,
"loss": 0.8295,
"step": 360
},
{
"epoch": 1.444,
"grad_norm": 1.93070650100708,
"learning_rate": 7.177419354838711e-06,
"loss": 0.9505,
"step": 361
},
{
"epoch": 1.448,
"grad_norm": 1.8391205072402954,
"learning_rate": 7.1693548387096774e-06,
"loss": 1.0174,
"step": 362
},
{
"epoch": 1.452,
"grad_norm": 2.107405185699463,
"learning_rate": 7.161290322580646e-06,
"loss": 0.9669,
"step": 363
},
{
"epoch": 1.456,
"grad_norm": 1.9300522804260254,
"learning_rate": 7.153225806451614e-06,
"loss": 0.855,
"step": 364
},
{
"epoch": 1.46,
"grad_norm": 2.127452850341797,
"learning_rate": 7.145161290322581e-06,
"loss": 0.9638,
"step": 365
},
{
"epoch": 1.464,
"grad_norm": 2.0022165775299072,
"learning_rate": 7.137096774193549e-06,
"loss": 1.3054,
"step": 366
},
{
"epoch": 1.468,
"grad_norm": 2.113560199737549,
"learning_rate": 7.1290322580645166e-06,
"loss": 1.1549,
"step": 367
},
{
"epoch": 1.472,
"grad_norm": 1.8650003671646118,
"learning_rate": 7.120967741935484e-06,
"loss": 1.1583,
"step": 368
},
{
"epoch": 1.476,
"grad_norm": 2.0937418937683105,
"learning_rate": 7.112903225806453e-06,
"loss": 1.0378,
"step": 369
},
{
"epoch": 1.48,
"grad_norm": 1.8015251159667969,
"learning_rate": 7.1048387096774195e-06,
"loss": 1.0748,
"step": 370
},
{
"epoch": 1.484,
"grad_norm": 1.937143087387085,
"learning_rate": 7.096774193548388e-06,
"loss": 1.2231,
"step": 371
},
{
"epoch": 1.488,
"grad_norm": 1.769061803817749,
"learning_rate": 7.088709677419356e-06,
"loss": 0.9692,
"step": 372
},
{
"epoch": 1.492,
"grad_norm": 2.050584316253662,
"learning_rate": 7.0806451612903225e-06,
"loss": 1.2983,
"step": 373
},
{
"epoch": 1.496,
"grad_norm": 1.7325676679611206,
"learning_rate": 7.072580645161291e-06,
"loss": 0.8723,
"step": 374
},
{
"epoch": 1.5,
"grad_norm": 1.8762125968933105,
"learning_rate": 7.064516129032259e-06,
"loss": 0.9408,
"step": 375
},
{
"epoch": 1.504,
"grad_norm": 1.9299713373184204,
"learning_rate": 7.056451612903227e-06,
"loss": 1.0404,
"step": 376
},
{
"epoch": 1.508,
"grad_norm": 2.071428060531616,
"learning_rate": 7.048387096774194e-06,
"loss": 1.2014,
"step": 377
},
{
"epoch": 1.512,
"grad_norm": 2.0758373737335205,
"learning_rate": 7.040322580645162e-06,
"loss": 1.1974,
"step": 378
},
{
"epoch": 1.516,
"grad_norm": 2.0668601989746094,
"learning_rate": 7.03225806451613e-06,
"loss": 0.9456,
"step": 379
},
{
"epoch": 1.52,
"grad_norm": 1.9508721828460693,
"learning_rate": 7.024193548387097e-06,
"loss": 1.0519,
"step": 380
},
{
"epoch": 1.524,
"grad_norm": 1.8952667713165283,
"learning_rate": 7.0161290322580654e-06,
"loss": 0.8057,
"step": 381
},
{
"epoch": 1.528,
"grad_norm": 2.4796831607818604,
"learning_rate": 7.008064516129033e-06,
"loss": 0.8467,
"step": 382
},
{
"epoch": 1.532,
"grad_norm": 2.1569836139678955,
"learning_rate": 7e-06,
"loss": 1.0599,
"step": 383
},
{
"epoch": 1.536,
"grad_norm": 1.8130536079406738,
"learning_rate": 6.991935483870968e-06,
"loss": 1.0255,
"step": 384
},
{
"epoch": 1.54,
"grad_norm": 2.090113639831543,
"learning_rate": 6.983870967741936e-06,
"loss": 0.9335,
"step": 385
},
{
"epoch": 1.544,
"grad_norm": 1.9055856466293335,
"learning_rate": 6.9758064516129046e-06,
"loss": 1.0086,
"step": 386
},
{
"epoch": 1.548,
"grad_norm": 2.1749179363250732,
"learning_rate": 6.967741935483871e-06,
"loss": 0.9453,
"step": 387
},
{
"epoch": 1.552,
"grad_norm": 1.8622307777404785,
"learning_rate": 6.959677419354839e-06,
"loss": 1.0079,
"step": 388
},
{
"epoch": 1.556,
"grad_norm": 1.8603782653808594,
"learning_rate": 6.9516129032258075e-06,
"loss": 0.8453,
"step": 389
},
{
"epoch": 1.56,
"grad_norm": 2.0890860557556152,
"learning_rate": 6.943548387096774e-06,
"loss": 1.0772,
"step": 390
},
{
"epoch": 1.564,
"grad_norm": 1.6885607242584229,
"learning_rate": 6.935483870967743e-06,
"loss": 0.8288,
"step": 391
},
{
"epoch": 1.568,
"grad_norm": 1.810863733291626,
"learning_rate": 6.9274193548387105e-06,
"loss": 0.931,
"step": 392
},
{
"epoch": 1.572,
"grad_norm": 2.0588154792785645,
"learning_rate": 6.919354838709677e-06,
"loss": 0.9439,
"step": 393
},
{
"epoch": 1.576,
"grad_norm": 1.9780665636062622,
"learning_rate": 6.911290322580646e-06,
"loss": 0.8887,
"step": 394
},
{
"epoch": 1.58,
"grad_norm": 1.7643142938613892,
"learning_rate": 6.9032258064516135e-06,
"loss": 0.8651,
"step": 395
},
{
"epoch": 1.584,
"grad_norm": 1.984982967376709,
"learning_rate": 6.895161290322582e-06,
"loss": 0.976,
"step": 396
},
{
"epoch": 1.588,
"grad_norm": 1.7668675184249878,
"learning_rate": 6.887096774193549e-06,
"loss": 0.8342,
"step": 397
},
{
"epoch": 1.592,
"grad_norm": 2.10591983795166,
"learning_rate": 6.879032258064516e-06,
"loss": 1.1061,
"step": 398
},
{
"epoch": 1.596,
"grad_norm": 2.0348215103149414,
"learning_rate": 6.870967741935485e-06,
"loss": 0.9554,
"step": 399
},
{
"epoch": 1.6,
"grad_norm": 1.9119504690170288,
"learning_rate": 6.862903225806452e-06,
"loss": 1.005,
"step": 400
},
{
"epoch": 1.604,
"grad_norm": 2.0619728565216064,
"learning_rate": 6.854838709677419e-06,
"loss": 1.1878,
"step": 401
},
{
"epoch": 1.608,
"grad_norm": 1.6374893188476562,
"learning_rate": 6.846774193548388e-06,
"loss": 0.6809,
"step": 402
},
{
"epoch": 1.612,
"grad_norm": 1.9424902200698853,
"learning_rate": 6.838709677419355e-06,
"loss": 1.0557,
"step": 403
},
{
"epoch": 1.616,
"grad_norm": 2.002797842025757,
"learning_rate": 6.830645161290323e-06,
"loss": 0.861,
"step": 404
},
{
"epoch": 1.62,
"grad_norm": 1.8212698698043823,
"learning_rate": 6.822580645161291e-06,
"loss": 0.845,
"step": 405
},
{
"epoch": 1.624,
"grad_norm": 2.012608528137207,
"learning_rate": 6.8145161290322585e-06,
"loss": 1.0074,
"step": 406
},
{
"epoch": 1.6280000000000001,
"grad_norm": 1.9290971755981445,
"learning_rate": 6.806451612903226e-06,
"loss": 0.9082,
"step": 407
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.8718587160110474,
"learning_rate": 6.798387096774194e-06,
"loss": 0.927,
"step": 408
},
{
"epoch": 1.6360000000000001,
"grad_norm": 1.7788649797439575,
"learning_rate": 6.790322580645162e-06,
"loss": 0.8388,
"step": 409
},
{
"epoch": 1.6400000000000001,
"grad_norm": 1.988103985786438,
"learning_rate": 6.78225806451613e-06,
"loss": 0.8716,
"step": 410
},
{
"epoch": 1.6440000000000001,
"grad_norm": 2.083486318588257,
"learning_rate": 6.774193548387097e-06,
"loss": 1.3621,
"step": 411
},
{
"epoch": 1.6480000000000001,
"grad_norm": 1.8370341062545776,
"learning_rate": 6.766129032258065e-06,
"loss": 0.8775,
"step": 412
},
{
"epoch": 1.6520000000000001,
"grad_norm": 2.1198623180389404,
"learning_rate": 6.758064516129033e-06,
"loss": 1.1957,
"step": 413
},
{
"epoch": 1.6560000000000001,
"grad_norm": 1.887670636177063,
"learning_rate": 6.750000000000001e-06,
"loss": 0.8128,
"step": 414
},
{
"epoch": 1.6600000000000001,
"grad_norm": 1.776105284690857,
"learning_rate": 6.741935483870968e-06,
"loss": 1.0328,
"step": 415
},
{
"epoch": 1.6640000000000001,
"grad_norm": 2.0482587814331055,
"learning_rate": 6.733870967741936e-06,
"loss": 1.1532,
"step": 416
},
{
"epoch": 1.6680000000000001,
"grad_norm": 1.9450839757919312,
"learning_rate": 6.725806451612904e-06,
"loss": 0.8195,
"step": 417
},
{
"epoch": 1.6720000000000002,
"grad_norm": 1.8145220279693604,
"learning_rate": 6.717741935483871e-06,
"loss": 0.9485,
"step": 418
},
{
"epoch": 1.6760000000000002,
"grad_norm": 1.8135958909988403,
"learning_rate": 6.70967741935484e-06,
"loss": 0.7613,
"step": 419
},
{
"epoch": 1.6800000000000002,
"grad_norm": 1.711436152458191,
"learning_rate": 6.701612903225807e-06,
"loss": 0.8422,
"step": 420
},
{
"epoch": 1.6840000000000002,
"grad_norm": 1.8717050552368164,
"learning_rate": 6.693548387096774e-06,
"loss": 0.9559,
"step": 421
},
{
"epoch": 1.688,
"grad_norm": 2.074445962905884,
"learning_rate": 6.685483870967743e-06,
"loss": 1.0544,
"step": 422
},
{
"epoch": 1.692,
"grad_norm": 2.0965824127197266,
"learning_rate": 6.67741935483871e-06,
"loss": 1.0758,
"step": 423
},
{
"epoch": 1.696,
"grad_norm": 1.8185127973556519,
"learning_rate": 6.669354838709679e-06,
"loss": 0.8732,
"step": 424
},
{
"epoch": 1.7,
"grad_norm": 1.8362635374069214,
"learning_rate": 6.661290322580646e-06,
"loss": 0.7973,
"step": 425
},
{
"epoch": 1.704,
"grad_norm": 1.9787343740463257,
"learning_rate": 6.653225806451613e-06,
"loss": 1.4597,
"step": 426
},
{
"epoch": 1.708,
"grad_norm": 1.9990483522415161,
"learning_rate": 6.645161290322582e-06,
"loss": 1.0183,
"step": 427
},
{
"epoch": 1.712,
"grad_norm": 2.142268419265747,
"learning_rate": 6.637096774193549e-06,
"loss": 1.2538,
"step": 428
},
{
"epoch": 1.716,
"grad_norm": 1.8665509223937988,
"learning_rate": 6.629032258064517e-06,
"loss": 0.9783,
"step": 429
},
{
"epoch": 1.72,
"grad_norm": 2.159270763397217,
"learning_rate": 6.620967741935485e-06,
"loss": 0.8965,
"step": 430
},
{
"epoch": 1.724,
"grad_norm": 2.037621021270752,
"learning_rate": 6.612903225806452e-06,
"loss": 0.8806,
"step": 431
},
{
"epoch": 1.728,
"grad_norm": 1.7615220546722412,
"learning_rate": 6.60483870967742e-06,
"loss": 0.8228,
"step": 432
},
{
"epoch": 1.732,
"grad_norm": 2.1290621757507324,
"learning_rate": 6.596774193548388e-06,
"loss": 0.7951,
"step": 433
},
{
"epoch": 1.736,
"grad_norm": 1.9679354429244995,
"learning_rate": 6.5887096774193545e-06,
"loss": 0.8681,
"step": 434
},
{
"epoch": 1.74,
"grad_norm": 1.9964251518249512,
"learning_rate": 6.580645161290323e-06,
"loss": 0.8912,
"step": 435
},
{
"epoch": 1.744,
"grad_norm": 1.8060578107833862,
"learning_rate": 6.572580645161291e-06,
"loss": 0.8951,
"step": 436
},
{
"epoch": 1.748,
"grad_norm": 1.6865990161895752,
"learning_rate": 6.564516129032259e-06,
"loss": 0.7064,
"step": 437
},
{
"epoch": 1.752,
"grad_norm": 1.6390618085861206,
"learning_rate": 6.556451612903226e-06,
"loss": 0.658,
"step": 438
},
{
"epoch": 1.756,
"grad_norm": 1.9682906866073608,
"learning_rate": 6.548387096774194e-06,
"loss": 1.3055,
"step": 439
},
{
"epoch": 1.76,
"grad_norm": 2.312551498413086,
"learning_rate": 6.540322580645162e-06,
"loss": 1.2867,
"step": 440
},
{
"epoch": 1.764,
"grad_norm": 2.086641550064087,
"learning_rate": 6.532258064516129e-06,
"loss": 1.0897,
"step": 441
},
{
"epoch": 1.768,
"grad_norm": 2.1024136543273926,
"learning_rate": 6.5241935483870975e-06,
"loss": 0.9324,
"step": 442
},
{
"epoch": 1.772,
"grad_norm": 1.8934117555618286,
"learning_rate": 6.516129032258065e-06,
"loss": 0.8183,
"step": 443
},
{
"epoch": 1.776,
"grad_norm": 2.1037709712982178,
"learning_rate": 6.508064516129032e-06,
"loss": 1.3476,
"step": 444
},
{
"epoch": 1.78,
"grad_norm": 1.903442144393921,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.795,
"step": 445
},
{
"epoch": 1.784,
"grad_norm": 1.7604291439056396,
"learning_rate": 6.491935483870968e-06,
"loss": 0.8118,
"step": 446
},
{
"epoch": 1.788,
"grad_norm": 1.8407769203186035,
"learning_rate": 6.483870967741937e-06,
"loss": 0.8989,
"step": 447
},
{
"epoch": 1.792,
"grad_norm": 1.908974528312683,
"learning_rate": 6.475806451612903e-06,
"loss": 0.9908,
"step": 448
},
{
"epoch": 1.796,
"grad_norm": 1.6634641885757446,
"learning_rate": 6.467741935483871e-06,
"loss": 0.7673,
"step": 449
},
{
"epoch": 1.8,
"grad_norm": 2.033625602722168,
"learning_rate": 6.4596774193548396e-06,
"loss": 1.1297,
"step": 450
},
{
"epoch": 1.804,
"grad_norm": 1.7993324995040894,
"learning_rate": 6.451612903225806e-06,
"loss": 0.8454,
"step": 451
},
{
"epoch": 1.808,
"grad_norm": 1.9716646671295166,
"learning_rate": 6.443548387096775e-06,
"loss": 1.064,
"step": 452
},
{
"epoch": 1.812,
"grad_norm": 1.8964923620224,
"learning_rate": 6.4354838709677425e-06,
"loss": 1.0819,
"step": 453
},
{
"epoch": 1.8159999999999998,
"grad_norm": 1.8927526473999023,
"learning_rate": 6.42741935483871e-06,
"loss": 0.9889,
"step": 454
},
{
"epoch": 1.8199999999999998,
"grad_norm": 2.130237579345703,
"learning_rate": 6.419354838709678e-06,
"loss": 0.8271,
"step": 455
},
{
"epoch": 1.8239999999999998,
"grad_norm": 2.2165160179138184,
"learning_rate": 6.4112903225806455e-06,
"loss": 1.1792,
"step": 456
},
{
"epoch": 1.8279999999999998,
"grad_norm": 2.3909194469451904,
"learning_rate": 6.403225806451614e-06,
"loss": 1.1584,
"step": 457
},
{
"epoch": 1.8319999999999999,
"grad_norm": 1.752228856086731,
"learning_rate": 6.395161290322582e-06,
"loss": 0.9875,
"step": 458
},
{
"epoch": 1.8359999999999999,
"grad_norm": 1.9226669073104858,
"learning_rate": 6.3870967741935485e-06,
"loss": 1.0947,
"step": 459
},
{
"epoch": 1.8399999999999999,
"grad_norm": 1.792618751525879,
"learning_rate": 6.379032258064517e-06,
"loss": 0.82,
"step": 460
},
{
"epoch": 1.8439999999999999,
"grad_norm": 1.9961439371109009,
"learning_rate": 6.370967741935485e-06,
"loss": 0.8837,
"step": 461
},
{
"epoch": 1.8479999999999999,
"grad_norm": 1.900079369544983,
"learning_rate": 6.3629032258064514e-06,
"loss": 0.8935,
"step": 462
},
{
"epoch": 1.8519999999999999,
"grad_norm": 1.9829587936401367,
"learning_rate": 6.35483870967742e-06,
"loss": 0.9662,
"step": 463
},
{
"epoch": 1.8559999999999999,
"grad_norm": 2.161857843399048,
"learning_rate": 6.346774193548388e-06,
"loss": 1.0381,
"step": 464
},
{
"epoch": 1.8599999999999999,
"grad_norm": 2.085419178009033,
"learning_rate": 6.338709677419356e-06,
"loss": 0.8401,
"step": 465
},
{
"epoch": 1.8639999999999999,
"grad_norm": 1.981245756149292,
"learning_rate": 6.330645161290323e-06,
"loss": 1.2207,
"step": 466
},
{
"epoch": 1.8679999999999999,
"grad_norm": 2.1605920791625977,
"learning_rate": 6.3225806451612906e-06,
"loss": 1.1644,
"step": 467
},
{
"epoch": 1.8719999999999999,
"grad_norm": 1.9331471920013428,
"learning_rate": 6.314516129032259e-06,
"loss": 1.1664,
"step": 468
},
{
"epoch": 1.876,
"grad_norm": 1.9107038974761963,
"learning_rate": 6.306451612903226e-06,
"loss": 0.9665,
"step": 469
},
{
"epoch": 1.88,
"grad_norm": 1.829116940498352,
"learning_rate": 6.298387096774194e-06,
"loss": 0.6706,
"step": 470
},
{
"epoch": 1.884,
"grad_norm": 1.9585076570510864,
"learning_rate": 6.290322580645162e-06,
"loss": 0.9477,
"step": 471
},
{
"epoch": 1.888,
"grad_norm": 1.8251086473464966,
"learning_rate": 6.282258064516129e-06,
"loss": 1.0448,
"step": 472
},
{
"epoch": 1.892,
"grad_norm": 1.955357551574707,
"learning_rate": 6.274193548387097e-06,
"loss": 0.8644,
"step": 473
},
{
"epoch": 1.896,
"grad_norm": 1.7243505716323853,
"learning_rate": 6.266129032258065e-06,
"loss": 0.9741,
"step": 474
},
{
"epoch": 1.9,
"grad_norm": 1.7721489667892456,
"learning_rate": 6.2580645161290335e-06,
"loss": 1.0634,
"step": 475
},
{
"epoch": 1.904,
"grad_norm": 2.113557815551758,
"learning_rate": 6.25e-06,
"loss": 1.1883,
"step": 476
},
{
"epoch": 1.908,
"grad_norm": 2.1622400283813477,
"learning_rate": 6.241935483870968e-06,
"loss": 1.0856,
"step": 477
},
{
"epoch": 1.912,
"grad_norm": 1.9250800609588623,
"learning_rate": 6.2338709677419365e-06,
"loss": 0.9533,
"step": 478
},
{
"epoch": 1.916,
"grad_norm": 2.027179002761841,
"learning_rate": 6.225806451612903e-06,
"loss": 1.0671,
"step": 479
},
{
"epoch": 1.92,
"grad_norm": 2.1702287197113037,
"learning_rate": 6.217741935483872e-06,
"loss": 0.9522,
"step": 480
},
{
"epoch": 1.924,
"grad_norm": 1.8383008241653442,
"learning_rate": 6.209677419354839e-06,
"loss": 0.9636,
"step": 481
},
{
"epoch": 1.928,
"grad_norm": 1.9702820777893066,
"learning_rate": 6.201612903225806e-06,
"loss": 0.9917,
"step": 482
},
{
"epoch": 1.932,
"grad_norm": 1.850930094718933,
"learning_rate": 6.193548387096775e-06,
"loss": 1.208,
"step": 483
},
{
"epoch": 1.936,
"grad_norm": 1.9581983089447021,
"learning_rate": 6.185483870967742e-06,
"loss": 0.9504,
"step": 484
},
{
"epoch": 1.94,
"grad_norm": 2.117535352706909,
"learning_rate": 6.177419354838711e-06,
"loss": 0.8385,
"step": 485
},
{
"epoch": 1.944,
"grad_norm": 2.0964229106903076,
"learning_rate": 6.169354838709678e-06,
"loss": 1.1359,
"step": 486
},
{
"epoch": 1.948,
"grad_norm": 1.9881926774978638,
"learning_rate": 6.161290322580645e-06,
"loss": 1.1685,
"step": 487
},
{
"epoch": 1.952,
"grad_norm": 1.8612600564956665,
"learning_rate": 6.153225806451614e-06,
"loss": 1.0151,
"step": 488
},
{
"epoch": 1.956,
"grad_norm": 1.9227721691131592,
"learning_rate": 6.145161290322581e-06,
"loss": 0.9583,
"step": 489
},
{
"epoch": 1.96,
"grad_norm": 1.8418318033218384,
"learning_rate": 6.137096774193549e-06,
"loss": 0.9942,
"step": 490
},
{
"epoch": 1.964,
"grad_norm": 2.0368409156799316,
"learning_rate": 6.129032258064517e-06,
"loss": 1.1448,
"step": 491
},
{
"epoch": 1.968,
"grad_norm": 1.862594723701477,
"learning_rate": 6.120967741935484e-06,
"loss": 1.0607,
"step": 492
},
{
"epoch": 1.972,
"grad_norm": 2.114074468612671,
"learning_rate": 6.112903225806452e-06,
"loss": 1.1504,
"step": 493
},
{
"epoch": 1.976,
"grad_norm": 1.8414007425308228,
"learning_rate": 6.10483870967742e-06,
"loss": 0.9802,
"step": 494
},
{
"epoch": 1.98,
"grad_norm": 2.0259172916412354,
"learning_rate": 6.0967741935483874e-06,
"loss": 1.0735,
"step": 495
},
{
"epoch": 1.984,
"grad_norm": 1.812017560005188,
"learning_rate": 6.088709677419355e-06,
"loss": 0.9636,
"step": 496
},
{
"epoch": 1.988,
"grad_norm": 1.7785333395004272,
"learning_rate": 6.080645161290323e-06,
"loss": 0.8428,
"step": 497
},
{
"epoch": 1.992,
"grad_norm": 1.907761812210083,
"learning_rate": 6.072580645161291e-06,
"loss": 0.9633,
"step": 498
},
{
"epoch": 1.996,
"grad_norm": 2.5004708766937256,
"learning_rate": 6.064516129032259e-06,
"loss": 1.4453,
"step": 499
},
{
"epoch": 2.0,
"grad_norm": 1.9316829442977905,
"learning_rate": 6.056451612903226e-06,
"loss": 0.9325,
"step": 500
},
{
"epoch": 2.004,
"grad_norm": 1.7854198217391968,
"learning_rate": 6.048387096774194e-06,
"loss": 0.7874,
"step": 501
},
{
"epoch": 2.008,
"grad_norm": 2.056880235671997,
"learning_rate": 6.040322580645162e-06,
"loss": 0.8134,
"step": 502
},
{
"epoch": 2.012,
"grad_norm": 1.676098108291626,
"learning_rate": 6.0322580645161295e-06,
"loss": 0.5839,
"step": 503
},
{
"epoch": 2.016,
"grad_norm": 2.050013780593872,
"learning_rate": 6.024193548387097e-06,
"loss": 0.8682,
"step": 504
},
{
"epoch": 2.02,
"grad_norm": 1.8117666244506836,
"learning_rate": 6.016129032258065e-06,
"loss": 1.0753,
"step": 505
},
{
"epoch": 2.024,
"grad_norm": 1.8990579843521118,
"learning_rate": 6.008064516129033e-06,
"loss": 0.897,
"step": 506
},
{
"epoch": 2.028,
"grad_norm": 1.809313416481018,
"learning_rate": 6e-06,
"loss": 0.8301,
"step": 507
},
{
"epoch": 2.032,
"grad_norm": 1.8551738262176514,
"learning_rate": 5.991935483870969e-06,
"loss": 0.8942,
"step": 508
},
{
"epoch": 2.036,
"grad_norm": 1.9050978422164917,
"learning_rate": 5.983870967741936e-06,
"loss": 0.8722,
"step": 509
},
{
"epoch": 2.04,
"grad_norm": 2.03041934967041,
"learning_rate": 5.975806451612903e-06,
"loss": 0.9807,
"step": 510
},
{
"epoch": 2.044,
"grad_norm": 2.041501998901367,
"learning_rate": 5.967741935483872e-06,
"loss": 1.1012,
"step": 511
},
{
"epoch": 2.048,
"grad_norm": 1.8471384048461914,
"learning_rate": 5.959677419354839e-06,
"loss": 0.7939,
"step": 512
},
{
"epoch": 2.052,
"grad_norm": 2.0582258701324463,
"learning_rate": 5.951612903225808e-06,
"loss": 1.0515,
"step": 513
},
{
"epoch": 2.056,
"grad_norm": 2.1479275226593018,
"learning_rate": 5.943548387096775e-06,
"loss": 1.0338,
"step": 514
},
{
"epoch": 2.06,
"grad_norm": 2.1146697998046875,
"learning_rate": 5.935483870967742e-06,
"loss": 0.8127,
"step": 515
},
{
"epoch": 2.064,
"grad_norm": 2.1258368492126465,
"learning_rate": 5.927419354838711e-06,
"loss": 1.0184,
"step": 516
},
{
"epoch": 2.068,
"grad_norm": 1.7795977592468262,
"learning_rate": 5.9193548387096776e-06,
"loss": 1.0998,
"step": 517
},
{
"epoch": 2.072,
"grad_norm": 2.0097572803497314,
"learning_rate": 5.911290322580646e-06,
"loss": 0.8023,
"step": 518
},
{
"epoch": 2.076,
"grad_norm": 1.9408986568450928,
"learning_rate": 5.903225806451614e-06,
"loss": 0.799,
"step": 519
},
{
"epoch": 2.08,
"grad_norm": 1.8501501083374023,
"learning_rate": 5.8951612903225805e-06,
"loss": 0.6468,
"step": 520
},
{
"epoch": 2.084,
"grad_norm": 1.9509570598602295,
"learning_rate": 5.887096774193549e-06,
"loss": 0.8961,
"step": 521
},
{
"epoch": 2.088,
"grad_norm": 1.845482587814331,
"learning_rate": 5.879032258064517e-06,
"loss": 0.8763,
"step": 522
},
{
"epoch": 2.092,
"grad_norm": 1.9946309328079224,
"learning_rate": 5.8709677419354835e-06,
"loss": 0.9319,
"step": 523
},
{
"epoch": 2.096,
"grad_norm": 2.023287296295166,
"learning_rate": 5.862903225806452e-06,
"loss": 0.9106,
"step": 524
},
{
"epoch": 2.1,
"grad_norm": 1.8573815822601318,
"learning_rate": 5.85483870967742e-06,
"loss": 0.8133,
"step": 525
},
{
"epoch": 2.104,
"grad_norm": 1.9675424098968506,
"learning_rate": 5.846774193548388e-06,
"loss": 0.8816,
"step": 526
},
{
"epoch": 2.108,
"grad_norm": 2.4199588298797607,
"learning_rate": 5.838709677419355e-06,
"loss": 1.0439,
"step": 527
},
{
"epoch": 2.112,
"grad_norm": 1.9206243753433228,
"learning_rate": 5.830645161290323e-06,
"loss": 1.0431,
"step": 528
},
{
"epoch": 2.116,
"grad_norm": 2.0801823139190674,
"learning_rate": 5.822580645161291e-06,
"loss": 0.8906,
"step": 529
},
{
"epoch": 2.12,
"grad_norm": 2.001232385635376,
"learning_rate": 5.814516129032258e-06,
"loss": 0.7845,
"step": 530
},
{
"epoch": 2.124,
"grad_norm": 1.8495033979415894,
"learning_rate": 5.806451612903226e-06,
"loss": 0.8729,
"step": 531
},
{
"epoch": 2.128,
"grad_norm": 2.0924770832061768,
"learning_rate": 5.798387096774194e-06,
"loss": 0.7579,
"step": 532
},
{
"epoch": 2.132,
"grad_norm": 2.0251944065093994,
"learning_rate": 5.790322580645161e-06,
"loss": 0.8986,
"step": 533
},
{
"epoch": 2.136,
"grad_norm": 2.0584230422973633,
"learning_rate": 5.782258064516129e-06,
"loss": 0.9409,
"step": 534
},
{
"epoch": 2.14,
"grad_norm": 2.1257565021514893,
"learning_rate": 5.774193548387097e-06,
"loss": 1.0291,
"step": 535
},
{
"epoch": 2.144,
"grad_norm": 2.18398118019104,
"learning_rate": 5.7661290322580655e-06,
"loss": 1.0488,
"step": 536
},
{
"epoch": 2.148,
"grad_norm": 1.9207417964935303,
"learning_rate": 5.758064516129032e-06,
"loss": 0.7732,
"step": 537
},
{
"epoch": 2.152,
"grad_norm": 1.9380196332931519,
"learning_rate": 5.75e-06,
"loss": 0.874,
"step": 538
},
{
"epoch": 2.156,
"grad_norm": 1.9792423248291016,
"learning_rate": 5.7419354838709685e-06,
"loss": 0.6038,
"step": 539
},
{
"epoch": 2.16,
"grad_norm": 2.2578721046447754,
"learning_rate": 5.733870967741936e-06,
"loss": 1.0358,
"step": 540
},
{
"epoch": 2.164,
"grad_norm": 2.0607879161834717,
"learning_rate": 5.725806451612904e-06,
"loss": 1.0594,
"step": 541
},
{
"epoch": 2.168,
"grad_norm": 2.0992348194122314,
"learning_rate": 5.7177419354838715e-06,
"loss": 1.1117,
"step": 542
},
{
"epoch": 2.172,
"grad_norm": 1.9730112552642822,
"learning_rate": 5.709677419354839e-06,
"loss": 0.9292,
"step": 543
},
{
"epoch": 2.176,
"grad_norm": 1.9976557493209839,
"learning_rate": 5.701612903225807e-06,
"loss": 0.8501,
"step": 544
},
{
"epoch": 2.18,
"grad_norm": 2.0966856479644775,
"learning_rate": 5.6935483870967744e-06,
"loss": 1.0269,
"step": 545
},
{
"epoch": 2.184,
"grad_norm": 1.7158764600753784,
"learning_rate": 5.685483870967743e-06,
"loss": 0.5562,
"step": 546
},
{
"epoch": 2.188,
"grad_norm": 2.0837273597717285,
"learning_rate": 5.677419354838711e-06,
"loss": 1.1216,
"step": 547
},
{
"epoch": 2.192,
"grad_norm": 2.0325253009796143,
"learning_rate": 5.669354838709677e-06,
"loss": 1.1302,
"step": 548
},
{
"epoch": 2.196,
"grad_norm": 2.179825782775879,
"learning_rate": 5.661290322580646e-06,
"loss": 0.8691,
"step": 549
},
{
"epoch": 2.2,
"grad_norm": 2.073207378387451,
"learning_rate": 5.6532258064516136e-06,
"loss": 0.8533,
"step": 550
},
{
"epoch": 2.204,
"grad_norm": 1.89866304397583,
"learning_rate": 5.645161290322582e-06,
"loss": 0.6803,
"step": 551
},
{
"epoch": 2.208,
"grad_norm": 2.177705764770508,
"learning_rate": 5.637096774193549e-06,
"loss": 0.9427,
"step": 552
},
{
"epoch": 2.212,
"grad_norm": 1.882057547569275,
"learning_rate": 5.6290322580645165e-06,
"loss": 0.8539,
"step": 553
},
{
"epoch": 2.216,
"grad_norm": 2.101376533508301,
"learning_rate": 5.620967741935485e-06,
"loss": 1.0185,
"step": 554
},
{
"epoch": 2.22,
"grad_norm": 1.9109890460968018,
"learning_rate": 5.612903225806452e-06,
"loss": 0.8022,
"step": 555
},
{
"epoch": 2.224,
"grad_norm": 2.090585708618164,
"learning_rate": 5.6048387096774195e-06,
"loss": 1.149,
"step": 556
},
{
"epoch": 2.228,
"grad_norm": 2.002155065536499,
"learning_rate": 5.596774193548388e-06,
"loss": 0.8258,
"step": 557
},
{
"epoch": 2.232,
"grad_norm": 2.0641942024230957,
"learning_rate": 5.588709677419355e-06,
"loss": 0.7931,
"step": 558
},
{
"epoch": 2.2359999999999998,
"grad_norm": 1.8409854173660278,
"learning_rate": 5.580645161290323e-06,
"loss": 0.5935,
"step": 559
},
{
"epoch": 2.24,
"grad_norm": 1.7998849153518677,
"learning_rate": 5.572580645161291e-06,
"loss": 0.7823,
"step": 560
},
{
"epoch": 2.2439999999999998,
"grad_norm": 1.995986819267273,
"learning_rate": 5.564516129032258e-06,
"loss": 0.8117,
"step": 561
},
{
"epoch": 2.248,
"grad_norm": 2.903075695037842,
"learning_rate": 5.556451612903226e-06,
"loss": 0.8912,
"step": 562
},
{
"epoch": 2.252,
"grad_norm": 1.9949880838394165,
"learning_rate": 5.548387096774194e-06,
"loss": 0.9567,
"step": 563
},
{
"epoch": 2.2560000000000002,
"grad_norm": 2.518598794937134,
"learning_rate": 5.5403225806451624e-06,
"loss": 1.1984,
"step": 564
},
{
"epoch": 2.26,
"grad_norm": 1.9082916975021362,
"learning_rate": 5.532258064516129e-06,
"loss": 0.996,
"step": 565
},
{
"epoch": 2.2640000000000002,
"grad_norm": 2.1674211025238037,
"learning_rate": 5.524193548387097e-06,
"loss": 0.9543,
"step": 566
},
{
"epoch": 2.268,
"grad_norm": 2.0850236415863037,
"learning_rate": 5.516129032258065e-06,
"loss": 0.8417,
"step": 567
},
{
"epoch": 2.2720000000000002,
"grad_norm": 2.213880777359009,
"learning_rate": 5.508064516129032e-06,
"loss": 1.115,
"step": 568
},
{
"epoch": 2.276,
"grad_norm": 1.9851431846618652,
"learning_rate": 5.500000000000001e-06,
"loss": 0.9302,
"step": 569
},
{
"epoch": 2.2800000000000002,
"grad_norm": 2.029381513595581,
"learning_rate": 5.491935483870968e-06,
"loss": 0.7905,
"step": 570
},
{
"epoch": 2.284,
"grad_norm": 2.088749885559082,
"learning_rate": 5.483870967741935e-06,
"loss": 0.951,
"step": 571
},
{
"epoch": 2.288,
"grad_norm": 1.7397806644439697,
"learning_rate": 5.475806451612904e-06,
"loss": 0.705,
"step": 572
},
{
"epoch": 2.292,
"grad_norm": 2.039262294769287,
"learning_rate": 5.467741935483871e-06,
"loss": 0.852,
"step": 573
},
{
"epoch": 2.296,
"grad_norm": 1.953506350517273,
"learning_rate": 5.45967741935484e-06,
"loss": 0.8474,
"step": 574
},
{
"epoch": 2.3,
"grad_norm": 1.8871941566467285,
"learning_rate": 5.451612903225807e-06,
"loss": 0.745,
"step": 575
},
{
"epoch": 2.304,
"grad_norm": 1.7728246450424194,
"learning_rate": 5.443548387096774e-06,
"loss": 0.6626,
"step": 576
},
{
"epoch": 2.308,
"grad_norm": 1.984948754310608,
"learning_rate": 5.435483870967743e-06,
"loss": 0.8932,
"step": 577
},
{
"epoch": 2.312,
"grad_norm": 1.9669218063354492,
"learning_rate": 5.42741935483871e-06,
"loss": 0.933,
"step": 578
},
{
"epoch": 2.316,
"grad_norm": 2.025562286376953,
"learning_rate": 5.419354838709678e-06,
"loss": 0.825,
"step": 579
},
{
"epoch": 2.32,
"grad_norm": 2.116050958633423,
"learning_rate": 5.411290322580646e-06,
"loss": 1.0288,
"step": 580
},
{
"epoch": 2.324,
"grad_norm": 2.0228822231292725,
"learning_rate": 5.4032258064516126e-06,
"loss": 0.9068,
"step": 581
},
{
"epoch": 2.328,
"grad_norm": 1.9385783672332764,
"learning_rate": 5.395161290322581e-06,
"loss": 0.9609,
"step": 582
},
{
"epoch": 2.332,
"grad_norm": 2.0579097270965576,
"learning_rate": 5.387096774193549e-06,
"loss": 1.0191,
"step": 583
},
{
"epoch": 2.336,
"grad_norm": 1.9625400304794312,
"learning_rate": 5.379032258064517e-06,
"loss": 0.872,
"step": 584
},
{
"epoch": 2.34,
"grad_norm": 2.110098361968994,
"learning_rate": 5.370967741935484e-06,
"loss": 1.1903,
"step": 585
},
{
"epoch": 2.344,
"grad_norm": 2.013211250305176,
"learning_rate": 5.362903225806452e-06,
"loss": 0.71,
"step": 586
},
{
"epoch": 2.348,
"grad_norm": 1.926029920578003,
"learning_rate": 5.35483870967742e-06,
"loss": 0.785,
"step": 587
},
{
"epoch": 2.352,
"grad_norm": 1.944735050201416,
"learning_rate": 5.346774193548388e-06,
"loss": 0.9547,
"step": 588
},
{
"epoch": 2.356,
"grad_norm": 2.068157911300659,
"learning_rate": 5.338709677419355e-06,
"loss": 1.0035,
"step": 589
},
{
"epoch": 2.36,
"grad_norm": 1.9762252569198608,
"learning_rate": 5.330645161290323e-06,
"loss": 0.9913,
"step": 590
},
{
"epoch": 2.364,
"grad_norm": 2.0391647815704346,
"learning_rate": 5.322580645161291e-06,
"loss": 1.1314,
"step": 591
},
{
"epoch": 2.368,
"grad_norm": 2.0903215408325195,
"learning_rate": 5.3145161290322585e-06,
"loss": 0.9021,
"step": 592
},
{
"epoch": 2.372,
"grad_norm": 2.012575149536133,
"learning_rate": 5.306451612903226e-06,
"loss": 0.8438,
"step": 593
},
{
"epoch": 2.376,
"grad_norm": 3.0318682193756104,
"learning_rate": 5.298387096774194e-06,
"loss": 0.9652,
"step": 594
},
{
"epoch": 2.38,
"grad_norm": 2.0645689964294434,
"learning_rate": 5.290322580645162e-06,
"loss": 0.931,
"step": 595
},
{
"epoch": 2.384,
"grad_norm": 2.010096549987793,
"learning_rate": 5.282258064516129e-06,
"loss": 0.8592,
"step": 596
},
{
"epoch": 2.388,
"grad_norm": 2.2321157455444336,
"learning_rate": 5.274193548387098e-06,
"loss": 0.8352,
"step": 597
},
{
"epoch": 2.392,
"grad_norm": 2.0409233570098877,
"learning_rate": 5.266129032258065e-06,
"loss": 0.9383,
"step": 598
},
{
"epoch": 2.396,
"grad_norm": 1.9163116216659546,
"learning_rate": 5.258064516129032e-06,
"loss": 0.6861,
"step": 599
},
{
"epoch": 2.4,
"grad_norm": 1.9450281858444214,
"learning_rate": 5.2500000000000006e-06,
"loss": 0.748,
"step": 600
},
{
"epoch": 2.404,
"grad_norm": 1.9742106199264526,
"learning_rate": 5.241935483870968e-06,
"loss": 0.862,
"step": 601
},
{
"epoch": 2.408,
"grad_norm": 2.3256115913391113,
"learning_rate": 5.233870967741937e-06,
"loss": 0.8754,
"step": 602
},
{
"epoch": 2.412,
"grad_norm": 1.8236236572265625,
"learning_rate": 5.2258064516129035e-06,
"loss": 0.7874,
"step": 603
},
{
"epoch": 2.416,
"grad_norm": 2.0002152919769287,
"learning_rate": 5.217741935483871e-06,
"loss": 0.9769,
"step": 604
},
{
"epoch": 2.42,
"grad_norm": 1.9513111114501953,
"learning_rate": 5.20967741935484e-06,
"loss": 0.8151,
"step": 605
},
{
"epoch": 2.424,
"grad_norm": 1.953925371170044,
"learning_rate": 5.2016129032258065e-06,
"loss": 0.8143,
"step": 606
},
{
"epoch": 2.428,
"grad_norm": 2.4850172996520996,
"learning_rate": 5.193548387096775e-06,
"loss": 1.0706,
"step": 607
},
{
"epoch": 2.432,
"grad_norm": 1.8947373628616333,
"learning_rate": 5.185483870967743e-06,
"loss": 0.7941,
"step": 608
},
{
"epoch": 2.436,
"grad_norm": 2.0892701148986816,
"learning_rate": 5.1774193548387095e-06,
"loss": 0.9921,
"step": 609
},
{
"epoch": 2.44,
"grad_norm": 2.635343074798584,
"learning_rate": 5.169354838709678e-06,
"loss": 0.9391,
"step": 610
},
{
"epoch": 2.444,
"grad_norm": 2.0520694255828857,
"learning_rate": 5.161290322580646e-06,
"loss": 0.8278,
"step": 611
},
{
"epoch": 2.448,
"grad_norm": 2.127861499786377,
"learning_rate": 5.153225806451614e-06,
"loss": 1.1514,
"step": 612
},
{
"epoch": 2.452,
"grad_norm": 2.0440480709075928,
"learning_rate": 5.145161290322581e-06,
"loss": 0.7209,
"step": 613
},
{
"epoch": 2.456,
"grad_norm": 1.8232911825180054,
"learning_rate": 5.1370967741935486e-06,
"loss": 0.6083,
"step": 614
},
{
"epoch": 2.46,
"grad_norm": 2.0450599193573,
"learning_rate": 5.129032258064517e-06,
"loss": 0.824,
"step": 615
},
{
"epoch": 2.464,
"grad_norm": 2.1209301948547363,
"learning_rate": 5.120967741935484e-06,
"loss": 1.1781,
"step": 616
},
{
"epoch": 2.468,
"grad_norm": 2.2119786739349365,
"learning_rate": 5.1129032258064515e-06,
"loss": 1.1847,
"step": 617
},
{
"epoch": 2.472,
"grad_norm": 2.3321080207824707,
"learning_rate": 5.10483870967742e-06,
"loss": 0.8121,
"step": 618
},
{
"epoch": 2.476,
"grad_norm": 2.008375644683838,
"learning_rate": 5.096774193548387e-06,
"loss": 0.8741,
"step": 619
},
{
"epoch": 2.48,
"grad_norm": 2.2514259815216064,
"learning_rate": 5.088709677419355e-06,
"loss": 1.1202,
"step": 620
},
{
"epoch": 2.484,
"grad_norm": 1.8712433576583862,
"learning_rate": 5.080645161290323e-06,
"loss": 1.002,
"step": 621
},
{
"epoch": 2.488,
"grad_norm": 2.0136568546295166,
"learning_rate": 5.07258064516129e-06,
"loss": 0.9409,
"step": 622
},
{
"epoch": 2.492,
"grad_norm": 2.0185978412628174,
"learning_rate": 5.064516129032258e-06,
"loss": 0.7706,
"step": 623
},
{
"epoch": 2.496,
"grad_norm": 2.5203325748443604,
"learning_rate": 5.056451612903226e-06,
"loss": 0.8937,
"step": 624
},
{
"epoch": 2.5,
"grad_norm": 1.8861210346221924,
"learning_rate": 5.0483870967741945e-06,
"loss": 0.7429,
"step": 625
},
{
"epoch": 2.504,
"grad_norm": 2.0991008281707764,
"learning_rate": 5.040322580645161e-06,
"loss": 1.1311,
"step": 626
},
{
"epoch": 2.508,
"grad_norm": 1.919195532798767,
"learning_rate": 5.032258064516129e-06,
"loss": 0.8449,
"step": 627
},
{
"epoch": 2.512,
"grad_norm": 2.0639350414276123,
"learning_rate": 5.0241935483870974e-06,
"loss": 0.7829,
"step": 628
},
{
"epoch": 2.516,
"grad_norm": 2.0387368202209473,
"learning_rate": 5.016129032258065e-06,
"loss": 0.7948,
"step": 629
},
{
"epoch": 2.52,
"grad_norm": 2.121511459350586,
"learning_rate": 5.008064516129033e-06,
"loss": 0.8836,
"step": 630
},
{
"epoch": 2.524,
"grad_norm": 2.033421277999878,
"learning_rate": 5e-06,
"loss": 0.8941,
"step": 631
},
{
"epoch": 2.528,
"grad_norm": 1.9068338871002197,
"learning_rate": 4.991935483870968e-06,
"loss": 0.7699,
"step": 632
},
{
"epoch": 2.532,
"grad_norm": 2.0993237495422363,
"learning_rate": 4.983870967741936e-06,
"loss": 0.9509,
"step": 633
},
{
"epoch": 2.536,
"grad_norm": 1.8794467449188232,
"learning_rate": 4.975806451612903e-06,
"loss": 1.0481,
"step": 634
},
{
"epoch": 2.54,
"grad_norm": 2.022815704345703,
"learning_rate": 4.967741935483871e-06,
"loss": 0.8893,
"step": 635
},
{
"epoch": 2.544,
"grad_norm": 2.1620354652404785,
"learning_rate": 4.9596774193548395e-06,
"loss": 0.8449,
"step": 636
},
{
"epoch": 2.548,
"grad_norm": 2.199138641357422,
"learning_rate": 4.951612903225807e-06,
"loss": 1.099,
"step": 637
},
{
"epoch": 2.552,
"grad_norm": 1.8625229597091675,
"learning_rate": 4.943548387096775e-06,
"loss": 0.6892,
"step": 638
},
{
"epoch": 2.556,
"grad_norm": 1.9562162160873413,
"learning_rate": 4.9354838709677425e-06,
"loss": 0.832,
"step": 639
},
{
"epoch": 2.56,
"grad_norm": 2.2847657203674316,
"learning_rate": 4.92741935483871e-06,
"loss": 0.985,
"step": 640
},
{
"epoch": 2.564,
"grad_norm": 1.908446192741394,
"learning_rate": 4.919354838709678e-06,
"loss": 0.7687,
"step": 641
},
{
"epoch": 2.568,
"grad_norm": 2.076167106628418,
"learning_rate": 4.9112903225806455e-06,
"loss": 0.7979,
"step": 642
},
{
"epoch": 2.572,
"grad_norm": 1.9638773202896118,
"learning_rate": 4.903225806451613e-06,
"loss": 0.7731,
"step": 643
},
{
"epoch": 2.576,
"grad_norm": 1.938830018043518,
"learning_rate": 4.895161290322581e-06,
"loss": 0.6483,
"step": 644
},
{
"epoch": 2.58,
"grad_norm": 1.9963139295578003,
"learning_rate": 4.8870967741935484e-06,
"loss": 0.7376,
"step": 645
},
{
"epoch": 2.584,
"grad_norm": 2.0620100498199463,
"learning_rate": 4.879032258064517e-06,
"loss": 0.7429,
"step": 646
},
{
"epoch": 2.588,
"grad_norm": 2.256847858428955,
"learning_rate": 4.870967741935485e-06,
"loss": 1.0364,
"step": 647
},
{
"epoch": 2.592,
"grad_norm": 1.9625539779663086,
"learning_rate": 4.862903225806451e-06,
"loss": 0.7753,
"step": 648
},
{
"epoch": 2.596,
"grad_norm": 2.0241689682006836,
"learning_rate": 4.85483870967742e-06,
"loss": 0.7314,
"step": 649
},
{
"epoch": 2.6,
"grad_norm": 2.127464771270752,
"learning_rate": 4.8467741935483876e-06,
"loss": 0.9116,
"step": 650
},
{
"epoch": 2.604,
"grad_norm": 2.0825483798980713,
"learning_rate": 4.838709677419355e-06,
"loss": 0.892,
"step": 651
},
{
"epoch": 2.608,
"grad_norm": 1.9793882369995117,
"learning_rate": 4.830645161290323e-06,
"loss": 0.8634,
"step": 652
},
{
"epoch": 2.612,
"grad_norm": 2.1449151039123535,
"learning_rate": 4.8225806451612905e-06,
"loss": 0.8948,
"step": 653
},
{
"epoch": 2.616,
"grad_norm": 2.0711538791656494,
"learning_rate": 4.814516129032258e-06,
"loss": 0.9662,
"step": 654
},
{
"epoch": 2.62,
"grad_norm": 2.1446480751037598,
"learning_rate": 4.806451612903227e-06,
"loss": 1.1214,
"step": 655
},
{
"epoch": 2.624,
"grad_norm": 2.095132350921631,
"learning_rate": 4.798387096774194e-06,
"loss": 0.9048,
"step": 656
},
{
"epoch": 2.628,
"grad_norm": 2.2719180583953857,
"learning_rate": 4.790322580645161e-06,
"loss": 0.9439,
"step": 657
},
{
"epoch": 2.632,
"grad_norm": 2.134521961212158,
"learning_rate": 4.78225806451613e-06,
"loss": 0.9083,
"step": 658
},
{
"epoch": 2.636,
"grad_norm": 2.250610589981079,
"learning_rate": 4.774193548387097e-06,
"loss": 1.0183,
"step": 659
},
{
"epoch": 2.64,
"grad_norm": 2.1581926345825195,
"learning_rate": 4.766129032258065e-06,
"loss": 1.0784,
"step": 660
},
{
"epoch": 2.644,
"grad_norm": 1.8543059825897217,
"learning_rate": 4.758064516129033e-06,
"loss": 0.7357,
"step": 661
},
{
"epoch": 2.648,
"grad_norm": 2.0010993480682373,
"learning_rate": 4.75e-06,
"loss": 0.8364,
"step": 662
},
{
"epoch": 2.652,
"grad_norm": 2.1048102378845215,
"learning_rate": 4.741935483870968e-06,
"loss": 0.945,
"step": 663
},
{
"epoch": 2.656,
"grad_norm": 1.93734872341156,
"learning_rate": 4.7338709677419356e-06,
"loss": 0.9438,
"step": 664
},
{
"epoch": 2.66,
"grad_norm": 1.8726097345352173,
"learning_rate": 4.725806451612904e-06,
"loss": 0.7453,
"step": 665
},
{
"epoch": 2.664,
"grad_norm": 1.912177324295044,
"learning_rate": 4.717741935483872e-06,
"loss": 0.8832,
"step": 666
},
{
"epoch": 2.668,
"grad_norm": 2.0042824745178223,
"learning_rate": 4.7096774193548385e-06,
"loss": 0.9554,
"step": 667
},
{
"epoch": 2.672,
"grad_norm": 2.0271189212799072,
"learning_rate": 4.701612903225807e-06,
"loss": 0.8369,
"step": 668
},
{
"epoch": 2.676,
"grad_norm": 1.935368299484253,
"learning_rate": 4.693548387096775e-06,
"loss": 0.839,
"step": 669
},
{
"epoch": 2.68,
"grad_norm": 1.7850247621536255,
"learning_rate": 4.685483870967742e-06,
"loss": 0.6585,
"step": 670
},
{
"epoch": 2.684,
"grad_norm": 2.017695665359497,
"learning_rate": 4.67741935483871e-06,
"loss": 0.983,
"step": 671
},
{
"epoch": 2.6879999999999997,
"grad_norm": 2.1200356483459473,
"learning_rate": 4.669354838709678e-06,
"loss": 1.0346,
"step": 672
},
{
"epoch": 2.692,
"grad_norm": 1.8420408964157104,
"learning_rate": 4.661290322580645e-06,
"loss": 0.6386,
"step": 673
},
{
"epoch": 2.6959999999999997,
"grad_norm": 2.109278678894043,
"learning_rate": 4.653225806451613e-06,
"loss": 0.8037,
"step": 674
},
{
"epoch": 2.7,
"grad_norm": 2.1483943462371826,
"learning_rate": 4.6451612903225815e-06,
"loss": 0.9849,
"step": 675
},
{
"epoch": 2.7039999999999997,
"grad_norm": 2.1622354984283447,
"learning_rate": 4.637096774193548e-06,
"loss": 0.9104,
"step": 676
},
{
"epoch": 2.708,
"grad_norm": 2.178973913192749,
"learning_rate": 4.629032258064517e-06,
"loss": 0.9923,
"step": 677
},
{
"epoch": 2.7119999999999997,
"grad_norm": 2.0353808403015137,
"learning_rate": 4.6209677419354844e-06,
"loss": 0.9072,
"step": 678
},
{
"epoch": 2.716,
"grad_norm": 2.2480483055114746,
"learning_rate": 4.612903225806452e-06,
"loss": 1.1416,
"step": 679
},
{
"epoch": 2.7199999999999998,
"grad_norm": 2.0765092372894287,
"learning_rate": 4.60483870967742e-06,
"loss": 0.9671,
"step": 680
},
{
"epoch": 2.724,
"grad_norm": 2.0080316066741943,
"learning_rate": 4.596774193548387e-06,
"loss": 1.0271,
"step": 681
},
{
"epoch": 2.7279999999999998,
"grad_norm": 2.1100165843963623,
"learning_rate": 4.588709677419355e-06,
"loss": 0.8433,
"step": 682
},
{
"epoch": 2.732,
"grad_norm": 1.883931279182434,
"learning_rate": 4.580645161290323e-06,
"loss": 0.7499,
"step": 683
},
{
"epoch": 2.7359999999999998,
"grad_norm": 2.1434648036956787,
"learning_rate": 4.572580645161291e-06,
"loss": 0.8798,
"step": 684
},
{
"epoch": 2.74,
"grad_norm": 2.163625717163086,
"learning_rate": 4.564516129032259e-06,
"loss": 1.0695,
"step": 685
},
{
"epoch": 2.7439999999999998,
"grad_norm": 2.0244061946868896,
"learning_rate": 4.556451612903226e-06,
"loss": 1.1869,
"step": 686
},
{
"epoch": 2.748,
"grad_norm": 1.9727342128753662,
"learning_rate": 4.548387096774194e-06,
"loss": 0.8893,
"step": 687
},
{
"epoch": 2.752,
"grad_norm": 2.0887742042541504,
"learning_rate": 4.540322580645162e-06,
"loss": 1.0081,
"step": 688
},
{
"epoch": 2.7560000000000002,
"grad_norm": 1.9039736986160278,
"learning_rate": 4.5322580645161295e-06,
"loss": 0.7422,
"step": 689
},
{
"epoch": 2.76,
"grad_norm": 1.9305413961410522,
"learning_rate": 4.524193548387097e-06,
"loss": 0.946,
"step": 690
},
{
"epoch": 2.7640000000000002,
"grad_norm": 2.099938154220581,
"learning_rate": 4.516129032258065e-06,
"loss": 0.8641,
"step": 691
},
{
"epoch": 2.768,
"grad_norm": 2.146622657775879,
"learning_rate": 4.5080645161290325e-06,
"loss": 0.9026,
"step": 692
},
{
"epoch": 2.7720000000000002,
"grad_norm": 2.158890724182129,
"learning_rate": 4.5e-06,
"loss": 0.8535,
"step": 693
},
{
"epoch": 2.776,
"grad_norm": 1.977545976638794,
"learning_rate": 4.491935483870969e-06,
"loss": 0.8315,
"step": 694
},
{
"epoch": 2.7800000000000002,
"grad_norm": 2.205862522125244,
"learning_rate": 4.4838709677419354e-06,
"loss": 0.8047,
"step": 695
},
{
"epoch": 2.784,
"grad_norm": 2.0016543865203857,
"learning_rate": 4.475806451612903e-06,
"loss": 0.7461,
"step": 696
},
{
"epoch": 2.7880000000000003,
"grad_norm": 2.0579326152801514,
"learning_rate": 4.467741935483872e-06,
"loss": 0.9703,
"step": 697
},
{
"epoch": 2.792,
"grad_norm": 2.0946247577667236,
"learning_rate": 4.459677419354839e-06,
"loss": 0.9027,
"step": 698
},
{
"epoch": 2.7960000000000003,
"grad_norm": 2.0501272678375244,
"learning_rate": 4.451612903225807e-06,
"loss": 0.8489,
"step": 699
},
{
"epoch": 2.8,
"grad_norm": 2.2338743209838867,
"learning_rate": 4.4435483870967745e-06,
"loss": 1.0747,
"step": 700
},
{
"epoch": 2.8040000000000003,
"grad_norm": 1.8594715595245361,
"learning_rate": 4.435483870967742e-06,
"loss": 0.7034,
"step": 701
},
{
"epoch": 2.808,
"grad_norm": 2.1036953926086426,
"learning_rate": 4.42741935483871e-06,
"loss": 1.1334,
"step": 702
},
{
"epoch": 2.8120000000000003,
"grad_norm": 2.05660343170166,
"learning_rate": 4.419354838709678e-06,
"loss": 0.9696,
"step": 703
},
{
"epoch": 2.816,
"grad_norm": 1.8654855489730835,
"learning_rate": 4.411290322580645e-06,
"loss": 0.7639,
"step": 704
},
{
"epoch": 2.82,
"grad_norm": 2.0342979431152344,
"learning_rate": 4.403225806451613e-06,
"loss": 0.7359,
"step": 705
},
{
"epoch": 2.824,
"grad_norm": 2.1098721027374268,
"learning_rate": 4.395161290322581e-06,
"loss": 0.9963,
"step": 706
},
{
"epoch": 2.828,
"grad_norm": 2.033388614654541,
"learning_rate": 4.387096774193549e-06,
"loss": 1.0258,
"step": 707
},
{
"epoch": 2.832,
"grad_norm": 2.041229248046875,
"learning_rate": 4.379032258064517e-06,
"loss": 0.8926,
"step": 708
},
{
"epoch": 2.836,
"grad_norm": 2.04064679145813,
"learning_rate": 4.370967741935484e-06,
"loss": 0.9707,
"step": 709
},
{
"epoch": 2.84,
"grad_norm": 2.09248685836792,
"learning_rate": 4.362903225806452e-06,
"loss": 0.9311,
"step": 710
},
{
"epoch": 2.844,
"grad_norm": 1.9359509944915771,
"learning_rate": 4.35483870967742e-06,
"loss": 0.7429,
"step": 711
},
{
"epoch": 2.848,
"grad_norm": 1.8163801431655884,
"learning_rate": 4.346774193548387e-06,
"loss": 0.7459,
"step": 712
},
{
"epoch": 2.852,
"grad_norm": 2.0014097690582275,
"learning_rate": 4.338709677419356e-06,
"loss": 0.7952,
"step": 713
},
{
"epoch": 2.856,
"grad_norm": 1.8234827518463135,
"learning_rate": 4.3306451612903226e-06,
"loss": 0.7551,
"step": 714
},
{
"epoch": 2.86,
"grad_norm": 1.8983055353164673,
"learning_rate": 4.32258064516129e-06,
"loss": 0.8233,
"step": 715
},
{
"epoch": 2.864,
"grad_norm": 2.007643222808838,
"learning_rate": 4.314516129032259e-06,
"loss": 0.8136,
"step": 716
},
{
"epoch": 2.868,
"grad_norm": 2.030146598815918,
"learning_rate": 4.306451612903226e-06,
"loss": 0.9544,
"step": 717
},
{
"epoch": 2.872,
"grad_norm": 1.9575273990631104,
"learning_rate": 4.298387096774194e-06,
"loss": 0.8132,
"step": 718
},
{
"epoch": 2.876,
"grad_norm": 2.100543260574341,
"learning_rate": 4.290322580645162e-06,
"loss": 0.8235,
"step": 719
},
{
"epoch": 2.88,
"grad_norm": 2.001739501953125,
"learning_rate": 4.282258064516129e-06,
"loss": 0.8988,
"step": 720
},
{
"epoch": 2.884,
"grad_norm": 1.9895164966583252,
"learning_rate": 4.274193548387097e-06,
"loss": 0.8197,
"step": 721
},
{
"epoch": 2.888,
"grad_norm": 2.17423677444458,
"learning_rate": 4.266129032258065e-06,
"loss": 1.0093,
"step": 722
},
{
"epoch": 2.892,
"grad_norm": 2.095899820327759,
"learning_rate": 4.258064516129032e-06,
"loss": 0.8459,
"step": 723
},
{
"epoch": 2.896,
"grad_norm": 1.9749155044555664,
"learning_rate": 4.25e-06,
"loss": 0.728,
"step": 724
},
{
"epoch": 2.9,
"grad_norm": 2.1287543773651123,
"learning_rate": 4.2419354838709685e-06,
"loss": 0.893,
"step": 725
},
{
"epoch": 2.904,
"grad_norm": 2.2005615234375,
"learning_rate": 4.233870967741936e-06,
"loss": 0.9092,
"step": 726
},
{
"epoch": 2.908,
"grad_norm": 1.9065170288085938,
"learning_rate": 4.225806451612904e-06,
"loss": 0.7934,
"step": 727
},
{
"epoch": 2.912,
"grad_norm": 2.1782727241516113,
"learning_rate": 4.2177419354838714e-06,
"loss": 0.9975,
"step": 728
},
{
"epoch": 2.916,
"grad_norm": 1.943291425704956,
"learning_rate": 4.209677419354839e-06,
"loss": 0.6421,
"step": 729
},
{
"epoch": 2.92,
"grad_norm": 2.0664920806884766,
"learning_rate": 4.201612903225807e-06,
"loss": 0.9832,
"step": 730
},
{
"epoch": 2.924,
"grad_norm": 2.025261163711548,
"learning_rate": 4.193548387096774e-06,
"loss": 0.8664,
"step": 731
},
{
"epoch": 2.928,
"grad_norm": 2.236361503601074,
"learning_rate": 4.185483870967742e-06,
"loss": 0.9429,
"step": 732
},
{
"epoch": 2.932,
"grad_norm": 2.07635235786438,
"learning_rate": 4.17741935483871e-06,
"loss": 0.8334,
"step": 733
},
{
"epoch": 2.936,
"grad_norm": 2.529341697692871,
"learning_rate": 4.169354838709677e-06,
"loss": 1.3918,
"step": 734
},
{
"epoch": 2.94,
"grad_norm": 2.07926869392395,
"learning_rate": 4.161290322580646e-06,
"loss": 0.8891,
"step": 735
},
{
"epoch": 2.944,
"grad_norm": 2.050360679626465,
"learning_rate": 4.1532258064516135e-06,
"loss": 0.6847,
"step": 736
},
{
"epoch": 2.948,
"grad_norm": 1.9370849132537842,
"learning_rate": 4.14516129032258e-06,
"loss": 0.9969,
"step": 737
},
{
"epoch": 2.952,
"grad_norm": 2.014514923095703,
"learning_rate": 4.137096774193549e-06,
"loss": 0.9307,
"step": 738
},
{
"epoch": 2.956,
"grad_norm": 1.977842092514038,
"learning_rate": 4.1290322580645165e-06,
"loss": 0.7776,
"step": 739
},
{
"epoch": 2.96,
"grad_norm": 2.2410061359405518,
"learning_rate": 4.120967741935484e-06,
"loss": 0.7748,
"step": 740
},
{
"epoch": 2.964,
"grad_norm": 2.128809928894043,
"learning_rate": 4.112903225806452e-06,
"loss": 0.9865,
"step": 741
},
{
"epoch": 2.968,
"grad_norm": 2.3587141036987305,
"learning_rate": 4.1048387096774195e-06,
"loss": 1.0742,
"step": 742
},
{
"epoch": 2.972,
"grad_norm": 1.8111485242843628,
"learning_rate": 4.096774193548387e-06,
"loss": 0.7442,
"step": 743
},
{
"epoch": 2.976,
"grad_norm": 1.935282588005066,
"learning_rate": 4.088709677419356e-06,
"loss": 0.785,
"step": 744
},
{
"epoch": 2.98,
"grad_norm": 2.1059017181396484,
"learning_rate": 4.080645161290323e-06,
"loss": 0.9952,
"step": 745
},
{
"epoch": 2.984,
"grad_norm": 1.992270588874817,
"learning_rate": 4.072580645161291e-06,
"loss": 1.0289,
"step": 746
},
{
"epoch": 2.988,
"grad_norm": 2.0285916328430176,
"learning_rate": 4.064516129032259e-06,
"loss": 0.9602,
"step": 747
},
{
"epoch": 2.992,
"grad_norm": 2.2532975673675537,
"learning_rate": 4.056451612903226e-06,
"loss": 0.8776,
"step": 748
},
{
"epoch": 2.996,
"grad_norm": 2.032003879547119,
"learning_rate": 4.048387096774194e-06,
"loss": 0.8821,
"step": 749
},
{
"epoch": 3.0,
"grad_norm": 2.123439311981201,
"learning_rate": 4.0403225806451615e-06,
"loss": 1.1439,
"step": 750
},
{
"epoch": 3.004,
"grad_norm": 1.7719855308532715,
"learning_rate": 4.032258064516129e-06,
"loss": 0.7289,
"step": 751
},
{
"epoch": 3.008,
"grad_norm": 2.1318604946136475,
"learning_rate": 4.024193548387097e-06,
"loss": 0.8148,
"step": 752
},
{
"epoch": 3.012,
"grad_norm": 1.8814866542816162,
"learning_rate": 4.0161290322580645e-06,
"loss": 0.6389,
"step": 753
},
{
"epoch": 3.016,
"grad_norm": 1.8504319190979004,
"learning_rate": 4.008064516129033e-06,
"loss": 0.7059,
"step": 754
},
{
"epoch": 3.02,
"grad_norm": 1.9470055103302002,
"learning_rate": 4.000000000000001e-06,
"loss": 0.8993,
"step": 755
},
{
"epoch": 3.024,
"grad_norm": 2.0780255794525146,
"learning_rate": 3.9919354838709675e-06,
"loss": 0.9513,
"step": 756
},
{
"epoch": 3.028,
"grad_norm": 1.9300411939620972,
"learning_rate": 3.983870967741936e-06,
"loss": 0.9704,
"step": 757
},
{
"epoch": 3.032,
"grad_norm": 2.2161500453948975,
"learning_rate": 3.975806451612904e-06,
"loss": 0.9227,
"step": 758
},
{
"epoch": 3.036,
"grad_norm": 1.9605953693389893,
"learning_rate": 3.967741935483871e-06,
"loss": 0.7131,
"step": 759
},
{
"epoch": 3.04,
"grad_norm": 2.048816442489624,
"learning_rate": 3.959677419354839e-06,
"loss": 0.81,
"step": 760
},
{
"epoch": 3.044,
"grad_norm": 2.2346277236938477,
"learning_rate": 3.951612903225807e-06,
"loss": 1.105,
"step": 761
},
{
"epoch": 3.048,
"grad_norm": 2.1492414474487305,
"learning_rate": 3.943548387096774e-06,
"loss": 0.7878,
"step": 762
},
{
"epoch": 3.052,
"grad_norm": 2.0384480953216553,
"learning_rate": 3.935483870967742e-06,
"loss": 0.9071,
"step": 763
},
{
"epoch": 3.056,
"grad_norm": 1.9908150434494019,
"learning_rate": 3.92741935483871e-06,
"loss": 0.7479,
"step": 764
},
{
"epoch": 3.06,
"grad_norm": 2.333285093307495,
"learning_rate": 3.919354838709677e-06,
"loss": 0.872,
"step": 765
},
{
"epoch": 3.064,
"grad_norm": 2.2055699825286865,
"learning_rate": 3.911290322580646e-06,
"loss": 1.0272,
"step": 766
},
{
"epoch": 3.068,
"grad_norm": 2.220351219177246,
"learning_rate": 3.903225806451613e-06,
"loss": 0.805,
"step": 767
},
{
"epoch": 3.072,
"grad_norm": 2.3372137546539307,
"learning_rate": 3.895161290322581e-06,
"loss": 0.9424,
"step": 768
},
{
"epoch": 3.076,
"grad_norm": 1.9927171468734741,
"learning_rate": 3.887096774193549e-06,
"loss": 0.8291,
"step": 769
},
{
"epoch": 3.08,
"grad_norm": 2.0067288875579834,
"learning_rate": 3.879032258064516e-06,
"loss": 0.7085,
"step": 770
},
{
"epoch": 3.084,
"grad_norm": 2.0726656913757324,
"learning_rate": 3.870967741935484e-06,
"loss": 0.7554,
"step": 771
},
{
"epoch": 3.088,
"grad_norm": 2.2010180950164795,
"learning_rate": 3.862903225806452e-06,
"loss": 0.8685,
"step": 772
},
{
"epoch": 3.092,
"grad_norm": 1.9578466415405273,
"learning_rate": 3.85483870967742e-06,
"loss": 0.7926,
"step": 773
},
{
"epoch": 3.096,
"grad_norm": 2.350506067276001,
"learning_rate": 3.846774193548388e-06,
"loss": 0.8615,
"step": 774
},
{
"epoch": 3.1,
"grad_norm": 2.1344101428985596,
"learning_rate": 3.838709677419355e-06,
"loss": 0.7115,
"step": 775
},
{
"epoch": 3.104,
"grad_norm": 2.144338846206665,
"learning_rate": 3.830645161290323e-06,
"loss": 0.8837,
"step": 776
},
{
"epoch": 3.108,
"grad_norm": 2.0415239334106445,
"learning_rate": 3.822580645161291e-06,
"loss": 0.7773,
"step": 777
},
{
"epoch": 3.112,
"grad_norm": 2.1297361850738525,
"learning_rate": 3.8145161290322584e-06,
"loss": 0.7109,
"step": 778
},
{
"epoch": 3.116,
"grad_norm": 2.2194135189056396,
"learning_rate": 3.8064516129032257e-06,
"loss": 1.0434,
"step": 779
},
{
"epoch": 3.12,
"grad_norm": 2.218269109725952,
"learning_rate": 3.7983870967741937e-06,
"loss": 1.0466,
"step": 780
},
{
"epoch": 3.124,
"grad_norm": 2.140355110168457,
"learning_rate": 3.7903225806451614e-06,
"loss": 0.9349,
"step": 781
},
{
"epoch": 3.128,
"grad_norm": 2.2219574451446533,
"learning_rate": 3.7822580645161295e-06,
"loss": 0.9302,
"step": 782
},
{
"epoch": 3.132,
"grad_norm": 2.2847862243652344,
"learning_rate": 3.774193548387097e-06,
"loss": 0.8846,
"step": 783
},
{
"epoch": 3.136,
"grad_norm": 2.7046589851379395,
"learning_rate": 3.7661290322580648e-06,
"loss": 1.1264,
"step": 784
},
{
"epoch": 3.14,
"grad_norm": 2.2613577842712402,
"learning_rate": 3.7580645161290324e-06,
"loss": 0.8623,
"step": 785
},
{
"epoch": 3.144,
"grad_norm": 2.1418726444244385,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.708,
"step": 786
},
{
"epoch": 3.148,
"grad_norm": 2.306887149810791,
"learning_rate": 3.741935483870968e-06,
"loss": 0.8445,
"step": 787
},
{
"epoch": 3.152,
"grad_norm": 2.23420786857605,
"learning_rate": 3.733870967741936e-06,
"loss": 0.7293,
"step": 788
},
{
"epoch": 3.156,
"grad_norm": 2.2788777351379395,
"learning_rate": 3.7258064516129035e-06,
"loss": 0.8153,
"step": 789
},
{
"epoch": 3.16,
"grad_norm": 1.8298105001449585,
"learning_rate": 3.717741935483871e-06,
"loss": 0.5402,
"step": 790
},
{
"epoch": 3.164,
"grad_norm": 2.0371906757354736,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.8256,
"step": 791
},
{
"epoch": 3.168,
"grad_norm": 2.2342934608459473,
"learning_rate": 3.701612903225807e-06,
"loss": 0.7587,
"step": 792
},
{
"epoch": 3.172,
"grad_norm": 1.977276086807251,
"learning_rate": 3.693548387096775e-06,
"loss": 0.6206,
"step": 793
},
{
"epoch": 3.176,
"grad_norm": 2.0874223709106445,
"learning_rate": 3.685483870967742e-06,
"loss": 0.7768,
"step": 794
},
{
"epoch": 3.18,
"grad_norm": 2.3658487796783447,
"learning_rate": 3.67741935483871e-06,
"loss": 1.0724,
"step": 795
},
{
"epoch": 3.184,
"grad_norm": 2.1156909465789795,
"learning_rate": 3.669354838709678e-06,
"loss": 0.7234,
"step": 796
},
{
"epoch": 3.188,
"grad_norm": 2.087329387664795,
"learning_rate": 3.6612903225806456e-06,
"loss": 0.7502,
"step": 797
},
{
"epoch": 3.192,
"grad_norm": 2.002694606781006,
"learning_rate": 3.653225806451613e-06,
"loss": 0.6678,
"step": 798
},
{
"epoch": 3.196,
"grad_norm": 2.2412350177764893,
"learning_rate": 3.645161290322581e-06,
"loss": 0.8916,
"step": 799
},
{
"epoch": 3.2,
"grad_norm": 2.1156818866729736,
"learning_rate": 3.6370967741935485e-06,
"loss": 0.8201,
"step": 800
},
{
"epoch": 3.204,
"grad_norm": 1.9646267890930176,
"learning_rate": 3.6290322580645166e-06,
"loss": 0.6466,
"step": 801
},
{
"epoch": 3.208,
"grad_norm": 2.306908369064331,
"learning_rate": 3.6209677419354843e-06,
"loss": 0.8978,
"step": 802
},
{
"epoch": 3.212,
"grad_norm": 2.0926380157470703,
"learning_rate": 3.6129032258064515e-06,
"loss": 0.8653,
"step": 803
},
{
"epoch": 3.216,
"grad_norm": 1.9711487293243408,
"learning_rate": 3.6048387096774196e-06,
"loss": 0.6135,
"step": 804
},
{
"epoch": 3.22,
"grad_norm": 1.9564828872680664,
"learning_rate": 3.5967741935483872e-06,
"loss": 0.5758,
"step": 805
},
{
"epoch": 3.224,
"grad_norm": 2.0477702617645264,
"learning_rate": 3.5887096774193553e-06,
"loss": 0.6318,
"step": 806
},
{
"epoch": 3.228,
"grad_norm": 2.380937337875366,
"learning_rate": 3.580645161290323e-06,
"loss": 0.7827,
"step": 807
},
{
"epoch": 3.232,
"grad_norm": 2.2055320739746094,
"learning_rate": 3.5725806451612906e-06,
"loss": 1.0348,
"step": 808
},
{
"epoch": 3.2359999999999998,
"grad_norm": 2.1511037349700928,
"learning_rate": 3.5645161290322583e-06,
"loss": 0.7221,
"step": 809
},
{
"epoch": 3.24,
"grad_norm": 2.1401073932647705,
"learning_rate": 3.5564516129032264e-06,
"loss": 0.7153,
"step": 810
},
{
"epoch": 3.2439999999999998,
"grad_norm": 2.51979660987854,
"learning_rate": 3.548387096774194e-06,
"loss": 0.6526,
"step": 811
},
{
"epoch": 3.248,
"grad_norm": 2.209287643432617,
"learning_rate": 3.5403225806451612e-06,
"loss": 0.9964,
"step": 812
},
{
"epoch": 3.252,
"grad_norm": 1.9091753959655762,
"learning_rate": 3.5322580645161293e-06,
"loss": 0.6887,
"step": 813
},
{
"epoch": 3.2560000000000002,
"grad_norm": 2.0265955924987793,
"learning_rate": 3.524193548387097e-06,
"loss": 0.8468,
"step": 814
},
{
"epoch": 3.26,
"grad_norm": 2.014711618423462,
"learning_rate": 3.516129032258065e-06,
"loss": 0.8604,
"step": 815
},
{
"epoch": 3.2640000000000002,
"grad_norm": 1.97525155544281,
"learning_rate": 3.5080645161290327e-06,
"loss": 0.6563,
"step": 816
},
{
"epoch": 3.268,
"grad_norm": 2.1765522956848145,
"learning_rate": 3.5e-06,
"loss": 0.9257,
"step": 817
},
{
"epoch": 3.2720000000000002,
"grad_norm": 2.2417075634002686,
"learning_rate": 3.491935483870968e-06,
"loss": 0.8021,
"step": 818
},
{
"epoch": 3.276,
"grad_norm": 2.251518487930298,
"learning_rate": 3.4838709677419357e-06,
"loss": 0.7227,
"step": 819
},
{
"epoch": 3.2800000000000002,
"grad_norm": 2.335770606994629,
"learning_rate": 3.4758064516129038e-06,
"loss": 0.8255,
"step": 820
},
{
"epoch": 3.284,
"grad_norm": 2.1436550617218018,
"learning_rate": 3.4677419354838714e-06,
"loss": 0.9514,
"step": 821
},
{
"epoch": 3.288,
"grad_norm": 2.09932804107666,
"learning_rate": 3.4596774193548386e-06,
"loss": 0.5868,
"step": 822
},
{
"epoch": 3.292,
"grad_norm": 2.198049306869507,
"learning_rate": 3.4516129032258067e-06,
"loss": 0.8944,
"step": 823
},
{
"epoch": 3.296,
"grad_norm": 2.0434253215789795,
"learning_rate": 3.4435483870967744e-06,
"loss": 0.8541,
"step": 824
},
{
"epoch": 3.3,
"grad_norm": 2.1087992191314697,
"learning_rate": 3.4354838709677425e-06,
"loss": 0.8743,
"step": 825
},
{
"epoch": 3.304,
"grad_norm": 2.2953779697418213,
"learning_rate": 3.4274193548387097e-06,
"loss": 0.8574,
"step": 826
},
{
"epoch": 3.308,
"grad_norm": 2.283322811126709,
"learning_rate": 3.4193548387096773e-06,
"loss": 0.8048,
"step": 827
},
{
"epoch": 3.312,
"grad_norm": 2.1171464920043945,
"learning_rate": 3.4112903225806454e-06,
"loss": 0.8447,
"step": 828
},
{
"epoch": 3.316,
"grad_norm": 2.184858560562134,
"learning_rate": 3.403225806451613e-06,
"loss": 0.8631,
"step": 829
},
{
"epoch": 3.32,
"grad_norm": 2.0850114822387695,
"learning_rate": 3.395161290322581e-06,
"loss": 0.7252,
"step": 830
},
{
"epoch": 3.324,
"grad_norm": 2.0298373699188232,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.609,
"step": 831
},
{
"epoch": 3.328,
"grad_norm": 1.981493353843689,
"learning_rate": 3.3790322580645165e-06,
"loss": 0.6752,
"step": 832
},
{
"epoch": 3.332,
"grad_norm": 1.9498553276062012,
"learning_rate": 3.370967741935484e-06,
"loss": 0.7262,
"step": 833
},
{
"epoch": 3.336,
"grad_norm": 2.145020008087158,
"learning_rate": 3.362903225806452e-06,
"loss": 0.757,
"step": 834
},
{
"epoch": 3.34,
"grad_norm": 2.355727434158325,
"learning_rate": 3.35483870967742e-06,
"loss": 0.982,
"step": 835
},
{
"epoch": 3.344,
"grad_norm": 2.1901140213012695,
"learning_rate": 3.346774193548387e-06,
"loss": 1.0534,
"step": 836
},
{
"epoch": 3.348,
"grad_norm": 2.0761780738830566,
"learning_rate": 3.338709677419355e-06,
"loss": 0.668,
"step": 837
},
{
"epoch": 3.352,
"grad_norm": 1.9439619779586792,
"learning_rate": 3.330645161290323e-06,
"loss": 0.6646,
"step": 838
},
{
"epoch": 3.356,
"grad_norm": 2.3940210342407227,
"learning_rate": 3.322580645161291e-06,
"loss": 0.8633,
"step": 839
},
{
"epoch": 3.36,
"grad_norm": 2.194489002227783,
"learning_rate": 3.3145161290322586e-06,
"loss": 0.7908,
"step": 840
},
{
"epoch": 3.364,
"grad_norm": 2.1915009021759033,
"learning_rate": 3.306451612903226e-06,
"loss": 0.7874,
"step": 841
},
{
"epoch": 3.368,
"grad_norm": 2.230459451675415,
"learning_rate": 3.298387096774194e-06,
"loss": 0.8457,
"step": 842
},
{
"epoch": 3.372,
"grad_norm": 1.936184048652649,
"learning_rate": 3.2903225806451615e-06,
"loss": 0.5494,
"step": 843
},
{
"epoch": 3.376,
"grad_norm": 1.977290391921997,
"learning_rate": 3.2822580645161296e-06,
"loss": 0.5436,
"step": 844
},
{
"epoch": 3.38,
"grad_norm": 1.9874720573425293,
"learning_rate": 3.274193548387097e-06,
"loss": 0.6058,
"step": 845
},
{
"epoch": 3.384,
"grad_norm": 2.178884983062744,
"learning_rate": 3.2661290322580645e-06,
"loss": 0.786,
"step": 846
},
{
"epoch": 3.388,
"grad_norm": 2.1946234703063965,
"learning_rate": 3.2580645161290326e-06,
"loss": 0.8966,
"step": 847
},
{
"epoch": 3.392,
"grad_norm": 2.4029574394226074,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.8739,
"step": 848
},
{
"epoch": 3.396,
"grad_norm": 2.372209072113037,
"learning_rate": 3.2419354838709683e-06,
"loss": 0.8783,
"step": 849
},
{
"epoch": 3.4,
"grad_norm": 2.1974384784698486,
"learning_rate": 3.2338709677419355e-06,
"loss": 0.7697,
"step": 850
},
{
"epoch": 3.404,
"grad_norm": 2.1393015384674072,
"learning_rate": 3.225806451612903e-06,
"loss": 0.7115,
"step": 851
},
{
"epoch": 3.408,
"grad_norm": 2.1827282905578613,
"learning_rate": 3.2177419354838713e-06,
"loss": 0.6953,
"step": 852
},
{
"epoch": 3.412,
"grad_norm": 2.194448709487915,
"learning_rate": 3.209677419354839e-06,
"loss": 0.7056,
"step": 853
},
{
"epoch": 3.416,
"grad_norm": 2.071216106414795,
"learning_rate": 3.201612903225807e-06,
"loss": 0.6491,
"step": 854
},
{
"epoch": 3.42,
"grad_norm": 2.0821046829223633,
"learning_rate": 3.1935483870967742e-06,
"loss": 0.7056,
"step": 855
},
{
"epoch": 3.424,
"grad_norm": 2.3360350131988525,
"learning_rate": 3.1854838709677423e-06,
"loss": 0.8335,
"step": 856
},
{
"epoch": 3.428,
"grad_norm": 2.363644599914551,
"learning_rate": 3.17741935483871e-06,
"loss": 0.9493,
"step": 857
},
{
"epoch": 3.432,
"grad_norm": 2.0370776653289795,
"learning_rate": 3.169354838709678e-06,
"loss": 0.6045,
"step": 858
},
{
"epoch": 3.436,
"grad_norm": 1.980400562286377,
"learning_rate": 3.1612903225806453e-06,
"loss": 0.7546,
"step": 859
},
{
"epoch": 3.44,
"grad_norm": 2.0285377502441406,
"learning_rate": 3.153225806451613e-06,
"loss": 0.7595,
"step": 860
},
{
"epoch": 3.444,
"grad_norm": 2.0669169425964355,
"learning_rate": 3.145161290322581e-06,
"loss": 0.7445,
"step": 861
},
{
"epoch": 3.448,
"grad_norm": 2.302081346511841,
"learning_rate": 3.1370967741935487e-06,
"loss": 0.8515,
"step": 862
},
{
"epoch": 3.452,
"grad_norm": 2.183147668838501,
"learning_rate": 3.1290322580645167e-06,
"loss": 0.839,
"step": 863
},
{
"epoch": 3.456,
"grad_norm": 2.367276191711426,
"learning_rate": 3.120967741935484e-06,
"loss": 0.8589,
"step": 864
},
{
"epoch": 3.46,
"grad_norm": 2.2339820861816406,
"learning_rate": 3.1129032258064516e-06,
"loss": 0.756,
"step": 865
},
{
"epoch": 3.464,
"grad_norm": 2.2325429916381836,
"learning_rate": 3.1048387096774197e-06,
"loss": 0.7331,
"step": 866
},
{
"epoch": 3.468,
"grad_norm": 1.9906346797943115,
"learning_rate": 3.0967741935483874e-06,
"loss": 0.6628,
"step": 867
},
{
"epoch": 3.472,
"grad_norm": 1.958089828491211,
"learning_rate": 3.0887096774193554e-06,
"loss": 0.5923,
"step": 868
},
{
"epoch": 3.476,
"grad_norm": 2.2247402667999268,
"learning_rate": 3.0806451612903227e-06,
"loss": 0.8411,
"step": 869
},
{
"epoch": 3.48,
"grad_norm": 2.3002967834472656,
"learning_rate": 3.0725806451612903e-06,
"loss": 0.9978,
"step": 870
},
{
"epoch": 3.484,
"grad_norm": 2.2606053352355957,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.9531,
"step": 871
},
{
"epoch": 3.488,
"grad_norm": 2.662911891937256,
"learning_rate": 3.056451612903226e-06,
"loss": 1.014,
"step": 872
},
{
"epoch": 3.492,
"grad_norm": 2.019650936126709,
"learning_rate": 3.0483870967741937e-06,
"loss": 0.6297,
"step": 873
},
{
"epoch": 3.496,
"grad_norm": 2.097616195678711,
"learning_rate": 3.0403225806451614e-06,
"loss": 0.7289,
"step": 874
},
{
"epoch": 3.5,
"grad_norm": 2.0663158893585205,
"learning_rate": 3.0322580645161295e-06,
"loss": 0.602,
"step": 875
},
{
"epoch": 3.504,
"grad_norm": 2.182816505432129,
"learning_rate": 3.024193548387097e-06,
"loss": 0.851,
"step": 876
},
{
"epoch": 3.508,
"grad_norm": 2.0033254623413086,
"learning_rate": 3.0161290322580648e-06,
"loss": 0.538,
"step": 877
},
{
"epoch": 3.512,
"grad_norm": 2.444193124771118,
"learning_rate": 3.0080645161290324e-06,
"loss": 0.8755,
"step": 878
},
{
"epoch": 3.516,
"grad_norm": 2.156662940979004,
"learning_rate": 3e-06,
"loss": 0.866,
"step": 879
},
{
"epoch": 3.52,
"grad_norm": 2.0604827404022217,
"learning_rate": 2.991935483870968e-06,
"loss": 0.7635,
"step": 880
},
{
"epoch": 3.524,
"grad_norm": 2.1980183124542236,
"learning_rate": 2.983870967741936e-06,
"loss": 0.9103,
"step": 881
},
{
"epoch": 3.528,
"grad_norm": 2.5254287719726562,
"learning_rate": 2.975806451612904e-06,
"loss": 1.0833,
"step": 882
},
{
"epoch": 3.532,
"grad_norm": 2.141465902328491,
"learning_rate": 2.967741935483871e-06,
"loss": 0.667,
"step": 883
},
{
"epoch": 3.536,
"grad_norm": 2.2055859565734863,
"learning_rate": 2.9596774193548388e-06,
"loss": 0.6874,
"step": 884
},
{
"epoch": 3.54,
"grad_norm": 2.136110544204712,
"learning_rate": 2.951612903225807e-06,
"loss": 0.7897,
"step": 885
},
{
"epoch": 3.544,
"grad_norm": 2.2698121070861816,
"learning_rate": 2.9435483870967745e-06,
"loss": 0.7304,
"step": 886
},
{
"epoch": 3.548,
"grad_norm": 2.3062589168548584,
"learning_rate": 2.9354838709677417e-06,
"loss": 1.0791,
"step": 887
},
{
"epoch": 3.552,
"grad_norm": 2.4121716022491455,
"learning_rate": 2.92741935483871e-06,
"loss": 0.9852,
"step": 888
},
{
"epoch": 3.556,
"grad_norm": 2.3910701274871826,
"learning_rate": 2.9193548387096775e-06,
"loss": 0.9276,
"step": 889
},
{
"epoch": 3.56,
"grad_norm": 2.2537155151367188,
"learning_rate": 2.9112903225806456e-06,
"loss": 0.9438,
"step": 890
},
{
"epoch": 3.564,
"grad_norm": 2.2412519454956055,
"learning_rate": 2.903225806451613e-06,
"loss": 0.6798,
"step": 891
},
{
"epoch": 3.568,
"grad_norm": 2.003661632537842,
"learning_rate": 2.8951612903225804e-06,
"loss": 0.6765,
"step": 892
},
{
"epoch": 3.572,
"grad_norm": 2.2614223957061768,
"learning_rate": 2.8870967741935485e-06,
"loss": 0.871,
"step": 893
},
{
"epoch": 3.576,
"grad_norm": 2.2518208026885986,
"learning_rate": 2.879032258064516e-06,
"loss": 0.829,
"step": 894
},
{
"epoch": 3.58,
"grad_norm": 2.1910247802734375,
"learning_rate": 2.8709677419354843e-06,
"loss": 0.7672,
"step": 895
},
{
"epoch": 3.584,
"grad_norm": 2.079641580581665,
"learning_rate": 2.862903225806452e-06,
"loss": 0.8007,
"step": 896
},
{
"epoch": 3.588,
"grad_norm": 2.3112521171569824,
"learning_rate": 2.8548387096774196e-06,
"loss": 0.8943,
"step": 897
},
{
"epoch": 3.592,
"grad_norm": 2.2150983810424805,
"learning_rate": 2.8467741935483872e-06,
"loss": 0.8935,
"step": 898
},
{
"epoch": 3.596,
"grad_norm": 2.2401678562164307,
"learning_rate": 2.8387096774193553e-06,
"loss": 0.834,
"step": 899
},
{
"epoch": 3.6,
"grad_norm": 2.1043715476989746,
"learning_rate": 2.830645161290323e-06,
"loss": 0.7918,
"step": 900
},
{
"epoch": 3.604,
"grad_norm": 2.1325125694274902,
"learning_rate": 2.822580645161291e-06,
"loss": 0.7057,
"step": 901
},
{
"epoch": 3.608,
"grad_norm": 2.4012649059295654,
"learning_rate": 2.8145161290322583e-06,
"loss": 0.856,
"step": 902
},
{
"epoch": 3.612,
"grad_norm": 2.29689884185791,
"learning_rate": 2.806451612903226e-06,
"loss": 0.6157,
"step": 903
},
{
"epoch": 3.616,
"grad_norm": 2.371718168258667,
"learning_rate": 2.798387096774194e-06,
"loss": 0.7604,
"step": 904
},
{
"epoch": 3.62,
"grad_norm": 2.0902979373931885,
"learning_rate": 2.7903225806451617e-06,
"loss": 0.639,
"step": 905
},
{
"epoch": 3.624,
"grad_norm": 2.0452613830566406,
"learning_rate": 2.782258064516129e-06,
"loss": 0.6567,
"step": 906
},
{
"epoch": 3.628,
"grad_norm": 2.4750518798828125,
"learning_rate": 2.774193548387097e-06,
"loss": 0.9825,
"step": 907
},
{
"epoch": 3.632,
"grad_norm": 2.1596009731292725,
"learning_rate": 2.7661290322580646e-06,
"loss": 0.7421,
"step": 908
},
{
"epoch": 3.636,
"grad_norm": 2.1303629875183105,
"learning_rate": 2.7580645161290327e-06,
"loss": 0.7502,
"step": 909
},
{
"epoch": 3.64,
"grad_norm": 2.1187257766723633,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.7001,
"step": 910
},
{
"epoch": 3.644,
"grad_norm": 2.2745378017425537,
"learning_rate": 2.7419354838709676e-06,
"loss": 1.0053,
"step": 911
},
{
"epoch": 3.648,
"grad_norm": 2.392577886581421,
"learning_rate": 2.7338709677419357e-06,
"loss": 0.842,
"step": 912
},
{
"epoch": 3.652,
"grad_norm": 2.352764844894409,
"learning_rate": 2.7258064516129033e-06,
"loss": 0.8263,
"step": 913
},
{
"epoch": 3.656,
"grad_norm": 2.2857632637023926,
"learning_rate": 2.7177419354838714e-06,
"loss": 0.9502,
"step": 914
},
{
"epoch": 3.66,
"grad_norm": 2.1214029788970947,
"learning_rate": 2.709677419354839e-06,
"loss": 0.7304,
"step": 915
},
{
"epoch": 3.664,
"grad_norm": 2.1727020740509033,
"learning_rate": 2.7016129032258063e-06,
"loss": 0.7385,
"step": 916
},
{
"epoch": 3.668,
"grad_norm": 2.2126569747924805,
"learning_rate": 2.6935483870967744e-06,
"loss": 0.8303,
"step": 917
},
{
"epoch": 3.672,
"grad_norm": 2.3064305782318115,
"learning_rate": 2.685483870967742e-06,
"loss": 0.9337,
"step": 918
},
{
"epoch": 3.676,
"grad_norm": 2.20000958442688,
"learning_rate": 2.67741935483871e-06,
"loss": 0.8639,
"step": 919
},
{
"epoch": 3.68,
"grad_norm": 2.3370859622955322,
"learning_rate": 2.6693548387096773e-06,
"loss": 0.9072,
"step": 920
},
{
"epoch": 3.684,
"grad_norm": 2.3687403202056885,
"learning_rate": 2.6612903225806454e-06,
"loss": 0.9368,
"step": 921
},
{
"epoch": 3.6879999999999997,
"grad_norm": 2.11373233795166,
"learning_rate": 2.653225806451613e-06,
"loss": 0.6405,
"step": 922
},
{
"epoch": 3.692,
"grad_norm": 2.117852210998535,
"learning_rate": 2.645161290322581e-06,
"loss": 0.9027,
"step": 923
},
{
"epoch": 3.6959999999999997,
"grad_norm": 2.4241926670074463,
"learning_rate": 2.637096774193549e-06,
"loss": 0.9222,
"step": 924
},
{
"epoch": 3.7,
"grad_norm": 2.1808900833129883,
"learning_rate": 2.629032258064516e-06,
"loss": 0.6647,
"step": 925
},
{
"epoch": 3.7039999999999997,
"grad_norm": 2.2304883003234863,
"learning_rate": 2.620967741935484e-06,
"loss": 0.8719,
"step": 926
},
{
"epoch": 3.708,
"grad_norm": 2.160715341567993,
"learning_rate": 2.6129032258064518e-06,
"loss": 0.838,
"step": 927
},
{
"epoch": 3.7119999999999997,
"grad_norm": 2.529524803161621,
"learning_rate": 2.60483870967742e-06,
"loss": 1.0631,
"step": 928
},
{
"epoch": 3.716,
"grad_norm": 2.2674691677093506,
"learning_rate": 2.5967741935483875e-06,
"loss": 0.704,
"step": 929
},
{
"epoch": 3.7199999999999998,
"grad_norm": 2.225947380065918,
"learning_rate": 2.5887096774193547e-06,
"loss": 0.6462,
"step": 930
},
{
"epoch": 3.724,
"grad_norm": 2.0101845264434814,
"learning_rate": 2.580645161290323e-06,
"loss": 0.6034,
"step": 931
},
{
"epoch": 3.7279999999999998,
"grad_norm": 2.166468858718872,
"learning_rate": 2.5725806451612905e-06,
"loss": 0.7711,
"step": 932
},
{
"epoch": 3.732,
"grad_norm": 2.0121777057647705,
"learning_rate": 2.5645161290322585e-06,
"loss": 0.7086,
"step": 933
},
{
"epoch": 3.7359999999999998,
"grad_norm": 2.2757959365844727,
"learning_rate": 2.5564516129032258e-06,
"loss": 0.8797,
"step": 934
},
{
"epoch": 3.74,
"grad_norm": 2.2584173679351807,
"learning_rate": 2.5483870967741934e-06,
"loss": 0.8662,
"step": 935
},
{
"epoch": 3.7439999999999998,
"grad_norm": 2.009277105331421,
"learning_rate": 2.5403225806451615e-06,
"loss": 0.5832,
"step": 936
},
{
"epoch": 3.748,
"grad_norm": 2.0112719535827637,
"learning_rate": 2.532258064516129e-06,
"loss": 0.7917,
"step": 937
},
{
"epoch": 3.752,
"grad_norm": 2.2253365516662598,
"learning_rate": 2.5241935483870972e-06,
"loss": 0.8194,
"step": 938
},
{
"epoch": 3.7560000000000002,
"grad_norm": 2.218528985977173,
"learning_rate": 2.5161290322580645e-06,
"loss": 0.6909,
"step": 939
},
{
"epoch": 3.76,
"grad_norm": 2.150059938430786,
"learning_rate": 2.5080645161290325e-06,
"loss": 0.7104,
"step": 940
},
{
"epoch": 3.7640000000000002,
"grad_norm": 2.401381254196167,
"learning_rate": 2.5e-06,
"loss": 0.9667,
"step": 941
},
{
"epoch": 3.768,
"grad_norm": 2.2755489349365234,
"learning_rate": 2.491935483870968e-06,
"loss": 0.7664,
"step": 942
},
{
"epoch": 3.7720000000000002,
"grad_norm": 2.206324577331543,
"learning_rate": 2.4838709677419355e-06,
"loss": 1.0714,
"step": 943
},
{
"epoch": 3.776,
"grad_norm": 2.0583205223083496,
"learning_rate": 2.4758064516129036e-06,
"loss": 0.8074,
"step": 944
},
{
"epoch": 3.7800000000000002,
"grad_norm": 2.0630557537078857,
"learning_rate": 2.4677419354838712e-06,
"loss": 0.7139,
"step": 945
},
{
"epoch": 3.784,
"grad_norm": 1.9153733253479004,
"learning_rate": 2.459677419354839e-06,
"loss": 0.6087,
"step": 946
},
{
"epoch": 3.7880000000000003,
"grad_norm": 2.1446893215179443,
"learning_rate": 2.4516129032258066e-06,
"loss": 0.651,
"step": 947
},
{
"epoch": 3.792,
"grad_norm": 2.325532913208008,
"learning_rate": 2.4435483870967742e-06,
"loss": 0.9991,
"step": 948
},
{
"epoch": 3.7960000000000003,
"grad_norm": 2.2939658164978027,
"learning_rate": 2.4354838709677423e-06,
"loss": 0.9107,
"step": 949
},
{
"epoch": 3.8,
"grad_norm": 1.944509506225586,
"learning_rate": 2.42741935483871e-06,
"loss": 0.677,
"step": 950
},
{
"epoch": 3.8040000000000003,
"grad_norm": 2.1296355724334717,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.6326,
"step": 951
},
{
"epoch": 3.808,
"grad_norm": 2.449873447418213,
"learning_rate": 2.4112903225806453e-06,
"loss": 1.0493,
"step": 952
},
{
"epoch": 3.8120000000000003,
"grad_norm": 2.2936856746673584,
"learning_rate": 2.4032258064516133e-06,
"loss": 0.751,
"step": 953
},
{
"epoch": 3.816,
"grad_norm": 2.239792823791504,
"learning_rate": 2.3951612903225806e-06,
"loss": 0.8381,
"step": 954
},
{
"epoch": 3.82,
"grad_norm": 2.564072608947754,
"learning_rate": 2.3870967741935486e-06,
"loss": 0.9443,
"step": 955
},
{
"epoch": 3.824,
"grad_norm": 2.301388740539551,
"learning_rate": 2.3790322580645163e-06,
"loss": 0.9068,
"step": 956
},
{
"epoch": 3.828,
"grad_norm": 2.4470112323760986,
"learning_rate": 2.370967741935484e-06,
"loss": 0.834,
"step": 957
},
{
"epoch": 3.832,
"grad_norm": 2.2462210655212402,
"learning_rate": 2.362903225806452e-06,
"loss": 0.8344,
"step": 958
},
{
"epoch": 3.836,
"grad_norm": 2.14129900932312,
"learning_rate": 2.3548387096774193e-06,
"loss": 0.7379,
"step": 959
},
{
"epoch": 3.84,
"grad_norm": 2.263746976852417,
"learning_rate": 2.3467741935483873e-06,
"loss": 0.8391,
"step": 960
},
{
"epoch": 3.844,
"grad_norm": 2.0509262084960938,
"learning_rate": 2.338709677419355e-06,
"loss": 0.7171,
"step": 961
},
{
"epoch": 3.848,
"grad_norm": 2.1760873794555664,
"learning_rate": 2.3306451612903227e-06,
"loss": 0.8904,
"step": 962
},
{
"epoch": 3.852,
"grad_norm": 2.5136802196502686,
"learning_rate": 2.3225806451612907e-06,
"loss": 0.9318,
"step": 963
},
{
"epoch": 3.856,
"grad_norm": 2.2124810218811035,
"learning_rate": 2.3145161290322584e-06,
"loss": 0.8245,
"step": 964
},
{
"epoch": 3.86,
"grad_norm": 2.1728949546813965,
"learning_rate": 2.306451612903226e-06,
"loss": 0.8638,
"step": 965
},
{
"epoch": 3.864,
"grad_norm": 2.495039939880371,
"learning_rate": 2.2983870967741937e-06,
"loss": 0.9993,
"step": 966
},
{
"epoch": 3.868,
"grad_norm": 2.2015881538391113,
"learning_rate": 2.2903225806451614e-06,
"loss": 0.7732,
"step": 967
},
{
"epoch": 3.872,
"grad_norm": 2.163741111755371,
"learning_rate": 2.2822580645161294e-06,
"loss": 0.759,
"step": 968
},
{
"epoch": 3.876,
"grad_norm": 2.3122260570526123,
"learning_rate": 2.274193548387097e-06,
"loss": 1.0268,
"step": 969
},
{
"epoch": 3.88,
"grad_norm": 2.0754685401916504,
"learning_rate": 2.2661290322580647e-06,
"loss": 0.624,
"step": 970
},
{
"epoch": 3.884,
"grad_norm": 2.128565549850464,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.7533,
"step": 971
},
{
"epoch": 3.888,
"grad_norm": 2.1690149307250977,
"learning_rate": 2.25e-06,
"loss": 0.6677,
"step": 972
},
{
"epoch": 3.892,
"grad_norm": 2.17118239402771,
"learning_rate": 2.2419354838709677e-06,
"loss": 0.7237,
"step": 973
},
{
"epoch": 3.896,
"grad_norm": 2.3318235874176025,
"learning_rate": 2.233870967741936e-06,
"loss": 0.7337,
"step": 974
},
{
"epoch": 3.9,
"grad_norm": 2.1360106468200684,
"learning_rate": 2.2258064516129034e-06,
"loss": 0.8524,
"step": 975
},
{
"epoch": 3.904,
"grad_norm": 2.201362371444702,
"learning_rate": 2.217741935483871e-06,
"loss": 0.8545,
"step": 976
},
{
"epoch": 3.908,
"grad_norm": 2.2761240005493164,
"learning_rate": 2.209677419354839e-06,
"loss": 0.8508,
"step": 977
},
{
"epoch": 3.912,
"grad_norm": 1.9397066831588745,
"learning_rate": 2.2016129032258064e-06,
"loss": 0.5631,
"step": 978
},
{
"epoch": 3.916,
"grad_norm": 2.3536376953125,
"learning_rate": 2.1935483870967745e-06,
"loss": 0.9373,
"step": 979
},
{
"epoch": 3.92,
"grad_norm": 2.205050230026245,
"learning_rate": 2.185483870967742e-06,
"loss": 0.7274,
"step": 980
},
{
"epoch": 3.924,
"grad_norm": 2.0966451168060303,
"learning_rate": 2.17741935483871e-06,
"loss": 0.5711,
"step": 981
},
{
"epoch": 3.928,
"grad_norm": 2.1973869800567627,
"learning_rate": 2.169354838709678e-06,
"loss": 0.8734,
"step": 982
},
{
"epoch": 3.932,
"grad_norm": 2.3555471897125244,
"learning_rate": 2.161290322580645e-06,
"loss": 0.9951,
"step": 983
},
{
"epoch": 3.936,
"grad_norm": 2.1399526596069336,
"learning_rate": 2.153225806451613e-06,
"loss": 0.8126,
"step": 984
},
{
"epoch": 3.94,
"grad_norm": 2.199490785598755,
"learning_rate": 2.145161290322581e-06,
"loss": 0.9233,
"step": 985
},
{
"epoch": 3.944,
"grad_norm": 2.1005666255950928,
"learning_rate": 2.1370967741935485e-06,
"loss": 0.6775,
"step": 986
},
{
"epoch": 3.948,
"grad_norm": 2.1148951053619385,
"learning_rate": 2.129032258064516e-06,
"loss": 0.621,
"step": 987
},
{
"epoch": 3.952,
"grad_norm": 2.326742649078369,
"learning_rate": 2.1209677419354842e-06,
"loss": 0.7486,
"step": 988
},
{
"epoch": 3.956,
"grad_norm": 2.4045190811157227,
"learning_rate": 2.112903225806452e-06,
"loss": 0.8569,
"step": 989
},
{
"epoch": 3.96,
"grad_norm": 2.2377429008483887,
"learning_rate": 2.1048387096774195e-06,
"loss": 0.8598,
"step": 990
},
{
"epoch": 3.964,
"grad_norm": 2.4663689136505127,
"learning_rate": 2.096774193548387e-06,
"loss": 0.8422,
"step": 991
},
{
"epoch": 3.968,
"grad_norm": 2.2024405002593994,
"learning_rate": 2.088709677419355e-06,
"loss": 0.7979,
"step": 992
},
{
"epoch": 3.972,
"grad_norm": 2.1740357875823975,
"learning_rate": 2.080645161290323e-06,
"loss": 0.6501,
"step": 993
},
{
"epoch": 3.976,
"grad_norm": 2.392676830291748,
"learning_rate": 2.07258064516129e-06,
"loss": 0.9717,
"step": 994
},
{
"epoch": 3.98,
"grad_norm": 2.417933225631714,
"learning_rate": 2.0645161290322582e-06,
"loss": 0.9765,
"step": 995
},
{
"epoch": 3.984,
"grad_norm": 2.3684704303741455,
"learning_rate": 2.056451612903226e-06,
"loss": 0.7818,
"step": 996
},
{
"epoch": 3.988,
"grad_norm": 2.265115737915039,
"learning_rate": 2.0483870967741936e-06,
"loss": 0.9738,
"step": 997
},
{
"epoch": 3.992,
"grad_norm": 2.0875535011291504,
"learning_rate": 2.0403225806451616e-06,
"loss": 0.6718,
"step": 998
},
{
"epoch": 3.996,
"grad_norm": 2.160306453704834,
"learning_rate": 2.0322580645161293e-06,
"loss": 0.647,
"step": 999
},
{
"epoch": 4.0,
"grad_norm": 2.2271740436553955,
"learning_rate": 2.024193548387097e-06,
"loss": 0.926,
"step": 1000
},
{
"epoch": 4.004,
"grad_norm": 2.406463623046875,
"learning_rate": 2.0161290322580646e-06,
"loss": 1.0294,
"step": 1001
},
{
"epoch": 4.008,
"grad_norm": 2.06563401222229,
"learning_rate": 2.0080645161290323e-06,
"loss": 0.7166,
"step": 1002
},
{
"epoch": 4.012,
"grad_norm": 2.054182529449463,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.7667,
"step": 1003
},
{
"epoch": 4.016,
"grad_norm": 2.4627397060394287,
"learning_rate": 1.991935483870968e-06,
"loss": 0.6641,
"step": 1004
},
{
"epoch": 4.02,
"grad_norm": 2.3262436389923096,
"learning_rate": 1.9838709677419356e-06,
"loss": 0.9825,
"step": 1005
},
{
"epoch": 4.024,
"grad_norm": 2.2590932846069336,
"learning_rate": 1.9758064516129033e-06,
"loss": 0.7796,
"step": 1006
},
{
"epoch": 4.028,
"grad_norm": 1.988510012626648,
"learning_rate": 1.967741935483871e-06,
"loss": 0.7545,
"step": 1007
},
{
"epoch": 4.032,
"grad_norm": 2.131132125854492,
"learning_rate": 1.9596774193548386e-06,
"loss": 0.7807,
"step": 1008
},
{
"epoch": 4.036,
"grad_norm": 2.2927346229553223,
"learning_rate": 1.9516129032258067e-06,
"loss": 0.9807,
"step": 1009
},
{
"epoch": 4.04,
"grad_norm": 2.2677903175354004,
"learning_rate": 1.9435483870967743e-06,
"loss": 0.6969,
"step": 1010
},
{
"epoch": 4.044,
"grad_norm": 2.0686841011047363,
"learning_rate": 1.935483870967742e-06,
"loss": 0.5789,
"step": 1011
},
{
"epoch": 4.048,
"grad_norm": 2.2940640449523926,
"learning_rate": 1.92741935483871e-06,
"loss": 0.6927,
"step": 1012
},
{
"epoch": 4.052,
"grad_norm": 2.3742549419403076,
"learning_rate": 1.9193548387096773e-06,
"loss": 0.7842,
"step": 1013
},
{
"epoch": 4.056,
"grad_norm": 2.1507906913757324,
"learning_rate": 1.9112903225806454e-06,
"loss": 0.777,
"step": 1014
},
{
"epoch": 4.06,
"grad_norm": 2.125037670135498,
"learning_rate": 1.9032258064516128e-06,
"loss": 0.715,
"step": 1015
},
{
"epoch": 4.064,
"grad_norm": 2.556920289993286,
"learning_rate": 1.8951612903225807e-06,
"loss": 0.9831,
"step": 1016
},
{
"epoch": 4.068,
"grad_norm": 2.2890443801879883,
"learning_rate": 1.8870967741935486e-06,
"loss": 0.8791,
"step": 1017
},
{
"epoch": 4.072,
"grad_norm": 2.3837790489196777,
"learning_rate": 1.8790322580645162e-06,
"loss": 0.7557,
"step": 1018
},
{
"epoch": 4.076,
"grad_norm": 2.2448434829711914,
"learning_rate": 1.870967741935484e-06,
"loss": 0.6021,
"step": 1019
},
{
"epoch": 4.08,
"grad_norm": 2.1938180923461914,
"learning_rate": 1.8629032258064517e-06,
"loss": 0.5916,
"step": 1020
},
{
"epoch": 4.084,
"grad_norm": 2.184678792953491,
"learning_rate": 1.8548387096774196e-06,
"loss": 0.6774,
"step": 1021
},
{
"epoch": 4.088,
"grad_norm": 2.091639518737793,
"learning_rate": 1.8467741935483875e-06,
"loss": 0.5811,
"step": 1022
},
{
"epoch": 4.092,
"grad_norm": 2.2382893562316895,
"learning_rate": 1.838709677419355e-06,
"loss": 0.7811,
"step": 1023
},
{
"epoch": 4.096,
"grad_norm": 2.4816744327545166,
"learning_rate": 1.8306451612903228e-06,
"loss": 0.7529,
"step": 1024
},
{
"epoch": 4.1,
"grad_norm": 2.398247718811035,
"learning_rate": 1.8225806451612904e-06,
"loss": 0.7077,
"step": 1025
},
{
"epoch": 4.104,
"grad_norm": 2.3608181476593018,
"learning_rate": 1.8145161290322583e-06,
"loss": 0.6565,
"step": 1026
},
{
"epoch": 4.108,
"grad_norm": 2.1144907474517822,
"learning_rate": 1.8064516129032258e-06,
"loss": 0.5724,
"step": 1027
},
{
"epoch": 4.112,
"grad_norm": 2.3261606693267822,
"learning_rate": 1.7983870967741936e-06,
"loss": 0.6656,
"step": 1028
},
{
"epoch": 4.116,
"grad_norm": 2.0781667232513428,
"learning_rate": 1.7903225806451615e-06,
"loss": 0.7156,
"step": 1029
},
{
"epoch": 4.12,
"grad_norm": 2.208777666091919,
"learning_rate": 1.7822580645161291e-06,
"loss": 0.7395,
"step": 1030
},
{
"epoch": 4.124,
"grad_norm": 2.323026180267334,
"learning_rate": 1.774193548387097e-06,
"loss": 0.6843,
"step": 1031
},
{
"epoch": 4.128,
"grad_norm": 2.2553014755249023,
"learning_rate": 1.7661290322580647e-06,
"loss": 0.7674,
"step": 1032
},
{
"epoch": 4.132,
"grad_norm": 2.1738624572753906,
"learning_rate": 1.7580645161290325e-06,
"loss": 0.7363,
"step": 1033
},
{
"epoch": 4.136,
"grad_norm": 2.306398630142212,
"learning_rate": 1.75e-06,
"loss": 0.8493,
"step": 1034
},
{
"epoch": 4.14,
"grad_norm": 2.269899845123291,
"learning_rate": 1.7419354838709678e-06,
"loss": 0.7735,
"step": 1035
},
{
"epoch": 4.144,
"grad_norm": 2.2679085731506348,
"learning_rate": 1.7338709677419357e-06,
"loss": 0.6658,
"step": 1036
},
{
"epoch": 4.148,
"grad_norm": 2.300226926803589,
"learning_rate": 1.7258064516129034e-06,
"loss": 0.7193,
"step": 1037
},
{
"epoch": 4.152,
"grad_norm": 2.3566718101501465,
"learning_rate": 1.7177419354838712e-06,
"loss": 0.786,
"step": 1038
},
{
"epoch": 4.156,
"grad_norm": 2.0547354221343994,
"learning_rate": 1.7096774193548387e-06,
"loss": 0.5577,
"step": 1039
},
{
"epoch": 4.16,
"grad_norm": 2.0804097652435303,
"learning_rate": 1.7016129032258065e-06,
"loss": 0.5129,
"step": 1040
},
{
"epoch": 4.164,
"grad_norm": 2.2978291511535645,
"learning_rate": 1.6935483870967742e-06,
"loss": 0.7825,
"step": 1041
},
{
"epoch": 4.168,
"grad_norm": 2.329310894012451,
"learning_rate": 1.685483870967742e-06,
"loss": 0.8602,
"step": 1042
},
{
"epoch": 4.172,
"grad_norm": 2.0429697036743164,
"learning_rate": 1.67741935483871e-06,
"loss": 0.6153,
"step": 1043
},
{
"epoch": 4.176,
"grad_norm": 2.251570463180542,
"learning_rate": 1.6693548387096776e-06,
"loss": 0.7544,
"step": 1044
},
{
"epoch": 4.18,
"grad_norm": 2.3504586219787598,
"learning_rate": 1.6612903225806455e-06,
"loss": 0.7068,
"step": 1045
},
{
"epoch": 4.184,
"grad_norm": 2.4925265312194824,
"learning_rate": 1.653225806451613e-06,
"loss": 0.8326,
"step": 1046
},
{
"epoch": 4.188,
"grad_norm": 2.126401424407959,
"learning_rate": 1.6451612903225808e-06,
"loss": 0.6162,
"step": 1047
},
{
"epoch": 4.192,
"grad_norm": 2.3744008541107178,
"learning_rate": 1.6370967741935484e-06,
"loss": 0.841,
"step": 1048
},
{
"epoch": 4.196,
"grad_norm": 2.235731840133667,
"learning_rate": 1.6290322580645163e-06,
"loss": 0.7612,
"step": 1049
},
{
"epoch": 4.2,
"grad_norm": 2.3301475048065186,
"learning_rate": 1.6209677419354842e-06,
"loss": 0.7748,
"step": 1050
},
{
"epoch": 4.204,
"grad_norm": 2.1939594745635986,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.6976,
"step": 1051
},
{
"epoch": 4.208,
"grad_norm": 2.33828067779541,
"learning_rate": 1.6048387096774195e-06,
"loss": 0.8672,
"step": 1052
},
{
"epoch": 4.212,
"grad_norm": 2.3657424449920654,
"learning_rate": 1.5967741935483871e-06,
"loss": 0.7897,
"step": 1053
},
{
"epoch": 4.216,
"grad_norm": 2.33683705329895,
"learning_rate": 1.588709677419355e-06,
"loss": 0.8152,
"step": 1054
},
{
"epoch": 4.22,
"grad_norm": 2.3173093795776367,
"learning_rate": 1.5806451612903226e-06,
"loss": 0.7319,
"step": 1055
},
{
"epoch": 4.224,
"grad_norm": 2.067448139190674,
"learning_rate": 1.5725806451612905e-06,
"loss": 0.6357,
"step": 1056
},
{
"epoch": 4.228,
"grad_norm": 2.410370349884033,
"learning_rate": 1.5645161290322584e-06,
"loss": 0.6698,
"step": 1057
},
{
"epoch": 4.232,
"grad_norm": 2.0703933238983154,
"learning_rate": 1.5564516129032258e-06,
"loss": 0.5286,
"step": 1058
},
{
"epoch": 4.236,
"grad_norm": 2.5206849575042725,
"learning_rate": 1.5483870967741937e-06,
"loss": 0.9473,
"step": 1059
},
{
"epoch": 4.24,
"grad_norm": 2.193002939224243,
"learning_rate": 1.5403225806451613e-06,
"loss": 0.7604,
"step": 1060
},
{
"epoch": 4.244,
"grad_norm": 2.514152765274048,
"learning_rate": 1.5322580645161292e-06,
"loss": 0.8542,
"step": 1061
},
{
"epoch": 4.248,
"grad_norm": 2.1836607456207275,
"learning_rate": 1.5241935483870969e-06,
"loss": 0.6228,
"step": 1062
},
{
"epoch": 4.252,
"grad_norm": 2.289808511734009,
"learning_rate": 1.5161290322580647e-06,
"loss": 0.8151,
"step": 1063
},
{
"epoch": 4.256,
"grad_norm": 2.335754632949829,
"learning_rate": 1.5080645161290324e-06,
"loss": 0.7623,
"step": 1064
},
{
"epoch": 4.26,
"grad_norm": 2.4347617626190186,
"learning_rate": 1.5e-06,
"loss": 0.834,
"step": 1065
},
{
"epoch": 4.264,
"grad_norm": 2.4381332397460938,
"learning_rate": 1.491935483870968e-06,
"loss": 0.7373,
"step": 1066
},
{
"epoch": 4.268,
"grad_norm": 2.4642021656036377,
"learning_rate": 1.4838709677419356e-06,
"loss": 0.904,
"step": 1067
},
{
"epoch": 4.272,
"grad_norm": 2.3428714275360107,
"learning_rate": 1.4758064516129034e-06,
"loss": 0.67,
"step": 1068
},
{
"epoch": 4.276,
"grad_norm": 2.1284022331237793,
"learning_rate": 1.4677419354838709e-06,
"loss": 0.71,
"step": 1069
},
{
"epoch": 4.28,
"grad_norm": 2.2314751148223877,
"learning_rate": 1.4596774193548387e-06,
"loss": 0.5407,
"step": 1070
},
{
"epoch": 4.284,
"grad_norm": 2.235234022140503,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.6042,
"step": 1071
},
{
"epoch": 4.288,
"grad_norm": 2.1681883335113525,
"learning_rate": 1.4435483870967743e-06,
"loss": 0.5758,
"step": 1072
},
{
"epoch": 4.292,
"grad_norm": 2.2013158798217773,
"learning_rate": 1.4354838709677421e-06,
"loss": 0.6864,
"step": 1073
},
{
"epoch": 4.296,
"grad_norm": 2.1779909133911133,
"learning_rate": 1.4274193548387098e-06,
"loss": 0.8579,
"step": 1074
},
{
"epoch": 4.3,
"grad_norm": 2.1225690841674805,
"learning_rate": 1.4193548387096776e-06,
"loss": 0.5255,
"step": 1075
},
{
"epoch": 4.304,
"grad_norm": 2.294562339782715,
"learning_rate": 1.4112903225806455e-06,
"loss": 0.6786,
"step": 1076
},
{
"epoch": 4.308,
"grad_norm": 2.2220890522003174,
"learning_rate": 1.403225806451613e-06,
"loss": 0.6386,
"step": 1077
},
{
"epoch": 4.312,
"grad_norm": 2.431384801864624,
"learning_rate": 1.3951612903225808e-06,
"loss": 0.6489,
"step": 1078
},
{
"epoch": 4.316,
"grad_norm": 2.2231733798980713,
"learning_rate": 1.3870967741935485e-06,
"loss": 0.6481,
"step": 1079
},
{
"epoch": 4.32,
"grad_norm": 2.2959814071655273,
"learning_rate": 1.3790322580645163e-06,
"loss": 0.8887,
"step": 1080
},
{
"epoch": 4.324,
"grad_norm": 2.6281895637512207,
"learning_rate": 1.3709677419354838e-06,
"loss": 0.8555,
"step": 1081
},
{
"epoch": 4.328,
"grad_norm": 2.2837817668914795,
"learning_rate": 1.3629032258064517e-06,
"loss": 0.7436,
"step": 1082
},
{
"epoch": 4.332,
"grad_norm": 2.278343439102173,
"learning_rate": 1.3548387096774195e-06,
"loss": 0.8137,
"step": 1083
},
{
"epoch": 4.336,
"grad_norm": 2.593653678894043,
"learning_rate": 1.3467741935483872e-06,
"loss": 0.9537,
"step": 1084
},
{
"epoch": 4.34,
"grad_norm": 2.2890312671661377,
"learning_rate": 1.338709677419355e-06,
"loss": 0.7581,
"step": 1085
},
{
"epoch": 4.344,
"grad_norm": 2.2410354614257812,
"learning_rate": 1.3306451612903227e-06,
"loss": 0.6705,
"step": 1086
},
{
"epoch": 4.348,
"grad_norm": 2.34249210357666,
"learning_rate": 1.3225806451612906e-06,
"loss": 0.6918,
"step": 1087
},
{
"epoch": 4.352,
"grad_norm": 2.224848508834839,
"learning_rate": 1.314516129032258e-06,
"loss": 0.661,
"step": 1088
},
{
"epoch": 4.356,
"grad_norm": 2.231247901916504,
"learning_rate": 1.3064516129032259e-06,
"loss": 0.6758,
"step": 1089
},
{
"epoch": 4.36,
"grad_norm": 2.2124154567718506,
"learning_rate": 1.2983870967741937e-06,
"loss": 0.691,
"step": 1090
},
{
"epoch": 4.364,
"grad_norm": 2.3312814235687256,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.8702,
"step": 1091
},
{
"epoch": 4.368,
"grad_norm": 2.3130455017089844,
"learning_rate": 1.2822580645161293e-06,
"loss": 0.7612,
"step": 1092
},
{
"epoch": 4.372,
"grad_norm": 2.3148789405822754,
"learning_rate": 1.2741935483870967e-06,
"loss": 0.8986,
"step": 1093
},
{
"epoch": 4.376,
"grad_norm": 2.438842296600342,
"learning_rate": 1.2661290322580646e-06,
"loss": 1.0276,
"step": 1094
},
{
"epoch": 4.38,
"grad_norm": 2.1867611408233643,
"learning_rate": 1.2580645161290322e-06,
"loss": 0.6475,
"step": 1095
},
{
"epoch": 4.384,
"grad_norm": 2.132303476333618,
"learning_rate": 1.25e-06,
"loss": 0.7559,
"step": 1096
},
{
"epoch": 4.388,
"grad_norm": 2.495635509490967,
"learning_rate": 1.2419354838709678e-06,
"loss": 0.983,
"step": 1097
},
{
"epoch": 4.392,
"grad_norm": 2.5002808570861816,
"learning_rate": 1.2338709677419356e-06,
"loss": 0.933,
"step": 1098
},
{
"epoch": 4.396,
"grad_norm": 2.1121408939361572,
"learning_rate": 1.2258064516129033e-06,
"loss": 0.6312,
"step": 1099
},
{
"epoch": 4.4,
"grad_norm": 2.2128350734710693,
"learning_rate": 1.2177419354838711e-06,
"loss": 0.657,
"step": 1100
},
{
"epoch": 4.404,
"grad_norm": 2.482390880584717,
"learning_rate": 1.2096774193548388e-06,
"loss": 0.8101,
"step": 1101
},
{
"epoch": 4.408,
"grad_norm": 2.305540084838867,
"learning_rate": 1.2016129032258067e-06,
"loss": 0.7443,
"step": 1102
},
{
"epoch": 4.412,
"grad_norm": 2.619077682495117,
"learning_rate": 1.1935483870967743e-06,
"loss": 1.055,
"step": 1103
},
{
"epoch": 4.416,
"grad_norm": 2.0542643070220947,
"learning_rate": 1.185483870967742e-06,
"loss": 0.5384,
"step": 1104
},
{
"epoch": 4.42,
"grad_norm": 2.1693155765533447,
"learning_rate": 1.1774193548387096e-06,
"loss": 0.6142,
"step": 1105
},
{
"epoch": 4.424,
"grad_norm": 2.2738637924194336,
"learning_rate": 1.1693548387096775e-06,
"loss": 0.7541,
"step": 1106
},
{
"epoch": 4.428,
"grad_norm": 2.372004270553589,
"learning_rate": 1.1612903225806454e-06,
"loss": 0.815,
"step": 1107
},
{
"epoch": 4.432,
"grad_norm": 2.3350911140441895,
"learning_rate": 1.153225806451613e-06,
"loss": 0.9225,
"step": 1108
},
{
"epoch": 4.436,
"grad_norm": 2.01047682762146,
"learning_rate": 1.1451612903225807e-06,
"loss": 0.4809,
"step": 1109
},
{
"epoch": 4.44,
"grad_norm": 2.1741702556610107,
"learning_rate": 1.1370967741935485e-06,
"loss": 0.6214,
"step": 1110
},
{
"epoch": 4.444,
"grad_norm": 2.320693254470825,
"learning_rate": 1.1290322580645162e-06,
"loss": 0.8867,
"step": 1111
},
{
"epoch": 4.448,
"grad_norm": 2.3759713172912598,
"learning_rate": 1.1209677419354839e-06,
"loss": 0.7674,
"step": 1112
},
{
"epoch": 4.452,
"grad_norm": 2.2036936283111572,
"learning_rate": 1.1129032258064517e-06,
"loss": 0.6323,
"step": 1113
},
{
"epoch": 4.456,
"grad_norm": 2.4528908729553223,
"learning_rate": 1.1048387096774196e-06,
"loss": 0.8555,
"step": 1114
},
{
"epoch": 4.46,
"grad_norm": 2.1635193824768066,
"learning_rate": 1.0967741935483872e-06,
"loss": 0.7599,
"step": 1115
},
{
"epoch": 4.464,
"grad_norm": 2.2776708602905273,
"learning_rate": 1.088709677419355e-06,
"loss": 0.7247,
"step": 1116
},
{
"epoch": 4.468,
"grad_norm": 2.4202232360839844,
"learning_rate": 1.0806451612903226e-06,
"loss": 0.7726,
"step": 1117
},
{
"epoch": 4.4719999999999995,
"grad_norm": 2.226058006286621,
"learning_rate": 1.0725806451612904e-06,
"loss": 0.7561,
"step": 1118
},
{
"epoch": 4.476,
"grad_norm": 2.371166706085205,
"learning_rate": 1.064516129032258e-06,
"loss": 0.8106,
"step": 1119
},
{
"epoch": 4.48,
"grad_norm": 2.545086145401001,
"learning_rate": 1.056451612903226e-06,
"loss": 0.6927,
"step": 1120
},
{
"epoch": 4.484,
"grad_norm": 2.384808301925659,
"learning_rate": 1.0483870967741936e-06,
"loss": 0.8472,
"step": 1121
},
{
"epoch": 4.4879999999999995,
"grad_norm": 2.267472267150879,
"learning_rate": 1.0403225806451615e-06,
"loss": 0.6731,
"step": 1122
},
{
"epoch": 4.492,
"grad_norm": 2.5474860668182373,
"learning_rate": 1.0322580645161291e-06,
"loss": 1.0592,
"step": 1123
},
{
"epoch": 4.496,
"grad_norm": 2.423821210861206,
"learning_rate": 1.0241935483870968e-06,
"loss": 0.7909,
"step": 1124
},
{
"epoch": 4.5,
"grad_norm": 2.09659743309021,
"learning_rate": 1.0161290322580646e-06,
"loss": 0.6294,
"step": 1125
},
{
"epoch": 4.504,
"grad_norm": 2.1957030296325684,
"learning_rate": 1.0080645161290323e-06,
"loss": 0.5339,
"step": 1126
},
{
"epoch": 4.508,
"grad_norm": 2.3419601917266846,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.815,
"step": 1127
},
{
"epoch": 4.5120000000000005,
"grad_norm": 2.3392527103424072,
"learning_rate": 9.919354838709678e-07,
"loss": 0.7399,
"step": 1128
},
{
"epoch": 4.516,
"grad_norm": 2.591635227203369,
"learning_rate": 9.838709677419355e-07,
"loss": 0.8111,
"step": 1129
},
{
"epoch": 4.52,
"grad_norm": 2.3863515853881836,
"learning_rate": 9.758064516129033e-07,
"loss": 0.7086,
"step": 1130
},
{
"epoch": 4.524,
"grad_norm": 2.4174227714538574,
"learning_rate": 9.67741935483871e-07,
"loss": 0.7625,
"step": 1131
},
{
"epoch": 4.5280000000000005,
"grad_norm": 2.3676483631134033,
"learning_rate": 9.596774193548387e-07,
"loss": 0.7432,
"step": 1132
},
{
"epoch": 4.532,
"grad_norm": 2.2280187606811523,
"learning_rate": 9.516129032258064e-07,
"loss": 0.6567,
"step": 1133
},
{
"epoch": 4.536,
"grad_norm": 2.323503255844116,
"learning_rate": 9.435483870967743e-07,
"loss": 0.7162,
"step": 1134
},
{
"epoch": 4.54,
"grad_norm": 2.330273151397705,
"learning_rate": 9.35483870967742e-07,
"loss": 0.7492,
"step": 1135
},
{
"epoch": 4.5440000000000005,
"grad_norm": 2.3072502613067627,
"learning_rate": 9.274193548387098e-07,
"loss": 0.7717,
"step": 1136
},
{
"epoch": 4.548,
"grad_norm": 2.1893625259399414,
"learning_rate": 9.193548387096775e-07,
"loss": 0.7577,
"step": 1137
},
{
"epoch": 4.552,
"grad_norm": 2.299312114715576,
"learning_rate": 9.112903225806452e-07,
"loss": 0.6489,
"step": 1138
},
{
"epoch": 4.556,
"grad_norm": 2.2672600746154785,
"learning_rate": 9.032258064516129e-07,
"loss": 0.7582,
"step": 1139
},
{
"epoch": 4.5600000000000005,
"grad_norm": 2.35774302482605,
"learning_rate": 8.951612903225807e-07,
"loss": 0.6944,
"step": 1140
},
{
"epoch": 4.564,
"grad_norm": 2.4566967487335205,
"learning_rate": 8.870967741935485e-07,
"loss": 0.7874,
"step": 1141
},
{
"epoch": 4.568,
"grad_norm": 2.346771478652954,
"learning_rate": 8.790322580645163e-07,
"loss": 0.7592,
"step": 1142
},
{
"epoch": 4.572,
"grad_norm": 2.4383773803710938,
"learning_rate": 8.709677419354839e-07,
"loss": 0.7337,
"step": 1143
},
{
"epoch": 4.576,
"grad_norm": 2.19571852684021,
"learning_rate": 8.629032258064517e-07,
"loss": 0.6448,
"step": 1144
},
{
"epoch": 4.58,
"grad_norm": 2.2884631156921387,
"learning_rate": 8.548387096774193e-07,
"loss": 0.5722,
"step": 1145
},
{
"epoch": 4.584,
"grad_norm": 2.3079285621643066,
"learning_rate": 8.467741935483871e-07,
"loss": 0.7114,
"step": 1146
},
{
"epoch": 4.588,
"grad_norm": 2.3879995346069336,
"learning_rate": 8.38709677419355e-07,
"loss": 0.6822,
"step": 1147
},
{
"epoch": 4.592,
"grad_norm": 2.3414344787597656,
"learning_rate": 8.306451612903227e-07,
"loss": 0.8221,
"step": 1148
},
{
"epoch": 4.596,
"grad_norm": 2.401705026626587,
"learning_rate": 8.225806451612904e-07,
"loss": 0.8003,
"step": 1149
},
{
"epoch": 4.6,
"grad_norm": 2.3082618713378906,
"learning_rate": 8.145161290322581e-07,
"loss": 0.7632,
"step": 1150
},
{
"epoch": 4.604,
"grad_norm": 2.3123667240142822,
"learning_rate": 8.064516129032258e-07,
"loss": 0.6528,
"step": 1151
},
{
"epoch": 4.608,
"grad_norm": 2.3781580924987793,
"learning_rate": 7.983870967741936e-07,
"loss": 0.788,
"step": 1152
},
{
"epoch": 4.612,
"grad_norm": 2.3994123935699463,
"learning_rate": 7.903225806451613e-07,
"loss": 0.8394,
"step": 1153
},
{
"epoch": 4.616,
"grad_norm": 2.3815858364105225,
"learning_rate": 7.822580645161292e-07,
"loss": 1.0368,
"step": 1154
},
{
"epoch": 4.62,
"grad_norm": 1.8045047521591187,
"learning_rate": 7.741935483870968e-07,
"loss": 0.4503,
"step": 1155
},
{
"epoch": 4.624,
"grad_norm": 2.581266403198242,
"learning_rate": 7.661290322580646e-07,
"loss": 0.7589,
"step": 1156
},
{
"epoch": 4.628,
"grad_norm": 2.2751896381378174,
"learning_rate": 7.580645161290324e-07,
"loss": 0.6624,
"step": 1157
},
{
"epoch": 4.632,
"grad_norm": 2.2207417488098145,
"learning_rate": 7.5e-07,
"loss": 0.7476,
"step": 1158
},
{
"epoch": 4.636,
"grad_norm": 2.173737049102783,
"learning_rate": 7.419354838709678e-07,
"loss": 0.5195,
"step": 1159
},
{
"epoch": 4.64,
"grad_norm": 2.327514886856079,
"learning_rate": 7.338709677419354e-07,
"loss": 0.7557,
"step": 1160
},
{
"epoch": 4.644,
"grad_norm": 2.36411190032959,
"learning_rate": 7.258064516129033e-07,
"loss": 0.8697,
"step": 1161
},
{
"epoch": 4.648,
"grad_norm": 2.3163628578186035,
"learning_rate": 7.177419354838711e-07,
"loss": 0.742,
"step": 1162
},
{
"epoch": 4.652,
"grad_norm": 2.373682975769043,
"learning_rate": 7.096774193548388e-07,
"loss": 0.7396,
"step": 1163
},
{
"epoch": 4.656,
"grad_norm": 2.7890610694885254,
"learning_rate": 7.016129032258065e-07,
"loss": 0.9487,
"step": 1164
},
{
"epoch": 4.66,
"grad_norm": 1.9923303127288818,
"learning_rate": 6.935483870967742e-07,
"loss": 0.5078,
"step": 1165
},
{
"epoch": 4.664,
"grad_norm": 2.2962071895599365,
"learning_rate": 6.854838709677419e-07,
"loss": 0.6853,
"step": 1166
},
{
"epoch": 4.668,
"grad_norm": 2.216494083404541,
"learning_rate": 6.774193548387098e-07,
"loss": 0.5454,
"step": 1167
},
{
"epoch": 4.672,
"grad_norm": 2.3010871410369873,
"learning_rate": 6.693548387096775e-07,
"loss": 0.6566,
"step": 1168
},
{
"epoch": 4.676,
"grad_norm": 2.5437018871307373,
"learning_rate": 6.612903225806453e-07,
"loss": 1.0156,
"step": 1169
},
{
"epoch": 4.68,
"grad_norm": 2.4474453926086426,
"learning_rate": 6.532258064516129e-07,
"loss": 0.7806,
"step": 1170
},
{
"epoch": 4.684,
"grad_norm": 2.4709243774414062,
"learning_rate": 6.451612903225807e-07,
"loss": 0.7458,
"step": 1171
},
{
"epoch": 4.688,
"grad_norm": 2.0869922637939453,
"learning_rate": 6.370967741935484e-07,
"loss": 0.6425,
"step": 1172
},
{
"epoch": 4.692,
"grad_norm": 2.2414233684539795,
"learning_rate": 6.290322580645161e-07,
"loss": 0.7052,
"step": 1173
},
{
"epoch": 4.696,
"grad_norm": 2.1230409145355225,
"learning_rate": 6.209677419354839e-07,
"loss": 0.5901,
"step": 1174
},
{
"epoch": 4.7,
"grad_norm": 2.0766208171844482,
"learning_rate": 6.129032258064516e-07,
"loss": 0.627,
"step": 1175
},
{
"epoch": 4.704,
"grad_norm": 2.3263955116271973,
"learning_rate": 6.048387096774194e-07,
"loss": 0.6084,
"step": 1176
},
{
"epoch": 4.708,
"grad_norm": 2.28118634223938,
"learning_rate": 5.967741935483872e-07,
"loss": 0.7532,
"step": 1177
},
{
"epoch": 4.712,
"grad_norm": 2.4390621185302734,
"learning_rate": 5.887096774193548e-07,
"loss": 0.8747,
"step": 1178
},
{
"epoch": 4.716,
"grad_norm": 2.4335639476776123,
"learning_rate": 5.806451612903227e-07,
"loss": 0.7191,
"step": 1179
},
{
"epoch": 4.72,
"grad_norm": 2.284865617752075,
"learning_rate": 5.725806451612903e-07,
"loss": 0.6934,
"step": 1180
},
{
"epoch": 4.724,
"grad_norm": 2.382296085357666,
"learning_rate": 5.645161290322581e-07,
"loss": 0.8553,
"step": 1181
},
{
"epoch": 4.728,
"grad_norm": 2.481611967086792,
"learning_rate": 5.564516129032259e-07,
"loss": 0.7187,
"step": 1182
},
{
"epoch": 4.732,
"grad_norm": 2.524700880050659,
"learning_rate": 5.483870967741936e-07,
"loss": 0.8057,
"step": 1183
},
{
"epoch": 4.736,
"grad_norm": 2.0938427448272705,
"learning_rate": 5.403225806451613e-07,
"loss": 0.5949,
"step": 1184
},
{
"epoch": 4.74,
"grad_norm": 2.2317137718200684,
"learning_rate": 5.32258064516129e-07,
"loss": 0.5911,
"step": 1185
},
{
"epoch": 4.744,
"grad_norm": 2.4001145362854004,
"learning_rate": 5.241935483870968e-07,
"loss": 0.6954,
"step": 1186
},
{
"epoch": 4.748,
"grad_norm": 2.1287758350372314,
"learning_rate": 5.161290322580646e-07,
"loss": 0.6147,
"step": 1187
},
{
"epoch": 4.752,
"grad_norm": 2.6044185161590576,
"learning_rate": 5.080645161290323e-07,
"loss": 0.8149,
"step": 1188
},
{
"epoch": 4.756,
"grad_norm": 2.4292356967926025,
"learning_rate": 5.000000000000001e-07,
"loss": 0.9372,
"step": 1189
},
{
"epoch": 4.76,
"grad_norm": 2.4027163982391357,
"learning_rate": 4.919354838709677e-07,
"loss": 0.8758,
"step": 1190
},
{
"epoch": 4.764,
"grad_norm": 2.3395657539367676,
"learning_rate": 4.838709677419355e-07,
"loss": 0.8588,
"step": 1191
},
{
"epoch": 4.768,
"grad_norm": 2.1531972885131836,
"learning_rate": 4.758064516129032e-07,
"loss": 0.6956,
"step": 1192
},
{
"epoch": 4.772,
"grad_norm": 2.2789173126220703,
"learning_rate": 4.67741935483871e-07,
"loss": 0.594,
"step": 1193
},
{
"epoch": 4.776,
"grad_norm": 2.3256616592407227,
"learning_rate": 4.5967741935483873e-07,
"loss": 0.6754,
"step": 1194
},
{
"epoch": 4.78,
"grad_norm": 2.6827950477600098,
"learning_rate": 4.5161290322580644e-07,
"loss": 1.123,
"step": 1195
},
{
"epoch": 4.784,
"grad_norm": 2.393773078918457,
"learning_rate": 4.4354838709677425e-07,
"loss": 0.8604,
"step": 1196
},
{
"epoch": 4.788,
"grad_norm": 2.275068521499634,
"learning_rate": 4.3548387096774196e-07,
"loss": 0.6247,
"step": 1197
},
{
"epoch": 4.792,
"grad_norm": 2.2951745986938477,
"learning_rate": 4.2741935483870967e-07,
"loss": 0.7261,
"step": 1198
},
{
"epoch": 4.796,
"grad_norm": 2.144277811050415,
"learning_rate": 4.193548387096775e-07,
"loss": 0.7152,
"step": 1199
},
{
"epoch": 4.8,
"grad_norm": 2.2701258659362793,
"learning_rate": 4.112903225806452e-07,
"loss": 0.7496,
"step": 1200
}
],
"logging_steps": 1,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.056150075029094e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}