Files
llama3-1B-sft/trainer_state.json
ModelHub XC 8462c4a571 初始化项目,由ModelHub XC社区提供模型
Model: boradorish/llama3-1B-sft
Source: Original Platform
2026-06-10 15:40:17 +08:00

10943 lines
265 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1557,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019277108433734939,
"grad_norm": 2.8518834114074707,
"learning_rate": 0.0,
"loss": 0.0891,
"step": 1
},
{
"epoch": 0.0038554216867469878,
"grad_norm": 1.8441249132156372,
"learning_rate": 2.564102564102564e-07,
"loss": 0.0539,
"step": 2
},
{
"epoch": 0.005783132530120482,
"grad_norm": 2.8263237476348877,
"learning_rate": 5.128205128205128e-07,
"loss": 0.099,
"step": 3
},
{
"epoch": 0.0077108433734939755,
"grad_norm": 2.5051236152648926,
"learning_rate": 7.692307692307694e-07,
"loss": 0.0789,
"step": 4
},
{
"epoch": 0.00963855421686747,
"grad_norm": 2.6903438568115234,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.0881,
"step": 5
},
{
"epoch": 0.011566265060240964,
"grad_norm": 2.6205761432647705,
"learning_rate": 1.282051282051282e-06,
"loss": 0.0776,
"step": 6
},
{
"epoch": 0.013493975903614458,
"grad_norm": 2.6309337615966797,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.0827,
"step": 7
},
{
"epoch": 0.015421686746987951,
"grad_norm": 1.5427855253219604,
"learning_rate": 1.794871794871795e-06,
"loss": 0.0577,
"step": 8
},
{
"epoch": 0.017349397590361446,
"grad_norm": 1.0973446369171143,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.04,
"step": 9
},
{
"epoch": 0.01927710843373494,
"grad_norm": 1.3253350257873535,
"learning_rate": 2.307692307692308e-06,
"loss": 0.0506,
"step": 10
},
{
"epoch": 0.021204819277108433,
"grad_norm": 1.588739037513733,
"learning_rate": 2.564102564102564e-06,
"loss": 0.0874,
"step": 11
},
{
"epoch": 0.02313253012048193,
"grad_norm": 1.4987014532089233,
"learning_rate": 2.8205128205128207e-06,
"loss": 0.0597,
"step": 12
},
{
"epoch": 0.02506024096385542,
"grad_norm": 1.6571592092514038,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0559,
"step": 13
},
{
"epoch": 0.026987951807228915,
"grad_norm": 1.8860628604888916,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0688,
"step": 14
},
{
"epoch": 0.02891566265060241,
"grad_norm": 1.3202295303344727,
"learning_rate": 3.58974358974359e-06,
"loss": 0.0433,
"step": 15
},
{
"epoch": 0.030843373493975902,
"grad_norm": 1.5870612859725952,
"learning_rate": 3.846153846153847e-06,
"loss": 0.0695,
"step": 16
},
{
"epoch": 0.0327710843373494,
"grad_norm": 0.9192284345626831,
"learning_rate": 4.102564102564103e-06,
"loss": 0.0392,
"step": 17
},
{
"epoch": 0.03469879518072289,
"grad_norm": 0.7950155735015869,
"learning_rate": 4.358974358974359e-06,
"loss": 0.0351,
"step": 18
},
{
"epoch": 0.03662650602409639,
"grad_norm": 0.8854314684867859,
"learning_rate": 4.615384615384616e-06,
"loss": 0.0356,
"step": 19
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.9546788930892944,
"learning_rate": 4.871794871794872e-06,
"loss": 0.0427,
"step": 20
},
{
"epoch": 0.04048192771084337,
"grad_norm": 0.6315903663635254,
"learning_rate": 5.128205128205128e-06,
"loss": 0.0397,
"step": 21
},
{
"epoch": 0.042409638554216866,
"grad_norm": 0.9230924844741821,
"learning_rate": 5.384615384615385e-06,
"loss": 0.0481,
"step": 22
},
{
"epoch": 0.04433734939759036,
"grad_norm": 0.711546003818512,
"learning_rate": 5.641025641025641e-06,
"loss": 0.0479,
"step": 23
},
{
"epoch": 0.04626506024096386,
"grad_norm": 0.5288046598434448,
"learning_rate": 5.897435897435898e-06,
"loss": 0.0182,
"step": 24
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.9420496225357056,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0389,
"step": 25
},
{
"epoch": 0.05012048192771084,
"grad_norm": 0.5001983046531677,
"learning_rate": 6.410256410256412e-06,
"loss": 0.0268,
"step": 26
},
{
"epoch": 0.052048192771084335,
"grad_norm": 0.8084653615951538,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0367,
"step": 27
},
{
"epoch": 0.05397590361445783,
"grad_norm": 0.7195103764533997,
"learning_rate": 6.923076923076923e-06,
"loss": 0.0251,
"step": 28
},
{
"epoch": 0.055903614457831326,
"grad_norm": 0.529958963394165,
"learning_rate": 7.17948717948718e-06,
"loss": 0.0289,
"step": 29
},
{
"epoch": 0.05783132530120482,
"grad_norm": 0.795376181602478,
"learning_rate": 7.435897435897437e-06,
"loss": 0.043,
"step": 30
},
{
"epoch": 0.059759036144578316,
"grad_norm": 0.6366249918937683,
"learning_rate": 7.692307692307694e-06,
"loss": 0.029,
"step": 31
},
{
"epoch": 0.061686746987951804,
"grad_norm": 0.5414115190505981,
"learning_rate": 7.948717948717949e-06,
"loss": 0.0365,
"step": 32
},
{
"epoch": 0.0636144578313253,
"grad_norm": 0.9350972175598145,
"learning_rate": 8.205128205128205e-06,
"loss": 0.0283,
"step": 33
},
{
"epoch": 0.0655421686746988,
"grad_norm": 0.5660741925239563,
"learning_rate": 8.461538461538462e-06,
"loss": 0.0234,
"step": 34
},
{
"epoch": 0.06746987951807229,
"grad_norm": 0.5623988509178162,
"learning_rate": 8.717948717948719e-06,
"loss": 0.0307,
"step": 35
},
{
"epoch": 0.06939759036144579,
"grad_norm": 0.5260195732116699,
"learning_rate": 8.974358974358976e-06,
"loss": 0.0264,
"step": 36
},
{
"epoch": 0.07132530120481928,
"grad_norm": 0.4934785068035126,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0224,
"step": 37
},
{
"epoch": 0.07325301204819278,
"grad_norm": 0.4797322154045105,
"learning_rate": 9.487179487179487e-06,
"loss": 0.0163,
"step": 38
},
{
"epoch": 0.07518072289156627,
"grad_norm": 0.4739217460155487,
"learning_rate": 9.743589743589744e-06,
"loss": 0.0165,
"step": 39
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.4527677595615387,
"learning_rate": 1e-05,
"loss": 0.0163,
"step": 40
},
{
"epoch": 0.07903614457831325,
"grad_norm": 0.6241316795349121,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.0302,
"step": 41
},
{
"epoch": 0.08096385542168674,
"grad_norm": 0.639043927192688,
"learning_rate": 1.0512820512820514e-05,
"loss": 0.0312,
"step": 42
},
{
"epoch": 0.08289156626506024,
"grad_norm": 0.5121409296989441,
"learning_rate": 1.076923076923077e-05,
"loss": 0.0256,
"step": 43
},
{
"epoch": 0.08481927710843373,
"grad_norm": 0.6340477466583252,
"learning_rate": 1.1025641025641028e-05,
"loss": 0.04,
"step": 44
},
{
"epoch": 0.08674698795180723,
"grad_norm": 0.5260409712791443,
"learning_rate": 1.1282051282051283e-05,
"loss": 0.0282,
"step": 45
},
{
"epoch": 0.08867469879518072,
"grad_norm": 0.6390711069107056,
"learning_rate": 1.1538461538461538e-05,
"loss": 0.0243,
"step": 46
},
{
"epoch": 0.09060240963855422,
"grad_norm": 0.46469295024871826,
"learning_rate": 1.1794871794871796e-05,
"loss": 0.0208,
"step": 47
},
{
"epoch": 0.09253012048192771,
"grad_norm": 0.8711516857147217,
"learning_rate": 1.2051282051282051e-05,
"loss": 0.0291,
"step": 48
},
{
"epoch": 0.09445783132530121,
"grad_norm": 0.9164300560951233,
"learning_rate": 1.230769230769231e-05,
"loss": 0.0342,
"step": 49
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.5401139259338379,
"learning_rate": 1.2564102564102565e-05,
"loss": 0.0185,
"step": 50
},
{
"epoch": 0.0983132530120482,
"grad_norm": 0.44393008947372437,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.0228,
"step": 51
},
{
"epoch": 0.10024096385542168,
"grad_norm": 0.3855767846107483,
"learning_rate": 1.3076923076923078e-05,
"loss": 0.0176,
"step": 52
},
{
"epoch": 0.10216867469879518,
"grad_norm": 0.8561235070228577,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0433,
"step": 53
},
{
"epoch": 0.10409638554216867,
"grad_norm": 0.768002450466156,
"learning_rate": 1.3589743589743592e-05,
"loss": 0.0245,
"step": 54
},
{
"epoch": 0.10602409638554217,
"grad_norm": 0.4559759497642517,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.0224,
"step": 55
},
{
"epoch": 0.10795180722891566,
"grad_norm": 0.6203847527503967,
"learning_rate": 1.4102564102564105e-05,
"loss": 0.0296,
"step": 56
},
{
"epoch": 0.10987951807228916,
"grad_norm": 0.6651368141174316,
"learning_rate": 1.435897435897436e-05,
"loss": 0.0336,
"step": 57
},
{
"epoch": 0.11180722891566265,
"grad_norm": 0.377734512090683,
"learning_rate": 1.4615384615384615e-05,
"loss": 0.0196,
"step": 58
},
{
"epoch": 0.11373493975903615,
"grad_norm": 0.687568724155426,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.0207,
"step": 59
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.7905604243278503,
"learning_rate": 1.5128205128205129e-05,
"loss": 0.047,
"step": 60
},
{
"epoch": 0.11759036144578314,
"grad_norm": 0.7938196063041687,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0198,
"step": 61
},
{
"epoch": 0.11951807228915663,
"grad_norm": 0.41340553760528564,
"learning_rate": 1.5641025641025644e-05,
"loss": 0.0161,
"step": 62
},
{
"epoch": 0.12144578313253013,
"grad_norm": 0.5668172240257263,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.0275,
"step": 63
},
{
"epoch": 0.12337349397590361,
"grad_norm": 0.48333367705345154,
"learning_rate": 1.6153846153846154e-05,
"loss": 0.0137,
"step": 64
},
{
"epoch": 0.12530120481927712,
"grad_norm": 0.6843933463096619,
"learning_rate": 1.641025641025641e-05,
"loss": 0.0294,
"step": 65
},
{
"epoch": 0.1272289156626506,
"grad_norm": 0.7789272665977478,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0401,
"step": 66
},
{
"epoch": 0.1291566265060241,
"grad_norm": 0.6203492879867554,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.0292,
"step": 67
},
{
"epoch": 0.1310843373493976,
"grad_norm": 0.5940662622451782,
"learning_rate": 1.717948717948718e-05,
"loss": 0.0178,
"step": 68
},
{
"epoch": 0.13301204819277107,
"grad_norm": 0.35504868626594543,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.0129,
"step": 69
},
{
"epoch": 0.13493975903614458,
"grad_norm": 0.8796699643135071,
"learning_rate": 1.7692307692307694e-05,
"loss": 0.034,
"step": 70
},
{
"epoch": 0.13686746987951806,
"grad_norm": 0.967444896697998,
"learning_rate": 1.794871794871795e-05,
"loss": 0.0266,
"step": 71
},
{
"epoch": 0.13879518072289157,
"grad_norm": 0.4428526759147644,
"learning_rate": 1.8205128205128208e-05,
"loss": 0.0223,
"step": 72
},
{
"epoch": 0.14072289156626505,
"grad_norm": 0.42897751927375793,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.0187,
"step": 73
},
{
"epoch": 0.14265060240963856,
"grad_norm": 0.5100914835929871,
"learning_rate": 1.8717948717948718e-05,
"loss": 0.0164,
"step": 74
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.6028861999511719,
"learning_rate": 1.8974358974358975e-05,
"loss": 0.0164,
"step": 75
},
{
"epoch": 0.14650602409638555,
"grad_norm": 0.6187024116516113,
"learning_rate": 1.923076923076923e-05,
"loss": 0.0296,
"step": 76
},
{
"epoch": 0.14843373493975903,
"grad_norm": 0.4822489619255066,
"learning_rate": 1.9487179487179488e-05,
"loss": 0.0148,
"step": 77
},
{
"epoch": 0.15036144578313254,
"grad_norm": 0.7231149673461914,
"learning_rate": 1.9743589743589745e-05,
"loss": 0.0395,
"step": 78
},
{
"epoch": 0.15228915662650602,
"grad_norm": 0.8409642577171326,
"learning_rate": 2e-05,
"loss": 0.0446,
"step": 79
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.4883500039577484,
"learning_rate": 2.025641025641026e-05,
"loss": 0.0206,
"step": 80
},
{
"epoch": 0.156144578313253,
"grad_norm": 0.6287479400634766,
"learning_rate": 2.0512820512820512e-05,
"loss": 0.0333,
"step": 81
},
{
"epoch": 0.1580722891566265,
"grad_norm": 0.5041632652282715,
"learning_rate": 2.0769230769230772e-05,
"loss": 0.0414,
"step": 82
},
{
"epoch": 0.16,
"grad_norm": 0.5103405117988586,
"learning_rate": 2.102564102564103e-05,
"loss": 0.045,
"step": 83
},
{
"epoch": 0.16192771084337348,
"grad_norm": 0.493161678314209,
"learning_rate": 2.1282051282051285e-05,
"loss": 0.021,
"step": 84
},
{
"epoch": 0.163855421686747,
"grad_norm": 0.908843994140625,
"learning_rate": 2.153846153846154e-05,
"loss": 0.0389,
"step": 85
},
{
"epoch": 0.16578313253012048,
"grad_norm": 0.5067003965377808,
"learning_rate": 2.1794871794871795e-05,
"loss": 0.0272,
"step": 86
},
{
"epoch": 0.16771084337349398,
"grad_norm": 0.5791381597518921,
"learning_rate": 2.2051282051282056e-05,
"loss": 0.0368,
"step": 87
},
{
"epoch": 0.16963855421686747,
"grad_norm": 0.7056036591529846,
"learning_rate": 2.230769230769231e-05,
"loss": 0.0284,
"step": 88
},
{
"epoch": 0.17156626506024097,
"grad_norm": 0.6563822031021118,
"learning_rate": 2.2564102564102566e-05,
"loss": 0.0646,
"step": 89
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.9483286142349243,
"learning_rate": 2.2820512820512822e-05,
"loss": 0.0439,
"step": 90
},
{
"epoch": 0.17542168674698796,
"grad_norm": 0.370664119720459,
"learning_rate": 2.3076923076923076e-05,
"loss": 0.0109,
"step": 91
},
{
"epoch": 0.17734939759036145,
"grad_norm": 0.9776477813720703,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.0458,
"step": 92
},
{
"epoch": 0.17927710843373493,
"grad_norm": 0.45710092782974243,
"learning_rate": 2.3589743589743593e-05,
"loss": 0.0212,
"step": 93
},
{
"epoch": 0.18120481927710844,
"grad_norm": 0.8623896837234497,
"learning_rate": 2.384615384615385e-05,
"loss": 0.0215,
"step": 94
},
{
"epoch": 0.18313253012048192,
"grad_norm": 0.55814528465271,
"learning_rate": 2.4102564102564103e-05,
"loss": 0.0218,
"step": 95
},
{
"epoch": 0.18506024096385543,
"grad_norm": 0.49882641434669495,
"learning_rate": 2.435897435897436e-05,
"loss": 0.0268,
"step": 96
},
{
"epoch": 0.1869879518072289,
"grad_norm": 0.3508654534816742,
"learning_rate": 2.461538461538462e-05,
"loss": 0.0172,
"step": 97
},
{
"epoch": 0.18891566265060242,
"grad_norm": 0.601170003414154,
"learning_rate": 2.4871794871794873e-05,
"loss": 0.0208,
"step": 98
},
{
"epoch": 0.1908433734939759,
"grad_norm": 1.1748133897781372,
"learning_rate": 2.512820512820513e-05,
"loss": 0.0259,
"step": 99
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.46370384097099304,
"learning_rate": 2.5384615384615386e-05,
"loss": 0.0242,
"step": 100
},
{
"epoch": 0.1946987951807229,
"grad_norm": 0.525010883808136,
"learning_rate": 2.5641025641025646e-05,
"loss": 0.0188,
"step": 101
},
{
"epoch": 0.1966265060240964,
"grad_norm": 0.766501784324646,
"learning_rate": 2.58974358974359e-05,
"loss": 0.0584,
"step": 102
},
{
"epoch": 0.19855421686746988,
"grad_norm": 0.3572964370250702,
"learning_rate": 2.6153846153846157e-05,
"loss": 0.0131,
"step": 103
},
{
"epoch": 0.20048192771084336,
"grad_norm": 0.6467130780220032,
"learning_rate": 2.6410256410256413e-05,
"loss": 0.0231,
"step": 104
},
{
"epoch": 0.20240963855421687,
"grad_norm": 1.1852102279663086,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.027,
"step": 105
},
{
"epoch": 0.20433734939759035,
"grad_norm": 2.3659932613372803,
"learning_rate": 2.6923076923076927e-05,
"loss": 0.0224,
"step": 106
},
{
"epoch": 0.20626506024096386,
"grad_norm": 0.5343687534332275,
"learning_rate": 2.7179487179487183e-05,
"loss": 0.0198,
"step": 107
},
{
"epoch": 0.20819277108433734,
"grad_norm": 1.852160096168518,
"learning_rate": 2.7435897435897437e-05,
"loss": 0.032,
"step": 108
},
{
"epoch": 0.21012048192771085,
"grad_norm": 0.47291702032089233,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.0117,
"step": 109
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.7623187899589539,
"learning_rate": 2.794871794871795e-05,
"loss": 0.0337,
"step": 110
},
{
"epoch": 0.21397590361445784,
"grad_norm": 0.5272570848464966,
"learning_rate": 2.820512820512821e-05,
"loss": 0.0131,
"step": 111
},
{
"epoch": 0.21590361445783132,
"grad_norm": 0.5568500757217407,
"learning_rate": 2.8461538461538464e-05,
"loss": 0.0233,
"step": 112
},
{
"epoch": 0.21783132530120483,
"grad_norm": 0.4008469879627228,
"learning_rate": 2.871794871794872e-05,
"loss": 0.0204,
"step": 113
},
{
"epoch": 0.2197590361445783,
"grad_norm": 0.4888612926006317,
"learning_rate": 2.8974358974358977e-05,
"loss": 0.016,
"step": 114
},
{
"epoch": 0.2216867469879518,
"grad_norm": 0.44903355836868286,
"learning_rate": 2.923076923076923e-05,
"loss": 0.0135,
"step": 115
},
{
"epoch": 0.2236144578313253,
"grad_norm": 0.9266762733459473,
"learning_rate": 2.948717948717949e-05,
"loss": 0.0233,
"step": 116
},
{
"epoch": 0.22554216867469878,
"grad_norm": 0.5352638959884644,
"learning_rate": 2.9743589743589747e-05,
"loss": 0.0198,
"step": 117
},
{
"epoch": 0.2274698795180723,
"grad_norm": 0.6051343679428101,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0246,
"step": 118
},
{
"epoch": 0.22939759036144577,
"grad_norm": 0.9971133470535278,
"learning_rate": 3.0256410256410257e-05,
"loss": 0.025,
"step": 119
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.704236626625061,
"learning_rate": 3.0512820512820514e-05,
"loss": 0.031,
"step": 120
},
{
"epoch": 0.23325301204819276,
"grad_norm": 0.6137097477912903,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.0519,
"step": 121
},
{
"epoch": 0.23518072289156627,
"grad_norm": 0.7396159768104553,
"learning_rate": 3.102564102564103e-05,
"loss": 0.0325,
"step": 122
},
{
"epoch": 0.23710843373493976,
"grad_norm": 1.3282053470611572,
"learning_rate": 3.128205128205129e-05,
"loss": 0.0252,
"step": 123
},
{
"epoch": 0.23903614457831326,
"grad_norm": 0.5220731496810913,
"learning_rate": 3.153846153846154e-05,
"loss": 0.0262,
"step": 124
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.5357242822647095,
"learning_rate": 3.1794871794871795e-05,
"loss": 0.0243,
"step": 125
},
{
"epoch": 0.24289156626506025,
"grad_norm": 0.48207753896713257,
"learning_rate": 3.205128205128206e-05,
"loss": 0.0178,
"step": 126
},
{
"epoch": 0.24481927710843374,
"grad_norm": 0.552988588809967,
"learning_rate": 3.230769230769231e-05,
"loss": 0.023,
"step": 127
},
{
"epoch": 0.24674698795180722,
"grad_norm": 1.7962840795516968,
"learning_rate": 3.2564102564102565e-05,
"loss": 0.032,
"step": 128
},
{
"epoch": 0.24867469879518073,
"grad_norm": 1.6404600143432617,
"learning_rate": 3.282051282051282e-05,
"loss": 0.0231,
"step": 129
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.39142486453056335,
"learning_rate": 3.307692307692308e-05,
"loss": 0.0147,
"step": 130
},
{
"epoch": 0.2525301204819277,
"grad_norm": 1.3272887468338013,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0439,
"step": 131
},
{
"epoch": 0.2544578313253012,
"grad_norm": 1.5122811794281006,
"learning_rate": 3.358974358974359e-05,
"loss": 0.0282,
"step": 132
},
{
"epoch": 0.2563855421686747,
"grad_norm": 1.8542430400848389,
"learning_rate": 3.384615384615385e-05,
"loss": 0.0515,
"step": 133
},
{
"epoch": 0.2583132530120482,
"grad_norm": 4.059277534484863,
"learning_rate": 3.4102564102564105e-05,
"loss": 0.0781,
"step": 134
},
{
"epoch": 0.26024096385542167,
"grad_norm": 0.6206214427947998,
"learning_rate": 3.435897435897436e-05,
"loss": 0.0306,
"step": 135
},
{
"epoch": 0.2621686746987952,
"grad_norm": 0.4575510323047638,
"learning_rate": 3.461538461538462e-05,
"loss": 0.0154,
"step": 136
},
{
"epoch": 0.2640963855421687,
"grad_norm": 1.1556978225708008,
"learning_rate": 3.4871794871794875e-05,
"loss": 0.0235,
"step": 137
},
{
"epoch": 0.26602409638554214,
"grad_norm": 0.6975051760673523,
"learning_rate": 3.512820512820513e-05,
"loss": 0.0453,
"step": 138
},
{
"epoch": 0.26795180722891565,
"grad_norm": 0.8686623573303223,
"learning_rate": 3.538461538461539e-05,
"loss": 0.0427,
"step": 139
},
{
"epoch": 0.26987951807228916,
"grad_norm": 2.0681848526000977,
"learning_rate": 3.5641025641025646e-05,
"loss": 0.04,
"step": 140
},
{
"epoch": 0.27180722891566267,
"grad_norm": 0.4397984445095062,
"learning_rate": 3.58974358974359e-05,
"loss": 0.0188,
"step": 141
},
{
"epoch": 0.2737349397590361,
"grad_norm": 0.5871334075927734,
"learning_rate": 3.615384615384616e-05,
"loss": 0.0253,
"step": 142
},
{
"epoch": 0.27566265060240963,
"grad_norm": 1.1078568696975708,
"learning_rate": 3.6410256410256416e-05,
"loss": 0.0316,
"step": 143
},
{
"epoch": 0.27759036144578314,
"grad_norm": 0.5691841840744019,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0266,
"step": 144
},
{
"epoch": 0.27951807228915665,
"grad_norm": 0.7896255254745483,
"learning_rate": 3.692307692307693e-05,
"loss": 0.0281,
"step": 145
},
{
"epoch": 0.2814457831325301,
"grad_norm": 0.9988337159156799,
"learning_rate": 3.7179487179487186e-05,
"loss": 0.0295,
"step": 146
},
{
"epoch": 0.2833734939759036,
"grad_norm": 0.9811834692955017,
"learning_rate": 3.7435897435897436e-05,
"loss": 0.0322,
"step": 147
},
{
"epoch": 0.2853012048192771,
"grad_norm": 0.6503105759620667,
"learning_rate": 3.769230769230769e-05,
"loss": 0.0266,
"step": 148
},
{
"epoch": 0.28722891566265063,
"grad_norm": 1.9164355993270874,
"learning_rate": 3.794871794871795e-05,
"loss": 0.0677,
"step": 149
},
{
"epoch": 0.2891566265060241,
"grad_norm": 1.1724557876586914,
"learning_rate": 3.820512820512821e-05,
"loss": 0.0324,
"step": 150
},
{
"epoch": 0.2910843373493976,
"grad_norm": 0.8482469916343689,
"learning_rate": 3.846153846153846e-05,
"loss": 0.0259,
"step": 151
},
{
"epoch": 0.2930120481927711,
"grad_norm": 0.8572830557823181,
"learning_rate": 3.871794871794872e-05,
"loss": 0.0358,
"step": 152
},
{
"epoch": 0.29493975903614456,
"grad_norm": 0.6630825400352478,
"learning_rate": 3.8974358974358976e-05,
"loss": 0.0447,
"step": 153
},
{
"epoch": 0.29686746987951806,
"grad_norm": 0.9197093844413757,
"learning_rate": 3.923076923076923e-05,
"loss": 0.0409,
"step": 154
},
{
"epoch": 0.2987951807228916,
"grad_norm": 0.6976819634437561,
"learning_rate": 3.948717948717949e-05,
"loss": 0.0317,
"step": 155
},
{
"epoch": 0.3007228915662651,
"grad_norm": 0.7353514432907104,
"learning_rate": 3.9743589743589747e-05,
"loss": 0.0306,
"step": 156
},
{
"epoch": 0.30265060240963854,
"grad_norm": 0.5730232000350952,
"learning_rate": 4e-05,
"loss": 0.0324,
"step": 157
},
{
"epoch": 0.30457831325301205,
"grad_norm": 0.7852078676223755,
"learning_rate": 3.999994971675547e-05,
"loss": 0.0354,
"step": 158
},
{
"epoch": 0.30650602409638555,
"grad_norm": 0.5924715399742126,
"learning_rate": 3.999979886727471e-05,
"loss": 0.0366,
"step": 159
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.7359845638275146,
"learning_rate": 3.999954745231624e-05,
"loss": 0.0437,
"step": 160
},
{
"epoch": 0.3103614457831325,
"grad_norm": 0.7866976857185364,
"learning_rate": 3.999919547314426e-05,
"loss": 0.0363,
"step": 161
},
{
"epoch": 0.312289156626506,
"grad_norm": 0.7425745129585266,
"learning_rate": 3.999874293152863e-05,
"loss": 0.0259,
"step": 162
},
{
"epoch": 0.31421686746987953,
"grad_norm": 1.8922245502471924,
"learning_rate": 3.9998189829744885e-05,
"loss": 0.0341,
"step": 163
},
{
"epoch": 0.316144578313253,
"grad_norm": 0.7908634543418884,
"learning_rate": 3.99975361705742e-05,
"loss": 0.0424,
"step": 164
},
{
"epoch": 0.3180722891566265,
"grad_norm": 2.047368049621582,
"learning_rate": 3.999678195730337e-05,
"loss": 0.0535,
"step": 165
},
{
"epoch": 0.32,
"grad_norm": 0.5702639222145081,
"learning_rate": 3.999592719372484e-05,
"loss": 0.0284,
"step": 166
},
{
"epoch": 0.3219277108433735,
"grad_norm": 0.45015648007392883,
"learning_rate": 3.9994971884136636e-05,
"loss": 0.0313,
"step": 167
},
{
"epoch": 0.32385542168674697,
"grad_norm": 4.094679355621338,
"learning_rate": 3.9993916033342355e-05,
"loss": 0.0524,
"step": 168
},
{
"epoch": 0.3257831325301205,
"grad_norm": 0.800846517086029,
"learning_rate": 3.999275964665117e-05,
"loss": 0.0282,
"step": 169
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.47881078720092773,
"learning_rate": 3.999150272987776e-05,
"loss": 0.0293,
"step": 170
},
{
"epoch": 0.3296385542168675,
"grad_norm": 0.5716657638549805,
"learning_rate": 3.999014528934232e-05,
"loss": 0.0221,
"step": 171
},
{
"epoch": 0.33156626506024095,
"grad_norm": 0.6333311200141907,
"learning_rate": 3.998868733187048e-05,
"loss": 0.0302,
"step": 172
},
{
"epoch": 0.33349397590361446,
"grad_norm": 6.642521858215332,
"learning_rate": 3.998712886479335e-05,
"loss": 0.0364,
"step": 173
},
{
"epoch": 0.33542168674698797,
"grad_norm": 0.7515506148338318,
"learning_rate": 3.998546989594739e-05,
"loss": 0.0296,
"step": 174
},
{
"epoch": 0.3373493975903614,
"grad_norm": 1.0728015899658203,
"learning_rate": 3.998371043367445e-05,
"loss": 0.0549,
"step": 175
},
{
"epoch": 0.33927710843373493,
"grad_norm": 1.3025579452514648,
"learning_rate": 3.998185048682166e-05,
"loss": 0.0577,
"step": 176
},
{
"epoch": 0.34120481927710844,
"grad_norm": 1.0962958335876465,
"learning_rate": 3.997989006474144e-05,
"loss": 0.0313,
"step": 177
},
{
"epoch": 0.34313253012048195,
"grad_norm": 0.7064313292503357,
"learning_rate": 3.997782917729143e-05,
"loss": 0.0309,
"step": 178
},
{
"epoch": 0.3450602409638554,
"grad_norm": 0.43374207615852356,
"learning_rate": 3.997566783483445e-05,
"loss": 0.0166,
"step": 179
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.7236390113830566,
"learning_rate": 3.9973406048238413e-05,
"loss": 0.0254,
"step": 180
},
{
"epoch": 0.3489156626506024,
"grad_norm": 0.5041500926017761,
"learning_rate": 3.9971043828876334e-05,
"loss": 0.0239,
"step": 181
},
{
"epoch": 0.35084337349397593,
"grad_norm": 1.2744532823562622,
"learning_rate": 3.9968581188626204e-05,
"loss": 0.0404,
"step": 182
},
{
"epoch": 0.3527710843373494,
"grad_norm": 0.45845362544059753,
"learning_rate": 3.996601813987098e-05,
"loss": 0.0127,
"step": 183
},
{
"epoch": 0.3546987951807229,
"grad_norm": 0.4426881968975067,
"learning_rate": 3.996335469549852e-05,
"loss": 0.0176,
"step": 184
},
{
"epoch": 0.3566265060240964,
"grad_norm": 1.0030732154846191,
"learning_rate": 3.9960590868901465e-05,
"loss": 0.0457,
"step": 185
},
{
"epoch": 0.35855421686746985,
"grad_norm": 0.6428582668304443,
"learning_rate": 3.995772667397725e-05,
"loss": 0.0271,
"step": 186
},
{
"epoch": 0.36048192771084336,
"grad_norm": 0.5335744619369507,
"learning_rate": 3.995476212512795e-05,
"loss": 0.0297,
"step": 187
},
{
"epoch": 0.3624096385542169,
"grad_norm": 0.6995761394500732,
"learning_rate": 3.99516972372603e-05,
"loss": 0.0322,
"step": 188
},
{
"epoch": 0.3643373493975904,
"grad_norm": 0.765511155128479,
"learning_rate": 3.9948532025785546e-05,
"loss": 0.0253,
"step": 189
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.6165828108787537,
"learning_rate": 3.9945266506619403e-05,
"loss": 0.0355,
"step": 190
},
{
"epoch": 0.36819277108433734,
"grad_norm": 0.851970911026001,
"learning_rate": 3.994190069618195e-05,
"loss": 0.056,
"step": 191
},
{
"epoch": 0.37012048192771085,
"grad_norm": 0.9850023984909058,
"learning_rate": 3.993843461139757e-05,
"loss": 0.0415,
"step": 192
},
{
"epoch": 0.37204819277108436,
"grad_norm": 0.7455295324325562,
"learning_rate": 3.9934868269694886e-05,
"loss": 0.0379,
"step": 193
},
{
"epoch": 0.3739759036144578,
"grad_norm": 1.159469723701477,
"learning_rate": 3.9931201689006595e-05,
"loss": 0.0237,
"step": 194
},
{
"epoch": 0.3759036144578313,
"grad_norm": 0.5490080118179321,
"learning_rate": 3.992743488776947e-05,
"loss": 0.024,
"step": 195
},
{
"epoch": 0.37783132530120483,
"grad_norm": 1.279831886291504,
"learning_rate": 3.992356788492421e-05,
"loss": 0.0273,
"step": 196
},
{
"epoch": 0.3797590361445783,
"grad_norm": 0.859104335308075,
"learning_rate": 3.9919600699915355e-05,
"loss": 0.0411,
"step": 197
},
{
"epoch": 0.3816867469879518,
"grad_norm": 1.2525300979614258,
"learning_rate": 3.991553335269119e-05,
"loss": 0.0857,
"step": 198
},
{
"epoch": 0.3836144578313253,
"grad_norm": 0.4924193024635315,
"learning_rate": 3.991136586370367e-05,
"loss": 0.0294,
"step": 199
},
{
"epoch": 0.3855421686746988,
"grad_norm": 1.417190670967102,
"learning_rate": 3.990709825390828e-05,
"loss": 0.0395,
"step": 200
},
{
"epoch": 0.38746987951807227,
"grad_norm": 0.6172056198120117,
"learning_rate": 3.9902730544763936e-05,
"loss": 0.0194,
"step": 201
},
{
"epoch": 0.3893975903614458,
"grad_norm": 0.7292149662971497,
"learning_rate": 3.989826275823291e-05,
"loss": 0.0381,
"step": 202
},
{
"epoch": 0.3913253012048193,
"grad_norm": 0.5949816107749939,
"learning_rate": 3.989369491678067e-05,
"loss": 0.0254,
"step": 203
},
{
"epoch": 0.3932530120481928,
"grad_norm": 0.6012582182884216,
"learning_rate": 3.988902704337582e-05,
"loss": 0.048,
"step": 204
},
{
"epoch": 0.39518072289156625,
"grad_norm": 0.6273590922355652,
"learning_rate": 3.9884259161489936e-05,
"loss": 0.0268,
"step": 205
},
{
"epoch": 0.39710843373493976,
"grad_norm": 0.9615244269371033,
"learning_rate": 3.987939129509746e-05,
"loss": 0.0192,
"step": 206
},
{
"epoch": 0.39903614457831327,
"grad_norm": 0.6009241342544556,
"learning_rate": 3.9874423468675624e-05,
"loss": 0.0362,
"step": 207
},
{
"epoch": 0.4009638554216867,
"grad_norm": 0.411335289478302,
"learning_rate": 3.9869355707204266e-05,
"loss": 0.017,
"step": 208
},
{
"epoch": 0.40289156626506023,
"grad_norm": 0.6151527166366577,
"learning_rate": 3.986418803616573e-05,
"loss": 0.0283,
"step": 209
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.33808204531669617,
"learning_rate": 3.985892048154474e-05,
"loss": 0.0158,
"step": 210
},
{
"epoch": 0.40674698795180725,
"grad_norm": 0.5464187860488892,
"learning_rate": 3.9853553069828284e-05,
"loss": 0.0292,
"step": 211
},
{
"epoch": 0.4086746987951807,
"grad_norm": 0.6658390760421753,
"learning_rate": 3.984808582800543e-05,
"loss": 0.0281,
"step": 212
},
{
"epoch": 0.4106024096385542,
"grad_norm": 0.4253764748573303,
"learning_rate": 3.984251878356726e-05,
"loss": 0.031,
"step": 213
},
{
"epoch": 0.4125301204819277,
"grad_norm": 0.32309481501579285,
"learning_rate": 3.983685196450667e-05,
"loss": 0.0166,
"step": 214
},
{
"epoch": 0.41445783132530123,
"grad_norm": 0.43756410479545593,
"learning_rate": 3.9831085399318265e-05,
"loss": 0.0326,
"step": 215
},
{
"epoch": 0.4163855421686747,
"grad_norm": 0.264046847820282,
"learning_rate": 3.982521911699822e-05,
"loss": 0.0118,
"step": 216
},
{
"epoch": 0.4183132530120482,
"grad_norm": 0.8630897402763367,
"learning_rate": 3.9819253147044084e-05,
"loss": 0.0246,
"step": 217
},
{
"epoch": 0.4202409638554217,
"grad_norm": 0.6923379898071289,
"learning_rate": 3.98131875194547e-05,
"loss": 0.036,
"step": 218
},
{
"epoch": 0.42216867469879515,
"grad_norm": 0.5874778628349304,
"learning_rate": 3.9807022264730024e-05,
"loss": 0.0255,
"step": 219
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.394336074590683,
"learning_rate": 3.980075741387094e-05,
"loss": 0.0187,
"step": 220
},
{
"epoch": 0.4260240963855422,
"grad_norm": 0.6300327777862549,
"learning_rate": 3.979439299837915e-05,
"loss": 0.0214,
"step": 221
},
{
"epoch": 0.4279518072289157,
"grad_norm": 0.5200467109680176,
"learning_rate": 3.978792905025702e-05,
"loss": 0.0628,
"step": 222
},
{
"epoch": 0.42987951807228914,
"grad_norm": 0.5713880062103271,
"learning_rate": 3.978136560200735e-05,
"loss": 0.0302,
"step": 223
},
{
"epoch": 0.43180722891566264,
"grad_norm": 0.5345383286476135,
"learning_rate": 3.977470268663331e-05,
"loss": 0.0125,
"step": 224
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.5378350019454956,
"learning_rate": 3.976794033763819e-05,
"loss": 0.0246,
"step": 225
},
{
"epoch": 0.43566265060240966,
"grad_norm": 0.5554935336112976,
"learning_rate": 3.9761078589025276e-05,
"loss": 0.0212,
"step": 226
},
{
"epoch": 0.4375903614457831,
"grad_norm": 0.2832634747028351,
"learning_rate": 3.9754117475297664e-05,
"loss": 0.0125,
"step": 227
},
{
"epoch": 0.4395180722891566,
"grad_norm": 1.2910150289535522,
"learning_rate": 3.97470570314581e-05,
"loss": 0.0364,
"step": 228
},
{
"epoch": 0.44144578313253013,
"grad_norm": 0.3731018602848053,
"learning_rate": 3.973989729300878e-05,
"loss": 0.0128,
"step": 229
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.9433871507644653,
"learning_rate": 3.9732638295951195e-05,
"loss": 0.0367,
"step": 230
},
{
"epoch": 0.4453012048192771,
"grad_norm": 1.0779197216033936,
"learning_rate": 3.972528007678594e-05,
"loss": 0.0667,
"step": 231
},
{
"epoch": 0.4472289156626506,
"grad_norm": 1.7009105682373047,
"learning_rate": 3.9717822672512516e-05,
"loss": 0.0655,
"step": 232
},
{
"epoch": 0.4491566265060241,
"grad_norm": 0.5646032094955444,
"learning_rate": 3.971026612062919e-05,
"loss": 0.064,
"step": 233
},
{
"epoch": 0.45108433734939757,
"grad_norm": 0.44474121928215027,
"learning_rate": 3.970261045913274e-05,
"loss": 0.0206,
"step": 234
},
{
"epoch": 0.4530120481927711,
"grad_norm": 1.3969277143478394,
"learning_rate": 3.969485572651833e-05,
"loss": 0.0486,
"step": 235
},
{
"epoch": 0.4549397590361446,
"grad_norm": 0.6401994228363037,
"learning_rate": 3.968700196177925e-05,
"loss": 0.0262,
"step": 236
},
{
"epoch": 0.4568674698795181,
"grad_norm": 0.7091913223266602,
"learning_rate": 3.96790492044068e-05,
"loss": 0.014,
"step": 237
},
{
"epoch": 0.45879518072289155,
"grad_norm": 0.6561547517776489,
"learning_rate": 3.967099749439002e-05,
"loss": 0.0482,
"step": 238
},
{
"epoch": 0.46072289156626506,
"grad_norm": 0.6924155354499817,
"learning_rate": 3.966284687221551e-05,
"loss": 0.0289,
"step": 239
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.5868663787841797,
"learning_rate": 3.9654597378867256e-05,
"loss": 0.0331,
"step": 240
},
{
"epoch": 0.464578313253012,
"grad_norm": 0.7930939793586731,
"learning_rate": 3.964624905582637e-05,
"loss": 0.0925,
"step": 241
},
{
"epoch": 0.46650602409638553,
"grad_norm": 0.4888836145401001,
"learning_rate": 3.9637801945070944e-05,
"loss": 0.015,
"step": 242
},
{
"epoch": 0.46843373493975904,
"grad_norm": 0.7820287346839905,
"learning_rate": 3.962925608907579e-05,
"loss": 0.0382,
"step": 243
},
{
"epoch": 0.47036144578313255,
"grad_norm": 0.4914316236972809,
"learning_rate": 3.962061153081224e-05,
"loss": 0.0257,
"step": 244
},
{
"epoch": 0.472289156626506,
"grad_norm": 0.5681505799293518,
"learning_rate": 3.961186831374793e-05,
"loss": 0.0551,
"step": 245
},
{
"epoch": 0.4742168674698795,
"grad_norm": 0.5049723386764526,
"learning_rate": 3.9603026481846616e-05,
"loss": 0.0186,
"step": 246
},
{
"epoch": 0.476144578313253,
"grad_norm": 0.5034119486808777,
"learning_rate": 3.959408607956787e-05,
"loss": 0.024,
"step": 247
},
{
"epoch": 0.47807228915662653,
"grad_norm": 0.4543336033821106,
"learning_rate": 3.958504715186695e-05,
"loss": 0.0256,
"step": 248
},
{
"epoch": 0.48,
"grad_norm": 0.5595743656158447,
"learning_rate": 3.957590974419452e-05,
"loss": 0.0222,
"step": 249
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.5701581239700317,
"learning_rate": 3.956667390249642e-05,
"loss": 0.0334,
"step": 250
},
{
"epoch": 0.483855421686747,
"grad_norm": 0.53755784034729,
"learning_rate": 3.9557339673213474e-05,
"loss": 0.0345,
"step": 251
},
{
"epoch": 0.4857831325301205,
"grad_norm": 0.4368877112865448,
"learning_rate": 3.95479071032812e-05,
"loss": 0.0183,
"step": 252
},
{
"epoch": 0.48771084337349396,
"grad_norm": 0.7972906827926636,
"learning_rate": 3.953837624012963e-05,
"loss": 0.0337,
"step": 253
},
{
"epoch": 0.48963855421686747,
"grad_norm": 0.6148451566696167,
"learning_rate": 3.9528747131683023e-05,
"loss": 0.0524,
"step": 254
},
{
"epoch": 0.491566265060241,
"grad_norm": 0.500840961933136,
"learning_rate": 3.9519019826359676e-05,
"loss": 0.0248,
"step": 255
},
{
"epoch": 0.49349397590361443,
"grad_norm": 0.5536255240440369,
"learning_rate": 3.9509194373071624e-05,
"loss": 0.0219,
"step": 256
},
{
"epoch": 0.49542168674698794,
"grad_norm": 0.6873176097869873,
"learning_rate": 3.9499270821224444e-05,
"loss": 0.0312,
"step": 257
},
{
"epoch": 0.49734939759036145,
"grad_norm": 0.37207168340682983,
"learning_rate": 3.9489249220716974e-05,
"loss": 0.0149,
"step": 258
},
{
"epoch": 0.49927710843373496,
"grad_norm": 0.4458799660205841,
"learning_rate": 3.947912962194107e-05,
"loss": 0.0214,
"step": 259
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.4272724390029907,
"learning_rate": 3.9468912075781345e-05,
"loss": 0.0263,
"step": 260
},
{
"epoch": 0.503132530120482,
"grad_norm": 0.5245792269706726,
"learning_rate": 3.945859663361496e-05,
"loss": 0.0103,
"step": 261
},
{
"epoch": 0.5050602409638554,
"grad_norm": 0.8799260854721069,
"learning_rate": 3.9448183347311284e-05,
"loss": 0.0292,
"step": 262
},
{
"epoch": 0.5069879518072289,
"grad_norm": 0.5996833443641663,
"learning_rate": 3.943767226923171e-05,
"loss": 0.0306,
"step": 263
},
{
"epoch": 0.5089156626506024,
"grad_norm": 0.6044682860374451,
"learning_rate": 3.942706345222935e-05,
"loss": 0.0218,
"step": 264
},
{
"epoch": 0.5108433734939759,
"grad_norm": 0.4770200848579407,
"learning_rate": 3.941635694964878e-05,
"loss": 0.0226,
"step": 265
},
{
"epoch": 0.5127710843373494,
"grad_norm": 0.5605704188346863,
"learning_rate": 3.940555281532576e-05,
"loss": 0.0354,
"step": 266
},
{
"epoch": 0.5146987951807229,
"grad_norm": 0.46532443165779114,
"learning_rate": 3.939465110358699e-05,
"loss": 0.0223,
"step": 267
},
{
"epoch": 0.5166265060240964,
"grad_norm": 0.5190595388412476,
"learning_rate": 3.93836518692498e-05,
"loss": 0.0219,
"step": 268
},
{
"epoch": 0.5185542168674698,
"grad_norm": 0.5767757892608643,
"learning_rate": 3.937255516762193e-05,
"loss": 0.0294,
"step": 269
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.4543164372444153,
"learning_rate": 3.936136105450119e-05,
"loss": 0.0244,
"step": 270
},
{
"epoch": 0.5224096385542168,
"grad_norm": 0.4155154526233673,
"learning_rate": 3.9350069586175195e-05,
"loss": 0.02,
"step": 271
},
{
"epoch": 0.5243373493975904,
"grad_norm": 0.5470768213272095,
"learning_rate": 3.933868081942113e-05,
"loss": 0.0187,
"step": 272
},
{
"epoch": 0.5262650602409639,
"grad_norm": 0.9491772651672363,
"learning_rate": 3.9327194811505406e-05,
"loss": 0.0337,
"step": 273
},
{
"epoch": 0.5281927710843374,
"grad_norm": 0.9313873052597046,
"learning_rate": 3.93156116201834e-05,
"loss": 0.0573,
"step": 274
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.7181005477905273,
"learning_rate": 3.930393130369915e-05,
"loss": 0.0405,
"step": 275
},
{
"epoch": 0.5320481927710843,
"grad_norm": 0.34231385588645935,
"learning_rate": 3.9292153920785076e-05,
"loss": 0.0153,
"step": 276
},
{
"epoch": 0.5339759036144578,
"grad_norm": 0.6899610161781311,
"learning_rate": 3.928027953066168e-05,
"loss": 0.0338,
"step": 277
},
{
"epoch": 0.5359036144578313,
"grad_norm": 0.7509781718254089,
"learning_rate": 3.926830819303726e-05,
"loss": 0.0416,
"step": 278
},
{
"epoch": 0.5378313253012048,
"grad_norm": 0.6326774954795837,
"learning_rate": 3.925623996810757e-05,
"loss": 0.0293,
"step": 279
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.5543203353881836,
"learning_rate": 3.924407491655557e-05,
"loss": 0.0263,
"step": 280
},
{
"epoch": 0.5416867469879518,
"grad_norm": 0.5367572903633118,
"learning_rate": 3.9231813099551086e-05,
"loss": 0.0276,
"step": 281
},
{
"epoch": 0.5436144578313253,
"grad_norm": 0.3143869638442993,
"learning_rate": 3.921945457875051e-05,
"loss": 0.0146,
"step": 282
},
{
"epoch": 0.5455421686746988,
"grad_norm": 0.47403043508529663,
"learning_rate": 3.920699941629649e-05,
"loss": 0.0267,
"step": 283
},
{
"epoch": 0.5474698795180722,
"grad_norm": 0.5082595348358154,
"learning_rate": 3.919444767481763e-05,
"loss": 0.0183,
"step": 284
},
{
"epoch": 0.5493975903614458,
"grad_norm": 0.747949481010437,
"learning_rate": 3.918179941742816e-05,
"loss": 0.0412,
"step": 285
},
{
"epoch": 0.5513253012048193,
"grad_norm": 0.6553886532783508,
"learning_rate": 3.916905470772762e-05,
"loss": 0.0505,
"step": 286
},
{
"epoch": 0.5532530120481928,
"grad_norm": 0.3838176131248474,
"learning_rate": 3.9156213609800545e-05,
"loss": 0.0156,
"step": 287
},
{
"epoch": 0.5551807228915663,
"grad_norm": 0.7427731156349182,
"learning_rate": 3.914327618821614e-05,
"loss": 0.0278,
"step": 288
},
{
"epoch": 0.5571084337349398,
"grad_norm": 0.2612821161746979,
"learning_rate": 3.913024250802796e-05,
"loss": 0.0101,
"step": 289
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.3799416124820709,
"learning_rate": 3.911711263477357e-05,
"loss": 0.0168,
"step": 290
},
{
"epoch": 0.5609638554216867,
"grad_norm": 0.5053854584693909,
"learning_rate": 3.910388663447425e-05,
"loss": 0.0249,
"step": 291
},
{
"epoch": 0.5628915662650602,
"grad_norm": 0.38095012307167053,
"learning_rate": 3.909056457363461e-05,
"loss": 0.0156,
"step": 292
},
{
"epoch": 0.5648192771084337,
"grad_norm": 0.4477892220020294,
"learning_rate": 3.907714651924229e-05,
"loss": 0.0309,
"step": 293
},
{
"epoch": 0.5667469879518072,
"grad_norm": 0.5875864624977112,
"learning_rate": 3.906363253876763e-05,
"loss": 0.0287,
"step": 294
},
{
"epoch": 0.5686746987951807,
"grad_norm": 0.522990882396698,
"learning_rate": 3.90500227001633e-05,
"loss": 0.0318,
"step": 295
},
{
"epoch": 0.5706024096385542,
"grad_norm": 0.4153876304626465,
"learning_rate": 3.9036317071863994e-05,
"loss": 0.0192,
"step": 296
},
{
"epoch": 0.5725301204819278,
"grad_norm": 0.4675769507884979,
"learning_rate": 3.902251572278605e-05,
"loss": 0.067,
"step": 297
},
{
"epoch": 0.5744578313253013,
"grad_norm": 0.35778650641441345,
"learning_rate": 3.900861872232713e-05,
"loss": 0.0197,
"step": 298
},
{
"epoch": 0.5763855421686747,
"grad_norm": 0.7382330894470215,
"learning_rate": 3.899462614036587e-05,
"loss": 0.0283,
"step": 299
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.41268599033355713,
"learning_rate": 3.89805380472615e-05,
"loss": 0.0207,
"step": 300
},
{
"epoch": 0.5802409638554217,
"grad_norm": 1.2013020515441895,
"learning_rate": 3.8966354513853535e-05,
"loss": 0.0301,
"step": 301
},
{
"epoch": 0.5821686746987952,
"grad_norm": 0.424757719039917,
"learning_rate": 3.895207561146137e-05,
"loss": 0.022,
"step": 302
},
{
"epoch": 0.5840963855421687,
"grad_norm": 0.4196677505970001,
"learning_rate": 3.893770141188396e-05,
"loss": 0.0424,
"step": 303
},
{
"epoch": 0.5860240963855422,
"grad_norm": 0.8644190430641174,
"learning_rate": 3.892323198739946e-05,
"loss": 0.08,
"step": 304
},
{
"epoch": 0.5879518072289157,
"grad_norm": 0.5645135045051575,
"learning_rate": 3.890866741076482e-05,
"loss": 0.0152,
"step": 305
},
{
"epoch": 0.5898795180722891,
"grad_norm": 0.5218387246131897,
"learning_rate": 3.889400775521545e-05,
"loss": 0.0205,
"step": 306
},
{
"epoch": 0.5918072289156626,
"grad_norm": 0.39709413051605225,
"learning_rate": 3.8879253094464865e-05,
"loss": 0.0233,
"step": 307
},
{
"epoch": 0.5937349397590361,
"grad_norm": 0.3572910726070404,
"learning_rate": 3.8864403502704285e-05,
"loss": 0.0198,
"step": 308
},
{
"epoch": 0.5956626506024096,
"grad_norm": 0.382709264755249,
"learning_rate": 3.8849459054602274e-05,
"loss": 0.0176,
"step": 309
},
{
"epoch": 0.5975903614457831,
"grad_norm": 3.4527227878570557,
"learning_rate": 3.883441982530436e-05,
"loss": 0.0239,
"step": 310
},
{
"epoch": 0.5995180722891567,
"grad_norm": 0.4467569589614868,
"learning_rate": 3.8819285890432674e-05,
"loss": 0.0284,
"step": 311
},
{
"epoch": 0.6014457831325302,
"grad_norm": 0.44513460993766785,
"learning_rate": 3.880405732608555e-05,
"loss": 0.0233,
"step": 312
},
{
"epoch": 0.6033734939759036,
"grad_norm": 0.8029689192771912,
"learning_rate": 3.8788734208837155e-05,
"loss": 0.0433,
"step": 313
},
{
"epoch": 0.6053012048192771,
"grad_norm": 0.7291454076766968,
"learning_rate": 3.877331661573709e-05,
"loss": 0.043,
"step": 314
},
{
"epoch": 0.6072289156626506,
"grad_norm": 0.6050467491149902,
"learning_rate": 3.8757804624310006e-05,
"loss": 0.0377,
"step": 315
},
{
"epoch": 0.6091566265060241,
"grad_norm": 0.6714366674423218,
"learning_rate": 3.874219831255524e-05,
"loss": 0.046,
"step": 316
},
{
"epoch": 0.6110843373493976,
"grad_norm": 0.336037278175354,
"learning_rate": 3.8726497758946394e-05,
"loss": 0.0149,
"step": 317
},
{
"epoch": 0.6130120481927711,
"grad_norm": 0.3057402968406677,
"learning_rate": 3.871070304243094e-05,
"loss": 0.014,
"step": 318
},
{
"epoch": 0.6149397590361446,
"grad_norm": 0.4537644684314728,
"learning_rate": 3.8694814242429834e-05,
"loss": 0.0503,
"step": 319
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.45573824644088745,
"learning_rate": 3.8678831438837116e-05,
"loss": 0.021,
"step": 320
},
{
"epoch": 0.6187951807228915,
"grad_norm": 0.30729591846466064,
"learning_rate": 3.866275471201952e-05,
"loss": 0.0163,
"step": 321
},
{
"epoch": 0.620722891566265,
"grad_norm": 0.7614850401878357,
"learning_rate": 3.8646584142816036e-05,
"loss": 0.0347,
"step": 322
},
{
"epoch": 0.6226506024096385,
"grad_norm": 0.5323611497879028,
"learning_rate": 3.863031981253754e-05,
"loss": 0.0201,
"step": 323
},
{
"epoch": 0.624578313253012,
"grad_norm": 0.34426453709602356,
"learning_rate": 3.861396180296635e-05,
"loss": 0.0243,
"step": 324
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.621636152267456,
"learning_rate": 3.859751019635585e-05,
"loss": 0.0166,
"step": 325
},
{
"epoch": 0.6284337349397591,
"grad_norm": 0.549324095249176,
"learning_rate": 3.858096507543006e-05,
"loss": 0.0274,
"step": 326
},
{
"epoch": 0.6303614457831326,
"grad_norm": 0.358426570892334,
"learning_rate": 3.8564326523383214e-05,
"loss": 0.0207,
"step": 327
},
{
"epoch": 0.632289156626506,
"grad_norm": 0.3639723062515259,
"learning_rate": 3.8547594623879346e-05,
"loss": 0.0297,
"step": 328
},
{
"epoch": 0.6342168674698795,
"grad_norm": 0.3402212858200073,
"learning_rate": 3.853076946105188e-05,
"loss": 0.0258,
"step": 329
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.4083027243614197,
"learning_rate": 3.85138511195032e-05,
"loss": 0.0351,
"step": 330
},
{
"epoch": 0.6380722891566265,
"grad_norm": 0.43532121181488037,
"learning_rate": 3.84968396843042e-05,
"loss": 0.0388,
"step": 331
},
{
"epoch": 0.64,
"grad_norm": 0.35353463888168335,
"learning_rate": 3.8479735240993904e-05,
"loss": 0.0203,
"step": 332
},
{
"epoch": 0.6419277108433735,
"grad_norm": 0.350149929523468,
"learning_rate": 3.846253787557901e-05,
"loss": 0.0261,
"step": 333
},
{
"epoch": 0.643855421686747,
"grad_norm": 0.7665389180183411,
"learning_rate": 3.844524767453344e-05,
"loss": 0.0108,
"step": 334
},
{
"epoch": 0.6457831325301204,
"grad_norm": 0.44621360301971436,
"learning_rate": 3.842786472479795e-05,
"loss": 0.0282,
"step": 335
},
{
"epoch": 0.6477108433734939,
"grad_norm": 0.7787201404571533,
"learning_rate": 3.841038911377962e-05,
"loss": 0.0216,
"step": 336
},
{
"epoch": 0.6496385542168674,
"grad_norm": 0.48260653018951416,
"learning_rate": 3.839282092935153e-05,
"loss": 0.0234,
"step": 337
},
{
"epoch": 0.651566265060241,
"grad_norm": 0.4987852871417999,
"learning_rate": 3.837516025985219e-05,
"loss": 0.0515,
"step": 338
},
{
"epoch": 0.6534939759036145,
"grad_norm": 0.9030266404151917,
"learning_rate": 3.835740719408517e-05,
"loss": 0.0508,
"step": 339
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.6381701231002808,
"learning_rate": 3.833956182131867e-05,
"loss": 0.0405,
"step": 340
},
{
"epoch": 0.6573493975903615,
"grad_norm": 0.42828986048698425,
"learning_rate": 3.832162423128499e-05,
"loss": 0.024,
"step": 341
},
{
"epoch": 0.659277108433735,
"grad_norm": 0.38725873827934265,
"learning_rate": 3.8303594514180164e-05,
"loss": 0.0199,
"step": 342
},
{
"epoch": 0.6612048192771084,
"grad_norm": 0.23280498385429382,
"learning_rate": 3.828547276066346e-05,
"loss": 0.0101,
"step": 343
},
{
"epoch": 0.6631325301204819,
"grad_norm": 0.7298216819763184,
"learning_rate": 3.8267259061856925e-05,
"loss": 0.0455,
"step": 344
},
{
"epoch": 0.6650602409638554,
"grad_norm": 0.5975687503814697,
"learning_rate": 3.824895350934496e-05,
"loss": 0.0372,
"step": 345
},
{
"epoch": 0.6669879518072289,
"grad_norm": 0.6295403242111206,
"learning_rate": 3.823055619517381e-05,
"loss": 0.0362,
"step": 346
},
{
"epoch": 0.6689156626506024,
"grad_norm": 0.5086020827293396,
"learning_rate": 3.821206721185115e-05,
"loss": 0.0368,
"step": 347
},
{
"epoch": 0.6708433734939759,
"grad_norm": 0.34506168961524963,
"learning_rate": 3.819348665234557e-05,
"loss": 0.0178,
"step": 348
},
{
"epoch": 0.6727710843373494,
"grad_norm": 1.309940218925476,
"learning_rate": 3.817481461008617e-05,
"loss": 0.024,
"step": 349
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.4074770510196686,
"learning_rate": 3.815605117896204e-05,
"loss": 0.0262,
"step": 350
},
{
"epoch": 0.6766265060240964,
"grad_norm": 0.48525840044021606,
"learning_rate": 3.8137196453321775e-05,
"loss": 0.0209,
"step": 351
},
{
"epoch": 0.6785542168674699,
"grad_norm": 0.7199739217758179,
"learning_rate": 3.811825052797308e-05,
"loss": 0.0396,
"step": 352
},
{
"epoch": 0.6804819277108434,
"grad_norm": 0.519540011882782,
"learning_rate": 3.8099213498182196e-05,
"loss": 0.0453,
"step": 353
},
{
"epoch": 0.6824096385542169,
"grad_norm": 0.9738391041755676,
"learning_rate": 3.808008545967349e-05,
"loss": 0.0317,
"step": 354
},
{
"epoch": 0.6843373493975904,
"grad_norm": 1.888344407081604,
"learning_rate": 3.8060866508628953e-05,
"loss": 0.0452,
"step": 355
},
{
"epoch": 0.6862650602409639,
"grad_norm": 0.48989811539649963,
"learning_rate": 3.8041556741687695e-05,
"loss": 0.0315,
"step": 356
},
{
"epoch": 0.6881927710843373,
"grad_norm": 0.3764645457267761,
"learning_rate": 3.8022156255945496e-05,
"loss": 0.0269,
"step": 357
},
{
"epoch": 0.6901204819277108,
"grad_norm": 0.46409738063812256,
"learning_rate": 3.800266514895429e-05,
"loss": 0.0171,
"step": 358
},
{
"epoch": 0.6920481927710843,
"grad_norm": 0.41091030836105347,
"learning_rate": 3.7983083518721695e-05,
"loss": 0.0167,
"step": 359
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.8375523090362549,
"learning_rate": 3.79634114637105e-05,
"loss": 0.0342,
"step": 360
},
{
"epoch": 0.6959036144578313,
"grad_norm": 1.7053394317626953,
"learning_rate": 3.794364908283817e-05,
"loss": 0.02,
"step": 361
},
{
"epoch": 0.6978313253012048,
"grad_norm": 0.4163115918636322,
"learning_rate": 3.792379647547637e-05,
"loss": 0.0138,
"step": 362
},
{
"epoch": 0.6997590361445784,
"grad_norm": 0.388751745223999,
"learning_rate": 3.790385374145046e-05,
"loss": 0.0172,
"step": 363
},
{
"epoch": 0.7016867469879519,
"grad_norm": 0.5584064722061157,
"learning_rate": 3.7883820981038966e-05,
"loss": 0.0254,
"step": 364
},
{
"epoch": 0.7036144578313253,
"grad_norm": 1.394264817237854,
"learning_rate": 3.7863698294973114e-05,
"loss": 0.037,
"step": 365
},
{
"epoch": 0.7055421686746988,
"grad_norm": 0.46280744671821594,
"learning_rate": 3.78434857844363e-05,
"loss": 0.0234,
"step": 366
},
{
"epoch": 0.7074698795180723,
"grad_norm": 0.39548924565315247,
"learning_rate": 3.782318355106358e-05,
"loss": 0.0164,
"step": 367
},
{
"epoch": 0.7093975903614458,
"grad_norm": 0.7307773232460022,
"learning_rate": 3.780279169694118e-05,
"loss": 0.0192,
"step": 368
},
{
"epoch": 0.7113253012048193,
"grad_norm": 0.28035807609558105,
"learning_rate": 3.778231032460594e-05,
"loss": 0.0131,
"step": 369
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.8376953601837158,
"learning_rate": 3.776173953704486e-05,
"loss": 0.0291,
"step": 370
},
{
"epoch": 0.7151807228915663,
"grad_norm": 0.7356843948364258,
"learning_rate": 3.774107943769454e-05,
"loss": 0.0214,
"step": 371
},
{
"epoch": 0.7171084337349397,
"grad_norm": 0.41503390669822693,
"learning_rate": 3.772033013044064e-05,
"loss": 0.0221,
"step": 372
},
{
"epoch": 0.7190361445783132,
"grad_norm": 0.35732385516166687,
"learning_rate": 3.7699491719617436e-05,
"loss": 0.015,
"step": 373
},
{
"epoch": 0.7209638554216867,
"grad_norm": 0.283778578042984,
"learning_rate": 3.76785643100072e-05,
"loss": 0.0146,
"step": 374
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.3219413459300995,
"learning_rate": 3.765754800683974e-05,
"loss": 0.015,
"step": 375
},
{
"epoch": 0.7248192771084337,
"grad_norm": 0.610431432723999,
"learning_rate": 3.7636442915791856e-05,
"loss": 0.0326,
"step": 376
},
{
"epoch": 0.7267469879518073,
"grad_norm": 4.944870948791504,
"learning_rate": 3.7615249142986784e-05,
"loss": 0.0432,
"step": 377
},
{
"epoch": 0.7286746987951808,
"grad_norm": 0.4894593060016632,
"learning_rate": 3.7593966794993696e-05,
"loss": 0.0174,
"step": 378
},
{
"epoch": 0.7306024096385542,
"grad_norm": 0.4211325943470001,
"learning_rate": 3.757259597882714e-05,
"loss": 0.023,
"step": 379
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.33621737360954285,
"learning_rate": 3.755113680194651e-05,
"loss": 0.0201,
"step": 380
},
{
"epoch": 0.7344578313253012,
"grad_norm": 0.5799694657325745,
"learning_rate": 3.7529589372255514e-05,
"loss": 0.0173,
"step": 381
},
{
"epoch": 0.7363855421686747,
"grad_norm": 0.5172572731971741,
"learning_rate": 3.750795379810162e-05,
"loss": 0.0284,
"step": 382
},
{
"epoch": 0.7383132530120482,
"grad_norm": 0.5715453028678894,
"learning_rate": 3.748623018827552e-05,
"loss": 0.0194,
"step": 383
},
{
"epoch": 0.7402409638554217,
"grad_norm": 0.5284178256988525,
"learning_rate": 3.746441865201056e-05,
"loss": 0.0247,
"step": 384
},
{
"epoch": 0.7421686746987952,
"grad_norm": 0.37828654050827026,
"learning_rate": 3.744251929898223e-05,
"loss": 0.0097,
"step": 385
},
{
"epoch": 0.7440963855421687,
"grad_norm": 0.3252779543399811,
"learning_rate": 3.742053223930758e-05,
"loss": 0.0238,
"step": 386
},
{
"epoch": 0.7460240963855421,
"grad_norm": 0.6031543612480164,
"learning_rate": 3.7398457583544674e-05,
"loss": 0.0332,
"step": 387
},
{
"epoch": 0.7479518072289156,
"grad_norm": 0.23846614360809326,
"learning_rate": 3.737629544269206e-05,
"loss": 0.0122,
"step": 388
},
{
"epoch": 0.7498795180722891,
"grad_norm": 0.5274029970169067,
"learning_rate": 3.7354045928188155e-05,
"loss": 0.0324,
"step": 389
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.4672217071056366,
"learning_rate": 3.733170915191075e-05,
"loss": 0.0196,
"step": 390
},
{
"epoch": 0.7537349397590362,
"grad_norm": 0.29819396138191223,
"learning_rate": 3.730928522617639e-05,
"loss": 0.0131,
"step": 391
},
{
"epoch": 0.7556626506024097,
"grad_norm": 0.43824997544288635,
"learning_rate": 3.7286774263739855e-05,
"loss": 0.0238,
"step": 392
},
{
"epoch": 0.7575903614457832,
"grad_norm": 0.2822072505950928,
"learning_rate": 3.726417637779357e-05,
"loss": 0.0314,
"step": 393
},
{
"epoch": 0.7595180722891566,
"grad_norm": 0.43815648555755615,
"learning_rate": 3.7241491681967044e-05,
"loss": 0.0144,
"step": 394
},
{
"epoch": 0.7614457831325301,
"grad_norm": 0.37194815278053284,
"learning_rate": 3.721872029032628e-05,
"loss": 0.0286,
"step": 395
},
{
"epoch": 0.7633734939759036,
"grad_norm": 0.7319737672805786,
"learning_rate": 3.719586231737322e-05,
"loss": 0.0427,
"step": 396
},
{
"epoch": 0.7653012048192771,
"grad_norm": 0.5870066285133362,
"learning_rate": 3.717291787804517e-05,
"loss": 0.0138,
"step": 397
},
{
"epoch": 0.7672289156626506,
"grad_norm": 0.6574277281761169,
"learning_rate": 3.7149887087714225e-05,
"loss": 0.061,
"step": 398
},
{
"epoch": 0.7691566265060241,
"grad_norm": 0.5467348694801331,
"learning_rate": 3.712677006218666e-05,
"loss": 0.022,
"step": 399
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.3589288890361786,
"learning_rate": 3.710356691770238e-05,
"loss": 0.0161,
"step": 400
},
{
"epoch": 0.7730120481927711,
"grad_norm": 0.574630618095398,
"learning_rate": 3.708027777093433e-05,
"loss": 0.0285,
"step": 401
},
{
"epoch": 0.7749397590361445,
"grad_norm": 0.39048445224761963,
"learning_rate": 3.70569027389879e-05,
"loss": 0.012,
"step": 402
},
{
"epoch": 0.776867469879518,
"grad_norm": 0.34803536534309387,
"learning_rate": 3.703344193940032e-05,
"loss": 0.0155,
"step": 403
},
{
"epoch": 0.7787951807228916,
"grad_norm": 1.188948392868042,
"learning_rate": 3.700989549014011e-05,
"loss": 0.0617,
"step": 404
},
{
"epoch": 0.7807228915662651,
"grad_norm": 0.473157674074173,
"learning_rate": 3.698626350960646e-05,
"loss": 0.0298,
"step": 405
},
{
"epoch": 0.7826506024096386,
"grad_norm": 0.42009076476097107,
"learning_rate": 3.6962546116628634e-05,
"loss": 0.03,
"step": 406
},
{
"epoch": 0.7845783132530121,
"grad_norm": 0.6334308981895447,
"learning_rate": 3.693874343046537e-05,
"loss": 0.0107,
"step": 407
},
{
"epoch": 0.7865060240963856,
"grad_norm": 0.35594677925109863,
"learning_rate": 3.6914855570804314e-05,
"loss": 0.0174,
"step": 408
},
{
"epoch": 0.788433734939759,
"grad_norm": 0.28985708951950073,
"learning_rate": 3.689088265776136e-05,
"loss": 0.0149,
"step": 409
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.3981950581073761,
"learning_rate": 3.686682481188011e-05,
"loss": 0.019,
"step": 410
},
{
"epoch": 0.792289156626506,
"grad_norm": 0.48819583654403687,
"learning_rate": 3.6842682154131193e-05,
"loss": 0.0217,
"step": 411
},
{
"epoch": 0.7942168674698795,
"grad_norm": 0.42819952964782715,
"learning_rate": 3.681845480591174e-05,
"loss": 0.0198,
"step": 412
},
{
"epoch": 0.796144578313253,
"grad_norm": 0.48591694235801697,
"learning_rate": 3.6794142889044727e-05,
"loss": 0.0253,
"step": 413
},
{
"epoch": 0.7980722891566265,
"grad_norm": 0.4730607271194458,
"learning_rate": 3.676974652577835e-05,
"loss": 0.0329,
"step": 414
},
{
"epoch": 0.8,
"grad_norm": 0.5390865802764893,
"learning_rate": 3.6745265838785434e-05,
"loss": 0.0479,
"step": 415
},
{
"epoch": 0.8019277108433734,
"grad_norm": 0.6377891302108765,
"learning_rate": 3.672070095116283e-05,
"loss": 0.019,
"step": 416
},
{
"epoch": 0.803855421686747,
"grad_norm": 0.8984615206718445,
"learning_rate": 3.669605198643075e-05,
"loss": 0.0444,
"step": 417
},
{
"epoch": 0.8057831325301205,
"grad_norm": 0.4913877546787262,
"learning_rate": 3.667131906853219e-05,
"loss": 0.031,
"step": 418
},
{
"epoch": 0.807710843373494,
"grad_norm": 0.37894028425216675,
"learning_rate": 3.664650232183229e-05,
"loss": 0.0195,
"step": 419
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.3644949495792389,
"learning_rate": 3.66216018711177e-05,
"loss": 0.018,
"step": 420
},
{
"epoch": 0.811566265060241,
"grad_norm": 0.414440393447876,
"learning_rate": 3.659661784159597e-05,
"loss": 0.0188,
"step": 421
},
{
"epoch": 0.8134939759036145,
"grad_norm": 0.49220341444015503,
"learning_rate": 3.65715503588949e-05,
"loss": 0.016,
"step": 422
},
{
"epoch": 0.815421686746988,
"grad_norm": 1.0939836502075195,
"learning_rate": 3.654639954906193e-05,
"loss": 0.0758,
"step": 423
},
{
"epoch": 0.8173493975903614,
"grad_norm": 0.43222442269325256,
"learning_rate": 3.652116553856349e-05,
"loss": 0.0308,
"step": 424
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.5081896185874939,
"learning_rate": 3.649584845428438e-05,
"loss": 0.0493,
"step": 425
},
{
"epoch": 0.8212048192771084,
"grad_norm": 0.9811948537826538,
"learning_rate": 3.64704484235271e-05,
"loss": 0.019,
"step": 426
},
{
"epoch": 0.8231325301204819,
"grad_norm": 0.31656572222709656,
"learning_rate": 3.6444965574011255e-05,
"loss": 0.0135,
"step": 427
},
{
"epoch": 0.8250602409638554,
"grad_norm": 0.7844433188438416,
"learning_rate": 3.641940003387289e-05,
"loss": 0.0402,
"step": 428
},
{
"epoch": 0.826987951807229,
"grad_norm": 0.3353273570537567,
"learning_rate": 3.6393751931663814e-05,
"loss": 0.0132,
"step": 429
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.7253058552742004,
"learning_rate": 3.6368021396351015e-05,
"loss": 0.0296,
"step": 430
},
{
"epoch": 0.8308433734939759,
"grad_norm": 0.45300304889678955,
"learning_rate": 3.634220855731598e-05,
"loss": 0.0258,
"step": 431
},
{
"epoch": 0.8327710843373494,
"grad_norm": 0.3480473458766937,
"learning_rate": 3.631631354435403e-05,
"loss": 0.0099,
"step": 432
},
{
"epoch": 0.8346987951807229,
"grad_norm": 2.1114516258239746,
"learning_rate": 3.62903364876737e-05,
"loss": 0.0457,
"step": 433
},
{
"epoch": 0.8366265060240964,
"grad_norm": 0.5649561882019043,
"learning_rate": 3.626427751789606e-05,
"loss": 0.0444,
"step": 434
},
{
"epoch": 0.8385542168674699,
"grad_norm": 0.3864995539188385,
"learning_rate": 3.623813676605405e-05,
"loss": 0.0223,
"step": 435
},
{
"epoch": 0.8404819277108434,
"grad_norm": 1.2134298086166382,
"learning_rate": 3.621191436359186e-05,
"loss": 0.0353,
"step": 436
},
{
"epoch": 0.8424096385542169,
"grad_norm": 0.4403415024280548,
"learning_rate": 3.6185610442364246e-05,
"loss": 0.0216,
"step": 437
},
{
"epoch": 0.8443373493975903,
"grad_norm": 0.6050297021865845,
"learning_rate": 3.6159225134635846e-05,
"loss": 0.0433,
"step": 438
},
{
"epoch": 0.8462650602409638,
"grad_norm": 0.7951678037643433,
"learning_rate": 3.6132758573080556e-05,
"loss": 0.031,
"step": 439
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.4991949796676636,
"learning_rate": 3.6106210890780834e-05,
"loss": 0.0313,
"step": 440
},
{
"epoch": 0.8501204819277108,
"grad_norm": 0.47951385378837585,
"learning_rate": 3.607958222122704e-05,
"loss": 0.0218,
"step": 441
},
{
"epoch": 0.8520481927710843,
"grad_norm": 0.7345194220542908,
"learning_rate": 3.6052872698316755e-05,
"loss": 0.0239,
"step": 442
},
{
"epoch": 0.8539759036144579,
"grad_norm": 1.4814884662628174,
"learning_rate": 3.602608245635414e-05,
"loss": 0.0127,
"step": 443
},
{
"epoch": 0.8559036144578314,
"grad_norm": 2.4240877628326416,
"learning_rate": 3.599921163004922e-05,
"loss": 0.0618,
"step": 444
},
{
"epoch": 0.8578313253012049,
"grad_norm": 0.41523510217666626,
"learning_rate": 3.5972260354517216e-05,
"loss": 0.0283,
"step": 445
},
{
"epoch": 0.8597590361445783,
"grad_norm": 0.5577677488327026,
"learning_rate": 3.594522876527791e-05,
"loss": 0.0271,
"step": 446
},
{
"epoch": 0.8616867469879518,
"grad_norm": 0.5829064846038818,
"learning_rate": 3.591811699825487e-05,
"loss": 0.0169,
"step": 447
},
{
"epoch": 0.8636144578313253,
"grad_norm": 0.4478822350502014,
"learning_rate": 3.5890925189774886e-05,
"loss": 0.0239,
"step": 448
},
{
"epoch": 0.8655421686746988,
"grad_norm": 0.3498048782348633,
"learning_rate": 3.586365347656718e-05,
"loss": 0.0137,
"step": 449
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.6571130156517029,
"learning_rate": 3.583630199576278e-05,
"loss": 0.027,
"step": 450
},
{
"epoch": 0.8693975903614458,
"grad_norm": 0.344970166683197,
"learning_rate": 3.58088708848938e-05,
"loss": 0.0167,
"step": 451
},
{
"epoch": 0.8713253012048193,
"grad_norm": 0.34611570835113525,
"learning_rate": 3.5781360281892775e-05,
"loss": 0.0468,
"step": 452
},
{
"epoch": 0.8732530120481927,
"grad_norm": 0.66157066822052,
"learning_rate": 3.575377032509194e-05,
"loss": 0.0344,
"step": 453
},
{
"epoch": 0.8751807228915662,
"grad_norm": 0.3676326870918274,
"learning_rate": 3.5726101153222534e-05,
"loss": 0.0366,
"step": 454
},
{
"epoch": 0.8771084337349397,
"grad_norm": 0.5958423018455505,
"learning_rate": 3.569835290541414e-05,
"loss": 0.0382,
"step": 455
},
{
"epoch": 0.8790361445783132,
"grad_norm": 0.36787471175193787,
"learning_rate": 3.567052572119397e-05,
"loss": 0.018,
"step": 456
},
{
"epoch": 0.8809638554216868,
"grad_norm": 0.9478234052658081,
"learning_rate": 3.564261974048611e-05,
"loss": 0.0179,
"step": 457
},
{
"epoch": 0.8828915662650603,
"grad_norm": 0.3337579369544983,
"learning_rate": 3.56146351036109e-05,
"loss": 0.0147,
"step": 458
},
{
"epoch": 0.8848192771084338,
"grad_norm": 0.4984932243824005,
"learning_rate": 3.558657195128416e-05,
"loss": 0.0224,
"step": 459
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.36718735098838806,
"learning_rate": 3.555843042461653e-05,
"loss": 0.0202,
"step": 460
},
{
"epoch": 0.8886746987951807,
"grad_norm": 0.4081745445728302,
"learning_rate": 3.553021066511274e-05,
"loss": 0.0288,
"step": 461
},
{
"epoch": 0.8906024096385542,
"grad_norm": 0.3233242332935333,
"learning_rate": 3.55019128146709e-05,
"loss": 0.0362,
"step": 462
},
{
"epoch": 0.8925301204819277,
"grad_norm": 0.6560158729553223,
"learning_rate": 3.547353701558178e-05,
"loss": 0.038,
"step": 463
},
{
"epoch": 0.8944578313253012,
"grad_norm": 0.47668641805648804,
"learning_rate": 3.544508341052811e-05,
"loss": 0.0399,
"step": 464
},
{
"epoch": 0.8963855421686747,
"grad_norm": 0.45512664318084717,
"learning_rate": 3.541655214258383e-05,
"loss": 0.022,
"step": 465
},
{
"epoch": 0.8983132530120482,
"grad_norm": 0.8410730361938477,
"learning_rate": 3.538794335521343e-05,
"loss": 0.0315,
"step": 466
},
{
"epoch": 0.9002409638554217,
"grad_norm": 0.4872909486293793,
"learning_rate": 3.535925719227117e-05,
"loss": 0.0152,
"step": 467
},
{
"epoch": 0.9021686746987951,
"grad_norm": 0.45623311400413513,
"learning_rate": 3.533049379800038e-05,
"loss": 0.0305,
"step": 468
},
{
"epoch": 0.9040963855421686,
"grad_norm": 0.43087029457092285,
"learning_rate": 3.530165331703275e-05,
"loss": 0.0131,
"step": 469
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.4610525369644165,
"learning_rate": 3.527273589438756e-05,
"loss": 0.0187,
"step": 470
},
{
"epoch": 0.9079518072289157,
"grad_norm": 0.3356114327907562,
"learning_rate": 3.5243741675471006e-05,
"loss": 0.0185,
"step": 471
},
{
"epoch": 0.9098795180722892,
"grad_norm": 0.9065960049629211,
"learning_rate": 3.5214670806075426e-05,
"loss": 0.0433,
"step": 472
},
{
"epoch": 0.9118072289156627,
"grad_norm": 0.3652578294277191,
"learning_rate": 3.518552343237858e-05,
"loss": 0.02,
"step": 473
},
{
"epoch": 0.9137349397590362,
"grad_norm": 0.32377883791923523,
"learning_rate": 3.5156299700942916e-05,
"loss": 0.0165,
"step": 474
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.2431817352771759,
"learning_rate": 3.512699975871485e-05,
"loss": 0.0172,
"step": 475
},
{
"epoch": 0.9175903614457831,
"grad_norm": 0.6390707492828369,
"learning_rate": 3.509762375302399e-05,
"loss": 0.0356,
"step": 476
},
{
"epoch": 0.9195180722891566,
"grad_norm": 0.2283092886209488,
"learning_rate": 3.506817183158243e-05,
"loss": 0.0088,
"step": 477
},
{
"epoch": 0.9214457831325301,
"grad_norm": 0.5053914189338684,
"learning_rate": 3.5038644142483966e-05,
"loss": 0.0389,
"step": 478
},
{
"epoch": 0.9233734939759036,
"grad_norm": 0.2567576467990875,
"learning_rate": 3.500904083420342e-05,
"loss": 0.0155,
"step": 479
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.6852384209632874,
"learning_rate": 3.497936205559583e-05,
"loss": 0.0247,
"step": 480
},
{
"epoch": 0.9272289156626506,
"grad_norm": 0.36403414607048035,
"learning_rate": 3.494960795589572e-05,
"loss": 0.023,
"step": 481
},
{
"epoch": 0.929156626506024,
"grad_norm": 0.506554901599884,
"learning_rate": 3.491977868471635e-05,
"loss": 0.0273,
"step": 482
},
{
"epoch": 0.9310843373493976,
"grad_norm": 0.38329923152923584,
"learning_rate": 3.4889874392048985e-05,
"loss": 0.0169,
"step": 483
},
{
"epoch": 0.9330120481927711,
"grad_norm": 0.2805836498737335,
"learning_rate": 3.48598952282621e-05,
"loss": 0.0105,
"step": 484
},
{
"epoch": 0.9349397590361446,
"grad_norm": 0.6315302848815918,
"learning_rate": 3.482984134410067e-05,
"loss": 0.0289,
"step": 485
},
{
"epoch": 0.9368674698795181,
"grad_norm": 0.6431388854980469,
"learning_rate": 3.479971289068537e-05,
"loss": 0.0311,
"step": 486
},
{
"epoch": 0.9387951807228916,
"grad_norm": 0.9794723391532898,
"learning_rate": 3.476951001951184e-05,
"loss": 0.0452,
"step": 487
},
{
"epoch": 0.9407228915662651,
"grad_norm": 0.7984824180603027,
"learning_rate": 3.473923288244991e-05,
"loss": 0.0689,
"step": 488
},
{
"epoch": 0.9426506024096386,
"grad_norm": 0.46362006664276123,
"learning_rate": 3.470888163174286e-05,
"loss": 0.0241,
"step": 489
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.5051195025444031,
"learning_rate": 3.467845642000661e-05,
"loss": 0.0228,
"step": 490
},
{
"epoch": 0.9465060240963855,
"grad_norm": 0.3082812428474426,
"learning_rate": 3.4647957400229004e-05,
"loss": 0.0144,
"step": 491
},
{
"epoch": 0.948433734939759,
"grad_norm": 0.2691391110420227,
"learning_rate": 3.461738472576902e-05,
"loss": 0.0167,
"step": 492
},
{
"epoch": 0.9503614457831325,
"grad_norm": 0.5627671480178833,
"learning_rate": 3.458673855035597e-05,
"loss": 0.031,
"step": 493
},
{
"epoch": 0.952289156626506,
"grad_norm": 0.4571435749530792,
"learning_rate": 3.455601902808876e-05,
"loss": 0.0191,
"step": 494
},
{
"epoch": 0.9542168674698795,
"grad_norm": 1.0117709636688232,
"learning_rate": 3.452522631343515e-05,
"loss": 0.0192,
"step": 495
},
{
"epoch": 0.9561445783132531,
"grad_norm": 0.28375712037086487,
"learning_rate": 3.449436056123086e-05,
"loss": 0.0159,
"step": 496
},
{
"epoch": 0.9580722891566265,
"grad_norm": 0.26381856203079224,
"learning_rate": 3.446342192667893e-05,
"loss": 0.0113,
"step": 497
},
{
"epoch": 0.96,
"grad_norm": 0.49317577481269836,
"learning_rate": 3.443241056534884e-05,
"loss": 0.0332,
"step": 498
},
{
"epoch": 0.9619277108433735,
"grad_norm": 0.28884485363960266,
"learning_rate": 3.440132663317579e-05,
"loss": 0.0117,
"step": 499
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.36255285143852234,
"learning_rate": 3.4370170286459864e-05,
"loss": 0.0169,
"step": 500
},
{
"epoch": 0.9657831325301205,
"grad_norm": 0.4265049993991852,
"learning_rate": 3.433894168186529e-05,
"loss": 0.0217,
"step": 501
},
{
"epoch": 0.967710843373494,
"grad_norm": 0.8169426321983337,
"learning_rate": 3.430764097641962e-05,
"loss": 0.0207,
"step": 502
},
{
"epoch": 0.9696385542168675,
"grad_norm": 1.866077184677124,
"learning_rate": 3.427626832751296e-05,
"loss": 0.0381,
"step": 503
},
{
"epoch": 0.971566265060241,
"grad_norm": 0.33124980330467224,
"learning_rate": 3.424482389289716e-05,
"loss": 0.0245,
"step": 504
},
{
"epoch": 0.9734939759036144,
"grad_norm": 0.37479540705680847,
"learning_rate": 3.4213307830685055e-05,
"loss": 0.0164,
"step": 505
},
{
"epoch": 0.9754216867469879,
"grad_norm": 0.39738863706588745,
"learning_rate": 3.4181720299349615e-05,
"loss": 0.0297,
"step": 506
},
{
"epoch": 0.9773493975903614,
"grad_norm": 0.2567287087440491,
"learning_rate": 3.4150061457723205e-05,
"loss": 0.0102,
"step": 507
},
{
"epoch": 0.9792771084337349,
"grad_norm": 0.6230517029762268,
"learning_rate": 3.411833146499675e-05,
"loss": 0.0243,
"step": 508
},
{
"epoch": 0.9812048192771085,
"grad_norm": 0.44843971729278564,
"learning_rate": 3.408653048071894e-05,
"loss": 0.0357,
"step": 509
},
{
"epoch": 0.983132530120482,
"grad_norm": 1.0569655895233154,
"learning_rate": 3.405465866479546e-05,
"loss": 0.037,
"step": 510
},
{
"epoch": 0.9850602409638555,
"grad_norm": 0.29000964760780334,
"learning_rate": 3.402271617748812e-05,
"loss": 0.0129,
"step": 511
},
{
"epoch": 0.9869879518072289,
"grad_norm": 2.1627447605133057,
"learning_rate": 3.399070317941413e-05,
"loss": 0.0442,
"step": 512
},
{
"epoch": 0.9889156626506024,
"grad_norm": 0.27371272444725037,
"learning_rate": 3.395861983154522e-05,
"loss": 0.0119,
"step": 513
},
{
"epoch": 0.9908433734939759,
"grad_norm": 0.4117226302623749,
"learning_rate": 3.392646629520688e-05,
"loss": 0.0455,
"step": 514
},
{
"epoch": 0.9927710843373494,
"grad_norm": 0.5098996758460999,
"learning_rate": 3.389424273207752e-05,
"loss": 0.0203,
"step": 515
},
{
"epoch": 0.9946987951807229,
"grad_norm": 0.5192157626152039,
"learning_rate": 3.386194930418767e-05,
"loss": 0.0329,
"step": 516
},
{
"epoch": 0.9966265060240964,
"grad_norm": 0.18757697939872742,
"learning_rate": 3.382958617391915e-05,
"loss": 0.0065,
"step": 517
},
{
"epoch": 0.9985542168674699,
"grad_norm": 0.3334413170814514,
"learning_rate": 3.3797153504004296e-05,
"loss": 0.0266,
"step": 518
},
{
"epoch": 1.0,
"grad_norm": 0.4152225852012634,
"learning_rate": 3.3764651457525095e-05,
"loss": 0.0169,
"step": 519
},
{
"epoch": 1.0019277108433735,
"grad_norm": 0.43535247445106506,
"learning_rate": 3.373208019791237e-05,
"loss": 0.0221,
"step": 520
},
{
"epoch": 1.003855421686747,
"grad_norm": 0.39292722940444946,
"learning_rate": 3.3699439888945e-05,
"loss": 0.0211,
"step": 521
},
{
"epoch": 1.0057831325301205,
"grad_norm": 0.19566713273525238,
"learning_rate": 3.366673069474904e-05,
"loss": 0.0069,
"step": 522
},
{
"epoch": 1.007710843373494,
"grad_norm": 0.5101853609085083,
"learning_rate": 3.3633952779796914e-05,
"loss": 0.0191,
"step": 523
},
{
"epoch": 1.0096385542168675,
"grad_norm": 0.999434769153595,
"learning_rate": 3.360110630890664e-05,
"loss": 0.0196,
"step": 524
},
{
"epoch": 1.011566265060241,
"grad_norm": 0.4646223783493042,
"learning_rate": 3.356819144724092e-05,
"loss": 0.0328,
"step": 525
},
{
"epoch": 1.0134939759036146,
"grad_norm": 0.3132480978965759,
"learning_rate": 3.3535208360306354e-05,
"loss": 0.0203,
"step": 526
},
{
"epoch": 1.0154216867469879,
"grad_norm": 0.3038032352924347,
"learning_rate": 3.350215721395261e-05,
"loss": 0.0122,
"step": 527
},
{
"epoch": 1.0173493975903614,
"grad_norm": 0.45082882046699524,
"learning_rate": 3.346903817437157e-05,
"loss": 0.0437,
"step": 528
},
{
"epoch": 1.0192771084337349,
"grad_norm": 0.26917046308517456,
"learning_rate": 3.343585140809651e-05,
"loss": 0.013,
"step": 529
},
{
"epoch": 1.0212048192771084,
"grad_norm": 0.23869264125823975,
"learning_rate": 3.3402597082001276e-05,
"loss": 0.008,
"step": 530
},
{
"epoch": 1.0231325301204819,
"grad_norm": 0.31315353512763977,
"learning_rate": 3.3369275363299394e-05,
"loss": 0.0078,
"step": 531
},
{
"epoch": 1.0250602409638554,
"grad_norm": 0.4780346751213074,
"learning_rate": 3.333588641954327e-05,
"loss": 0.0225,
"step": 532
},
{
"epoch": 1.026987951807229,
"grad_norm": 0.2920368015766144,
"learning_rate": 3.330243041862336e-05,
"loss": 0.0118,
"step": 533
},
{
"epoch": 1.0289156626506024,
"grad_norm": 0.543669581413269,
"learning_rate": 3.326890752876728e-05,
"loss": 0.0338,
"step": 534
},
{
"epoch": 1.030843373493976,
"grad_norm": 0.4288000464439392,
"learning_rate": 3.323531791853901e-05,
"loss": 0.0341,
"step": 535
},
{
"epoch": 1.0327710843373494,
"grad_norm": 0.26600322127342224,
"learning_rate": 3.3201661756838e-05,
"loss": 0.0184,
"step": 536
},
{
"epoch": 1.034698795180723,
"grad_norm": 0.290937602519989,
"learning_rate": 3.316793921289835e-05,
"loss": 0.0152,
"step": 537
},
{
"epoch": 1.0366265060240965,
"grad_norm": 0.7621443271636963,
"learning_rate": 3.313415045628795e-05,
"loss": 0.0326,
"step": 538
},
{
"epoch": 1.03855421686747,
"grad_norm": 0.5581283569335938,
"learning_rate": 3.3100295656907646e-05,
"loss": 0.0164,
"step": 539
},
{
"epoch": 1.0404819277108435,
"grad_norm": 0.20930901169776917,
"learning_rate": 3.306637498499034e-05,
"loss": 0.0091,
"step": 540
},
{
"epoch": 1.0424096385542168,
"grad_norm": 0.46212059259414673,
"learning_rate": 3.303238861110018e-05,
"loss": 0.0118,
"step": 541
},
{
"epoch": 1.0443373493975903,
"grad_norm": 0.38259151577949524,
"learning_rate": 3.299833670613168e-05,
"loss": 0.0081,
"step": 542
},
{
"epoch": 1.0462650602409638,
"grad_norm": 0.4888618290424347,
"learning_rate": 3.2964219441308865e-05,
"loss": 0.0138,
"step": 543
},
{
"epoch": 1.0481927710843373,
"grad_norm": 0.32103127241134644,
"learning_rate": 3.2930036988184425e-05,
"loss": 0.0171,
"step": 544
},
{
"epoch": 1.0501204819277108,
"grad_norm": 0.27787327766418457,
"learning_rate": 3.28957895186388e-05,
"loss": 0.0106,
"step": 545
},
{
"epoch": 1.0520481927710843,
"grad_norm": 0.35597777366638184,
"learning_rate": 3.2861477204879395e-05,
"loss": 0.0123,
"step": 546
},
{
"epoch": 1.0539759036144578,
"grad_norm": 0.3619804084300995,
"learning_rate": 3.2827100219439656e-05,
"loss": 0.0088,
"step": 547
},
{
"epoch": 1.0559036144578313,
"grad_norm": 0.2525513470172882,
"learning_rate": 3.279265873517822e-05,
"loss": 0.0179,
"step": 548
},
{
"epoch": 1.0578313253012048,
"grad_norm": 0.3910020887851715,
"learning_rate": 3.275815292527804e-05,
"loss": 0.0142,
"step": 549
},
{
"epoch": 1.0597590361445783,
"grad_norm": 0.30515050888061523,
"learning_rate": 3.2723582963245526e-05,
"loss": 0.0123,
"step": 550
},
{
"epoch": 1.0616867469879518,
"grad_norm": 0.21708644926548004,
"learning_rate": 3.2688949022909665e-05,
"loss": 0.0098,
"step": 551
},
{
"epoch": 1.0636144578313254,
"grad_norm": 0.23307719826698303,
"learning_rate": 3.265425127842114e-05,
"loss": 0.0097,
"step": 552
},
{
"epoch": 1.0655421686746989,
"grad_norm": 0.676654577255249,
"learning_rate": 3.261948990425147e-05,
"loss": 0.0227,
"step": 553
},
{
"epoch": 1.0674698795180724,
"grad_norm": 0.4593975841999054,
"learning_rate": 3.258466507519213e-05,
"loss": 0.047,
"step": 554
},
{
"epoch": 1.0693975903614459,
"grad_norm": 0.19405829906463623,
"learning_rate": 3.254977696635366e-05,
"loss": 0.0314,
"step": 555
},
{
"epoch": 1.0713253012048192,
"grad_norm": 0.14563389122486115,
"learning_rate": 3.2514825753164774e-05,
"loss": 0.0046,
"step": 556
},
{
"epoch": 1.0732530120481927,
"grad_norm": 0.2642340064048767,
"learning_rate": 3.247981161137153e-05,
"loss": 0.022,
"step": 557
},
{
"epoch": 1.0751807228915662,
"grad_norm": 0.17274761199951172,
"learning_rate": 3.2444734717036386e-05,
"loss": 0.0134,
"step": 558
},
{
"epoch": 1.0771084337349397,
"grad_norm": 0.44354626536369324,
"learning_rate": 3.240959524653735e-05,
"loss": 0.0211,
"step": 559
},
{
"epoch": 1.0790361445783132,
"grad_norm": 0.2806888818740845,
"learning_rate": 3.237439337656708e-05,
"loss": 0.0141,
"step": 560
},
{
"epoch": 1.0809638554216867,
"grad_norm": 0.21679501235485077,
"learning_rate": 3.2339129284131994e-05,
"loss": 0.019,
"step": 561
},
{
"epoch": 1.0828915662650602,
"grad_norm": 0.3040260076522827,
"learning_rate": 3.2303803146551386e-05,
"loss": 0.0249,
"step": 562
},
{
"epoch": 1.0848192771084337,
"grad_norm": 0.2793775200843811,
"learning_rate": 3.226841514145656e-05,
"loss": 0.0088,
"step": 563
},
{
"epoch": 1.0867469879518072,
"grad_norm": 0.149955615401268,
"learning_rate": 3.223296544678987e-05,
"loss": 0.0054,
"step": 564
},
{
"epoch": 1.0886746987951808,
"grad_norm": 0.22166767716407776,
"learning_rate": 3.219745424080389e-05,
"loss": 0.0109,
"step": 565
},
{
"epoch": 1.0906024096385543,
"grad_norm": 0.22399431467056274,
"learning_rate": 3.2161881702060476e-05,
"loss": 0.0106,
"step": 566
},
{
"epoch": 1.0925301204819278,
"grad_norm": 0.18537986278533936,
"learning_rate": 3.2126248009429905e-05,
"loss": 0.0077,
"step": 567
},
{
"epoch": 1.0944578313253013,
"grad_norm": 0.24511495232582092,
"learning_rate": 3.2090553342089935e-05,
"loss": 0.0093,
"step": 568
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.4766045808792114,
"learning_rate": 3.205479787952494e-05,
"loss": 0.036,
"step": 569
},
{
"epoch": 1.0983132530120483,
"grad_norm": 0.1425715535879135,
"learning_rate": 3.201898180152499e-05,
"loss": 0.0085,
"step": 570
},
{
"epoch": 1.1002409638554216,
"grad_norm": 0.1909666359424591,
"learning_rate": 3.1983105288184945e-05,
"loss": 0.0081,
"step": 571
},
{
"epoch": 1.102168674698795,
"grad_norm": 0.44077104330062866,
"learning_rate": 3.194716851990355e-05,
"loss": 0.017,
"step": 572
},
{
"epoch": 1.1040963855421686,
"grad_norm": 0.5757400989532471,
"learning_rate": 3.191117167738253e-05,
"loss": 0.021,
"step": 573
},
{
"epoch": 1.106024096385542,
"grad_norm": 0.1977701038122177,
"learning_rate": 3.1875114941625705e-05,
"loss": 0.0096,
"step": 574
},
{
"epoch": 1.1079518072289156,
"grad_norm": 0.3524581491947174,
"learning_rate": 3.1838998493938026e-05,
"loss": 0.0118,
"step": 575
},
{
"epoch": 1.1098795180722891,
"grad_norm": 0.3301331698894501,
"learning_rate": 3.180282251592472e-05,
"loss": 0.0094,
"step": 576
},
{
"epoch": 1.1118072289156626,
"grad_norm": 0.2774488925933838,
"learning_rate": 3.1766587189490336e-05,
"loss": 0.0131,
"step": 577
},
{
"epoch": 1.1137349397590361,
"grad_norm": 1.732595443725586,
"learning_rate": 3.173029269683785e-05,
"loss": 0.0445,
"step": 578
},
{
"epoch": 1.1156626506024097,
"grad_norm": 0.28746843338012695,
"learning_rate": 3.169393922046776e-05,
"loss": 0.0116,
"step": 579
},
{
"epoch": 1.1175903614457832,
"grad_norm": 0.2952995002269745,
"learning_rate": 3.165752694317713e-05,
"loss": 0.0116,
"step": 580
},
{
"epoch": 1.1195180722891567,
"grad_norm": 0.2938575744628906,
"learning_rate": 3.16210560480587e-05,
"loss": 0.013,
"step": 581
},
{
"epoch": 1.1214457831325302,
"grad_norm": 0.22283495962619781,
"learning_rate": 3.158452671849998e-05,
"loss": 0.0052,
"step": 582
},
{
"epoch": 1.1233734939759037,
"grad_norm": 0.6272858381271362,
"learning_rate": 3.154793913818226e-05,
"loss": 0.0182,
"step": 583
},
{
"epoch": 1.1253012048192772,
"grad_norm": 0.479753702878952,
"learning_rate": 3.1511293491079804e-05,
"loss": 0.0146,
"step": 584
},
{
"epoch": 1.1272289156626507,
"grad_norm": 0.31104400753974915,
"learning_rate": 3.1474589961458786e-05,
"loss": 0.0139,
"step": 585
},
{
"epoch": 1.129156626506024,
"grad_norm": 0.4932832419872284,
"learning_rate": 3.1437828733876477e-05,
"loss": 0.0236,
"step": 586
},
{
"epoch": 1.1310843373493975,
"grad_norm": 0.222808837890625,
"learning_rate": 3.140100999318025e-05,
"loss": 0.0084,
"step": 587
},
{
"epoch": 1.133012048192771,
"grad_norm": 0.4515356719493866,
"learning_rate": 3.136413392450668e-05,
"loss": 0.0215,
"step": 588
},
{
"epoch": 1.1349397590361445,
"grad_norm": 0.39302268624305725,
"learning_rate": 3.132720071328061e-05,
"loss": 0.0154,
"step": 589
},
{
"epoch": 1.136867469879518,
"grad_norm": 0.43382835388183594,
"learning_rate": 3.1290210545214205e-05,
"loss": 0.0088,
"step": 590
},
{
"epoch": 1.1387951807228915,
"grad_norm": 0.18707136809825897,
"learning_rate": 3.125316360630602e-05,
"loss": 0.0126,
"step": 591
},
{
"epoch": 1.140722891566265,
"grad_norm": 0.5688219666481018,
"learning_rate": 3.121606008284011e-05,
"loss": 0.0147,
"step": 592
},
{
"epoch": 1.1426506024096386,
"grad_norm": 0.3321833312511444,
"learning_rate": 3.1178900161385005e-05,
"loss": 0.0119,
"step": 593
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.3738424777984619,
"learning_rate": 3.114168402879286e-05,
"loss": 0.0158,
"step": 594
},
{
"epoch": 1.1465060240963856,
"grad_norm": 0.2386978417634964,
"learning_rate": 3.110441187219846e-05,
"loss": 0.0107,
"step": 595
},
{
"epoch": 1.148433734939759,
"grad_norm": 0.2165699452161789,
"learning_rate": 3.10670838790183e-05,
"loss": 0.0079,
"step": 596
},
{
"epoch": 1.1503614457831326,
"grad_norm": 0.25952696800231934,
"learning_rate": 3.102970023694965e-05,
"loss": 0.0147,
"step": 597
},
{
"epoch": 1.152289156626506,
"grad_norm": 0.21448305249214172,
"learning_rate": 3.099226113396959e-05,
"loss": 0.0099,
"step": 598
},
{
"epoch": 1.1542168674698796,
"grad_norm": 0.37226060032844543,
"learning_rate": 3.095476675833405e-05,
"loss": 0.0214,
"step": 599
},
{
"epoch": 1.1561445783132531,
"grad_norm": 0.29637983441352844,
"learning_rate": 3.0917217298576955e-05,
"loss": 0.0118,
"step": 600
},
{
"epoch": 1.1580722891566264,
"grad_norm": 0.18535609543323517,
"learning_rate": 3.0879612943509154e-05,
"loss": 0.0086,
"step": 601
},
{
"epoch": 1.16,
"grad_norm": 0.25874125957489014,
"learning_rate": 3.0841953882217536e-05,
"loss": 0.0088,
"step": 602
},
{
"epoch": 1.1619277108433734,
"grad_norm": 0.46092745661735535,
"learning_rate": 3.08042403040641e-05,
"loss": 0.0241,
"step": 603
},
{
"epoch": 1.163855421686747,
"grad_norm": 0.27023249864578247,
"learning_rate": 3.076647239868494e-05,
"loss": 0.0154,
"step": 604
},
{
"epoch": 1.1657831325301204,
"grad_norm": 0.445157527923584,
"learning_rate": 3.072865035598933e-05,
"loss": 0.0197,
"step": 605
},
{
"epoch": 1.167710843373494,
"grad_norm": 0.18097272515296936,
"learning_rate": 3.06907743661588e-05,
"loss": 0.0093,
"step": 606
},
{
"epoch": 1.1696385542168675,
"grad_norm": 0.22469942271709442,
"learning_rate": 3.065284461964609e-05,
"loss": 0.0171,
"step": 607
},
{
"epoch": 1.171566265060241,
"grad_norm": 0.20190906524658203,
"learning_rate": 3.061486130717428e-05,
"loss": 0.008,
"step": 608
},
{
"epoch": 1.1734939759036145,
"grad_norm": 0.18294145166873932,
"learning_rate": 3.057682461973579e-05,
"loss": 0.0155,
"step": 609
},
{
"epoch": 1.175421686746988,
"grad_norm": 0.34203943610191345,
"learning_rate": 3.053873474859143e-05,
"loss": 0.0212,
"step": 610
},
{
"epoch": 1.1773493975903615,
"grad_norm": 0.49073582887649536,
"learning_rate": 3.050059188526942e-05,
"loss": 0.019,
"step": 611
},
{
"epoch": 1.179277108433735,
"grad_norm": 0.3537680506706238,
"learning_rate": 3.046239622156446e-05,
"loss": 0.0147,
"step": 612
},
{
"epoch": 1.1812048192771085,
"grad_norm": 0.2584632635116577,
"learning_rate": 3.042414794953674e-05,
"loss": 0.0088,
"step": 613
},
{
"epoch": 1.1831325301204818,
"grad_norm": 0.3529360890388489,
"learning_rate": 3.0385847261510975e-05,
"loss": 0.0187,
"step": 614
},
{
"epoch": 1.1850602409638555,
"grad_norm": 0.3331570327281952,
"learning_rate": 3.0347494350075465e-05,
"loss": 0.0124,
"step": 615
},
{
"epoch": 1.1869879518072288,
"grad_norm": 0.2223527580499649,
"learning_rate": 3.0309089408081074e-05,
"loss": 0.01,
"step": 616
},
{
"epoch": 1.1889156626506023,
"grad_norm": 0.21985746920108795,
"learning_rate": 3.027063262864032e-05,
"loss": 0.0087,
"step": 617
},
{
"epoch": 1.1908433734939758,
"grad_norm": 0.2989653944969177,
"learning_rate": 3.023212420512637e-05,
"loss": 0.0137,
"step": 618
},
{
"epoch": 1.1927710843373494,
"grad_norm": 0.17423275113105774,
"learning_rate": 3.0193564331172074e-05,
"loss": 0.0056,
"step": 619
},
{
"epoch": 1.1946987951807229,
"grad_norm": 1.0992127656936646,
"learning_rate": 3.0154953200668976e-05,
"loss": 0.0274,
"step": 620
},
{
"epoch": 1.1966265060240964,
"grad_norm": 0.21641989052295685,
"learning_rate": 3.011629100776638e-05,
"loss": 0.0151,
"step": 621
},
{
"epoch": 1.1985542168674699,
"grad_norm": 0.4558199644088745,
"learning_rate": 3.007757794687033e-05,
"loss": 0.0424,
"step": 622
},
{
"epoch": 1.2004819277108434,
"grad_norm": 0.42380189895629883,
"learning_rate": 3.003881421264266e-05,
"loss": 0.0079,
"step": 623
},
{
"epoch": 1.202409638554217,
"grad_norm": 0.28791171312332153,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0142,
"step": 624
},
{
"epoch": 1.2043373493975904,
"grad_norm": 0.3906581997871399,
"learning_rate": 2.996113550411281e-05,
"loss": 0.0251,
"step": 625
},
{
"epoch": 1.206265060240964,
"grad_norm": 0.47848746180534363,
"learning_rate": 2.9922220920404375e-05,
"loss": 0.0137,
"step": 626
},
{
"epoch": 1.2081927710843374,
"grad_norm": 0.22666941583156586,
"learning_rate": 2.9883256444549862e-05,
"loss": 0.0105,
"step": 627
},
{
"epoch": 1.210120481927711,
"grad_norm": 0.18968136608600616,
"learning_rate": 2.984424227247529e-05,
"loss": 0.0089,
"step": 628
},
{
"epoch": 1.2120481927710842,
"grad_norm": 0.28732606768608093,
"learning_rate": 2.980517860035656e-05,
"loss": 0.0253,
"step": 629
},
{
"epoch": 1.213975903614458,
"grad_norm": 0.21131543815135956,
"learning_rate": 2.9766065624618518e-05,
"loss": 0.0134,
"step": 630
},
{
"epoch": 1.2159036144578312,
"grad_norm": 0.7594877481460571,
"learning_rate": 2.972690354193388e-05,
"loss": 0.0157,
"step": 631
},
{
"epoch": 1.2178313253012047,
"grad_norm": 0.730291485786438,
"learning_rate": 2.96876925492223e-05,
"loss": 0.0204,
"step": 632
},
{
"epoch": 1.2197590361445783,
"grad_norm": 0.20333674550056458,
"learning_rate": 2.9648432843649382e-05,
"loss": 0.0114,
"step": 633
},
{
"epoch": 1.2216867469879518,
"grad_norm": 0.5680793523788452,
"learning_rate": 2.960912462262566e-05,
"loss": 0.0146,
"step": 634
},
{
"epoch": 1.2236144578313253,
"grad_norm": 0.4591079354286194,
"learning_rate": 2.9569768083805618e-05,
"loss": 0.0112,
"step": 635
},
{
"epoch": 1.2255421686746988,
"grad_norm": 0.3793511390686035,
"learning_rate": 2.953036342508671e-05,
"loss": 0.0377,
"step": 636
},
{
"epoch": 1.2274698795180723,
"grad_norm": 1.118723750114441,
"learning_rate": 2.9490910844608346e-05,
"loss": 0.0432,
"step": 637
},
{
"epoch": 1.2293975903614458,
"grad_norm": 0.36990776658058167,
"learning_rate": 2.9451410540750887e-05,
"loss": 0.0203,
"step": 638
},
{
"epoch": 1.2313253012048193,
"grad_norm": 0.930397629737854,
"learning_rate": 2.94118627121347e-05,
"loss": 0.0311,
"step": 639
},
{
"epoch": 1.2332530120481928,
"grad_norm": 0.2347625195980072,
"learning_rate": 2.9372267557619075e-05,
"loss": 0.0168,
"step": 640
},
{
"epoch": 1.2351807228915663,
"grad_norm": 0.3720332384109497,
"learning_rate": 2.933262527630131e-05,
"loss": 0.0136,
"step": 641
},
{
"epoch": 1.2371084337349398,
"grad_norm": 0.4871984124183655,
"learning_rate": 2.929293606751565e-05,
"loss": 0.0339,
"step": 642
},
{
"epoch": 1.2390361445783133,
"grad_norm": 0.35853689908981323,
"learning_rate": 2.9253200130832322e-05,
"loss": 0.0095,
"step": 643
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.42003703117370605,
"learning_rate": 2.92134176660565e-05,
"loss": 0.0142,
"step": 644
},
{
"epoch": 1.2428915662650604,
"grad_norm": 0.3854500651359558,
"learning_rate": 2.9173588873227338e-05,
"loss": 0.0209,
"step": 645
},
{
"epoch": 1.2448192771084337,
"grad_norm": 0.24665917456150055,
"learning_rate": 2.913371395261691e-05,
"loss": 0.0087,
"step": 646
},
{
"epoch": 1.2467469879518072,
"grad_norm": 0.41571593284606934,
"learning_rate": 2.9093793104729268e-05,
"loss": 0.0164,
"step": 647
},
{
"epoch": 1.2486746987951807,
"grad_norm": 0.4597891569137573,
"learning_rate": 2.9053826530299377e-05,
"loss": 0.0138,
"step": 648
},
{
"epoch": 1.2506024096385542,
"grad_norm": 0.43345385789871216,
"learning_rate": 2.901381443029215e-05,
"loss": 0.0353,
"step": 649
},
{
"epoch": 1.2525301204819277,
"grad_norm": 0.3706768751144409,
"learning_rate": 2.897375700590141e-05,
"loss": 0.007,
"step": 650
},
{
"epoch": 1.2544578313253012,
"grad_norm": 0.30305296182632446,
"learning_rate": 2.8933654458548873e-05,
"loss": 0.0123,
"step": 651
},
{
"epoch": 1.2563855421686747,
"grad_norm": 0.2042127549648285,
"learning_rate": 2.8893506989883167e-05,
"loss": 0.0099,
"step": 652
},
{
"epoch": 1.2583132530120482,
"grad_norm": 0.20524422824382782,
"learning_rate": 2.8853314801778784e-05,
"loss": 0.0097,
"step": 653
},
{
"epoch": 1.2602409638554217,
"grad_norm": 0.2351921945810318,
"learning_rate": 2.8813078096335093e-05,
"loss": 0.0091,
"step": 654
},
{
"epoch": 1.2621686746987952,
"grad_norm": 0.34547340869903564,
"learning_rate": 2.87727970758753e-05,
"loss": 0.0088,
"step": 655
},
{
"epoch": 1.2640963855421687,
"grad_norm": 0.35163217782974243,
"learning_rate": 2.8732471942945443e-05,
"loss": 0.0145,
"step": 656
},
{
"epoch": 1.266024096385542,
"grad_norm": 1.715137243270874,
"learning_rate": 2.8692102900313378e-05,
"loss": 0.0198,
"step": 657
},
{
"epoch": 1.2679518072289158,
"grad_norm": 0.2860178053379059,
"learning_rate": 2.8651690150967748e-05,
"loss": 0.0085,
"step": 658
},
{
"epoch": 1.269879518072289,
"grad_norm": 0.21175967156887054,
"learning_rate": 2.8611233898116967e-05,
"loss": 0.0071,
"step": 659
},
{
"epoch": 1.2718072289156628,
"grad_norm": 0.33726972341537476,
"learning_rate": 2.85707343451882e-05,
"loss": 0.012,
"step": 660
},
{
"epoch": 1.273734939759036,
"grad_norm": 0.2138456553220749,
"learning_rate": 2.853019169582635e-05,
"loss": 0.0092,
"step": 661
},
{
"epoch": 1.2756626506024096,
"grad_norm": 0.2304934412240982,
"learning_rate": 2.8489606153892997e-05,
"loss": 0.0144,
"step": 662
},
{
"epoch": 1.277590361445783,
"grad_norm": 0.2691061794757843,
"learning_rate": 2.8448977923465425e-05,
"loss": 0.0121,
"step": 663
},
{
"epoch": 1.2795180722891566,
"grad_norm": 0.35254305601119995,
"learning_rate": 2.840830720883555e-05,
"loss": 0.0125,
"step": 664
},
{
"epoch": 1.28144578313253,
"grad_norm": 0.36552608013153076,
"learning_rate": 2.836759421450893e-05,
"loss": 0.021,
"step": 665
},
{
"epoch": 1.2833734939759036,
"grad_norm": 0.37177154421806335,
"learning_rate": 2.83268391452037e-05,
"loss": 0.0216,
"step": 666
},
{
"epoch": 1.2853012048192771,
"grad_norm": 0.20932547748088837,
"learning_rate": 2.828604220584958e-05,
"loss": 0.0077,
"step": 667
},
{
"epoch": 1.2872289156626506,
"grad_norm": 0.5158557295799255,
"learning_rate": 2.824520360158681e-05,
"loss": 0.0394,
"step": 668
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.22623969614505768,
"learning_rate": 2.820432353776515e-05,
"loss": 0.0087,
"step": 669
},
{
"epoch": 1.2910843373493976,
"grad_norm": 0.2996046245098114,
"learning_rate": 2.8163402219942822e-05,
"loss": 0.01,
"step": 670
},
{
"epoch": 1.2930120481927712,
"grad_norm": 0.24957989156246185,
"learning_rate": 2.8122439853885488e-05,
"loss": 0.0127,
"step": 671
},
{
"epoch": 1.2949397590361444,
"grad_norm": 0.2636559307575226,
"learning_rate": 2.8081436645565216e-05,
"loss": 0.0128,
"step": 672
},
{
"epoch": 1.2968674698795182,
"grad_norm": 0.3531591296195984,
"learning_rate": 2.804039280115944e-05,
"loss": 0.0199,
"step": 673
},
{
"epoch": 1.2987951807228915,
"grad_norm": 0.3682299852371216,
"learning_rate": 2.7999308527049927e-05,
"loss": 0.0088,
"step": 674
},
{
"epoch": 1.3007228915662652,
"grad_norm": 0.19555217027664185,
"learning_rate": 2.795818402982174e-05,
"loss": 0.0084,
"step": 675
},
{
"epoch": 1.3026506024096385,
"grad_norm": 0.2864912450313568,
"learning_rate": 2.7917019516262186e-05,
"loss": 0.0154,
"step": 676
},
{
"epoch": 1.304578313253012,
"grad_norm": 0.2211237996816635,
"learning_rate": 2.78758151933598e-05,
"loss": 0.0078,
"step": 677
},
{
"epoch": 1.3065060240963855,
"grad_norm": 0.13646945357322693,
"learning_rate": 2.7834571268303294e-05,
"loss": 0.0058,
"step": 678
},
{
"epoch": 1.308433734939759,
"grad_norm": 0.16530285775661469,
"learning_rate": 2.779328794848049e-05,
"loss": 0.007,
"step": 679
},
{
"epoch": 1.3103614457831325,
"grad_norm": 0.2145693302154541,
"learning_rate": 2.7751965441477325e-05,
"loss": 0.0203,
"step": 680
},
{
"epoch": 1.312289156626506,
"grad_norm": 0.24273739755153656,
"learning_rate": 2.771060395507677e-05,
"loss": 0.0106,
"step": 681
},
{
"epoch": 1.3142168674698795,
"grad_norm": 0.20430618524551392,
"learning_rate": 2.7669203697257794e-05,
"loss": 0.0122,
"step": 682
},
{
"epoch": 1.316144578313253,
"grad_norm": 0.2502615749835968,
"learning_rate": 2.7627764876194335e-05,
"loss": 0.0101,
"step": 683
},
{
"epoch": 1.3180722891566266,
"grad_norm": 0.287239670753479,
"learning_rate": 2.7586287700254214e-05,
"loss": 0.0203,
"step": 684
},
{
"epoch": 1.32,
"grad_norm": 0.16239754855632782,
"learning_rate": 2.7544772377998147e-05,
"loss": 0.0084,
"step": 685
},
{
"epoch": 1.3219277108433736,
"grad_norm": 0.27174142003059387,
"learning_rate": 2.7503219118178636e-05,
"loss": 0.008,
"step": 686
},
{
"epoch": 1.3238554216867469,
"grad_norm": 0.12878240644931793,
"learning_rate": 2.7461628129738954e-05,
"loss": 0.0053,
"step": 687
},
{
"epoch": 1.3257831325301206,
"grad_norm": 0.16112515330314636,
"learning_rate": 2.7419999621812086e-05,
"loss": 0.0059,
"step": 688
},
{
"epoch": 1.3277108433734939,
"grad_norm": 0.2398834228515625,
"learning_rate": 2.7378333803719672e-05,
"loss": 0.0095,
"step": 689
},
{
"epoch": 1.3296385542168676,
"grad_norm": 0.18516193330287933,
"learning_rate": 2.733663088497097e-05,
"loss": 0.0071,
"step": 690
},
{
"epoch": 1.331566265060241,
"grad_norm": 0.2974924147129059,
"learning_rate": 2.7294891075261785e-05,
"loss": 0.0227,
"step": 691
},
{
"epoch": 1.3334939759036144,
"grad_norm": 0.12931054830551147,
"learning_rate": 2.7253114584473418e-05,
"loss": 0.0039,
"step": 692
},
{
"epoch": 1.335421686746988,
"grad_norm": 0.16319474577903748,
"learning_rate": 2.7211301622671623e-05,
"loss": 0.008,
"step": 693
},
{
"epoch": 1.3373493975903614,
"grad_norm": 0.27622169256210327,
"learning_rate": 2.7169452400105533e-05,
"loss": 0.0238,
"step": 694
},
{
"epoch": 1.339277108433735,
"grad_norm": 0.45309779047966003,
"learning_rate": 2.712756712720663e-05,
"loss": 0.0439,
"step": 695
},
{
"epoch": 1.3412048192771084,
"grad_norm": 0.2469855099916458,
"learning_rate": 2.708564601458765e-05,
"loss": 0.0085,
"step": 696
},
{
"epoch": 1.343132530120482,
"grad_norm": 0.4245856702327728,
"learning_rate": 2.7043689273041535e-05,
"loss": 0.0097,
"step": 697
},
{
"epoch": 1.3450602409638555,
"grad_norm": 0.26796087622642517,
"learning_rate": 2.7001697113540414e-05,
"loss": 0.0119,
"step": 698
},
{
"epoch": 1.346987951807229,
"grad_norm": 0.3569283187389374,
"learning_rate": 2.6959669747234482e-05,
"loss": 0.0096,
"step": 699
},
{
"epoch": 1.3489156626506025,
"grad_norm": 0.7038524150848389,
"learning_rate": 2.6917607385450973e-05,
"loss": 0.0317,
"step": 700
},
{
"epoch": 1.350843373493976,
"grad_norm": 0.23568563163280487,
"learning_rate": 2.687551023969308e-05,
"loss": 0.0112,
"step": 701
},
{
"epoch": 1.3527710843373493,
"grad_norm": 0.20338499546051025,
"learning_rate": 2.6833378521638935e-05,
"loss": 0.0092,
"step": 702
},
{
"epoch": 1.354698795180723,
"grad_norm": 4.22187614440918,
"learning_rate": 2.679121244314046e-05,
"loss": 0.0314,
"step": 703
},
{
"epoch": 1.3566265060240963,
"grad_norm": 0.2542206048965454,
"learning_rate": 2.674901221622239e-05,
"loss": 0.0158,
"step": 704
},
{
"epoch": 1.3585542168674698,
"grad_norm": 0.49705010652542114,
"learning_rate": 2.670677805308116e-05,
"loss": 0.0162,
"step": 705
},
{
"epoch": 1.3604819277108433,
"grad_norm": 0.17502115666866302,
"learning_rate": 2.666451016608383e-05,
"loss": 0.0074,
"step": 706
},
{
"epoch": 1.3624096385542168,
"grad_norm": 0.21738742291927338,
"learning_rate": 2.6622208767767075e-05,
"loss": 0.0135,
"step": 707
},
{
"epoch": 1.3643373493975903,
"grad_norm": 0.3309847414493561,
"learning_rate": 2.6579874070836032e-05,
"loss": 0.0107,
"step": 708
},
{
"epoch": 1.3662650602409638,
"grad_norm": 0.10706827789545059,
"learning_rate": 2.6537506288163303e-05,
"loss": 0.0043,
"step": 709
},
{
"epoch": 1.3681927710843373,
"grad_norm": 0.173640176653862,
"learning_rate": 2.6495105632787835e-05,
"loss": 0.0092,
"step": 710
},
{
"epoch": 1.3701204819277109,
"grad_norm": 0.2636397182941437,
"learning_rate": 2.6452672317913893e-05,
"loss": 0.0097,
"step": 711
},
{
"epoch": 1.3720481927710844,
"grad_norm": 0.28485360741615295,
"learning_rate": 2.6410206556909943e-05,
"loss": 0.0193,
"step": 712
},
{
"epoch": 1.3739759036144579,
"grad_norm": 0.23210027813911438,
"learning_rate": 2.636770856330761e-05,
"loss": 0.0229,
"step": 713
},
{
"epoch": 1.3759036144578314,
"grad_norm": 0.13388316333293915,
"learning_rate": 2.6325178550800596e-05,
"loss": 0.004,
"step": 714
},
{
"epoch": 1.377831325301205,
"grad_norm": 0.5131422877311707,
"learning_rate": 2.6282616733243603e-05,
"loss": 0.0137,
"step": 715
},
{
"epoch": 1.3797590361445784,
"grad_norm": 0.3243267834186554,
"learning_rate": 2.6240023324651258e-05,
"loss": 0.0153,
"step": 716
},
{
"epoch": 1.3816867469879517,
"grad_norm": 0.1440611034631729,
"learning_rate": 2.619739853919704e-05,
"loss": 0.0031,
"step": 717
},
{
"epoch": 1.3836144578313254,
"grad_norm": 0.30346596240997314,
"learning_rate": 2.6154742591212196e-05,
"loss": 0.0109,
"step": 718
},
{
"epoch": 1.3855421686746987,
"grad_norm": 0.19109240174293518,
"learning_rate": 2.611205569518468e-05,
"loss": 0.0094,
"step": 719
},
{
"epoch": 1.3874698795180722,
"grad_norm": 0.28636518120765686,
"learning_rate": 2.6069338065758056e-05,
"loss": 0.0123,
"step": 720
},
{
"epoch": 1.3893975903614457,
"grad_norm": 0.28083911538124084,
"learning_rate": 2.6026589917730416e-05,
"loss": 0.0104,
"step": 721
},
{
"epoch": 1.3913253012048192,
"grad_norm": 0.36553966999053955,
"learning_rate": 2.5983811466053327e-05,
"loss": 0.0143,
"step": 722
},
{
"epoch": 1.3932530120481927,
"grad_norm": 0.23317205905914307,
"learning_rate": 2.5941002925830708e-05,
"loss": 0.011,
"step": 723
},
{
"epoch": 1.3951807228915662,
"grad_norm": 0.3825171887874603,
"learning_rate": 2.589816451231781e-05,
"loss": 0.0098,
"step": 724
},
{
"epoch": 1.3971084337349398,
"grad_norm": 0.19916608929634094,
"learning_rate": 2.585529644092006e-05,
"loss": 0.0094,
"step": 725
},
{
"epoch": 1.3990361445783133,
"grad_norm": 0.19990523159503937,
"learning_rate": 2.5812398927192027e-05,
"loss": 0.0128,
"step": 726
},
{
"epoch": 1.4009638554216868,
"grad_norm": 0.34662899374961853,
"learning_rate": 2.5769472186836347e-05,
"loss": 0.0091,
"step": 727
},
{
"epoch": 1.4028915662650603,
"grad_norm": 0.23481112718582153,
"learning_rate": 2.5726516435702583e-05,
"loss": 0.0154,
"step": 728
},
{
"epoch": 1.4048192771084338,
"grad_norm": 0.1846667379140854,
"learning_rate": 2.5683531889786194e-05,
"loss": 0.0088,
"step": 729
},
{
"epoch": 1.4067469879518073,
"grad_norm": 0.16717663407325745,
"learning_rate": 2.564051876522742e-05,
"loss": 0.0083,
"step": 730
},
{
"epoch": 1.4086746987951808,
"grad_norm": 0.4116475284099579,
"learning_rate": 2.5597477278310202e-05,
"loss": 0.0179,
"step": 731
},
{
"epoch": 1.410602409638554,
"grad_norm": 0.171807661652565,
"learning_rate": 2.5554407645461115e-05,
"loss": 0.0063,
"step": 732
},
{
"epoch": 1.4125301204819278,
"grad_norm": 0.1954439878463745,
"learning_rate": 2.5511310083248243e-05,
"loss": 0.017,
"step": 733
},
{
"epoch": 1.4144578313253011,
"grad_norm": 0.37158989906311035,
"learning_rate": 2.5468184808380104e-05,
"loss": 0.0173,
"step": 734
},
{
"epoch": 1.4163855421686746,
"grad_norm": 0.2001633644104004,
"learning_rate": 2.542503203770458e-05,
"loss": 0.0165,
"step": 735
},
{
"epoch": 1.4183132530120481,
"grad_norm": 0.45673373341560364,
"learning_rate": 2.53818519882078e-05,
"loss": 0.0185,
"step": 736
},
{
"epoch": 1.4202409638554216,
"grad_norm": 0.3838701546192169,
"learning_rate": 2.5338644877013067e-05,
"loss": 0.0134,
"step": 737
},
{
"epoch": 1.4221686746987952,
"grad_norm": 0.32032477855682373,
"learning_rate": 2.5295410921379745e-05,
"loss": 0.0143,
"step": 738
},
{
"epoch": 1.4240963855421687,
"grad_norm": 0.4594039022922516,
"learning_rate": 2.52521503387022e-05,
"loss": 0.0193,
"step": 739
},
{
"epoch": 1.4260240963855422,
"grad_norm": 0.3889620900154114,
"learning_rate": 2.5208863346508667e-05,
"loss": 0.0114,
"step": 740
},
{
"epoch": 1.4279518072289157,
"grad_norm": 0.33153319358825684,
"learning_rate": 2.5165550162460203e-05,
"loss": 0.0102,
"step": 741
},
{
"epoch": 1.4298795180722892,
"grad_norm": 0.7269518375396729,
"learning_rate": 2.5122211004349536e-05,
"loss": 0.0215,
"step": 742
},
{
"epoch": 1.4318072289156627,
"grad_norm": 0.31653261184692383,
"learning_rate": 2.5078846090100023e-05,
"loss": 0.0115,
"step": 743
},
{
"epoch": 1.4337349397590362,
"grad_norm": 0.20620353519916534,
"learning_rate": 2.5035455637764518e-05,
"loss": 0.0153,
"step": 744
},
{
"epoch": 1.4356626506024097,
"grad_norm": 0.17266008257865906,
"learning_rate": 2.4992039865524297e-05,
"loss": 0.0069,
"step": 745
},
{
"epoch": 1.4375903614457832,
"grad_norm": 0.24760811030864716,
"learning_rate": 2.494859899168795e-05,
"loss": 0.0108,
"step": 746
},
{
"epoch": 1.4395180722891565,
"grad_norm": 0.2584865391254425,
"learning_rate": 2.4905133234690282e-05,
"loss": 0.0095,
"step": 747
},
{
"epoch": 1.4414457831325302,
"grad_norm": 0.48847514390945435,
"learning_rate": 2.486164281309122e-05,
"loss": 0.0181,
"step": 748
},
{
"epoch": 1.4433734939759035,
"grad_norm": 0.42942047119140625,
"learning_rate": 2.4818127945574717e-05,
"loss": 0.025,
"step": 749
},
{
"epoch": 1.445301204819277,
"grad_norm": 0.23713800311088562,
"learning_rate": 2.4774588850947648e-05,
"loss": 0.0085,
"step": 750
},
{
"epoch": 1.4472289156626506,
"grad_norm": 0.8797569870948792,
"learning_rate": 2.473102574813871e-05,
"loss": 0.0097,
"step": 751
},
{
"epoch": 1.449156626506024,
"grad_norm": 0.2744862735271454,
"learning_rate": 2.4687438856197302e-05,
"loss": 0.0122,
"step": 752
},
{
"epoch": 1.4510843373493976,
"grad_norm": 0.12747010588645935,
"learning_rate": 2.4643828394292478e-05,
"loss": 0.0056,
"step": 753
},
{
"epoch": 1.453012048192771,
"grad_norm": 0.37376829981803894,
"learning_rate": 2.4600194581711775e-05,
"loss": 0.0052,
"step": 754
},
{
"epoch": 1.4549397590361446,
"grad_norm": 0.2536911368370056,
"learning_rate": 2.4556537637860176e-05,
"loss": 0.0113,
"step": 755
},
{
"epoch": 1.456867469879518,
"grad_norm": 0.25950780510902405,
"learning_rate": 2.451285778225894e-05,
"loss": 0.0099,
"step": 756
},
{
"epoch": 1.4587951807228916,
"grad_norm": 0.19535955786705017,
"learning_rate": 2.4469155234544565e-05,
"loss": 0.0069,
"step": 757
},
{
"epoch": 1.4607228915662651,
"grad_norm": 0.22816115617752075,
"learning_rate": 2.442543021446764e-05,
"loss": 0.0088,
"step": 758
},
{
"epoch": 1.4626506024096386,
"grad_norm": 0.3363986313343048,
"learning_rate": 2.4381682941891755e-05,
"loss": 0.0182,
"step": 759
},
{
"epoch": 1.464578313253012,
"grad_norm": 0.21492891013622284,
"learning_rate": 2.4337913636792382e-05,
"loss": 0.0069,
"step": 760
},
{
"epoch": 1.4665060240963856,
"grad_norm": 0.6070862412452698,
"learning_rate": 2.429412251925579e-05,
"loss": 0.0406,
"step": 761
},
{
"epoch": 1.468433734939759,
"grad_norm": 2.6469690799713135,
"learning_rate": 2.425030980947793e-05,
"loss": 0.0205,
"step": 762
},
{
"epoch": 1.4703614457831327,
"grad_norm": 0.30909740924835205,
"learning_rate": 2.420647572776332e-05,
"loss": 0.0136,
"step": 763
},
{
"epoch": 1.472289156626506,
"grad_norm": 0.6639553904533386,
"learning_rate": 2.416262049452395e-05,
"loss": 0.011,
"step": 764
},
{
"epoch": 1.4742168674698795,
"grad_norm": 0.2919616997241974,
"learning_rate": 2.4118744330278147e-05,
"loss": 0.0131,
"step": 765
},
{
"epoch": 1.476144578313253,
"grad_norm": 0.5232429504394531,
"learning_rate": 2.4074847455649523e-05,
"loss": 0.0138,
"step": 766
},
{
"epoch": 1.4780722891566265,
"grad_norm": 5.630630970001221,
"learning_rate": 2.403093009136579e-05,
"loss": 0.0264,
"step": 767
},
{
"epoch": 1.48,
"grad_norm": 0.33234721422195435,
"learning_rate": 2.3986992458257707e-05,
"loss": 0.0111,
"step": 768
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.28444772958755493,
"learning_rate": 2.3943034777257945e-05,
"loss": 0.0144,
"step": 769
},
{
"epoch": 1.483855421686747,
"grad_norm": 0.16229979693889618,
"learning_rate": 2.38990572694e-05,
"loss": 0.0062,
"step": 770
},
{
"epoch": 1.4857831325301205,
"grad_norm": 0.27474716305732727,
"learning_rate": 2.385506015581704e-05,
"loss": 0.0172,
"step": 771
},
{
"epoch": 1.487710843373494,
"grad_norm": 0.246526300907135,
"learning_rate": 2.381104365774083e-05,
"loss": 0.012,
"step": 772
},
{
"epoch": 1.4896385542168675,
"grad_norm": 0.282047837972641,
"learning_rate": 2.37670079965006e-05,
"loss": 0.0116,
"step": 773
},
{
"epoch": 1.491566265060241,
"grad_norm": 0.2878139317035675,
"learning_rate": 2.3722953393521944e-05,
"loss": 0.0147,
"step": 774
},
{
"epoch": 1.4934939759036143,
"grad_norm": 0.5586277842521667,
"learning_rate": 2.367888007032571e-05,
"loss": 0.0111,
"step": 775
},
{
"epoch": 1.495421686746988,
"grad_norm": 0.562160313129425,
"learning_rate": 2.3634788248526846e-05,
"loss": 0.0061,
"step": 776
},
{
"epoch": 1.4973493975903613,
"grad_norm": 0.3452005982398987,
"learning_rate": 2.3590678149833356e-05,
"loss": 0.0205,
"step": 777
},
{
"epoch": 1.499277108433735,
"grad_norm": 0.7757686376571655,
"learning_rate": 2.3546549996045114e-05,
"loss": 0.0273,
"step": 778
},
{
"epoch": 1.5012048192771084,
"grad_norm": 0.19530551135540009,
"learning_rate": 2.3502404009052812e-05,
"loss": 0.0083,
"step": 779
},
{
"epoch": 1.503132530120482,
"grad_norm": 0.2586531639099121,
"learning_rate": 2.3458240410836775e-05,
"loss": 0.0122,
"step": 780
},
{
"epoch": 1.5050602409638554,
"grad_norm": 0.30063286423683167,
"learning_rate": 2.3414059423465924e-05,
"loss": 0.0083,
"step": 781
},
{
"epoch": 1.5069879518072289,
"grad_norm": 0.18663185834884644,
"learning_rate": 2.3369861269096575e-05,
"loss": 0.0104,
"step": 782
},
{
"epoch": 1.5089156626506024,
"grad_norm": 0.4405941069126129,
"learning_rate": 2.3325646169971416e-05,
"loss": 0.0264,
"step": 783
},
{
"epoch": 1.510843373493976,
"grad_norm": 0.2947913110256195,
"learning_rate": 2.3281414348418294e-05,
"loss": 0.0107,
"step": 784
},
{
"epoch": 1.5127710843373494,
"grad_norm": 0.23813778162002563,
"learning_rate": 2.3237166026849158e-05,
"loss": 0.0084,
"step": 785
},
{
"epoch": 1.514698795180723,
"grad_norm": 0.33380329608917236,
"learning_rate": 2.3192901427758932e-05,
"loss": 0.0111,
"step": 786
},
{
"epoch": 1.5166265060240964,
"grad_norm": 0.3736988306045532,
"learning_rate": 2.314862077372438e-05,
"loss": 0.0135,
"step": 787
},
{
"epoch": 1.5185542168674697,
"grad_norm": 0.3785395920276642,
"learning_rate": 2.3104324287402996e-05,
"loss": 0.0265,
"step": 788
},
{
"epoch": 1.5204819277108435,
"grad_norm": 0.3359154462814331,
"learning_rate": 2.3060012191531885e-05,
"loss": 0.0127,
"step": 789
},
{
"epoch": 1.5224096385542167,
"grad_norm": 0.720753014087677,
"learning_rate": 2.301568470892664e-05,
"loss": 0.0134,
"step": 790
},
{
"epoch": 1.5243373493975905,
"grad_norm": 0.36473193764686584,
"learning_rate": 2.297134206248024e-05,
"loss": 0.0318,
"step": 791
},
{
"epoch": 1.5262650602409638,
"grad_norm": 0.29987087845802307,
"learning_rate": 2.2926984475161884e-05,
"loss": 0.008,
"step": 792
},
{
"epoch": 1.5281927710843375,
"grad_norm": 0.2883112132549286,
"learning_rate": 2.2882612170015914e-05,
"loss": 0.0125,
"step": 793
},
{
"epoch": 1.5301204819277108,
"grad_norm": 0.28983229398727417,
"learning_rate": 2.2838225370160682e-05,
"loss": 0.0155,
"step": 794
},
{
"epoch": 1.5320481927710843,
"grad_norm": 0.47236886620521545,
"learning_rate": 2.2793824298787414e-05,
"loss": 0.0132,
"step": 795
},
{
"epoch": 1.5339759036144578,
"grad_norm": 0.8328865170478821,
"learning_rate": 2.2749409179159104e-05,
"loss": 0.026,
"step": 796
},
{
"epoch": 1.5359036144578313,
"grad_norm": 0.3129172623157501,
"learning_rate": 2.2704980234609396e-05,
"loss": 0.0099,
"step": 797
},
{
"epoch": 1.5378313253012048,
"grad_norm": 0.22284500300884247,
"learning_rate": 2.2660537688541416e-05,
"loss": 0.009,
"step": 798
},
{
"epoch": 1.5397590361445783,
"grad_norm": 0.3346405625343323,
"learning_rate": 2.2616081764426726e-05,
"loss": 0.0077,
"step": 799
},
{
"epoch": 1.5416867469879518,
"grad_norm": 0.2923565208911896,
"learning_rate": 2.2571612685804124e-05,
"loss": 0.0119,
"step": 800
},
{
"epoch": 1.5436144578313253,
"grad_norm": 0.1921311914920807,
"learning_rate": 2.252713067627857e-05,
"loss": 0.0083,
"step": 801
},
{
"epoch": 1.5455421686746988,
"grad_norm": 0.23221106827259064,
"learning_rate": 2.2482635959520044e-05,
"loss": 0.0049,
"step": 802
},
{
"epoch": 1.5474698795180721,
"grad_norm": 0.6340724229812622,
"learning_rate": 2.243812875926241e-05,
"loss": 0.0273,
"step": 803
},
{
"epoch": 1.5493975903614459,
"grad_norm": 0.2699439823627472,
"learning_rate": 2.2393609299302314e-05,
"loss": 0.0108,
"step": 804
},
{
"epoch": 1.5513253012048192,
"grad_norm": 0.2005189210176468,
"learning_rate": 2.2349077803498052e-05,
"loss": 0.0076,
"step": 805
},
{
"epoch": 1.5532530120481929,
"grad_norm": 0.39668548107147217,
"learning_rate": 2.230453449576842e-05,
"loss": 0.0135,
"step": 806
},
{
"epoch": 1.5551807228915662,
"grad_norm": 0.2406950294971466,
"learning_rate": 2.2259979600091635e-05,
"loss": 0.0094,
"step": 807
},
{
"epoch": 1.55710843373494,
"grad_norm": 0.30363157391548157,
"learning_rate": 2.2215413340504158e-05,
"loss": 0.0178,
"step": 808
},
{
"epoch": 1.5590361445783132,
"grad_norm": 0.19508181512355804,
"learning_rate": 2.2170835941099605e-05,
"loss": 0.0069,
"step": 809
},
{
"epoch": 1.5609638554216867,
"grad_norm": 0.734106719493866,
"learning_rate": 2.2126247626027615e-05,
"loss": 0.0319,
"step": 810
},
{
"epoch": 1.5628915662650602,
"grad_norm": 0.2591583728790283,
"learning_rate": 2.208164861949268e-05,
"loss": 0.0168,
"step": 811
},
{
"epoch": 1.5648192771084337,
"grad_norm": 0.2386734038591385,
"learning_rate": 2.20370391457531e-05,
"loss": 0.0041,
"step": 812
},
{
"epoch": 1.5667469879518072,
"grad_norm": 0.1675218939781189,
"learning_rate": 2.1992419429119764e-05,
"loss": 0.0078,
"step": 813
},
{
"epoch": 1.5686746987951807,
"grad_norm": 0.45591506361961365,
"learning_rate": 2.1947789693955097e-05,
"loss": 0.0166,
"step": 814
},
{
"epoch": 1.5706024096385542,
"grad_norm": 0.46940621733665466,
"learning_rate": 2.190315016467188e-05,
"loss": 0.0176,
"step": 815
},
{
"epoch": 1.5725301204819278,
"grad_norm": 0.2294205278158188,
"learning_rate": 2.1858501065732146e-05,
"loss": 0.0102,
"step": 816
},
{
"epoch": 1.5744578313253013,
"grad_norm": 0.28922322392463684,
"learning_rate": 2.181384262164606e-05,
"loss": 0.0111,
"step": 817
},
{
"epoch": 1.5763855421686745,
"grad_norm": 0.19650064408779144,
"learning_rate": 2.1769175056970765e-05,
"loss": 0.0076,
"step": 818
},
{
"epoch": 1.5783132530120483,
"grad_norm": 0.19538825750350952,
"learning_rate": 2.172449859630927e-05,
"loss": 0.0118,
"step": 819
},
{
"epoch": 1.5802409638554216,
"grad_norm": 0.1900389939546585,
"learning_rate": 2.167981346430931e-05,
"loss": 0.0066,
"step": 820
},
{
"epoch": 1.5821686746987953,
"grad_norm": 0.21593710780143738,
"learning_rate": 2.1635119885662235e-05,
"loss": 0.0101,
"step": 821
},
{
"epoch": 1.5840963855421686,
"grad_norm": 0.2699289321899414,
"learning_rate": 2.159041808510185e-05,
"loss": 0.0118,
"step": 822
},
{
"epoch": 1.5860240963855423,
"grad_norm": 0.31867673993110657,
"learning_rate": 2.1545708287403322e-05,
"loss": 0.0122,
"step": 823
},
{
"epoch": 1.5879518072289156,
"grad_norm": 0.2862400412559509,
"learning_rate": 2.1500990717382004e-05,
"loss": 0.0216,
"step": 824
},
{
"epoch": 1.589879518072289,
"grad_norm": 0.28482481837272644,
"learning_rate": 2.145626559989237e-05,
"loss": 0.0136,
"step": 825
},
{
"epoch": 1.5918072289156626,
"grad_norm": 0.2866958975791931,
"learning_rate": 2.1411533159826803e-05,
"loss": 0.0298,
"step": 826
},
{
"epoch": 1.5937349397590361,
"grad_norm": 0.39092838764190674,
"learning_rate": 2.1366793622114533e-05,
"loss": 0.0382,
"step": 827
},
{
"epoch": 1.5956626506024096,
"grad_norm": 0.16381537914276123,
"learning_rate": 2.1322047211720468e-05,
"loss": 0.0074,
"step": 828
},
{
"epoch": 1.5975903614457831,
"grad_norm": 0.22146940231323242,
"learning_rate": 2.1277294153644083e-05,
"loss": 0.0103,
"step": 829
},
{
"epoch": 1.5995180722891567,
"grad_norm": 0.2155209183692932,
"learning_rate": 2.123253467291827e-05,
"loss": 0.0095,
"step": 830
},
{
"epoch": 1.6014457831325302,
"grad_norm": 0.41510409116744995,
"learning_rate": 2.118776899460822e-05,
"loss": 0.0457,
"step": 831
},
{
"epoch": 1.6033734939759037,
"grad_norm": 0.19718150794506073,
"learning_rate": 2.1142997343810293e-05,
"loss": 0.0192,
"step": 832
},
{
"epoch": 1.605301204819277,
"grad_norm": 0.40924403071403503,
"learning_rate": 2.1098219945650865e-05,
"loss": 0.0278,
"step": 833
},
{
"epoch": 1.6072289156626507,
"grad_norm": 0.18657824397087097,
"learning_rate": 2.105343702528524e-05,
"loss": 0.0076,
"step": 834
},
{
"epoch": 1.609156626506024,
"grad_norm": 0.1727641075849533,
"learning_rate": 2.100864880789645e-05,
"loss": 0.0076,
"step": 835
},
{
"epoch": 1.6110843373493977,
"grad_norm": 0.18138745427131653,
"learning_rate": 2.0963855518694203e-05,
"loss": 0.005,
"step": 836
},
{
"epoch": 1.613012048192771,
"grad_norm": 0.19173955917358398,
"learning_rate": 2.0919057382913675e-05,
"loss": 0.0084,
"step": 837
},
{
"epoch": 1.6149397590361447,
"grad_norm": 0.3812403380870819,
"learning_rate": 2.0874254625814435e-05,
"loss": 0.009,
"step": 838
},
{
"epoch": 1.616867469879518,
"grad_norm": 0.2009759545326233,
"learning_rate": 2.0829447472679285e-05,
"loss": 0.0098,
"step": 839
},
{
"epoch": 1.6187951807228915,
"grad_norm": 0.48703446984291077,
"learning_rate": 2.0784636148813124e-05,
"loss": 0.0099,
"step": 840
},
{
"epoch": 1.620722891566265,
"grad_norm": 0.28995075821876526,
"learning_rate": 2.0739820879541827e-05,
"loss": 0.0075,
"step": 841
},
{
"epoch": 1.6226506024096385,
"grad_norm": 0.2130059450864792,
"learning_rate": 2.069500189021111e-05,
"loss": 0.007,
"step": 842
},
{
"epoch": 1.624578313253012,
"grad_norm": 0.252524733543396,
"learning_rate": 2.0650179406185397e-05,
"loss": 0.0249,
"step": 843
},
{
"epoch": 1.6265060240963856,
"grad_norm": 0.23069098591804504,
"learning_rate": 2.060535365284668e-05,
"loss": 0.0084,
"step": 844
},
{
"epoch": 1.628433734939759,
"grad_norm": 0.25051403045654297,
"learning_rate": 2.056052485559338e-05,
"loss": 0.0071,
"step": 845
},
{
"epoch": 1.6303614457831326,
"grad_norm": 0.27664798498153687,
"learning_rate": 2.051569323983924e-05,
"loss": 0.0198,
"step": 846
},
{
"epoch": 1.632289156626506,
"grad_norm": 0.2954922318458557,
"learning_rate": 2.047085903101218e-05,
"loss": 0.006,
"step": 847
},
{
"epoch": 1.6342168674698794,
"grad_norm": 0.28477591276168823,
"learning_rate": 2.0426022454553137e-05,
"loss": 0.0147,
"step": 848
},
{
"epoch": 1.636144578313253,
"grad_norm": 0.2785305678844452,
"learning_rate": 2.0381183735914968e-05,
"loss": 0.0117,
"step": 849
},
{
"epoch": 1.6380722891566264,
"grad_norm": 0.2500309348106384,
"learning_rate": 2.0336343100561295e-05,
"loss": 0.008,
"step": 850
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.18932047486305237,
"learning_rate": 2.0291500773965392e-05,
"loss": 0.0256,
"step": 851
},
{
"epoch": 1.6419277108433734,
"grad_norm": 0.6396257877349854,
"learning_rate": 2.0246656981609013e-05,
"loss": 0.0141,
"step": 852
},
{
"epoch": 1.6438554216867471,
"grad_norm": 0.5072891116142273,
"learning_rate": 2.02018119489813e-05,
"loss": 0.008,
"step": 853
},
{
"epoch": 1.6457831325301204,
"grad_norm": 0.2920839488506317,
"learning_rate": 2.0156965901577635e-05,
"loss": 0.0085,
"step": 854
},
{
"epoch": 1.647710843373494,
"grad_norm": 0.1391262263059616,
"learning_rate": 2.011211906489848e-05,
"loss": 0.0078,
"step": 855
},
{
"epoch": 1.6496385542168674,
"grad_norm": 0.29620468616485596,
"learning_rate": 2.00672716644483e-05,
"loss": 0.0109,
"step": 856
},
{
"epoch": 1.651566265060241,
"grad_norm": 0.13946573436260223,
"learning_rate": 2.002242392573436e-05,
"loss": 0.0076,
"step": 857
},
{
"epoch": 1.6534939759036145,
"grad_norm": 0.9766128659248352,
"learning_rate": 1.997757607426565e-05,
"loss": 0.0309,
"step": 858
},
{
"epoch": 1.655421686746988,
"grad_norm": 0.18002203106880188,
"learning_rate": 1.9932728335551702e-05,
"loss": 0.0072,
"step": 859
},
{
"epoch": 1.6573493975903615,
"grad_norm": 0.28073111176490784,
"learning_rate": 1.988788093510152e-05,
"loss": 0.0246,
"step": 860
},
{
"epoch": 1.659277108433735,
"grad_norm": 0.1919957399368286,
"learning_rate": 1.9843034098422375e-05,
"loss": 0.0087,
"step": 861
},
{
"epoch": 1.6612048192771085,
"grad_norm": 0.1825258433818817,
"learning_rate": 1.9798188051018705e-05,
"loss": 0.0092,
"step": 862
},
{
"epoch": 1.6631325301204818,
"grad_norm": 0.32412952184677124,
"learning_rate": 1.9753343018390997e-05,
"loss": 0.0118,
"step": 863
},
{
"epoch": 1.6650602409638555,
"grad_norm": 0.12828563153743744,
"learning_rate": 1.9708499226034618e-05,
"loss": 0.0056,
"step": 864
},
{
"epoch": 1.6669879518072288,
"grad_norm": 0.18647560477256775,
"learning_rate": 1.966365689943871e-05,
"loss": 0.0094,
"step": 865
},
{
"epoch": 1.6689156626506025,
"grad_norm": 0.19835828244686127,
"learning_rate": 1.9618816264085042e-05,
"loss": 0.0097,
"step": 866
},
{
"epoch": 1.6708433734939758,
"grad_norm": 0.22364282608032227,
"learning_rate": 1.957397754544687e-05,
"loss": 0.0062,
"step": 867
},
{
"epoch": 1.6727710843373496,
"grad_norm": 0.29420018196105957,
"learning_rate": 1.952914096898783e-05,
"loss": 0.0182,
"step": 868
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.2149929702281952,
"learning_rate": 1.9484306760160766e-05,
"loss": 0.0125,
"step": 869
},
{
"epoch": 1.6766265060240964,
"grad_norm": 0.16844330728054047,
"learning_rate": 1.9439475144406623e-05,
"loss": 0.0074,
"step": 870
},
{
"epoch": 1.6785542168674699,
"grad_norm": 0.5010282397270203,
"learning_rate": 1.9394646347153334e-05,
"loss": 0.0213,
"step": 871
},
{
"epoch": 1.6804819277108434,
"grad_norm": 0.29847195744514465,
"learning_rate": 1.9349820593814606e-05,
"loss": 0.0173,
"step": 872
},
{
"epoch": 1.6824096385542169,
"grad_norm": 0.23835812509059906,
"learning_rate": 1.930499810978889e-05,
"loss": 0.011,
"step": 873
},
{
"epoch": 1.6843373493975904,
"grad_norm": 0.3269020617008209,
"learning_rate": 1.9260179120458177e-05,
"loss": 0.0285,
"step": 874
},
{
"epoch": 1.686265060240964,
"grad_norm": 0.2142144739627838,
"learning_rate": 1.9215363851186883e-05,
"loss": 0.0146,
"step": 875
},
{
"epoch": 1.6881927710843372,
"grad_norm": 0.3098377585411072,
"learning_rate": 1.9170552527320725e-05,
"loss": 0.0104,
"step": 876
},
{
"epoch": 1.690120481927711,
"grad_norm": 0.22504115104675293,
"learning_rate": 1.9125745374185568e-05,
"loss": 0.0091,
"step": 877
},
{
"epoch": 1.6920481927710842,
"grad_norm": 0.20633333921432495,
"learning_rate": 1.908094261708633e-05,
"loss": 0.0097,
"step": 878
},
{
"epoch": 1.693975903614458,
"grad_norm": 1.179566502571106,
"learning_rate": 1.9036144481305807e-05,
"loss": 0.0143,
"step": 879
},
{
"epoch": 1.6959036144578312,
"grad_norm": 0.15525613725185394,
"learning_rate": 1.8991351192103554e-05,
"loss": 0.0062,
"step": 880
},
{
"epoch": 1.697831325301205,
"grad_norm": 0.15966367721557617,
"learning_rate": 1.8946562974714763e-05,
"loss": 0.0048,
"step": 881
},
{
"epoch": 1.6997590361445782,
"grad_norm": 0.18902607262134552,
"learning_rate": 1.890178005434914e-05,
"loss": 0.0124,
"step": 882
},
{
"epoch": 1.701686746987952,
"grad_norm": 0.21692413091659546,
"learning_rate": 1.885700265618971e-05,
"loss": 0.0135,
"step": 883
},
{
"epoch": 1.7036144578313253,
"grad_norm": 0.38948455452919006,
"learning_rate": 1.8812231005391786e-05,
"loss": 0.0365,
"step": 884
},
{
"epoch": 1.7055421686746988,
"grad_norm": 0.2483491599559784,
"learning_rate": 1.8767465327081736e-05,
"loss": 0.0202,
"step": 885
},
{
"epoch": 1.7074698795180723,
"grad_norm": 0.15305832028388977,
"learning_rate": 1.872270584635592e-05,
"loss": 0.0035,
"step": 886
},
{
"epoch": 1.7093975903614458,
"grad_norm": 0.17794466018676758,
"learning_rate": 1.867795278827954e-05,
"loss": 0.0157,
"step": 887
},
{
"epoch": 1.7113253012048193,
"grad_norm": 0.1938813328742981,
"learning_rate": 1.863320637788547e-05,
"loss": 0.0071,
"step": 888
},
{
"epoch": 1.7132530120481928,
"grad_norm": 0.27061617374420166,
"learning_rate": 1.8588466840173207e-05,
"loss": 0.0347,
"step": 889
},
{
"epoch": 1.7151807228915663,
"grad_norm": 0.1541014313697815,
"learning_rate": 1.8543734400107637e-05,
"loss": 0.006,
"step": 890
},
{
"epoch": 1.7171084337349396,
"grad_norm": 0.1436876654624939,
"learning_rate": 1.8499009282617996e-05,
"loss": 0.0059,
"step": 891
},
{
"epoch": 1.7190361445783133,
"grad_norm": 1.0573723316192627,
"learning_rate": 1.8454291712596688e-05,
"loss": 0.008,
"step": 892
},
{
"epoch": 1.7209638554216866,
"grad_norm": 0.15406259894371033,
"learning_rate": 1.8409581914898157e-05,
"loss": 0.0061,
"step": 893
},
{
"epoch": 1.7228915662650603,
"grad_norm": 0.24822913110256195,
"learning_rate": 1.836488011433777e-05,
"loss": 0.0085,
"step": 894
},
{
"epoch": 1.7248192771084336,
"grad_norm": 0.21049316227436066,
"learning_rate": 1.83201865356907e-05,
"loss": 0.0075,
"step": 895
},
{
"epoch": 1.7267469879518074,
"grad_norm": 0.24159866571426392,
"learning_rate": 1.8275501403690733e-05,
"loss": 0.0156,
"step": 896
},
{
"epoch": 1.7286746987951807,
"grad_norm": 0.3191063106060028,
"learning_rate": 1.823082494302924e-05,
"loss": 0.0218,
"step": 897
},
{
"epoch": 1.7306024096385542,
"grad_norm": 0.20296362042427063,
"learning_rate": 1.8186157378353945e-05,
"loss": 0.0126,
"step": 898
},
{
"epoch": 1.7325301204819277,
"grad_norm": 0.1905524581670761,
"learning_rate": 1.8141498934267858e-05,
"loss": 0.0131,
"step": 899
},
{
"epoch": 1.7344578313253012,
"grad_norm": 0.5350520610809326,
"learning_rate": 1.809684983532813e-05,
"loss": 0.0115,
"step": 900
},
{
"epoch": 1.7363855421686747,
"grad_norm": 0.17144092917442322,
"learning_rate": 1.8052210306044907e-05,
"loss": 0.0113,
"step": 901
},
{
"epoch": 1.7383132530120482,
"grad_norm": 0.11777982115745544,
"learning_rate": 1.8007580570880236e-05,
"loss": 0.0058,
"step": 902
},
{
"epoch": 1.7402409638554217,
"grad_norm": 0.2078275978565216,
"learning_rate": 1.7962960854246908e-05,
"loss": 0.0106,
"step": 903
},
{
"epoch": 1.7421686746987952,
"grad_norm": 0.2550877630710602,
"learning_rate": 1.791835138050732e-05,
"loss": 0.0076,
"step": 904
},
{
"epoch": 1.7440963855421687,
"grad_norm": 0.11553912609815598,
"learning_rate": 1.7873752373972395e-05,
"loss": 0.0038,
"step": 905
},
{
"epoch": 1.746024096385542,
"grad_norm": 0.10724586248397827,
"learning_rate": 1.7829164058900398e-05,
"loss": 0.0043,
"step": 906
},
{
"epoch": 1.7479518072289157,
"grad_norm": 0.30152231454849243,
"learning_rate": 1.7784586659495845e-05,
"loss": 0.0099,
"step": 907
},
{
"epoch": 1.749879518072289,
"grad_norm": 0.18372933566570282,
"learning_rate": 1.7740020399908372e-05,
"loss": 0.0074,
"step": 908
},
{
"epoch": 1.7518072289156628,
"grad_norm": 0.35184428095817566,
"learning_rate": 1.7695465504231586e-05,
"loss": 0.0184,
"step": 909
},
{
"epoch": 1.753734939759036,
"grad_norm": 0.15083615481853485,
"learning_rate": 1.765092219650196e-05,
"loss": 0.0061,
"step": 910
},
{
"epoch": 1.7556626506024098,
"grad_norm": 0.2599961459636688,
"learning_rate": 1.7606390700697693e-05,
"loss": 0.0101,
"step": 911
},
{
"epoch": 1.757590361445783,
"grad_norm": 0.10829206556081772,
"learning_rate": 1.7561871240737595e-05,
"loss": 0.0034,
"step": 912
},
{
"epoch": 1.7595180722891566,
"grad_norm": 0.38098782300949097,
"learning_rate": 1.7517364040479966e-05,
"loss": 0.0384,
"step": 913
},
{
"epoch": 1.76144578313253,
"grad_norm": 0.14975085854530334,
"learning_rate": 1.7472869323721432e-05,
"loss": 0.0055,
"step": 914
},
{
"epoch": 1.7633734939759036,
"grad_norm": 0.4151444733142853,
"learning_rate": 1.742838731419588e-05,
"loss": 0.0307,
"step": 915
},
{
"epoch": 1.765301204819277,
"grad_norm": 0.22238481044769287,
"learning_rate": 1.738391823557328e-05,
"loss": 0.0059,
"step": 916
},
{
"epoch": 1.7672289156626506,
"grad_norm": 0.23386356234550476,
"learning_rate": 1.7339462311458587e-05,
"loss": 0.0113,
"step": 917
},
{
"epoch": 1.7691566265060241,
"grad_norm": 0.21911191940307617,
"learning_rate": 1.7295019765390618e-05,
"loss": 0.0071,
"step": 918
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.343159943819046,
"learning_rate": 1.7250590820840903e-05,
"loss": 0.0144,
"step": 919
},
{
"epoch": 1.7730120481927711,
"grad_norm": 0.32204556465148926,
"learning_rate": 1.720617570121259e-05,
"loss": 0.0131,
"step": 920
},
{
"epoch": 1.7749397590361444,
"grad_norm": 0.4105585515499115,
"learning_rate": 1.7161774629839328e-05,
"loss": 0.0148,
"step": 921
},
{
"epoch": 1.7768674698795182,
"grad_norm": 0.16380974650382996,
"learning_rate": 1.7117387829984093e-05,
"loss": 0.0066,
"step": 922
},
{
"epoch": 1.7787951807228914,
"grad_norm": 0.22920913994312286,
"learning_rate": 1.707301552483813e-05,
"loss": 0.0105,
"step": 923
},
{
"epoch": 1.7807228915662652,
"grad_norm": 0.2075149267911911,
"learning_rate": 1.7028657937519767e-05,
"loss": 0.0104,
"step": 924
},
{
"epoch": 1.7826506024096385,
"grad_norm": 0.44439977407455444,
"learning_rate": 1.6984315291073355e-05,
"loss": 0.0134,
"step": 925
},
{
"epoch": 1.7845783132530122,
"grad_norm": 0.24068203568458557,
"learning_rate": 1.6939987808468125e-05,
"loss": 0.0078,
"step": 926
},
{
"epoch": 1.7865060240963855,
"grad_norm": 0.34044349193573,
"learning_rate": 1.689567571259701e-05,
"loss": 0.0108,
"step": 927
},
{
"epoch": 1.788433734939759,
"grad_norm": 0.34082743525505066,
"learning_rate": 1.6851379226275624e-05,
"loss": 0.0266,
"step": 928
},
{
"epoch": 1.7903614457831325,
"grad_norm": 0.19490115344524384,
"learning_rate": 1.6807098572241075e-05,
"loss": 0.0109,
"step": 929
},
{
"epoch": 1.792289156626506,
"grad_norm": 0.16208237409591675,
"learning_rate": 1.6762833973150846e-05,
"loss": 0.0113,
"step": 930
},
{
"epoch": 1.7942168674698795,
"grad_norm": 0.35555699467658997,
"learning_rate": 1.671858565158172e-05,
"loss": 0.0196,
"step": 931
},
{
"epoch": 1.796144578313253,
"grad_norm": 0.1600857824087143,
"learning_rate": 1.6674353830028587e-05,
"loss": 0.0089,
"step": 932
},
{
"epoch": 1.7980722891566265,
"grad_norm": 0.1699574887752533,
"learning_rate": 1.663013873090342e-05,
"loss": 0.0074,
"step": 933
},
{
"epoch": 1.8,
"grad_norm": 0.2472933828830719,
"learning_rate": 1.6585940576534086e-05,
"loss": 0.0063,
"step": 934
},
{
"epoch": 1.8019277108433736,
"grad_norm": 0.23491555452346802,
"learning_rate": 1.654175958916323e-05,
"loss": 0.0101,
"step": 935
},
{
"epoch": 1.8038554216867468,
"grad_norm": 0.28635191917419434,
"learning_rate": 1.6497595990947195e-05,
"loss": 0.0131,
"step": 936
},
{
"epoch": 1.8057831325301206,
"grad_norm": 0.15400712192058563,
"learning_rate": 1.645345000395489e-05,
"loss": 0.0068,
"step": 937
},
{
"epoch": 1.8077108433734939,
"grad_norm": 0.18223172426223755,
"learning_rate": 1.6409321850166647e-05,
"loss": 0.0094,
"step": 938
},
{
"epoch": 1.8096385542168676,
"grad_norm": 0.2789457142353058,
"learning_rate": 1.636521175147316e-05,
"loss": 0.0202,
"step": 939
},
{
"epoch": 1.8115662650602409,
"grad_norm": 0.4267627000808716,
"learning_rate": 1.6321119929674297e-05,
"loss": 0.0176,
"step": 940
},
{
"epoch": 1.8134939759036146,
"grad_norm": 0.3021615445613861,
"learning_rate": 1.6277046606478056e-05,
"loss": 0.0085,
"step": 941
},
{
"epoch": 1.815421686746988,
"grad_norm": 0.3724934756755829,
"learning_rate": 1.6232992003499405e-05,
"loss": 0.0474,
"step": 942
},
{
"epoch": 1.8173493975903614,
"grad_norm": 0.20904326438903809,
"learning_rate": 1.6188956342259177e-05,
"loss": 0.0078,
"step": 943
},
{
"epoch": 1.819277108433735,
"grad_norm": 0.31168171763420105,
"learning_rate": 1.614493984418297e-05,
"loss": 0.0174,
"step": 944
},
{
"epoch": 1.8212048192771084,
"grad_norm": 0.21273556351661682,
"learning_rate": 1.6100942730600003e-05,
"loss": 0.0054,
"step": 945
},
{
"epoch": 1.823132530120482,
"grad_norm": 0.16991695761680603,
"learning_rate": 1.6056965222742055e-05,
"loss": 0.0063,
"step": 946
},
{
"epoch": 1.8250602409638554,
"grad_norm": 0.22762684524059296,
"learning_rate": 1.6013007541742303e-05,
"loss": 0.0234,
"step": 947
},
{
"epoch": 1.826987951807229,
"grad_norm": 0.20128795504570007,
"learning_rate": 1.596906990863422e-05,
"loss": 0.0095,
"step": 948
},
{
"epoch": 1.8289156626506025,
"grad_norm": 0.30772027373313904,
"learning_rate": 1.592515254435048e-05,
"loss": 0.0356,
"step": 949
},
{
"epoch": 1.830843373493976,
"grad_norm": 0.12954631447792053,
"learning_rate": 1.5881255669721857e-05,
"loss": 0.008,
"step": 950
},
{
"epoch": 1.8327710843373493,
"grad_norm": 0.7787145972251892,
"learning_rate": 1.5837379505476054e-05,
"loss": 0.0108,
"step": 951
},
{
"epoch": 1.834698795180723,
"grad_norm": 0.1683879941701889,
"learning_rate": 1.5793524272236683e-05,
"loss": 0.006,
"step": 952
},
{
"epoch": 1.8366265060240963,
"grad_norm": 0.16475361585617065,
"learning_rate": 1.5749690190522076e-05,
"loss": 0.0065,
"step": 953
},
{
"epoch": 1.83855421686747,
"grad_norm": 0.211905375123024,
"learning_rate": 1.5705877480744214e-05,
"loss": 0.0092,
"step": 954
},
{
"epoch": 1.8404819277108433,
"grad_norm": 0.23850117623806,
"learning_rate": 1.5662086363207628e-05,
"loss": 0.012,
"step": 955
},
{
"epoch": 1.842409638554217,
"grad_norm": 0.19100065529346466,
"learning_rate": 1.561831705810825e-05,
"loss": 0.0113,
"step": 956
},
{
"epoch": 1.8443373493975903,
"grad_norm": 0.3635985255241394,
"learning_rate": 1.557456978553236e-05,
"loss": 0.0168,
"step": 957
},
{
"epoch": 1.8462650602409638,
"grad_norm": 0.16449116170406342,
"learning_rate": 1.553084476545544e-05,
"loss": 0.0042,
"step": 958
},
{
"epoch": 1.8481927710843373,
"grad_norm": 0.566093385219574,
"learning_rate": 1.5487142217741062e-05,
"loss": 0.0145,
"step": 959
},
{
"epoch": 1.8501204819277108,
"grad_norm": 0.15960252285003662,
"learning_rate": 1.5443462362139834e-05,
"loss": 0.0059,
"step": 960
},
{
"epoch": 1.8520481927710843,
"grad_norm": 0.40773797035217285,
"learning_rate": 1.539980541828823e-05,
"loss": 0.0257,
"step": 961
},
{
"epoch": 1.8539759036144579,
"grad_norm": 0.4802496135234833,
"learning_rate": 1.5356171605707522e-05,
"loss": 0.0111,
"step": 962
},
{
"epoch": 1.8559036144578314,
"grad_norm": 0.15745794773101807,
"learning_rate": 1.5312561143802704e-05,
"loss": 0.0049,
"step": 963
},
{
"epoch": 1.8578313253012049,
"grad_norm": 0.15139251947402954,
"learning_rate": 1.5268974251861298e-05,
"loss": 0.0077,
"step": 964
},
{
"epoch": 1.8597590361445784,
"grad_norm": 0.2188841849565506,
"learning_rate": 1.5225411149052356e-05,
"loss": 0.017,
"step": 965
},
{
"epoch": 1.8616867469879517,
"grad_norm": 0.10853131115436554,
"learning_rate": 1.5181872054425287e-05,
"loss": 0.0049,
"step": 966
},
{
"epoch": 1.8636144578313254,
"grad_norm": 0.8254880905151367,
"learning_rate": 1.5138357186908785e-05,
"loss": 0.0317,
"step": 967
},
{
"epoch": 1.8655421686746987,
"grad_norm": 0.2989620566368103,
"learning_rate": 1.5094866765309728e-05,
"loss": 0.0126,
"step": 968
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.16411150991916656,
"learning_rate": 1.5051401008312054e-05,
"loss": 0.0101,
"step": 969
},
{
"epoch": 1.8693975903614457,
"grad_norm": 0.2861763834953308,
"learning_rate": 1.5007960134475706e-05,
"loss": 0.0155,
"step": 970
},
{
"epoch": 1.8713253012048194,
"grad_norm": 0.24879588186740875,
"learning_rate": 1.4964544362235487e-05,
"loss": 0.0187,
"step": 971
},
{
"epoch": 1.8732530120481927,
"grad_norm": 0.2433672398328781,
"learning_rate": 1.4921153909899983e-05,
"loss": 0.0084,
"step": 972
},
{
"epoch": 1.8751807228915662,
"grad_norm": 0.15097154676914215,
"learning_rate": 1.487778899565047e-05,
"loss": 0.007,
"step": 973
},
{
"epoch": 1.8771084337349397,
"grad_norm": 0.1629047691822052,
"learning_rate": 1.4834449837539806e-05,
"loss": 0.0058,
"step": 974
},
{
"epoch": 1.8790361445783132,
"grad_norm": 0.9937071204185486,
"learning_rate": 1.4791136653491333e-05,
"loss": 0.0323,
"step": 975
},
{
"epoch": 1.8809638554216868,
"grad_norm": 0.19555562734603882,
"learning_rate": 1.4747849661297808e-05,
"loss": 0.0126,
"step": 976
},
{
"epoch": 1.8828915662650603,
"grad_norm": 0.16147711873054504,
"learning_rate": 1.470458907862026e-05,
"loss": 0.0067,
"step": 977
},
{
"epoch": 1.8848192771084338,
"grad_norm": 0.2730027735233307,
"learning_rate": 1.4661355122986945e-05,
"loss": 0.0147,
"step": 978
},
{
"epoch": 1.886746987951807,
"grad_norm": 0.13759832084178925,
"learning_rate": 1.4618148011792206e-05,
"loss": 0.0038,
"step": 979
},
{
"epoch": 1.8886746987951808,
"grad_norm": 0.33516690135002136,
"learning_rate": 1.4574967962295419e-05,
"loss": 0.0139,
"step": 980
},
{
"epoch": 1.890602409638554,
"grad_norm": 0.2345741093158722,
"learning_rate": 1.4531815191619903e-05,
"loss": 0.0094,
"step": 981
},
{
"epoch": 1.8925301204819278,
"grad_norm": 0.14681044220924377,
"learning_rate": 1.4488689916751762e-05,
"loss": 0.0065,
"step": 982
},
{
"epoch": 1.894457831325301,
"grad_norm": 0.21143914759159088,
"learning_rate": 1.4445592354538885e-05,
"loss": 0.0057,
"step": 983
},
{
"epoch": 1.8963855421686748,
"grad_norm": 0.3109160363674164,
"learning_rate": 1.44025227216898e-05,
"loss": 0.0142,
"step": 984
},
{
"epoch": 1.8983132530120481,
"grad_norm": 0.24301907420158386,
"learning_rate": 1.435948123477259e-05,
"loss": 0.012,
"step": 985
},
{
"epoch": 1.9002409638554218,
"grad_norm": 0.19817675650119781,
"learning_rate": 1.431646811021382e-05,
"loss": 0.0097,
"step": 986
},
{
"epoch": 1.9021686746987951,
"grad_norm": 0.13464932143688202,
"learning_rate": 1.4273483564297425e-05,
"loss": 0.0046,
"step": 987
},
{
"epoch": 1.9040963855421686,
"grad_norm": 0.1698642522096634,
"learning_rate": 1.4230527813163656e-05,
"loss": 0.0038,
"step": 988
},
{
"epoch": 1.9060240963855422,
"grad_norm": 0.19395388662815094,
"learning_rate": 1.4187601072807975e-05,
"loss": 0.0123,
"step": 989
},
{
"epoch": 1.9079518072289157,
"grad_norm": 0.2093188613653183,
"learning_rate": 1.4144703559079948e-05,
"loss": 0.0093,
"step": 990
},
{
"epoch": 1.9098795180722892,
"grad_norm": 0.1529311090707779,
"learning_rate": 1.4101835487682198e-05,
"loss": 0.0051,
"step": 991
},
{
"epoch": 1.9118072289156627,
"grad_norm": 0.18725350499153137,
"learning_rate": 1.4058997074169299e-05,
"loss": 0.0083,
"step": 992
},
{
"epoch": 1.9137349397590362,
"grad_norm": 0.15601560473442078,
"learning_rate": 1.401618853394668e-05,
"loss": 0.0086,
"step": 993
},
{
"epoch": 1.9156626506024095,
"grad_norm": 0.23890644311904907,
"learning_rate": 1.3973410082269591e-05,
"loss": 0.015,
"step": 994
},
{
"epoch": 1.9175903614457832,
"grad_norm": 0.2442619949579239,
"learning_rate": 1.3930661934241947e-05,
"loss": 0.0089,
"step": 995
},
{
"epoch": 1.9195180722891565,
"grad_norm": 0.1540212482213974,
"learning_rate": 1.388794430481532e-05,
"loss": 0.0072,
"step": 996
},
{
"epoch": 1.9214457831325302,
"grad_norm": 0.1359291970729828,
"learning_rate": 1.3845257408787807e-05,
"loss": 0.0131,
"step": 997
},
{
"epoch": 1.9233734939759035,
"grad_norm": 0.25486138463020325,
"learning_rate": 1.3802601460802967e-05,
"loss": 0.0198,
"step": 998
},
{
"epoch": 1.9253012048192772,
"grad_norm": 0.28815609216690063,
"learning_rate": 1.3759976675348754e-05,
"loss": 0.014,
"step": 999
},
{
"epoch": 1.9272289156626505,
"grad_norm": 0.15648497641086578,
"learning_rate": 1.3717383266756403e-05,
"loss": 0.0065,
"step": 1000
},
{
"epoch": 1.929156626506024,
"grad_norm": 0.16912540793418884,
"learning_rate": 1.367482144919941e-05,
"loss": 0.0059,
"step": 1001
},
{
"epoch": 1.9310843373493976,
"grad_norm": 0.16896723210811615,
"learning_rate": 1.3632291436692397e-05,
"loss": 0.0054,
"step": 1002
},
{
"epoch": 1.933012048192771,
"grad_norm": 0.20287497341632843,
"learning_rate": 1.3589793443090064e-05,
"loss": 0.0097,
"step": 1003
},
{
"epoch": 1.9349397590361446,
"grad_norm": 0.14804276823997498,
"learning_rate": 1.3547327682086114e-05,
"loss": 0.0125,
"step": 1004
},
{
"epoch": 1.936867469879518,
"grad_norm": 0.23820064961910248,
"learning_rate": 1.3504894367212171e-05,
"loss": 0.0131,
"step": 1005
},
{
"epoch": 1.9387951807228916,
"grad_norm": 0.25607362389564514,
"learning_rate": 1.34624937118367e-05,
"loss": 0.0115,
"step": 1006
},
{
"epoch": 1.940722891566265,
"grad_norm": 0.37233737111091614,
"learning_rate": 1.3420125929163976e-05,
"loss": 0.0309,
"step": 1007
},
{
"epoch": 1.9426506024096386,
"grad_norm": 0.19426730275154114,
"learning_rate": 1.3377791232232929e-05,
"loss": 0.0078,
"step": 1008
},
{
"epoch": 1.944578313253012,
"grad_norm": 0.2784160077571869,
"learning_rate": 1.333548983391617e-05,
"loss": 0.0142,
"step": 1009
},
{
"epoch": 1.9465060240963856,
"grad_norm": 0.11407195776700974,
"learning_rate": 1.3293221946918853e-05,
"loss": 0.0035,
"step": 1010
},
{
"epoch": 1.948433734939759,
"grad_norm": 0.3965436816215515,
"learning_rate": 1.325098778377762e-05,
"loss": 0.0242,
"step": 1011
},
{
"epoch": 1.9503614457831326,
"grad_norm": 0.18520519137382507,
"learning_rate": 1.3208787556859543e-05,
"loss": 0.0096,
"step": 1012
},
{
"epoch": 1.952289156626506,
"grad_norm": 0.2783315181732178,
"learning_rate": 1.3166621478361075e-05,
"loss": 0.0103,
"step": 1013
},
{
"epoch": 1.9542168674698797,
"grad_norm": 0.22714459896087646,
"learning_rate": 1.3124489760306917e-05,
"loss": 0.0078,
"step": 1014
},
{
"epoch": 1.956144578313253,
"grad_norm": 0.1257915049791336,
"learning_rate": 1.3082392614549036e-05,
"loss": 0.0077,
"step": 1015
},
{
"epoch": 1.9580722891566265,
"grad_norm": 0.15592887997627258,
"learning_rate": 1.3040330252765526e-05,
"loss": 0.0106,
"step": 1016
},
{
"epoch": 1.96,
"grad_norm": 0.19295449554920197,
"learning_rate": 1.2998302886459586e-05,
"loss": 0.0082,
"step": 1017
},
{
"epoch": 1.9619277108433735,
"grad_norm": 0.15544794499874115,
"learning_rate": 1.2956310726958472e-05,
"loss": 0.0068,
"step": 1018
},
{
"epoch": 1.963855421686747,
"grad_norm": 0.25899502635002136,
"learning_rate": 1.291435398541236e-05,
"loss": 0.0086,
"step": 1019
},
{
"epoch": 1.9657831325301205,
"grad_norm": 0.34639033675193787,
"learning_rate": 1.2872432872793379e-05,
"loss": 0.0116,
"step": 1020
},
{
"epoch": 1.967710843373494,
"grad_norm": 0.1628410518169403,
"learning_rate": 1.283054759989447e-05,
"loss": 0.0055,
"step": 1021
},
{
"epoch": 1.9696385542168675,
"grad_norm": 0.9273788928985596,
"learning_rate": 1.2788698377328385e-05,
"loss": 0.0264,
"step": 1022
},
{
"epoch": 1.971566265060241,
"grad_norm": 0.163126140832901,
"learning_rate": 1.2746885415526594e-05,
"loss": 0.0046,
"step": 1023
},
{
"epoch": 1.9734939759036143,
"grad_norm": 0.1475439816713333,
"learning_rate": 1.2705108924738223e-05,
"loss": 0.0056,
"step": 1024
},
{
"epoch": 1.975421686746988,
"grad_norm": 0.1654318869113922,
"learning_rate": 1.2663369115029034e-05,
"loss": 0.0056,
"step": 1025
},
{
"epoch": 1.9773493975903613,
"grad_norm": 0.20536045730113983,
"learning_rate": 1.2621666196280333e-05,
"loss": 0.0101,
"step": 1026
},
{
"epoch": 1.979277108433735,
"grad_norm": 0.19256474077701569,
"learning_rate": 1.258000037818792e-05,
"loss": 0.0059,
"step": 1027
},
{
"epoch": 1.9812048192771083,
"grad_norm": 0.2605120539665222,
"learning_rate": 1.2538371870261053e-05,
"loss": 0.0115,
"step": 1028
},
{
"epoch": 1.983132530120482,
"grad_norm": 0.14840295910835266,
"learning_rate": 1.249678088182137e-05,
"loss": 0.0046,
"step": 1029
},
{
"epoch": 1.9850602409638554,
"grad_norm": 0.17585207521915436,
"learning_rate": 1.2455227622001851e-05,
"loss": 0.0086,
"step": 1030
},
{
"epoch": 1.9869879518072289,
"grad_norm": 0.11044781655073166,
"learning_rate": 1.241371229974579e-05,
"loss": 0.0034,
"step": 1031
},
{
"epoch": 1.9889156626506024,
"grad_norm": 0.25584840774536133,
"learning_rate": 1.2372235123805672e-05,
"loss": 0.0245,
"step": 1032
},
{
"epoch": 1.9908433734939759,
"grad_norm": 0.25962474942207336,
"learning_rate": 1.2330796302742211e-05,
"loss": 0.0104,
"step": 1033
},
{
"epoch": 1.9927710843373494,
"grad_norm": 0.33408522605895996,
"learning_rate": 1.2289396044923238e-05,
"loss": 0.0176,
"step": 1034
},
{
"epoch": 1.994698795180723,
"grad_norm": 0.479950487613678,
"learning_rate": 1.2248034558522682e-05,
"loss": 0.0113,
"step": 1035
},
{
"epoch": 1.9966265060240964,
"grad_norm": 0.16567294299602509,
"learning_rate": 1.2206712051519518e-05,
"loss": 0.0036,
"step": 1036
},
{
"epoch": 1.99855421686747,
"grad_norm": 0.19343771040439606,
"learning_rate": 1.2165428731696713e-05,
"loss": 0.0077,
"step": 1037
},
{
"epoch": 2.0,
"grad_norm": 0.22895601391792297,
"learning_rate": 1.2124184806640202e-05,
"loss": 0.0114,
"step": 1038
},
{
"epoch": 2.0019277108433733,
"grad_norm": 0.15838384628295898,
"learning_rate": 1.208298048373782e-05,
"loss": 0.0043,
"step": 1039
},
{
"epoch": 2.003855421686747,
"grad_norm": 0.681065559387207,
"learning_rate": 1.2041815970178268e-05,
"loss": 0.0214,
"step": 1040
},
{
"epoch": 2.0057831325301203,
"grad_norm": 0.3357350528240204,
"learning_rate": 1.2000691472950081e-05,
"loss": 0.0079,
"step": 1041
},
{
"epoch": 2.007710843373494,
"grad_norm": 0.15238308906555176,
"learning_rate": 1.1959607198840568e-05,
"loss": 0.0041,
"step": 1042
},
{
"epoch": 2.0096385542168673,
"grad_norm": 0.11763229966163635,
"learning_rate": 1.1918563354434784e-05,
"loss": 0.0033,
"step": 1043
},
{
"epoch": 2.011566265060241,
"grad_norm": 0.3759301006793976,
"learning_rate": 1.1877560146114515e-05,
"loss": 0.0128,
"step": 1044
},
{
"epoch": 2.0134939759036143,
"grad_norm": 0.1143188625574112,
"learning_rate": 1.1836597780057183e-05,
"loss": 0.0078,
"step": 1045
},
{
"epoch": 2.015421686746988,
"grad_norm": 0.20059260725975037,
"learning_rate": 1.179567646223485e-05,
"loss": 0.0149,
"step": 1046
},
{
"epoch": 2.0173493975903614,
"grad_norm": 0.15569567680358887,
"learning_rate": 1.1754796398413196e-05,
"loss": 0.0038,
"step": 1047
},
{
"epoch": 2.019277108433735,
"grad_norm": 0.1153278723359108,
"learning_rate": 1.1713957794150423e-05,
"loss": 0.0041,
"step": 1048
},
{
"epoch": 2.0212048192771084,
"grad_norm": 0.1838717758655548,
"learning_rate": 1.1673160854796307e-05,
"loss": 0.0041,
"step": 1049
},
{
"epoch": 2.023132530120482,
"grad_norm": 0.12264502793550491,
"learning_rate": 1.1632405785491077e-05,
"loss": 0.0043,
"step": 1050
},
{
"epoch": 2.0250602409638554,
"grad_norm": 0.14363229274749756,
"learning_rate": 1.159169279116445e-05,
"loss": 0.0066,
"step": 1051
},
{
"epoch": 2.026987951807229,
"grad_norm": 0.1316995471715927,
"learning_rate": 1.1551022076534585e-05,
"loss": 0.0024,
"step": 1052
},
{
"epoch": 2.0289156626506024,
"grad_norm": 0.13392619788646698,
"learning_rate": 1.1510393846107001e-05,
"loss": 0.0051,
"step": 1053
},
{
"epoch": 2.0308433734939757,
"grad_norm": 3.0086817741394043,
"learning_rate": 1.1469808304173658e-05,
"loss": 0.0334,
"step": 1054
},
{
"epoch": 2.0327710843373494,
"grad_norm": 0.17756076157093048,
"learning_rate": 1.1429265654811803e-05,
"loss": 0.0068,
"step": 1055
},
{
"epoch": 2.0346987951807227,
"grad_norm": 0.13250532746315002,
"learning_rate": 1.1388766101883038e-05,
"loss": 0.0087,
"step": 1056
},
{
"epoch": 2.0366265060240965,
"grad_norm": 0.3534089922904968,
"learning_rate": 1.1348309849032257e-05,
"loss": 0.0076,
"step": 1057
},
{
"epoch": 2.0385542168674697,
"grad_norm": 0.11939049512147903,
"learning_rate": 1.1307897099686627e-05,
"loss": 0.0029,
"step": 1058
},
{
"epoch": 2.0404819277108435,
"grad_norm": 0.11862517893314362,
"learning_rate": 1.1267528057054562e-05,
"loss": 0.0062,
"step": 1059
},
{
"epoch": 2.0424096385542168,
"grad_norm": 0.1539212018251419,
"learning_rate": 1.1227202924124704e-05,
"loss": 0.0067,
"step": 1060
},
{
"epoch": 2.0443373493975905,
"grad_norm": 0.17163440585136414,
"learning_rate": 1.118692190366491e-05,
"loss": 0.0055,
"step": 1061
},
{
"epoch": 2.0462650602409638,
"grad_norm": 0.12304897606372833,
"learning_rate": 1.1146685198221222e-05,
"loss": 0.0036,
"step": 1062
},
{
"epoch": 2.0481927710843375,
"grad_norm": 0.17319051921367645,
"learning_rate": 1.1106493010116842e-05,
"loss": 0.0058,
"step": 1063
},
{
"epoch": 2.050120481927711,
"grad_norm": 0.2242443859577179,
"learning_rate": 1.1066345541451127e-05,
"loss": 0.0059,
"step": 1064
},
{
"epoch": 2.0520481927710845,
"grad_norm": 0.09533938020467758,
"learning_rate": 1.1026242994098597e-05,
"loss": 0.0033,
"step": 1065
},
{
"epoch": 2.053975903614458,
"grad_norm": 0.11697929352521896,
"learning_rate": 1.0986185569707852e-05,
"loss": 0.0038,
"step": 1066
},
{
"epoch": 2.0559036144578315,
"grad_norm": 0.2563149333000183,
"learning_rate": 1.0946173469700625e-05,
"loss": 0.0158,
"step": 1067
},
{
"epoch": 2.057831325301205,
"grad_norm": 0.21836932003498077,
"learning_rate": 1.0906206895270739e-05,
"loss": 0.0085,
"step": 1068
},
{
"epoch": 2.059759036144578,
"grad_norm": 0.1798071414232254,
"learning_rate": 1.0866286047383094e-05,
"loss": 0.0053,
"step": 1069
},
{
"epoch": 2.061686746987952,
"grad_norm": 0.08937730640172958,
"learning_rate": 1.0826411126772675e-05,
"loss": 0.0025,
"step": 1070
},
{
"epoch": 2.063614457831325,
"grad_norm": 0.0942138060927391,
"learning_rate": 1.0786582333943499e-05,
"loss": 0.0017,
"step": 1071
},
{
"epoch": 2.065542168674699,
"grad_norm": 0.13076582551002502,
"learning_rate": 1.0746799869167679e-05,
"loss": 0.0033,
"step": 1072
},
{
"epoch": 2.067469879518072,
"grad_norm": 0.0993233174085617,
"learning_rate": 1.0707063932484357e-05,
"loss": 0.0046,
"step": 1073
},
{
"epoch": 2.069397590361446,
"grad_norm": 0.3046741485595703,
"learning_rate": 1.0667374723698698e-05,
"loss": 0.009,
"step": 1074
},
{
"epoch": 2.071325301204819,
"grad_norm": 0.12197669595479965,
"learning_rate": 1.0627732442380932e-05,
"loss": 0.0034,
"step": 1075
},
{
"epoch": 2.073253012048193,
"grad_norm": 0.12721140682697296,
"learning_rate": 1.058813728786531e-05,
"loss": 0.0048,
"step": 1076
},
{
"epoch": 2.075180722891566,
"grad_norm": 0.10011966526508331,
"learning_rate": 1.0548589459249112e-05,
"loss": 0.0026,
"step": 1077
},
{
"epoch": 2.07710843373494,
"grad_norm": 0.3314201831817627,
"learning_rate": 1.0509089155391661e-05,
"loss": 0.0284,
"step": 1078
},
{
"epoch": 2.079036144578313,
"grad_norm": 0.32739701867103577,
"learning_rate": 1.0469636574913288e-05,
"loss": 0.0088,
"step": 1079
},
{
"epoch": 2.080963855421687,
"grad_norm": 0.13805675506591797,
"learning_rate": 1.043023191619438e-05,
"loss": 0.0042,
"step": 1080
},
{
"epoch": 2.0828915662650602,
"grad_norm": 0.14789745211601257,
"learning_rate": 1.039087537737435e-05,
"loss": 0.0037,
"step": 1081
},
{
"epoch": 2.0848192771084335,
"grad_norm": 0.15518991649150848,
"learning_rate": 1.0351567156350617e-05,
"loss": 0.0044,
"step": 1082
},
{
"epoch": 2.0867469879518072,
"grad_norm": 0.08380113542079926,
"learning_rate": 1.0312307450777706e-05,
"loss": 0.0019,
"step": 1083
},
{
"epoch": 2.0886746987951805,
"grad_norm": 0.17892400920391083,
"learning_rate": 1.027309645806613e-05,
"loss": 0.0065,
"step": 1084
},
{
"epoch": 2.0906024096385543,
"grad_norm": 0.5497608780860901,
"learning_rate": 1.0233934375381489e-05,
"loss": 0.0238,
"step": 1085
},
{
"epoch": 2.0925301204819275,
"grad_norm": 1.0189186334609985,
"learning_rate": 1.019482139964344e-05,
"loss": 0.0092,
"step": 1086
},
{
"epoch": 2.0944578313253013,
"grad_norm": 0.12144117057323456,
"learning_rate": 1.015575772752472e-05,
"loss": 0.0038,
"step": 1087
},
{
"epoch": 2.0963855421686746,
"grad_norm": 0.1115315854549408,
"learning_rate": 1.0116743555450148e-05,
"loss": 0.0024,
"step": 1088
},
{
"epoch": 2.0983132530120483,
"grad_norm": 0.22671759128570557,
"learning_rate": 1.0077779079595631e-05,
"loss": 0.0136,
"step": 1089
},
{
"epoch": 2.1002409638554216,
"grad_norm": 2.0009827613830566,
"learning_rate": 1.003886449588719e-05,
"loss": 0.0493,
"step": 1090
},
{
"epoch": 2.1021686746987953,
"grad_norm": 0.11907301843166351,
"learning_rate": 1.0000000000000006e-05,
"loss": 0.0034,
"step": 1091
},
{
"epoch": 2.1040963855421686,
"grad_norm": 0.31257638335227966,
"learning_rate": 9.961185787357346e-06,
"loss": 0.0129,
"step": 1092
},
{
"epoch": 2.1060240963855423,
"grad_norm": 0.11033743619918823,
"learning_rate": 9.922422053129674e-06,
"loss": 0.0184,
"step": 1093
},
{
"epoch": 2.1079518072289156,
"grad_norm": 0.2575698494911194,
"learning_rate": 9.883708992233626e-06,
"loss": 0.0054,
"step": 1094
},
{
"epoch": 2.1098795180722894,
"grad_norm": 0.12921132147312164,
"learning_rate": 9.845046799331029e-06,
"loss": 0.0037,
"step": 1095
},
{
"epoch": 2.1118072289156626,
"grad_norm": 0.21405921876430511,
"learning_rate": 9.806435668827941e-06,
"loss": 0.006,
"step": 1096
},
{
"epoch": 2.113734939759036,
"grad_norm": 0.12929430603981018,
"learning_rate": 9.76787579487363e-06,
"loss": 0.0049,
"step": 1097
},
{
"epoch": 2.1156626506024097,
"grad_norm": 0.1793181151151657,
"learning_rate": 9.729367371359681e-06,
"loss": 0.0086,
"step": 1098
},
{
"epoch": 2.117590361445783,
"grad_norm": 0.2182074338197708,
"learning_rate": 9.690910591918936e-06,
"loss": 0.0106,
"step": 1099
},
{
"epoch": 2.1195180722891567,
"grad_norm": 0.0705680400133133,
"learning_rate": 9.652505649924547e-06,
"loss": 0.0012,
"step": 1100
},
{
"epoch": 2.12144578313253,
"grad_norm": 0.10509738326072693,
"learning_rate": 9.614152738489021e-06,
"loss": 0.0048,
"step": 1101
},
{
"epoch": 2.1233734939759037,
"grad_norm": 0.13775436580181122,
"learning_rate": 9.575852050463268e-06,
"loss": 0.0089,
"step": 1102
},
{
"epoch": 2.125301204819277,
"grad_norm": 0.15230101346969604,
"learning_rate": 9.537603778435545e-06,
"loss": 0.0065,
"step": 1103
},
{
"epoch": 2.1272289156626507,
"grad_norm": 0.24702346324920654,
"learning_rate": 9.499408114730583e-06,
"loss": 0.016,
"step": 1104
},
{
"epoch": 2.129156626506024,
"grad_norm": 0.1082577034831047,
"learning_rate": 9.461265251408575e-06,
"loss": 0.0036,
"step": 1105
},
{
"epoch": 2.1310843373493977,
"grad_norm": 0.1063847690820694,
"learning_rate": 9.423175380264211e-06,
"loss": 0.0037,
"step": 1106
},
{
"epoch": 2.133012048192771,
"grad_norm": 0.07686953246593475,
"learning_rate": 9.385138692825729e-06,
"loss": 0.0031,
"step": 1107
},
{
"epoch": 2.1349397590361447,
"grad_norm": 0.2046380341053009,
"learning_rate": 9.347155380353912e-06,
"loss": 0.0087,
"step": 1108
},
{
"epoch": 2.136867469879518,
"grad_norm": 0.1341692954301834,
"learning_rate": 9.30922563384121e-06,
"loss": 0.0045,
"step": 1109
},
{
"epoch": 2.1387951807228918,
"grad_norm": 0.09870535880327225,
"learning_rate": 9.271349644010672e-06,
"loss": 0.003,
"step": 1110
},
{
"epoch": 2.140722891566265,
"grad_norm": 0.18708615005016327,
"learning_rate": 9.233527601315069e-06,
"loss": 0.0042,
"step": 1111
},
{
"epoch": 2.1426506024096383,
"grad_norm": 0.5175634026527405,
"learning_rate": 9.195759695935907e-06,
"loss": 0.0173,
"step": 1112
},
{
"epoch": 2.144578313253012,
"grad_norm": 0.14939036965370178,
"learning_rate": 9.158046117782464e-06,
"loss": 0.0031,
"step": 1113
},
{
"epoch": 2.1465060240963854,
"grad_norm": 0.2837410569190979,
"learning_rate": 9.120387056490851e-06,
"loss": 0.0097,
"step": 1114
},
{
"epoch": 2.148433734939759,
"grad_norm": 0.11088677495718002,
"learning_rate": 9.082782701423047e-06,
"loss": 0.0026,
"step": 1115
},
{
"epoch": 2.1503614457831324,
"grad_norm": 0.07785166054964066,
"learning_rate": 9.045233241665947e-06,
"loss": 0.0019,
"step": 1116
},
{
"epoch": 2.152289156626506,
"grad_norm": 0.17568141222000122,
"learning_rate": 9.007738866030427e-06,
"loss": 0.0039,
"step": 1117
},
{
"epoch": 2.1542168674698794,
"grad_norm": 0.12652266025543213,
"learning_rate": 8.970299763050356e-06,
"loss": 0.0033,
"step": 1118
},
{
"epoch": 2.156144578313253,
"grad_norm": 0.16801467537879944,
"learning_rate": 8.932916120981695e-06,
"loss": 0.0076,
"step": 1119
},
{
"epoch": 2.1580722891566264,
"grad_norm": 0.18313169479370117,
"learning_rate": 8.895588127801545e-06,
"loss": 0.0052,
"step": 1120
},
{
"epoch": 2.16,
"grad_norm": 0.07546049356460571,
"learning_rate": 8.858315971207146e-06,
"loss": 0.0022,
"step": 1121
},
{
"epoch": 2.1619277108433734,
"grad_norm": 0.4039839208126068,
"learning_rate": 8.821099838614996e-06,
"loss": 0.0203,
"step": 1122
},
{
"epoch": 2.163855421686747,
"grad_norm": 0.09244243055582047,
"learning_rate": 8.783939917159897e-06,
"loss": 0.002,
"step": 1123
},
{
"epoch": 2.1657831325301204,
"grad_norm": 0.18327835202217102,
"learning_rate": 8.746836393693978e-06,
"loss": 0.0055,
"step": 1124
},
{
"epoch": 2.167710843373494,
"grad_norm": 0.22010307013988495,
"learning_rate": 8.709789454785809e-06,
"loss": 0.0077,
"step": 1125
},
{
"epoch": 2.1696385542168675,
"grad_norm": 0.09438297897577286,
"learning_rate": 8.67279928671939e-06,
"loss": 0.0032,
"step": 1126
},
{
"epoch": 2.1715662650602408,
"grad_norm": 0.20782770216464996,
"learning_rate": 8.635866075493318e-06,
"loss": 0.0028,
"step": 1127
},
{
"epoch": 2.1734939759036145,
"grad_norm": 0.1958685964345932,
"learning_rate": 8.598990006819756e-06,
"loss": 0.0047,
"step": 1128
},
{
"epoch": 2.1754216867469878,
"grad_norm": 0.06459935009479523,
"learning_rate": 8.562171266123528e-06,
"loss": 0.0015,
"step": 1129
},
{
"epoch": 2.1773493975903615,
"grad_norm": 0.33486708998680115,
"learning_rate": 8.525410038541218e-06,
"loss": 0.0094,
"step": 1130
},
{
"epoch": 2.179277108433735,
"grad_norm": 0.5755940079689026,
"learning_rate": 8.488706508920202e-06,
"loss": 0.0067,
"step": 1131
},
{
"epoch": 2.1812048192771085,
"grad_norm": 0.10840924829244614,
"learning_rate": 8.452060861817738e-06,
"loss": 0.0082,
"step": 1132
},
{
"epoch": 2.183132530120482,
"grad_norm": 0.18611350655555725,
"learning_rate": 8.415473281500037e-06,
"loss": 0.0059,
"step": 1133
},
{
"epoch": 2.1850602409638555,
"grad_norm": 0.11245249956846237,
"learning_rate": 8.378943951941301e-06,
"loss": 0.0107,
"step": 1134
},
{
"epoch": 2.186987951807229,
"grad_norm": 0.12284426391124725,
"learning_rate": 8.342473056822873e-06,
"loss": 0.0025,
"step": 1135
},
{
"epoch": 2.1889156626506026,
"grad_norm": 0.12542888522148132,
"learning_rate": 8.306060779532245e-06,
"loss": 0.0059,
"step": 1136
},
{
"epoch": 2.190843373493976,
"grad_norm": 0.1287655532360077,
"learning_rate": 8.26970730316215e-06,
"loss": 0.0022,
"step": 1137
},
{
"epoch": 2.1927710843373496,
"grad_norm": 0.1818632185459137,
"learning_rate": 8.233412810509669e-06,
"loss": 0.0131,
"step": 1138
},
{
"epoch": 2.194698795180723,
"grad_norm": 0.09687745571136475,
"learning_rate": 8.197177484075284e-06,
"loss": 0.0025,
"step": 1139
},
{
"epoch": 2.1966265060240966,
"grad_norm": 0.16103452444076538,
"learning_rate": 8.161001506061979e-06,
"loss": 0.0031,
"step": 1140
},
{
"epoch": 2.19855421686747,
"grad_norm": 0.2711680233478546,
"learning_rate": 8.124885058374302e-06,
"loss": 0.0034,
"step": 1141
},
{
"epoch": 2.200481927710843,
"grad_norm": 0.17613105475902557,
"learning_rate": 8.088828322617473e-06,
"loss": 0.0044,
"step": 1142
},
{
"epoch": 2.202409638554217,
"grad_norm": 0.2298487424850464,
"learning_rate": 8.052831480096464e-06,
"loss": 0.0168,
"step": 1143
},
{
"epoch": 2.20433734939759,
"grad_norm": 0.17042206227779388,
"learning_rate": 8.016894711815067e-06,
"loss": 0.007,
"step": 1144
},
{
"epoch": 2.206265060240964,
"grad_norm": 0.2830466628074646,
"learning_rate": 7.98101819847501e-06,
"loss": 0.0091,
"step": 1145
},
{
"epoch": 2.208192771084337,
"grad_norm": 0.22089065611362457,
"learning_rate": 7.945202120475063e-06,
"loss": 0.0046,
"step": 1146
},
{
"epoch": 2.210120481927711,
"grad_norm": 0.1716073900461197,
"learning_rate": 7.909446657910072e-06,
"loss": 0.0032,
"step": 1147
},
{
"epoch": 2.212048192771084,
"grad_norm": 0.16140373051166534,
"learning_rate": 7.873751990570104e-06,
"loss": 0.0057,
"step": 1148
},
{
"epoch": 2.213975903614458,
"grad_norm": 0.1671605408191681,
"learning_rate": 7.838118297939529e-06,
"loss": 0.0039,
"step": 1149
},
{
"epoch": 2.2159036144578312,
"grad_norm": 0.10933005809783936,
"learning_rate": 7.802545759196117e-06,
"loss": 0.005,
"step": 1150
},
{
"epoch": 2.217831325301205,
"grad_norm": 0.07819998264312744,
"learning_rate": 7.76703455321014e-06,
"loss": 0.0025,
"step": 1151
},
{
"epoch": 2.2197590361445783,
"grad_norm": 0.36211854219436646,
"learning_rate": 7.73158485854344e-06,
"loss": 0.0151,
"step": 1152
},
{
"epoch": 2.221686746987952,
"grad_norm": 0.09098304808139801,
"learning_rate": 7.696196853448612e-06,
"loss": 0.0027,
"step": 1153
},
{
"epoch": 2.2236144578313253,
"grad_norm": 0.17442144453525543,
"learning_rate": 7.660870715868018e-06,
"loss": 0.006,
"step": 1154
},
{
"epoch": 2.225542168674699,
"grad_norm": 0.09785338491201401,
"learning_rate": 7.625606623432933e-06,
"loss": 0.0041,
"step": 1155
},
{
"epoch": 2.2274698795180723,
"grad_norm": 0.19399888813495636,
"learning_rate": 7.590404753462653e-06,
"loss": 0.0125,
"step": 1156
},
{
"epoch": 2.2293975903614456,
"grad_norm": 0.11080623418092728,
"learning_rate": 7.55526528296362e-06,
"loss": 0.0022,
"step": 1157
},
{
"epoch": 2.2313253012048193,
"grad_norm": 0.14067359268665314,
"learning_rate": 7.520188388628473e-06,
"loss": 0.0123,
"step": 1158
},
{
"epoch": 2.2332530120481926,
"grad_norm": 0.14533625543117523,
"learning_rate": 7.485174246835227e-06,
"loss": 0.0039,
"step": 1159
},
{
"epoch": 2.2351807228915663,
"grad_norm": 0.1253812462091446,
"learning_rate": 7.4502230336463466e-06,
"loss": 0.003,
"step": 1160
},
{
"epoch": 2.2371084337349396,
"grad_norm": 0.12766572833061218,
"learning_rate": 7.415334924807869e-06,
"loss": 0.0044,
"step": 1161
},
{
"epoch": 2.2390361445783133,
"grad_norm": 0.11985791474580765,
"learning_rate": 7.380510095748535e-06,
"loss": 0.0071,
"step": 1162
},
{
"epoch": 2.2409638554216866,
"grad_norm": 0.15505346655845642,
"learning_rate": 7.3457487215788605e-06,
"loss": 0.0046,
"step": 1163
},
{
"epoch": 2.2428915662650604,
"grad_norm": 0.18983210623264313,
"learning_rate": 7.311050977090343e-06,
"loss": 0.0079,
"step": 1164
},
{
"epoch": 2.2448192771084337,
"grad_norm": 0.19279207289218903,
"learning_rate": 7.276417036754479e-06,
"loss": 0.0042,
"step": 1165
},
{
"epoch": 2.2467469879518074,
"grad_norm": 0.21539707481861115,
"learning_rate": 7.241847074721964e-06,
"loss": 0.0087,
"step": 1166
},
{
"epoch": 2.2486746987951807,
"grad_norm": 0.07004354894161224,
"learning_rate": 7.207341264821783e-06,
"loss": 0.002,
"step": 1167
},
{
"epoch": 2.2506024096385544,
"grad_norm": 0.2203039526939392,
"learning_rate": 7.172899780560345e-06,
"loss": 0.0069,
"step": 1168
},
{
"epoch": 2.2525301204819277,
"grad_norm": 0.12474718689918518,
"learning_rate": 7.138522795120606e-06,
"loss": 0.0122,
"step": 1169
},
{
"epoch": 2.2544578313253014,
"grad_norm": 0.09078995883464813,
"learning_rate": 7.104210481361204e-06,
"loss": 0.0025,
"step": 1170
},
{
"epoch": 2.2563855421686747,
"grad_norm": 0.141757071018219,
"learning_rate": 7.069963011815584e-06,
"loss": 0.0039,
"step": 1171
},
{
"epoch": 2.258313253012048,
"grad_norm": 0.14944659173488617,
"learning_rate": 7.035780558691141e-06,
"loss": 0.0025,
"step": 1172
},
{
"epoch": 2.2602409638554217,
"grad_norm": 0.06723666191101074,
"learning_rate": 7.001663293868328e-06,
"loss": 0.0014,
"step": 1173
},
{
"epoch": 2.262168674698795,
"grad_norm": 0.11966485530138016,
"learning_rate": 6.967611388899826e-06,
"loss": 0.0067,
"step": 1174
},
{
"epoch": 2.2640963855421687,
"grad_norm": 0.08943185210227966,
"learning_rate": 6.933625015009666e-06,
"loss": 0.0036,
"step": 1175
},
{
"epoch": 2.266024096385542,
"grad_norm": 0.04511453956365585,
"learning_rate": 6.899704343092359e-06,
"loss": 0.0014,
"step": 1176
},
{
"epoch": 2.2679518072289158,
"grad_norm": 0.1867951601743698,
"learning_rate": 6.865849543712058e-06,
"loss": 0.009,
"step": 1177
},
{
"epoch": 2.269879518072289,
"grad_norm": 0.23791250586509705,
"learning_rate": 6.832060787101658e-06,
"loss": 0.0117,
"step": 1178
},
{
"epoch": 2.271807228915663,
"grad_norm": 0.13210316002368927,
"learning_rate": 6.798338243162008e-06,
"loss": 0.0024,
"step": 1179
},
{
"epoch": 2.273734939759036,
"grad_norm": 0.1601375937461853,
"learning_rate": 6.764682081461002e-06,
"loss": 0.013,
"step": 1180
},
{
"epoch": 2.27566265060241,
"grad_norm": 0.21996766328811646,
"learning_rate": 6.73109247123273e-06,
"loss": 0.0074,
"step": 1181
},
{
"epoch": 2.277590361445783,
"grad_norm": 0.15780030190944672,
"learning_rate": 6.6975695813766465e-06,
"loss": 0.0052,
"step": 1182
},
{
"epoch": 2.279518072289157,
"grad_norm": 0.18146437406539917,
"learning_rate": 6.664113580456739e-06,
"loss": 0.0265,
"step": 1183
},
{
"epoch": 2.28144578313253,
"grad_norm": 0.12033495306968689,
"learning_rate": 6.630724636700618e-06,
"loss": 0.0026,
"step": 1184
},
{
"epoch": 2.283373493975904,
"grad_norm": 0.25268155336380005,
"learning_rate": 6.59740291799873e-06,
"loss": 0.0046,
"step": 1185
},
{
"epoch": 2.285301204819277,
"grad_norm": 0.19043004512786865,
"learning_rate": 6.564148591903488e-06,
"loss": 0.0063,
"step": 1186
},
{
"epoch": 2.2872289156626504,
"grad_norm": 0.06894923001527786,
"learning_rate": 6.530961825628432e-06,
"loss": 0.0012,
"step": 1187
},
{
"epoch": 2.289156626506024,
"grad_norm": 0.16378818452358246,
"learning_rate": 6.4978427860474015e-06,
"loss": 0.0048,
"step": 1188
},
{
"epoch": 2.2910843373493974,
"grad_norm": 0.11130444705486298,
"learning_rate": 6.464791639693648e-06,
"loss": 0.0049,
"step": 1189
},
{
"epoch": 2.293012048192771,
"grad_norm": 0.10573417693376541,
"learning_rate": 6.431808552759083e-06,
"loss": 0.0019,
"step": 1190
},
{
"epoch": 2.2949397590361444,
"grad_norm": 0.13344882428646088,
"learning_rate": 6.398893691093367e-06,
"loss": 0.0033,
"step": 1191
},
{
"epoch": 2.296867469879518,
"grad_norm": 0.12659135460853577,
"learning_rate": 6.366047220203088e-06,
"loss": 0.0032,
"step": 1192
},
{
"epoch": 2.2987951807228915,
"grad_norm": 0.10152821987867355,
"learning_rate": 6.333269305250971e-06,
"loss": 0.0027,
"step": 1193
},
{
"epoch": 2.300722891566265,
"grad_norm": 0.1889944225549698,
"learning_rate": 6.300560111055006e-06,
"loss": 0.0062,
"step": 1194
},
{
"epoch": 2.3026506024096385,
"grad_norm": 2.3101227283477783,
"learning_rate": 6.2679198020876275e-06,
"loss": 0.0113,
"step": 1195
},
{
"epoch": 2.304578313253012,
"grad_norm": 0.6224933862686157,
"learning_rate": 6.235348542474908e-06,
"loss": 0.0273,
"step": 1196
},
{
"epoch": 2.3065060240963855,
"grad_norm": 0.1908419281244278,
"learning_rate": 6.202846495995705e-06,
"loss": 0.0056,
"step": 1197
},
{
"epoch": 2.3084337349397592,
"grad_norm": 0.10968491435050964,
"learning_rate": 6.170413826080856e-06,
"loss": 0.0034,
"step": 1198
},
{
"epoch": 2.3103614457831325,
"grad_norm": 0.23200668394565582,
"learning_rate": 6.138050695812343e-06,
"loss": 0.0042,
"step": 1199
},
{
"epoch": 2.3122891566265062,
"grad_norm": 0.12442032992839813,
"learning_rate": 6.105757267922481e-06,
"loss": 0.0045,
"step": 1200
},
{
"epoch": 2.3142168674698795,
"grad_norm": 0.14563624560832977,
"learning_rate": 6.073533704793122e-06,
"loss": 0.0035,
"step": 1201
},
{
"epoch": 2.316144578313253,
"grad_norm": 0.11523722857236862,
"learning_rate": 6.04138016845478e-06,
"loss": 0.0088,
"step": 1202
},
{
"epoch": 2.3180722891566266,
"grad_norm": 0.2000943422317505,
"learning_rate": 6.009296820585871e-06,
"loss": 0.0059,
"step": 1203
},
{
"epoch": 2.32,
"grad_norm": 0.10698592662811279,
"learning_rate": 5.977283822511879e-06,
"loss": 0.0028,
"step": 1204
},
{
"epoch": 2.3219277108433736,
"grad_norm": 0.1533137410879135,
"learning_rate": 5.945341335204547e-06,
"loss": 0.0044,
"step": 1205
},
{
"epoch": 2.323855421686747,
"grad_norm": 0.1235835999250412,
"learning_rate": 5.9134695192810695e-06,
"loss": 0.0043,
"step": 1206
},
{
"epoch": 2.3257831325301206,
"grad_norm": 0.1916925013065338,
"learning_rate": 5.8816685350032575e-06,
"loss": 0.0066,
"step": 1207
},
{
"epoch": 2.327710843373494,
"grad_norm": 0.08812380582094193,
"learning_rate": 5.849938542276801e-06,
"loss": 0.0022,
"step": 1208
},
{
"epoch": 2.3296385542168676,
"grad_norm": 0.13387660682201385,
"learning_rate": 5.818279700650393e-06,
"loss": 0.0037,
"step": 1209
},
{
"epoch": 2.331566265060241,
"grad_norm": 0.2309022694826126,
"learning_rate": 5.786692169314954e-06,
"loss": 0.0049,
"step": 1210
},
{
"epoch": 2.3334939759036146,
"grad_norm": 0.09956549853086472,
"learning_rate": 5.755176107102833e-06,
"loss": 0.002,
"step": 1211
},
{
"epoch": 2.335421686746988,
"grad_norm": 0.06035687029361725,
"learning_rate": 5.723731672487043e-06,
"loss": 0.002,
"step": 1212
},
{
"epoch": 2.337349397590361,
"grad_norm": 0.06850237399339676,
"learning_rate": 5.69235902358038e-06,
"loss": 0.0013,
"step": 1213
},
{
"epoch": 2.339277108433735,
"grad_norm": 0.12068171054124832,
"learning_rate": 5.661058318134711e-06,
"loss": 0.0041,
"step": 1214
},
{
"epoch": 2.3412048192771087,
"grad_norm": 0.13146616518497467,
"learning_rate": 5.6298297135401355e-06,
"loss": 0.0022,
"step": 1215
},
{
"epoch": 2.343132530120482,
"grad_norm": 0.15160737931728363,
"learning_rate": 5.598673366824212e-06,
"loss": 0.0036,
"step": 1216
},
{
"epoch": 2.3450602409638552,
"grad_norm": 0.26196014881134033,
"learning_rate": 5.567589434651164e-06,
"loss": 0.0151,
"step": 1217
},
{
"epoch": 2.346987951807229,
"grad_norm": 0.12898831069469452,
"learning_rate": 5.536578073321073e-06,
"loss": 0.006,
"step": 1218
},
{
"epoch": 2.3489156626506023,
"grad_norm": 0.11385104805231094,
"learning_rate": 5.505639438769146e-06,
"loss": 0.0052,
"step": 1219
},
{
"epoch": 2.350843373493976,
"grad_norm": 0.14569509029388428,
"learning_rate": 5.47477368656486e-06,
"loss": 0.0048,
"step": 1220
},
{
"epoch": 2.3527710843373493,
"grad_norm": 0.12406075745820999,
"learning_rate": 5.443980971911238e-06,
"loss": 0.0028,
"step": 1221
},
{
"epoch": 2.354698795180723,
"grad_norm": 0.3730498254299164,
"learning_rate": 5.413261449644039e-06,
"loss": 0.0043,
"step": 1222
},
{
"epoch": 2.3566265060240963,
"grad_norm": 0.1449914574623108,
"learning_rate": 5.382615274230987e-06,
"loss": 0.0075,
"step": 1223
},
{
"epoch": 2.35855421686747,
"grad_norm": 0.20739100873470306,
"learning_rate": 5.352042599770995e-06,
"loss": 0.0061,
"step": 1224
},
{
"epoch": 2.3604819277108433,
"grad_norm": 0.05786775052547455,
"learning_rate": 5.321543579993398e-06,
"loss": 0.0015,
"step": 1225
},
{
"epoch": 2.362409638554217,
"grad_norm": 0.09043122828006744,
"learning_rate": 5.2911183682571446e-06,
"loss": 0.0034,
"step": 1226
},
{
"epoch": 2.3643373493975903,
"grad_norm": 0.2685496211051941,
"learning_rate": 5.260767117550094e-06,
"loss": 0.0076,
"step": 1227
},
{
"epoch": 2.3662650602409636,
"grad_norm": 0.17694126069545746,
"learning_rate": 5.230489980488165e-06,
"loss": 0.0148,
"step": 1228
},
{
"epoch": 2.3681927710843373,
"grad_norm": 0.11609307676553726,
"learning_rate": 5.200287109314633e-06,
"loss": 0.0049,
"step": 1229
},
{
"epoch": 2.370120481927711,
"grad_norm": 0.1257704645395279,
"learning_rate": 5.1701586558993285e-06,
"loss": 0.0031,
"step": 1230
},
{
"epoch": 2.3720481927710844,
"grad_norm": 0.27177703380584717,
"learning_rate": 5.140104771737899e-06,
"loss": 0.0058,
"step": 1231
},
{
"epoch": 2.3739759036144576,
"grad_norm": 0.13928169012069702,
"learning_rate": 5.110125607951024e-06,
"loss": 0.0051,
"step": 1232
},
{
"epoch": 2.3759036144578314,
"grad_norm": 0.679577648639679,
"learning_rate": 5.0802213152836514e-06,
"loss": 0.0173,
"step": 1233
},
{
"epoch": 2.3778313253012047,
"grad_norm": 0.16769403219223022,
"learning_rate": 5.0503920441042845e-06,
"loss": 0.0045,
"step": 1234
},
{
"epoch": 2.3797590361445784,
"grad_norm": 0.09427493065595627,
"learning_rate": 5.0206379444041764e-06,
"loss": 0.0024,
"step": 1235
},
{
"epoch": 2.3816867469879517,
"grad_norm": 0.33908671140670776,
"learning_rate": 4.990959165796585e-06,
"loss": 0.0088,
"step": 1236
},
{
"epoch": 2.3836144578313254,
"grad_norm": 0.18106943368911743,
"learning_rate": 4.961355857516034e-06,
"loss": 0.0094,
"step": 1237
},
{
"epoch": 2.3855421686746987,
"grad_norm": 0.5833203196525574,
"learning_rate": 4.931828168417583e-06,
"loss": 0.0086,
"step": 1238
},
{
"epoch": 2.3874698795180724,
"grad_norm": 0.09108569473028183,
"learning_rate": 4.902376246976015e-06,
"loss": 0.0014,
"step": 1239
},
{
"epoch": 2.3893975903614457,
"grad_norm": 0.10596407204866409,
"learning_rate": 4.873000241285153e-06,
"loss": 0.0043,
"step": 1240
},
{
"epoch": 2.3913253012048195,
"grad_norm": 0.10775511711835861,
"learning_rate": 4.8437002990570835e-06,
"loss": 0.0014,
"step": 1241
},
{
"epoch": 2.3932530120481927,
"grad_norm": 0.9646345973014832,
"learning_rate": 4.8144765676214245e-06,
"loss": 0.0525,
"step": 1242
},
{
"epoch": 2.395180722891566,
"grad_norm": 0.20530278980731964,
"learning_rate": 4.7853291939245814e-06,
"loss": 0.008,
"step": 1243
},
{
"epoch": 2.3971084337349398,
"grad_norm": 0.1682119369506836,
"learning_rate": 4.756258324528995e-06,
"loss": 0.0044,
"step": 1244
},
{
"epoch": 2.3990361445783135,
"grad_norm": 0.45536917448043823,
"learning_rate": 4.727264105612439e-06,
"loss": 0.0186,
"step": 1245
},
{
"epoch": 2.4009638554216868,
"grad_norm": 0.3017471730709076,
"learning_rate": 4.698346682967258e-06,
"loss": 0.0106,
"step": 1246
},
{
"epoch": 2.40289156626506,
"grad_norm": 0.1226554661989212,
"learning_rate": 4.669506201999625e-06,
"loss": 0.0035,
"step": 1247
},
{
"epoch": 2.404819277108434,
"grad_norm": 0.13750068843364716,
"learning_rate": 4.640742807728837e-06,
"loss": 0.0038,
"step": 1248
},
{
"epoch": 2.406746987951807,
"grad_norm": 0.11531024426221848,
"learning_rate": 4.612056644786575e-06,
"loss": 0.0021,
"step": 1249
},
{
"epoch": 2.408674698795181,
"grad_norm": 0.1143675372004509,
"learning_rate": 4.583447857416175e-06,
"loss": 0.0028,
"step": 1250
},
{
"epoch": 2.410602409638554,
"grad_norm": 0.0914216861128807,
"learning_rate": 4.554916589471898e-06,
"loss": 0.0027,
"step": 1251
},
{
"epoch": 2.412530120481928,
"grad_norm": 0.18339012563228607,
"learning_rate": 4.526462984418221e-06,
"loss": 0.0037,
"step": 1252
},
{
"epoch": 2.414457831325301,
"grad_norm": 0.11073138564825058,
"learning_rate": 4.498087185329105e-06,
"loss": 0.003,
"step": 1253
},
{
"epoch": 2.416385542168675,
"grad_norm": 0.20792435109615326,
"learning_rate": 4.469789334887265e-06,
"loss": 0.009,
"step": 1254
},
{
"epoch": 2.418313253012048,
"grad_norm": 0.09485629945993423,
"learning_rate": 4.441569575383471e-06,
"loss": 0.0033,
"step": 1255
},
{
"epoch": 2.420240963855422,
"grad_norm": 0.11831793934106827,
"learning_rate": 4.413428048715851e-06,
"loss": 0.0021,
"step": 1256
},
{
"epoch": 2.422168674698795,
"grad_norm": 0.11818034201860428,
"learning_rate": 4.38536489638911e-06,
"loss": 0.0041,
"step": 1257
},
{
"epoch": 2.4240963855421684,
"grad_norm": 0.2583082616329193,
"learning_rate": 4.3573802595138945e-06,
"loss": 0.0039,
"step": 1258
},
{
"epoch": 2.426024096385542,
"grad_norm": 0.3120201826095581,
"learning_rate": 4.329474278806034e-06,
"loss": 0.0087,
"step": 1259
},
{
"epoch": 2.427951807228916,
"grad_norm": 0.1258879452943802,
"learning_rate": 4.301647094585855e-06,
"loss": 0.0046,
"step": 1260
},
{
"epoch": 2.429879518072289,
"grad_norm": 0.15144586563110352,
"learning_rate": 4.273898846777473e-06,
"loss": 0.0054,
"step": 1261
},
{
"epoch": 2.4318072289156625,
"grad_norm": 0.15615184605121613,
"learning_rate": 4.246229674908067e-06,
"loss": 0.0072,
"step": 1262
},
{
"epoch": 2.433734939759036,
"grad_norm": 0.09690173715353012,
"learning_rate": 4.218639718107225e-06,
"loss": 0.003,
"step": 1263
},
{
"epoch": 2.4356626506024095,
"grad_norm": 0.23884955048561096,
"learning_rate": 4.1911291151062e-06,
"loss": 0.0109,
"step": 1264
},
{
"epoch": 2.4375903614457832,
"grad_norm": 0.0905768945813179,
"learning_rate": 4.163698004237222e-06,
"loss": 0.0027,
"step": 1265
},
{
"epoch": 2.4395180722891565,
"grad_norm": 0.09168912470340729,
"learning_rate": 4.136346523432821e-06,
"loss": 0.0018,
"step": 1266
},
{
"epoch": 2.4414457831325302,
"grad_norm": 0.17878012359142303,
"learning_rate": 4.109074810225118e-06,
"loss": 0.0048,
"step": 1267
},
{
"epoch": 2.4433734939759035,
"grad_norm": 0.09913790971040726,
"learning_rate": 4.08188300174513e-06,
"loss": 0.0021,
"step": 1268
},
{
"epoch": 2.4453012048192773,
"grad_norm": 0.16615812480449677,
"learning_rate": 4.054771234722106e-06,
"loss": 0.0066,
"step": 1269
},
{
"epoch": 2.4472289156626506,
"grad_norm": 0.09618276357650757,
"learning_rate": 4.027739645482784e-06,
"loss": 0.0043,
"step": 1270
},
{
"epoch": 2.4491566265060243,
"grad_norm": 0.33473479747772217,
"learning_rate": 4.0007883699507855e-06,
"loss": 0.0236,
"step": 1271
},
{
"epoch": 2.4510843373493976,
"grad_norm": 0.15051880478858948,
"learning_rate": 3.973917543645867e-06,
"loss": 0.0068,
"step": 1272
},
{
"epoch": 2.453012048192771,
"grad_norm": 0.24134816229343414,
"learning_rate": 3.947127301683249e-06,
"loss": 0.0194,
"step": 1273
},
{
"epoch": 2.4549397590361446,
"grad_norm": 0.10495353490114212,
"learning_rate": 3.920417778772967e-06,
"loss": 0.0042,
"step": 1274
},
{
"epoch": 2.4568674698795183,
"grad_norm": 0.2294938713312149,
"learning_rate": 3.893789109219171e-06,
"loss": 0.0224,
"step": 1275
},
{
"epoch": 2.4587951807228916,
"grad_norm": 0.13710513710975647,
"learning_rate": 3.867241426919446e-06,
"loss": 0.0046,
"step": 1276
},
{
"epoch": 2.460722891566265,
"grad_norm": 0.06754808127880096,
"learning_rate": 3.840774865364157e-06,
"loss": 0.0019,
"step": 1277
},
{
"epoch": 2.4626506024096386,
"grad_norm": 0.24797780811786652,
"learning_rate": 3.8143895576357605e-06,
"loss": 0.0063,
"step": 1278
},
{
"epoch": 2.464578313253012,
"grad_norm": 0.1476449817419052,
"learning_rate": 3.788085636408143e-06,
"loss": 0.0055,
"step": 1279
},
{
"epoch": 2.4665060240963856,
"grad_norm": 0.22397096455097198,
"learning_rate": 3.7618632339459616e-06,
"loss": 0.0164,
"step": 1280
},
{
"epoch": 2.468433734939759,
"grad_norm": 0.21596969664096832,
"learning_rate": 3.7357224821039497e-06,
"loss": 0.0112,
"step": 1281
},
{
"epoch": 2.4703614457831327,
"grad_norm": 0.2775099575519562,
"learning_rate": 3.7096635123263068e-06,
"loss": 0.0112,
"step": 1282
},
{
"epoch": 2.472289156626506,
"grad_norm": 0.07963326573371887,
"learning_rate": 3.683686455645974e-06,
"loss": 0.0013,
"step": 1283
},
{
"epoch": 2.4742168674698797,
"grad_norm": 0.1253802627325058,
"learning_rate": 3.6577914426840266e-06,
"loss": 0.0038,
"step": 1284
},
{
"epoch": 2.476144578313253,
"grad_norm": 0.10258597880601883,
"learning_rate": 3.631978603648989e-06,
"loss": 0.0023,
"step": 1285
},
{
"epoch": 2.4780722891566267,
"grad_norm": 0.17102380096912384,
"learning_rate": 3.6062480683361935e-06,
"loss": 0.0025,
"step": 1286
},
{
"epoch": 2.48,
"grad_norm": 0.09547360241413116,
"learning_rate": 3.580599966127123e-06,
"loss": 0.003,
"step": 1287
},
{
"epoch": 2.4819277108433733,
"grad_norm": 0.08008653670549393,
"learning_rate": 3.5550344259887438e-06,
"loss": 0.0023,
"step": 1288
},
{
"epoch": 2.483855421686747,
"grad_norm": 0.07712296396493912,
"learning_rate": 3.5295515764729003e-06,
"loss": 0.0015,
"step": 1289
},
{
"epoch": 2.4857831325301207,
"grad_norm": 0.21118703484535217,
"learning_rate": 3.5041515457156303e-06,
"loss": 0.0041,
"step": 1290
},
{
"epoch": 2.487710843373494,
"grad_norm": 0.10772393643856049,
"learning_rate": 3.4788344614365155e-06,
"loss": 0.0029,
"step": 1291
},
{
"epoch": 2.4896385542168673,
"grad_norm": 0.2353268563747406,
"learning_rate": 3.453600450938073e-06,
"loss": 0.0072,
"step": 1292
},
{
"epoch": 2.491566265060241,
"grad_norm": 0.2897944152355194,
"learning_rate": 3.428449641105107e-06,
"loss": 0.0205,
"step": 1293
},
{
"epoch": 2.4934939759036143,
"grad_norm": 0.19756680727005005,
"learning_rate": 3.4033821584040383e-06,
"loss": 0.0065,
"step": 1294
},
{
"epoch": 2.495421686746988,
"grad_norm": 0.13538534939289093,
"learning_rate": 3.378398128882305e-06,
"loss": 0.0025,
"step": 1295
},
{
"epoch": 2.4973493975903613,
"grad_norm": 0.2301637977361679,
"learning_rate": 3.3534976781677142e-06,
"loss": 0.0071,
"step": 1296
},
{
"epoch": 2.499277108433735,
"grad_norm": 0.0965796634554863,
"learning_rate": 3.3286809314678137e-06,
"loss": 0.0024,
"step": 1297
},
{
"epoch": 2.5012048192771084,
"grad_norm": 0.0777980163693428,
"learning_rate": 3.30394801356926e-06,
"loss": 0.0013,
"step": 1298
},
{
"epoch": 2.503132530120482,
"grad_norm": 0.3157603442668915,
"learning_rate": 3.279299048837177e-06,
"loss": 0.0228,
"step": 1299
},
{
"epoch": 2.5050602409638554,
"grad_norm": 0.15660233795642853,
"learning_rate": 3.2547341612145654e-06,
"loss": 0.0056,
"step": 1300
},
{
"epoch": 2.506987951807229,
"grad_norm": 0.21655581891536713,
"learning_rate": 3.2302534742216586e-06,
"loss": 0.0081,
"step": 1301
},
{
"epoch": 2.5089156626506024,
"grad_norm": 0.09475889801979065,
"learning_rate": 3.205857110955277e-06,
"loss": 0.0029,
"step": 1302
},
{
"epoch": 2.5108433734939757,
"grad_norm": 0.13174696266651154,
"learning_rate": 3.18154519408826e-06,
"loss": 0.0059,
"step": 1303
},
{
"epoch": 2.5127710843373494,
"grad_norm": 0.10386355221271515,
"learning_rate": 3.1573178458688102e-06,
"loss": 0.0042,
"step": 1304
},
{
"epoch": 2.514698795180723,
"grad_norm": 0.12700854241847992,
"learning_rate": 3.133175188119899e-06,
"loss": 0.0041,
"step": 1305
},
{
"epoch": 2.5166265060240964,
"grad_norm": 0.1617022454738617,
"learning_rate": 3.109117342238639e-06,
"loss": 0.0053,
"step": 1306
},
{
"epoch": 2.5185542168674697,
"grad_norm": 0.8668884038925171,
"learning_rate": 3.085144429195688e-06,
"loss": 0.0084,
"step": 1307
},
{
"epoch": 2.5204819277108435,
"grad_norm": 0.22429344058036804,
"learning_rate": 3.061256569534634e-06,
"loss": 0.0053,
"step": 1308
},
{
"epoch": 2.5224096385542167,
"grad_norm": 0.08967582136392593,
"learning_rate": 3.037453883371375e-06,
"loss": 0.0018,
"step": 1309
},
{
"epoch": 2.5243373493975905,
"grad_norm": 0.1251695454120636,
"learning_rate": 3.0137364903935464e-06,
"loss": 0.0037,
"step": 1310
},
{
"epoch": 2.5262650602409638,
"grad_norm": 0.09026174992322922,
"learning_rate": 2.990104509859897e-06,
"loss": 0.0024,
"step": 1311
},
{
"epoch": 2.5281927710843375,
"grad_norm": 0.34319114685058594,
"learning_rate": 2.966558060599689e-06,
"loss": 0.0063,
"step": 1312
},
{
"epoch": 2.5301204819277108,
"grad_norm": 0.20300136506557465,
"learning_rate": 2.9430972610121087e-06,
"loss": 0.0054,
"step": 1313
},
{
"epoch": 2.532048192771084,
"grad_norm": 0.19160760939121246,
"learning_rate": 2.9197222290656737e-06,
"loss": 0.0095,
"step": 1314
},
{
"epoch": 2.533975903614458,
"grad_norm": 0.18991442024707794,
"learning_rate": 2.8964330822976227e-06,
"loss": 0.006,
"step": 1315
},
{
"epoch": 2.5359036144578315,
"grad_norm": 0.1801903396844864,
"learning_rate": 2.873229937813349e-06,
"loss": 0.0067,
"step": 1316
},
{
"epoch": 2.537831325301205,
"grad_norm": 0.07068303227424622,
"learning_rate": 2.850112912285783e-06,
"loss": 0.0015,
"step": 1317
},
{
"epoch": 2.539759036144578,
"grad_norm": 0.1404612809419632,
"learning_rate": 2.8270821219548296e-06,
"loss": 0.0036,
"step": 1318
},
{
"epoch": 2.541686746987952,
"grad_norm": 0.12199504673480988,
"learning_rate": 2.8041376826267862e-06,
"loss": 0.0068,
"step": 1319
},
{
"epoch": 2.5436144578313256,
"grad_norm": 0.2167249619960785,
"learning_rate": 2.7812797096737253e-06,
"loss": 0.0048,
"step": 1320
},
{
"epoch": 2.545542168674699,
"grad_norm": 0.07466506212949753,
"learning_rate": 2.7585083180329575e-06,
"loss": 0.0017,
"step": 1321
},
{
"epoch": 2.547469879518072,
"grad_norm": 0.11736353486776352,
"learning_rate": 2.7358236222064283e-06,
"loss": 0.003,
"step": 1322
},
{
"epoch": 2.549397590361446,
"grad_norm": 0.16602204740047455,
"learning_rate": 2.7132257362601453e-06,
"loss": 0.005,
"step": 1323
},
{
"epoch": 2.551325301204819,
"grad_norm": 0.15473629534244537,
"learning_rate": 2.6907147738236193e-06,
"loss": 0.0077,
"step": 1324
},
{
"epoch": 2.553253012048193,
"grad_norm": 0.07868973910808563,
"learning_rate": 2.6682908480892567e-06,
"loss": 0.0013,
"step": 1325
},
{
"epoch": 2.555180722891566,
"grad_norm": 0.2137845754623413,
"learning_rate": 2.645954071811847e-06,
"loss": 0.0092,
"step": 1326
},
{
"epoch": 2.55710843373494,
"grad_norm": 0.11191053688526154,
"learning_rate": 2.623704557307949e-06,
"loss": 0.0031,
"step": 1327
},
{
"epoch": 2.559036144578313,
"grad_norm": 0.3080642521381378,
"learning_rate": 2.6015424164553295e-06,
"loss": 0.0104,
"step": 1328
},
{
"epoch": 2.5609638554216865,
"grad_norm": 0.08816439658403397,
"learning_rate": 2.579467760692427e-06,
"loss": 0.004,
"step": 1329
},
{
"epoch": 2.56289156626506,
"grad_norm": 0.17154981195926666,
"learning_rate": 2.557480701017776e-06,
"loss": 0.0035,
"step": 1330
},
{
"epoch": 2.564819277108434,
"grad_norm": 0.09479143470525742,
"learning_rate": 2.5355813479894464e-06,
"loss": 0.0034,
"step": 1331
},
{
"epoch": 2.5667469879518072,
"grad_norm": 0.26139333844184875,
"learning_rate": 2.513769811724487e-06,
"loss": 0.0076,
"step": 1332
},
{
"epoch": 2.5686746987951805,
"grad_norm": 0.16864238679409027,
"learning_rate": 2.4920462018983816e-06,
"loss": 0.0046,
"step": 1333
},
{
"epoch": 2.5706024096385542,
"grad_norm": 0.1133158802986145,
"learning_rate": 2.4704106277444884e-06,
"loss": 0.0034,
"step": 1334
},
{
"epoch": 2.572530120481928,
"grad_norm": 0.27522334456443787,
"learning_rate": 2.4488631980534995e-06,
"loss": 0.0127,
"step": 1335
},
{
"epoch": 2.5744578313253013,
"grad_norm": 0.13547387719154358,
"learning_rate": 2.427404021172868e-06,
"loss": 0.0031,
"step": 1336
},
{
"epoch": 2.5763855421686745,
"grad_norm": 0.13478629291057587,
"learning_rate": 2.406033205006313e-06,
"loss": 0.0039,
"step": 1337
},
{
"epoch": 2.5783132530120483,
"grad_norm": 0.11515481770038605,
"learning_rate": 2.3847508570132226e-06,
"loss": 0.0029,
"step": 1338
},
{
"epoch": 2.5802409638554216,
"grad_norm": 0.21657171845436096,
"learning_rate": 2.36355708420815e-06,
"loss": 0.011,
"step": 1339
},
{
"epoch": 2.5821686746987953,
"grad_norm": 0.11441601067781448,
"learning_rate": 2.342451993160262e-06,
"loss": 0.006,
"step": 1340
},
{
"epoch": 2.5840963855421686,
"grad_norm": 0.13475841283798218,
"learning_rate": 2.3214356899928036e-06,
"loss": 0.0051,
"step": 1341
},
{
"epoch": 2.5860240963855423,
"grad_norm": 0.053035832941532135,
"learning_rate": 2.300508280382572e-06,
"loss": 0.0012,
"step": 1342
},
{
"epoch": 2.5879518072289156,
"grad_norm": 0.12467508763074875,
"learning_rate": 2.279669869559358e-06,
"loss": 0.0024,
"step": 1343
},
{
"epoch": 2.589879518072289,
"grad_norm": 0.10572273284196854,
"learning_rate": 2.2589205623054646e-06,
"loss": 0.0024,
"step": 1344
},
{
"epoch": 2.5918072289156626,
"grad_norm": 0.17056365311145782,
"learning_rate": 2.238260462955142e-06,
"loss": 0.0064,
"step": 1345
},
{
"epoch": 2.5937349397590364,
"grad_norm": 0.07940494269132614,
"learning_rate": 2.2176896753940637e-06,
"loss": 0.0012,
"step": 1346
},
{
"epoch": 2.5956626506024096,
"grad_norm": 0.10416694730520248,
"learning_rate": 2.1972083030588244e-06,
"loss": 0.0092,
"step": 1347
},
{
"epoch": 2.597590361445783,
"grad_norm": 0.2384328842163086,
"learning_rate": 2.176816448936423e-06,
"loss": 0.0067,
"step": 1348
},
{
"epoch": 2.5995180722891567,
"grad_norm": 0.14279082417488098,
"learning_rate": 2.156514215563703e-06,
"loss": 0.0059,
"step": 1349
},
{
"epoch": 2.6014457831325304,
"grad_norm": 0.08462683111429214,
"learning_rate": 2.1363017050268886e-06,
"loss": 0.0021,
"step": 1350
},
{
"epoch": 2.6033734939759037,
"grad_norm": 0.09768491238355637,
"learning_rate": 2.1161790189610377e-06,
"loss": 0.0038,
"step": 1351
},
{
"epoch": 2.605301204819277,
"grad_norm": 0.25498896837234497,
"learning_rate": 2.0961462585495474e-06,
"loss": 0.0114,
"step": 1352
},
{
"epoch": 2.6072289156626507,
"grad_norm": 0.15635675191879272,
"learning_rate": 2.076203524523637e-06,
"loss": 0.0054,
"step": 1353
},
{
"epoch": 2.609156626506024,
"grad_norm": 0.11619213968515396,
"learning_rate": 2.056350917161836e-06,
"loss": 0.007,
"step": 1354
},
{
"epoch": 2.6110843373493977,
"grad_norm": 0.18085338175296783,
"learning_rate": 2.0365885362895053e-06,
"loss": 0.0061,
"step": 1355
},
{
"epoch": 2.613012048192771,
"grad_norm": 0.14492927491664886,
"learning_rate": 2.016916481278306e-06,
"loss": 0.0114,
"step": 1356
},
{
"epoch": 2.6149397590361447,
"grad_norm": 0.21257621049880981,
"learning_rate": 1.997334851045709e-06,
"loss": 0.0057,
"step": 1357
},
{
"epoch": 2.616867469879518,
"grad_norm": 0.11539656668901443,
"learning_rate": 1.9778437440545085e-06,
"loss": 0.0071,
"step": 1358
},
{
"epoch": 2.6187951807228913,
"grad_norm": 0.1642933189868927,
"learning_rate": 1.95844325831231e-06,
"loss": 0.0054,
"step": 1359
},
{
"epoch": 2.620722891566265,
"grad_norm": 0.10779479146003723,
"learning_rate": 1.9391334913710545e-06,
"loss": 0.0028,
"step": 1360
},
{
"epoch": 2.6226506024096388,
"grad_norm": 0.14295366406440735,
"learning_rate": 1.9199145403265175e-06,
"loss": 0.0048,
"step": 1361
},
{
"epoch": 2.624578313253012,
"grad_norm": 0.13454844057559967,
"learning_rate": 1.9007865018178107e-06,
"loss": 0.0072,
"step": 1362
},
{
"epoch": 2.6265060240963853,
"grad_norm": 0.778252363204956,
"learning_rate": 1.8817494720269302e-06,
"loss": 0.0071,
"step": 1363
},
{
"epoch": 2.628433734939759,
"grad_norm": 0.11488679051399231,
"learning_rate": 1.8628035466782268e-06,
"loss": 0.0038,
"step": 1364
},
{
"epoch": 2.630361445783133,
"grad_norm": 0.15560875833034515,
"learning_rate": 1.8439488210379687e-06,
"loss": 0.0043,
"step": 1365
},
{
"epoch": 2.632289156626506,
"grad_norm": 0.10538071393966675,
"learning_rate": 1.8251853899138306e-06,
"loss": 0.0041,
"step": 1366
},
{
"epoch": 2.6342168674698794,
"grad_norm": 0.12866193056106567,
"learning_rate": 1.8065133476544306e-06,
"loss": 0.0034,
"step": 1367
},
{
"epoch": 2.636144578313253,
"grad_norm": 0.2045469433069229,
"learning_rate": 1.7879327881488584e-06,
"loss": 0.0141,
"step": 1368
},
{
"epoch": 2.6380722891566264,
"grad_norm": 0.12423976510763168,
"learning_rate": 1.769443804826194e-06,
"loss": 0.0047,
"step": 1369
},
{
"epoch": 2.64,
"grad_norm": 0.1007109209895134,
"learning_rate": 1.751046490655046e-06,
"loss": 0.0031,
"step": 1370
},
{
"epoch": 2.6419277108433734,
"grad_norm": 0.0681275874376297,
"learning_rate": 1.7327409381430804e-06,
"loss": 0.0019,
"step": 1371
},
{
"epoch": 2.643855421686747,
"grad_norm": 0.1645517498254776,
"learning_rate": 1.7145272393365498e-06,
"loss": 0.0035,
"step": 1372
},
{
"epoch": 2.6457831325301204,
"grad_norm": 0.13689427077770233,
"learning_rate": 1.6964054858198386e-06,
"loss": 0.0086,
"step": 1373
},
{
"epoch": 2.6477108433734937,
"grad_norm": 0.10440093278884888,
"learning_rate": 1.6783757687150149e-06,
"loss": 0.0019,
"step": 1374
},
{
"epoch": 2.6496385542168674,
"grad_norm": 0.1142532229423523,
"learning_rate": 1.6604381786813383e-06,
"loss": 0.0047,
"step": 1375
},
{
"epoch": 2.651566265060241,
"grad_norm": 0.10430166125297546,
"learning_rate": 1.6425928059148312e-06,
"loss": 0.0027,
"step": 1376
},
{
"epoch": 2.6534939759036145,
"grad_norm": 0.2315254956483841,
"learning_rate": 1.624839740147819e-06,
"loss": 0.0071,
"step": 1377
},
{
"epoch": 2.6554216867469878,
"grad_norm": 0.15356265008449554,
"learning_rate": 1.6071790706484746e-06,
"loss": 0.0109,
"step": 1378
},
{
"epoch": 2.6573493975903615,
"grad_norm": 0.1332363784313202,
"learning_rate": 1.589610886220383e-06,
"loss": 0.0046,
"step": 1379
},
{
"epoch": 2.659277108433735,
"grad_norm": 0.18892519176006317,
"learning_rate": 1.5721352752020602e-06,
"loss": 0.0138,
"step": 1380
},
{
"epoch": 2.6612048192771085,
"grad_norm": 0.10537895560264587,
"learning_rate": 1.5547523254665598e-06,
"loss": 0.0066,
"step": 1381
},
{
"epoch": 2.663132530120482,
"grad_norm": 0.1308947205543518,
"learning_rate": 1.5374621244209965e-06,
"loss": 0.0039,
"step": 1382
},
{
"epoch": 2.6650602409638555,
"grad_norm": 0.11358808726072311,
"learning_rate": 1.5202647590060983e-06,
"loss": 0.0029,
"step": 1383
},
{
"epoch": 2.666987951807229,
"grad_norm": 0.12029009312391281,
"learning_rate": 1.5031603156958064e-06,
"loss": 0.0032,
"step": 1384
},
{
"epoch": 2.6689156626506025,
"grad_norm": 0.36994072794914246,
"learning_rate": 1.4861488804968093e-06,
"loss": 0.024,
"step": 1385
},
{
"epoch": 2.670843373493976,
"grad_norm": 0.1263083666563034,
"learning_rate": 1.4692305389481232e-06,
"loss": 0.0047,
"step": 1386
},
{
"epoch": 2.6727710843373496,
"grad_norm": 0.15056709945201874,
"learning_rate": 1.452405376120658e-06,
"loss": 0.0014,
"step": 1387
},
{
"epoch": 2.674698795180723,
"grad_norm": 0.10418888181447983,
"learning_rate": 1.4356734766167925e-06,
"loss": 0.0035,
"step": 1388
},
{
"epoch": 2.676626506024096,
"grad_norm": 0.12220565974712372,
"learning_rate": 1.4190349245699443e-06,
"loss": 0.0063,
"step": 1389
},
{
"epoch": 2.67855421686747,
"grad_norm": 0.14774753153324127,
"learning_rate": 1.402489803644156e-06,
"loss": 0.008,
"step": 1390
},
{
"epoch": 2.6804819277108436,
"grad_norm": 0.14384198188781738,
"learning_rate": 1.3860381970336544e-06,
"loss": 0.0039,
"step": 1391
},
{
"epoch": 2.682409638554217,
"grad_norm": 0.10995055735111237,
"learning_rate": 1.3696801874624698e-06,
"loss": 0.0028,
"step": 1392
},
{
"epoch": 2.68433734939759,
"grad_norm": 0.12208505719900131,
"learning_rate": 1.353415857183966e-06,
"loss": 0.0029,
"step": 1393
},
{
"epoch": 2.686265060240964,
"grad_norm": 0.16018439829349518,
"learning_rate": 1.337245287980482e-06,
"loss": 0.0068,
"step": 1394
},
{
"epoch": 2.688192771084337,
"grad_norm": 5.2112274169921875,
"learning_rate": 1.3211685611628844e-06,
"loss": 0.1645,
"step": 1395
},
{
"epoch": 2.690120481927711,
"grad_norm": 0.12426120787858963,
"learning_rate": 1.3051857575701732e-06,
"loss": 0.0044,
"step": 1396
},
{
"epoch": 2.692048192771084,
"grad_norm": 0.13931375741958618,
"learning_rate": 1.2892969575690685e-06,
"loss": 0.0035,
"step": 1397
},
{
"epoch": 2.693975903614458,
"grad_norm": 0.1804540753364563,
"learning_rate": 1.273502241053608e-06,
"loss": 0.0108,
"step": 1398
},
{
"epoch": 2.695903614457831,
"grad_norm": 0.12313607335090637,
"learning_rate": 1.2578016874447596e-06,
"loss": 0.0073,
"step": 1399
},
{
"epoch": 2.697831325301205,
"grad_norm": 0.1301470398902893,
"learning_rate": 1.2421953756899985e-06,
"loss": 0.0037,
"step": 1400
},
{
"epoch": 2.6997590361445782,
"grad_norm": 0.12769126892089844,
"learning_rate": 1.226683384262919e-06,
"loss": 0.0041,
"step": 1401
},
{
"epoch": 2.701686746987952,
"grad_norm": 0.20923997461795807,
"learning_rate": 1.21126579116285e-06,
"loss": 0.0101,
"step": 1402
},
{
"epoch": 2.7036144578313253,
"grad_norm": 0.09334482997655869,
"learning_rate": 1.1959426739144497e-06,
"loss": 0.0022,
"step": 1403
},
{
"epoch": 2.7055421686746985,
"grad_norm": 0.06848987936973572,
"learning_rate": 1.1807141095673291e-06,
"loss": 0.0013,
"step": 1404
},
{
"epoch": 2.7074698795180723,
"grad_norm": 0.14552196860313416,
"learning_rate": 1.1655801746956463e-06,
"loss": 0.0066,
"step": 1405
},
{
"epoch": 2.709397590361446,
"grad_norm": 0.11259587109088898,
"learning_rate": 1.1505409453977334e-06,
"loss": 0.0045,
"step": 1406
},
{
"epoch": 2.7113253012048193,
"grad_norm": 0.23408068716526031,
"learning_rate": 1.135596497295719e-06,
"loss": 0.0181,
"step": 1407
},
{
"epoch": 2.7132530120481926,
"grad_norm": 0.1483619660139084,
"learning_rate": 1.1207469055351395e-06,
"loss": 0.0042,
"step": 1408
},
{
"epoch": 2.7151807228915663,
"grad_norm": 0.1170588880777359,
"learning_rate": 1.105992244784555e-06,
"loss": 0.0059,
"step": 1409
},
{
"epoch": 2.7171084337349396,
"grad_norm": 0.15649215877056122,
"learning_rate": 1.0913325892351857e-06,
"loss": 0.0023,
"step": 1410
},
{
"epoch": 2.7190361445783133,
"grad_norm": 0.0980108231306076,
"learning_rate": 1.0767680126005443e-06,
"loss": 0.0019,
"step": 1411
},
{
"epoch": 2.7209638554216866,
"grad_norm": 0.14913050830364227,
"learning_rate": 1.0622985881160396e-06,
"loss": 0.0018,
"step": 1412
},
{
"epoch": 2.7228915662650603,
"grad_norm": 0.0827481672167778,
"learning_rate": 1.0479243885386347e-06,
"loss": 0.0023,
"step": 1413
},
{
"epoch": 2.7248192771084336,
"grad_norm": 0.15648555755615234,
"learning_rate": 1.0336454861464706e-06,
"loss": 0.0033,
"step": 1414
},
{
"epoch": 2.7267469879518074,
"grad_norm": 0.10614357888698578,
"learning_rate": 1.0194619527385007e-06,
"loss": 0.0029,
"step": 1415
},
{
"epoch": 2.7286746987951807,
"grad_norm": 0.07111652940511703,
"learning_rate": 1.0053738596341355e-06,
"loss": 0.0026,
"step": 1416
},
{
"epoch": 2.7306024096385544,
"grad_norm": 0.11736573278903961,
"learning_rate": 9.91381277672867e-07,
"loss": 0.005,
"step": 1417
},
{
"epoch": 2.7325301204819277,
"grad_norm": 0.18440629541873932,
"learning_rate": 9.774842772139537e-07,
"loss": 0.0038,
"step": 1418
},
{
"epoch": 2.734457831325301,
"grad_norm": 0.11000041663646698,
"learning_rate": 9.636829281360116e-07,
"loss": 0.0034,
"step": 1419
},
{
"epoch": 2.7363855421686747,
"grad_norm": 0.15212605893611908,
"learning_rate": 9.499772998367018e-07,
"loss": 0.0038,
"step": 1420
},
{
"epoch": 2.7383132530120484,
"grad_norm": 0.07784705609083176,
"learning_rate": 9.36367461232377e-07,
"loss": 0.002,
"step": 1421
},
{
"epoch": 2.7402409638554217,
"grad_norm": 0.1096726506948471,
"learning_rate": 9.22853480757715e-07,
"loss": 0.0028,
"step": 1422
},
{
"epoch": 2.742168674698795,
"grad_norm": 0.17528535425662994,
"learning_rate": 9.094354263653971e-07,
"loss": 0.0065,
"step": 1423
},
{
"epoch": 2.7440963855421687,
"grad_norm": 0.09263470768928528,
"learning_rate": 8.961133655257548e-07,
"loss": 0.0031,
"step": 1424
},
{
"epoch": 2.746024096385542,
"grad_norm": 0.14822180569171906,
"learning_rate": 8.828873652264303e-07,
"loss": 0.0043,
"step": 1425
},
{
"epoch": 2.7479518072289157,
"grad_norm": 0.11577019095420837,
"learning_rate": 8.697574919720497e-07,
"loss": 0.004,
"step": 1426
},
{
"epoch": 2.749879518072289,
"grad_norm": 0.11681873351335526,
"learning_rate": 8.567238117838683e-07,
"loss": 0.0035,
"step": 1427
},
{
"epoch": 2.7518072289156628,
"grad_norm": 0.1191524937748909,
"learning_rate": 8.437863901994592e-07,
"loss": 0.0022,
"step": 1428
},
{
"epoch": 2.753734939759036,
"grad_norm": 0.1528361737728119,
"learning_rate": 8.309452922723849e-07,
"loss": 0.0042,
"step": 1429
},
{
"epoch": 2.75566265060241,
"grad_norm": 0.42052382230758667,
"learning_rate": 8.18200582571842e-07,
"loss": 0.0149,
"step": 1430
},
{
"epoch": 2.757590361445783,
"grad_norm": 0.13524137437343597,
"learning_rate": 8.055523251823705e-07,
"loss": 0.0029,
"step": 1431
},
{
"epoch": 2.759518072289157,
"grad_norm": 0.0980493426322937,
"learning_rate": 7.930005837035138e-07,
"loss": 0.0036,
"step": 1432
},
{
"epoch": 2.76144578313253,
"grad_norm": 0.17335453629493713,
"learning_rate": 7.805454212494967e-07,
"loss": 0.0066,
"step": 1433
},
{
"epoch": 2.7633734939759034,
"grad_norm": 0.13746409118175507,
"learning_rate": 7.681869004489218e-07,
"loss": 0.0066,
"step": 1434
},
{
"epoch": 2.765301204819277,
"grad_norm": 0.18556399643421173,
"learning_rate": 7.559250834444332e-07,
"loss": 0.0073,
"step": 1435
},
{
"epoch": 2.767228915662651,
"grad_norm": 0.09743557125329971,
"learning_rate": 7.437600318924332e-07,
"loss": 0.0023,
"step": 1436
},
{
"epoch": 2.769156626506024,
"grad_norm": 0.10671001672744751,
"learning_rate": 7.316918069627488e-07,
"loss": 0.003,
"step": 1437
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.10671380162239075,
"learning_rate": 7.197204693383231e-07,
"loss": 0.0021,
"step": 1438
},
{
"epoch": 2.773012048192771,
"grad_norm": 0.06824454665184021,
"learning_rate": 7.078460792149311e-07,
"loss": 0.0017,
"step": 1439
},
{
"epoch": 2.7749397590361444,
"grad_norm": 0.12668560445308685,
"learning_rate": 6.960686963008556e-07,
"loss": 0.0035,
"step": 1440
},
{
"epoch": 2.776867469879518,
"grad_norm": 0.10260980576276779,
"learning_rate": 6.843883798166029e-07,
"loss": 0.0027,
"step": 1441
},
{
"epoch": 2.7787951807228914,
"grad_norm": 0.09880302101373672,
"learning_rate": 6.728051884945941e-07,
"loss": 0.0029,
"step": 1442
},
{
"epoch": 2.780722891566265,
"grad_norm": 0.305993914604187,
"learning_rate": 6.613191805788699e-07,
"loss": 0.0112,
"step": 1443
},
{
"epoch": 2.7826506024096385,
"grad_norm": 0.10707511752843857,
"learning_rate": 6.499304138248064e-07,
"loss": 0.0062,
"step": 1444
},
{
"epoch": 2.784578313253012,
"grad_norm": 0.0986943170428276,
"learning_rate": 6.386389454988195e-07,
"loss": 0.0021,
"step": 1445
},
{
"epoch": 2.7865060240963855,
"grad_norm": 0.1458776742219925,
"learning_rate": 6.274448323780724e-07,
"loss": 0.0094,
"step": 1446
},
{
"epoch": 2.788433734939759,
"grad_norm": 0.09657061100006104,
"learning_rate": 6.163481307501995e-07,
"loss": 0.0026,
"step": 1447
},
{
"epoch": 2.7903614457831325,
"grad_norm": 0.1462988704442978,
"learning_rate": 6.053488964130183e-07,
"loss": 0.0075,
"step": 1448
},
{
"epoch": 2.792289156626506,
"grad_norm": 0.15330864489078522,
"learning_rate": 5.94447184674245e-07,
"loss": 0.0067,
"step": 1449
},
{
"epoch": 2.7942168674698795,
"grad_norm": 0.1513473242521286,
"learning_rate": 5.836430503512236e-07,
"loss": 0.0106,
"step": 1450
},
{
"epoch": 2.7961445783132532,
"grad_norm": 0.2151842713356018,
"learning_rate": 5.729365477706505e-07,
"loss": 0.0062,
"step": 1451
},
{
"epoch": 2.7980722891566265,
"grad_norm": 0.13624203205108643,
"learning_rate": 5.623277307682929e-07,
"loss": 0.0045,
"step": 1452
},
{
"epoch": 2.8,
"grad_norm": 0.12075261026620865,
"learning_rate": 5.518166526887214e-07,
"loss": 0.0073,
"step": 1453
},
{
"epoch": 2.8019277108433736,
"grad_norm": 0.11320624500513077,
"learning_rate": 5.41403366385047e-07,
"loss": 0.002,
"step": 1454
},
{
"epoch": 2.803855421686747,
"grad_norm": 0.08470363914966583,
"learning_rate": 5.310879242186606e-07,
"loss": 0.0021,
"step": 1455
},
{
"epoch": 2.8057831325301206,
"grad_norm": 0.15221907198429108,
"learning_rate": 5.208703780589419e-07,
"loss": 0.0019,
"step": 1456
},
{
"epoch": 2.807710843373494,
"grad_norm": 0.12709103524684906,
"learning_rate": 5.107507792830335e-07,
"loss": 0.0052,
"step": 1457
},
{
"epoch": 2.8096385542168676,
"grad_norm": 0.10888515412807465,
"learning_rate": 5.007291787755586e-07,
"loss": 0.0023,
"step": 1458
},
{
"epoch": 2.811566265060241,
"grad_norm": 0.25710970163345337,
"learning_rate": 4.908056269283789e-07,
"loss": 0.0073,
"step": 1459
},
{
"epoch": 2.8134939759036146,
"grad_norm": 0.08488702774047852,
"learning_rate": 4.809801736403308e-07,
"loss": 0.0016,
"step": 1460
},
{
"epoch": 2.815421686746988,
"grad_norm": 0.1282006949186325,
"learning_rate": 4.7125286831698034e-07,
"loss": 0.0035,
"step": 1461
},
{
"epoch": 2.8173493975903616,
"grad_norm": 0.08955442905426025,
"learning_rate": 4.6162375987037766e-07,
"loss": 0.004,
"step": 1462
},
{
"epoch": 2.819277108433735,
"grad_norm": 0.11310838907957077,
"learning_rate": 4.520928967188054e-07,
"loss": 0.0022,
"step": 1463
},
{
"epoch": 2.821204819277108,
"grad_norm": 0.15055686235427856,
"learning_rate": 4.426603267865326e-07,
"loss": 0.0042,
"step": 1464
},
{
"epoch": 2.823132530120482,
"grad_norm": 0.14379452168941498,
"learning_rate": 4.333260975035769e-07,
"loss": 0.0089,
"step": 1465
},
{
"epoch": 2.8250602409638557,
"grad_norm": 0.1795361489057541,
"learning_rate": 4.240902558054827e-07,
"loss": 0.013,
"step": 1466
},
{
"epoch": 2.826987951807229,
"grad_norm": 0.06829468160867691,
"learning_rate": 4.1495284813305003e-07,
"loss": 0.0018,
"step": 1467
},
{
"epoch": 2.8289156626506022,
"grad_norm": 0.35213515162467957,
"learning_rate": 4.0591392043213275e-07,
"loss": 0.0144,
"step": 1468
},
{
"epoch": 2.830843373493976,
"grad_norm": 0.11828093230724335,
"learning_rate": 3.969735181533918e-07,
"loss": 0.0028,
"step": 1469
},
{
"epoch": 2.8327710843373493,
"grad_norm": 0.13286921381950378,
"learning_rate": 3.881316862520712e-07,
"loss": 0.0042,
"step": 1470
},
{
"epoch": 2.834698795180723,
"grad_norm": 0.10271132737398148,
"learning_rate": 3.7938846918776917e-07,
"loss": 0.0047,
"step": 1471
},
{
"epoch": 2.8366265060240963,
"grad_norm": 0.09422904253005981,
"learning_rate": 3.707439109242139e-07,
"loss": 0.0061,
"step": 1472
},
{
"epoch": 2.83855421686747,
"grad_norm": 0.10817123204469681,
"learning_rate": 3.6219805492905934e-07,
"loss": 0.0029,
"step": 1473
},
{
"epoch": 2.8404819277108433,
"grad_norm": 0.10254565626382828,
"learning_rate": 3.53750944173632e-07,
"loss": 0.0044,
"step": 1474
},
{
"epoch": 2.842409638554217,
"grad_norm": 0.11423154920339584,
"learning_rate": 3.45402621132751e-07,
"loss": 0.0059,
"step": 1475
},
{
"epoch": 2.8443373493975903,
"grad_norm": 0.15620556473731995,
"learning_rate": 3.3715312778449305e-07,
"loss": 0.005,
"step": 1476
},
{
"epoch": 2.846265060240964,
"grad_norm": 0.1081036925315857,
"learning_rate": 3.2900250560998546e-07,
"loss": 0.004,
"step": 1477
},
{
"epoch": 2.8481927710843373,
"grad_norm": 0.38650745153427124,
"learning_rate": 3.209507955932001e-07,
"loss": 0.0076,
"step": 1478
},
{
"epoch": 2.8501204819277106,
"grad_norm": 0.1864783614873886,
"learning_rate": 3.129980382207509e-07,
"loss": 0.0092,
"step": 1479
},
{
"epoch": 2.8520481927710843,
"grad_norm": 0.1458069533109665,
"learning_rate": 3.05144273481679e-07,
"loss": 0.0058,
"step": 1480
},
{
"epoch": 2.853975903614458,
"grad_norm": 0.14836257696151733,
"learning_rate": 2.9738954086726334e-07,
"loss": 0.014,
"step": 1481
},
{
"epoch": 2.8559036144578314,
"grad_norm": 0.10147511214017868,
"learning_rate": 2.8973387937081485e-07,
"loss": 0.0047,
"step": 1482
},
{
"epoch": 2.8578313253012047,
"grad_norm": 0.13740235567092896,
"learning_rate": 2.821773274874828e-07,
"loss": 0.0028,
"step": 1483
},
{
"epoch": 2.8597590361445784,
"grad_norm": 0.16089461743831635,
"learning_rate": 2.7471992321406624e-07,
"loss": 0.0168,
"step": 1484
},
{
"epoch": 2.8616867469879517,
"grad_norm": 0.0599152147769928,
"learning_rate": 2.6736170404880744e-07,
"loss": 0.0017,
"step": 1485
},
{
"epoch": 2.8636144578313254,
"grad_norm": 0.148875430226326,
"learning_rate": 2.6010270699122096e-07,
"loss": 0.0045,
"step": 1486
},
{
"epoch": 2.8655421686746987,
"grad_norm": 0.26763641834259033,
"learning_rate": 2.529429685419027e-07,
"loss": 0.007,
"step": 1487
},
{
"epoch": 2.8674698795180724,
"grad_norm": 0.1743084192276001,
"learning_rate": 2.458825247023389e-07,
"loss": 0.0112,
"step": 1488
},
{
"epoch": 2.8693975903614457,
"grad_norm": 0.21380828320980072,
"learning_rate": 2.3892141097473063e-07,
"loss": 0.0103,
"step": 1489
},
{
"epoch": 2.8713253012048194,
"grad_norm": 2.185253620147705,
"learning_rate": 2.3205966236181433e-07,
"loss": 0.0195,
"step": 1490
},
{
"epoch": 2.8732530120481927,
"grad_norm": 0.11854024976491928,
"learning_rate": 2.252973133666947e-07,
"loss": 0.0034,
"step": 1491
},
{
"epoch": 2.8751807228915665,
"grad_norm": 0.36487653851509094,
"learning_rate": 2.1863439799265195e-07,
"loss": 0.0063,
"step": 1492
},
{
"epoch": 2.8771084337349397,
"grad_norm": 0.1029730811715126,
"learning_rate": 2.1207094974298847e-07,
"loss": 0.0049,
"step": 1493
},
{
"epoch": 2.879036144578313,
"grad_norm": 0.10066278278827667,
"learning_rate": 2.056070016208489e-07,
"loss": 0.0021,
"step": 1494
},
{
"epoch": 2.8809638554216868,
"grad_norm": 0.21477262675762177,
"learning_rate": 1.9924258612906256e-07,
"loss": 0.0052,
"step": 1495
},
{
"epoch": 2.8828915662650605,
"grad_norm": 0.29007601737976074,
"learning_rate": 1.929777352699791e-07,
"loss": 0.0065,
"step": 1496
},
{
"epoch": 2.8848192771084338,
"grad_norm": 0.32320499420166016,
"learning_rate": 1.8681248054529754e-07,
"loss": 0.0334,
"step": 1497
},
{
"epoch": 2.886746987951807,
"grad_norm": 0.12790757417678833,
"learning_rate": 1.8074685295591754e-07,
"loss": 0.0034,
"step": 1498
},
{
"epoch": 2.888674698795181,
"grad_norm": 0.12194570153951645,
"learning_rate": 1.7478088300178608e-07,
"loss": 0.0038,
"step": 1499
},
{
"epoch": 2.890602409638554,
"grad_norm": 0.13514107465744019,
"learning_rate": 1.6891460068173548e-07,
"loss": 0.0042,
"step": 1500
},
{
"epoch": 2.892530120481928,
"grad_norm": 0.09762352705001831,
"learning_rate": 1.631480354933346e-07,
"loss": 0.0016,
"step": 1501
},
{
"epoch": 2.894457831325301,
"grad_norm": 0.10607658326625824,
"learning_rate": 1.5748121643274661e-07,
"loss": 0.0062,
"step": 1502
},
{
"epoch": 2.896385542168675,
"grad_norm": 0.0920143872499466,
"learning_rate": 1.519141719945738e-07,
"loss": 0.0025,
"step": 1503
},
{
"epoch": 2.898313253012048,
"grad_norm": 0.17520834505558014,
"learning_rate": 1.4644693017172418e-07,
"loss": 0.0045,
"step": 1504
},
{
"epoch": 2.900240963855422,
"grad_norm": 0.49769192934036255,
"learning_rate": 1.4107951845526267e-07,
"loss": 0.0059,
"step": 1505
},
{
"epoch": 2.902168674698795,
"grad_norm": 0.06354644149541855,
"learning_rate": 1.3581196383427586e-07,
"loss": 0.0021,
"step": 1506
},
{
"epoch": 2.904096385542169,
"grad_norm": 0.09340358525514603,
"learning_rate": 1.3064429279573853e-07,
"loss": 0.0036,
"step": 1507
},
{
"epoch": 2.906024096385542,
"grad_norm": 0.06073952466249466,
"learning_rate": 1.255765313243762e-07,
"loss": 0.001,
"step": 1508
},
{
"epoch": 2.9079518072289154,
"grad_norm": 0.1323407143354416,
"learning_rate": 1.206087049025384e-07,
"loss": 0.008,
"step": 1509
},
{
"epoch": 2.909879518072289,
"grad_norm": 0.18533159792423248,
"learning_rate": 1.1574083851007e-07,
"loss": 0.0086,
"step": 1510
},
{
"epoch": 2.911807228915663,
"grad_norm": 0.09885486960411072,
"learning_rate": 1.1097295662418018e-07,
"loss": 0.0023,
"step": 1511
},
{
"epoch": 2.913734939759036,
"grad_norm": 0.08286528289318085,
"learning_rate": 1.0630508321932687e-07,
"loss": 0.0029,
"step": 1512
},
{
"epoch": 2.9156626506024095,
"grad_norm": 0.1265413761138916,
"learning_rate": 1.0173724176709254e-07,
"loss": 0.003,
"step": 1513
},
{
"epoch": 2.917590361445783,
"grad_norm": 0.0776480957865715,
"learning_rate": 9.726945523606646e-08,
"loss": 0.0013,
"step": 1514
},
{
"epoch": 2.9195180722891565,
"grad_norm": 0.14106431603431702,
"learning_rate": 9.290174609172697e-08,
"loss": 0.0204,
"step": 1515
},
{
"epoch": 2.9214457831325302,
"grad_norm": 0.10813348740339279,
"learning_rate": 8.863413629633277e-08,
"loss": 0.0026,
"step": 1516
},
{
"epoch": 2.9233734939759035,
"grad_norm": 0.11505429446697235,
"learning_rate": 8.446664730881182e-08,
"loss": 0.0038,
"step": 1517
},
{
"epoch": 2.9253012048192772,
"grad_norm": 0.18488599359989166,
"learning_rate": 8.039930008465257e-08,
"loss": 0.0094,
"step": 1518
},
{
"epoch": 2.9272289156626505,
"grad_norm": 0.19229602813720703,
"learning_rate": 7.643211507579296e-08,
"loss": 0.0062,
"step": 1519
},
{
"epoch": 2.929156626506024,
"grad_norm": 0.0876188799738884,
"learning_rate": 7.25651122305293e-08,
"loss": 0.0024,
"step": 1520
},
{
"epoch": 2.9310843373493976,
"grad_norm": 0.15103434026241302,
"learning_rate": 6.87983109934054e-08,
"loss": 0.0056,
"step": 1521
},
{
"epoch": 2.9330120481927713,
"grad_norm": 0.1714266538619995,
"learning_rate": 6.51317303051191e-08,
"loss": 0.0047,
"step": 1522
},
{
"epoch": 2.9349397590361446,
"grad_norm": 0.30670225620269775,
"learning_rate": 6.156538860242922e-08,
"loss": 0.0111,
"step": 1523
},
{
"epoch": 2.936867469879518,
"grad_norm": 0.13250356912612915,
"learning_rate": 5.809930381805773e-08,
"loss": 0.0033,
"step": 1524
},
{
"epoch": 2.9387951807228916,
"grad_norm": 0.10350223630666733,
"learning_rate": 5.4733493380603183e-08,
"loss": 0.0028,
"step": 1525
},
{
"epoch": 2.9407228915662653,
"grad_norm": 0.1638195812702179,
"learning_rate": 5.1467974214456374e-08,
"loss": 0.0037,
"step": 1526
},
{
"epoch": 2.9426506024096386,
"grad_norm": 0.11159276962280273,
"learning_rate": 4.830276273970258e-08,
"loss": 0.003,
"step": 1527
},
{
"epoch": 2.944578313253012,
"grad_norm": 0.09866586327552795,
"learning_rate": 4.5237874872052776e-08,
"loss": 0.0032,
"step": 1528
},
{
"epoch": 2.9465060240963856,
"grad_norm": 0.17825454473495483,
"learning_rate": 4.227332602275924e-08,
"loss": 0.0105,
"step": 1529
},
{
"epoch": 2.948433734939759,
"grad_norm": 0.10379356890916824,
"learning_rate": 3.940913109853561e-08,
"loss": 0.0055,
"step": 1530
},
{
"epoch": 2.9503614457831326,
"grad_norm": 0.23834416270256042,
"learning_rate": 3.66453045014814e-08,
"loss": 0.0044,
"step": 1531
},
{
"epoch": 2.952289156626506,
"grad_norm": 0.11515571922063828,
"learning_rate": 3.398186012901539e-08,
"loss": 0.0042,
"step": 1532
},
{
"epoch": 2.9542168674698797,
"grad_norm": 0.14170049130916595,
"learning_rate": 3.141881137379788e-08,
"loss": 0.0073,
"step": 1533
},
{
"epoch": 2.956144578313253,
"grad_norm": 0.237248957157135,
"learning_rate": 2.8956171123670774e-08,
"loss": 0.0055,
"step": 1534
},
{
"epoch": 2.9580722891566262,
"grad_norm": 0.07076071947813034,
"learning_rate": 2.6593951761588744e-08,
"loss": 0.0016,
"step": 1535
},
{
"epoch": 2.96,
"grad_norm": 0.1100577786564827,
"learning_rate": 2.4332165165557032e-08,
"loss": 0.0026,
"step": 1536
},
{
"epoch": 2.9619277108433737,
"grad_norm": 0.11576279252767563,
"learning_rate": 2.2170822708573736e-08,
"loss": 0.0036,
"step": 1537
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.2067718207836151,
"learning_rate": 2.0109935258565415e-08,
"loss": 0.0063,
"step": 1538
},
{
"epoch": 2.9657831325301203,
"grad_norm": 0.15040244162082672,
"learning_rate": 1.8149513178347122e-08,
"loss": 0.0081,
"step": 1539
},
{
"epoch": 2.967710843373494,
"grad_norm": 0.14071759581565857,
"learning_rate": 1.6289566325555783e-08,
"loss": 0.006,
"step": 1540
},
{
"epoch": 2.9696385542168677,
"grad_norm": 0.32527413964271545,
"learning_rate": 1.4530104052610239e-08,
"loss": 0.0021,
"step": 1541
},
{
"epoch": 2.971566265060241,
"grad_norm": 0.06794515997171402,
"learning_rate": 1.2871135206651287e-08,
"loss": 0.0016,
"step": 1542
},
{
"epoch": 2.9734939759036143,
"grad_norm": 0.08525913208723068,
"learning_rate": 1.1312668129519477e-08,
"loss": 0.0023,
"step": 1543
},
{
"epoch": 2.975421686746988,
"grad_norm": 0.14025282859802246,
"learning_rate": 9.854710657688504e-09,
"loss": 0.0025,
"step": 1544
},
{
"epoch": 2.9773493975903613,
"grad_norm": 0.15709802508354187,
"learning_rate": 8.497270122242996e-09,
"loss": 0.0038,
"step": 1545
},
{
"epoch": 2.979277108433735,
"grad_norm": 0.1520087569952011,
"learning_rate": 7.240353348834106e-09,
"loss": 0.0027,
"step": 1546
},
{
"epoch": 2.9812048192771083,
"grad_norm": 0.13271088898181915,
"learning_rate": 6.083966657646212e-09,
"loss": 0.003,
"step": 1547
},
{
"epoch": 2.983132530120482,
"grad_norm": 0.0962211862206459,
"learning_rate": 5.028115863370265e-09,
"loss": 0.0021,
"step": 1548
},
{
"epoch": 2.9850602409638554,
"grad_norm": 0.11485985666513443,
"learning_rate": 4.072806275163821e-09,
"loss": 0.0039,
"step": 1549
},
{
"epoch": 2.9869879518072286,
"grad_norm": 0.15437521040439606,
"learning_rate": 3.2180426966332833e-09,
"loss": 0.0048,
"step": 1550
},
{
"epoch": 2.9889156626506024,
"grad_norm": 0.09884651750326157,
"learning_rate": 2.4638294258072513e-09,
"loss": 0.0032,
"step": 1551
},
{
"epoch": 2.990843373493976,
"grad_norm": 0.30931419134140015,
"learning_rate": 1.810170255116539e-09,
"loss": 0.0038,
"step": 1552
},
{
"epoch": 2.9927710843373494,
"grad_norm": 0.3311678469181061,
"learning_rate": 1.2570684713719695e-09,
"loss": 0.0247,
"step": 1553
},
{
"epoch": 2.9946987951807227,
"grad_norm": 0.13150249421596527,
"learning_rate": 8.045268557443919e-10,
"loss": 0.0029,
"step": 1554
},
{
"epoch": 2.9966265060240964,
"grad_norm": 0.10827342420816422,
"learning_rate": 4.5254768376468137e-10,
"loss": 0.0119,
"step": 1555
},
{
"epoch": 2.99855421686747,
"grad_norm": 0.10358250141143799,
"learning_rate": 2.011327252948725e-10,
"loss": 0.0038,
"step": 1556
},
{
"epoch": 3.0,
"grad_norm": 0.09550733864307404,
"learning_rate": 5.028324453482114e-11,
"loss": 0.0016,
"step": 1557
},
{
"epoch": 3.0,
"step": 1557,
"total_flos": 2.043435500286509e+18,
"train_loss": 0.016654981696585226,
"train_runtime": 5294.7714,
"train_samples_per_second": 9.403,
"train_steps_per_second": 0.294
}
],
"logging_steps": 1,
"max_steps": 1557,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 92,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.043435500286509e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}