Files
llama3-1B-sft/checkpoint-920/trainer_state.json
ModelHub XC 8462c4a571 初始化项目,由ModelHub XC社区提供模型
Model: boradorish/llama3-1B-sft
Source: Original Platform
2026-06-10 15:40:17 +08:00

6475 lines
156 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.7730120481927711,
"eval_steps": 500,
"global_step": 920,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019277108433734939,
"grad_norm": 2.8518834114074707,
"learning_rate": 0.0,
"loss": 0.0891,
"step": 1
},
{
"epoch": 0.0038554216867469878,
"grad_norm": 1.8441249132156372,
"learning_rate": 2.564102564102564e-07,
"loss": 0.0539,
"step": 2
},
{
"epoch": 0.005783132530120482,
"grad_norm": 2.8263237476348877,
"learning_rate": 5.128205128205128e-07,
"loss": 0.099,
"step": 3
},
{
"epoch": 0.0077108433734939755,
"grad_norm": 2.5051236152648926,
"learning_rate": 7.692307692307694e-07,
"loss": 0.0789,
"step": 4
},
{
"epoch": 0.00963855421686747,
"grad_norm": 2.6903438568115234,
"learning_rate": 1.0256410256410257e-06,
"loss": 0.0881,
"step": 5
},
{
"epoch": 0.011566265060240964,
"grad_norm": 2.6205761432647705,
"learning_rate": 1.282051282051282e-06,
"loss": 0.0776,
"step": 6
},
{
"epoch": 0.013493975903614458,
"grad_norm": 2.6309337615966797,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.0827,
"step": 7
},
{
"epoch": 0.015421686746987951,
"grad_norm": 1.5427855253219604,
"learning_rate": 1.794871794871795e-06,
"loss": 0.0577,
"step": 8
},
{
"epoch": 0.017349397590361446,
"grad_norm": 1.0973446369171143,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.04,
"step": 9
},
{
"epoch": 0.01927710843373494,
"grad_norm": 1.3253350257873535,
"learning_rate": 2.307692307692308e-06,
"loss": 0.0506,
"step": 10
},
{
"epoch": 0.021204819277108433,
"grad_norm": 1.588739037513733,
"learning_rate": 2.564102564102564e-06,
"loss": 0.0874,
"step": 11
},
{
"epoch": 0.02313253012048193,
"grad_norm": 1.4987014532089233,
"learning_rate": 2.8205128205128207e-06,
"loss": 0.0597,
"step": 12
},
{
"epoch": 0.02506024096385542,
"grad_norm": 1.6571592092514038,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0559,
"step": 13
},
{
"epoch": 0.026987951807228915,
"grad_norm": 1.8860628604888916,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0688,
"step": 14
},
{
"epoch": 0.02891566265060241,
"grad_norm": 1.3202295303344727,
"learning_rate": 3.58974358974359e-06,
"loss": 0.0433,
"step": 15
},
{
"epoch": 0.030843373493975902,
"grad_norm": 1.5870612859725952,
"learning_rate": 3.846153846153847e-06,
"loss": 0.0695,
"step": 16
},
{
"epoch": 0.0327710843373494,
"grad_norm": 0.9192284345626831,
"learning_rate": 4.102564102564103e-06,
"loss": 0.0392,
"step": 17
},
{
"epoch": 0.03469879518072289,
"grad_norm": 0.7950155735015869,
"learning_rate": 4.358974358974359e-06,
"loss": 0.0351,
"step": 18
},
{
"epoch": 0.03662650602409639,
"grad_norm": 0.8854314684867859,
"learning_rate": 4.615384615384616e-06,
"loss": 0.0356,
"step": 19
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.9546788930892944,
"learning_rate": 4.871794871794872e-06,
"loss": 0.0427,
"step": 20
},
{
"epoch": 0.04048192771084337,
"grad_norm": 0.6315903663635254,
"learning_rate": 5.128205128205128e-06,
"loss": 0.0397,
"step": 21
},
{
"epoch": 0.042409638554216866,
"grad_norm": 0.9230924844741821,
"learning_rate": 5.384615384615385e-06,
"loss": 0.0481,
"step": 22
},
{
"epoch": 0.04433734939759036,
"grad_norm": 0.711546003818512,
"learning_rate": 5.641025641025641e-06,
"loss": 0.0479,
"step": 23
},
{
"epoch": 0.04626506024096386,
"grad_norm": 0.5288046598434448,
"learning_rate": 5.897435897435898e-06,
"loss": 0.0182,
"step": 24
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.9420496225357056,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0389,
"step": 25
},
{
"epoch": 0.05012048192771084,
"grad_norm": 0.5001983046531677,
"learning_rate": 6.410256410256412e-06,
"loss": 0.0268,
"step": 26
},
{
"epoch": 0.052048192771084335,
"grad_norm": 0.8084653615951538,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0367,
"step": 27
},
{
"epoch": 0.05397590361445783,
"grad_norm": 0.7195103764533997,
"learning_rate": 6.923076923076923e-06,
"loss": 0.0251,
"step": 28
},
{
"epoch": 0.055903614457831326,
"grad_norm": 0.529958963394165,
"learning_rate": 7.17948717948718e-06,
"loss": 0.0289,
"step": 29
},
{
"epoch": 0.05783132530120482,
"grad_norm": 0.795376181602478,
"learning_rate": 7.435897435897437e-06,
"loss": 0.043,
"step": 30
},
{
"epoch": 0.059759036144578316,
"grad_norm": 0.6366249918937683,
"learning_rate": 7.692307692307694e-06,
"loss": 0.029,
"step": 31
},
{
"epoch": 0.061686746987951804,
"grad_norm": 0.5414115190505981,
"learning_rate": 7.948717948717949e-06,
"loss": 0.0365,
"step": 32
},
{
"epoch": 0.0636144578313253,
"grad_norm": 0.9350972175598145,
"learning_rate": 8.205128205128205e-06,
"loss": 0.0283,
"step": 33
},
{
"epoch": 0.0655421686746988,
"grad_norm": 0.5660741925239563,
"learning_rate": 8.461538461538462e-06,
"loss": 0.0234,
"step": 34
},
{
"epoch": 0.06746987951807229,
"grad_norm": 0.5623988509178162,
"learning_rate": 8.717948717948719e-06,
"loss": 0.0307,
"step": 35
},
{
"epoch": 0.06939759036144579,
"grad_norm": 0.5260195732116699,
"learning_rate": 8.974358974358976e-06,
"loss": 0.0264,
"step": 36
},
{
"epoch": 0.07132530120481928,
"grad_norm": 0.4934785068035126,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0224,
"step": 37
},
{
"epoch": 0.07325301204819278,
"grad_norm": 0.4797322154045105,
"learning_rate": 9.487179487179487e-06,
"loss": 0.0163,
"step": 38
},
{
"epoch": 0.07518072289156627,
"grad_norm": 0.4739217460155487,
"learning_rate": 9.743589743589744e-06,
"loss": 0.0165,
"step": 39
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.4527677595615387,
"learning_rate": 1e-05,
"loss": 0.0163,
"step": 40
},
{
"epoch": 0.07903614457831325,
"grad_norm": 0.6241316795349121,
"learning_rate": 1.0256410256410256e-05,
"loss": 0.0302,
"step": 41
},
{
"epoch": 0.08096385542168674,
"grad_norm": 0.639043927192688,
"learning_rate": 1.0512820512820514e-05,
"loss": 0.0312,
"step": 42
},
{
"epoch": 0.08289156626506024,
"grad_norm": 0.5121409296989441,
"learning_rate": 1.076923076923077e-05,
"loss": 0.0256,
"step": 43
},
{
"epoch": 0.08481927710843373,
"grad_norm": 0.6340477466583252,
"learning_rate": 1.1025641025641028e-05,
"loss": 0.04,
"step": 44
},
{
"epoch": 0.08674698795180723,
"grad_norm": 0.5260409712791443,
"learning_rate": 1.1282051282051283e-05,
"loss": 0.0282,
"step": 45
},
{
"epoch": 0.08867469879518072,
"grad_norm": 0.6390711069107056,
"learning_rate": 1.1538461538461538e-05,
"loss": 0.0243,
"step": 46
},
{
"epoch": 0.09060240963855422,
"grad_norm": 0.46469295024871826,
"learning_rate": 1.1794871794871796e-05,
"loss": 0.0208,
"step": 47
},
{
"epoch": 0.09253012048192771,
"grad_norm": 0.8711516857147217,
"learning_rate": 1.2051282051282051e-05,
"loss": 0.0291,
"step": 48
},
{
"epoch": 0.09445783132530121,
"grad_norm": 0.9164300560951233,
"learning_rate": 1.230769230769231e-05,
"loss": 0.0342,
"step": 49
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.5401139259338379,
"learning_rate": 1.2564102564102565e-05,
"loss": 0.0185,
"step": 50
},
{
"epoch": 0.0983132530120482,
"grad_norm": 0.44393008947372437,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.0228,
"step": 51
},
{
"epoch": 0.10024096385542168,
"grad_norm": 0.3855767846107483,
"learning_rate": 1.3076923076923078e-05,
"loss": 0.0176,
"step": 52
},
{
"epoch": 0.10216867469879518,
"grad_norm": 0.8561235070228577,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0433,
"step": 53
},
{
"epoch": 0.10409638554216867,
"grad_norm": 0.768002450466156,
"learning_rate": 1.3589743589743592e-05,
"loss": 0.0245,
"step": 54
},
{
"epoch": 0.10602409638554217,
"grad_norm": 0.4559759497642517,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.0224,
"step": 55
},
{
"epoch": 0.10795180722891566,
"grad_norm": 0.6203847527503967,
"learning_rate": 1.4102564102564105e-05,
"loss": 0.0296,
"step": 56
},
{
"epoch": 0.10987951807228916,
"grad_norm": 0.6651368141174316,
"learning_rate": 1.435897435897436e-05,
"loss": 0.0336,
"step": 57
},
{
"epoch": 0.11180722891566265,
"grad_norm": 0.377734512090683,
"learning_rate": 1.4615384615384615e-05,
"loss": 0.0196,
"step": 58
},
{
"epoch": 0.11373493975903615,
"grad_norm": 0.687568724155426,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.0207,
"step": 59
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.7905604243278503,
"learning_rate": 1.5128205128205129e-05,
"loss": 0.047,
"step": 60
},
{
"epoch": 0.11759036144578314,
"grad_norm": 0.7938196063041687,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0198,
"step": 61
},
{
"epoch": 0.11951807228915663,
"grad_norm": 0.41340553760528564,
"learning_rate": 1.5641025641025644e-05,
"loss": 0.0161,
"step": 62
},
{
"epoch": 0.12144578313253013,
"grad_norm": 0.5668172240257263,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.0275,
"step": 63
},
{
"epoch": 0.12337349397590361,
"grad_norm": 0.48333367705345154,
"learning_rate": 1.6153846153846154e-05,
"loss": 0.0137,
"step": 64
},
{
"epoch": 0.12530120481927712,
"grad_norm": 0.6843933463096619,
"learning_rate": 1.641025641025641e-05,
"loss": 0.0294,
"step": 65
},
{
"epoch": 0.1272289156626506,
"grad_norm": 0.7789272665977478,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0401,
"step": 66
},
{
"epoch": 0.1291566265060241,
"grad_norm": 0.6203492879867554,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.0292,
"step": 67
},
{
"epoch": 0.1310843373493976,
"grad_norm": 0.5940662622451782,
"learning_rate": 1.717948717948718e-05,
"loss": 0.0178,
"step": 68
},
{
"epoch": 0.13301204819277107,
"grad_norm": 0.35504868626594543,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.0129,
"step": 69
},
{
"epoch": 0.13493975903614458,
"grad_norm": 0.8796699643135071,
"learning_rate": 1.7692307692307694e-05,
"loss": 0.034,
"step": 70
},
{
"epoch": 0.13686746987951806,
"grad_norm": 0.967444896697998,
"learning_rate": 1.794871794871795e-05,
"loss": 0.0266,
"step": 71
},
{
"epoch": 0.13879518072289157,
"grad_norm": 0.4428526759147644,
"learning_rate": 1.8205128205128208e-05,
"loss": 0.0223,
"step": 72
},
{
"epoch": 0.14072289156626505,
"grad_norm": 0.42897751927375793,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.0187,
"step": 73
},
{
"epoch": 0.14265060240963856,
"grad_norm": 0.5100914835929871,
"learning_rate": 1.8717948717948718e-05,
"loss": 0.0164,
"step": 74
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.6028861999511719,
"learning_rate": 1.8974358974358975e-05,
"loss": 0.0164,
"step": 75
},
{
"epoch": 0.14650602409638555,
"grad_norm": 0.6187024116516113,
"learning_rate": 1.923076923076923e-05,
"loss": 0.0296,
"step": 76
},
{
"epoch": 0.14843373493975903,
"grad_norm": 0.4822489619255066,
"learning_rate": 1.9487179487179488e-05,
"loss": 0.0148,
"step": 77
},
{
"epoch": 0.15036144578313254,
"grad_norm": 0.7231149673461914,
"learning_rate": 1.9743589743589745e-05,
"loss": 0.0395,
"step": 78
},
{
"epoch": 0.15228915662650602,
"grad_norm": 0.8409642577171326,
"learning_rate": 2e-05,
"loss": 0.0446,
"step": 79
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.4883500039577484,
"learning_rate": 2.025641025641026e-05,
"loss": 0.0206,
"step": 80
},
{
"epoch": 0.156144578313253,
"grad_norm": 0.6287479400634766,
"learning_rate": 2.0512820512820512e-05,
"loss": 0.0333,
"step": 81
},
{
"epoch": 0.1580722891566265,
"grad_norm": 0.5041632652282715,
"learning_rate": 2.0769230769230772e-05,
"loss": 0.0414,
"step": 82
},
{
"epoch": 0.16,
"grad_norm": 0.5103405117988586,
"learning_rate": 2.102564102564103e-05,
"loss": 0.045,
"step": 83
},
{
"epoch": 0.16192771084337348,
"grad_norm": 0.493161678314209,
"learning_rate": 2.1282051282051285e-05,
"loss": 0.021,
"step": 84
},
{
"epoch": 0.163855421686747,
"grad_norm": 0.908843994140625,
"learning_rate": 2.153846153846154e-05,
"loss": 0.0389,
"step": 85
},
{
"epoch": 0.16578313253012048,
"grad_norm": 0.5067003965377808,
"learning_rate": 2.1794871794871795e-05,
"loss": 0.0272,
"step": 86
},
{
"epoch": 0.16771084337349398,
"grad_norm": 0.5791381597518921,
"learning_rate": 2.2051282051282056e-05,
"loss": 0.0368,
"step": 87
},
{
"epoch": 0.16963855421686747,
"grad_norm": 0.7056036591529846,
"learning_rate": 2.230769230769231e-05,
"loss": 0.0284,
"step": 88
},
{
"epoch": 0.17156626506024097,
"grad_norm": 0.6563822031021118,
"learning_rate": 2.2564102564102566e-05,
"loss": 0.0646,
"step": 89
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.9483286142349243,
"learning_rate": 2.2820512820512822e-05,
"loss": 0.0439,
"step": 90
},
{
"epoch": 0.17542168674698796,
"grad_norm": 0.370664119720459,
"learning_rate": 2.3076923076923076e-05,
"loss": 0.0109,
"step": 91
},
{
"epoch": 0.17734939759036145,
"grad_norm": 0.9776477813720703,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.0458,
"step": 92
},
{
"epoch": 0.17927710843373493,
"grad_norm": 0.45710092782974243,
"learning_rate": 2.3589743589743593e-05,
"loss": 0.0212,
"step": 93
},
{
"epoch": 0.18120481927710844,
"grad_norm": 0.8623896837234497,
"learning_rate": 2.384615384615385e-05,
"loss": 0.0215,
"step": 94
},
{
"epoch": 0.18313253012048192,
"grad_norm": 0.55814528465271,
"learning_rate": 2.4102564102564103e-05,
"loss": 0.0218,
"step": 95
},
{
"epoch": 0.18506024096385543,
"grad_norm": 0.49882641434669495,
"learning_rate": 2.435897435897436e-05,
"loss": 0.0268,
"step": 96
},
{
"epoch": 0.1869879518072289,
"grad_norm": 0.3508654534816742,
"learning_rate": 2.461538461538462e-05,
"loss": 0.0172,
"step": 97
},
{
"epoch": 0.18891566265060242,
"grad_norm": 0.601170003414154,
"learning_rate": 2.4871794871794873e-05,
"loss": 0.0208,
"step": 98
},
{
"epoch": 0.1908433734939759,
"grad_norm": 1.1748133897781372,
"learning_rate": 2.512820512820513e-05,
"loss": 0.0259,
"step": 99
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.46370384097099304,
"learning_rate": 2.5384615384615386e-05,
"loss": 0.0242,
"step": 100
},
{
"epoch": 0.1946987951807229,
"grad_norm": 0.525010883808136,
"learning_rate": 2.5641025641025646e-05,
"loss": 0.0188,
"step": 101
},
{
"epoch": 0.1966265060240964,
"grad_norm": 0.766501784324646,
"learning_rate": 2.58974358974359e-05,
"loss": 0.0584,
"step": 102
},
{
"epoch": 0.19855421686746988,
"grad_norm": 0.3572964370250702,
"learning_rate": 2.6153846153846157e-05,
"loss": 0.0131,
"step": 103
},
{
"epoch": 0.20048192771084336,
"grad_norm": 0.6467130780220032,
"learning_rate": 2.6410256410256413e-05,
"loss": 0.0231,
"step": 104
},
{
"epoch": 0.20240963855421687,
"grad_norm": 1.1852102279663086,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.027,
"step": 105
},
{
"epoch": 0.20433734939759035,
"grad_norm": 2.3659932613372803,
"learning_rate": 2.6923076923076927e-05,
"loss": 0.0224,
"step": 106
},
{
"epoch": 0.20626506024096386,
"grad_norm": 0.5343687534332275,
"learning_rate": 2.7179487179487183e-05,
"loss": 0.0198,
"step": 107
},
{
"epoch": 0.20819277108433734,
"grad_norm": 1.852160096168518,
"learning_rate": 2.7435897435897437e-05,
"loss": 0.032,
"step": 108
},
{
"epoch": 0.21012048192771085,
"grad_norm": 0.47291702032089233,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.0117,
"step": 109
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.7623187899589539,
"learning_rate": 2.794871794871795e-05,
"loss": 0.0337,
"step": 110
},
{
"epoch": 0.21397590361445784,
"grad_norm": 0.5272570848464966,
"learning_rate": 2.820512820512821e-05,
"loss": 0.0131,
"step": 111
},
{
"epoch": 0.21590361445783132,
"grad_norm": 0.5568500757217407,
"learning_rate": 2.8461538461538464e-05,
"loss": 0.0233,
"step": 112
},
{
"epoch": 0.21783132530120483,
"grad_norm": 0.4008469879627228,
"learning_rate": 2.871794871794872e-05,
"loss": 0.0204,
"step": 113
},
{
"epoch": 0.2197590361445783,
"grad_norm": 0.4888612926006317,
"learning_rate": 2.8974358974358977e-05,
"loss": 0.016,
"step": 114
},
{
"epoch": 0.2216867469879518,
"grad_norm": 0.44903355836868286,
"learning_rate": 2.923076923076923e-05,
"loss": 0.0135,
"step": 115
},
{
"epoch": 0.2236144578313253,
"grad_norm": 0.9266762733459473,
"learning_rate": 2.948717948717949e-05,
"loss": 0.0233,
"step": 116
},
{
"epoch": 0.22554216867469878,
"grad_norm": 0.5352638959884644,
"learning_rate": 2.9743589743589747e-05,
"loss": 0.0198,
"step": 117
},
{
"epoch": 0.2274698795180723,
"grad_norm": 0.6051343679428101,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0246,
"step": 118
},
{
"epoch": 0.22939759036144577,
"grad_norm": 0.9971133470535278,
"learning_rate": 3.0256410256410257e-05,
"loss": 0.025,
"step": 119
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.704236626625061,
"learning_rate": 3.0512820512820514e-05,
"loss": 0.031,
"step": 120
},
{
"epoch": 0.23325301204819276,
"grad_norm": 0.6137097477912903,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.0519,
"step": 121
},
{
"epoch": 0.23518072289156627,
"grad_norm": 0.7396159768104553,
"learning_rate": 3.102564102564103e-05,
"loss": 0.0325,
"step": 122
},
{
"epoch": 0.23710843373493976,
"grad_norm": 1.3282053470611572,
"learning_rate": 3.128205128205129e-05,
"loss": 0.0252,
"step": 123
},
{
"epoch": 0.23903614457831326,
"grad_norm": 0.5220731496810913,
"learning_rate": 3.153846153846154e-05,
"loss": 0.0262,
"step": 124
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.5357242822647095,
"learning_rate": 3.1794871794871795e-05,
"loss": 0.0243,
"step": 125
},
{
"epoch": 0.24289156626506025,
"grad_norm": 0.48207753896713257,
"learning_rate": 3.205128205128206e-05,
"loss": 0.0178,
"step": 126
},
{
"epoch": 0.24481927710843374,
"grad_norm": 0.552988588809967,
"learning_rate": 3.230769230769231e-05,
"loss": 0.023,
"step": 127
},
{
"epoch": 0.24674698795180722,
"grad_norm": 1.7962840795516968,
"learning_rate": 3.2564102564102565e-05,
"loss": 0.032,
"step": 128
},
{
"epoch": 0.24867469879518073,
"grad_norm": 1.6404600143432617,
"learning_rate": 3.282051282051282e-05,
"loss": 0.0231,
"step": 129
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.39142486453056335,
"learning_rate": 3.307692307692308e-05,
"loss": 0.0147,
"step": 130
},
{
"epoch": 0.2525301204819277,
"grad_norm": 1.3272887468338013,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0439,
"step": 131
},
{
"epoch": 0.2544578313253012,
"grad_norm": 1.5122811794281006,
"learning_rate": 3.358974358974359e-05,
"loss": 0.0282,
"step": 132
},
{
"epoch": 0.2563855421686747,
"grad_norm": 1.8542430400848389,
"learning_rate": 3.384615384615385e-05,
"loss": 0.0515,
"step": 133
},
{
"epoch": 0.2583132530120482,
"grad_norm": 4.059277534484863,
"learning_rate": 3.4102564102564105e-05,
"loss": 0.0781,
"step": 134
},
{
"epoch": 0.26024096385542167,
"grad_norm": 0.6206214427947998,
"learning_rate": 3.435897435897436e-05,
"loss": 0.0306,
"step": 135
},
{
"epoch": 0.2621686746987952,
"grad_norm": 0.4575510323047638,
"learning_rate": 3.461538461538462e-05,
"loss": 0.0154,
"step": 136
},
{
"epoch": 0.2640963855421687,
"grad_norm": 1.1556978225708008,
"learning_rate": 3.4871794871794875e-05,
"loss": 0.0235,
"step": 137
},
{
"epoch": 0.26602409638554214,
"grad_norm": 0.6975051760673523,
"learning_rate": 3.512820512820513e-05,
"loss": 0.0453,
"step": 138
},
{
"epoch": 0.26795180722891565,
"grad_norm": 0.8686623573303223,
"learning_rate": 3.538461538461539e-05,
"loss": 0.0427,
"step": 139
},
{
"epoch": 0.26987951807228916,
"grad_norm": 2.0681848526000977,
"learning_rate": 3.5641025641025646e-05,
"loss": 0.04,
"step": 140
},
{
"epoch": 0.27180722891566267,
"grad_norm": 0.4397984445095062,
"learning_rate": 3.58974358974359e-05,
"loss": 0.0188,
"step": 141
},
{
"epoch": 0.2737349397590361,
"grad_norm": 0.5871334075927734,
"learning_rate": 3.615384615384616e-05,
"loss": 0.0253,
"step": 142
},
{
"epoch": 0.27566265060240963,
"grad_norm": 1.1078568696975708,
"learning_rate": 3.6410256410256416e-05,
"loss": 0.0316,
"step": 143
},
{
"epoch": 0.27759036144578314,
"grad_norm": 0.5691841840744019,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0266,
"step": 144
},
{
"epoch": 0.27951807228915665,
"grad_norm": 0.7896255254745483,
"learning_rate": 3.692307692307693e-05,
"loss": 0.0281,
"step": 145
},
{
"epoch": 0.2814457831325301,
"grad_norm": 0.9988337159156799,
"learning_rate": 3.7179487179487186e-05,
"loss": 0.0295,
"step": 146
},
{
"epoch": 0.2833734939759036,
"grad_norm": 0.9811834692955017,
"learning_rate": 3.7435897435897436e-05,
"loss": 0.0322,
"step": 147
},
{
"epoch": 0.2853012048192771,
"grad_norm": 0.6503105759620667,
"learning_rate": 3.769230769230769e-05,
"loss": 0.0266,
"step": 148
},
{
"epoch": 0.28722891566265063,
"grad_norm": 1.9164355993270874,
"learning_rate": 3.794871794871795e-05,
"loss": 0.0677,
"step": 149
},
{
"epoch": 0.2891566265060241,
"grad_norm": 1.1724557876586914,
"learning_rate": 3.820512820512821e-05,
"loss": 0.0324,
"step": 150
},
{
"epoch": 0.2910843373493976,
"grad_norm": 0.8482469916343689,
"learning_rate": 3.846153846153846e-05,
"loss": 0.0259,
"step": 151
},
{
"epoch": 0.2930120481927711,
"grad_norm": 0.8572830557823181,
"learning_rate": 3.871794871794872e-05,
"loss": 0.0358,
"step": 152
},
{
"epoch": 0.29493975903614456,
"grad_norm": 0.6630825400352478,
"learning_rate": 3.8974358974358976e-05,
"loss": 0.0447,
"step": 153
},
{
"epoch": 0.29686746987951806,
"grad_norm": 0.9197093844413757,
"learning_rate": 3.923076923076923e-05,
"loss": 0.0409,
"step": 154
},
{
"epoch": 0.2987951807228916,
"grad_norm": 0.6976819634437561,
"learning_rate": 3.948717948717949e-05,
"loss": 0.0317,
"step": 155
},
{
"epoch": 0.3007228915662651,
"grad_norm": 0.7353514432907104,
"learning_rate": 3.9743589743589747e-05,
"loss": 0.0306,
"step": 156
},
{
"epoch": 0.30265060240963854,
"grad_norm": 0.5730232000350952,
"learning_rate": 4e-05,
"loss": 0.0324,
"step": 157
},
{
"epoch": 0.30457831325301205,
"grad_norm": 0.7852078676223755,
"learning_rate": 3.999994971675547e-05,
"loss": 0.0354,
"step": 158
},
{
"epoch": 0.30650602409638555,
"grad_norm": 0.5924715399742126,
"learning_rate": 3.999979886727471e-05,
"loss": 0.0366,
"step": 159
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.7359845638275146,
"learning_rate": 3.999954745231624e-05,
"loss": 0.0437,
"step": 160
},
{
"epoch": 0.3103614457831325,
"grad_norm": 0.7866976857185364,
"learning_rate": 3.999919547314426e-05,
"loss": 0.0363,
"step": 161
},
{
"epoch": 0.312289156626506,
"grad_norm": 0.7425745129585266,
"learning_rate": 3.999874293152863e-05,
"loss": 0.0259,
"step": 162
},
{
"epoch": 0.31421686746987953,
"grad_norm": 1.8922245502471924,
"learning_rate": 3.9998189829744885e-05,
"loss": 0.0341,
"step": 163
},
{
"epoch": 0.316144578313253,
"grad_norm": 0.7908634543418884,
"learning_rate": 3.99975361705742e-05,
"loss": 0.0424,
"step": 164
},
{
"epoch": 0.3180722891566265,
"grad_norm": 2.047368049621582,
"learning_rate": 3.999678195730337e-05,
"loss": 0.0535,
"step": 165
},
{
"epoch": 0.32,
"grad_norm": 0.5702639222145081,
"learning_rate": 3.999592719372484e-05,
"loss": 0.0284,
"step": 166
},
{
"epoch": 0.3219277108433735,
"grad_norm": 0.45015648007392883,
"learning_rate": 3.9994971884136636e-05,
"loss": 0.0313,
"step": 167
},
{
"epoch": 0.32385542168674697,
"grad_norm": 4.094679355621338,
"learning_rate": 3.9993916033342355e-05,
"loss": 0.0524,
"step": 168
},
{
"epoch": 0.3257831325301205,
"grad_norm": 0.800846517086029,
"learning_rate": 3.999275964665117e-05,
"loss": 0.0282,
"step": 169
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.47881078720092773,
"learning_rate": 3.999150272987776e-05,
"loss": 0.0293,
"step": 170
},
{
"epoch": 0.3296385542168675,
"grad_norm": 0.5716657638549805,
"learning_rate": 3.999014528934232e-05,
"loss": 0.0221,
"step": 171
},
{
"epoch": 0.33156626506024095,
"grad_norm": 0.6333311200141907,
"learning_rate": 3.998868733187048e-05,
"loss": 0.0302,
"step": 172
},
{
"epoch": 0.33349397590361446,
"grad_norm": 6.642521858215332,
"learning_rate": 3.998712886479335e-05,
"loss": 0.0364,
"step": 173
},
{
"epoch": 0.33542168674698797,
"grad_norm": 0.7515506148338318,
"learning_rate": 3.998546989594739e-05,
"loss": 0.0296,
"step": 174
},
{
"epoch": 0.3373493975903614,
"grad_norm": 1.0728015899658203,
"learning_rate": 3.998371043367445e-05,
"loss": 0.0549,
"step": 175
},
{
"epoch": 0.33927710843373493,
"grad_norm": 1.3025579452514648,
"learning_rate": 3.998185048682166e-05,
"loss": 0.0577,
"step": 176
},
{
"epoch": 0.34120481927710844,
"grad_norm": 1.0962958335876465,
"learning_rate": 3.997989006474144e-05,
"loss": 0.0313,
"step": 177
},
{
"epoch": 0.34313253012048195,
"grad_norm": 0.7064313292503357,
"learning_rate": 3.997782917729143e-05,
"loss": 0.0309,
"step": 178
},
{
"epoch": 0.3450602409638554,
"grad_norm": 0.43374207615852356,
"learning_rate": 3.997566783483445e-05,
"loss": 0.0166,
"step": 179
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.7236390113830566,
"learning_rate": 3.9973406048238413e-05,
"loss": 0.0254,
"step": 180
},
{
"epoch": 0.3489156626506024,
"grad_norm": 0.5041500926017761,
"learning_rate": 3.9971043828876334e-05,
"loss": 0.0239,
"step": 181
},
{
"epoch": 0.35084337349397593,
"grad_norm": 1.2744532823562622,
"learning_rate": 3.9968581188626204e-05,
"loss": 0.0404,
"step": 182
},
{
"epoch": 0.3527710843373494,
"grad_norm": 0.45845362544059753,
"learning_rate": 3.996601813987098e-05,
"loss": 0.0127,
"step": 183
},
{
"epoch": 0.3546987951807229,
"grad_norm": 0.4426881968975067,
"learning_rate": 3.996335469549852e-05,
"loss": 0.0176,
"step": 184
},
{
"epoch": 0.3566265060240964,
"grad_norm": 1.0030732154846191,
"learning_rate": 3.9960590868901465e-05,
"loss": 0.0457,
"step": 185
},
{
"epoch": 0.35855421686746985,
"grad_norm": 0.6428582668304443,
"learning_rate": 3.995772667397725e-05,
"loss": 0.0271,
"step": 186
},
{
"epoch": 0.36048192771084336,
"grad_norm": 0.5335744619369507,
"learning_rate": 3.995476212512795e-05,
"loss": 0.0297,
"step": 187
},
{
"epoch": 0.3624096385542169,
"grad_norm": 0.6995761394500732,
"learning_rate": 3.99516972372603e-05,
"loss": 0.0322,
"step": 188
},
{
"epoch": 0.3643373493975904,
"grad_norm": 0.765511155128479,
"learning_rate": 3.9948532025785546e-05,
"loss": 0.0253,
"step": 189
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.6165828108787537,
"learning_rate": 3.9945266506619403e-05,
"loss": 0.0355,
"step": 190
},
{
"epoch": 0.36819277108433734,
"grad_norm": 0.851970911026001,
"learning_rate": 3.994190069618195e-05,
"loss": 0.056,
"step": 191
},
{
"epoch": 0.37012048192771085,
"grad_norm": 0.9850023984909058,
"learning_rate": 3.993843461139757e-05,
"loss": 0.0415,
"step": 192
},
{
"epoch": 0.37204819277108436,
"grad_norm": 0.7455295324325562,
"learning_rate": 3.9934868269694886e-05,
"loss": 0.0379,
"step": 193
},
{
"epoch": 0.3739759036144578,
"grad_norm": 1.159469723701477,
"learning_rate": 3.9931201689006595e-05,
"loss": 0.0237,
"step": 194
},
{
"epoch": 0.3759036144578313,
"grad_norm": 0.5490080118179321,
"learning_rate": 3.992743488776947e-05,
"loss": 0.024,
"step": 195
},
{
"epoch": 0.37783132530120483,
"grad_norm": 1.279831886291504,
"learning_rate": 3.992356788492421e-05,
"loss": 0.0273,
"step": 196
},
{
"epoch": 0.3797590361445783,
"grad_norm": 0.859104335308075,
"learning_rate": 3.9919600699915355e-05,
"loss": 0.0411,
"step": 197
},
{
"epoch": 0.3816867469879518,
"grad_norm": 1.2525300979614258,
"learning_rate": 3.991553335269119e-05,
"loss": 0.0857,
"step": 198
},
{
"epoch": 0.3836144578313253,
"grad_norm": 0.4924193024635315,
"learning_rate": 3.991136586370367e-05,
"loss": 0.0294,
"step": 199
},
{
"epoch": 0.3855421686746988,
"grad_norm": 1.417190670967102,
"learning_rate": 3.990709825390828e-05,
"loss": 0.0395,
"step": 200
},
{
"epoch": 0.38746987951807227,
"grad_norm": 0.6172056198120117,
"learning_rate": 3.9902730544763936e-05,
"loss": 0.0194,
"step": 201
},
{
"epoch": 0.3893975903614458,
"grad_norm": 0.7292149662971497,
"learning_rate": 3.989826275823291e-05,
"loss": 0.0381,
"step": 202
},
{
"epoch": 0.3913253012048193,
"grad_norm": 0.5949816107749939,
"learning_rate": 3.989369491678067e-05,
"loss": 0.0254,
"step": 203
},
{
"epoch": 0.3932530120481928,
"grad_norm": 0.6012582182884216,
"learning_rate": 3.988902704337582e-05,
"loss": 0.048,
"step": 204
},
{
"epoch": 0.39518072289156625,
"grad_norm": 0.6273590922355652,
"learning_rate": 3.9884259161489936e-05,
"loss": 0.0268,
"step": 205
},
{
"epoch": 0.39710843373493976,
"grad_norm": 0.9615244269371033,
"learning_rate": 3.987939129509746e-05,
"loss": 0.0192,
"step": 206
},
{
"epoch": 0.39903614457831327,
"grad_norm": 0.6009241342544556,
"learning_rate": 3.9874423468675624e-05,
"loss": 0.0362,
"step": 207
},
{
"epoch": 0.4009638554216867,
"grad_norm": 0.411335289478302,
"learning_rate": 3.9869355707204266e-05,
"loss": 0.017,
"step": 208
},
{
"epoch": 0.40289156626506023,
"grad_norm": 0.6151527166366577,
"learning_rate": 3.986418803616573e-05,
"loss": 0.0283,
"step": 209
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.33808204531669617,
"learning_rate": 3.985892048154474e-05,
"loss": 0.0158,
"step": 210
},
{
"epoch": 0.40674698795180725,
"grad_norm": 0.5464187860488892,
"learning_rate": 3.9853553069828284e-05,
"loss": 0.0292,
"step": 211
},
{
"epoch": 0.4086746987951807,
"grad_norm": 0.6658390760421753,
"learning_rate": 3.984808582800543e-05,
"loss": 0.0281,
"step": 212
},
{
"epoch": 0.4106024096385542,
"grad_norm": 0.4253764748573303,
"learning_rate": 3.984251878356726e-05,
"loss": 0.031,
"step": 213
},
{
"epoch": 0.4125301204819277,
"grad_norm": 0.32309481501579285,
"learning_rate": 3.983685196450667e-05,
"loss": 0.0166,
"step": 214
},
{
"epoch": 0.41445783132530123,
"grad_norm": 0.43756410479545593,
"learning_rate": 3.9831085399318265e-05,
"loss": 0.0326,
"step": 215
},
{
"epoch": 0.4163855421686747,
"grad_norm": 0.264046847820282,
"learning_rate": 3.982521911699822e-05,
"loss": 0.0118,
"step": 216
},
{
"epoch": 0.4183132530120482,
"grad_norm": 0.8630897402763367,
"learning_rate": 3.9819253147044084e-05,
"loss": 0.0246,
"step": 217
},
{
"epoch": 0.4202409638554217,
"grad_norm": 0.6923379898071289,
"learning_rate": 3.98131875194547e-05,
"loss": 0.036,
"step": 218
},
{
"epoch": 0.42216867469879515,
"grad_norm": 0.5874778628349304,
"learning_rate": 3.9807022264730024e-05,
"loss": 0.0255,
"step": 219
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.394336074590683,
"learning_rate": 3.980075741387094e-05,
"loss": 0.0187,
"step": 220
},
{
"epoch": 0.4260240963855422,
"grad_norm": 0.6300327777862549,
"learning_rate": 3.979439299837915e-05,
"loss": 0.0214,
"step": 221
},
{
"epoch": 0.4279518072289157,
"grad_norm": 0.5200467109680176,
"learning_rate": 3.978792905025702e-05,
"loss": 0.0628,
"step": 222
},
{
"epoch": 0.42987951807228914,
"grad_norm": 0.5713880062103271,
"learning_rate": 3.978136560200735e-05,
"loss": 0.0302,
"step": 223
},
{
"epoch": 0.43180722891566264,
"grad_norm": 0.5345383286476135,
"learning_rate": 3.977470268663331e-05,
"loss": 0.0125,
"step": 224
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.5378350019454956,
"learning_rate": 3.976794033763819e-05,
"loss": 0.0246,
"step": 225
},
{
"epoch": 0.43566265060240966,
"grad_norm": 0.5554935336112976,
"learning_rate": 3.9761078589025276e-05,
"loss": 0.0212,
"step": 226
},
{
"epoch": 0.4375903614457831,
"grad_norm": 0.2832634747028351,
"learning_rate": 3.9754117475297664e-05,
"loss": 0.0125,
"step": 227
},
{
"epoch": 0.4395180722891566,
"grad_norm": 1.2910150289535522,
"learning_rate": 3.97470570314581e-05,
"loss": 0.0364,
"step": 228
},
{
"epoch": 0.44144578313253013,
"grad_norm": 0.3731018602848053,
"learning_rate": 3.973989729300878e-05,
"loss": 0.0128,
"step": 229
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.9433871507644653,
"learning_rate": 3.9732638295951195e-05,
"loss": 0.0367,
"step": 230
},
{
"epoch": 0.4453012048192771,
"grad_norm": 1.0779197216033936,
"learning_rate": 3.972528007678594e-05,
"loss": 0.0667,
"step": 231
},
{
"epoch": 0.4472289156626506,
"grad_norm": 1.7009105682373047,
"learning_rate": 3.9717822672512516e-05,
"loss": 0.0655,
"step": 232
},
{
"epoch": 0.4491566265060241,
"grad_norm": 0.5646032094955444,
"learning_rate": 3.971026612062919e-05,
"loss": 0.064,
"step": 233
},
{
"epoch": 0.45108433734939757,
"grad_norm": 0.44474121928215027,
"learning_rate": 3.970261045913274e-05,
"loss": 0.0206,
"step": 234
},
{
"epoch": 0.4530120481927711,
"grad_norm": 1.3969277143478394,
"learning_rate": 3.969485572651833e-05,
"loss": 0.0486,
"step": 235
},
{
"epoch": 0.4549397590361446,
"grad_norm": 0.6401994228363037,
"learning_rate": 3.968700196177925e-05,
"loss": 0.0262,
"step": 236
},
{
"epoch": 0.4568674698795181,
"grad_norm": 0.7091913223266602,
"learning_rate": 3.96790492044068e-05,
"loss": 0.014,
"step": 237
},
{
"epoch": 0.45879518072289155,
"grad_norm": 0.6561547517776489,
"learning_rate": 3.967099749439002e-05,
"loss": 0.0482,
"step": 238
},
{
"epoch": 0.46072289156626506,
"grad_norm": 0.6924155354499817,
"learning_rate": 3.966284687221551e-05,
"loss": 0.0289,
"step": 239
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.5868663787841797,
"learning_rate": 3.9654597378867256e-05,
"loss": 0.0331,
"step": 240
},
{
"epoch": 0.464578313253012,
"grad_norm": 0.7930939793586731,
"learning_rate": 3.964624905582637e-05,
"loss": 0.0925,
"step": 241
},
{
"epoch": 0.46650602409638553,
"grad_norm": 0.4888836145401001,
"learning_rate": 3.9637801945070944e-05,
"loss": 0.015,
"step": 242
},
{
"epoch": 0.46843373493975904,
"grad_norm": 0.7820287346839905,
"learning_rate": 3.962925608907579e-05,
"loss": 0.0382,
"step": 243
},
{
"epoch": 0.47036144578313255,
"grad_norm": 0.4914316236972809,
"learning_rate": 3.962061153081224e-05,
"loss": 0.0257,
"step": 244
},
{
"epoch": 0.472289156626506,
"grad_norm": 0.5681505799293518,
"learning_rate": 3.961186831374793e-05,
"loss": 0.0551,
"step": 245
},
{
"epoch": 0.4742168674698795,
"grad_norm": 0.5049723386764526,
"learning_rate": 3.9603026481846616e-05,
"loss": 0.0186,
"step": 246
},
{
"epoch": 0.476144578313253,
"grad_norm": 0.5034119486808777,
"learning_rate": 3.959408607956787e-05,
"loss": 0.024,
"step": 247
},
{
"epoch": 0.47807228915662653,
"grad_norm": 0.4543336033821106,
"learning_rate": 3.958504715186695e-05,
"loss": 0.0256,
"step": 248
},
{
"epoch": 0.48,
"grad_norm": 0.5595743656158447,
"learning_rate": 3.957590974419452e-05,
"loss": 0.0222,
"step": 249
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.5701581239700317,
"learning_rate": 3.956667390249642e-05,
"loss": 0.0334,
"step": 250
},
{
"epoch": 0.483855421686747,
"grad_norm": 0.53755784034729,
"learning_rate": 3.9557339673213474e-05,
"loss": 0.0345,
"step": 251
},
{
"epoch": 0.4857831325301205,
"grad_norm": 0.4368877112865448,
"learning_rate": 3.95479071032812e-05,
"loss": 0.0183,
"step": 252
},
{
"epoch": 0.48771084337349396,
"grad_norm": 0.7972906827926636,
"learning_rate": 3.953837624012963e-05,
"loss": 0.0337,
"step": 253
},
{
"epoch": 0.48963855421686747,
"grad_norm": 0.6148451566696167,
"learning_rate": 3.9528747131683023e-05,
"loss": 0.0524,
"step": 254
},
{
"epoch": 0.491566265060241,
"grad_norm": 0.500840961933136,
"learning_rate": 3.9519019826359676e-05,
"loss": 0.0248,
"step": 255
},
{
"epoch": 0.49349397590361443,
"grad_norm": 0.5536255240440369,
"learning_rate": 3.9509194373071624e-05,
"loss": 0.0219,
"step": 256
},
{
"epoch": 0.49542168674698794,
"grad_norm": 0.6873176097869873,
"learning_rate": 3.9499270821224444e-05,
"loss": 0.0312,
"step": 257
},
{
"epoch": 0.49734939759036145,
"grad_norm": 0.37207168340682983,
"learning_rate": 3.9489249220716974e-05,
"loss": 0.0149,
"step": 258
},
{
"epoch": 0.49927710843373496,
"grad_norm": 0.4458799660205841,
"learning_rate": 3.947912962194107e-05,
"loss": 0.0214,
"step": 259
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.4272724390029907,
"learning_rate": 3.9468912075781345e-05,
"loss": 0.0263,
"step": 260
},
{
"epoch": 0.503132530120482,
"grad_norm": 0.5245792269706726,
"learning_rate": 3.945859663361496e-05,
"loss": 0.0103,
"step": 261
},
{
"epoch": 0.5050602409638554,
"grad_norm": 0.8799260854721069,
"learning_rate": 3.9448183347311284e-05,
"loss": 0.0292,
"step": 262
},
{
"epoch": 0.5069879518072289,
"grad_norm": 0.5996833443641663,
"learning_rate": 3.943767226923171e-05,
"loss": 0.0306,
"step": 263
},
{
"epoch": 0.5089156626506024,
"grad_norm": 0.6044682860374451,
"learning_rate": 3.942706345222935e-05,
"loss": 0.0218,
"step": 264
},
{
"epoch": 0.5108433734939759,
"grad_norm": 0.4770200848579407,
"learning_rate": 3.941635694964878e-05,
"loss": 0.0226,
"step": 265
},
{
"epoch": 0.5127710843373494,
"grad_norm": 0.5605704188346863,
"learning_rate": 3.940555281532576e-05,
"loss": 0.0354,
"step": 266
},
{
"epoch": 0.5146987951807229,
"grad_norm": 0.46532443165779114,
"learning_rate": 3.939465110358699e-05,
"loss": 0.0223,
"step": 267
},
{
"epoch": 0.5166265060240964,
"grad_norm": 0.5190595388412476,
"learning_rate": 3.93836518692498e-05,
"loss": 0.0219,
"step": 268
},
{
"epoch": 0.5185542168674698,
"grad_norm": 0.5767757892608643,
"learning_rate": 3.937255516762193e-05,
"loss": 0.0294,
"step": 269
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.4543164372444153,
"learning_rate": 3.936136105450119e-05,
"loss": 0.0244,
"step": 270
},
{
"epoch": 0.5224096385542168,
"grad_norm": 0.4155154526233673,
"learning_rate": 3.9350069586175195e-05,
"loss": 0.02,
"step": 271
},
{
"epoch": 0.5243373493975904,
"grad_norm": 0.5470768213272095,
"learning_rate": 3.933868081942113e-05,
"loss": 0.0187,
"step": 272
},
{
"epoch": 0.5262650602409639,
"grad_norm": 0.9491772651672363,
"learning_rate": 3.9327194811505406e-05,
"loss": 0.0337,
"step": 273
},
{
"epoch": 0.5281927710843374,
"grad_norm": 0.9313873052597046,
"learning_rate": 3.93156116201834e-05,
"loss": 0.0573,
"step": 274
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.7181005477905273,
"learning_rate": 3.930393130369915e-05,
"loss": 0.0405,
"step": 275
},
{
"epoch": 0.5320481927710843,
"grad_norm": 0.34231385588645935,
"learning_rate": 3.9292153920785076e-05,
"loss": 0.0153,
"step": 276
},
{
"epoch": 0.5339759036144578,
"grad_norm": 0.6899610161781311,
"learning_rate": 3.928027953066168e-05,
"loss": 0.0338,
"step": 277
},
{
"epoch": 0.5359036144578313,
"grad_norm": 0.7509781718254089,
"learning_rate": 3.926830819303726e-05,
"loss": 0.0416,
"step": 278
},
{
"epoch": 0.5378313253012048,
"grad_norm": 0.6326774954795837,
"learning_rate": 3.925623996810757e-05,
"loss": 0.0293,
"step": 279
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.5543203353881836,
"learning_rate": 3.924407491655557e-05,
"loss": 0.0263,
"step": 280
},
{
"epoch": 0.5416867469879518,
"grad_norm": 0.5367572903633118,
"learning_rate": 3.9231813099551086e-05,
"loss": 0.0276,
"step": 281
},
{
"epoch": 0.5436144578313253,
"grad_norm": 0.3143869638442993,
"learning_rate": 3.921945457875051e-05,
"loss": 0.0146,
"step": 282
},
{
"epoch": 0.5455421686746988,
"grad_norm": 0.47403043508529663,
"learning_rate": 3.920699941629649e-05,
"loss": 0.0267,
"step": 283
},
{
"epoch": 0.5474698795180722,
"grad_norm": 0.5082595348358154,
"learning_rate": 3.919444767481763e-05,
"loss": 0.0183,
"step": 284
},
{
"epoch": 0.5493975903614458,
"grad_norm": 0.747949481010437,
"learning_rate": 3.918179941742816e-05,
"loss": 0.0412,
"step": 285
},
{
"epoch": 0.5513253012048193,
"grad_norm": 0.6553886532783508,
"learning_rate": 3.916905470772762e-05,
"loss": 0.0505,
"step": 286
},
{
"epoch": 0.5532530120481928,
"grad_norm": 0.3838176131248474,
"learning_rate": 3.9156213609800545e-05,
"loss": 0.0156,
"step": 287
},
{
"epoch": 0.5551807228915663,
"grad_norm": 0.7427731156349182,
"learning_rate": 3.914327618821614e-05,
"loss": 0.0278,
"step": 288
},
{
"epoch": 0.5571084337349398,
"grad_norm": 0.2612821161746979,
"learning_rate": 3.913024250802796e-05,
"loss": 0.0101,
"step": 289
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.3799416124820709,
"learning_rate": 3.911711263477357e-05,
"loss": 0.0168,
"step": 290
},
{
"epoch": 0.5609638554216867,
"grad_norm": 0.5053854584693909,
"learning_rate": 3.910388663447425e-05,
"loss": 0.0249,
"step": 291
},
{
"epoch": 0.5628915662650602,
"grad_norm": 0.38095012307167053,
"learning_rate": 3.909056457363461e-05,
"loss": 0.0156,
"step": 292
},
{
"epoch": 0.5648192771084337,
"grad_norm": 0.4477892220020294,
"learning_rate": 3.907714651924229e-05,
"loss": 0.0309,
"step": 293
},
{
"epoch": 0.5667469879518072,
"grad_norm": 0.5875864624977112,
"learning_rate": 3.906363253876763e-05,
"loss": 0.0287,
"step": 294
},
{
"epoch": 0.5686746987951807,
"grad_norm": 0.522990882396698,
"learning_rate": 3.90500227001633e-05,
"loss": 0.0318,
"step": 295
},
{
"epoch": 0.5706024096385542,
"grad_norm": 0.4153876304626465,
"learning_rate": 3.9036317071863994e-05,
"loss": 0.0192,
"step": 296
},
{
"epoch": 0.5725301204819278,
"grad_norm": 0.4675769507884979,
"learning_rate": 3.902251572278605e-05,
"loss": 0.067,
"step": 297
},
{
"epoch": 0.5744578313253013,
"grad_norm": 0.35778650641441345,
"learning_rate": 3.900861872232713e-05,
"loss": 0.0197,
"step": 298
},
{
"epoch": 0.5763855421686747,
"grad_norm": 0.7382330894470215,
"learning_rate": 3.899462614036587e-05,
"loss": 0.0283,
"step": 299
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.41268599033355713,
"learning_rate": 3.89805380472615e-05,
"loss": 0.0207,
"step": 300
},
{
"epoch": 0.5802409638554217,
"grad_norm": 1.2013020515441895,
"learning_rate": 3.8966354513853535e-05,
"loss": 0.0301,
"step": 301
},
{
"epoch": 0.5821686746987952,
"grad_norm": 0.424757719039917,
"learning_rate": 3.895207561146137e-05,
"loss": 0.022,
"step": 302
},
{
"epoch": 0.5840963855421687,
"grad_norm": 0.4196677505970001,
"learning_rate": 3.893770141188396e-05,
"loss": 0.0424,
"step": 303
},
{
"epoch": 0.5860240963855422,
"grad_norm": 0.8644190430641174,
"learning_rate": 3.892323198739946e-05,
"loss": 0.08,
"step": 304
},
{
"epoch": 0.5879518072289157,
"grad_norm": 0.5645135045051575,
"learning_rate": 3.890866741076482e-05,
"loss": 0.0152,
"step": 305
},
{
"epoch": 0.5898795180722891,
"grad_norm": 0.5218387246131897,
"learning_rate": 3.889400775521545e-05,
"loss": 0.0205,
"step": 306
},
{
"epoch": 0.5918072289156626,
"grad_norm": 0.39709413051605225,
"learning_rate": 3.8879253094464865e-05,
"loss": 0.0233,
"step": 307
},
{
"epoch": 0.5937349397590361,
"grad_norm": 0.3572910726070404,
"learning_rate": 3.8864403502704285e-05,
"loss": 0.0198,
"step": 308
},
{
"epoch": 0.5956626506024096,
"grad_norm": 0.382709264755249,
"learning_rate": 3.8849459054602274e-05,
"loss": 0.0176,
"step": 309
},
{
"epoch": 0.5975903614457831,
"grad_norm": 3.4527227878570557,
"learning_rate": 3.883441982530436e-05,
"loss": 0.0239,
"step": 310
},
{
"epoch": 0.5995180722891567,
"grad_norm": 0.4467569589614868,
"learning_rate": 3.8819285890432674e-05,
"loss": 0.0284,
"step": 311
},
{
"epoch": 0.6014457831325302,
"grad_norm": 0.44513460993766785,
"learning_rate": 3.880405732608555e-05,
"loss": 0.0233,
"step": 312
},
{
"epoch": 0.6033734939759036,
"grad_norm": 0.8029689192771912,
"learning_rate": 3.8788734208837155e-05,
"loss": 0.0433,
"step": 313
},
{
"epoch": 0.6053012048192771,
"grad_norm": 0.7291454076766968,
"learning_rate": 3.877331661573709e-05,
"loss": 0.043,
"step": 314
},
{
"epoch": 0.6072289156626506,
"grad_norm": 0.6050467491149902,
"learning_rate": 3.8757804624310006e-05,
"loss": 0.0377,
"step": 315
},
{
"epoch": 0.6091566265060241,
"grad_norm": 0.6714366674423218,
"learning_rate": 3.874219831255524e-05,
"loss": 0.046,
"step": 316
},
{
"epoch": 0.6110843373493976,
"grad_norm": 0.336037278175354,
"learning_rate": 3.8726497758946394e-05,
"loss": 0.0149,
"step": 317
},
{
"epoch": 0.6130120481927711,
"grad_norm": 0.3057402968406677,
"learning_rate": 3.871070304243094e-05,
"loss": 0.014,
"step": 318
},
{
"epoch": 0.6149397590361446,
"grad_norm": 0.4537644684314728,
"learning_rate": 3.8694814242429834e-05,
"loss": 0.0503,
"step": 319
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.45573824644088745,
"learning_rate": 3.8678831438837116e-05,
"loss": 0.021,
"step": 320
},
{
"epoch": 0.6187951807228915,
"grad_norm": 0.30729591846466064,
"learning_rate": 3.866275471201952e-05,
"loss": 0.0163,
"step": 321
},
{
"epoch": 0.620722891566265,
"grad_norm": 0.7614850401878357,
"learning_rate": 3.8646584142816036e-05,
"loss": 0.0347,
"step": 322
},
{
"epoch": 0.6226506024096385,
"grad_norm": 0.5323611497879028,
"learning_rate": 3.863031981253754e-05,
"loss": 0.0201,
"step": 323
},
{
"epoch": 0.624578313253012,
"grad_norm": 0.34426453709602356,
"learning_rate": 3.861396180296635e-05,
"loss": 0.0243,
"step": 324
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.621636152267456,
"learning_rate": 3.859751019635585e-05,
"loss": 0.0166,
"step": 325
},
{
"epoch": 0.6284337349397591,
"grad_norm": 0.549324095249176,
"learning_rate": 3.858096507543006e-05,
"loss": 0.0274,
"step": 326
},
{
"epoch": 0.6303614457831326,
"grad_norm": 0.358426570892334,
"learning_rate": 3.8564326523383214e-05,
"loss": 0.0207,
"step": 327
},
{
"epoch": 0.632289156626506,
"grad_norm": 0.3639723062515259,
"learning_rate": 3.8547594623879346e-05,
"loss": 0.0297,
"step": 328
},
{
"epoch": 0.6342168674698795,
"grad_norm": 0.3402212858200073,
"learning_rate": 3.853076946105188e-05,
"loss": 0.0258,
"step": 329
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.4083027243614197,
"learning_rate": 3.85138511195032e-05,
"loss": 0.0351,
"step": 330
},
{
"epoch": 0.6380722891566265,
"grad_norm": 0.43532121181488037,
"learning_rate": 3.84968396843042e-05,
"loss": 0.0388,
"step": 331
},
{
"epoch": 0.64,
"grad_norm": 0.35353463888168335,
"learning_rate": 3.8479735240993904e-05,
"loss": 0.0203,
"step": 332
},
{
"epoch": 0.6419277108433735,
"grad_norm": 0.350149929523468,
"learning_rate": 3.846253787557901e-05,
"loss": 0.0261,
"step": 333
},
{
"epoch": 0.643855421686747,
"grad_norm": 0.7665389180183411,
"learning_rate": 3.844524767453344e-05,
"loss": 0.0108,
"step": 334
},
{
"epoch": 0.6457831325301204,
"grad_norm": 0.44621360301971436,
"learning_rate": 3.842786472479795e-05,
"loss": 0.0282,
"step": 335
},
{
"epoch": 0.6477108433734939,
"grad_norm": 0.7787201404571533,
"learning_rate": 3.841038911377962e-05,
"loss": 0.0216,
"step": 336
},
{
"epoch": 0.6496385542168674,
"grad_norm": 0.48260653018951416,
"learning_rate": 3.839282092935153e-05,
"loss": 0.0234,
"step": 337
},
{
"epoch": 0.651566265060241,
"grad_norm": 0.4987852871417999,
"learning_rate": 3.837516025985219e-05,
"loss": 0.0515,
"step": 338
},
{
"epoch": 0.6534939759036145,
"grad_norm": 0.9030266404151917,
"learning_rate": 3.835740719408517e-05,
"loss": 0.0508,
"step": 339
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.6381701231002808,
"learning_rate": 3.833956182131867e-05,
"loss": 0.0405,
"step": 340
},
{
"epoch": 0.6573493975903615,
"grad_norm": 0.42828986048698425,
"learning_rate": 3.832162423128499e-05,
"loss": 0.024,
"step": 341
},
{
"epoch": 0.659277108433735,
"grad_norm": 0.38725873827934265,
"learning_rate": 3.8303594514180164e-05,
"loss": 0.0199,
"step": 342
},
{
"epoch": 0.6612048192771084,
"grad_norm": 0.23280498385429382,
"learning_rate": 3.828547276066346e-05,
"loss": 0.0101,
"step": 343
},
{
"epoch": 0.6631325301204819,
"grad_norm": 0.7298216819763184,
"learning_rate": 3.8267259061856925e-05,
"loss": 0.0455,
"step": 344
},
{
"epoch": 0.6650602409638554,
"grad_norm": 0.5975687503814697,
"learning_rate": 3.824895350934496e-05,
"loss": 0.0372,
"step": 345
},
{
"epoch": 0.6669879518072289,
"grad_norm": 0.6295403242111206,
"learning_rate": 3.823055619517381e-05,
"loss": 0.0362,
"step": 346
},
{
"epoch": 0.6689156626506024,
"grad_norm": 0.5086020827293396,
"learning_rate": 3.821206721185115e-05,
"loss": 0.0368,
"step": 347
},
{
"epoch": 0.6708433734939759,
"grad_norm": 0.34506168961524963,
"learning_rate": 3.819348665234557e-05,
"loss": 0.0178,
"step": 348
},
{
"epoch": 0.6727710843373494,
"grad_norm": 1.309940218925476,
"learning_rate": 3.817481461008617e-05,
"loss": 0.024,
"step": 349
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.4074770510196686,
"learning_rate": 3.815605117896204e-05,
"loss": 0.0262,
"step": 350
},
{
"epoch": 0.6766265060240964,
"grad_norm": 0.48525840044021606,
"learning_rate": 3.8137196453321775e-05,
"loss": 0.0209,
"step": 351
},
{
"epoch": 0.6785542168674699,
"grad_norm": 0.7199739217758179,
"learning_rate": 3.811825052797308e-05,
"loss": 0.0396,
"step": 352
},
{
"epoch": 0.6804819277108434,
"grad_norm": 0.519540011882782,
"learning_rate": 3.8099213498182196e-05,
"loss": 0.0453,
"step": 353
},
{
"epoch": 0.6824096385542169,
"grad_norm": 0.9738391041755676,
"learning_rate": 3.808008545967349e-05,
"loss": 0.0317,
"step": 354
},
{
"epoch": 0.6843373493975904,
"grad_norm": 1.888344407081604,
"learning_rate": 3.8060866508628953e-05,
"loss": 0.0452,
"step": 355
},
{
"epoch": 0.6862650602409639,
"grad_norm": 0.48989811539649963,
"learning_rate": 3.8041556741687695e-05,
"loss": 0.0315,
"step": 356
},
{
"epoch": 0.6881927710843373,
"grad_norm": 0.3764645457267761,
"learning_rate": 3.8022156255945496e-05,
"loss": 0.0269,
"step": 357
},
{
"epoch": 0.6901204819277108,
"grad_norm": 0.46409738063812256,
"learning_rate": 3.800266514895429e-05,
"loss": 0.0171,
"step": 358
},
{
"epoch": 0.6920481927710843,
"grad_norm": 0.41091030836105347,
"learning_rate": 3.7983083518721695e-05,
"loss": 0.0167,
"step": 359
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.8375523090362549,
"learning_rate": 3.79634114637105e-05,
"loss": 0.0342,
"step": 360
},
{
"epoch": 0.6959036144578313,
"grad_norm": 1.7053394317626953,
"learning_rate": 3.794364908283817e-05,
"loss": 0.02,
"step": 361
},
{
"epoch": 0.6978313253012048,
"grad_norm": 0.4163115918636322,
"learning_rate": 3.792379647547637e-05,
"loss": 0.0138,
"step": 362
},
{
"epoch": 0.6997590361445784,
"grad_norm": 0.388751745223999,
"learning_rate": 3.790385374145046e-05,
"loss": 0.0172,
"step": 363
},
{
"epoch": 0.7016867469879519,
"grad_norm": 0.5584064722061157,
"learning_rate": 3.7883820981038966e-05,
"loss": 0.0254,
"step": 364
},
{
"epoch": 0.7036144578313253,
"grad_norm": 1.394264817237854,
"learning_rate": 3.7863698294973114e-05,
"loss": 0.037,
"step": 365
},
{
"epoch": 0.7055421686746988,
"grad_norm": 0.46280744671821594,
"learning_rate": 3.78434857844363e-05,
"loss": 0.0234,
"step": 366
},
{
"epoch": 0.7074698795180723,
"grad_norm": 0.39548924565315247,
"learning_rate": 3.782318355106358e-05,
"loss": 0.0164,
"step": 367
},
{
"epoch": 0.7093975903614458,
"grad_norm": 0.7307773232460022,
"learning_rate": 3.780279169694118e-05,
"loss": 0.0192,
"step": 368
},
{
"epoch": 0.7113253012048193,
"grad_norm": 0.28035807609558105,
"learning_rate": 3.778231032460594e-05,
"loss": 0.0131,
"step": 369
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.8376953601837158,
"learning_rate": 3.776173953704486e-05,
"loss": 0.0291,
"step": 370
},
{
"epoch": 0.7151807228915663,
"grad_norm": 0.7356843948364258,
"learning_rate": 3.774107943769454e-05,
"loss": 0.0214,
"step": 371
},
{
"epoch": 0.7171084337349397,
"grad_norm": 0.41503390669822693,
"learning_rate": 3.772033013044064e-05,
"loss": 0.0221,
"step": 372
},
{
"epoch": 0.7190361445783132,
"grad_norm": 0.35732385516166687,
"learning_rate": 3.7699491719617436e-05,
"loss": 0.015,
"step": 373
},
{
"epoch": 0.7209638554216867,
"grad_norm": 0.283778578042984,
"learning_rate": 3.76785643100072e-05,
"loss": 0.0146,
"step": 374
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.3219413459300995,
"learning_rate": 3.765754800683974e-05,
"loss": 0.015,
"step": 375
},
{
"epoch": 0.7248192771084337,
"grad_norm": 0.610431432723999,
"learning_rate": 3.7636442915791856e-05,
"loss": 0.0326,
"step": 376
},
{
"epoch": 0.7267469879518073,
"grad_norm": 4.944870948791504,
"learning_rate": 3.7615249142986784e-05,
"loss": 0.0432,
"step": 377
},
{
"epoch": 0.7286746987951808,
"grad_norm": 0.4894593060016632,
"learning_rate": 3.7593966794993696e-05,
"loss": 0.0174,
"step": 378
},
{
"epoch": 0.7306024096385542,
"grad_norm": 0.4211325943470001,
"learning_rate": 3.757259597882714e-05,
"loss": 0.023,
"step": 379
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.33621737360954285,
"learning_rate": 3.755113680194651e-05,
"loss": 0.0201,
"step": 380
},
{
"epoch": 0.7344578313253012,
"grad_norm": 0.5799694657325745,
"learning_rate": 3.7529589372255514e-05,
"loss": 0.0173,
"step": 381
},
{
"epoch": 0.7363855421686747,
"grad_norm": 0.5172572731971741,
"learning_rate": 3.750795379810162e-05,
"loss": 0.0284,
"step": 382
},
{
"epoch": 0.7383132530120482,
"grad_norm": 0.5715453028678894,
"learning_rate": 3.748623018827552e-05,
"loss": 0.0194,
"step": 383
},
{
"epoch": 0.7402409638554217,
"grad_norm": 0.5284178256988525,
"learning_rate": 3.746441865201056e-05,
"loss": 0.0247,
"step": 384
},
{
"epoch": 0.7421686746987952,
"grad_norm": 0.37828654050827026,
"learning_rate": 3.744251929898223e-05,
"loss": 0.0097,
"step": 385
},
{
"epoch": 0.7440963855421687,
"grad_norm": 0.3252779543399811,
"learning_rate": 3.742053223930758e-05,
"loss": 0.0238,
"step": 386
},
{
"epoch": 0.7460240963855421,
"grad_norm": 0.6031543612480164,
"learning_rate": 3.7398457583544674e-05,
"loss": 0.0332,
"step": 387
},
{
"epoch": 0.7479518072289156,
"grad_norm": 0.23846614360809326,
"learning_rate": 3.737629544269206e-05,
"loss": 0.0122,
"step": 388
},
{
"epoch": 0.7498795180722891,
"grad_norm": 0.5274029970169067,
"learning_rate": 3.7354045928188155e-05,
"loss": 0.0324,
"step": 389
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.4672217071056366,
"learning_rate": 3.733170915191075e-05,
"loss": 0.0196,
"step": 390
},
{
"epoch": 0.7537349397590362,
"grad_norm": 0.29819396138191223,
"learning_rate": 3.730928522617639e-05,
"loss": 0.0131,
"step": 391
},
{
"epoch": 0.7556626506024097,
"grad_norm": 0.43824997544288635,
"learning_rate": 3.7286774263739855e-05,
"loss": 0.0238,
"step": 392
},
{
"epoch": 0.7575903614457832,
"grad_norm": 0.2822072505950928,
"learning_rate": 3.726417637779357e-05,
"loss": 0.0314,
"step": 393
},
{
"epoch": 0.7595180722891566,
"grad_norm": 0.43815648555755615,
"learning_rate": 3.7241491681967044e-05,
"loss": 0.0144,
"step": 394
},
{
"epoch": 0.7614457831325301,
"grad_norm": 0.37194815278053284,
"learning_rate": 3.721872029032628e-05,
"loss": 0.0286,
"step": 395
},
{
"epoch": 0.7633734939759036,
"grad_norm": 0.7319737672805786,
"learning_rate": 3.719586231737322e-05,
"loss": 0.0427,
"step": 396
},
{
"epoch": 0.7653012048192771,
"grad_norm": 0.5870066285133362,
"learning_rate": 3.717291787804517e-05,
"loss": 0.0138,
"step": 397
},
{
"epoch": 0.7672289156626506,
"grad_norm": 0.6574277281761169,
"learning_rate": 3.7149887087714225e-05,
"loss": 0.061,
"step": 398
},
{
"epoch": 0.7691566265060241,
"grad_norm": 0.5467348694801331,
"learning_rate": 3.712677006218666e-05,
"loss": 0.022,
"step": 399
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.3589288890361786,
"learning_rate": 3.710356691770238e-05,
"loss": 0.0161,
"step": 400
},
{
"epoch": 0.7730120481927711,
"grad_norm": 0.574630618095398,
"learning_rate": 3.708027777093433e-05,
"loss": 0.0285,
"step": 401
},
{
"epoch": 0.7749397590361445,
"grad_norm": 0.39048445224761963,
"learning_rate": 3.70569027389879e-05,
"loss": 0.012,
"step": 402
},
{
"epoch": 0.776867469879518,
"grad_norm": 0.34803536534309387,
"learning_rate": 3.703344193940032e-05,
"loss": 0.0155,
"step": 403
},
{
"epoch": 0.7787951807228916,
"grad_norm": 1.188948392868042,
"learning_rate": 3.700989549014011e-05,
"loss": 0.0617,
"step": 404
},
{
"epoch": 0.7807228915662651,
"grad_norm": 0.473157674074173,
"learning_rate": 3.698626350960646e-05,
"loss": 0.0298,
"step": 405
},
{
"epoch": 0.7826506024096386,
"grad_norm": 0.42009076476097107,
"learning_rate": 3.6962546116628634e-05,
"loss": 0.03,
"step": 406
},
{
"epoch": 0.7845783132530121,
"grad_norm": 0.6334308981895447,
"learning_rate": 3.693874343046537e-05,
"loss": 0.0107,
"step": 407
},
{
"epoch": 0.7865060240963856,
"grad_norm": 0.35594677925109863,
"learning_rate": 3.6914855570804314e-05,
"loss": 0.0174,
"step": 408
},
{
"epoch": 0.788433734939759,
"grad_norm": 0.28985708951950073,
"learning_rate": 3.689088265776136e-05,
"loss": 0.0149,
"step": 409
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.3981950581073761,
"learning_rate": 3.686682481188011e-05,
"loss": 0.019,
"step": 410
},
{
"epoch": 0.792289156626506,
"grad_norm": 0.48819583654403687,
"learning_rate": 3.6842682154131193e-05,
"loss": 0.0217,
"step": 411
},
{
"epoch": 0.7942168674698795,
"grad_norm": 0.42819952964782715,
"learning_rate": 3.681845480591174e-05,
"loss": 0.0198,
"step": 412
},
{
"epoch": 0.796144578313253,
"grad_norm": 0.48591694235801697,
"learning_rate": 3.6794142889044727e-05,
"loss": 0.0253,
"step": 413
},
{
"epoch": 0.7980722891566265,
"grad_norm": 0.4730607271194458,
"learning_rate": 3.676974652577835e-05,
"loss": 0.0329,
"step": 414
},
{
"epoch": 0.8,
"grad_norm": 0.5390865802764893,
"learning_rate": 3.6745265838785434e-05,
"loss": 0.0479,
"step": 415
},
{
"epoch": 0.8019277108433734,
"grad_norm": 0.6377891302108765,
"learning_rate": 3.672070095116283e-05,
"loss": 0.019,
"step": 416
},
{
"epoch": 0.803855421686747,
"grad_norm": 0.8984615206718445,
"learning_rate": 3.669605198643075e-05,
"loss": 0.0444,
"step": 417
},
{
"epoch": 0.8057831325301205,
"grad_norm": 0.4913877546787262,
"learning_rate": 3.667131906853219e-05,
"loss": 0.031,
"step": 418
},
{
"epoch": 0.807710843373494,
"grad_norm": 0.37894028425216675,
"learning_rate": 3.664650232183229e-05,
"loss": 0.0195,
"step": 419
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.3644949495792389,
"learning_rate": 3.66216018711177e-05,
"loss": 0.018,
"step": 420
},
{
"epoch": 0.811566265060241,
"grad_norm": 0.414440393447876,
"learning_rate": 3.659661784159597e-05,
"loss": 0.0188,
"step": 421
},
{
"epoch": 0.8134939759036145,
"grad_norm": 0.49220341444015503,
"learning_rate": 3.65715503588949e-05,
"loss": 0.016,
"step": 422
},
{
"epoch": 0.815421686746988,
"grad_norm": 1.0939836502075195,
"learning_rate": 3.654639954906193e-05,
"loss": 0.0758,
"step": 423
},
{
"epoch": 0.8173493975903614,
"grad_norm": 0.43222442269325256,
"learning_rate": 3.652116553856349e-05,
"loss": 0.0308,
"step": 424
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.5081896185874939,
"learning_rate": 3.649584845428438e-05,
"loss": 0.0493,
"step": 425
},
{
"epoch": 0.8212048192771084,
"grad_norm": 0.9811948537826538,
"learning_rate": 3.64704484235271e-05,
"loss": 0.019,
"step": 426
},
{
"epoch": 0.8231325301204819,
"grad_norm": 0.31656572222709656,
"learning_rate": 3.6444965574011255e-05,
"loss": 0.0135,
"step": 427
},
{
"epoch": 0.8250602409638554,
"grad_norm": 0.7844433188438416,
"learning_rate": 3.641940003387289e-05,
"loss": 0.0402,
"step": 428
},
{
"epoch": 0.826987951807229,
"grad_norm": 0.3353273570537567,
"learning_rate": 3.6393751931663814e-05,
"loss": 0.0132,
"step": 429
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.7253058552742004,
"learning_rate": 3.6368021396351015e-05,
"loss": 0.0296,
"step": 430
},
{
"epoch": 0.8308433734939759,
"grad_norm": 0.45300304889678955,
"learning_rate": 3.634220855731598e-05,
"loss": 0.0258,
"step": 431
},
{
"epoch": 0.8327710843373494,
"grad_norm": 0.3480473458766937,
"learning_rate": 3.631631354435403e-05,
"loss": 0.0099,
"step": 432
},
{
"epoch": 0.8346987951807229,
"grad_norm": 2.1114516258239746,
"learning_rate": 3.62903364876737e-05,
"loss": 0.0457,
"step": 433
},
{
"epoch": 0.8366265060240964,
"grad_norm": 0.5649561882019043,
"learning_rate": 3.626427751789606e-05,
"loss": 0.0444,
"step": 434
},
{
"epoch": 0.8385542168674699,
"grad_norm": 0.3864995539188385,
"learning_rate": 3.623813676605405e-05,
"loss": 0.0223,
"step": 435
},
{
"epoch": 0.8404819277108434,
"grad_norm": 1.2134298086166382,
"learning_rate": 3.621191436359186e-05,
"loss": 0.0353,
"step": 436
},
{
"epoch": 0.8424096385542169,
"grad_norm": 0.4403415024280548,
"learning_rate": 3.6185610442364246e-05,
"loss": 0.0216,
"step": 437
},
{
"epoch": 0.8443373493975903,
"grad_norm": 0.6050297021865845,
"learning_rate": 3.6159225134635846e-05,
"loss": 0.0433,
"step": 438
},
{
"epoch": 0.8462650602409638,
"grad_norm": 0.7951678037643433,
"learning_rate": 3.6132758573080556e-05,
"loss": 0.031,
"step": 439
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.4991949796676636,
"learning_rate": 3.6106210890780834e-05,
"loss": 0.0313,
"step": 440
},
{
"epoch": 0.8501204819277108,
"grad_norm": 0.47951385378837585,
"learning_rate": 3.607958222122704e-05,
"loss": 0.0218,
"step": 441
},
{
"epoch": 0.8520481927710843,
"grad_norm": 0.7345194220542908,
"learning_rate": 3.6052872698316755e-05,
"loss": 0.0239,
"step": 442
},
{
"epoch": 0.8539759036144579,
"grad_norm": 1.4814884662628174,
"learning_rate": 3.602608245635414e-05,
"loss": 0.0127,
"step": 443
},
{
"epoch": 0.8559036144578314,
"grad_norm": 2.4240877628326416,
"learning_rate": 3.599921163004922e-05,
"loss": 0.0618,
"step": 444
},
{
"epoch": 0.8578313253012049,
"grad_norm": 0.41523510217666626,
"learning_rate": 3.5972260354517216e-05,
"loss": 0.0283,
"step": 445
},
{
"epoch": 0.8597590361445783,
"grad_norm": 0.5577677488327026,
"learning_rate": 3.594522876527791e-05,
"loss": 0.0271,
"step": 446
},
{
"epoch": 0.8616867469879518,
"grad_norm": 0.5829064846038818,
"learning_rate": 3.591811699825487e-05,
"loss": 0.0169,
"step": 447
},
{
"epoch": 0.8636144578313253,
"grad_norm": 0.4478822350502014,
"learning_rate": 3.5890925189774886e-05,
"loss": 0.0239,
"step": 448
},
{
"epoch": 0.8655421686746988,
"grad_norm": 0.3498048782348633,
"learning_rate": 3.586365347656718e-05,
"loss": 0.0137,
"step": 449
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.6571130156517029,
"learning_rate": 3.583630199576278e-05,
"loss": 0.027,
"step": 450
},
{
"epoch": 0.8693975903614458,
"grad_norm": 0.344970166683197,
"learning_rate": 3.58088708848938e-05,
"loss": 0.0167,
"step": 451
},
{
"epoch": 0.8713253012048193,
"grad_norm": 0.34611570835113525,
"learning_rate": 3.5781360281892775e-05,
"loss": 0.0468,
"step": 452
},
{
"epoch": 0.8732530120481927,
"grad_norm": 0.66157066822052,
"learning_rate": 3.575377032509194e-05,
"loss": 0.0344,
"step": 453
},
{
"epoch": 0.8751807228915662,
"grad_norm": 0.3676326870918274,
"learning_rate": 3.5726101153222534e-05,
"loss": 0.0366,
"step": 454
},
{
"epoch": 0.8771084337349397,
"grad_norm": 0.5958423018455505,
"learning_rate": 3.569835290541414e-05,
"loss": 0.0382,
"step": 455
},
{
"epoch": 0.8790361445783132,
"grad_norm": 0.36787471175193787,
"learning_rate": 3.567052572119397e-05,
"loss": 0.018,
"step": 456
},
{
"epoch": 0.8809638554216868,
"grad_norm": 0.9478234052658081,
"learning_rate": 3.564261974048611e-05,
"loss": 0.0179,
"step": 457
},
{
"epoch": 0.8828915662650603,
"grad_norm": 0.3337579369544983,
"learning_rate": 3.56146351036109e-05,
"loss": 0.0147,
"step": 458
},
{
"epoch": 0.8848192771084338,
"grad_norm": 0.4984932243824005,
"learning_rate": 3.558657195128416e-05,
"loss": 0.0224,
"step": 459
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.36718735098838806,
"learning_rate": 3.555843042461653e-05,
"loss": 0.0202,
"step": 460
},
{
"epoch": 0.8886746987951807,
"grad_norm": 0.4081745445728302,
"learning_rate": 3.553021066511274e-05,
"loss": 0.0288,
"step": 461
},
{
"epoch": 0.8906024096385542,
"grad_norm": 0.3233242332935333,
"learning_rate": 3.55019128146709e-05,
"loss": 0.0362,
"step": 462
},
{
"epoch": 0.8925301204819277,
"grad_norm": 0.6560158729553223,
"learning_rate": 3.547353701558178e-05,
"loss": 0.038,
"step": 463
},
{
"epoch": 0.8944578313253012,
"grad_norm": 0.47668641805648804,
"learning_rate": 3.544508341052811e-05,
"loss": 0.0399,
"step": 464
},
{
"epoch": 0.8963855421686747,
"grad_norm": 0.45512664318084717,
"learning_rate": 3.541655214258383e-05,
"loss": 0.022,
"step": 465
},
{
"epoch": 0.8983132530120482,
"grad_norm": 0.8410730361938477,
"learning_rate": 3.538794335521343e-05,
"loss": 0.0315,
"step": 466
},
{
"epoch": 0.9002409638554217,
"grad_norm": 0.4872909486293793,
"learning_rate": 3.535925719227117e-05,
"loss": 0.0152,
"step": 467
},
{
"epoch": 0.9021686746987951,
"grad_norm": 0.45623311400413513,
"learning_rate": 3.533049379800038e-05,
"loss": 0.0305,
"step": 468
},
{
"epoch": 0.9040963855421686,
"grad_norm": 0.43087029457092285,
"learning_rate": 3.530165331703275e-05,
"loss": 0.0131,
"step": 469
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.4610525369644165,
"learning_rate": 3.527273589438756e-05,
"loss": 0.0187,
"step": 470
},
{
"epoch": 0.9079518072289157,
"grad_norm": 0.3356114327907562,
"learning_rate": 3.5243741675471006e-05,
"loss": 0.0185,
"step": 471
},
{
"epoch": 0.9098795180722892,
"grad_norm": 0.9065960049629211,
"learning_rate": 3.5214670806075426e-05,
"loss": 0.0433,
"step": 472
},
{
"epoch": 0.9118072289156627,
"grad_norm": 0.3652578294277191,
"learning_rate": 3.518552343237858e-05,
"loss": 0.02,
"step": 473
},
{
"epoch": 0.9137349397590362,
"grad_norm": 0.32377883791923523,
"learning_rate": 3.5156299700942916e-05,
"loss": 0.0165,
"step": 474
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.2431817352771759,
"learning_rate": 3.512699975871485e-05,
"loss": 0.0172,
"step": 475
},
{
"epoch": 0.9175903614457831,
"grad_norm": 0.6390707492828369,
"learning_rate": 3.509762375302399e-05,
"loss": 0.0356,
"step": 476
},
{
"epoch": 0.9195180722891566,
"grad_norm": 0.2283092886209488,
"learning_rate": 3.506817183158243e-05,
"loss": 0.0088,
"step": 477
},
{
"epoch": 0.9214457831325301,
"grad_norm": 0.5053914189338684,
"learning_rate": 3.5038644142483966e-05,
"loss": 0.0389,
"step": 478
},
{
"epoch": 0.9233734939759036,
"grad_norm": 0.2567576467990875,
"learning_rate": 3.500904083420342e-05,
"loss": 0.0155,
"step": 479
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.6852384209632874,
"learning_rate": 3.497936205559583e-05,
"loss": 0.0247,
"step": 480
},
{
"epoch": 0.9272289156626506,
"grad_norm": 0.36403414607048035,
"learning_rate": 3.494960795589572e-05,
"loss": 0.023,
"step": 481
},
{
"epoch": 0.929156626506024,
"grad_norm": 0.506554901599884,
"learning_rate": 3.491977868471635e-05,
"loss": 0.0273,
"step": 482
},
{
"epoch": 0.9310843373493976,
"grad_norm": 0.38329923152923584,
"learning_rate": 3.4889874392048985e-05,
"loss": 0.0169,
"step": 483
},
{
"epoch": 0.9330120481927711,
"grad_norm": 0.2805836498737335,
"learning_rate": 3.48598952282621e-05,
"loss": 0.0105,
"step": 484
},
{
"epoch": 0.9349397590361446,
"grad_norm": 0.6315302848815918,
"learning_rate": 3.482984134410067e-05,
"loss": 0.0289,
"step": 485
},
{
"epoch": 0.9368674698795181,
"grad_norm": 0.6431388854980469,
"learning_rate": 3.479971289068537e-05,
"loss": 0.0311,
"step": 486
},
{
"epoch": 0.9387951807228916,
"grad_norm": 0.9794723391532898,
"learning_rate": 3.476951001951184e-05,
"loss": 0.0452,
"step": 487
},
{
"epoch": 0.9407228915662651,
"grad_norm": 0.7984824180603027,
"learning_rate": 3.473923288244991e-05,
"loss": 0.0689,
"step": 488
},
{
"epoch": 0.9426506024096386,
"grad_norm": 0.46362006664276123,
"learning_rate": 3.470888163174286e-05,
"loss": 0.0241,
"step": 489
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.5051195025444031,
"learning_rate": 3.467845642000661e-05,
"loss": 0.0228,
"step": 490
},
{
"epoch": 0.9465060240963855,
"grad_norm": 0.3082812428474426,
"learning_rate": 3.4647957400229004e-05,
"loss": 0.0144,
"step": 491
},
{
"epoch": 0.948433734939759,
"grad_norm": 0.2691391110420227,
"learning_rate": 3.461738472576902e-05,
"loss": 0.0167,
"step": 492
},
{
"epoch": 0.9503614457831325,
"grad_norm": 0.5627671480178833,
"learning_rate": 3.458673855035597e-05,
"loss": 0.031,
"step": 493
},
{
"epoch": 0.952289156626506,
"grad_norm": 0.4571435749530792,
"learning_rate": 3.455601902808876e-05,
"loss": 0.0191,
"step": 494
},
{
"epoch": 0.9542168674698795,
"grad_norm": 1.0117709636688232,
"learning_rate": 3.452522631343515e-05,
"loss": 0.0192,
"step": 495
},
{
"epoch": 0.9561445783132531,
"grad_norm": 0.28375712037086487,
"learning_rate": 3.449436056123086e-05,
"loss": 0.0159,
"step": 496
},
{
"epoch": 0.9580722891566265,
"grad_norm": 0.26381856203079224,
"learning_rate": 3.446342192667893e-05,
"loss": 0.0113,
"step": 497
},
{
"epoch": 0.96,
"grad_norm": 0.49317577481269836,
"learning_rate": 3.443241056534884e-05,
"loss": 0.0332,
"step": 498
},
{
"epoch": 0.9619277108433735,
"grad_norm": 0.28884485363960266,
"learning_rate": 3.440132663317579e-05,
"loss": 0.0117,
"step": 499
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.36255285143852234,
"learning_rate": 3.4370170286459864e-05,
"loss": 0.0169,
"step": 500
},
{
"epoch": 0.9657831325301205,
"grad_norm": 0.4265049993991852,
"learning_rate": 3.433894168186529e-05,
"loss": 0.0217,
"step": 501
},
{
"epoch": 0.967710843373494,
"grad_norm": 0.8169426321983337,
"learning_rate": 3.430764097641962e-05,
"loss": 0.0207,
"step": 502
},
{
"epoch": 0.9696385542168675,
"grad_norm": 1.866077184677124,
"learning_rate": 3.427626832751296e-05,
"loss": 0.0381,
"step": 503
},
{
"epoch": 0.971566265060241,
"grad_norm": 0.33124980330467224,
"learning_rate": 3.424482389289716e-05,
"loss": 0.0245,
"step": 504
},
{
"epoch": 0.9734939759036144,
"grad_norm": 0.37479540705680847,
"learning_rate": 3.4213307830685055e-05,
"loss": 0.0164,
"step": 505
},
{
"epoch": 0.9754216867469879,
"grad_norm": 0.39738863706588745,
"learning_rate": 3.4181720299349615e-05,
"loss": 0.0297,
"step": 506
},
{
"epoch": 0.9773493975903614,
"grad_norm": 0.2567287087440491,
"learning_rate": 3.4150061457723205e-05,
"loss": 0.0102,
"step": 507
},
{
"epoch": 0.9792771084337349,
"grad_norm": 0.6230517029762268,
"learning_rate": 3.411833146499675e-05,
"loss": 0.0243,
"step": 508
},
{
"epoch": 0.9812048192771085,
"grad_norm": 0.44843971729278564,
"learning_rate": 3.408653048071894e-05,
"loss": 0.0357,
"step": 509
},
{
"epoch": 0.983132530120482,
"grad_norm": 1.0569655895233154,
"learning_rate": 3.405465866479546e-05,
"loss": 0.037,
"step": 510
},
{
"epoch": 0.9850602409638555,
"grad_norm": 0.29000964760780334,
"learning_rate": 3.402271617748812e-05,
"loss": 0.0129,
"step": 511
},
{
"epoch": 0.9869879518072289,
"grad_norm": 2.1627447605133057,
"learning_rate": 3.399070317941413e-05,
"loss": 0.0442,
"step": 512
},
{
"epoch": 0.9889156626506024,
"grad_norm": 0.27371272444725037,
"learning_rate": 3.395861983154522e-05,
"loss": 0.0119,
"step": 513
},
{
"epoch": 0.9908433734939759,
"grad_norm": 0.4117226302623749,
"learning_rate": 3.392646629520688e-05,
"loss": 0.0455,
"step": 514
},
{
"epoch": 0.9927710843373494,
"grad_norm": 0.5098996758460999,
"learning_rate": 3.389424273207752e-05,
"loss": 0.0203,
"step": 515
},
{
"epoch": 0.9946987951807229,
"grad_norm": 0.5192157626152039,
"learning_rate": 3.386194930418767e-05,
"loss": 0.0329,
"step": 516
},
{
"epoch": 0.9966265060240964,
"grad_norm": 0.18757697939872742,
"learning_rate": 3.382958617391915e-05,
"loss": 0.0065,
"step": 517
},
{
"epoch": 0.9985542168674699,
"grad_norm": 0.3334413170814514,
"learning_rate": 3.3797153504004296e-05,
"loss": 0.0266,
"step": 518
},
{
"epoch": 1.0,
"grad_norm": 0.4152225852012634,
"learning_rate": 3.3764651457525095e-05,
"loss": 0.0169,
"step": 519
},
{
"epoch": 1.0019277108433735,
"grad_norm": 0.43535247445106506,
"learning_rate": 3.373208019791237e-05,
"loss": 0.0221,
"step": 520
},
{
"epoch": 1.003855421686747,
"grad_norm": 0.39292722940444946,
"learning_rate": 3.3699439888945e-05,
"loss": 0.0211,
"step": 521
},
{
"epoch": 1.0057831325301205,
"grad_norm": 0.19566713273525238,
"learning_rate": 3.366673069474904e-05,
"loss": 0.0069,
"step": 522
},
{
"epoch": 1.007710843373494,
"grad_norm": 0.5101853609085083,
"learning_rate": 3.3633952779796914e-05,
"loss": 0.0191,
"step": 523
},
{
"epoch": 1.0096385542168675,
"grad_norm": 0.999434769153595,
"learning_rate": 3.360110630890664e-05,
"loss": 0.0196,
"step": 524
},
{
"epoch": 1.011566265060241,
"grad_norm": 0.4646223783493042,
"learning_rate": 3.356819144724092e-05,
"loss": 0.0328,
"step": 525
},
{
"epoch": 1.0134939759036146,
"grad_norm": 0.3132480978965759,
"learning_rate": 3.3535208360306354e-05,
"loss": 0.0203,
"step": 526
},
{
"epoch": 1.0154216867469879,
"grad_norm": 0.3038032352924347,
"learning_rate": 3.350215721395261e-05,
"loss": 0.0122,
"step": 527
},
{
"epoch": 1.0173493975903614,
"grad_norm": 0.45082882046699524,
"learning_rate": 3.346903817437157e-05,
"loss": 0.0437,
"step": 528
},
{
"epoch": 1.0192771084337349,
"grad_norm": 0.26917046308517456,
"learning_rate": 3.343585140809651e-05,
"loss": 0.013,
"step": 529
},
{
"epoch": 1.0212048192771084,
"grad_norm": 0.23869264125823975,
"learning_rate": 3.3402597082001276e-05,
"loss": 0.008,
"step": 530
},
{
"epoch": 1.0231325301204819,
"grad_norm": 0.31315353512763977,
"learning_rate": 3.3369275363299394e-05,
"loss": 0.0078,
"step": 531
},
{
"epoch": 1.0250602409638554,
"grad_norm": 0.4780346751213074,
"learning_rate": 3.333588641954327e-05,
"loss": 0.0225,
"step": 532
},
{
"epoch": 1.026987951807229,
"grad_norm": 0.2920368015766144,
"learning_rate": 3.330243041862336e-05,
"loss": 0.0118,
"step": 533
},
{
"epoch": 1.0289156626506024,
"grad_norm": 0.543669581413269,
"learning_rate": 3.326890752876728e-05,
"loss": 0.0338,
"step": 534
},
{
"epoch": 1.030843373493976,
"grad_norm": 0.4288000464439392,
"learning_rate": 3.323531791853901e-05,
"loss": 0.0341,
"step": 535
},
{
"epoch": 1.0327710843373494,
"grad_norm": 0.26600322127342224,
"learning_rate": 3.3201661756838e-05,
"loss": 0.0184,
"step": 536
},
{
"epoch": 1.034698795180723,
"grad_norm": 0.290937602519989,
"learning_rate": 3.316793921289835e-05,
"loss": 0.0152,
"step": 537
},
{
"epoch": 1.0366265060240965,
"grad_norm": 0.7621443271636963,
"learning_rate": 3.313415045628795e-05,
"loss": 0.0326,
"step": 538
},
{
"epoch": 1.03855421686747,
"grad_norm": 0.5581283569335938,
"learning_rate": 3.3100295656907646e-05,
"loss": 0.0164,
"step": 539
},
{
"epoch": 1.0404819277108435,
"grad_norm": 0.20930901169776917,
"learning_rate": 3.306637498499034e-05,
"loss": 0.0091,
"step": 540
},
{
"epoch": 1.0424096385542168,
"grad_norm": 0.46212059259414673,
"learning_rate": 3.303238861110018e-05,
"loss": 0.0118,
"step": 541
},
{
"epoch": 1.0443373493975903,
"grad_norm": 0.38259151577949524,
"learning_rate": 3.299833670613168e-05,
"loss": 0.0081,
"step": 542
},
{
"epoch": 1.0462650602409638,
"grad_norm": 0.4888618290424347,
"learning_rate": 3.2964219441308865e-05,
"loss": 0.0138,
"step": 543
},
{
"epoch": 1.0481927710843373,
"grad_norm": 0.32103127241134644,
"learning_rate": 3.2930036988184425e-05,
"loss": 0.0171,
"step": 544
},
{
"epoch": 1.0501204819277108,
"grad_norm": 0.27787327766418457,
"learning_rate": 3.28957895186388e-05,
"loss": 0.0106,
"step": 545
},
{
"epoch": 1.0520481927710843,
"grad_norm": 0.35597777366638184,
"learning_rate": 3.2861477204879395e-05,
"loss": 0.0123,
"step": 546
},
{
"epoch": 1.0539759036144578,
"grad_norm": 0.3619804084300995,
"learning_rate": 3.2827100219439656e-05,
"loss": 0.0088,
"step": 547
},
{
"epoch": 1.0559036144578313,
"grad_norm": 0.2525513470172882,
"learning_rate": 3.279265873517822e-05,
"loss": 0.0179,
"step": 548
},
{
"epoch": 1.0578313253012048,
"grad_norm": 0.3910020887851715,
"learning_rate": 3.275815292527804e-05,
"loss": 0.0142,
"step": 549
},
{
"epoch": 1.0597590361445783,
"grad_norm": 0.30515050888061523,
"learning_rate": 3.2723582963245526e-05,
"loss": 0.0123,
"step": 550
},
{
"epoch": 1.0616867469879518,
"grad_norm": 0.21708644926548004,
"learning_rate": 3.2688949022909665e-05,
"loss": 0.0098,
"step": 551
},
{
"epoch": 1.0636144578313254,
"grad_norm": 0.23307719826698303,
"learning_rate": 3.265425127842114e-05,
"loss": 0.0097,
"step": 552
},
{
"epoch": 1.0655421686746989,
"grad_norm": 0.676654577255249,
"learning_rate": 3.261948990425147e-05,
"loss": 0.0227,
"step": 553
},
{
"epoch": 1.0674698795180724,
"grad_norm": 0.4593975841999054,
"learning_rate": 3.258466507519213e-05,
"loss": 0.047,
"step": 554
},
{
"epoch": 1.0693975903614459,
"grad_norm": 0.19405829906463623,
"learning_rate": 3.254977696635366e-05,
"loss": 0.0314,
"step": 555
},
{
"epoch": 1.0713253012048192,
"grad_norm": 0.14563389122486115,
"learning_rate": 3.2514825753164774e-05,
"loss": 0.0046,
"step": 556
},
{
"epoch": 1.0732530120481927,
"grad_norm": 0.2642340064048767,
"learning_rate": 3.247981161137153e-05,
"loss": 0.022,
"step": 557
},
{
"epoch": 1.0751807228915662,
"grad_norm": 0.17274761199951172,
"learning_rate": 3.2444734717036386e-05,
"loss": 0.0134,
"step": 558
},
{
"epoch": 1.0771084337349397,
"grad_norm": 0.44354626536369324,
"learning_rate": 3.240959524653735e-05,
"loss": 0.0211,
"step": 559
},
{
"epoch": 1.0790361445783132,
"grad_norm": 0.2806888818740845,
"learning_rate": 3.237439337656708e-05,
"loss": 0.0141,
"step": 560
},
{
"epoch": 1.0809638554216867,
"grad_norm": 0.21679501235485077,
"learning_rate": 3.2339129284131994e-05,
"loss": 0.019,
"step": 561
},
{
"epoch": 1.0828915662650602,
"grad_norm": 0.3040260076522827,
"learning_rate": 3.2303803146551386e-05,
"loss": 0.0249,
"step": 562
},
{
"epoch": 1.0848192771084337,
"grad_norm": 0.2793775200843811,
"learning_rate": 3.226841514145656e-05,
"loss": 0.0088,
"step": 563
},
{
"epoch": 1.0867469879518072,
"grad_norm": 0.149955615401268,
"learning_rate": 3.223296544678987e-05,
"loss": 0.0054,
"step": 564
},
{
"epoch": 1.0886746987951808,
"grad_norm": 0.22166767716407776,
"learning_rate": 3.219745424080389e-05,
"loss": 0.0109,
"step": 565
},
{
"epoch": 1.0906024096385543,
"grad_norm": 0.22399431467056274,
"learning_rate": 3.2161881702060476e-05,
"loss": 0.0106,
"step": 566
},
{
"epoch": 1.0925301204819278,
"grad_norm": 0.18537986278533936,
"learning_rate": 3.2126248009429905e-05,
"loss": 0.0077,
"step": 567
},
{
"epoch": 1.0944578313253013,
"grad_norm": 0.24511495232582092,
"learning_rate": 3.2090553342089935e-05,
"loss": 0.0093,
"step": 568
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.4766045808792114,
"learning_rate": 3.205479787952494e-05,
"loss": 0.036,
"step": 569
},
{
"epoch": 1.0983132530120483,
"grad_norm": 0.1425715535879135,
"learning_rate": 3.201898180152499e-05,
"loss": 0.0085,
"step": 570
},
{
"epoch": 1.1002409638554216,
"grad_norm": 0.1909666359424591,
"learning_rate": 3.1983105288184945e-05,
"loss": 0.0081,
"step": 571
},
{
"epoch": 1.102168674698795,
"grad_norm": 0.44077104330062866,
"learning_rate": 3.194716851990355e-05,
"loss": 0.017,
"step": 572
},
{
"epoch": 1.1040963855421686,
"grad_norm": 0.5757400989532471,
"learning_rate": 3.191117167738253e-05,
"loss": 0.021,
"step": 573
},
{
"epoch": 1.106024096385542,
"grad_norm": 0.1977701038122177,
"learning_rate": 3.1875114941625705e-05,
"loss": 0.0096,
"step": 574
},
{
"epoch": 1.1079518072289156,
"grad_norm": 0.3524581491947174,
"learning_rate": 3.1838998493938026e-05,
"loss": 0.0118,
"step": 575
},
{
"epoch": 1.1098795180722891,
"grad_norm": 0.3301331698894501,
"learning_rate": 3.180282251592472e-05,
"loss": 0.0094,
"step": 576
},
{
"epoch": 1.1118072289156626,
"grad_norm": 0.2774488925933838,
"learning_rate": 3.1766587189490336e-05,
"loss": 0.0131,
"step": 577
},
{
"epoch": 1.1137349397590361,
"grad_norm": 1.732595443725586,
"learning_rate": 3.173029269683785e-05,
"loss": 0.0445,
"step": 578
},
{
"epoch": 1.1156626506024097,
"grad_norm": 0.28746843338012695,
"learning_rate": 3.169393922046776e-05,
"loss": 0.0116,
"step": 579
},
{
"epoch": 1.1175903614457832,
"grad_norm": 0.2952995002269745,
"learning_rate": 3.165752694317713e-05,
"loss": 0.0116,
"step": 580
},
{
"epoch": 1.1195180722891567,
"grad_norm": 0.2938575744628906,
"learning_rate": 3.16210560480587e-05,
"loss": 0.013,
"step": 581
},
{
"epoch": 1.1214457831325302,
"grad_norm": 0.22283495962619781,
"learning_rate": 3.158452671849998e-05,
"loss": 0.0052,
"step": 582
},
{
"epoch": 1.1233734939759037,
"grad_norm": 0.6272858381271362,
"learning_rate": 3.154793913818226e-05,
"loss": 0.0182,
"step": 583
},
{
"epoch": 1.1253012048192772,
"grad_norm": 0.479753702878952,
"learning_rate": 3.1511293491079804e-05,
"loss": 0.0146,
"step": 584
},
{
"epoch": 1.1272289156626507,
"grad_norm": 0.31104400753974915,
"learning_rate": 3.1474589961458786e-05,
"loss": 0.0139,
"step": 585
},
{
"epoch": 1.129156626506024,
"grad_norm": 0.4932832419872284,
"learning_rate": 3.1437828733876477e-05,
"loss": 0.0236,
"step": 586
},
{
"epoch": 1.1310843373493975,
"grad_norm": 0.222808837890625,
"learning_rate": 3.140100999318025e-05,
"loss": 0.0084,
"step": 587
},
{
"epoch": 1.133012048192771,
"grad_norm": 0.4515356719493866,
"learning_rate": 3.136413392450668e-05,
"loss": 0.0215,
"step": 588
},
{
"epoch": 1.1349397590361445,
"grad_norm": 0.39302268624305725,
"learning_rate": 3.132720071328061e-05,
"loss": 0.0154,
"step": 589
},
{
"epoch": 1.136867469879518,
"grad_norm": 0.43382835388183594,
"learning_rate": 3.1290210545214205e-05,
"loss": 0.0088,
"step": 590
},
{
"epoch": 1.1387951807228915,
"grad_norm": 0.18707136809825897,
"learning_rate": 3.125316360630602e-05,
"loss": 0.0126,
"step": 591
},
{
"epoch": 1.140722891566265,
"grad_norm": 0.5688219666481018,
"learning_rate": 3.121606008284011e-05,
"loss": 0.0147,
"step": 592
},
{
"epoch": 1.1426506024096386,
"grad_norm": 0.3321833312511444,
"learning_rate": 3.1178900161385005e-05,
"loss": 0.0119,
"step": 593
},
{
"epoch": 1.144578313253012,
"grad_norm": 0.3738424777984619,
"learning_rate": 3.114168402879286e-05,
"loss": 0.0158,
"step": 594
},
{
"epoch": 1.1465060240963856,
"grad_norm": 0.2386978417634964,
"learning_rate": 3.110441187219846e-05,
"loss": 0.0107,
"step": 595
},
{
"epoch": 1.148433734939759,
"grad_norm": 0.2165699452161789,
"learning_rate": 3.10670838790183e-05,
"loss": 0.0079,
"step": 596
},
{
"epoch": 1.1503614457831326,
"grad_norm": 0.25952696800231934,
"learning_rate": 3.102970023694965e-05,
"loss": 0.0147,
"step": 597
},
{
"epoch": 1.152289156626506,
"grad_norm": 0.21448305249214172,
"learning_rate": 3.099226113396959e-05,
"loss": 0.0099,
"step": 598
},
{
"epoch": 1.1542168674698796,
"grad_norm": 0.37226060032844543,
"learning_rate": 3.095476675833405e-05,
"loss": 0.0214,
"step": 599
},
{
"epoch": 1.1561445783132531,
"grad_norm": 0.29637983441352844,
"learning_rate": 3.0917217298576955e-05,
"loss": 0.0118,
"step": 600
},
{
"epoch": 1.1580722891566264,
"grad_norm": 0.18535609543323517,
"learning_rate": 3.0879612943509154e-05,
"loss": 0.0086,
"step": 601
},
{
"epoch": 1.16,
"grad_norm": 0.25874125957489014,
"learning_rate": 3.0841953882217536e-05,
"loss": 0.0088,
"step": 602
},
{
"epoch": 1.1619277108433734,
"grad_norm": 0.46092745661735535,
"learning_rate": 3.08042403040641e-05,
"loss": 0.0241,
"step": 603
},
{
"epoch": 1.163855421686747,
"grad_norm": 0.27023249864578247,
"learning_rate": 3.076647239868494e-05,
"loss": 0.0154,
"step": 604
},
{
"epoch": 1.1657831325301204,
"grad_norm": 0.445157527923584,
"learning_rate": 3.072865035598933e-05,
"loss": 0.0197,
"step": 605
},
{
"epoch": 1.167710843373494,
"grad_norm": 0.18097272515296936,
"learning_rate": 3.06907743661588e-05,
"loss": 0.0093,
"step": 606
},
{
"epoch": 1.1696385542168675,
"grad_norm": 0.22469942271709442,
"learning_rate": 3.065284461964609e-05,
"loss": 0.0171,
"step": 607
},
{
"epoch": 1.171566265060241,
"grad_norm": 0.20190906524658203,
"learning_rate": 3.061486130717428e-05,
"loss": 0.008,
"step": 608
},
{
"epoch": 1.1734939759036145,
"grad_norm": 0.18294145166873932,
"learning_rate": 3.057682461973579e-05,
"loss": 0.0155,
"step": 609
},
{
"epoch": 1.175421686746988,
"grad_norm": 0.34203943610191345,
"learning_rate": 3.053873474859143e-05,
"loss": 0.0212,
"step": 610
},
{
"epoch": 1.1773493975903615,
"grad_norm": 0.49073582887649536,
"learning_rate": 3.050059188526942e-05,
"loss": 0.019,
"step": 611
},
{
"epoch": 1.179277108433735,
"grad_norm": 0.3537680506706238,
"learning_rate": 3.046239622156446e-05,
"loss": 0.0147,
"step": 612
},
{
"epoch": 1.1812048192771085,
"grad_norm": 0.2584632635116577,
"learning_rate": 3.042414794953674e-05,
"loss": 0.0088,
"step": 613
},
{
"epoch": 1.1831325301204818,
"grad_norm": 0.3529360890388489,
"learning_rate": 3.0385847261510975e-05,
"loss": 0.0187,
"step": 614
},
{
"epoch": 1.1850602409638555,
"grad_norm": 0.3331570327281952,
"learning_rate": 3.0347494350075465e-05,
"loss": 0.0124,
"step": 615
},
{
"epoch": 1.1869879518072288,
"grad_norm": 0.2223527580499649,
"learning_rate": 3.0309089408081074e-05,
"loss": 0.01,
"step": 616
},
{
"epoch": 1.1889156626506023,
"grad_norm": 0.21985746920108795,
"learning_rate": 3.027063262864032e-05,
"loss": 0.0087,
"step": 617
},
{
"epoch": 1.1908433734939758,
"grad_norm": 0.2989653944969177,
"learning_rate": 3.023212420512637e-05,
"loss": 0.0137,
"step": 618
},
{
"epoch": 1.1927710843373494,
"grad_norm": 0.17423275113105774,
"learning_rate": 3.0193564331172074e-05,
"loss": 0.0056,
"step": 619
},
{
"epoch": 1.1946987951807229,
"grad_norm": 1.0992127656936646,
"learning_rate": 3.0154953200668976e-05,
"loss": 0.0274,
"step": 620
},
{
"epoch": 1.1966265060240964,
"grad_norm": 0.21641989052295685,
"learning_rate": 3.011629100776638e-05,
"loss": 0.0151,
"step": 621
},
{
"epoch": 1.1985542168674699,
"grad_norm": 0.4558199644088745,
"learning_rate": 3.007757794687033e-05,
"loss": 0.0424,
"step": 622
},
{
"epoch": 1.2004819277108434,
"grad_norm": 0.42380189895629883,
"learning_rate": 3.003881421264266e-05,
"loss": 0.0079,
"step": 623
},
{
"epoch": 1.202409638554217,
"grad_norm": 0.28791171312332153,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0142,
"step": 624
},
{
"epoch": 1.2043373493975904,
"grad_norm": 0.3906581997871399,
"learning_rate": 2.996113550411281e-05,
"loss": 0.0251,
"step": 625
},
{
"epoch": 1.206265060240964,
"grad_norm": 0.47848746180534363,
"learning_rate": 2.9922220920404375e-05,
"loss": 0.0137,
"step": 626
},
{
"epoch": 1.2081927710843374,
"grad_norm": 0.22666941583156586,
"learning_rate": 2.9883256444549862e-05,
"loss": 0.0105,
"step": 627
},
{
"epoch": 1.210120481927711,
"grad_norm": 0.18968136608600616,
"learning_rate": 2.984424227247529e-05,
"loss": 0.0089,
"step": 628
},
{
"epoch": 1.2120481927710842,
"grad_norm": 0.28732606768608093,
"learning_rate": 2.980517860035656e-05,
"loss": 0.0253,
"step": 629
},
{
"epoch": 1.213975903614458,
"grad_norm": 0.21131543815135956,
"learning_rate": 2.9766065624618518e-05,
"loss": 0.0134,
"step": 630
},
{
"epoch": 1.2159036144578312,
"grad_norm": 0.7594877481460571,
"learning_rate": 2.972690354193388e-05,
"loss": 0.0157,
"step": 631
},
{
"epoch": 1.2178313253012047,
"grad_norm": 0.730291485786438,
"learning_rate": 2.96876925492223e-05,
"loss": 0.0204,
"step": 632
},
{
"epoch": 1.2197590361445783,
"grad_norm": 0.20333674550056458,
"learning_rate": 2.9648432843649382e-05,
"loss": 0.0114,
"step": 633
},
{
"epoch": 1.2216867469879518,
"grad_norm": 0.5680793523788452,
"learning_rate": 2.960912462262566e-05,
"loss": 0.0146,
"step": 634
},
{
"epoch": 1.2236144578313253,
"grad_norm": 0.4591079354286194,
"learning_rate": 2.9569768083805618e-05,
"loss": 0.0112,
"step": 635
},
{
"epoch": 1.2255421686746988,
"grad_norm": 0.3793511390686035,
"learning_rate": 2.953036342508671e-05,
"loss": 0.0377,
"step": 636
},
{
"epoch": 1.2274698795180723,
"grad_norm": 1.118723750114441,
"learning_rate": 2.9490910844608346e-05,
"loss": 0.0432,
"step": 637
},
{
"epoch": 1.2293975903614458,
"grad_norm": 0.36990776658058167,
"learning_rate": 2.9451410540750887e-05,
"loss": 0.0203,
"step": 638
},
{
"epoch": 1.2313253012048193,
"grad_norm": 0.930397629737854,
"learning_rate": 2.94118627121347e-05,
"loss": 0.0311,
"step": 639
},
{
"epoch": 1.2332530120481928,
"grad_norm": 0.2347625195980072,
"learning_rate": 2.9372267557619075e-05,
"loss": 0.0168,
"step": 640
},
{
"epoch": 1.2351807228915663,
"grad_norm": 0.3720332384109497,
"learning_rate": 2.933262527630131e-05,
"loss": 0.0136,
"step": 641
},
{
"epoch": 1.2371084337349398,
"grad_norm": 0.4871984124183655,
"learning_rate": 2.929293606751565e-05,
"loss": 0.0339,
"step": 642
},
{
"epoch": 1.2390361445783133,
"grad_norm": 0.35853689908981323,
"learning_rate": 2.9253200130832322e-05,
"loss": 0.0095,
"step": 643
},
{
"epoch": 1.2409638554216866,
"grad_norm": 0.42003703117370605,
"learning_rate": 2.92134176660565e-05,
"loss": 0.0142,
"step": 644
},
{
"epoch": 1.2428915662650604,
"grad_norm": 0.3854500651359558,
"learning_rate": 2.9173588873227338e-05,
"loss": 0.0209,
"step": 645
},
{
"epoch": 1.2448192771084337,
"grad_norm": 0.24665917456150055,
"learning_rate": 2.913371395261691e-05,
"loss": 0.0087,
"step": 646
},
{
"epoch": 1.2467469879518072,
"grad_norm": 0.41571593284606934,
"learning_rate": 2.9093793104729268e-05,
"loss": 0.0164,
"step": 647
},
{
"epoch": 1.2486746987951807,
"grad_norm": 0.4597891569137573,
"learning_rate": 2.9053826530299377e-05,
"loss": 0.0138,
"step": 648
},
{
"epoch": 1.2506024096385542,
"grad_norm": 0.43345385789871216,
"learning_rate": 2.901381443029215e-05,
"loss": 0.0353,
"step": 649
},
{
"epoch": 1.2525301204819277,
"grad_norm": 0.3706768751144409,
"learning_rate": 2.897375700590141e-05,
"loss": 0.007,
"step": 650
},
{
"epoch": 1.2544578313253012,
"grad_norm": 0.30305296182632446,
"learning_rate": 2.8933654458548873e-05,
"loss": 0.0123,
"step": 651
},
{
"epoch": 1.2563855421686747,
"grad_norm": 0.2042127549648285,
"learning_rate": 2.8893506989883167e-05,
"loss": 0.0099,
"step": 652
},
{
"epoch": 1.2583132530120482,
"grad_norm": 0.20524422824382782,
"learning_rate": 2.8853314801778784e-05,
"loss": 0.0097,
"step": 653
},
{
"epoch": 1.2602409638554217,
"grad_norm": 0.2351921945810318,
"learning_rate": 2.8813078096335093e-05,
"loss": 0.0091,
"step": 654
},
{
"epoch": 1.2621686746987952,
"grad_norm": 0.34547340869903564,
"learning_rate": 2.87727970758753e-05,
"loss": 0.0088,
"step": 655
},
{
"epoch": 1.2640963855421687,
"grad_norm": 0.35163217782974243,
"learning_rate": 2.8732471942945443e-05,
"loss": 0.0145,
"step": 656
},
{
"epoch": 1.266024096385542,
"grad_norm": 1.715137243270874,
"learning_rate": 2.8692102900313378e-05,
"loss": 0.0198,
"step": 657
},
{
"epoch": 1.2679518072289158,
"grad_norm": 0.2860178053379059,
"learning_rate": 2.8651690150967748e-05,
"loss": 0.0085,
"step": 658
},
{
"epoch": 1.269879518072289,
"grad_norm": 0.21175967156887054,
"learning_rate": 2.8611233898116967e-05,
"loss": 0.0071,
"step": 659
},
{
"epoch": 1.2718072289156628,
"grad_norm": 0.33726972341537476,
"learning_rate": 2.85707343451882e-05,
"loss": 0.012,
"step": 660
},
{
"epoch": 1.273734939759036,
"grad_norm": 0.2138456553220749,
"learning_rate": 2.853019169582635e-05,
"loss": 0.0092,
"step": 661
},
{
"epoch": 1.2756626506024096,
"grad_norm": 0.2304934412240982,
"learning_rate": 2.8489606153892997e-05,
"loss": 0.0144,
"step": 662
},
{
"epoch": 1.277590361445783,
"grad_norm": 0.2691061794757843,
"learning_rate": 2.8448977923465425e-05,
"loss": 0.0121,
"step": 663
},
{
"epoch": 1.2795180722891566,
"grad_norm": 0.35254305601119995,
"learning_rate": 2.840830720883555e-05,
"loss": 0.0125,
"step": 664
},
{
"epoch": 1.28144578313253,
"grad_norm": 0.36552608013153076,
"learning_rate": 2.836759421450893e-05,
"loss": 0.021,
"step": 665
},
{
"epoch": 1.2833734939759036,
"grad_norm": 0.37177154421806335,
"learning_rate": 2.83268391452037e-05,
"loss": 0.0216,
"step": 666
},
{
"epoch": 1.2853012048192771,
"grad_norm": 0.20932547748088837,
"learning_rate": 2.828604220584958e-05,
"loss": 0.0077,
"step": 667
},
{
"epoch": 1.2872289156626506,
"grad_norm": 0.5158557295799255,
"learning_rate": 2.824520360158681e-05,
"loss": 0.0394,
"step": 668
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.22623969614505768,
"learning_rate": 2.820432353776515e-05,
"loss": 0.0087,
"step": 669
},
{
"epoch": 1.2910843373493976,
"grad_norm": 0.2996046245098114,
"learning_rate": 2.8163402219942822e-05,
"loss": 0.01,
"step": 670
},
{
"epoch": 1.2930120481927712,
"grad_norm": 0.24957989156246185,
"learning_rate": 2.8122439853885488e-05,
"loss": 0.0127,
"step": 671
},
{
"epoch": 1.2949397590361444,
"grad_norm": 0.2636559307575226,
"learning_rate": 2.8081436645565216e-05,
"loss": 0.0128,
"step": 672
},
{
"epoch": 1.2968674698795182,
"grad_norm": 0.3531591296195984,
"learning_rate": 2.804039280115944e-05,
"loss": 0.0199,
"step": 673
},
{
"epoch": 1.2987951807228915,
"grad_norm": 0.3682299852371216,
"learning_rate": 2.7999308527049927e-05,
"loss": 0.0088,
"step": 674
},
{
"epoch": 1.3007228915662652,
"grad_norm": 0.19555217027664185,
"learning_rate": 2.795818402982174e-05,
"loss": 0.0084,
"step": 675
},
{
"epoch": 1.3026506024096385,
"grad_norm": 0.2864912450313568,
"learning_rate": 2.7917019516262186e-05,
"loss": 0.0154,
"step": 676
},
{
"epoch": 1.304578313253012,
"grad_norm": 0.2211237996816635,
"learning_rate": 2.78758151933598e-05,
"loss": 0.0078,
"step": 677
},
{
"epoch": 1.3065060240963855,
"grad_norm": 0.13646945357322693,
"learning_rate": 2.7834571268303294e-05,
"loss": 0.0058,
"step": 678
},
{
"epoch": 1.308433734939759,
"grad_norm": 0.16530285775661469,
"learning_rate": 2.779328794848049e-05,
"loss": 0.007,
"step": 679
},
{
"epoch": 1.3103614457831325,
"grad_norm": 0.2145693302154541,
"learning_rate": 2.7751965441477325e-05,
"loss": 0.0203,
"step": 680
},
{
"epoch": 1.312289156626506,
"grad_norm": 0.24273739755153656,
"learning_rate": 2.771060395507677e-05,
"loss": 0.0106,
"step": 681
},
{
"epoch": 1.3142168674698795,
"grad_norm": 0.20430618524551392,
"learning_rate": 2.7669203697257794e-05,
"loss": 0.0122,
"step": 682
},
{
"epoch": 1.316144578313253,
"grad_norm": 0.2502615749835968,
"learning_rate": 2.7627764876194335e-05,
"loss": 0.0101,
"step": 683
},
{
"epoch": 1.3180722891566266,
"grad_norm": 0.287239670753479,
"learning_rate": 2.7586287700254214e-05,
"loss": 0.0203,
"step": 684
},
{
"epoch": 1.32,
"grad_norm": 0.16239754855632782,
"learning_rate": 2.7544772377998147e-05,
"loss": 0.0084,
"step": 685
},
{
"epoch": 1.3219277108433736,
"grad_norm": 0.27174142003059387,
"learning_rate": 2.7503219118178636e-05,
"loss": 0.008,
"step": 686
},
{
"epoch": 1.3238554216867469,
"grad_norm": 0.12878240644931793,
"learning_rate": 2.7461628129738954e-05,
"loss": 0.0053,
"step": 687
},
{
"epoch": 1.3257831325301206,
"grad_norm": 0.16112515330314636,
"learning_rate": 2.7419999621812086e-05,
"loss": 0.0059,
"step": 688
},
{
"epoch": 1.3277108433734939,
"grad_norm": 0.2398834228515625,
"learning_rate": 2.7378333803719672e-05,
"loss": 0.0095,
"step": 689
},
{
"epoch": 1.3296385542168676,
"grad_norm": 0.18516193330287933,
"learning_rate": 2.733663088497097e-05,
"loss": 0.0071,
"step": 690
},
{
"epoch": 1.331566265060241,
"grad_norm": 0.2974924147129059,
"learning_rate": 2.7294891075261785e-05,
"loss": 0.0227,
"step": 691
},
{
"epoch": 1.3334939759036144,
"grad_norm": 0.12931054830551147,
"learning_rate": 2.7253114584473418e-05,
"loss": 0.0039,
"step": 692
},
{
"epoch": 1.335421686746988,
"grad_norm": 0.16319474577903748,
"learning_rate": 2.7211301622671623e-05,
"loss": 0.008,
"step": 693
},
{
"epoch": 1.3373493975903614,
"grad_norm": 0.27622169256210327,
"learning_rate": 2.7169452400105533e-05,
"loss": 0.0238,
"step": 694
},
{
"epoch": 1.339277108433735,
"grad_norm": 0.45309779047966003,
"learning_rate": 2.712756712720663e-05,
"loss": 0.0439,
"step": 695
},
{
"epoch": 1.3412048192771084,
"grad_norm": 0.2469855099916458,
"learning_rate": 2.708564601458765e-05,
"loss": 0.0085,
"step": 696
},
{
"epoch": 1.343132530120482,
"grad_norm": 0.4245856702327728,
"learning_rate": 2.7043689273041535e-05,
"loss": 0.0097,
"step": 697
},
{
"epoch": 1.3450602409638555,
"grad_norm": 0.26796087622642517,
"learning_rate": 2.7001697113540414e-05,
"loss": 0.0119,
"step": 698
},
{
"epoch": 1.346987951807229,
"grad_norm": 0.3569283187389374,
"learning_rate": 2.6959669747234482e-05,
"loss": 0.0096,
"step": 699
},
{
"epoch": 1.3489156626506025,
"grad_norm": 0.7038524150848389,
"learning_rate": 2.6917607385450973e-05,
"loss": 0.0317,
"step": 700
},
{
"epoch": 1.350843373493976,
"grad_norm": 0.23568563163280487,
"learning_rate": 2.687551023969308e-05,
"loss": 0.0112,
"step": 701
},
{
"epoch": 1.3527710843373493,
"grad_norm": 0.20338499546051025,
"learning_rate": 2.6833378521638935e-05,
"loss": 0.0092,
"step": 702
},
{
"epoch": 1.354698795180723,
"grad_norm": 4.22187614440918,
"learning_rate": 2.679121244314046e-05,
"loss": 0.0314,
"step": 703
},
{
"epoch": 1.3566265060240963,
"grad_norm": 0.2542206048965454,
"learning_rate": 2.674901221622239e-05,
"loss": 0.0158,
"step": 704
},
{
"epoch": 1.3585542168674698,
"grad_norm": 0.49705010652542114,
"learning_rate": 2.670677805308116e-05,
"loss": 0.0162,
"step": 705
},
{
"epoch": 1.3604819277108433,
"grad_norm": 0.17502115666866302,
"learning_rate": 2.666451016608383e-05,
"loss": 0.0074,
"step": 706
},
{
"epoch": 1.3624096385542168,
"grad_norm": 0.21738742291927338,
"learning_rate": 2.6622208767767075e-05,
"loss": 0.0135,
"step": 707
},
{
"epoch": 1.3643373493975903,
"grad_norm": 0.3309847414493561,
"learning_rate": 2.6579874070836032e-05,
"loss": 0.0107,
"step": 708
},
{
"epoch": 1.3662650602409638,
"grad_norm": 0.10706827789545059,
"learning_rate": 2.6537506288163303e-05,
"loss": 0.0043,
"step": 709
},
{
"epoch": 1.3681927710843373,
"grad_norm": 0.173640176653862,
"learning_rate": 2.6495105632787835e-05,
"loss": 0.0092,
"step": 710
},
{
"epoch": 1.3701204819277109,
"grad_norm": 0.2636397182941437,
"learning_rate": 2.6452672317913893e-05,
"loss": 0.0097,
"step": 711
},
{
"epoch": 1.3720481927710844,
"grad_norm": 0.28485360741615295,
"learning_rate": 2.6410206556909943e-05,
"loss": 0.0193,
"step": 712
},
{
"epoch": 1.3739759036144579,
"grad_norm": 0.23210027813911438,
"learning_rate": 2.636770856330761e-05,
"loss": 0.0229,
"step": 713
},
{
"epoch": 1.3759036144578314,
"grad_norm": 0.13388316333293915,
"learning_rate": 2.6325178550800596e-05,
"loss": 0.004,
"step": 714
},
{
"epoch": 1.377831325301205,
"grad_norm": 0.5131422877311707,
"learning_rate": 2.6282616733243603e-05,
"loss": 0.0137,
"step": 715
},
{
"epoch": 1.3797590361445784,
"grad_norm": 0.3243267834186554,
"learning_rate": 2.6240023324651258e-05,
"loss": 0.0153,
"step": 716
},
{
"epoch": 1.3816867469879517,
"grad_norm": 0.1440611034631729,
"learning_rate": 2.619739853919704e-05,
"loss": 0.0031,
"step": 717
},
{
"epoch": 1.3836144578313254,
"grad_norm": 0.30346596240997314,
"learning_rate": 2.6154742591212196e-05,
"loss": 0.0109,
"step": 718
},
{
"epoch": 1.3855421686746987,
"grad_norm": 0.19109240174293518,
"learning_rate": 2.611205569518468e-05,
"loss": 0.0094,
"step": 719
},
{
"epoch": 1.3874698795180722,
"grad_norm": 0.28636518120765686,
"learning_rate": 2.6069338065758056e-05,
"loss": 0.0123,
"step": 720
},
{
"epoch": 1.3893975903614457,
"grad_norm": 0.28083911538124084,
"learning_rate": 2.6026589917730416e-05,
"loss": 0.0104,
"step": 721
},
{
"epoch": 1.3913253012048192,
"grad_norm": 0.36553966999053955,
"learning_rate": 2.5983811466053327e-05,
"loss": 0.0143,
"step": 722
},
{
"epoch": 1.3932530120481927,
"grad_norm": 0.23317205905914307,
"learning_rate": 2.5941002925830708e-05,
"loss": 0.011,
"step": 723
},
{
"epoch": 1.3951807228915662,
"grad_norm": 0.3825171887874603,
"learning_rate": 2.589816451231781e-05,
"loss": 0.0098,
"step": 724
},
{
"epoch": 1.3971084337349398,
"grad_norm": 0.19916608929634094,
"learning_rate": 2.585529644092006e-05,
"loss": 0.0094,
"step": 725
},
{
"epoch": 1.3990361445783133,
"grad_norm": 0.19990523159503937,
"learning_rate": 2.5812398927192027e-05,
"loss": 0.0128,
"step": 726
},
{
"epoch": 1.4009638554216868,
"grad_norm": 0.34662899374961853,
"learning_rate": 2.5769472186836347e-05,
"loss": 0.0091,
"step": 727
},
{
"epoch": 1.4028915662650603,
"grad_norm": 0.23481112718582153,
"learning_rate": 2.5726516435702583e-05,
"loss": 0.0154,
"step": 728
},
{
"epoch": 1.4048192771084338,
"grad_norm": 0.1846667379140854,
"learning_rate": 2.5683531889786194e-05,
"loss": 0.0088,
"step": 729
},
{
"epoch": 1.4067469879518073,
"grad_norm": 0.16717663407325745,
"learning_rate": 2.564051876522742e-05,
"loss": 0.0083,
"step": 730
},
{
"epoch": 1.4086746987951808,
"grad_norm": 0.4116475284099579,
"learning_rate": 2.5597477278310202e-05,
"loss": 0.0179,
"step": 731
},
{
"epoch": 1.410602409638554,
"grad_norm": 0.171807661652565,
"learning_rate": 2.5554407645461115e-05,
"loss": 0.0063,
"step": 732
},
{
"epoch": 1.4125301204819278,
"grad_norm": 0.1954439878463745,
"learning_rate": 2.5511310083248243e-05,
"loss": 0.017,
"step": 733
},
{
"epoch": 1.4144578313253011,
"grad_norm": 0.37158989906311035,
"learning_rate": 2.5468184808380104e-05,
"loss": 0.0173,
"step": 734
},
{
"epoch": 1.4163855421686746,
"grad_norm": 0.2001633644104004,
"learning_rate": 2.542503203770458e-05,
"loss": 0.0165,
"step": 735
},
{
"epoch": 1.4183132530120481,
"grad_norm": 0.45673373341560364,
"learning_rate": 2.53818519882078e-05,
"loss": 0.0185,
"step": 736
},
{
"epoch": 1.4202409638554216,
"grad_norm": 0.3838701546192169,
"learning_rate": 2.5338644877013067e-05,
"loss": 0.0134,
"step": 737
},
{
"epoch": 1.4221686746987952,
"grad_norm": 0.32032477855682373,
"learning_rate": 2.5295410921379745e-05,
"loss": 0.0143,
"step": 738
},
{
"epoch": 1.4240963855421687,
"grad_norm": 0.4594039022922516,
"learning_rate": 2.52521503387022e-05,
"loss": 0.0193,
"step": 739
},
{
"epoch": 1.4260240963855422,
"grad_norm": 0.3889620900154114,
"learning_rate": 2.5208863346508667e-05,
"loss": 0.0114,
"step": 740
},
{
"epoch": 1.4279518072289157,
"grad_norm": 0.33153319358825684,
"learning_rate": 2.5165550162460203e-05,
"loss": 0.0102,
"step": 741
},
{
"epoch": 1.4298795180722892,
"grad_norm": 0.7269518375396729,
"learning_rate": 2.5122211004349536e-05,
"loss": 0.0215,
"step": 742
},
{
"epoch": 1.4318072289156627,
"grad_norm": 0.31653261184692383,
"learning_rate": 2.5078846090100023e-05,
"loss": 0.0115,
"step": 743
},
{
"epoch": 1.4337349397590362,
"grad_norm": 0.20620353519916534,
"learning_rate": 2.5035455637764518e-05,
"loss": 0.0153,
"step": 744
},
{
"epoch": 1.4356626506024097,
"grad_norm": 0.17266008257865906,
"learning_rate": 2.4992039865524297e-05,
"loss": 0.0069,
"step": 745
},
{
"epoch": 1.4375903614457832,
"grad_norm": 0.24760811030864716,
"learning_rate": 2.494859899168795e-05,
"loss": 0.0108,
"step": 746
},
{
"epoch": 1.4395180722891565,
"grad_norm": 0.2584865391254425,
"learning_rate": 2.4905133234690282e-05,
"loss": 0.0095,
"step": 747
},
{
"epoch": 1.4414457831325302,
"grad_norm": 0.48847514390945435,
"learning_rate": 2.486164281309122e-05,
"loss": 0.0181,
"step": 748
},
{
"epoch": 1.4433734939759035,
"grad_norm": 0.42942047119140625,
"learning_rate": 2.4818127945574717e-05,
"loss": 0.025,
"step": 749
},
{
"epoch": 1.445301204819277,
"grad_norm": 0.23713800311088562,
"learning_rate": 2.4774588850947648e-05,
"loss": 0.0085,
"step": 750
},
{
"epoch": 1.4472289156626506,
"grad_norm": 0.8797569870948792,
"learning_rate": 2.473102574813871e-05,
"loss": 0.0097,
"step": 751
},
{
"epoch": 1.449156626506024,
"grad_norm": 0.2744862735271454,
"learning_rate": 2.4687438856197302e-05,
"loss": 0.0122,
"step": 752
},
{
"epoch": 1.4510843373493976,
"grad_norm": 0.12747010588645935,
"learning_rate": 2.4643828394292478e-05,
"loss": 0.0056,
"step": 753
},
{
"epoch": 1.453012048192771,
"grad_norm": 0.37376829981803894,
"learning_rate": 2.4600194581711775e-05,
"loss": 0.0052,
"step": 754
},
{
"epoch": 1.4549397590361446,
"grad_norm": 0.2536911368370056,
"learning_rate": 2.4556537637860176e-05,
"loss": 0.0113,
"step": 755
},
{
"epoch": 1.456867469879518,
"grad_norm": 0.25950780510902405,
"learning_rate": 2.451285778225894e-05,
"loss": 0.0099,
"step": 756
},
{
"epoch": 1.4587951807228916,
"grad_norm": 0.19535955786705017,
"learning_rate": 2.4469155234544565e-05,
"loss": 0.0069,
"step": 757
},
{
"epoch": 1.4607228915662651,
"grad_norm": 0.22816115617752075,
"learning_rate": 2.442543021446764e-05,
"loss": 0.0088,
"step": 758
},
{
"epoch": 1.4626506024096386,
"grad_norm": 0.3363986313343048,
"learning_rate": 2.4381682941891755e-05,
"loss": 0.0182,
"step": 759
},
{
"epoch": 1.464578313253012,
"grad_norm": 0.21492891013622284,
"learning_rate": 2.4337913636792382e-05,
"loss": 0.0069,
"step": 760
},
{
"epoch": 1.4665060240963856,
"grad_norm": 0.6070862412452698,
"learning_rate": 2.429412251925579e-05,
"loss": 0.0406,
"step": 761
},
{
"epoch": 1.468433734939759,
"grad_norm": 2.6469690799713135,
"learning_rate": 2.425030980947793e-05,
"loss": 0.0205,
"step": 762
},
{
"epoch": 1.4703614457831327,
"grad_norm": 0.30909740924835205,
"learning_rate": 2.420647572776332e-05,
"loss": 0.0136,
"step": 763
},
{
"epoch": 1.472289156626506,
"grad_norm": 0.6639553904533386,
"learning_rate": 2.416262049452395e-05,
"loss": 0.011,
"step": 764
},
{
"epoch": 1.4742168674698795,
"grad_norm": 0.2919616997241974,
"learning_rate": 2.4118744330278147e-05,
"loss": 0.0131,
"step": 765
},
{
"epoch": 1.476144578313253,
"grad_norm": 0.5232429504394531,
"learning_rate": 2.4074847455649523e-05,
"loss": 0.0138,
"step": 766
},
{
"epoch": 1.4780722891566265,
"grad_norm": 5.630630970001221,
"learning_rate": 2.403093009136579e-05,
"loss": 0.0264,
"step": 767
},
{
"epoch": 1.48,
"grad_norm": 0.33234721422195435,
"learning_rate": 2.3986992458257707e-05,
"loss": 0.0111,
"step": 768
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.28444772958755493,
"learning_rate": 2.3943034777257945e-05,
"loss": 0.0144,
"step": 769
},
{
"epoch": 1.483855421686747,
"grad_norm": 0.16229979693889618,
"learning_rate": 2.38990572694e-05,
"loss": 0.0062,
"step": 770
},
{
"epoch": 1.4857831325301205,
"grad_norm": 0.27474716305732727,
"learning_rate": 2.385506015581704e-05,
"loss": 0.0172,
"step": 771
},
{
"epoch": 1.487710843373494,
"grad_norm": 0.246526300907135,
"learning_rate": 2.381104365774083e-05,
"loss": 0.012,
"step": 772
},
{
"epoch": 1.4896385542168675,
"grad_norm": 0.282047837972641,
"learning_rate": 2.37670079965006e-05,
"loss": 0.0116,
"step": 773
},
{
"epoch": 1.491566265060241,
"grad_norm": 0.2878139317035675,
"learning_rate": 2.3722953393521944e-05,
"loss": 0.0147,
"step": 774
},
{
"epoch": 1.4934939759036143,
"grad_norm": 0.5586277842521667,
"learning_rate": 2.367888007032571e-05,
"loss": 0.0111,
"step": 775
},
{
"epoch": 1.495421686746988,
"grad_norm": 0.562160313129425,
"learning_rate": 2.3634788248526846e-05,
"loss": 0.0061,
"step": 776
},
{
"epoch": 1.4973493975903613,
"grad_norm": 0.3452005982398987,
"learning_rate": 2.3590678149833356e-05,
"loss": 0.0205,
"step": 777
},
{
"epoch": 1.499277108433735,
"grad_norm": 0.7757686376571655,
"learning_rate": 2.3546549996045114e-05,
"loss": 0.0273,
"step": 778
},
{
"epoch": 1.5012048192771084,
"grad_norm": 0.19530551135540009,
"learning_rate": 2.3502404009052812e-05,
"loss": 0.0083,
"step": 779
},
{
"epoch": 1.503132530120482,
"grad_norm": 0.2586531639099121,
"learning_rate": 2.3458240410836775e-05,
"loss": 0.0122,
"step": 780
},
{
"epoch": 1.5050602409638554,
"grad_norm": 0.30063286423683167,
"learning_rate": 2.3414059423465924e-05,
"loss": 0.0083,
"step": 781
},
{
"epoch": 1.5069879518072289,
"grad_norm": 0.18663185834884644,
"learning_rate": 2.3369861269096575e-05,
"loss": 0.0104,
"step": 782
},
{
"epoch": 1.5089156626506024,
"grad_norm": 0.4405941069126129,
"learning_rate": 2.3325646169971416e-05,
"loss": 0.0264,
"step": 783
},
{
"epoch": 1.510843373493976,
"grad_norm": 0.2947913110256195,
"learning_rate": 2.3281414348418294e-05,
"loss": 0.0107,
"step": 784
},
{
"epoch": 1.5127710843373494,
"grad_norm": 0.23813778162002563,
"learning_rate": 2.3237166026849158e-05,
"loss": 0.0084,
"step": 785
},
{
"epoch": 1.514698795180723,
"grad_norm": 0.33380329608917236,
"learning_rate": 2.3192901427758932e-05,
"loss": 0.0111,
"step": 786
},
{
"epoch": 1.5166265060240964,
"grad_norm": 0.3736988306045532,
"learning_rate": 2.314862077372438e-05,
"loss": 0.0135,
"step": 787
},
{
"epoch": 1.5185542168674697,
"grad_norm": 0.3785395920276642,
"learning_rate": 2.3104324287402996e-05,
"loss": 0.0265,
"step": 788
},
{
"epoch": 1.5204819277108435,
"grad_norm": 0.3359154462814331,
"learning_rate": 2.3060012191531885e-05,
"loss": 0.0127,
"step": 789
},
{
"epoch": 1.5224096385542167,
"grad_norm": 0.720753014087677,
"learning_rate": 2.301568470892664e-05,
"loss": 0.0134,
"step": 790
},
{
"epoch": 1.5243373493975905,
"grad_norm": 0.36473193764686584,
"learning_rate": 2.297134206248024e-05,
"loss": 0.0318,
"step": 791
},
{
"epoch": 1.5262650602409638,
"grad_norm": 0.29987087845802307,
"learning_rate": 2.2926984475161884e-05,
"loss": 0.008,
"step": 792
},
{
"epoch": 1.5281927710843375,
"grad_norm": 0.2883112132549286,
"learning_rate": 2.2882612170015914e-05,
"loss": 0.0125,
"step": 793
},
{
"epoch": 1.5301204819277108,
"grad_norm": 0.28983229398727417,
"learning_rate": 2.2838225370160682e-05,
"loss": 0.0155,
"step": 794
},
{
"epoch": 1.5320481927710843,
"grad_norm": 0.47236886620521545,
"learning_rate": 2.2793824298787414e-05,
"loss": 0.0132,
"step": 795
},
{
"epoch": 1.5339759036144578,
"grad_norm": 0.8328865170478821,
"learning_rate": 2.2749409179159104e-05,
"loss": 0.026,
"step": 796
},
{
"epoch": 1.5359036144578313,
"grad_norm": 0.3129172623157501,
"learning_rate": 2.2704980234609396e-05,
"loss": 0.0099,
"step": 797
},
{
"epoch": 1.5378313253012048,
"grad_norm": 0.22284500300884247,
"learning_rate": 2.2660537688541416e-05,
"loss": 0.009,
"step": 798
},
{
"epoch": 1.5397590361445783,
"grad_norm": 0.3346405625343323,
"learning_rate": 2.2616081764426726e-05,
"loss": 0.0077,
"step": 799
},
{
"epoch": 1.5416867469879518,
"grad_norm": 0.2923565208911896,
"learning_rate": 2.2571612685804124e-05,
"loss": 0.0119,
"step": 800
},
{
"epoch": 1.5436144578313253,
"grad_norm": 0.1921311914920807,
"learning_rate": 2.252713067627857e-05,
"loss": 0.0083,
"step": 801
},
{
"epoch": 1.5455421686746988,
"grad_norm": 0.23221106827259064,
"learning_rate": 2.2482635959520044e-05,
"loss": 0.0049,
"step": 802
},
{
"epoch": 1.5474698795180721,
"grad_norm": 0.6340724229812622,
"learning_rate": 2.243812875926241e-05,
"loss": 0.0273,
"step": 803
},
{
"epoch": 1.5493975903614459,
"grad_norm": 0.2699439823627472,
"learning_rate": 2.2393609299302314e-05,
"loss": 0.0108,
"step": 804
},
{
"epoch": 1.5513253012048192,
"grad_norm": 0.2005189210176468,
"learning_rate": 2.2349077803498052e-05,
"loss": 0.0076,
"step": 805
},
{
"epoch": 1.5532530120481929,
"grad_norm": 0.39668548107147217,
"learning_rate": 2.230453449576842e-05,
"loss": 0.0135,
"step": 806
},
{
"epoch": 1.5551807228915662,
"grad_norm": 0.2406950294971466,
"learning_rate": 2.2259979600091635e-05,
"loss": 0.0094,
"step": 807
},
{
"epoch": 1.55710843373494,
"grad_norm": 0.30363157391548157,
"learning_rate": 2.2215413340504158e-05,
"loss": 0.0178,
"step": 808
},
{
"epoch": 1.5590361445783132,
"grad_norm": 0.19508181512355804,
"learning_rate": 2.2170835941099605e-05,
"loss": 0.0069,
"step": 809
},
{
"epoch": 1.5609638554216867,
"grad_norm": 0.734106719493866,
"learning_rate": 2.2126247626027615e-05,
"loss": 0.0319,
"step": 810
},
{
"epoch": 1.5628915662650602,
"grad_norm": 0.2591583728790283,
"learning_rate": 2.208164861949268e-05,
"loss": 0.0168,
"step": 811
},
{
"epoch": 1.5648192771084337,
"grad_norm": 0.2386734038591385,
"learning_rate": 2.20370391457531e-05,
"loss": 0.0041,
"step": 812
},
{
"epoch": 1.5667469879518072,
"grad_norm": 0.1675218939781189,
"learning_rate": 2.1992419429119764e-05,
"loss": 0.0078,
"step": 813
},
{
"epoch": 1.5686746987951807,
"grad_norm": 0.45591506361961365,
"learning_rate": 2.1947789693955097e-05,
"loss": 0.0166,
"step": 814
},
{
"epoch": 1.5706024096385542,
"grad_norm": 0.46940621733665466,
"learning_rate": 2.190315016467188e-05,
"loss": 0.0176,
"step": 815
},
{
"epoch": 1.5725301204819278,
"grad_norm": 0.2294205278158188,
"learning_rate": 2.1858501065732146e-05,
"loss": 0.0102,
"step": 816
},
{
"epoch": 1.5744578313253013,
"grad_norm": 0.28922322392463684,
"learning_rate": 2.181384262164606e-05,
"loss": 0.0111,
"step": 817
},
{
"epoch": 1.5763855421686745,
"grad_norm": 0.19650064408779144,
"learning_rate": 2.1769175056970765e-05,
"loss": 0.0076,
"step": 818
},
{
"epoch": 1.5783132530120483,
"grad_norm": 0.19538825750350952,
"learning_rate": 2.172449859630927e-05,
"loss": 0.0118,
"step": 819
},
{
"epoch": 1.5802409638554216,
"grad_norm": 0.1900389939546585,
"learning_rate": 2.167981346430931e-05,
"loss": 0.0066,
"step": 820
},
{
"epoch": 1.5821686746987953,
"grad_norm": 0.21593710780143738,
"learning_rate": 2.1635119885662235e-05,
"loss": 0.0101,
"step": 821
},
{
"epoch": 1.5840963855421686,
"grad_norm": 0.2699289321899414,
"learning_rate": 2.159041808510185e-05,
"loss": 0.0118,
"step": 822
},
{
"epoch": 1.5860240963855423,
"grad_norm": 0.31867673993110657,
"learning_rate": 2.1545708287403322e-05,
"loss": 0.0122,
"step": 823
},
{
"epoch": 1.5879518072289156,
"grad_norm": 0.2862400412559509,
"learning_rate": 2.1500990717382004e-05,
"loss": 0.0216,
"step": 824
},
{
"epoch": 1.589879518072289,
"grad_norm": 0.28482481837272644,
"learning_rate": 2.145626559989237e-05,
"loss": 0.0136,
"step": 825
},
{
"epoch": 1.5918072289156626,
"grad_norm": 0.2866958975791931,
"learning_rate": 2.1411533159826803e-05,
"loss": 0.0298,
"step": 826
},
{
"epoch": 1.5937349397590361,
"grad_norm": 0.39092838764190674,
"learning_rate": 2.1366793622114533e-05,
"loss": 0.0382,
"step": 827
},
{
"epoch": 1.5956626506024096,
"grad_norm": 0.16381537914276123,
"learning_rate": 2.1322047211720468e-05,
"loss": 0.0074,
"step": 828
},
{
"epoch": 1.5975903614457831,
"grad_norm": 0.22146940231323242,
"learning_rate": 2.1277294153644083e-05,
"loss": 0.0103,
"step": 829
},
{
"epoch": 1.5995180722891567,
"grad_norm": 0.2155209183692932,
"learning_rate": 2.123253467291827e-05,
"loss": 0.0095,
"step": 830
},
{
"epoch": 1.6014457831325302,
"grad_norm": 0.41510409116744995,
"learning_rate": 2.118776899460822e-05,
"loss": 0.0457,
"step": 831
},
{
"epoch": 1.6033734939759037,
"grad_norm": 0.19718150794506073,
"learning_rate": 2.1142997343810293e-05,
"loss": 0.0192,
"step": 832
},
{
"epoch": 1.605301204819277,
"grad_norm": 0.40924403071403503,
"learning_rate": 2.1098219945650865e-05,
"loss": 0.0278,
"step": 833
},
{
"epoch": 1.6072289156626507,
"grad_norm": 0.18657824397087097,
"learning_rate": 2.105343702528524e-05,
"loss": 0.0076,
"step": 834
},
{
"epoch": 1.609156626506024,
"grad_norm": 0.1727641075849533,
"learning_rate": 2.100864880789645e-05,
"loss": 0.0076,
"step": 835
},
{
"epoch": 1.6110843373493977,
"grad_norm": 0.18138745427131653,
"learning_rate": 2.0963855518694203e-05,
"loss": 0.005,
"step": 836
},
{
"epoch": 1.613012048192771,
"grad_norm": 0.19173955917358398,
"learning_rate": 2.0919057382913675e-05,
"loss": 0.0084,
"step": 837
},
{
"epoch": 1.6149397590361447,
"grad_norm": 0.3812403380870819,
"learning_rate": 2.0874254625814435e-05,
"loss": 0.009,
"step": 838
},
{
"epoch": 1.616867469879518,
"grad_norm": 0.2009759545326233,
"learning_rate": 2.0829447472679285e-05,
"loss": 0.0098,
"step": 839
},
{
"epoch": 1.6187951807228915,
"grad_norm": 0.48703446984291077,
"learning_rate": 2.0784636148813124e-05,
"loss": 0.0099,
"step": 840
},
{
"epoch": 1.620722891566265,
"grad_norm": 0.28995075821876526,
"learning_rate": 2.0739820879541827e-05,
"loss": 0.0075,
"step": 841
},
{
"epoch": 1.6226506024096385,
"grad_norm": 0.2130059450864792,
"learning_rate": 2.069500189021111e-05,
"loss": 0.007,
"step": 842
},
{
"epoch": 1.624578313253012,
"grad_norm": 0.252524733543396,
"learning_rate": 2.0650179406185397e-05,
"loss": 0.0249,
"step": 843
},
{
"epoch": 1.6265060240963856,
"grad_norm": 0.23069098591804504,
"learning_rate": 2.060535365284668e-05,
"loss": 0.0084,
"step": 844
},
{
"epoch": 1.628433734939759,
"grad_norm": 0.25051403045654297,
"learning_rate": 2.056052485559338e-05,
"loss": 0.0071,
"step": 845
},
{
"epoch": 1.6303614457831326,
"grad_norm": 0.27664798498153687,
"learning_rate": 2.051569323983924e-05,
"loss": 0.0198,
"step": 846
},
{
"epoch": 1.632289156626506,
"grad_norm": 0.2954922318458557,
"learning_rate": 2.047085903101218e-05,
"loss": 0.006,
"step": 847
},
{
"epoch": 1.6342168674698794,
"grad_norm": 0.28477591276168823,
"learning_rate": 2.0426022454553137e-05,
"loss": 0.0147,
"step": 848
},
{
"epoch": 1.636144578313253,
"grad_norm": 0.2785305678844452,
"learning_rate": 2.0381183735914968e-05,
"loss": 0.0117,
"step": 849
},
{
"epoch": 1.6380722891566264,
"grad_norm": 0.2500309348106384,
"learning_rate": 2.0336343100561295e-05,
"loss": 0.008,
"step": 850
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.18932047486305237,
"learning_rate": 2.0291500773965392e-05,
"loss": 0.0256,
"step": 851
},
{
"epoch": 1.6419277108433734,
"grad_norm": 0.6396257877349854,
"learning_rate": 2.0246656981609013e-05,
"loss": 0.0141,
"step": 852
},
{
"epoch": 1.6438554216867471,
"grad_norm": 0.5072891116142273,
"learning_rate": 2.02018119489813e-05,
"loss": 0.008,
"step": 853
},
{
"epoch": 1.6457831325301204,
"grad_norm": 0.2920839488506317,
"learning_rate": 2.0156965901577635e-05,
"loss": 0.0085,
"step": 854
},
{
"epoch": 1.647710843373494,
"grad_norm": 0.1391262263059616,
"learning_rate": 2.011211906489848e-05,
"loss": 0.0078,
"step": 855
},
{
"epoch": 1.6496385542168674,
"grad_norm": 0.29620468616485596,
"learning_rate": 2.00672716644483e-05,
"loss": 0.0109,
"step": 856
},
{
"epoch": 1.651566265060241,
"grad_norm": 0.13946573436260223,
"learning_rate": 2.002242392573436e-05,
"loss": 0.0076,
"step": 857
},
{
"epoch": 1.6534939759036145,
"grad_norm": 0.9766128659248352,
"learning_rate": 1.997757607426565e-05,
"loss": 0.0309,
"step": 858
},
{
"epoch": 1.655421686746988,
"grad_norm": 0.18002203106880188,
"learning_rate": 1.9932728335551702e-05,
"loss": 0.0072,
"step": 859
},
{
"epoch": 1.6573493975903615,
"grad_norm": 0.28073111176490784,
"learning_rate": 1.988788093510152e-05,
"loss": 0.0246,
"step": 860
},
{
"epoch": 1.659277108433735,
"grad_norm": 0.1919957399368286,
"learning_rate": 1.9843034098422375e-05,
"loss": 0.0087,
"step": 861
},
{
"epoch": 1.6612048192771085,
"grad_norm": 0.1825258433818817,
"learning_rate": 1.9798188051018705e-05,
"loss": 0.0092,
"step": 862
},
{
"epoch": 1.6631325301204818,
"grad_norm": 0.32412952184677124,
"learning_rate": 1.9753343018390997e-05,
"loss": 0.0118,
"step": 863
},
{
"epoch": 1.6650602409638555,
"grad_norm": 0.12828563153743744,
"learning_rate": 1.9708499226034618e-05,
"loss": 0.0056,
"step": 864
},
{
"epoch": 1.6669879518072288,
"grad_norm": 0.18647560477256775,
"learning_rate": 1.966365689943871e-05,
"loss": 0.0094,
"step": 865
},
{
"epoch": 1.6689156626506025,
"grad_norm": 0.19835828244686127,
"learning_rate": 1.9618816264085042e-05,
"loss": 0.0097,
"step": 866
},
{
"epoch": 1.6708433734939758,
"grad_norm": 0.22364282608032227,
"learning_rate": 1.957397754544687e-05,
"loss": 0.0062,
"step": 867
},
{
"epoch": 1.6727710843373496,
"grad_norm": 0.29420018196105957,
"learning_rate": 1.952914096898783e-05,
"loss": 0.0182,
"step": 868
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.2149929702281952,
"learning_rate": 1.9484306760160766e-05,
"loss": 0.0125,
"step": 869
},
{
"epoch": 1.6766265060240964,
"grad_norm": 0.16844330728054047,
"learning_rate": 1.9439475144406623e-05,
"loss": 0.0074,
"step": 870
},
{
"epoch": 1.6785542168674699,
"grad_norm": 0.5010282397270203,
"learning_rate": 1.9394646347153334e-05,
"loss": 0.0213,
"step": 871
},
{
"epoch": 1.6804819277108434,
"grad_norm": 0.29847195744514465,
"learning_rate": 1.9349820593814606e-05,
"loss": 0.0173,
"step": 872
},
{
"epoch": 1.6824096385542169,
"grad_norm": 0.23835812509059906,
"learning_rate": 1.930499810978889e-05,
"loss": 0.011,
"step": 873
},
{
"epoch": 1.6843373493975904,
"grad_norm": 0.3269020617008209,
"learning_rate": 1.9260179120458177e-05,
"loss": 0.0285,
"step": 874
},
{
"epoch": 1.686265060240964,
"grad_norm": 0.2142144739627838,
"learning_rate": 1.9215363851186883e-05,
"loss": 0.0146,
"step": 875
},
{
"epoch": 1.6881927710843372,
"grad_norm": 0.3098377585411072,
"learning_rate": 1.9170552527320725e-05,
"loss": 0.0104,
"step": 876
},
{
"epoch": 1.690120481927711,
"grad_norm": 0.22504115104675293,
"learning_rate": 1.9125745374185568e-05,
"loss": 0.0091,
"step": 877
},
{
"epoch": 1.6920481927710842,
"grad_norm": 0.20633333921432495,
"learning_rate": 1.908094261708633e-05,
"loss": 0.0097,
"step": 878
},
{
"epoch": 1.693975903614458,
"grad_norm": 1.179566502571106,
"learning_rate": 1.9036144481305807e-05,
"loss": 0.0143,
"step": 879
},
{
"epoch": 1.6959036144578312,
"grad_norm": 0.15525613725185394,
"learning_rate": 1.8991351192103554e-05,
"loss": 0.0062,
"step": 880
},
{
"epoch": 1.697831325301205,
"grad_norm": 0.15966367721557617,
"learning_rate": 1.8946562974714763e-05,
"loss": 0.0048,
"step": 881
},
{
"epoch": 1.6997590361445782,
"grad_norm": 0.18902607262134552,
"learning_rate": 1.890178005434914e-05,
"loss": 0.0124,
"step": 882
},
{
"epoch": 1.701686746987952,
"grad_norm": 0.21692413091659546,
"learning_rate": 1.885700265618971e-05,
"loss": 0.0135,
"step": 883
},
{
"epoch": 1.7036144578313253,
"grad_norm": 0.38948455452919006,
"learning_rate": 1.8812231005391786e-05,
"loss": 0.0365,
"step": 884
},
{
"epoch": 1.7055421686746988,
"grad_norm": 0.2483491599559784,
"learning_rate": 1.8767465327081736e-05,
"loss": 0.0202,
"step": 885
},
{
"epoch": 1.7074698795180723,
"grad_norm": 0.15305832028388977,
"learning_rate": 1.872270584635592e-05,
"loss": 0.0035,
"step": 886
},
{
"epoch": 1.7093975903614458,
"grad_norm": 0.17794466018676758,
"learning_rate": 1.867795278827954e-05,
"loss": 0.0157,
"step": 887
},
{
"epoch": 1.7113253012048193,
"grad_norm": 0.1938813328742981,
"learning_rate": 1.863320637788547e-05,
"loss": 0.0071,
"step": 888
},
{
"epoch": 1.7132530120481928,
"grad_norm": 0.27061617374420166,
"learning_rate": 1.8588466840173207e-05,
"loss": 0.0347,
"step": 889
},
{
"epoch": 1.7151807228915663,
"grad_norm": 0.1541014313697815,
"learning_rate": 1.8543734400107637e-05,
"loss": 0.006,
"step": 890
},
{
"epoch": 1.7171084337349396,
"grad_norm": 0.1436876654624939,
"learning_rate": 1.8499009282617996e-05,
"loss": 0.0059,
"step": 891
},
{
"epoch": 1.7190361445783133,
"grad_norm": 1.0573723316192627,
"learning_rate": 1.8454291712596688e-05,
"loss": 0.008,
"step": 892
},
{
"epoch": 1.7209638554216866,
"grad_norm": 0.15406259894371033,
"learning_rate": 1.8409581914898157e-05,
"loss": 0.0061,
"step": 893
},
{
"epoch": 1.7228915662650603,
"grad_norm": 0.24822913110256195,
"learning_rate": 1.836488011433777e-05,
"loss": 0.0085,
"step": 894
},
{
"epoch": 1.7248192771084336,
"grad_norm": 0.21049316227436066,
"learning_rate": 1.83201865356907e-05,
"loss": 0.0075,
"step": 895
},
{
"epoch": 1.7267469879518074,
"grad_norm": 0.24159866571426392,
"learning_rate": 1.8275501403690733e-05,
"loss": 0.0156,
"step": 896
},
{
"epoch": 1.7286746987951807,
"grad_norm": 0.3191063106060028,
"learning_rate": 1.823082494302924e-05,
"loss": 0.0218,
"step": 897
},
{
"epoch": 1.7306024096385542,
"grad_norm": 0.20296362042427063,
"learning_rate": 1.8186157378353945e-05,
"loss": 0.0126,
"step": 898
},
{
"epoch": 1.7325301204819277,
"grad_norm": 0.1905524581670761,
"learning_rate": 1.8141498934267858e-05,
"loss": 0.0131,
"step": 899
},
{
"epoch": 1.7344578313253012,
"grad_norm": 0.5350520610809326,
"learning_rate": 1.809684983532813e-05,
"loss": 0.0115,
"step": 900
},
{
"epoch": 1.7363855421686747,
"grad_norm": 0.17144092917442322,
"learning_rate": 1.8052210306044907e-05,
"loss": 0.0113,
"step": 901
},
{
"epoch": 1.7383132530120482,
"grad_norm": 0.11777982115745544,
"learning_rate": 1.8007580570880236e-05,
"loss": 0.0058,
"step": 902
},
{
"epoch": 1.7402409638554217,
"grad_norm": 0.2078275978565216,
"learning_rate": 1.7962960854246908e-05,
"loss": 0.0106,
"step": 903
},
{
"epoch": 1.7421686746987952,
"grad_norm": 0.2550877630710602,
"learning_rate": 1.791835138050732e-05,
"loss": 0.0076,
"step": 904
},
{
"epoch": 1.7440963855421687,
"grad_norm": 0.11553912609815598,
"learning_rate": 1.7873752373972395e-05,
"loss": 0.0038,
"step": 905
},
{
"epoch": 1.746024096385542,
"grad_norm": 0.10724586248397827,
"learning_rate": 1.7829164058900398e-05,
"loss": 0.0043,
"step": 906
},
{
"epoch": 1.7479518072289157,
"grad_norm": 0.30152231454849243,
"learning_rate": 1.7784586659495845e-05,
"loss": 0.0099,
"step": 907
},
{
"epoch": 1.749879518072289,
"grad_norm": 0.18372933566570282,
"learning_rate": 1.7740020399908372e-05,
"loss": 0.0074,
"step": 908
},
{
"epoch": 1.7518072289156628,
"grad_norm": 0.35184428095817566,
"learning_rate": 1.7695465504231586e-05,
"loss": 0.0184,
"step": 909
},
{
"epoch": 1.753734939759036,
"grad_norm": 0.15083615481853485,
"learning_rate": 1.765092219650196e-05,
"loss": 0.0061,
"step": 910
},
{
"epoch": 1.7556626506024098,
"grad_norm": 0.2599961459636688,
"learning_rate": 1.7606390700697693e-05,
"loss": 0.0101,
"step": 911
},
{
"epoch": 1.757590361445783,
"grad_norm": 0.10829206556081772,
"learning_rate": 1.7561871240737595e-05,
"loss": 0.0034,
"step": 912
},
{
"epoch": 1.7595180722891566,
"grad_norm": 0.38098782300949097,
"learning_rate": 1.7517364040479966e-05,
"loss": 0.0384,
"step": 913
},
{
"epoch": 1.76144578313253,
"grad_norm": 0.14975085854530334,
"learning_rate": 1.7472869323721432e-05,
"loss": 0.0055,
"step": 914
},
{
"epoch": 1.7633734939759036,
"grad_norm": 0.4151444733142853,
"learning_rate": 1.742838731419588e-05,
"loss": 0.0307,
"step": 915
},
{
"epoch": 1.765301204819277,
"grad_norm": 0.22238481044769287,
"learning_rate": 1.738391823557328e-05,
"loss": 0.0059,
"step": 916
},
{
"epoch": 1.7672289156626506,
"grad_norm": 0.23386356234550476,
"learning_rate": 1.7339462311458587e-05,
"loss": 0.0113,
"step": 917
},
{
"epoch": 1.7691566265060241,
"grad_norm": 0.21911191940307617,
"learning_rate": 1.7295019765390618e-05,
"loss": 0.0071,
"step": 918
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.343159943819046,
"learning_rate": 1.7250590820840903e-05,
"loss": 0.0144,
"step": 919
},
{
"epoch": 1.7730120481927711,
"grad_norm": 0.32204556465148926,
"learning_rate": 1.720617570121259e-05,
"loss": 0.0131,
"step": 920
}
],
"logging_steps": 1,
"max_steps": 1557,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 92,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2080308880513434e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}