{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1557, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019277108433734939, "grad_norm": 2.8518834114074707, "learning_rate": 0.0, "loss": 0.0891, "step": 1 }, { "epoch": 0.0038554216867469878, "grad_norm": 1.8441249132156372, "learning_rate": 2.564102564102564e-07, "loss": 0.0539, "step": 2 }, { "epoch": 0.005783132530120482, "grad_norm": 2.8263237476348877, "learning_rate": 5.128205128205128e-07, "loss": 0.099, "step": 3 }, { "epoch": 0.0077108433734939755, "grad_norm": 2.5051236152648926, "learning_rate": 7.692307692307694e-07, "loss": 0.0789, "step": 4 }, { "epoch": 0.00963855421686747, "grad_norm": 2.6903438568115234, "learning_rate": 1.0256410256410257e-06, "loss": 0.0881, "step": 5 }, { "epoch": 0.011566265060240964, "grad_norm": 2.6205761432647705, "learning_rate": 1.282051282051282e-06, "loss": 0.0776, "step": 6 }, { "epoch": 0.013493975903614458, "grad_norm": 2.6309337615966797, "learning_rate": 1.5384615384615387e-06, "loss": 0.0827, "step": 7 }, { "epoch": 0.015421686746987951, "grad_norm": 1.5427855253219604, "learning_rate": 1.794871794871795e-06, "loss": 0.0577, "step": 8 }, { "epoch": 0.017349397590361446, "grad_norm": 1.0973446369171143, "learning_rate": 2.0512820512820513e-06, "loss": 0.04, "step": 9 }, { "epoch": 0.01927710843373494, "grad_norm": 1.3253350257873535, "learning_rate": 2.307692307692308e-06, "loss": 0.0506, "step": 10 }, { "epoch": 0.021204819277108433, "grad_norm": 1.588739037513733, "learning_rate": 2.564102564102564e-06, "loss": 0.0874, "step": 11 }, { "epoch": 0.02313253012048193, "grad_norm": 1.4987014532089233, "learning_rate": 2.8205128205128207e-06, "loss": 0.0597, "step": 12 }, { "epoch": 0.02506024096385542, "grad_norm": 1.6571592092514038, "learning_rate": 3.0769230769230774e-06, "loss": 0.0559, "step": 13 }, { "epoch": 0.026987951807228915, "grad_norm": 1.8860628604888916, "learning_rate": 3.3333333333333333e-06, "loss": 0.0688, "step": 14 }, { "epoch": 0.02891566265060241, "grad_norm": 1.3202295303344727, "learning_rate": 3.58974358974359e-06, "loss": 0.0433, "step": 15 }, { "epoch": 0.030843373493975902, "grad_norm": 1.5870612859725952, "learning_rate": 3.846153846153847e-06, "loss": 0.0695, "step": 16 }, { "epoch": 0.0327710843373494, "grad_norm": 0.9192284345626831, "learning_rate": 4.102564102564103e-06, "loss": 0.0392, "step": 17 }, { "epoch": 0.03469879518072289, "grad_norm": 0.7950155735015869, "learning_rate": 4.358974358974359e-06, "loss": 0.0351, "step": 18 }, { "epoch": 0.03662650602409639, "grad_norm": 0.8854314684867859, "learning_rate": 4.615384615384616e-06, "loss": 0.0356, "step": 19 }, { "epoch": 0.03855421686746988, "grad_norm": 0.9546788930892944, "learning_rate": 4.871794871794872e-06, "loss": 0.0427, "step": 20 }, { "epoch": 0.04048192771084337, "grad_norm": 0.6315903663635254, "learning_rate": 5.128205128205128e-06, "loss": 0.0397, "step": 21 }, { "epoch": 0.042409638554216866, "grad_norm": 0.9230924844741821, "learning_rate": 5.384615384615385e-06, "loss": 0.0481, "step": 22 }, { "epoch": 0.04433734939759036, "grad_norm": 0.711546003818512, "learning_rate": 5.641025641025641e-06, "loss": 0.0479, "step": 23 }, { "epoch": 0.04626506024096386, "grad_norm": 0.5288046598434448, "learning_rate": 5.897435897435898e-06, "loss": 0.0182, "step": 24 }, { "epoch": 0.04819277108433735, "grad_norm": 0.9420496225357056, "learning_rate": 6.153846153846155e-06, "loss": 0.0389, "step": 25 }, { "epoch": 0.05012048192771084, "grad_norm": 0.5001983046531677, "learning_rate": 6.410256410256412e-06, "loss": 0.0268, "step": 26 }, { "epoch": 0.052048192771084335, "grad_norm": 0.8084653615951538, "learning_rate": 6.666666666666667e-06, "loss": 0.0367, "step": 27 }, { "epoch": 0.05397590361445783, "grad_norm": 0.7195103764533997, "learning_rate": 6.923076923076923e-06, "loss": 0.0251, "step": 28 }, { "epoch": 0.055903614457831326, "grad_norm": 0.529958963394165, "learning_rate": 7.17948717948718e-06, "loss": 0.0289, "step": 29 }, { "epoch": 0.05783132530120482, "grad_norm": 0.795376181602478, "learning_rate": 7.435897435897437e-06, "loss": 0.043, "step": 30 }, { "epoch": 0.059759036144578316, "grad_norm": 0.6366249918937683, "learning_rate": 7.692307692307694e-06, "loss": 0.029, "step": 31 }, { "epoch": 0.061686746987951804, "grad_norm": 0.5414115190505981, "learning_rate": 7.948717948717949e-06, "loss": 0.0365, "step": 32 }, { "epoch": 0.0636144578313253, "grad_norm": 0.9350972175598145, "learning_rate": 8.205128205128205e-06, "loss": 0.0283, "step": 33 }, { "epoch": 0.0655421686746988, "grad_norm": 0.5660741925239563, "learning_rate": 8.461538461538462e-06, "loss": 0.0234, "step": 34 }, { "epoch": 0.06746987951807229, "grad_norm": 0.5623988509178162, "learning_rate": 8.717948717948719e-06, "loss": 0.0307, "step": 35 }, { "epoch": 0.06939759036144579, "grad_norm": 0.5260195732116699, "learning_rate": 8.974358974358976e-06, "loss": 0.0264, "step": 36 }, { "epoch": 0.07132530120481928, "grad_norm": 0.4934785068035126, "learning_rate": 9.230769230769232e-06, "loss": 0.0224, "step": 37 }, { "epoch": 0.07325301204819278, "grad_norm": 0.4797322154045105, "learning_rate": 9.487179487179487e-06, "loss": 0.0163, "step": 38 }, { "epoch": 0.07518072289156627, "grad_norm": 0.4739217460155487, "learning_rate": 9.743589743589744e-06, "loss": 0.0165, "step": 39 }, { "epoch": 0.07710843373493977, "grad_norm": 0.4527677595615387, "learning_rate": 1e-05, "loss": 0.0163, "step": 40 }, { "epoch": 0.07903614457831325, "grad_norm": 0.6241316795349121, "learning_rate": 1.0256410256410256e-05, "loss": 0.0302, "step": 41 }, { "epoch": 0.08096385542168674, "grad_norm": 0.639043927192688, "learning_rate": 1.0512820512820514e-05, "loss": 0.0312, "step": 42 }, { "epoch": 0.08289156626506024, "grad_norm": 0.5121409296989441, "learning_rate": 1.076923076923077e-05, "loss": 0.0256, "step": 43 }, { "epoch": 0.08481927710843373, "grad_norm": 0.6340477466583252, "learning_rate": 1.1025641025641028e-05, "loss": 0.04, "step": 44 }, { "epoch": 0.08674698795180723, "grad_norm": 0.5260409712791443, "learning_rate": 1.1282051282051283e-05, "loss": 0.0282, "step": 45 }, { "epoch": 0.08867469879518072, "grad_norm": 0.6390711069107056, "learning_rate": 1.1538461538461538e-05, "loss": 0.0243, "step": 46 }, { "epoch": 0.09060240963855422, "grad_norm": 0.46469295024871826, "learning_rate": 1.1794871794871796e-05, "loss": 0.0208, "step": 47 }, { "epoch": 0.09253012048192771, "grad_norm": 0.8711516857147217, "learning_rate": 1.2051282051282051e-05, "loss": 0.0291, "step": 48 }, { "epoch": 0.09445783132530121, "grad_norm": 0.9164300560951233, "learning_rate": 1.230769230769231e-05, "loss": 0.0342, "step": 49 }, { "epoch": 0.0963855421686747, "grad_norm": 0.5401139259338379, "learning_rate": 1.2564102564102565e-05, "loss": 0.0185, "step": 50 }, { "epoch": 0.0983132530120482, "grad_norm": 0.44393008947372437, "learning_rate": 1.2820512820512823e-05, "loss": 0.0228, "step": 51 }, { "epoch": 0.10024096385542168, "grad_norm": 0.3855767846107483, "learning_rate": 1.3076923076923078e-05, "loss": 0.0176, "step": 52 }, { "epoch": 0.10216867469879518, "grad_norm": 0.8561235070228577, "learning_rate": 1.3333333333333333e-05, "loss": 0.0433, "step": 53 }, { "epoch": 0.10409638554216867, "grad_norm": 0.768002450466156, "learning_rate": 1.3589743589743592e-05, "loss": 0.0245, "step": 54 }, { "epoch": 0.10602409638554217, "grad_norm": 0.4559759497642517, "learning_rate": 1.3846153846153847e-05, "loss": 0.0224, "step": 55 }, { "epoch": 0.10795180722891566, "grad_norm": 0.6203847527503967, "learning_rate": 1.4102564102564105e-05, "loss": 0.0296, "step": 56 }, { "epoch": 0.10987951807228916, "grad_norm": 0.6651368141174316, "learning_rate": 1.435897435897436e-05, "loss": 0.0336, "step": 57 }, { "epoch": 0.11180722891566265, "grad_norm": 0.377734512090683, "learning_rate": 1.4615384615384615e-05, "loss": 0.0196, "step": 58 }, { "epoch": 0.11373493975903615, "grad_norm": 0.687568724155426, "learning_rate": 1.4871794871794874e-05, "loss": 0.0207, "step": 59 }, { "epoch": 0.11566265060240964, "grad_norm": 0.7905604243278503, "learning_rate": 1.5128205128205129e-05, "loss": 0.047, "step": 60 }, { "epoch": 0.11759036144578314, "grad_norm": 0.7938196063041687, "learning_rate": 1.5384615384615387e-05, "loss": 0.0198, "step": 61 }, { "epoch": 0.11951807228915663, "grad_norm": 0.41340553760528564, "learning_rate": 1.5641025641025644e-05, "loss": 0.0161, "step": 62 }, { "epoch": 0.12144578313253013, "grad_norm": 0.5668172240257263, "learning_rate": 1.5897435897435897e-05, "loss": 0.0275, "step": 63 }, { "epoch": 0.12337349397590361, "grad_norm": 0.48333367705345154, "learning_rate": 1.6153846153846154e-05, "loss": 0.0137, "step": 64 }, { "epoch": 0.12530120481927712, "grad_norm": 0.6843933463096619, "learning_rate": 1.641025641025641e-05, "loss": 0.0294, "step": 65 }, { "epoch": 0.1272289156626506, "grad_norm": 0.7789272665977478, "learning_rate": 1.6666666666666667e-05, "loss": 0.0401, "step": 66 }, { "epoch": 0.1291566265060241, "grad_norm": 0.6203492879867554, "learning_rate": 1.6923076923076924e-05, "loss": 0.0292, "step": 67 }, { "epoch": 0.1310843373493976, "grad_norm": 0.5940662622451782, "learning_rate": 1.717948717948718e-05, "loss": 0.0178, "step": 68 }, { "epoch": 0.13301204819277107, "grad_norm": 0.35504868626594543, "learning_rate": 1.7435897435897438e-05, "loss": 0.0129, "step": 69 }, { "epoch": 0.13493975903614458, "grad_norm": 0.8796699643135071, "learning_rate": 1.7692307692307694e-05, "loss": 0.034, "step": 70 }, { "epoch": 0.13686746987951806, "grad_norm": 0.967444896697998, "learning_rate": 1.794871794871795e-05, "loss": 0.0266, "step": 71 }, { "epoch": 0.13879518072289157, "grad_norm": 0.4428526759147644, "learning_rate": 1.8205128205128208e-05, "loss": 0.0223, "step": 72 }, { "epoch": 0.14072289156626505, "grad_norm": 0.42897751927375793, "learning_rate": 1.8461538461538465e-05, "loss": 0.0187, "step": 73 }, { "epoch": 0.14265060240963856, "grad_norm": 0.5100914835929871, "learning_rate": 1.8717948717948718e-05, "loss": 0.0164, "step": 74 }, { "epoch": 0.14457831325301204, "grad_norm": 0.6028861999511719, "learning_rate": 1.8974358974358975e-05, "loss": 0.0164, "step": 75 }, { "epoch": 0.14650602409638555, "grad_norm": 0.6187024116516113, "learning_rate": 1.923076923076923e-05, "loss": 0.0296, "step": 76 }, { "epoch": 0.14843373493975903, "grad_norm": 0.4822489619255066, "learning_rate": 1.9487179487179488e-05, "loss": 0.0148, "step": 77 }, { "epoch": 0.15036144578313254, "grad_norm": 0.7231149673461914, "learning_rate": 1.9743589743589745e-05, "loss": 0.0395, "step": 78 }, { "epoch": 0.15228915662650602, "grad_norm": 0.8409642577171326, "learning_rate": 2e-05, "loss": 0.0446, "step": 79 }, { "epoch": 0.15421686746987953, "grad_norm": 0.4883500039577484, "learning_rate": 2.025641025641026e-05, "loss": 0.0206, "step": 80 }, { "epoch": 0.156144578313253, "grad_norm": 0.6287479400634766, "learning_rate": 2.0512820512820512e-05, "loss": 0.0333, "step": 81 }, { "epoch": 0.1580722891566265, "grad_norm": 0.5041632652282715, "learning_rate": 2.0769230769230772e-05, "loss": 0.0414, "step": 82 }, { "epoch": 0.16, "grad_norm": 0.5103405117988586, "learning_rate": 2.102564102564103e-05, "loss": 0.045, "step": 83 }, { "epoch": 0.16192771084337348, "grad_norm": 0.493161678314209, "learning_rate": 2.1282051282051285e-05, "loss": 0.021, "step": 84 }, { "epoch": 0.163855421686747, "grad_norm": 0.908843994140625, "learning_rate": 2.153846153846154e-05, "loss": 0.0389, "step": 85 }, { "epoch": 0.16578313253012048, "grad_norm": 0.5067003965377808, "learning_rate": 2.1794871794871795e-05, "loss": 0.0272, "step": 86 }, { "epoch": 0.16771084337349398, "grad_norm": 0.5791381597518921, "learning_rate": 2.2051282051282056e-05, "loss": 0.0368, "step": 87 }, { "epoch": 0.16963855421686747, "grad_norm": 0.7056036591529846, "learning_rate": 2.230769230769231e-05, "loss": 0.0284, "step": 88 }, { "epoch": 0.17156626506024097, "grad_norm": 0.6563822031021118, "learning_rate": 2.2564102564102566e-05, "loss": 0.0646, "step": 89 }, { "epoch": 0.17349397590361446, "grad_norm": 0.9483286142349243, "learning_rate": 2.2820512820512822e-05, "loss": 0.0439, "step": 90 }, { "epoch": 0.17542168674698796, "grad_norm": 0.370664119720459, "learning_rate": 2.3076923076923076e-05, "loss": 0.0109, "step": 91 }, { "epoch": 0.17734939759036145, "grad_norm": 0.9776477813720703, "learning_rate": 2.3333333333333336e-05, "loss": 0.0458, "step": 92 }, { "epoch": 0.17927710843373493, "grad_norm": 0.45710092782974243, "learning_rate": 2.3589743589743593e-05, "loss": 0.0212, "step": 93 }, { "epoch": 0.18120481927710844, "grad_norm": 0.8623896837234497, "learning_rate": 2.384615384615385e-05, "loss": 0.0215, "step": 94 }, { "epoch": 0.18313253012048192, "grad_norm": 0.55814528465271, "learning_rate": 2.4102564102564103e-05, "loss": 0.0218, "step": 95 }, { "epoch": 0.18506024096385543, "grad_norm": 0.49882641434669495, "learning_rate": 2.435897435897436e-05, "loss": 0.0268, "step": 96 }, { "epoch": 0.1869879518072289, "grad_norm": 0.3508654534816742, "learning_rate": 2.461538461538462e-05, "loss": 0.0172, "step": 97 }, { "epoch": 0.18891566265060242, "grad_norm": 0.601170003414154, "learning_rate": 2.4871794871794873e-05, "loss": 0.0208, "step": 98 }, { "epoch": 0.1908433734939759, "grad_norm": 1.1748133897781372, "learning_rate": 2.512820512820513e-05, "loss": 0.0259, "step": 99 }, { "epoch": 0.1927710843373494, "grad_norm": 0.46370384097099304, "learning_rate": 2.5384615384615386e-05, "loss": 0.0242, "step": 100 }, { "epoch": 0.1946987951807229, "grad_norm": 0.525010883808136, "learning_rate": 2.5641025641025646e-05, "loss": 0.0188, "step": 101 }, { "epoch": 0.1966265060240964, "grad_norm": 0.766501784324646, "learning_rate": 2.58974358974359e-05, "loss": 0.0584, "step": 102 }, { "epoch": 0.19855421686746988, "grad_norm": 0.3572964370250702, "learning_rate": 2.6153846153846157e-05, "loss": 0.0131, "step": 103 }, { "epoch": 0.20048192771084336, "grad_norm": 0.6467130780220032, "learning_rate": 2.6410256410256413e-05, "loss": 0.0231, "step": 104 }, { "epoch": 0.20240963855421687, "grad_norm": 1.1852102279663086, "learning_rate": 2.6666666666666667e-05, "loss": 0.027, "step": 105 }, { "epoch": 0.20433734939759035, "grad_norm": 2.3659932613372803, "learning_rate": 2.6923076923076927e-05, "loss": 0.0224, "step": 106 }, { "epoch": 0.20626506024096386, "grad_norm": 0.5343687534332275, "learning_rate": 2.7179487179487183e-05, "loss": 0.0198, "step": 107 }, { "epoch": 0.20819277108433734, "grad_norm": 1.852160096168518, "learning_rate": 2.7435897435897437e-05, "loss": 0.032, "step": 108 }, { "epoch": 0.21012048192771085, "grad_norm": 0.47291702032089233, "learning_rate": 2.7692307692307694e-05, "loss": 0.0117, "step": 109 }, { "epoch": 0.21204819277108433, "grad_norm": 0.7623187899589539, "learning_rate": 2.794871794871795e-05, "loss": 0.0337, "step": 110 }, { "epoch": 0.21397590361445784, "grad_norm": 0.5272570848464966, "learning_rate": 2.820512820512821e-05, "loss": 0.0131, "step": 111 }, { "epoch": 0.21590361445783132, "grad_norm": 0.5568500757217407, "learning_rate": 2.8461538461538464e-05, "loss": 0.0233, "step": 112 }, { "epoch": 0.21783132530120483, "grad_norm": 0.4008469879627228, "learning_rate": 2.871794871794872e-05, "loss": 0.0204, "step": 113 }, { "epoch": 0.2197590361445783, "grad_norm": 0.4888612926006317, "learning_rate": 2.8974358974358977e-05, "loss": 0.016, "step": 114 }, { "epoch": 0.2216867469879518, "grad_norm": 0.44903355836868286, "learning_rate": 2.923076923076923e-05, "loss": 0.0135, "step": 115 }, { "epoch": 0.2236144578313253, "grad_norm": 0.9266762733459473, "learning_rate": 2.948717948717949e-05, "loss": 0.0233, "step": 116 }, { "epoch": 0.22554216867469878, "grad_norm": 0.5352638959884644, "learning_rate": 2.9743589743589747e-05, "loss": 0.0198, "step": 117 }, { "epoch": 0.2274698795180723, "grad_norm": 0.6051343679428101, "learning_rate": 3.0000000000000004e-05, "loss": 0.0246, "step": 118 }, { "epoch": 0.22939759036144577, "grad_norm": 0.9971133470535278, "learning_rate": 3.0256410256410257e-05, "loss": 0.025, "step": 119 }, { "epoch": 0.23132530120481928, "grad_norm": 0.704236626625061, "learning_rate": 3.0512820512820514e-05, "loss": 0.031, "step": 120 }, { "epoch": 0.23325301204819276, "grad_norm": 0.6137097477912903, "learning_rate": 3.0769230769230774e-05, "loss": 0.0519, "step": 121 }, { "epoch": 0.23518072289156627, "grad_norm": 0.7396159768104553, "learning_rate": 3.102564102564103e-05, "loss": 0.0325, "step": 122 }, { "epoch": 0.23710843373493976, "grad_norm": 1.3282053470611572, "learning_rate": 3.128205128205129e-05, "loss": 0.0252, "step": 123 }, { "epoch": 0.23903614457831326, "grad_norm": 0.5220731496810913, "learning_rate": 3.153846153846154e-05, "loss": 0.0262, "step": 124 }, { "epoch": 0.24096385542168675, "grad_norm": 0.5357242822647095, "learning_rate": 3.1794871794871795e-05, "loss": 0.0243, "step": 125 }, { "epoch": 0.24289156626506025, "grad_norm": 0.48207753896713257, "learning_rate": 3.205128205128206e-05, "loss": 0.0178, "step": 126 }, { "epoch": 0.24481927710843374, "grad_norm": 0.552988588809967, "learning_rate": 3.230769230769231e-05, "loss": 0.023, "step": 127 }, { "epoch": 0.24674698795180722, "grad_norm": 1.7962840795516968, "learning_rate": 3.2564102564102565e-05, "loss": 0.032, "step": 128 }, { "epoch": 0.24867469879518073, "grad_norm": 1.6404600143432617, "learning_rate": 3.282051282051282e-05, "loss": 0.0231, "step": 129 }, { "epoch": 0.25060240963855424, "grad_norm": 0.39142486453056335, "learning_rate": 3.307692307692308e-05, "loss": 0.0147, "step": 130 }, { "epoch": 0.2525301204819277, "grad_norm": 1.3272887468338013, "learning_rate": 3.3333333333333335e-05, "loss": 0.0439, "step": 131 }, { "epoch": 0.2544578313253012, "grad_norm": 1.5122811794281006, "learning_rate": 3.358974358974359e-05, "loss": 0.0282, "step": 132 }, { "epoch": 0.2563855421686747, "grad_norm": 1.8542430400848389, "learning_rate": 3.384615384615385e-05, "loss": 0.0515, "step": 133 }, { "epoch": 0.2583132530120482, "grad_norm": 4.059277534484863, "learning_rate": 3.4102564102564105e-05, "loss": 0.0781, "step": 134 }, { "epoch": 0.26024096385542167, "grad_norm": 0.6206214427947998, "learning_rate": 3.435897435897436e-05, "loss": 0.0306, "step": 135 }, { "epoch": 0.2621686746987952, "grad_norm": 0.4575510323047638, "learning_rate": 3.461538461538462e-05, "loss": 0.0154, "step": 136 }, { "epoch": 0.2640963855421687, "grad_norm": 1.1556978225708008, "learning_rate": 3.4871794871794875e-05, "loss": 0.0235, "step": 137 }, { "epoch": 0.26602409638554214, "grad_norm": 0.6975051760673523, "learning_rate": 3.512820512820513e-05, "loss": 0.0453, "step": 138 }, { "epoch": 0.26795180722891565, "grad_norm": 0.8686623573303223, "learning_rate": 3.538461538461539e-05, "loss": 0.0427, "step": 139 }, { "epoch": 0.26987951807228916, "grad_norm": 2.0681848526000977, "learning_rate": 3.5641025641025646e-05, "loss": 0.04, "step": 140 }, { "epoch": 0.27180722891566267, "grad_norm": 0.4397984445095062, "learning_rate": 3.58974358974359e-05, "loss": 0.0188, "step": 141 }, { "epoch": 0.2737349397590361, "grad_norm": 0.5871334075927734, "learning_rate": 3.615384615384616e-05, "loss": 0.0253, "step": 142 }, { "epoch": 0.27566265060240963, "grad_norm": 1.1078568696975708, "learning_rate": 3.6410256410256416e-05, "loss": 0.0316, "step": 143 }, { "epoch": 0.27759036144578314, "grad_norm": 0.5691841840744019, "learning_rate": 3.6666666666666666e-05, "loss": 0.0266, "step": 144 }, { "epoch": 0.27951807228915665, "grad_norm": 0.7896255254745483, "learning_rate": 3.692307692307693e-05, "loss": 0.0281, "step": 145 }, { "epoch": 0.2814457831325301, "grad_norm": 0.9988337159156799, "learning_rate": 3.7179487179487186e-05, "loss": 0.0295, "step": 146 }, { "epoch": 0.2833734939759036, "grad_norm": 0.9811834692955017, "learning_rate": 3.7435897435897436e-05, "loss": 0.0322, "step": 147 }, { "epoch": 0.2853012048192771, "grad_norm": 0.6503105759620667, "learning_rate": 3.769230769230769e-05, "loss": 0.0266, "step": 148 }, { "epoch": 0.28722891566265063, "grad_norm": 1.9164355993270874, "learning_rate": 3.794871794871795e-05, "loss": 0.0677, "step": 149 }, { "epoch": 0.2891566265060241, "grad_norm": 1.1724557876586914, "learning_rate": 3.820512820512821e-05, "loss": 0.0324, "step": 150 }, { "epoch": 0.2910843373493976, "grad_norm": 0.8482469916343689, "learning_rate": 3.846153846153846e-05, "loss": 0.0259, "step": 151 }, { "epoch": 0.2930120481927711, "grad_norm": 0.8572830557823181, "learning_rate": 3.871794871794872e-05, "loss": 0.0358, "step": 152 }, { "epoch": 0.29493975903614456, "grad_norm": 0.6630825400352478, "learning_rate": 3.8974358974358976e-05, "loss": 0.0447, "step": 153 }, { "epoch": 0.29686746987951806, "grad_norm": 0.9197093844413757, "learning_rate": 3.923076923076923e-05, "loss": 0.0409, "step": 154 }, { "epoch": 0.2987951807228916, "grad_norm": 0.6976819634437561, "learning_rate": 3.948717948717949e-05, "loss": 0.0317, "step": 155 }, { "epoch": 0.3007228915662651, "grad_norm": 0.7353514432907104, "learning_rate": 3.9743589743589747e-05, "loss": 0.0306, "step": 156 }, { "epoch": 0.30265060240963854, "grad_norm": 0.5730232000350952, "learning_rate": 4e-05, "loss": 0.0324, "step": 157 }, { "epoch": 0.30457831325301205, "grad_norm": 0.7852078676223755, "learning_rate": 3.999994971675547e-05, "loss": 0.0354, "step": 158 }, { "epoch": 0.30650602409638555, "grad_norm": 0.5924715399742126, "learning_rate": 3.999979886727471e-05, "loss": 0.0366, "step": 159 }, { "epoch": 0.30843373493975906, "grad_norm": 0.7359845638275146, "learning_rate": 3.999954745231624e-05, "loss": 0.0437, "step": 160 }, { "epoch": 0.3103614457831325, "grad_norm": 0.7866976857185364, "learning_rate": 3.999919547314426e-05, "loss": 0.0363, "step": 161 }, { "epoch": 0.312289156626506, "grad_norm": 0.7425745129585266, "learning_rate": 3.999874293152863e-05, "loss": 0.0259, "step": 162 }, { "epoch": 0.31421686746987953, "grad_norm": 1.8922245502471924, "learning_rate": 3.9998189829744885e-05, "loss": 0.0341, "step": 163 }, { "epoch": 0.316144578313253, "grad_norm": 0.7908634543418884, "learning_rate": 3.99975361705742e-05, "loss": 0.0424, "step": 164 }, { "epoch": 0.3180722891566265, "grad_norm": 2.047368049621582, "learning_rate": 3.999678195730337e-05, "loss": 0.0535, "step": 165 }, { "epoch": 0.32, "grad_norm": 0.5702639222145081, "learning_rate": 3.999592719372484e-05, "loss": 0.0284, "step": 166 }, { "epoch": 0.3219277108433735, "grad_norm": 0.45015648007392883, "learning_rate": 3.9994971884136636e-05, "loss": 0.0313, "step": 167 }, { "epoch": 0.32385542168674697, "grad_norm": 4.094679355621338, "learning_rate": 3.9993916033342355e-05, "loss": 0.0524, "step": 168 }, { "epoch": 0.3257831325301205, "grad_norm": 0.800846517086029, "learning_rate": 3.999275964665117e-05, "loss": 0.0282, "step": 169 }, { "epoch": 0.327710843373494, "grad_norm": 0.47881078720092773, "learning_rate": 3.999150272987776e-05, "loss": 0.0293, "step": 170 }, { "epoch": 0.3296385542168675, "grad_norm": 0.5716657638549805, "learning_rate": 3.999014528934232e-05, "loss": 0.0221, "step": 171 }, { "epoch": 0.33156626506024095, "grad_norm": 0.6333311200141907, "learning_rate": 3.998868733187048e-05, "loss": 0.0302, "step": 172 }, { "epoch": 0.33349397590361446, "grad_norm": 6.642521858215332, "learning_rate": 3.998712886479335e-05, "loss": 0.0364, "step": 173 }, { "epoch": 0.33542168674698797, "grad_norm": 0.7515506148338318, "learning_rate": 3.998546989594739e-05, "loss": 0.0296, "step": 174 }, { "epoch": 0.3373493975903614, "grad_norm": 1.0728015899658203, "learning_rate": 3.998371043367445e-05, "loss": 0.0549, "step": 175 }, { "epoch": 0.33927710843373493, "grad_norm": 1.3025579452514648, "learning_rate": 3.998185048682166e-05, "loss": 0.0577, "step": 176 }, { "epoch": 0.34120481927710844, "grad_norm": 1.0962958335876465, "learning_rate": 3.997989006474144e-05, "loss": 0.0313, "step": 177 }, { "epoch": 0.34313253012048195, "grad_norm": 0.7064313292503357, "learning_rate": 3.997782917729143e-05, "loss": 0.0309, "step": 178 }, { "epoch": 0.3450602409638554, "grad_norm": 0.43374207615852356, "learning_rate": 3.997566783483445e-05, "loss": 0.0166, "step": 179 }, { "epoch": 0.3469879518072289, "grad_norm": 0.7236390113830566, "learning_rate": 3.9973406048238413e-05, "loss": 0.0254, "step": 180 }, { "epoch": 0.3489156626506024, "grad_norm": 0.5041500926017761, "learning_rate": 3.9971043828876334e-05, "loss": 0.0239, "step": 181 }, { "epoch": 0.35084337349397593, "grad_norm": 1.2744532823562622, "learning_rate": 3.9968581188626204e-05, "loss": 0.0404, "step": 182 }, { "epoch": 0.3527710843373494, "grad_norm": 0.45845362544059753, "learning_rate": 3.996601813987098e-05, "loss": 0.0127, "step": 183 }, { "epoch": 0.3546987951807229, "grad_norm": 0.4426881968975067, "learning_rate": 3.996335469549852e-05, "loss": 0.0176, "step": 184 }, { "epoch": 0.3566265060240964, "grad_norm": 1.0030732154846191, "learning_rate": 3.9960590868901465e-05, "loss": 0.0457, "step": 185 }, { "epoch": 0.35855421686746985, "grad_norm": 0.6428582668304443, "learning_rate": 3.995772667397725e-05, "loss": 0.0271, "step": 186 }, { "epoch": 0.36048192771084336, "grad_norm": 0.5335744619369507, "learning_rate": 3.995476212512795e-05, "loss": 0.0297, "step": 187 }, { "epoch": 0.3624096385542169, "grad_norm": 0.6995761394500732, "learning_rate": 3.99516972372603e-05, "loss": 0.0322, "step": 188 }, { "epoch": 0.3643373493975904, "grad_norm": 0.765511155128479, "learning_rate": 3.9948532025785546e-05, "loss": 0.0253, "step": 189 }, { "epoch": 0.36626506024096384, "grad_norm": 0.6165828108787537, "learning_rate": 3.9945266506619403e-05, "loss": 0.0355, "step": 190 }, { "epoch": 0.36819277108433734, "grad_norm": 0.851970911026001, "learning_rate": 3.994190069618195e-05, "loss": 0.056, "step": 191 }, { "epoch": 0.37012048192771085, "grad_norm": 0.9850023984909058, "learning_rate": 3.993843461139757e-05, "loss": 0.0415, "step": 192 }, { "epoch": 0.37204819277108436, "grad_norm": 0.7455295324325562, "learning_rate": 3.9934868269694886e-05, "loss": 0.0379, "step": 193 }, { "epoch": 0.3739759036144578, "grad_norm": 1.159469723701477, "learning_rate": 3.9931201689006595e-05, "loss": 0.0237, "step": 194 }, { "epoch": 0.3759036144578313, "grad_norm": 0.5490080118179321, "learning_rate": 3.992743488776947e-05, "loss": 0.024, "step": 195 }, { "epoch": 0.37783132530120483, "grad_norm": 1.279831886291504, "learning_rate": 3.992356788492421e-05, "loss": 0.0273, "step": 196 }, { "epoch": 0.3797590361445783, "grad_norm": 0.859104335308075, "learning_rate": 3.9919600699915355e-05, "loss": 0.0411, "step": 197 }, { "epoch": 0.3816867469879518, "grad_norm": 1.2525300979614258, "learning_rate": 3.991553335269119e-05, "loss": 0.0857, "step": 198 }, { "epoch": 0.3836144578313253, "grad_norm": 0.4924193024635315, "learning_rate": 3.991136586370367e-05, "loss": 0.0294, "step": 199 }, { "epoch": 0.3855421686746988, "grad_norm": 1.417190670967102, "learning_rate": 3.990709825390828e-05, "loss": 0.0395, "step": 200 }, { "epoch": 0.38746987951807227, "grad_norm": 0.6172056198120117, "learning_rate": 3.9902730544763936e-05, "loss": 0.0194, "step": 201 }, { "epoch": 0.3893975903614458, "grad_norm": 0.7292149662971497, "learning_rate": 3.989826275823291e-05, "loss": 0.0381, "step": 202 }, { "epoch": 0.3913253012048193, "grad_norm": 0.5949816107749939, "learning_rate": 3.989369491678067e-05, "loss": 0.0254, "step": 203 }, { "epoch": 0.3932530120481928, "grad_norm": 0.6012582182884216, "learning_rate": 3.988902704337582e-05, "loss": 0.048, "step": 204 }, { "epoch": 0.39518072289156625, "grad_norm": 0.6273590922355652, "learning_rate": 3.9884259161489936e-05, "loss": 0.0268, "step": 205 }, { "epoch": 0.39710843373493976, "grad_norm": 0.9615244269371033, "learning_rate": 3.987939129509746e-05, "loss": 0.0192, "step": 206 }, { "epoch": 0.39903614457831327, "grad_norm": 0.6009241342544556, "learning_rate": 3.9874423468675624e-05, "loss": 0.0362, "step": 207 }, { "epoch": 0.4009638554216867, "grad_norm": 0.411335289478302, "learning_rate": 3.9869355707204266e-05, "loss": 0.017, "step": 208 }, { "epoch": 0.40289156626506023, "grad_norm": 0.6151527166366577, "learning_rate": 3.986418803616573e-05, "loss": 0.0283, "step": 209 }, { "epoch": 0.40481927710843374, "grad_norm": 0.33808204531669617, "learning_rate": 3.985892048154474e-05, "loss": 0.0158, "step": 210 }, { "epoch": 0.40674698795180725, "grad_norm": 0.5464187860488892, "learning_rate": 3.9853553069828284e-05, "loss": 0.0292, "step": 211 }, { "epoch": 0.4086746987951807, "grad_norm": 0.6658390760421753, "learning_rate": 3.984808582800543e-05, "loss": 0.0281, "step": 212 }, { "epoch": 0.4106024096385542, "grad_norm": 0.4253764748573303, "learning_rate": 3.984251878356726e-05, "loss": 0.031, "step": 213 }, { "epoch": 0.4125301204819277, "grad_norm": 0.32309481501579285, "learning_rate": 3.983685196450667e-05, "loss": 0.0166, "step": 214 }, { "epoch": 0.41445783132530123, "grad_norm": 0.43756410479545593, "learning_rate": 3.9831085399318265e-05, "loss": 0.0326, "step": 215 }, { "epoch": 0.4163855421686747, "grad_norm": 0.264046847820282, "learning_rate": 3.982521911699822e-05, "loss": 0.0118, "step": 216 }, { "epoch": 0.4183132530120482, "grad_norm": 0.8630897402763367, "learning_rate": 3.9819253147044084e-05, "loss": 0.0246, "step": 217 }, { "epoch": 0.4202409638554217, "grad_norm": 0.6923379898071289, "learning_rate": 3.98131875194547e-05, "loss": 0.036, "step": 218 }, { "epoch": 0.42216867469879515, "grad_norm": 0.5874778628349304, "learning_rate": 3.9807022264730024e-05, "loss": 0.0255, "step": 219 }, { "epoch": 0.42409638554216866, "grad_norm": 0.394336074590683, "learning_rate": 3.980075741387094e-05, "loss": 0.0187, "step": 220 }, { "epoch": 0.4260240963855422, "grad_norm": 0.6300327777862549, "learning_rate": 3.979439299837915e-05, "loss": 0.0214, "step": 221 }, { "epoch": 0.4279518072289157, "grad_norm": 0.5200467109680176, "learning_rate": 3.978792905025702e-05, "loss": 0.0628, "step": 222 }, { "epoch": 0.42987951807228914, "grad_norm": 0.5713880062103271, "learning_rate": 3.978136560200735e-05, "loss": 0.0302, "step": 223 }, { "epoch": 0.43180722891566264, "grad_norm": 0.5345383286476135, "learning_rate": 3.977470268663331e-05, "loss": 0.0125, "step": 224 }, { "epoch": 0.43373493975903615, "grad_norm": 0.5378350019454956, "learning_rate": 3.976794033763819e-05, "loss": 0.0246, "step": 225 }, { "epoch": 0.43566265060240966, "grad_norm": 0.5554935336112976, "learning_rate": 3.9761078589025276e-05, "loss": 0.0212, "step": 226 }, { "epoch": 0.4375903614457831, "grad_norm": 0.2832634747028351, "learning_rate": 3.9754117475297664e-05, "loss": 0.0125, "step": 227 }, { "epoch": 0.4395180722891566, "grad_norm": 1.2910150289535522, "learning_rate": 3.97470570314581e-05, "loss": 0.0364, "step": 228 }, { "epoch": 0.44144578313253013, "grad_norm": 0.3731018602848053, "learning_rate": 3.973989729300878e-05, "loss": 0.0128, "step": 229 }, { "epoch": 0.4433734939759036, "grad_norm": 0.9433871507644653, "learning_rate": 3.9732638295951195e-05, "loss": 0.0367, "step": 230 }, { "epoch": 0.4453012048192771, "grad_norm": 1.0779197216033936, "learning_rate": 3.972528007678594e-05, "loss": 0.0667, "step": 231 }, { "epoch": 0.4472289156626506, "grad_norm": 1.7009105682373047, "learning_rate": 3.9717822672512516e-05, "loss": 0.0655, "step": 232 }, { "epoch": 0.4491566265060241, "grad_norm": 0.5646032094955444, "learning_rate": 3.971026612062919e-05, "loss": 0.064, "step": 233 }, { "epoch": 0.45108433734939757, "grad_norm": 0.44474121928215027, "learning_rate": 3.970261045913274e-05, "loss": 0.0206, "step": 234 }, { "epoch": 0.4530120481927711, "grad_norm": 1.3969277143478394, "learning_rate": 3.969485572651833e-05, "loss": 0.0486, "step": 235 }, { "epoch": 0.4549397590361446, "grad_norm": 0.6401994228363037, "learning_rate": 3.968700196177925e-05, "loss": 0.0262, "step": 236 }, { "epoch": 0.4568674698795181, "grad_norm": 0.7091913223266602, "learning_rate": 3.96790492044068e-05, "loss": 0.014, "step": 237 }, { "epoch": 0.45879518072289155, "grad_norm": 0.6561547517776489, "learning_rate": 3.967099749439002e-05, "loss": 0.0482, "step": 238 }, { "epoch": 0.46072289156626506, "grad_norm": 0.6924155354499817, "learning_rate": 3.966284687221551e-05, "loss": 0.0289, "step": 239 }, { "epoch": 0.46265060240963857, "grad_norm": 0.5868663787841797, "learning_rate": 3.9654597378867256e-05, "loss": 0.0331, "step": 240 }, { "epoch": 0.464578313253012, "grad_norm": 0.7930939793586731, "learning_rate": 3.964624905582637e-05, "loss": 0.0925, "step": 241 }, { "epoch": 0.46650602409638553, "grad_norm": 0.4888836145401001, "learning_rate": 3.9637801945070944e-05, "loss": 0.015, "step": 242 }, { "epoch": 0.46843373493975904, "grad_norm": 0.7820287346839905, "learning_rate": 3.962925608907579e-05, "loss": 0.0382, "step": 243 }, { "epoch": 0.47036144578313255, "grad_norm": 0.4914316236972809, "learning_rate": 3.962061153081224e-05, "loss": 0.0257, "step": 244 }, { "epoch": 0.472289156626506, "grad_norm": 0.5681505799293518, "learning_rate": 3.961186831374793e-05, "loss": 0.0551, "step": 245 }, { "epoch": 0.4742168674698795, "grad_norm": 0.5049723386764526, "learning_rate": 3.9603026481846616e-05, "loss": 0.0186, "step": 246 }, { "epoch": 0.476144578313253, "grad_norm": 0.5034119486808777, "learning_rate": 3.959408607956787e-05, "loss": 0.024, "step": 247 }, { "epoch": 0.47807228915662653, "grad_norm": 0.4543336033821106, "learning_rate": 3.958504715186695e-05, "loss": 0.0256, "step": 248 }, { "epoch": 0.48, "grad_norm": 0.5595743656158447, "learning_rate": 3.957590974419452e-05, "loss": 0.0222, "step": 249 }, { "epoch": 0.4819277108433735, "grad_norm": 0.5701581239700317, "learning_rate": 3.956667390249642e-05, "loss": 0.0334, "step": 250 }, { "epoch": 0.483855421686747, "grad_norm": 0.53755784034729, "learning_rate": 3.9557339673213474e-05, "loss": 0.0345, "step": 251 }, { "epoch": 0.4857831325301205, "grad_norm": 0.4368877112865448, "learning_rate": 3.95479071032812e-05, "loss": 0.0183, "step": 252 }, { "epoch": 0.48771084337349396, "grad_norm": 0.7972906827926636, "learning_rate": 3.953837624012963e-05, "loss": 0.0337, "step": 253 }, { "epoch": 0.48963855421686747, "grad_norm": 0.6148451566696167, "learning_rate": 3.9528747131683023e-05, "loss": 0.0524, "step": 254 }, { "epoch": 0.491566265060241, "grad_norm": 0.500840961933136, "learning_rate": 3.9519019826359676e-05, "loss": 0.0248, "step": 255 }, { "epoch": 0.49349397590361443, "grad_norm": 0.5536255240440369, "learning_rate": 3.9509194373071624e-05, "loss": 0.0219, "step": 256 }, { "epoch": 0.49542168674698794, "grad_norm": 0.6873176097869873, "learning_rate": 3.9499270821224444e-05, "loss": 0.0312, "step": 257 }, { "epoch": 0.49734939759036145, "grad_norm": 0.37207168340682983, "learning_rate": 3.9489249220716974e-05, "loss": 0.0149, "step": 258 }, { "epoch": 0.49927710843373496, "grad_norm": 0.4458799660205841, "learning_rate": 3.947912962194107e-05, "loss": 0.0214, "step": 259 }, { "epoch": 0.5012048192771085, "grad_norm": 0.4272724390029907, "learning_rate": 3.9468912075781345e-05, "loss": 0.0263, "step": 260 }, { "epoch": 0.503132530120482, "grad_norm": 0.5245792269706726, "learning_rate": 3.945859663361496e-05, "loss": 0.0103, "step": 261 }, { "epoch": 0.5050602409638554, "grad_norm": 0.8799260854721069, "learning_rate": 3.9448183347311284e-05, "loss": 0.0292, "step": 262 }, { "epoch": 0.5069879518072289, "grad_norm": 0.5996833443641663, "learning_rate": 3.943767226923171e-05, "loss": 0.0306, "step": 263 }, { "epoch": 0.5089156626506024, "grad_norm": 0.6044682860374451, "learning_rate": 3.942706345222935e-05, "loss": 0.0218, "step": 264 }, { "epoch": 0.5108433734939759, "grad_norm": 0.4770200848579407, "learning_rate": 3.941635694964878e-05, "loss": 0.0226, "step": 265 }, { "epoch": 0.5127710843373494, "grad_norm": 0.5605704188346863, "learning_rate": 3.940555281532576e-05, "loss": 0.0354, "step": 266 }, { "epoch": 0.5146987951807229, "grad_norm": 0.46532443165779114, "learning_rate": 3.939465110358699e-05, "loss": 0.0223, "step": 267 }, { "epoch": 0.5166265060240964, "grad_norm": 0.5190595388412476, "learning_rate": 3.93836518692498e-05, "loss": 0.0219, "step": 268 }, { "epoch": 0.5185542168674698, "grad_norm": 0.5767757892608643, "learning_rate": 3.937255516762193e-05, "loss": 0.0294, "step": 269 }, { "epoch": 0.5204819277108433, "grad_norm": 0.4543164372444153, "learning_rate": 3.936136105450119e-05, "loss": 0.0244, "step": 270 }, { "epoch": 0.5224096385542168, "grad_norm": 0.4155154526233673, "learning_rate": 3.9350069586175195e-05, "loss": 0.02, "step": 271 }, { "epoch": 0.5243373493975904, "grad_norm": 0.5470768213272095, "learning_rate": 3.933868081942113e-05, "loss": 0.0187, "step": 272 }, { "epoch": 0.5262650602409639, "grad_norm": 0.9491772651672363, "learning_rate": 3.9327194811505406e-05, "loss": 0.0337, "step": 273 }, { "epoch": 0.5281927710843374, "grad_norm": 0.9313873052597046, "learning_rate": 3.93156116201834e-05, "loss": 0.0573, "step": 274 }, { "epoch": 0.5301204819277109, "grad_norm": 0.7181005477905273, "learning_rate": 3.930393130369915e-05, "loss": 0.0405, "step": 275 }, { "epoch": 0.5320481927710843, "grad_norm": 0.34231385588645935, "learning_rate": 3.9292153920785076e-05, "loss": 0.0153, "step": 276 }, { "epoch": 0.5339759036144578, "grad_norm": 0.6899610161781311, "learning_rate": 3.928027953066168e-05, "loss": 0.0338, "step": 277 }, { "epoch": 0.5359036144578313, "grad_norm": 0.7509781718254089, "learning_rate": 3.926830819303726e-05, "loss": 0.0416, "step": 278 }, { "epoch": 0.5378313253012048, "grad_norm": 0.6326774954795837, "learning_rate": 3.925623996810757e-05, "loss": 0.0293, "step": 279 }, { "epoch": 0.5397590361445783, "grad_norm": 0.5543203353881836, "learning_rate": 3.924407491655557e-05, "loss": 0.0263, "step": 280 }, { "epoch": 0.5416867469879518, "grad_norm": 0.5367572903633118, "learning_rate": 3.9231813099551086e-05, "loss": 0.0276, "step": 281 }, { "epoch": 0.5436144578313253, "grad_norm": 0.3143869638442993, "learning_rate": 3.921945457875051e-05, "loss": 0.0146, "step": 282 }, { "epoch": 0.5455421686746988, "grad_norm": 0.47403043508529663, "learning_rate": 3.920699941629649e-05, "loss": 0.0267, "step": 283 }, { "epoch": 0.5474698795180722, "grad_norm": 0.5082595348358154, "learning_rate": 3.919444767481763e-05, "loss": 0.0183, "step": 284 }, { "epoch": 0.5493975903614458, "grad_norm": 0.747949481010437, "learning_rate": 3.918179941742816e-05, "loss": 0.0412, "step": 285 }, { "epoch": 0.5513253012048193, "grad_norm": 0.6553886532783508, "learning_rate": 3.916905470772762e-05, "loss": 0.0505, "step": 286 }, { "epoch": 0.5532530120481928, "grad_norm": 0.3838176131248474, "learning_rate": 3.9156213609800545e-05, "loss": 0.0156, "step": 287 }, { "epoch": 0.5551807228915663, "grad_norm": 0.7427731156349182, "learning_rate": 3.914327618821614e-05, "loss": 0.0278, "step": 288 }, { "epoch": 0.5571084337349398, "grad_norm": 0.2612821161746979, "learning_rate": 3.913024250802796e-05, "loss": 0.0101, "step": 289 }, { "epoch": 0.5590361445783133, "grad_norm": 0.3799416124820709, "learning_rate": 3.911711263477357e-05, "loss": 0.0168, "step": 290 }, { "epoch": 0.5609638554216867, "grad_norm": 0.5053854584693909, "learning_rate": 3.910388663447425e-05, "loss": 0.0249, "step": 291 }, { "epoch": 0.5628915662650602, "grad_norm": 0.38095012307167053, "learning_rate": 3.909056457363461e-05, "loss": 0.0156, "step": 292 }, { "epoch": 0.5648192771084337, "grad_norm": 0.4477892220020294, "learning_rate": 3.907714651924229e-05, "loss": 0.0309, "step": 293 }, { "epoch": 0.5667469879518072, "grad_norm": 0.5875864624977112, "learning_rate": 3.906363253876763e-05, "loss": 0.0287, "step": 294 }, { "epoch": 0.5686746987951807, "grad_norm": 0.522990882396698, "learning_rate": 3.90500227001633e-05, "loss": 0.0318, "step": 295 }, { "epoch": 0.5706024096385542, "grad_norm": 0.4153876304626465, "learning_rate": 3.9036317071863994e-05, "loss": 0.0192, "step": 296 }, { "epoch": 0.5725301204819278, "grad_norm": 0.4675769507884979, "learning_rate": 3.902251572278605e-05, "loss": 0.067, "step": 297 }, { "epoch": 0.5744578313253013, "grad_norm": 0.35778650641441345, "learning_rate": 3.900861872232713e-05, "loss": 0.0197, "step": 298 }, { "epoch": 0.5763855421686747, "grad_norm": 0.7382330894470215, "learning_rate": 3.899462614036587e-05, "loss": 0.0283, "step": 299 }, { "epoch": 0.5783132530120482, "grad_norm": 0.41268599033355713, "learning_rate": 3.89805380472615e-05, "loss": 0.0207, "step": 300 }, { "epoch": 0.5802409638554217, "grad_norm": 1.2013020515441895, "learning_rate": 3.8966354513853535e-05, "loss": 0.0301, "step": 301 }, { "epoch": 0.5821686746987952, "grad_norm": 0.424757719039917, "learning_rate": 3.895207561146137e-05, "loss": 0.022, "step": 302 }, { "epoch": 0.5840963855421687, "grad_norm": 0.4196677505970001, "learning_rate": 3.893770141188396e-05, "loss": 0.0424, "step": 303 }, { "epoch": 0.5860240963855422, "grad_norm": 0.8644190430641174, "learning_rate": 3.892323198739946e-05, "loss": 0.08, "step": 304 }, { "epoch": 0.5879518072289157, "grad_norm": 0.5645135045051575, "learning_rate": 3.890866741076482e-05, "loss": 0.0152, "step": 305 }, { "epoch": 0.5898795180722891, "grad_norm": 0.5218387246131897, "learning_rate": 3.889400775521545e-05, "loss": 0.0205, "step": 306 }, { "epoch": 0.5918072289156626, "grad_norm": 0.39709413051605225, "learning_rate": 3.8879253094464865e-05, "loss": 0.0233, "step": 307 }, { "epoch": 0.5937349397590361, "grad_norm": 0.3572910726070404, "learning_rate": 3.8864403502704285e-05, "loss": 0.0198, "step": 308 }, { "epoch": 0.5956626506024096, "grad_norm": 0.382709264755249, "learning_rate": 3.8849459054602274e-05, "loss": 0.0176, "step": 309 }, { "epoch": 0.5975903614457831, "grad_norm": 3.4527227878570557, "learning_rate": 3.883441982530436e-05, "loss": 0.0239, "step": 310 }, { "epoch": 0.5995180722891567, "grad_norm": 0.4467569589614868, "learning_rate": 3.8819285890432674e-05, "loss": 0.0284, "step": 311 }, { "epoch": 0.6014457831325302, "grad_norm": 0.44513460993766785, "learning_rate": 3.880405732608555e-05, "loss": 0.0233, "step": 312 }, { "epoch": 0.6033734939759036, "grad_norm": 0.8029689192771912, "learning_rate": 3.8788734208837155e-05, "loss": 0.0433, "step": 313 }, { "epoch": 0.6053012048192771, "grad_norm": 0.7291454076766968, "learning_rate": 3.877331661573709e-05, "loss": 0.043, "step": 314 }, { "epoch": 0.6072289156626506, "grad_norm": 0.6050467491149902, "learning_rate": 3.8757804624310006e-05, "loss": 0.0377, "step": 315 }, { "epoch": 0.6091566265060241, "grad_norm": 0.6714366674423218, "learning_rate": 3.874219831255524e-05, "loss": 0.046, "step": 316 }, { "epoch": 0.6110843373493976, "grad_norm": 0.336037278175354, "learning_rate": 3.8726497758946394e-05, "loss": 0.0149, "step": 317 }, { "epoch": 0.6130120481927711, "grad_norm": 0.3057402968406677, "learning_rate": 3.871070304243094e-05, "loss": 0.014, "step": 318 }, { "epoch": 0.6149397590361446, "grad_norm": 0.4537644684314728, "learning_rate": 3.8694814242429834e-05, "loss": 0.0503, "step": 319 }, { "epoch": 0.6168674698795181, "grad_norm": 0.45573824644088745, "learning_rate": 3.8678831438837116e-05, "loss": 0.021, "step": 320 }, { "epoch": 0.6187951807228915, "grad_norm": 0.30729591846466064, "learning_rate": 3.866275471201952e-05, "loss": 0.0163, "step": 321 }, { "epoch": 0.620722891566265, "grad_norm": 0.7614850401878357, "learning_rate": 3.8646584142816036e-05, "loss": 0.0347, "step": 322 }, { "epoch": 0.6226506024096385, "grad_norm": 0.5323611497879028, "learning_rate": 3.863031981253754e-05, "loss": 0.0201, "step": 323 }, { "epoch": 0.624578313253012, "grad_norm": 0.34426453709602356, "learning_rate": 3.861396180296635e-05, "loss": 0.0243, "step": 324 }, { "epoch": 0.6265060240963856, "grad_norm": 0.621636152267456, "learning_rate": 3.859751019635585e-05, "loss": 0.0166, "step": 325 }, { "epoch": 0.6284337349397591, "grad_norm": 0.549324095249176, "learning_rate": 3.858096507543006e-05, "loss": 0.0274, "step": 326 }, { "epoch": 0.6303614457831326, "grad_norm": 0.358426570892334, "learning_rate": 3.8564326523383214e-05, "loss": 0.0207, "step": 327 }, { "epoch": 0.632289156626506, "grad_norm": 0.3639723062515259, "learning_rate": 3.8547594623879346e-05, "loss": 0.0297, "step": 328 }, { "epoch": 0.6342168674698795, "grad_norm": 0.3402212858200073, "learning_rate": 3.853076946105188e-05, "loss": 0.0258, "step": 329 }, { "epoch": 0.636144578313253, "grad_norm": 0.4083027243614197, "learning_rate": 3.85138511195032e-05, "loss": 0.0351, "step": 330 }, { "epoch": 0.6380722891566265, "grad_norm": 0.43532121181488037, "learning_rate": 3.84968396843042e-05, "loss": 0.0388, "step": 331 }, { "epoch": 0.64, "grad_norm": 0.35353463888168335, "learning_rate": 3.8479735240993904e-05, "loss": 0.0203, "step": 332 }, { "epoch": 0.6419277108433735, "grad_norm": 0.350149929523468, "learning_rate": 3.846253787557901e-05, "loss": 0.0261, "step": 333 }, { "epoch": 0.643855421686747, "grad_norm": 0.7665389180183411, "learning_rate": 3.844524767453344e-05, "loss": 0.0108, "step": 334 }, { "epoch": 0.6457831325301204, "grad_norm": 0.44621360301971436, "learning_rate": 3.842786472479795e-05, "loss": 0.0282, "step": 335 }, { "epoch": 0.6477108433734939, "grad_norm": 0.7787201404571533, "learning_rate": 3.841038911377962e-05, "loss": 0.0216, "step": 336 }, { "epoch": 0.6496385542168674, "grad_norm": 0.48260653018951416, "learning_rate": 3.839282092935153e-05, "loss": 0.0234, "step": 337 }, { "epoch": 0.651566265060241, "grad_norm": 0.4987852871417999, "learning_rate": 3.837516025985219e-05, "loss": 0.0515, "step": 338 }, { "epoch": 0.6534939759036145, "grad_norm": 0.9030266404151917, "learning_rate": 3.835740719408517e-05, "loss": 0.0508, "step": 339 }, { "epoch": 0.655421686746988, "grad_norm": 0.6381701231002808, "learning_rate": 3.833956182131867e-05, "loss": 0.0405, "step": 340 }, { "epoch": 0.6573493975903615, "grad_norm": 0.42828986048698425, "learning_rate": 3.832162423128499e-05, "loss": 0.024, "step": 341 }, { "epoch": 0.659277108433735, "grad_norm": 0.38725873827934265, "learning_rate": 3.8303594514180164e-05, "loss": 0.0199, "step": 342 }, { "epoch": 0.6612048192771084, "grad_norm": 0.23280498385429382, "learning_rate": 3.828547276066346e-05, "loss": 0.0101, "step": 343 }, { "epoch": 0.6631325301204819, "grad_norm": 0.7298216819763184, "learning_rate": 3.8267259061856925e-05, "loss": 0.0455, "step": 344 }, { "epoch": 0.6650602409638554, "grad_norm": 0.5975687503814697, "learning_rate": 3.824895350934496e-05, "loss": 0.0372, "step": 345 }, { "epoch": 0.6669879518072289, "grad_norm": 0.6295403242111206, "learning_rate": 3.823055619517381e-05, "loss": 0.0362, "step": 346 }, { "epoch": 0.6689156626506024, "grad_norm": 0.5086020827293396, "learning_rate": 3.821206721185115e-05, "loss": 0.0368, "step": 347 }, { "epoch": 0.6708433734939759, "grad_norm": 0.34506168961524963, "learning_rate": 3.819348665234557e-05, "loss": 0.0178, "step": 348 }, { "epoch": 0.6727710843373494, "grad_norm": 1.309940218925476, "learning_rate": 3.817481461008617e-05, "loss": 0.024, "step": 349 }, { "epoch": 0.6746987951807228, "grad_norm": 0.4074770510196686, "learning_rate": 3.815605117896204e-05, "loss": 0.0262, "step": 350 }, { "epoch": 0.6766265060240964, "grad_norm": 0.48525840044021606, "learning_rate": 3.8137196453321775e-05, "loss": 0.0209, "step": 351 }, { "epoch": 0.6785542168674699, "grad_norm": 0.7199739217758179, "learning_rate": 3.811825052797308e-05, "loss": 0.0396, "step": 352 }, { "epoch": 0.6804819277108434, "grad_norm": 0.519540011882782, "learning_rate": 3.8099213498182196e-05, "loss": 0.0453, "step": 353 }, { "epoch": 0.6824096385542169, "grad_norm": 0.9738391041755676, "learning_rate": 3.808008545967349e-05, "loss": 0.0317, "step": 354 }, { "epoch": 0.6843373493975904, "grad_norm": 1.888344407081604, "learning_rate": 3.8060866508628953e-05, "loss": 0.0452, "step": 355 }, { "epoch": 0.6862650602409639, "grad_norm": 0.48989811539649963, "learning_rate": 3.8041556741687695e-05, "loss": 0.0315, "step": 356 }, { "epoch": 0.6881927710843373, "grad_norm": 0.3764645457267761, "learning_rate": 3.8022156255945496e-05, "loss": 0.0269, "step": 357 }, { "epoch": 0.6901204819277108, "grad_norm": 0.46409738063812256, "learning_rate": 3.800266514895429e-05, "loss": 0.0171, "step": 358 }, { "epoch": 0.6920481927710843, "grad_norm": 0.41091030836105347, "learning_rate": 3.7983083518721695e-05, "loss": 0.0167, "step": 359 }, { "epoch": 0.6939759036144578, "grad_norm": 0.8375523090362549, "learning_rate": 3.79634114637105e-05, "loss": 0.0342, "step": 360 }, { "epoch": 0.6959036144578313, "grad_norm": 1.7053394317626953, "learning_rate": 3.794364908283817e-05, "loss": 0.02, "step": 361 }, { "epoch": 0.6978313253012048, "grad_norm": 0.4163115918636322, "learning_rate": 3.792379647547637e-05, "loss": 0.0138, "step": 362 }, { "epoch": 0.6997590361445784, "grad_norm": 0.388751745223999, "learning_rate": 3.790385374145046e-05, "loss": 0.0172, "step": 363 }, { "epoch": 0.7016867469879519, "grad_norm": 0.5584064722061157, "learning_rate": 3.7883820981038966e-05, "loss": 0.0254, "step": 364 }, { "epoch": 0.7036144578313253, "grad_norm": 1.394264817237854, "learning_rate": 3.7863698294973114e-05, "loss": 0.037, "step": 365 }, { "epoch": 0.7055421686746988, "grad_norm": 0.46280744671821594, "learning_rate": 3.78434857844363e-05, "loss": 0.0234, "step": 366 }, { "epoch": 0.7074698795180723, "grad_norm": 0.39548924565315247, "learning_rate": 3.782318355106358e-05, "loss": 0.0164, "step": 367 }, { "epoch": 0.7093975903614458, "grad_norm": 0.7307773232460022, "learning_rate": 3.780279169694118e-05, "loss": 0.0192, "step": 368 }, { "epoch": 0.7113253012048193, "grad_norm": 0.28035807609558105, "learning_rate": 3.778231032460594e-05, "loss": 0.0131, "step": 369 }, { "epoch": 0.7132530120481928, "grad_norm": 0.8376953601837158, "learning_rate": 3.776173953704486e-05, "loss": 0.0291, "step": 370 }, { "epoch": 0.7151807228915663, "grad_norm": 0.7356843948364258, "learning_rate": 3.774107943769454e-05, "loss": 0.0214, "step": 371 }, { "epoch": 0.7171084337349397, "grad_norm": 0.41503390669822693, "learning_rate": 3.772033013044064e-05, "loss": 0.0221, "step": 372 }, { "epoch": 0.7190361445783132, "grad_norm": 0.35732385516166687, "learning_rate": 3.7699491719617436e-05, "loss": 0.015, "step": 373 }, { "epoch": 0.7209638554216867, "grad_norm": 0.283778578042984, "learning_rate": 3.76785643100072e-05, "loss": 0.0146, "step": 374 }, { "epoch": 0.7228915662650602, "grad_norm": 0.3219413459300995, "learning_rate": 3.765754800683974e-05, "loss": 0.015, "step": 375 }, { "epoch": 0.7248192771084337, "grad_norm": 0.610431432723999, "learning_rate": 3.7636442915791856e-05, "loss": 0.0326, "step": 376 }, { "epoch": 0.7267469879518073, "grad_norm": 4.944870948791504, "learning_rate": 3.7615249142986784e-05, "loss": 0.0432, "step": 377 }, { "epoch": 0.7286746987951808, "grad_norm": 0.4894593060016632, "learning_rate": 3.7593966794993696e-05, "loss": 0.0174, "step": 378 }, { "epoch": 0.7306024096385542, "grad_norm": 0.4211325943470001, "learning_rate": 3.757259597882714e-05, "loss": 0.023, "step": 379 }, { "epoch": 0.7325301204819277, "grad_norm": 0.33621737360954285, "learning_rate": 3.755113680194651e-05, "loss": 0.0201, "step": 380 }, { "epoch": 0.7344578313253012, "grad_norm": 0.5799694657325745, "learning_rate": 3.7529589372255514e-05, "loss": 0.0173, "step": 381 }, { "epoch": 0.7363855421686747, "grad_norm": 0.5172572731971741, "learning_rate": 3.750795379810162e-05, "loss": 0.0284, "step": 382 }, { "epoch": 0.7383132530120482, "grad_norm": 0.5715453028678894, "learning_rate": 3.748623018827552e-05, "loss": 0.0194, "step": 383 }, { "epoch": 0.7402409638554217, "grad_norm": 0.5284178256988525, "learning_rate": 3.746441865201056e-05, "loss": 0.0247, "step": 384 }, { "epoch": 0.7421686746987952, "grad_norm": 0.37828654050827026, "learning_rate": 3.744251929898223e-05, "loss": 0.0097, "step": 385 }, { "epoch": 0.7440963855421687, "grad_norm": 0.3252779543399811, "learning_rate": 3.742053223930758e-05, "loss": 0.0238, "step": 386 }, { "epoch": 0.7460240963855421, "grad_norm": 0.6031543612480164, "learning_rate": 3.7398457583544674e-05, "loss": 0.0332, "step": 387 }, { "epoch": 0.7479518072289156, "grad_norm": 0.23846614360809326, "learning_rate": 3.737629544269206e-05, "loss": 0.0122, "step": 388 }, { "epoch": 0.7498795180722891, "grad_norm": 0.5274029970169067, "learning_rate": 3.7354045928188155e-05, "loss": 0.0324, "step": 389 }, { "epoch": 0.7518072289156627, "grad_norm": 0.4672217071056366, "learning_rate": 3.733170915191075e-05, "loss": 0.0196, "step": 390 }, { "epoch": 0.7537349397590362, "grad_norm": 0.29819396138191223, "learning_rate": 3.730928522617639e-05, "loss": 0.0131, "step": 391 }, { "epoch": 0.7556626506024097, "grad_norm": 0.43824997544288635, "learning_rate": 3.7286774263739855e-05, "loss": 0.0238, "step": 392 }, { "epoch": 0.7575903614457832, "grad_norm": 0.2822072505950928, "learning_rate": 3.726417637779357e-05, "loss": 0.0314, "step": 393 }, { "epoch": 0.7595180722891566, "grad_norm": 0.43815648555755615, "learning_rate": 3.7241491681967044e-05, "loss": 0.0144, "step": 394 }, { "epoch": 0.7614457831325301, "grad_norm": 0.37194815278053284, "learning_rate": 3.721872029032628e-05, "loss": 0.0286, "step": 395 }, { "epoch": 0.7633734939759036, "grad_norm": 0.7319737672805786, "learning_rate": 3.719586231737322e-05, "loss": 0.0427, "step": 396 }, { "epoch": 0.7653012048192771, "grad_norm": 0.5870066285133362, "learning_rate": 3.717291787804517e-05, "loss": 0.0138, "step": 397 }, { "epoch": 0.7672289156626506, "grad_norm": 0.6574277281761169, "learning_rate": 3.7149887087714225e-05, "loss": 0.061, "step": 398 }, { "epoch": 0.7691566265060241, "grad_norm": 0.5467348694801331, "learning_rate": 3.712677006218666e-05, "loss": 0.022, "step": 399 }, { "epoch": 0.7710843373493976, "grad_norm": 0.3589288890361786, "learning_rate": 3.710356691770238e-05, "loss": 0.0161, "step": 400 }, { "epoch": 0.7730120481927711, "grad_norm": 0.574630618095398, "learning_rate": 3.708027777093433e-05, "loss": 0.0285, "step": 401 }, { "epoch": 0.7749397590361445, "grad_norm": 0.39048445224761963, "learning_rate": 3.70569027389879e-05, "loss": 0.012, "step": 402 }, { "epoch": 0.776867469879518, "grad_norm": 0.34803536534309387, "learning_rate": 3.703344193940032e-05, "loss": 0.0155, "step": 403 }, { "epoch": 0.7787951807228916, "grad_norm": 1.188948392868042, "learning_rate": 3.700989549014011e-05, "loss": 0.0617, "step": 404 }, { "epoch": 0.7807228915662651, "grad_norm": 0.473157674074173, "learning_rate": 3.698626350960646e-05, "loss": 0.0298, "step": 405 }, { "epoch": 0.7826506024096386, "grad_norm": 0.42009076476097107, "learning_rate": 3.6962546116628634e-05, "loss": 0.03, "step": 406 }, { "epoch": 0.7845783132530121, "grad_norm": 0.6334308981895447, "learning_rate": 3.693874343046537e-05, "loss": 0.0107, "step": 407 }, { "epoch": 0.7865060240963856, "grad_norm": 0.35594677925109863, "learning_rate": 3.6914855570804314e-05, "loss": 0.0174, "step": 408 }, { "epoch": 0.788433734939759, "grad_norm": 0.28985708951950073, "learning_rate": 3.689088265776136e-05, "loss": 0.0149, "step": 409 }, { "epoch": 0.7903614457831325, "grad_norm": 0.3981950581073761, "learning_rate": 3.686682481188011e-05, "loss": 0.019, "step": 410 }, { "epoch": 0.792289156626506, "grad_norm": 0.48819583654403687, "learning_rate": 3.6842682154131193e-05, "loss": 0.0217, "step": 411 }, { "epoch": 0.7942168674698795, "grad_norm": 0.42819952964782715, "learning_rate": 3.681845480591174e-05, "loss": 0.0198, "step": 412 }, { "epoch": 0.796144578313253, "grad_norm": 0.48591694235801697, "learning_rate": 3.6794142889044727e-05, "loss": 0.0253, "step": 413 }, { "epoch": 0.7980722891566265, "grad_norm": 0.4730607271194458, "learning_rate": 3.676974652577835e-05, "loss": 0.0329, "step": 414 }, { "epoch": 0.8, "grad_norm": 0.5390865802764893, "learning_rate": 3.6745265838785434e-05, "loss": 0.0479, "step": 415 }, { "epoch": 0.8019277108433734, "grad_norm": 0.6377891302108765, "learning_rate": 3.672070095116283e-05, "loss": 0.019, "step": 416 }, { "epoch": 0.803855421686747, "grad_norm": 0.8984615206718445, "learning_rate": 3.669605198643075e-05, "loss": 0.0444, "step": 417 }, { "epoch": 0.8057831325301205, "grad_norm": 0.4913877546787262, "learning_rate": 3.667131906853219e-05, "loss": 0.031, "step": 418 }, { "epoch": 0.807710843373494, "grad_norm": 0.37894028425216675, "learning_rate": 3.664650232183229e-05, "loss": 0.0195, "step": 419 }, { "epoch": 0.8096385542168675, "grad_norm": 0.3644949495792389, "learning_rate": 3.66216018711177e-05, "loss": 0.018, "step": 420 }, { "epoch": 0.811566265060241, "grad_norm": 0.414440393447876, "learning_rate": 3.659661784159597e-05, "loss": 0.0188, "step": 421 }, { "epoch": 0.8134939759036145, "grad_norm": 0.49220341444015503, "learning_rate": 3.65715503588949e-05, "loss": 0.016, "step": 422 }, { "epoch": 0.815421686746988, "grad_norm": 1.0939836502075195, "learning_rate": 3.654639954906193e-05, "loss": 0.0758, "step": 423 }, { "epoch": 0.8173493975903614, "grad_norm": 0.43222442269325256, "learning_rate": 3.652116553856349e-05, "loss": 0.0308, "step": 424 }, { "epoch": 0.8192771084337349, "grad_norm": 0.5081896185874939, "learning_rate": 3.649584845428438e-05, "loss": 0.0493, "step": 425 }, { "epoch": 0.8212048192771084, "grad_norm": 0.9811948537826538, "learning_rate": 3.64704484235271e-05, "loss": 0.019, "step": 426 }, { "epoch": 0.8231325301204819, "grad_norm": 0.31656572222709656, "learning_rate": 3.6444965574011255e-05, "loss": 0.0135, "step": 427 }, { "epoch": 0.8250602409638554, "grad_norm": 0.7844433188438416, "learning_rate": 3.641940003387289e-05, "loss": 0.0402, "step": 428 }, { "epoch": 0.826987951807229, "grad_norm": 0.3353273570537567, "learning_rate": 3.6393751931663814e-05, "loss": 0.0132, "step": 429 }, { "epoch": 0.8289156626506025, "grad_norm": 0.7253058552742004, "learning_rate": 3.6368021396351015e-05, "loss": 0.0296, "step": 430 }, { "epoch": 0.8308433734939759, "grad_norm": 0.45300304889678955, "learning_rate": 3.634220855731598e-05, "loss": 0.0258, "step": 431 }, { "epoch": 0.8327710843373494, "grad_norm": 0.3480473458766937, "learning_rate": 3.631631354435403e-05, "loss": 0.0099, "step": 432 }, { "epoch": 0.8346987951807229, "grad_norm": 2.1114516258239746, "learning_rate": 3.62903364876737e-05, "loss": 0.0457, "step": 433 }, { "epoch": 0.8366265060240964, "grad_norm": 0.5649561882019043, "learning_rate": 3.626427751789606e-05, "loss": 0.0444, "step": 434 }, { "epoch": 0.8385542168674699, "grad_norm": 0.3864995539188385, "learning_rate": 3.623813676605405e-05, "loss": 0.0223, "step": 435 }, { "epoch": 0.8404819277108434, "grad_norm": 1.2134298086166382, "learning_rate": 3.621191436359186e-05, "loss": 0.0353, "step": 436 }, { "epoch": 0.8424096385542169, "grad_norm": 0.4403415024280548, "learning_rate": 3.6185610442364246e-05, "loss": 0.0216, "step": 437 }, { "epoch": 0.8443373493975903, "grad_norm": 0.6050297021865845, "learning_rate": 3.6159225134635846e-05, "loss": 0.0433, "step": 438 }, { "epoch": 0.8462650602409638, "grad_norm": 0.7951678037643433, "learning_rate": 3.6132758573080556e-05, "loss": 0.031, "step": 439 }, { "epoch": 0.8481927710843373, "grad_norm": 0.4991949796676636, "learning_rate": 3.6106210890780834e-05, "loss": 0.0313, "step": 440 }, { "epoch": 0.8501204819277108, "grad_norm": 0.47951385378837585, "learning_rate": 3.607958222122704e-05, "loss": 0.0218, "step": 441 }, { "epoch": 0.8520481927710843, "grad_norm": 0.7345194220542908, "learning_rate": 3.6052872698316755e-05, "loss": 0.0239, "step": 442 }, { "epoch": 0.8539759036144579, "grad_norm": 1.4814884662628174, "learning_rate": 3.602608245635414e-05, "loss": 0.0127, "step": 443 }, { "epoch": 0.8559036144578314, "grad_norm": 2.4240877628326416, "learning_rate": 3.599921163004922e-05, "loss": 0.0618, "step": 444 }, { "epoch": 0.8578313253012049, "grad_norm": 0.41523510217666626, "learning_rate": 3.5972260354517216e-05, "loss": 0.0283, "step": 445 }, { "epoch": 0.8597590361445783, "grad_norm": 0.5577677488327026, "learning_rate": 3.594522876527791e-05, "loss": 0.0271, "step": 446 }, { "epoch": 0.8616867469879518, "grad_norm": 0.5829064846038818, "learning_rate": 3.591811699825487e-05, "loss": 0.0169, "step": 447 }, { "epoch": 0.8636144578313253, "grad_norm": 0.4478822350502014, "learning_rate": 3.5890925189774886e-05, "loss": 0.0239, "step": 448 }, { "epoch": 0.8655421686746988, "grad_norm": 0.3498048782348633, "learning_rate": 3.586365347656718e-05, "loss": 0.0137, "step": 449 }, { "epoch": 0.8674698795180723, "grad_norm": 0.6571130156517029, "learning_rate": 3.583630199576278e-05, "loss": 0.027, "step": 450 }, { "epoch": 0.8693975903614458, "grad_norm": 0.344970166683197, "learning_rate": 3.58088708848938e-05, "loss": 0.0167, "step": 451 }, { "epoch": 0.8713253012048193, "grad_norm": 0.34611570835113525, "learning_rate": 3.5781360281892775e-05, "loss": 0.0468, "step": 452 }, { "epoch": 0.8732530120481927, "grad_norm": 0.66157066822052, "learning_rate": 3.575377032509194e-05, "loss": 0.0344, "step": 453 }, { "epoch": 0.8751807228915662, "grad_norm": 0.3676326870918274, "learning_rate": 3.5726101153222534e-05, "loss": 0.0366, "step": 454 }, { "epoch": 0.8771084337349397, "grad_norm": 0.5958423018455505, "learning_rate": 3.569835290541414e-05, "loss": 0.0382, "step": 455 }, { "epoch": 0.8790361445783132, "grad_norm": 0.36787471175193787, "learning_rate": 3.567052572119397e-05, "loss": 0.018, "step": 456 }, { "epoch": 0.8809638554216868, "grad_norm": 0.9478234052658081, "learning_rate": 3.564261974048611e-05, "loss": 0.0179, "step": 457 }, { "epoch": 0.8828915662650603, "grad_norm": 0.3337579369544983, "learning_rate": 3.56146351036109e-05, "loss": 0.0147, "step": 458 }, { "epoch": 0.8848192771084338, "grad_norm": 0.4984932243824005, "learning_rate": 3.558657195128416e-05, "loss": 0.0224, "step": 459 }, { "epoch": 0.8867469879518072, "grad_norm": 0.36718735098838806, "learning_rate": 3.555843042461653e-05, "loss": 0.0202, "step": 460 }, { "epoch": 0.8886746987951807, "grad_norm": 0.4081745445728302, "learning_rate": 3.553021066511274e-05, "loss": 0.0288, "step": 461 }, { "epoch": 0.8906024096385542, "grad_norm": 0.3233242332935333, "learning_rate": 3.55019128146709e-05, "loss": 0.0362, "step": 462 }, { "epoch": 0.8925301204819277, "grad_norm": 0.6560158729553223, "learning_rate": 3.547353701558178e-05, "loss": 0.038, "step": 463 }, { "epoch": 0.8944578313253012, "grad_norm": 0.47668641805648804, "learning_rate": 3.544508341052811e-05, "loss": 0.0399, "step": 464 }, { "epoch": 0.8963855421686747, "grad_norm": 0.45512664318084717, "learning_rate": 3.541655214258383e-05, "loss": 0.022, "step": 465 }, { "epoch": 0.8983132530120482, "grad_norm": 0.8410730361938477, "learning_rate": 3.538794335521343e-05, "loss": 0.0315, "step": 466 }, { "epoch": 0.9002409638554217, "grad_norm": 0.4872909486293793, "learning_rate": 3.535925719227117e-05, "loss": 0.0152, "step": 467 }, { "epoch": 0.9021686746987951, "grad_norm": 0.45623311400413513, "learning_rate": 3.533049379800038e-05, "loss": 0.0305, "step": 468 }, { "epoch": 0.9040963855421686, "grad_norm": 0.43087029457092285, "learning_rate": 3.530165331703275e-05, "loss": 0.0131, "step": 469 }, { "epoch": 0.9060240963855422, "grad_norm": 0.4610525369644165, "learning_rate": 3.527273589438756e-05, "loss": 0.0187, "step": 470 }, { "epoch": 0.9079518072289157, "grad_norm": 0.3356114327907562, "learning_rate": 3.5243741675471006e-05, "loss": 0.0185, "step": 471 }, { "epoch": 0.9098795180722892, "grad_norm": 0.9065960049629211, "learning_rate": 3.5214670806075426e-05, "loss": 0.0433, "step": 472 }, { "epoch": 0.9118072289156627, "grad_norm": 0.3652578294277191, "learning_rate": 3.518552343237858e-05, "loss": 0.02, "step": 473 }, { "epoch": 0.9137349397590362, "grad_norm": 0.32377883791923523, "learning_rate": 3.5156299700942916e-05, "loss": 0.0165, "step": 474 }, { "epoch": 0.9156626506024096, "grad_norm": 0.2431817352771759, "learning_rate": 3.512699975871485e-05, "loss": 0.0172, "step": 475 }, { "epoch": 0.9175903614457831, "grad_norm": 0.6390707492828369, "learning_rate": 3.509762375302399e-05, "loss": 0.0356, "step": 476 }, { "epoch": 0.9195180722891566, "grad_norm": 0.2283092886209488, "learning_rate": 3.506817183158243e-05, "loss": 0.0088, "step": 477 }, { "epoch": 0.9214457831325301, "grad_norm": 0.5053914189338684, "learning_rate": 3.5038644142483966e-05, "loss": 0.0389, "step": 478 }, { "epoch": 0.9233734939759036, "grad_norm": 0.2567576467990875, "learning_rate": 3.500904083420342e-05, "loss": 0.0155, "step": 479 }, { "epoch": 0.9253012048192771, "grad_norm": 0.6852384209632874, "learning_rate": 3.497936205559583e-05, "loss": 0.0247, "step": 480 }, { "epoch": 0.9272289156626506, "grad_norm": 0.36403414607048035, "learning_rate": 3.494960795589572e-05, "loss": 0.023, "step": 481 }, { "epoch": 0.929156626506024, "grad_norm": 0.506554901599884, "learning_rate": 3.491977868471635e-05, "loss": 0.0273, "step": 482 }, { "epoch": 0.9310843373493976, "grad_norm": 0.38329923152923584, "learning_rate": 3.4889874392048985e-05, "loss": 0.0169, "step": 483 }, { "epoch": 0.9330120481927711, "grad_norm": 0.2805836498737335, "learning_rate": 3.48598952282621e-05, "loss": 0.0105, "step": 484 }, { "epoch": 0.9349397590361446, "grad_norm": 0.6315302848815918, "learning_rate": 3.482984134410067e-05, "loss": 0.0289, "step": 485 }, { "epoch": 0.9368674698795181, "grad_norm": 0.6431388854980469, "learning_rate": 3.479971289068537e-05, "loss": 0.0311, "step": 486 }, { "epoch": 0.9387951807228916, "grad_norm": 0.9794723391532898, "learning_rate": 3.476951001951184e-05, "loss": 0.0452, "step": 487 }, { "epoch": 0.9407228915662651, "grad_norm": 0.7984824180603027, "learning_rate": 3.473923288244991e-05, "loss": 0.0689, "step": 488 }, { "epoch": 0.9426506024096386, "grad_norm": 0.46362006664276123, "learning_rate": 3.470888163174286e-05, "loss": 0.0241, "step": 489 }, { "epoch": 0.944578313253012, "grad_norm": 0.5051195025444031, "learning_rate": 3.467845642000661e-05, "loss": 0.0228, "step": 490 }, { "epoch": 0.9465060240963855, "grad_norm": 0.3082812428474426, "learning_rate": 3.4647957400229004e-05, "loss": 0.0144, "step": 491 }, { "epoch": 0.948433734939759, "grad_norm": 0.2691391110420227, "learning_rate": 3.461738472576902e-05, "loss": 0.0167, "step": 492 }, { "epoch": 0.9503614457831325, "grad_norm": 0.5627671480178833, "learning_rate": 3.458673855035597e-05, "loss": 0.031, "step": 493 }, { "epoch": 0.952289156626506, "grad_norm": 0.4571435749530792, "learning_rate": 3.455601902808876e-05, "loss": 0.0191, "step": 494 }, { "epoch": 0.9542168674698795, "grad_norm": 1.0117709636688232, "learning_rate": 3.452522631343515e-05, "loss": 0.0192, "step": 495 }, { "epoch": 0.9561445783132531, "grad_norm": 0.28375712037086487, "learning_rate": 3.449436056123086e-05, "loss": 0.0159, "step": 496 }, { "epoch": 0.9580722891566265, "grad_norm": 0.26381856203079224, "learning_rate": 3.446342192667893e-05, "loss": 0.0113, "step": 497 }, { "epoch": 0.96, "grad_norm": 0.49317577481269836, "learning_rate": 3.443241056534884e-05, "loss": 0.0332, "step": 498 }, { "epoch": 0.9619277108433735, "grad_norm": 0.28884485363960266, "learning_rate": 3.440132663317579e-05, "loss": 0.0117, "step": 499 }, { "epoch": 0.963855421686747, "grad_norm": 0.36255285143852234, "learning_rate": 3.4370170286459864e-05, "loss": 0.0169, "step": 500 }, { "epoch": 0.9657831325301205, "grad_norm": 0.4265049993991852, "learning_rate": 3.433894168186529e-05, "loss": 0.0217, "step": 501 }, { "epoch": 0.967710843373494, "grad_norm": 0.8169426321983337, "learning_rate": 3.430764097641962e-05, "loss": 0.0207, "step": 502 }, { "epoch": 0.9696385542168675, "grad_norm": 1.866077184677124, "learning_rate": 3.427626832751296e-05, "loss": 0.0381, "step": 503 }, { "epoch": 0.971566265060241, "grad_norm": 0.33124980330467224, "learning_rate": 3.424482389289716e-05, "loss": 0.0245, "step": 504 }, { "epoch": 0.9734939759036144, "grad_norm": 0.37479540705680847, "learning_rate": 3.4213307830685055e-05, "loss": 0.0164, "step": 505 }, { "epoch": 0.9754216867469879, "grad_norm": 0.39738863706588745, "learning_rate": 3.4181720299349615e-05, "loss": 0.0297, "step": 506 }, { "epoch": 0.9773493975903614, "grad_norm": 0.2567287087440491, "learning_rate": 3.4150061457723205e-05, "loss": 0.0102, "step": 507 }, { "epoch": 0.9792771084337349, "grad_norm": 0.6230517029762268, "learning_rate": 3.411833146499675e-05, "loss": 0.0243, "step": 508 }, { "epoch": 0.9812048192771085, "grad_norm": 0.44843971729278564, "learning_rate": 3.408653048071894e-05, "loss": 0.0357, "step": 509 }, { "epoch": 0.983132530120482, "grad_norm": 1.0569655895233154, "learning_rate": 3.405465866479546e-05, "loss": 0.037, "step": 510 }, { "epoch": 0.9850602409638555, "grad_norm": 0.29000964760780334, "learning_rate": 3.402271617748812e-05, "loss": 0.0129, "step": 511 }, { "epoch": 0.9869879518072289, "grad_norm": 2.1627447605133057, "learning_rate": 3.399070317941413e-05, "loss": 0.0442, "step": 512 }, { "epoch": 0.9889156626506024, "grad_norm": 0.27371272444725037, "learning_rate": 3.395861983154522e-05, "loss": 0.0119, "step": 513 }, { "epoch": 0.9908433734939759, "grad_norm": 0.4117226302623749, "learning_rate": 3.392646629520688e-05, "loss": 0.0455, "step": 514 }, { "epoch": 0.9927710843373494, "grad_norm": 0.5098996758460999, "learning_rate": 3.389424273207752e-05, "loss": 0.0203, "step": 515 }, { "epoch": 0.9946987951807229, "grad_norm": 0.5192157626152039, "learning_rate": 3.386194930418767e-05, "loss": 0.0329, "step": 516 }, { "epoch": 0.9966265060240964, "grad_norm": 0.18757697939872742, "learning_rate": 3.382958617391915e-05, "loss": 0.0065, "step": 517 }, { "epoch": 0.9985542168674699, "grad_norm": 0.3334413170814514, "learning_rate": 3.3797153504004296e-05, "loss": 0.0266, "step": 518 }, { "epoch": 1.0, "grad_norm": 0.4152225852012634, "learning_rate": 3.3764651457525095e-05, "loss": 0.0169, "step": 519 }, { "epoch": 1.0019277108433735, "grad_norm": 0.43535247445106506, "learning_rate": 3.373208019791237e-05, "loss": 0.0221, "step": 520 }, { "epoch": 1.003855421686747, "grad_norm": 0.39292722940444946, "learning_rate": 3.3699439888945e-05, "loss": 0.0211, "step": 521 }, { "epoch": 1.0057831325301205, "grad_norm": 0.19566713273525238, "learning_rate": 3.366673069474904e-05, "loss": 0.0069, "step": 522 }, { "epoch": 1.007710843373494, "grad_norm": 0.5101853609085083, "learning_rate": 3.3633952779796914e-05, "loss": 0.0191, "step": 523 }, { "epoch": 1.0096385542168675, "grad_norm": 0.999434769153595, "learning_rate": 3.360110630890664e-05, "loss": 0.0196, "step": 524 }, { "epoch": 1.011566265060241, "grad_norm": 0.4646223783493042, "learning_rate": 3.356819144724092e-05, "loss": 0.0328, "step": 525 }, { "epoch": 1.0134939759036146, "grad_norm": 0.3132480978965759, "learning_rate": 3.3535208360306354e-05, "loss": 0.0203, "step": 526 }, { "epoch": 1.0154216867469879, "grad_norm": 0.3038032352924347, "learning_rate": 3.350215721395261e-05, "loss": 0.0122, "step": 527 }, { "epoch": 1.0173493975903614, "grad_norm": 0.45082882046699524, "learning_rate": 3.346903817437157e-05, "loss": 0.0437, "step": 528 }, { "epoch": 1.0192771084337349, "grad_norm": 0.26917046308517456, "learning_rate": 3.343585140809651e-05, "loss": 0.013, "step": 529 }, { "epoch": 1.0212048192771084, "grad_norm": 0.23869264125823975, "learning_rate": 3.3402597082001276e-05, "loss": 0.008, "step": 530 }, { "epoch": 1.0231325301204819, "grad_norm": 0.31315353512763977, "learning_rate": 3.3369275363299394e-05, "loss": 0.0078, "step": 531 }, { "epoch": 1.0250602409638554, "grad_norm": 0.4780346751213074, "learning_rate": 3.333588641954327e-05, "loss": 0.0225, "step": 532 }, { "epoch": 1.026987951807229, "grad_norm": 0.2920368015766144, "learning_rate": 3.330243041862336e-05, "loss": 0.0118, "step": 533 }, { "epoch": 1.0289156626506024, "grad_norm": 0.543669581413269, "learning_rate": 3.326890752876728e-05, "loss": 0.0338, "step": 534 }, { "epoch": 1.030843373493976, "grad_norm": 0.4288000464439392, "learning_rate": 3.323531791853901e-05, "loss": 0.0341, "step": 535 }, { "epoch": 1.0327710843373494, "grad_norm": 0.26600322127342224, "learning_rate": 3.3201661756838e-05, "loss": 0.0184, "step": 536 }, { "epoch": 1.034698795180723, "grad_norm": 0.290937602519989, "learning_rate": 3.316793921289835e-05, "loss": 0.0152, "step": 537 }, { "epoch": 1.0366265060240965, "grad_norm": 0.7621443271636963, "learning_rate": 3.313415045628795e-05, "loss": 0.0326, "step": 538 }, { "epoch": 1.03855421686747, "grad_norm": 0.5581283569335938, "learning_rate": 3.3100295656907646e-05, "loss": 0.0164, "step": 539 }, { "epoch": 1.0404819277108435, "grad_norm": 0.20930901169776917, "learning_rate": 3.306637498499034e-05, "loss": 0.0091, "step": 540 }, { "epoch": 1.0424096385542168, "grad_norm": 0.46212059259414673, "learning_rate": 3.303238861110018e-05, "loss": 0.0118, "step": 541 }, { "epoch": 1.0443373493975903, "grad_norm": 0.38259151577949524, "learning_rate": 3.299833670613168e-05, "loss": 0.0081, "step": 542 }, { "epoch": 1.0462650602409638, "grad_norm": 0.4888618290424347, "learning_rate": 3.2964219441308865e-05, "loss": 0.0138, "step": 543 }, { "epoch": 1.0481927710843373, "grad_norm": 0.32103127241134644, "learning_rate": 3.2930036988184425e-05, "loss": 0.0171, "step": 544 }, { "epoch": 1.0501204819277108, "grad_norm": 0.27787327766418457, "learning_rate": 3.28957895186388e-05, "loss": 0.0106, "step": 545 }, { "epoch": 1.0520481927710843, "grad_norm": 0.35597777366638184, "learning_rate": 3.2861477204879395e-05, "loss": 0.0123, "step": 546 }, { "epoch": 1.0539759036144578, "grad_norm": 0.3619804084300995, "learning_rate": 3.2827100219439656e-05, "loss": 0.0088, "step": 547 }, { "epoch": 1.0559036144578313, "grad_norm": 0.2525513470172882, "learning_rate": 3.279265873517822e-05, "loss": 0.0179, "step": 548 }, { "epoch": 1.0578313253012048, "grad_norm": 0.3910020887851715, "learning_rate": 3.275815292527804e-05, "loss": 0.0142, "step": 549 }, { "epoch": 1.0597590361445783, "grad_norm": 0.30515050888061523, "learning_rate": 3.2723582963245526e-05, "loss": 0.0123, "step": 550 }, { "epoch": 1.0616867469879518, "grad_norm": 0.21708644926548004, "learning_rate": 3.2688949022909665e-05, "loss": 0.0098, "step": 551 }, { "epoch": 1.0636144578313254, "grad_norm": 0.23307719826698303, "learning_rate": 3.265425127842114e-05, "loss": 0.0097, "step": 552 }, { "epoch": 1.0655421686746989, "grad_norm": 0.676654577255249, "learning_rate": 3.261948990425147e-05, "loss": 0.0227, "step": 553 }, { "epoch": 1.0674698795180724, "grad_norm": 0.4593975841999054, "learning_rate": 3.258466507519213e-05, "loss": 0.047, "step": 554 }, { "epoch": 1.0693975903614459, "grad_norm": 0.19405829906463623, "learning_rate": 3.254977696635366e-05, "loss": 0.0314, "step": 555 }, { "epoch": 1.0713253012048192, "grad_norm": 0.14563389122486115, "learning_rate": 3.2514825753164774e-05, "loss": 0.0046, "step": 556 }, { "epoch": 1.0732530120481927, "grad_norm": 0.2642340064048767, "learning_rate": 3.247981161137153e-05, "loss": 0.022, "step": 557 }, { "epoch": 1.0751807228915662, "grad_norm": 0.17274761199951172, "learning_rate": 3.2444734717036386e-05, "loss": 0.0134, "step": 558 }, { "epoch": 1.0771084337349397, "grad_norm": 0.44354626536369324, "learning_rate": 3.240959524653735e-05, "loss": 0.0211, "step": 559 }, { "epoch": 1.0790361445783132, "grad_norm": 0.2806888818740845, "learning_rate": 3.237439337656708e-05, "loss": 0.0141, "step": 560 }, { "epoch": 1.0809638554216867, "grad_norm": 0.21679501235485077, "learning_rate": 3.2339129284131994e-05, "loss": 0.019, "step": 561 }, { "epoch": 1.0828915662650602, "grad_norm": 0.3040260076522827, "learning_rate": 3.2303803146551386e-05, "loss": 0.0249, "step": 562 }, { "epoch": 1.0848192771084337, "grad_norm": 0.2793775200843811, "learning_rate": 3.226841514145656e-05, "loss": 0.0088, "step": 563 }, { "epoch": 1.0867469879518072, "grad_norm": 0.149955615401268, "learning_rate": 3.223296544678987e-05, "loss": 0.0054, "step": 564 }, { "epoch": 1.0886746987951808, "grad_norm": 0.22166767716407776, "learning_rate": 3.219745424080389e-05, "loss": 0.0109, "step": 565 }, { "epoch": 1.0906024096385543, "grad_norm": 0.22399431467056274, "learning_rate": 3.2161881702060476e-05, "loss": 0.0106, "step": 566 }, { "epoch": 1.0925301204819278, "grad_norm": 0.18537986278533936, "learning_rate": 3.2126248009429905e-05, "loss": 0.0077, "step": 567 }, { "epoch": 1.0944578313253013, "grad_norm": 0.24511495232582092, "learning_rate": 3.2090553342089935e-05, "loss": 0.0093, "step": 568 }, { "epoch": 1.0963855421686748, "grad_norm": 0.4766045808792114, "learning_rate": 3.205479787952494e-05, "loss": 0.036, "step": 569 }, { "epoch": 1.0983132530120483, "grad_norm": 0.1425715535879135, "learning_rate": 3.201898180152499e-05, "loss": 0.0085, "step": 570 }, { "epoch": 1.1002409638554216, "grad_norm": 0.1909666359424591, "learning_rate": 3.1983105288184945e-05, "loss": 0.0081, "step": 571 }, { "epoch": 1.102168674698795, "grad_norm": 0.44077104330062866, "learning_rate": 3.194716851990355e-05, "loss": 0.017, "step": 572 }, { "epoch": 1.1040963855421686, "grad_norm": 0.5757400989532471, "learning_rate": 3.191117167738253e-05, "loss": 0.021, "step": 573 }, { "epoch": 1.106024096385542, "grad_norm": 0.1977701038122177, "learning_rate": 3.1875114941625705e-05, "loss": 0.0096, "step": 574 }, { "epoch": 1.1079518072289156, "grad_norm": 0.3524581491947174, "learning_rate": 3.1838998493938026e-05, "loss": 0.0118, "step": 575 }, { "epoch": 1.1098795180722891, "grad_norm": 0.3301331698894501, "learning_rate": 3.180282251592472e-05, "loss": 0.0094, "step": 576 }, { "epoch": 1.1118072289156626, "grad_norm": 0.2774488925933838, "learning_rate": 3.1766587189490336e-05, "loss": 0.0131, "step": 577 }, { "epoch": 1.1137349397590361, "grad_norm": 1.732595443725586, "learning_rate": 3.173029269683785e-05, "loss": 0.0445, "step": 578 }, { "epoch": 1.1156626506024097, "grad_norm": 0.28746843338012695, "learning_rate": 3.169393922046776e-05, "loss": 0.0116, "step": 579 }, { "epoch": 1.1175903614457832, "grad_norm": 0.2952995002269745, "learning_rate": 3.165752694317713e-05, "loss": 0.0116, "step": 580 }, { "epoch": 1.1195180722891567, "grad_norm": 0.2938575744628906, "learning_rate": 3.16210560480587e-05, "loss": 0.013, "step": 581 }, { "epoch": 1.1214457831325302, "grad_norm": 0.22283495962619781, "learning_rate": 3.158452671849998e-05, "loss": 0.0052, "step": 582 }, { "epoch": 1.1233734939759037, "grad_norm": 0.6272858381271362, "learning_rate": 3.154793913818226e-05, "loss": 0.0182, "step": 583 }, { "epoch": 1.1253012048192772, "grad_norm": 0.479753702878952, "learning_rate": 3.1511293491079804e-05, "loss": 0.0146, "step": 584 }, { "epoch": 1.1272289156626507, "grad_norm": 0.31104400753974915, "learning_rate": 3.1474589961458786e-05, "loss": 0.0139, "step": 585 }, { "epoch": 1.129156626506024, "grad_norm": 0.4932832419872284, "learning_rate": 3.1437828733876477e-05, "loss": 0.0236, "step": 586 }, { "epoch": 1.1310843373493975, "grad_norm": 0.222808837890625, "learning_rate": 3.140100999318025e-05, "loss": 0.0084, "step": 587 }, { "epoch": 1.133012048192771, "grad_norm": 0.4515356719493866, "learning_rate": 3.136413392450668e-05, "loss": 0.0215, "step": 588 }, { "epoch": 1.1349397590361445, "grad_norm": 0.39302268624305725, "learning_rate": 3.132720071328061e-05, "loss": 0.0154, "step": 589 }, { "epoch": 1.136867469879518, "grad_norm": 0.43382835388183594, "learning_rate": 3.1290210545214205e-05, "loss": 0.0088, "step": 590 }, { "epoch": 1.1387951807228915, "grad_norm": 0.18707136809825897, "learning_rate": 3.125316360630602e-05, "loss": 0.0126, "step": 591 }, { "epoch": 1.140722891566265, "grad_norm": 0.5688219666481018, "learning_rate": 3.121606008284011e-05, "loss": 0.0147, "step": 592 }, { "epoch": 1.1426506024096386, "grad_norm": 0.3321833312511444, "learning_rate": 3.1178900161385005e-05, "loss": 0.0119, "step": 593 }, { "epoch": 1.144578313253012, "grad_norm": 0.3738424777984619, "learning_rate": 3.114168402879286e-05, "loss": 0.0158, "step": 594 }, { "epoch": 1.1465060240963856, "grad_norm": 0.2386978417634964, "learning_rate": 3.110441187219846e-05, "loss": 0.0107, "step": 595 }, { "epoch": 1.148433734939759, "grad_norm": 0.2165699452161789, "learning_rate": 3.10670838790183e-05, "loss": 0.0079, "step": 596 }, { "epoch": 1.1503614457831326, "grad_norm": 0.25952696800231934, "learning_rate": 3.102970023694965e-05, "loss": 0.0147, "step": 597 }, { "epoch": 1.152289156626506, "grad_norm": 0.21448305249214172, "learning_rate": 3.099226113396959e-05, "loss": 0.0099, "step": 598 }, { "epoch": 1.1542168674698796, "grad_norm": 0.37226060032844543, "learning_rate": 3.095476675833405e-05, "loss": 0.0214, "step": 599 }, { "epoch": 1.1561445783132531, "grad_norm": 0.29637983441352844, "learning_rate": 3.0917217298576955e-05, "loss": 0.0118, "step": 600 }, { "epoch": 1.1580722891566264, "grad_norm": 0.18535609543323517, "learning_rate": 3.0879612943509154e-05, "loss": 0.0086, "step": 601 }, { "epoch": 1.16, "grad_norm": 0.25874125957489014, "learning_rate": 3.0841953882217536e-05, "loss": 0.0088, "step": 602 }, { "epoch": 1.1619277108433734, "grad_norm": 0.46092745661735535, "learning_rate": 3.08042403040641e-05, "loss": 0.0241, "step": 603 }, { "epoch": 1.163855421686747, "grad_norm": 0.27023249864578247, "learning_rate": 3.076647239868494e-05, "loss": 0.0154, "step": 604 }, { "epoch": 1.1657831325301204, "grad_norm": 0.445157527923584, "learning_rate": 3.072865035598933e-05, "loss": 0.0197, "step": 605 }, { "epoch": 1.167710843373494, "grad_norm": 0.18097272515296936, "learning_rate": 3.06907743661588e-05, "loss": 0.0093, "step": 606 }, { "epoch": 1.1696385542168675, "grad_norm": 0.22469942271709442, "learning_rate": 3.065284461964609e-05, "loss": 0.0171, "step": 607 }, { "epoch": 1.171566265060241, "grad_norm": 0.20190906524658203, "learning_rate": 3.061486130717428e-05, "loss": 0.008, "step": 608 }, { "epoch": 1.1734939759036145, "grad_norm": 0.18294145166873932, "learning_rate": 3.057682461973579e-05, "loss": 0.0155, "step": 609 }, { "epoch": 1.175421686746988, "grad_norm": 0.34203943610191345, "learning_rate": 3.053873474859143e-05, "loss": 0.0212, "step": 610 }, { "epoch": 1.1773493975903615, "grad_norm": 0.49073582887649536, "learning_rate": 3.050059188526942e-05, "loss": 0.019, "step": 611 }, { "epoch": 1.179277108433735, "grad_norm": 0.3537680506706238, "learning_rate": 3.046239622156446e-05, "loss": 0.0147, "step": 612 }, { "epoch": 1.1812048192771085, "grad_norm": 0.2584632635116577, "learning_rate": 3.042414794953674e-05, "loss": 0.0088, "step": 613 }, { "epoch": 1.1831325301204818, "grad_norm": 0.3529360890388489, "learning_rate": 3.0385847261510975e-05, "loss": 0.0187, "step": 614 }, { "epoch": 1.1850602409638555, "grad_norm": 0.3331570327281952, "learning_rate": 3.0347494350075465e-05, "loss": 0.0124, "step": 615 }, { "epoch": 1.1869879518072288, "grad_norm": 0.2223527580499649, "learning_rate": 3.0309089408081074e-05, "loss": 0.01, "step": 616 }, { "epoch": 1.1889156626506023, "grad_norm": 0.21985746920108795, "learning_rate": 3.027063262864032e-05, "loss": 0.0087, "step": 617 }, { "epoch": 1.1908433734939758, "grad_norm": 0.2989653944969177, "learning_rate": 3.023212420512637e-05, "loss": 0.0137, "step": 618 }, { "epoch": 1.1927710843373494, "grad_norm": 0.17423275113105774, "learning_rate": 3.0193564331172074e-05, "loss": 0.0056, "step": 619 }, { "epoch": 1.1946987951807229, "grad_norm": 1.0992127656936646, "learning_rate": 3.0154953200668976e-05, "loss": 0.0274, "step": 620 }, { "epoch": 1.1966265060240964, "grad_norm": 0.21641989052295685, "learning_rate": 3.011629100776638e-05, "loss": 0.0151, "step": 621 }, { "epoch": 1.1985542168674699, "grad_norm": 0.4558199644088745, "learning_rate": 3.007757794687033e-05, "loss": 0.0424, "step": 622 }, { "epoch": 1.2004819277108434, "grad_norm": 0.42380189895629883, "learning_rate": 3.003881421264266e-05, "loss": 0.0079, "step": 623 }, { "epoch": 1.202409638554217, "grad_norm": 0.28791171312332153, "learning_rate": 3.0000000000000004e-05, "loss": 0.0142, "step": 624 }, { "epoch": 1.2043373493975904, "grad_norm": 0.3906581997871399, "learning_rate": 2.996113550411281e-05, "loss": 0.0251, "step": 625 }, { "epoch": 1.206265060240964, "grad_norm": 0.47848746180534363, "learning_rate": 2.9922220920404375e-05, "loss": 0.0137, "step": 626 }, { "epoch": 1.2081927710843374, "grad_norm": 0.22666941583156586, "learning_rate": 2.9883256444549862e-05, "loss": 0.0105, "step": 627 }, { "epoch": 1.210120481927711, "grad_norm": 0.18968136608600616, "learning_rate": 2.984424227247529e-05, "loss": 0.0089, "step": 628 }, { "epoch": 1.2120481927710842, "grad_norm": 0.28732606768608093, "learning_rate": 2.980517860035656e-05, "loss": 0.0253, "step": 629 }, { "epoch": 1.213975903614458, "grad_norm": 0.21131543815135956, "learning_rate": 2.9766065624618518e-05, "loss": 0.0134, "step": 630 }, { "epoch": 1.2159036144578312, "grad_norm": 0.7594877481460571, "learning_rate": 2.972690354193388e-05, "loss": 0.0157, "step": 631 }, { "epoch": 1.2178313253012047, "grad_norm": 0.730291485786438, "learning_rate": 2.96876925492223e-05, "loss": 0.0204, "step": 632 }, { "epoch": 1.2197590361445783, "grad_norm": 0.20333674550056458, "learning_rate": 2.9648432843649382e-05, "loss": 0.0114, "step": 633 }, { "epoch": 1.2216867469879518, "grad_norm": 0.5680793523788452, "learning_rate": 2.960912462262566e-05, "loss": 0.0146, "step": 634 }, { "epoch": 1.2236144578313253, "grad_norm": 0.4591079354286194, "learning_rate": 2.9569768083805618e-05, "loss": 0.0112, "step": 635 }, { "epoch": 1.2255421686746988, "grad_norm": 0.3793511390686035, "learning_rate": 2.953036342508671e-05, "loss": 0.0377, "step": 636 }, { "epoch": 1.2274698795180723, "grad_norm": 1.118723750114441, "learning_rate": 2.9490910844608346e-05, "loss": 0.0432, "step": 637 }, { "epoch": 1.2293975903614458, "grad_norm": 0.36990776658058167, "learning_rate": 2.9451410540750887e-05, "loss": 0.0203, "step": 638 }, { "epoch": 1.2313253012048193, "grad_norm": 0.930397629737854, "learning_rate": 2.94118627121347e-05, "loss": 0.0311, "step": 639 }, { "epoch": 1.2332530120481928, "grad_norm": 0.2347625195980072, "learning_rate": 2.9372267557619075e-05, "loss": 0.0168, "step": 640 }, { "epoch": 1.2351807228915663, "grad_norm": 0.3720332384109497, "learning_rate": 2.933262527630131e-05, "loss": 0.0136, "step": 641 }, { "epoch": 1.2371084337349398, "grad_norm": 0.4871984124183655, "learning_rate": 2.929293606751565e-05, "loss": 0.0339, "step": 642 }, { "epoch": 1.2390361445783133, "grad_norm": 0.35853689908981323, "learning_rate": 2.9253200130832322e-05, "loss": 0.0095, "step": 643 }, { "epoch": 1.2409638554216866, "grad_norm": 0.42003703117370605, "learning_rate": 2.92134176660565e-05, "loss": 0.0142, "step": 644 }, { "epoch": 1.2428915662650604, "grad_norm": 0.3854500651359558, "learning_rate": 2.9173588873227338e-05, "loss": 0.0209, "step": 645 }, { "epoch": 1.2448192771084337, "grad_norm": 0.24665917456150055, "learning_rate": 2.913371395261691e-05, "loss": 0.0087, "step": 646 }, { "epoch": 1.2467469879518072, "grad_norm": 0.41571593284606934, "learning_rate": 2.9093793104729268e-05, "loss": 0.0164, "step": 647 }, { "epoch": 1.2486746987951807, "grad_norm": 0.4597891569137573, "learning_rate": 2.9053826530299377e-05, "loss": 0.0138, "step": 648 }, { "epoch": 1.2506024096385542, "grad_norm": 0.43345385789871216, "learning_rate": 2.901381443029215e-05, "loss": 0.0353, "step": 649 }, { "epoch": 1.2525301204819277, "grad_norm": 0.3706768751144409, "learning_rate": 2.897375700590141e-05, "loss": 0.007, "step": 650 }, { "epoch": 1.2544578313253012, "grad_norm": 0.30305296182632446, "learning_rate": 2.8933654458548873e-05, "loss": 0.0123, "step": 651 }, { "epoch": 1.2563855421686747, "grad_norm": 0.2042127549648285, "learning_rate": 2.8893506989883167e-05, "loss": 0.0099, "step": 652 }, { "epoch": 1.2583132530120482, "grad_norm": 0.20524422824382782, "learning_rate": 2.8853314801778784e-05, "loss": 0.0097, "step": 653 }, { "epoch": 1.2602409638554217, "grad_norm": 0.2351921945810318, "learning_rate": 2.8813078096335093e-05, "loss": 0.0091, "step": 654 }, { "epoch": 1.2621686746987952, "grad_norm": 0.34547340869903564, "learning_rate": 2.87727970758753e-05, "loss": 0.0088, "step": 655 }, { "epoch": 1.2640963855421687, "grad_norm": 0.35163217782974243, "learning_rate": 2.8732471942945443e-05, "loss": 0.0145, "step": 656 }, { "epoch": 1.266024096385542, "grad_norm": 1.715137243270874, "learning_rate": 2.8692102900313378e-05, "loss": 0.0198, "step": 657 }, { "epoch": 1.2679518072289158, "grad_norm": 0.2860178053379059, "learning_rate": 2.8651690150967748e-05, "loss": 0.0085, "step": 658 }, { "epoch": 1.269879518072289, "grad_norm": 0.21175967156887054, "learning_rate": 2.8611233898116967e-05, "loss": 0.0071, "step": 659 }, { "epoch": 1.2718072289156628, "grad_norm": 0.33726972341537476, "learning_rate": 2.85707343451882e-05, "loss": 0.012, "step": 660 }, { "epoch": 1.273734939759036, "grad_norm": 0.2138456553220749, "learning_rate": 2.853019169582635e-05, "loss": 0.0092, "step": 661 }, { "epoch": 1.2756626506024096, "grad_norm": 0.2304934412240982, "learning_rate": 2.8489606153892997e-05, "loss": 0.0144, "step": 662 }, { "epoch": 1.277590361445783, "grad_norm": 0.2691061794757843, "learning_rate": 2.8448977923465425e-05, "loss": 0.0121, "step": 663 }, { "epoch": 1.2795180722891566, "grad_norm": 0.35254305601119995, "learning_rate": 2.840830720883555e-05, "loss": 0.0125, "step": 664 }, { "epoch": 1.28144578313253, "grad_norm": 0.36552608013153076, "learning_rate": 2.836759421450893e-05, "loss": 0.021, "step": 665 }, { "epoch": 1.2833734939759036, "grad_norm": 0.37177154421806335, "learning_rate": 2.83268391452037e-05, "loss": 0.0216, "step": 666 }, { "epoch": 1.2853012048192771, "grad_norm": 0.20932547748088837, "learning_rate": 2.828604220584958e-05, "loss": 0.0077, "step": 667 }, { "epoch": 1.2872289156626506, "grad_norm": 0.5158557295799255, "learning_rate": 2.824520360158681e-05, "loss": 0.0394, "step": 668 }, { "epoch": 1.2891566265060241, "grad_norm": 0.22623969614505768, "learning_rate": 2.820432353776515e-05, "loss": 0.0087, "step": 669 }, { "epoch": 1.2910843373493976, "grad_norm": 0.2996046245098114, "learning_rate": 2.8163402219942822e-05, "loss": 0.01, "step": 670 }, { "epoch": 1.2930120481927712, "grad_norm": 0.24957989156246185, "learning_rate": 2.8122439853885488e-05, "loss": 0.0127, "step": 671 }, { "epoch": 1.2949397590361444, "grad_norm": 0.2636559307575226, "learning_rate": 2.8081436645565216e-05, "loss": 0.0128, "step": 672 }, { "epoch": 1.2968674698795182, "grad_norm": 0.3531591296195984, "learning_rate": 2.804039280115944e-05, "loss": 0.0199, "step": 673 }, { "epoch": 1.2987951807228915, "grad_norm": 0.3682299852371216, "learning_rate": 2.7999308527049927e-05, "loss": 0.0088, "step": 674 }, { "epoch": 1.3007228915662652, "grad_norm": 0.19555217027664185, "learning_rate": 2.795818402982174e-05, "loss": 0.0084, "step": 675 }, { "epoch": 1.3026506024096385, "grad_norm": 0.2864912450313568, "learning_rate": 2.7917019516262186e-05, "loss": 0.0154, "step": 676 }, { "epoch": 1.304578313253012, "grad_norm": 0.2211237996816635, "learning_rate": 2.78758151933598e-05, "loss": 0.0078, "step": 677 }, { "epoch": 1.3065060240963855, "grad_norm": 0.13646945357322693, "learning_rate": 2.7834571268303294e-05, "loss": 0.0058, "step": 678 }, { "epoch": 1.308433734939759, "grad_norm": 0.16530285775661469, "learning_rate": 2.779328794848049e-05, "loss": 0.007, "step": 679 }, { "epoch": 1.3103614457831325, "grad_norm": 0.2145693302154541, "learning_rate": 2.7751965441477325e-05, "loss": 0.0203, "step": 680 }, { "epoch": 1.312289156626506, "grad_norm": 0.24273739755153656, "learning_rate": 2.771060395507677e-05, "loss": 0.0106, "step": 681 }, { "epoch": 1.3142168674698795, "grad_norm": 0.20430618524551392, "learning_rate": 2.7669203697257794e-05, "loss": 0.0122, "step": 682 }, { "epoch": 1.316144578313253, "grad_norm": 0.2502615749835968, "learning_rate": 2.7627764876194335e-05, "loss": 0.0101, "step": 683 }, { "epoch": 1.3180722891566266, "grad_norm": 0.287239670753479, "learning_rate": 2.7586287700254214e-05, "loss": 0.0203, "step": 684 }, { "epoch": 1.32, "grad_norm": 0.16239754855632782, "learning_rate": 2.7544772377998147e-05, "loss": 0.0084, "step": 685 }, { "epoch": 1.3219277108433736, "grad_norm": 0.27174142003059387, "learning_rate": 2.7503219118178636e-05, "loss": 0.008, "step": 686 }, { "epoch": 1.3238554216867469, "grad_norm": 0.12878240644931793, "learning_rate": 2.7461628129738954e-05, "loss": 0.0053, "step": 687 }, { "epoch": 1.3257831325301206, "grad_norm": 0.16112515330314636, "learning_rate": 2.7419999621812086e-05, "loss": 0.0059, "step": 688 }, { "epoch": 1.3277108433734939, "grad_norm": 0.2398834228515625, "learning_rate": 2.7378333803719672e-05, "loss": 0.0095, "step": 689 }, { "epoch": 1.3296385542168676, "grad_norm": 0.18516193330287933, "learning_rate": 2.733663088497097e-05, "loss": 0.0071, "step": 690 }, { "epoch": 1.331566265060241, "grad_norm": 0.2974924147129059, "learning_rate": 2.7294891075261785e-05, "loss": 0.0227, "step": 691 }, { "epoch": 1.3334939759036144, "grad_norm": 0.12931054830551147, "learning_rate": 2.7253114584473418e-05, "loss": 0.0039, "step": 692 }, { "epoch": 1.335421686746988, "grad_norm": 0.16319474577903748, "learning_rate": 2.7211301622671623e-05, "loss": 0.008, "step": 693 }, { "epoch": 1.3373493975903614, "grad_norm": 0.27622169256210327, "learning_rate": 2.7169452400105533e-05, "loss": 0.0238, "step": 694 }, { "epoch": 1.339277108433735, "grad_norm": 0.45309779047966003, "learning_rate": 2.712756712720663e-05, "loss": 0.0439, "step": 695 }, { "epoch": 1.3412048192771084, "grad_norm": 0.2469855099916458, "learning_rate": 2.708564601458765e-05, "loss": 0.0085, "step": 696 }, { "epoch": 1.343132530120482, "grad_norm": 0.4245856702327728, "learning_rate": 2.7043689273041535e-05, "loss": 0.0097, "step": 697 }, { "epoch": 1.3450602409638555, "grad_norm": 0.26796087622642517, "learning_rate": 2.7001697113540414e-05, "loss": 0.0119, "step": 698 }, { "epoch": 1.346987951807229, "grad_norm": 0.3569283187389374, "learning_rate": 2.6959669747234482e-05, "loss": 0.0096, "step": 699 }, { "epoch": 1.3489156626506025, "grad_norm": 0.7038524150848389, "learning_rate": 2.6917607385450973e-05, "loss": 0.0317, "step": 700 }, { "epoch": 1.350843373493976, "grad_norm": 0.23568563163280487, "learning_rate": 2.687551023969308e-05, "loss": 0.0112, "step": 701 }, { "epoch": 1.3527710843373493, "grad_norm": 0.20338499546051025, "learning_rate": 2.6833378521638935e-05, "loss": 0.0092, "step": 702 }, { "epoch": 1.354698795180723, "grad_norm": 4.22187614440918, "learning_rate": 2.679121244314046e-05, "loss": 0.0314, "step": 703 }, { "epoch": 1.3566265060240963, "grad_norm": 0.2542206048965454, "learning_rate": 2.674901221622239e-05, "loss": 0.0158, "step": 704 }, { "epoch": 1.3585542168674698, "grad_norm": 0.49705010652542114, "learning_rate": 2.670677805308116e-05, "loss": 0.0162, "step": 705 }, { "epoch": 1.3604819277108433, "grad_norm": 0.17502115666866302, "learning_rate": 2.666451016608383e-05, "loss": 0.0074, "step": 706 }, { "epoch": 1.3624096385542168, "grad_norm": 0.21738742291927338, "learning_rate": 2.6622208767767075e-05, "loss": 0.0135, "step": 707 }, { "epoch": 1.3643373493975903, "grad_norm": 0.3309847414493561, "learning_rate": 2.6579874070836032e-05, "loss": 0.0107, "step": 708 }, { "epoch": 1.3662650602409638, "grad_norm": 0.10706827789545059, "learning_rate": 2.6537506288163303e-05, "loss": 0.0043, "step": 709 }, { "epoch": 1.3681927710843373, "grad_norm": 0.173640176653862, "learning_rate": 2.6495105632787835e-05, "loss": 0.0092, "step": 710 }, { "epoch": 1.3701204819277109, "grad_norm": 0.2636397182941437, "learning_rate": 2.6452672317913893e-05, "loss": 0.0097, "step": 711 }, { "epoch": 1.3720481927710844, "grad_norm": 0.28485360741615295, "learning_rate": 2.6410206556909943e-05, "loss": 0.0193, "step": 712 }, { "epoch": 1.3739759036144579, "grad_norm": 0.23210027813911438, "learning_rate": 2.636770856330761e-05, "loss": 0.0229, "step": 713 }, { "epoch": 1.3759036144578314, "grad_norm": 0.13388316333293915, "learning_rate": 2.6325178550800596e-05, "loss": 0.004, "step": 714 }, { "epoch": 1.377831325301205, "grad_norm": 0.5131422877311707, "learning_rate": 2.6282616733243603e-05, "loss": 0.0137, "step": 715 }, { "epoch": 1.3797590361445784, "grad_norm": 0.3243267834186554, "learning_rate": 2.6240023324651258e-05, "loss": 0.0153, "step": 716 }, { "epoch": 1.3816867469879517, "grad_norm": 0.1440611034631729, "learning_rate": 2.619739853919704e-05, "loss": 0.0031, "step": 717 }, { "epoch": 1.3836144578313254, "grad_norm": 0.30346596240997314, "learning_rate": 2.6154742591212196e-05, "loss": 0.0109, "step": 718 }, { "epoch": 1.3855421686746987, "grad_norm": 0.19109240174293518, "learning_rate": 2.611205569518468e-05, "loss": 0.0094, "step": 719 }, { "epoch": 1.3874698795180722, "grad_norm": 0.28636518120765686, "learning_rate": 2.6069338065758056e-05, "loss": 0.0123, "step": 720 }, { "epoch": 1.3893975903614457, "grad_norm": 0.28083911538124084, "learning_rate": 2.6026589917730416e-05, "loss": 0.0104, "step": 721 }, { "epoch": 1.3913253012048192, "grad_norm": 0.36553966999053955, "learning_rate": 2.5983811466053327e-05, "loss": 0.0143, "step": 722 }, { "epoch": 1.3932530120481927, "grad_norm": 0.23317205905914307, "learning_rate": 2.5941002925830708e-05, "loss": 0.011, "step": 723 }, { "epoch": 1.3951807228915662, "grad_norm": 0.3825171887874603, "learning_rate": 2.589816451231781e-05, "loss": 0.0098, "step": 724 }, { "epoch": 1.3971084337349398, "grad_norm": 0.19916608929634094, "learning_rate": 2.585529644092006e-05, "loss": 0.0094, "step": 725 }, { "epoch": 1.3990361445783133, "grad_norm": 0.19990523159503937, "learning_rate": 2.5812398927192027e-05, "loss": 0.0128, "step": 726 }, { "epoch": 1.4009638554216868, "grad_norm": 0.34662899374961853, "learning_rate": 2.5769472186836347e-05, "loss": 0.0091, "step": 727 }, { "epoch": 1.4028915662650603, "grad_norm": 0.23481112718582153, "learning_rate": 2.5726516435702583e-05, "loss": 0.0154, "step": 728 }, { "epoch": 1.4048192771084338, "grad_norm": 0.1846667379140854, "learning_rate": 2.5683531889786194e-05, "loss": 0.0088, "step": 729 }, { "epoch": 1.4067469879518073, "grad_norm": 0.16717663407325745, "learning_rate": 2.564051876522742e-05, "loss": 0.0083, "step": 730 }, { "epoch": 1.4086746987951808, "grad_norm": 0.4116475284099579, "learning_rate": 2.5597477278310202e-05, "loss": 0.0179, "step": 731 }, { "epoch": 1.410602409638554, "grad_norm": 0.171807661652565, "learning_rate": 2.5554407645461115e-05, "loss": 0.0063, "step": 732 }, { "epoch": 1.4125301204819278, "grad_norm": 0.1954439878463745, "learning_rate": 2.5511310083248243e-05, "loss": 0.017, "step": 733 }, { "epoch": 1.4144578313253011, "grad_norm": 0.37158989906311035, "learning_rate": 2.5468184808380104e-05, "loss": 0.0173, "step": 734 }, { "epoch": 1.4163855421686746, "grad_norm": 0.2001633644104004, "learning_rate": 2.542503203770458e-05, "loss": 0.0165, "step": 735 }, { "epoch": 1.4183132530120481, "grad_norm": 0.45673373341560364, "learning_rate": 2.53818519882078e-05, "loss": 0.0185, "step": 736 }, { "epoch": 1.4202409638554216, "grad_norm": 0.3838701546192169, "learning_rate": 2.5338644877013067e-05, "loss": 0.0134, "step": 737 }, { "epoch": 1.4221686746987952, "grad_norm": 0.32032477855682373, "learning_rate": 2.5295410921379745e-05, "loss": 0.0143, "step": 738 }, { "epoch": 1.4240963855421687, "grad_norm": 0.4594039022922516, "learning_rate": 2.52521503387022e-05, "loss": 0.0193, "step": 739 }, { "epoch": 1.4260240963855422, "grad_norm": 0.3889620900154114, "learning_rate": 2.5208863346508667e-05, "loss": 0.0114, "step": 740 }, { "epoch": 1.4279518072289157, "grad_norm": 0.33153319358825684, "learning_rate": 2.5165550162460203e-05, "loss": 0.0102, "step": 741 }, { "epoch": 1.4298795180722892, "grad_norm": 0.7269518375396729, "learning_rate": 2.5122211004349536e-05, "loss": 0.0215, "step": 742 }, { "epoch": 1.4318072289156627, "grad_norm": 0.31653261184692383, "learning_rate": 2.5078846090100023e-05, "loss": 0.0115, "step": 743 }, { "epoch": 1.4337349397590362, "grad_norm": 0.20620353519916534, "learning_rate": 2.5035455637764518e-05, "loss": 0.0153, "step": 744 }, { "epoch": 1.4356626506024097, "grad_norm": 0.17266008257865906, "learning_rate": 2.4992039865524297e-05, "loss": 0.0069, "step": 745 }, { "epoch": 1.4375903614457832, "grad_norm": 0.24760811030864716, "learning_rate": 2.494859899168795e-05, "loss": 0.0108, "step": 746 }, { "epoch": 1.4395180722891565, "grad_norm": 0.2584865391254425, "learning_rate": 2.4905133234690282e-05, "loss": 0.0095, "step": 747 }, { "epoch": 1.4414457831325302, "grad_norm": 0.48847514390945435, "learning_rate": 2.486164281309122e-05, "loss": 0.0181, "step": 748 }, { "epoch": 1.4433734939759035, "grad_norm": 0.42942047119140625, "learning_rate": 2.4818127945574717e-05, "loss": 0.025, "step": 749 }, { "epoch": 1.445301204819277, "grad_norm": 0.23713800311088562, "learning_rate": 2.4774588850947648e-05, "loss": 0.0085, "step": 750 }, { "epoch": 1.4472289156626506, "grad_norm": 0.8797569870948792, "learning_rate": 2.473102574813871e-05, "loss": 0.0097, "step": 751 }, { "epoch": 1.449156626506024, "grad_norm": 0.2744862735271454, "learning_rate": 2.4687438856197302e-05, "loss": 0.0122, "step": 752 }, { "epoch": 1.4510843373493976, "grad_norm": 0.12747010588645935, "learning_rate": 2.4643828394292478e-05, "loss": 0.0056, "step": 753 }, { "epoch": 1.453012048192771, "grad_norm": 0.37376829981803894, "learning_rate": 2.4600194581711775e-05, "loss": 0.0052, "step": 754 }, { "epoch": 1.4549397590361446, "grad_norm": 0.2536911368370056, "learning_rate": 2.4556537637860176e-05, "loss": 0.0113, "step": 755 }, { "epoch": 1.456867469879518, "grad_norm": 0.25950780510902405, "learning_rate": 2.451285778225894e-05, "loss": 0.0099, "step": 756 }, { "epoch": 1.4587951807228916, "grad_norm": 0.19535955786705017, "learning_rate": 2.4469155234544565e-05, "loss": 0.0069, "step": 757 }, { "epoch": 1.4607228915662651, "grad_norm": 0.22816115617752075, "learning_rate": 2.442543021446764e-05, "loss": 0.0088, "step": 758 }, { "epoch": 1.4626506024096386, "grad_norm": 0.3363986313343048, "learning_rate": 2.4381682941891755e-05, "loss": 0.0182, "step": 759 }, { "epoch": 1.464578313253012, "grad_norm": 0.21492891013622284, "learning_rate": 2.4337913636792382e-05, "loss": 0.0069, "step": 760 }, { "epoch": 1.4665060240963856, "grad_norm": 0.6070862412452698, "learning_rate": 2.429412251925579e-05, "loss": 0.0406, "step": 761 }, { "epoch": 1.468433734939759, "grad_norm": 2.6469690799713135, "learning_rate": 2.425030980947793e-05, "loss": 0.0205, "step": 762 }, { "epoch": 1.4703614457831327, "grad_norm": 0.30909740924835205, "learning_rate": 2.420647572776332e-05, "loss": 0.0136, "step": 763 }, { "epoch": 1.472289156626506, "grad_norm": 0.6639553904533386, "learning_rate": 2.416262049452395e-05, "loss": 0.011, "step": 764 }, { "epoch": 1.4742168674698795, "grad_norm": 0.2919616997241974, "learning_rate": 2.4118744330278147e-05, "loss": 0.0131, "step": 765 }, { "epoch": 1.476144578313253, "grad_norm": 0.5232429504394531, "learning_rate": 2.4074847455649523e-05, "loss": 0.0138, "step": 766 }, { "epoch": 1.4780722891566265, "grad_norm": 5.630630970001221, "learning_rate": 2.403093009136579e-05, "loss": 0.0264, "step": 767 }, { "epoch": 1.48, "grad_norm": 0.33234721422195435, "learning_rate": 2.3986992458257707e-05, "loss": 0.0111, "step": 768 }, { "epoch": 1.4819277108433735, "grad_norm": 0.28444772958755493, "learning_rate": 2.3943034777257945e-05, "loss": 0.0144, "step": 769 }, { "epoch": 1.483855421686747, "grad_norm": 0.16229979693889618, "learning_rate": 2.38990572694e-05, "loss": 0.0062, "step": 770 }, { "epoch": 1.4857831325301205, "grad_norm": 0.27474716305732727, "learning_rate": 2.385506015581704e-05, "loss": 0.0172, "step": 771 }, { "epoch": 1.487710843373494, "grad_norm": 0.246526300907135, "learning_rate": 2.381104365774083e-05, "loss": 0.012, "step": 772 }, { "epoch": 1.4896385542168675, "grad_norm": 0.282047837972641, "learning_rate": 2.37670079965006e-05, "loss": 0.0116, "step": 773 }, { "epoch": 1.491566265060241, "grad_norm": 0.2878139317035675, "learning_rate": 2.3722953393521944e-05, "loss": 0.0147, "step": 774 }, { "epoch": 1.4934939759036143, "grad_norm": 0.5586277842521667, "learning_rate": 2.367888007032571e-05, "loss": 0.0111, "step": 775 }, { "epoch": 1.495421686746988, "grad_norm": 0.562160313129425, "learning_rate": 2.3634788248526846e-05, "loss": 0.0061, "step": 776 }, { "epoch": 1.4973493975903613, "grad_norm": 0.3452005982398987, "learning_rate": 2.3590678149833356e-05, "loss": 0.0205, "step": 777 }, { "epoch": 1.499277108433735, "grad_norm": 0.7757686376571655, "learning_rate": 2.3546549996045114e-05, "loss": 0.0273, "step": 778 }, { "epoch": 1.5012048192771084, "grad_norm": 0.19530551135540009, "learning_rate": 2.3502404009052812e-05, "loss": 0.0083, "step": 779 }, { "epoch": 1.503132530120482, "grad_norm": 0.2586531639099121, "learning_rate": 2.3458240410836775e-05, "loss": 0.0122, "step": 780 }, { "epoch": 1.5050602409638554, "grad_norm": 0.30063286423683167, "learning_rate": 2.3414059423465924e-05, "loss": 0.0083, "step": 781 }, { "epoch": 1.5069879518072289, "grad_norm": 0.18663185834884644, "learning_rate": 2.3369861269096575e-05, "loss": 0.0104, "step": 782 }, { "epoch": 1.5089156626506024, "grad_norm": 0.4405941069126129, "learning_rate": 2.3325646169971416e-05, "loss": 0.0264, "step": 783 }, { "epoch": 1.510843373493976, "grad_norm": 0.2947913110256195, "learning_rate": 2.3281414348418294e-05, "loss": 0.0107, "step": 784 }, { "epoch": 1.5127710843373494, "grad_norm": 0.23813778162002563, "learning_rate": 2.3237166026849158e-05, "loss": 0.0084, "step": 785 }, { "epoch": 1.514698795180723, "grad_norm": 0.33380329608917236, "learning_rate": 2.3192901427758932e-05, "loss": 0.0111, "step": 786 }, { "epoch": 1.5166265060240964, "grad_norm": 0.3736988306045532, "learning_rate": 2.314862077372438e-05, "loss": 0.0135, "step": 787 }, { "epoch": 1.5185542168674697, "grad_norm": 0.3785395920276642, "learning_rate": 2.3104324287402996e-05, "loss": 0.0265, "step": 788 }, { "epoch": 1.5204819277108435, "grad_norm": 0.3359154462814331, "learning_rate": 2.3060012191531885e-05, "loss": 0.0127, "step": 789 }, { "epoch": 1.5224096385542167, "grad_norm": 0.720753014087677, "learning_rate": 2.301568470892664e-05, "loss": 0.0134, "step": 790 }, { "epoch": 1.5243373493975905, "grad_norm": 0.36473193764686584, "learning_rate": 2.297134206248024e-05, "loss": 0.0318, "step": 791 }, { "epoch": 1.5262650602409638, "grad_norm": 0.29987087845802307, "learning_rate": 2.2926984475161884e-05, "loss": 0.008, "step": 792 }, { "epoch": 1.5281927710843375, "grad_norm": 0.2883112132549286, "learning_rate": 2.2882612170015914e-05, "loss": 0.0125, "step": 793 }, { "epoch": 1.5301204819277108, "grad_norm": 0.28983229398727417, "learning_rate": 2.2838225370160682e-05, "loss": 0.0155, "step": 794 }, { "epoch": 1.5320481927710843, "grad_norm": 0.47236886620521545, "learning_rate": 2.2793824298787414e-05, "loss": 0.0132, "step": 795 }, { "epoch": 1.5339759036144578, "grad_norm": 0.8328865170478821, "learning_rate": 2.2749409179159104e-05, "loss": 0.026, "step": 796 }, { "epoch": 1.5359036144578313, "grad_norm": 0.3129172623157501, "learning_rate": 2.2704980234609396e-05, "loss": 0.0099, "step": 797 }, { "epoch": 1.5378313253012048, "grad_norm": 0.22284500300884247, "learning_rate": 2.2660537688541416e-05, "loss": 0.009, "step": 798 }, { "epoch": 1.5397590361445783, "grad_norm": 0.3346405625343323, "learning_rate": 2.2616081764426726e-05, "loss": 0.0077, "step": 799 }, { "epoch": 1.5416867469879518, "grad_norm": 0.2923565208911896, "learning_rate": 2.2571612685804124e-05, "loss": 0.0119, "step": 800 }, { "epoch": 1.5436144578313253, "grad_norm": 0.1921311914920807, "learning_rate": 2.252713067627857e-05, "loss": 0.0083, "step": 801 }, { "epoch": 1.5455421686746988, "grad_norm": 0.23221106827259064, "learning_rate": 2.2482635959520044e-05, "loss": 0.0049, "step": 802 }, { "epoch": 1.5474698795180721, "grad_norm": 0.6340724229812622, "learning_rate": 2.243812875926241e-05, "loss": 0.0273, "step": 803 }, { "epoch": 1.5493975903614459, "grad_norm": 0.2699439823627472, "learning_rate": 2.2393609299302314e-05, "loss": 0.0108, "step": 804 }, { "epoch": 1.5513253012048192, "grad_norm": 0.2005189210176468, "learning_rate": 2.2349077803498052e-05, "loss": 0.0076, "step": 805 }, { "epoch": 1.5532530120481929, "grad_norm": 0.39668548107147217, "learning_rate": 2.230453449576842e-05, "loss": 0.0135, "step": 806 }, { "epoch": 1.5551807228915662, "grad_norm": 0.2406950294971466, "learning_rate": 2.2259979600091635e-05, "loss": 0.0094, "step": 807 }, { "epoch": 1.55710843373494, "grad_norm": 0.30363157391548157, "learning_rate": 2.2215413340504158e-05, "loss": 0.0178, "step": 808 }, { "epoch": 1.5590361445783132, "grad_norm": 0.19508181512355804, "learning_rate": 2.2170835941099605e-05, "loss": 0.0069, "step": 809 }, { "epoch": 1.5609638554216867, "grad_norm": 0.734106719493866, "learning_rate": 2.2126247626027615e-05, "loss": 0.0319, "step": 810 }, { "epoch": 1.5628915662650602, "grad_norm": 0.2591583728790283, "learning_rate": 2.208164861949268e-05, "loss": 0.0168, "step": 811 }, { "epoch": 1.5648192771084337, "grad_norm": 0.2386734038591385, "learning_rate": 2.20370391457531e-05, "loss": 0.0041, "step": 812 }, { "epoch": 1.5667469879518072, "grad_norm": 0.1675218939781189, "learning_rate": 2.1992419429119764e-05, "loss": 0.0078, "step": 813 }, { "epoch": 1.5686746987951807, "grad_norm": 0.45591506361961365, "learning_rate": 2.1947789693955097e-05, "loss": 0.0166, "step": 814 }, { "epoch": 1.5706024096385542, "grad_norm": 0.46940621733665466, "learning_rate": 2.190315016467188e-05, "loss": 0.0176, "step": 815 }, { "epoch": 1.5725301204819278, "grad_norm": 0.2294205278158188, "learning_rate": 2.1858501065732146e-05, "loss": 0.0102, "step": 816 }, { "epoch": 1.5744578313253013, "grad_norm": 0.28922322392463684, "learning_rate": 2.181384262164606e-05, "loss": 0.0111, "step": 817 }, { "epoch": 1.5763855421686745, "grad_norm": 0.19650064408779144, "learning_rate": 2.1769175056970765e-05, "loss": 0.0076, "step": 818 }, { "epoch": 1.5783132530120483, "grad_norm": 0.19538825750350952, "learning_rate": 2.172449859630927e-05, "loss": 0.0118, "step": 819 }, { "epoch": 1.5802409638554216, "grad_norm": 0.1900389939546585, "learning_rate": 2.167981346430931e-05, "loss": 0.0066, "step": 820 }, { "epoch": 1.5821686746987953, "grad_norm": 0.21593710780143738, "learning_rate": 2.1635119885662235e-05, "loss": 0.0101, "step": 821 }, { "epoch": 1.5840963855421686, "grad_norm": 0.2699289321899414, "learning_rate": 2.159041808510185e-05, "loss": 0.0118, "step": 822 }, { "epoch": 1.5860240963855423, "grad_norm": 0.31867673993110657, "learning_rate": 2.1545708287403322e-05, "loss": 0.0122, "step": 823 }, { "epoch": 1.5879518072289156, "grad_norm": 0.2862400412559509, "learning_rate": 2.1500990717382004e-05, "loss": 0.0216, "step": 824 }, { "epoch": 1.589879518072289, "grad_norm": 0.28482481837272644, "learning_rate": 2.145626559989237e-05, "loss": 0.0136, "step": 825 }, { "epoch": 1.5918072289156626, "grad_norm": 0.2866958975791931, "learning_rate": 2.1411533159826803e-05, "loss": 0.0298, "step": 826 }, { "epoch": 1.5937349397590361, "grad_norm": 0.39092838764190674, "learning_rate": 2.1366793622114533e-05, "loss": 0.0382, "step": 827 }, { "epoch": 1.5956626506024096, "grad_norm": 0.16381537914276123, "learning_rate": 2.1322047211720468e-05, "loss": 0.0074, "step": 828 }, { "epoch": 1.5975903614457831, "grad_norm": 0.22146940231323242, "learning_rate": 2.1277294153644083e-05, "loss": 0.0103, "step": 829 }, { "epoch": 1.5995180722891567, "grad_norm": 0.2155209183692932, "learning_rate": 2.123253467291827e-05, "loss": 0.0095, "step": 830 }, { "epoch": 1.6014457831325302, "grad_norm": 0.41510409116744995, "learning_rate": 2.118776899460822e-05, "loss": 0.0457, "step": 831 }, { "epoch": 1.6033734939759037, "grad_norm": 0.19718150794506073, "learning_rate": 2.1142997343810293e-05, "loss": 0.0192, "step": 832 }, { "epoch": 1.605301204819277, "grad_norm": 0.40924403071403503, "learning_rate": 2.1098219945650865e-05, "loss": 0.0278, "step": 833 }, { "epoch": 1.6072289156626507, "grad_norm": 0.18657824397087097, "learning_rate": 2.105343702528524e-05, "loss": 0.0076, "step": 834 }, { "epoch": 1.609156626506024, "grad_norm": 0.1727641075849533, "learning_rate": 2.100864880789645e-05, "loss": 0.0076, "step": 835 }, { "epoch": 1.6110843373493977, "grad_norm": 0.18138745427131653, "learning_rate": 2.0963855518694203e-05, "loss": 0.005, "step": 836 }, { "epoch": 1.613012048192771, "grad_norm": 0.19173955917358398, "learning_rate": 2.0919057382913675e-05, "loss": 0.0084, "step": 837 }, { "epoch": 1.6149397590361447, "grad_norm": 0.3812403380870819, "learning_rate": 2.0874254625814435e-05, "loss": 0.009, "step": 838 }, { "epoch": 1.616867469879518, "grad_norm": 0.2009759545326233, "learning_rate": 2.0829447472679285e-05, "loss": 0.0098, "step": 839 }, { "epoch": 1.6187951807228915, "grad_norm": 0.48703446984291077, "learning_rate": 2.0784636148813124e-05, "loss": 0.0099, "step": 840 }, { "epoch": 1.620722891566265, "grad_norm": 0.28995075821876526, "learning_rate": 2.0739820879541827e-05, "loss": 0.0075, "step": 841 }, { "epoch": 1.6226506024096385, "grad_norm": 0.2130059450864792, "learning_rate": 2.069500189021111e-05, "loss": 0.007, "step": 842 }, { "epoch": 1.624578313253012, "grad_norm": 0.252524733543396, "learning_rate": 2.0650179406185397e-05, "loss": 0.0249, "step": 843 }, { "epoch": 1.6265060240963856, "grad_norm": 0.23069098591804504, "learning_rate": 2.060535365284668e-05, "loss": 0.0084, "step": 844 }, { "epoch": 1.628433734939759, "grad_norm": 0.25051403045654297, "learning_rate": 2.056052485559338e-05, "loss": 0.0071, "step": 845 }, { "epoch": 1.6303614457831326, "grad_norm": 0.27664798498153687, "learning_rate": 2.051569323983924e-05, "loss": 0.0198, "step": 846 }, { "epoch": 1.632289156626506, "grad_norm": 0.2954922318458557, "learning_rate": 2.047085903101218e-05, "loss": 0.006, "step": 847 }, { "epoch": 1.6342168674698794, "grad_norm": 0.28477591276168823, "learning_rate": 2.0426022454553137e-05, "loss": 0.0147, "step": 848 }, { "epoch": 1.636144578313253, "grad_norm": 0.2785305678844452, "learning_rate": 2.0381183735914968e-05, "loss": 0.0117, "step": 849 }, { "epoch": 1.6380722891566264, "grad_norm": 0.2500309348106384, "learning_rate": 2.0336343100561295e-05, "loss": 0.008, "step": 850 }, { "epoch": 1.6400000000000001, "grad_norm": 0.18932047486305237, "learning_rate": 2.0291500773965392e-05, "loss": 0.0256, "step": 851 }, { "epoch": 1.6419277108433734, "grad_norm": 0.6396257877349854, "learning_rate": 2.0246656981609013e-05, "loss": 0.0141, "step": 852 }, { "epoch": 1.6438554216867471, "grad_norm": 0.5072891116142273, "learning_rate": 2.02018119489813e-05, "loss": 0.008, "step": 853 }, { "epoch": 1.6457831325301204, "grad_norm": 0.2920839488506317, "learning_rate": 2.0156965901577635e-05, "loss": 0.0085, "step": 854 }, { "epoch": 1.647710843373494, "grad_norm": 0.1391262263059616, "learning_rate": 2.011211906489848e-05, "loss": 0.0078, "step": 855 }, { "epoch": 1.6496385542168674, "grad_norm": 0.29620468616485596, "learning_rate": 2.00672716644483e-05, "loss": 0.0109, "step": 856 }, { "epoch": 1.651566265060241, "grad_norm": 0.13946573436260223, "learning_rate": 2.002242392573436e-05, "loss": 0.0076, "step": 857 }, { "epoch": 1.6534939759036145, "grad_norm": 0.9766128659248352, "learning_rate": 1.997757607426565e-05, "loss": 0.0309, "step": 858 }, { "epoch": 1.655421686746988, "grad_norm": 0.18002203106880188, "learning_rate": 1.9932728335551702e-05, "loss": 0.0072, "step": 859 }, { "epoch": 1.6573493975903615, "grad_norm": 0.28073111176490784, "learning_rate": 1.988788093510152e-05, "loss": 0.0246, "step": 860 }, { "epoch": 1.659277108433735, "grad_norm": 0.1919957399368286, "learning_rate": 1.9843034098422375e-05, "loss": 0.0087, "step": 861 }, { "epoch": 1.6612048192771085, "grad_norm": 0.1825258433818817, "learning_rate": 1.9798188051018705e-05, "loss": 0.0092, "step": 862 }, { "epoch": 1.6631325301204818, "grad_norm": 0.32412952184677124, "learning_rate": 1.9753343018390997e-05, "loss": 0.0118, "step": 863 }, { "epoch": 1.6650602409638555, "grad_norm": 0.12828563153743744, "learning_rate": 1.9708499226034618e-05, "loss": 0.0056, "step": 864 }, { "epoch": 1.6669879518072288, "grad_norm": 0.18647560477256775, "learning_rate": 1.966365689943871e-05, "loss": 0.0094, "step": 865 }, { "epoch": 1.6689156626506025, "grad_norm": 0.19835828244686127, "learning_rate": 1.9618816264085042e-05, "loss": 0.0097, "step": 866 }, { "epoch": 1.6708433734939758, "grad_norm": 0.22364282608032227, "learning_rate": 1.957397754544687e-05, "loss": 0.0062, "step": 867 }, { "epoch": 1.6727710843373496, "grad_norm": 0.29420018196105957, "learning_rate": 1.952914096898783e-05, "loss": 0.0182, "step": 868 }, { "epoch": 1.6746987951807228, "grad_norm": 0.2149929702281952, "learning_rate": 1.9484306760160766e-05, "loss": 0.0125, "step": 869 }, { "epoch": 1.6766265060240964, "grad_norm": 0.16844330728054047, "learning_rate": 1.9439475144406623e-05, "loss": 0.0074, "step": 870 }, { "epoch": 1.6785542168674699, "grad_norm": 0.5010282397270203, "learning_rate": 1.9394646347153334e-05, "loss": 0.0213, "step": 871 }, { "epoch": 1.6804819277108434, "grad_norm": 0.29847195744514465, "learning_rate": 1.9349820593814606e-05, "loss": 0.0173, "step": 872 }, { "epoch": 1.6824096385542169, "grad_norm": 0.23835812509059906, "learning_rate": 1.930499810978889e-05, "loss": 0.011, "step": 873 }, { "epoch": 1.6843373493975904, "grad_norm": 0.3269020617008209, "learning_rate": 1.9260179120458177e-05, "loss": 0.0285, "step": 874 }, { "epoch": 1.686265060240964, "grad_norm": 0.2142144739627838, "learning_rate": 1.9215363851186883e-05, "loss": 0.0146, "step": 875 }, { "epoch": 1.6881927710843372, "grad_norm": 0.3098377585411072, "learning_rate": 1.9170552527320725e-05, "loss": 0.0104, "step": 876 }, { "epoch": 1.690120481927711, "grad_norm": 0.22504115104675293, "learning_rate": 1.9125745374185568e-05, "loss": 0.0091, "step": 877 }, { "epoch": 1.6920481927710842, "grad_norm": 0.20633333921432495, "learning_rate": 1.908094261708633e-05, "loss": 0.0097, "step": 878 }, { "epoch": 1.693975903614458, "grad_norm": 1.179566502571106, "learning_rate": 1.9036144481305807e-05, "loss": 0.0143, "step": 879 }, { "epoch": 1.6959036144578312, "grad_norm": 0.15525613725185394, "learning_rate": 1.8991351192103554e-05, "loss": 0.0062, "step": 880 }, { "epoch": 1.697831325301205, "grad_norm": 0.15966367721557617, "learning_rate": 1.8946562974714763e-05, "loss": 0.0048, "step": 881 }, { "epoch": 1.6997590361445782, "grad_norm": 0.18902607262134552, "learning_rate": 1.890178005434914e-05, "loss": 0.0124, "step": 882 }, { "epoch": 1.701686746987952, "grad_norm": 0.21692413091659546, "learning_rate": 1.885700265618971e-05, "loss": 0.0135, "step": 883 }, { "epoch": 1.7036144578313253, "grad_norm": 0.38948455452919006, "learning_rate": 1.8812231005391786e-05, "loss": 0.0365, "step": 884 }, { "epoch": 1.7055421686746988, "grad_norm": 0.2483491599559784, "learning_rate": 1.8767465327081736e-05, "loss": 0.0202, "step": 885 }, { "epoch": 1.7074698795180723, "grad_norm": 0.15305832028388977, "learning_rate": 1.872270584635592e-05, "loss": 0.0035, "step": 886 }, { "epoch": 1.7093975903614458, "grad_norm": 0.17794466018676758, "learning_rate": 1.867795278827954e-05, "loss": 0.0157, "step": 887 }, { "epoch": 1.7113253012048193, "grad_norm": 0.1938813328742981, "learning_rate": 1.863320637788547e-05, "loss": 0.0071, "step": 888 }, { "epoch": 1.7132530120481928, "grad_norm": 0.27061617374420166, "learning_rate": 1.8588466840173207e-05, "loss": 0.0347, "step": 889 }, { "epoch": 1.7151807228915663, "grad_norm": 0.1541014313697815, "learning_rate": 1.8543734400107637e-05, "loss": 0.006, "step": 890 }, { "epoch": 1.7171084337349396, "grad_norm": 0.1436876654624939, "learning_rate": 1.8499009282617996e-05, "loss": 0.0059, "step": 891 }, { "epoch": 1.7190361445783133, "grad_norm": 1.0573723316192627, "learning_rate": 1.8454291712596688e-05, "loss": 0.008, "step": 892 }, { "epoch": 1.7209638554216866, "grad_norm": 0.15406259894371033, "learning_rate": 1.8409581914898157e-05, "loss": 0.0061, "step": 893 }, { "epoch": 1.7228915662650603, "grad_norm": 0.24822913110256195, "learning_rate": 1.836488011433777e-05, "loss": 0.0085, "step": 894 }, { "epoch": 1.7248192771084336, "grad_norm": 0.21049316227436066, "learning_rate": 1.83201865356907e-05, "loss": 0.0075, "step": 895 }, { "epoch": 1.7267469879518074, "grad_norm": 0.24159866571426392, "learning_rate": 1.8275501403690733e-05, "loss": 0.0156, "step": 896 }, { "epoch": 1.7286746987951807, "grad_norm": 0.3191063106060028, "learning_rate": 1.823082494302924e-05, "loss": 0.0218, "step": 897 }, { "epoch": 1.7306024096385542, "grad_norm": 0.20296362042427063, "learning_rate": 1.8186157378353945e-05, "loss": 0.0126, "step": 898 }, { "epoch": 1.7325301204819277, "grad_norm": 0.1905524581670761, "learning_rate": 1.8141498934267858e-05, "loss": 0.0131, "step": 899 }, { "epoch": 1.7344578313253012, "grad_norm": 0.5350520610809326, "learning_rate": 1.809684983532813e-05, "loss": 0.0115, "step": 900 }, { "epoch": 1.7363855421686747, "grad_norm": 0.17144092917442322, "learning_rate": 1.8052210306044907e-05, "loss": 0.0113, "step": 901 }, { "epoch": 1.7383132530120482, "grad_norm": 0.11777982115745544, "learning_rate": 1.8007580570880236e-05, "loss": 0.0058, "step": 902 }, { "epoch": 1.7402409638554217, "grad_norm": 0.2078275978565216, "learning_rate": 1.7962960854246908e-05, "loss": 0.0106, "step": 903 }, { "epoch": 1.7421686746987952, "grad_norm": 0.2550877630710602, "learning_rate": 1.791835138050732e-05, "loss": 0.0076, "step": 904 }, { "epoch": 1.7440963855421687, "grad_norm": 0.11553912609815598, "learning_rate": 1.7873752373972395e-05, "loss": 0.0038, "step": 905 }, { "epoch": 1.746024096385542, "grad_norm": 0.10724586248397827, "learning_rate": 1.7829164058900398e-05, "loss": 0.0043, "step": 906 }, { "epoch": 1.7479518072289157, "grad_norm": 0.30152231454849243, "learning_rate": 1.7784586659495845e-05, "loss": 0.0099, "step": 907 }, { "epoch": 1.749879518072289, "grad_norm": 0.18372933566570282, "learning_rate": 1.7740020399908372e-05, "loss": 0.0074, "step": 908 }, { "epoch": 1.7518072289156628, "grad_norm": 0.35184428095817566, "learning_rate": 1.7695465504231586e-05, "loss": 0.0184, "step": 909 }, { "epoch": 1.753734939759036, "grad_norm": 0.15083615481853485, "learning_rate": 1.765092219650196e-05, "loss": 0.0061, "step": 910 }, { "epoch": 1.7556626506024098, "grad_norm": 0.2599961459636688, "learning_rate": 1.7606390700697693e-05, "loss": 0.0101, "step": 911 }, { "epoch": 1.757590361445783, "grad_norm": 0.10829206556081772, "learning_rate": 1.7561871240737595e-05, "loss": 0.0034, "step": 912 }, { "epoch": 1.7595180722891566, "grad_norm": 0.38098782300949097, "learning_rate": 1.7517364040479966e-05, "loss": 0.0384, "step": 913 }, { "epoch": 1.76144578313253, "grad_norm": 0.14975085854530334, "learning_rate": 1.7472869323721432e-05, "loss": 0.0055, "step": 914 }, { "epoch": 1.7633734939759036, "grad_norm": 0.4151444733142853, "learning_rate": 1.742838731419588e-05, "loss": 0.0307, "step": 915 }, { "epoch": 1.765301204819277, "grad_norm": 0.22238481044769287, "learning_rate": 1.738391823557328e-05, "loss": 0.0059, "step": 916 }, { "epoch": 1.7672289156626506, "grad_norm": 0.23386356234550476, "learning_rate": 1.7339462311458587e-05, "loss": 0.0113, "step": 917 }, { "epoch": 1.7691566265060241, "grad_norm": 0.21911191940307617, "learning_rate": 1.7295019765390618e-05, "loss": 0.0071, "step": 918 }, { "epoch": 1.7710843373493976, "grad_norm": 0.343159943819046, "learning_rate": 1.7250590820840903e-05, "loss": 0.0144, "step": 919 }, { "epoch": 1.7730120481927711, "grad_norm": 0.32204556465148926, "learning_rate": 1.720617570121259e-05, "loss": 0.0131, "step": 920 }, { "epoch": 1.7749397590361444, "grad_norm": 0.4105585515499115, "learning_rate": 1.7161774629839328e-05, "loss": 0.0148, "step": 921 }, { "epoch": 1.7768674698795182, "grad_norm": 0.16380974650382996, "learning_rate": 1.7117387829984093e-05, "loss": 0.0066, "step": 922 }, { "epoch": 1.7787951807228914, "grad_norm": 0.22920913994312286, "learning_rate": 1.707301552483813e-05, "loss": 0.0105, "step": 923 }, { "epoch": 1.7807228915662652, "grad_norm": 0.2075149267911911, "learning_rate": 1.7028657937519767e-05, "loss": 0.0104, "step": 924 }, { "epoch": 1.7826506024096385, "grad_norm": 0.44439977407455444, "learning_rate": 1.6984315291073355e-05, "loss": 0.0134, "step": 925 }, { "epoch": 1.7845783132530122, "grad_norm": 0.24068203568458557, "learning_rate": 1.6939987808468125e-05, "loss": 0.0078, "step": 926 }, { "epoch": 1.7865060240963855, "grad_norm": 0.34044349193573, "learning_rate": 1.689567571259701e-05, "loss": 0.0108, "step": 927 }, { "epoch": 1.788433734939759, "grad_norm": 0.34082743525505066, "learning_rate": 1.6851379226275624e-05, "loss": 0.0266, "step": 928 }, { "epoch": 1.7903614457831325, "grad_norm": 0.19490115344524384, "learning_rate": 1.6807098572241075e-05, "loss": 0.0109, "step": 929 }, { "epoch": 1.792289156626506, "grad_norm": 0.16208237409591675, "learning_rate": 1.6762833973150846e-05, "loss": 0.0113, "step": 930 }, { "epoch": 1.7942168674698795, "grad_norm": 0.35555699467658997, "learning_rate": 1.671858565158172e-05, "loss": 0.0196, "step": 931 }, { "epoch": 1.796144578313253, "grad_norm": 0.1600857824087143, "learning_rate": 1.6674353830028587e-05, "loss": 0.0089, "step": 932 }, { "epoch": 1.7980722891566265, "grad_norm": 0.1699574887752533, "learning_rate": 1.663013873090342e-05, "loss": 0.0074, "step": 933 }, { "epoch": 1.8, "grad_norm": 0.2472933828830719, "learning_rate": 1.6585940576534086e-05, "loss": 0.0063, "step": 934 }, { "epoch": 1.8019277108433736, "grad_norm": 0.23491555452346802, "learning_rate": 1.654175958916323e-05, "loss": 0.0101, "step": 935 }, { "epoch": 1.8038554216867468, "grad_norm": 0.28635191917419434, "learning_rate": 1.6497595990947195e-05, "loss": 0.0131, "step": 936 }, { "epoch": 1.8057831325301206, "grad_norm": 0.15400712192058563, "learning_rate": 1.645345000395489e-05, "loss": 0.0068, "step": 937 }, { "epoch": 1.8077108433734939, "grad_norm": 0.18223172426223755, "learning_rate": 1.6409321850166647e-05, "loss": 0.0094, "step": 938 }, { "epoch": 1.8096385542168676, "grad_norm": 0.2789457142353058, "learning_rate": 1.636521175147316e-05, "loss": 0.0202, "step": 939 }, { "epoch": 1.8115662650602409, "grad_norm": 0.4267627000808716, "learning_rate": 1.6321119929674297e-05, "loss": 0.0176, "step": 940 }, { "epoch": 1.8134939759036146, "grad_norm": 0.3021615445613861, "learning_rate": 1.6277046606478056e-05, "loss": 0.0085, "step": 941 }, { "epoch": 1.815421686746988, "grad_norm": 0.3724934756755829, "learning_rate": 1.6232992003499405e-05, "loss": 0.0474, "step": 942 }, { "epoch": 1.8173493975903614, "grad_norm": 0.20904326438903809, "learning_rate": 1.6188956342259177e-05, "loss": 0.0078, "step": 943 }, { "epoch": 1.819277108433735, "grad_norm": 0.31168171763420105, "learning_rate": 1.614493984418297e-05, "loss": 0.0174, "step": 944 }, { "epoch": 1.8212048192771084, "grad_norm": 0.21273556351661682, "learning_rate": 1.6100942730600003e-05, "loss": 0.0054, "step": 945 }, { "epoch": 1.823132530120482, "grad_norm": 0.16991695761680603, "learning_rate": 1.6056965222742055e-05, "loss": 0.0063, "step": 946 }, { "epoch": 1.8250602409638554, "grad_norm": 0.22762684524059296, "learning_rate": 1.6013007541742303e-05, "loss": 0.0234, "step": 947 }, { "epoch": 1.826987951807229, "grad_norm": 0.20128795504570007, "learning_rate": 1.596906990863422e-05, "loss": 0.0095, "step": 948 }, { "epoch": 1.8289156626506025, "grad_norm": 0.30772027373313904, "learning_rate": 1.592515254435048e-05, "loss": 0.0356, "step": 949 }, { "epoch": 1.830843373493976, "grad_norm": 0.12954631447792053, "learning_rate": 1.5881255669721857e-05, "loss": 0.008, "step": 950 }, { "epoch": 1.8327710843373493, "grad_norm": 0.7787145972251892, "learning_rate": 1.5837379505476054e-05, "loss": 0.0108, "step": 951 }, { "epoch": 1.834698795180723, "grad_norm": 0.1683879941701889, "learning_rate": 1.5793524272236683e-05, "loss": 0.006, "step": 952 }, { "epoch": 1.8366265060240963, "grad_norm": 0.16475361585617065, "learning_rate": 1.5749690190522076e-05, "loss": 0.0065, "step": 953 }, { "epoch": 1.83855421686747, "grad_norm": 0.211905375123024, "learning_rate": 1.5705877480744214e-05, "loss": 0.0092, "step": 954 }, { "epoch": 1.8404819277108433, "grad_norm": 0.23850117623806, "learning_rate": 1.5662086363207628e-05, "loss": 0.012, "step": 955 }, { "epoch": 1.842409638554217, "grad_norm": 0.19100065529346466, "learning_rate": 1.561831705810825e-05, "loss": 0.0113, "step": 956 }, { "epoch": 1.8443373493975903, "grad_norm": 0.3635985255241394, "learning_rate": 1.557456978553236e-05, "loss": 0.0168, "step": 957 }, { "epoch": 1.8462650602409638, "grad_norm": 0.16449116170406342, "learning_rate": 1.553084476545544e-05, "loss": 0.0042, "step": 958 }, { "epoch": 1.8481927710843373, "grad_norm": 0.566093385219574, "learning_rate": 1.5487142217741062e-05, "loss": 0.0145, "step": 959 }, { "epoch": 1.8501204819277108, "grad_norm": 0.15960252285003662, "learning_rate": 1.5443462362139834e-05, "loss": 0.0059, "step": 960 }, { "epoch": 1.8520481927710843, "grad_norm": 0.40773797035217285, "learning_rate": 1.539980541828823e-05, "loss": 0.0257, "step": 961 }, { "epoch": 1.8539759036144579, "grad_norm": 0.4802496135234833, "learning_rate": 1.5356171605707522e-05, "loss": 0.0111, "step": 962 }, { "epoch": 1.8559036144578314, "grad_norm": 0.15745794773101807, "learning_rate": 1.5312561143802704e-05, "loss": 0.0049, "step": 963 }, { "epoch": 1.8578313253012049, "grad_norm": 0.15139251947402954, "learning_rate": 1.5268974251861298e-05, "loss": 0.0077, "step": 964 }, { "epoch": 1.8597590361445784, "grad_norm": 0.2188841849565506, "learning_rate": 1.5225411149052356e-05, "loss": 0.017, "step": 965 }, { "epoch": 1.8616867469879517, "grad_norm": 0.10853131115436554, "learning_rate": 1.5181872054425287e-05, "loss": 0.0049, "step": 966 }, { "epoch": 1.8636144578313254, "grad_norm": 0.8254880905151367, "learning_rate": 1.5138357186908785e-05, "loss": 0.0317, "step": 967 }, { "epoch": 1.8655421686746987, "grad_norm": 0.2989620566368103, "learning_rate": 1.5094866765309728e-05, "loss": 0.0126, "step": 968 }, { "epoch": 1.8674698795180724, "grad_norm": 0.16411150991916656, "learning_rate": 1.5051401008312054e-05, "loss": 0.0101, "step": 969 }, { "epoch": 1.8693975903614457, "grad_norm": 0.2861763834953308, "learning_rate": 1.5007960134475706e-05, "loss": 0.0155, "step": 970 }, { "epoch": 1.8713253012048194, "grad_norm": 0.24879588186740875, "learning_rate": 1.4964544362235487e-05, "loss": 0.0187, "step": 971 }, { "epoch": 1.8732530120481927, "grad_norm": 0.2433672398328781, "learning_rate": 1.4921153909899983e-05, "loss": 0.0084, "step": 972 }, { "epoch": 1.8751807228915662, "grad_norm": 0.15097154676914215, "learning_rate": 1.487778899565047e-05, "loss": 0.007, "step": 973 }, { "epoch": 1.8771084337349397, "grad_norm": 0.1629047691822052, "learning_rate": 1.4834449837539806e-05, "loss": 0.0058, "step": 974 }, { "epoch": 1.8790361445783132, "grad_norm": 0.9937071204185486, "learning_rate": 1.4791136653491333e-05, "loss": 0.0323, "step": 975 }, { "epoch": 1.8809638554216868, "grad_norm": 0.19555562734603882, "learning_rate": 1.4747849661297808e-05, "loss": 0.0126, "step": 976 }, { "epoch": 1.8828915662650603, "grad_norm": 0.16147711873054504, "learning_rate": 1.470458907862026e-05, "loss": 0.0067, "step": 977 }, { "epoch": 1.8848192771084338, "grad_norm": 0.2730027735233307, "learning_rate": 1.4661355122986945e-05, "loss": 0.0147, "step": 978 }, { "epoch": 1.886746987951807, "grad_norm": 0.13759832084178925, "learning_rate": 1.4618148011792206e-05, "loss": 0.0038, "step": 979 }, { "epoch": 1.8886746987951808, "grad_norm": 0.33516690135002136, "learning_rate": 1.4574967962295419e-05, "loss": 0.0139, "step": 980 }, { "epoch": 1.890602409638554, "grad_norm": 0.2345741093158722, "learning_rate": 1.4531815191619903e-05, "loss": 0.0094, "step": 981 }, { "epoch": 1.8925301204819278, "grad_norm": 0.14681044220924377, "learning_rate": 1.4488689916751762e-05, "loss": 0.0065, "step": 982 }, { "epoch": 1.894457831325301, "grad_norm": 0.21143914759159088, "learning_rate": 1.4445592354538885e-05, "loss": 0.0057, "step": 983 }, { "epoch": 1.8963855421686748, "grad_norm": 0.3109160363674164, "learning_rate": 1.44025227216898e-05, "loss": 0.0142, "step": 984 }, { "epoch": 1.8983132530120481, "grad_norm": 0.24301907420158386, "learning_rate": 1.435948123477259e-05, "loss": 0.012, "step": 985 }, { "epoch": 1.9002409638554218, "grad_norm": 0.19817675650119781, "learning_rate": 1.431646811021382e-05, "loss": 0.0097, "step": 986 }, { "epoch": 1.9021686746987951, "grad_norm": 0.13464932143688202, "learning_rate": 1.4273483564297425e-05, "loss": 0.0046, "step": 987 }, { "epoch": 1.9040963855421686, "grad_norm": 0.1698642522096634, "learning_rate": 1.4230527813163656e-05, "loss": 0.0038, "step": 988 }, { "epoch": 1.9060240963855422, "grad_norm": 0.19395388662815094, "learning_rate": 1.4187601072807975e-05, "loss": 0.0123, "step": 989 }, { "epoch": 1.9079518072289157, "grad_norm": 0.2093188613653183, "learning_rate": 1.4144703559079948e-05, "loss": 0.0093, "step": 990 }, { "epoch": 1.9098795180722892, "grad_norm": 0.1529311090707779, "learning_rate": 1.4101835487682198e-05, "loss": 0.0051, "step": 991 }, { "epoch": 1.9118072289156627, "grad_norm": 0.18725350499153137, "learning_rate": 1.4058997074169299e-05, "loss": 0.0083, "step": 992 }, { "epoch": 1.9137349397590362, "grad_norm": 0.15601560473442078, "learning_rate": 1.401618853394668e-05, "loss": 0.0086, "step": 993 }, { "epoch": 1.9156626506024095, "grad_norm": 0.23890644311904907, "learning_rate": 1.3973410082269591e-05, "loss": 0.015, "step": 994 }, { "epoch": 1.9175903614457832, "grad_norm": 0.2442619949579239, "learning_rate": 1.3930661934241947e-05, "loss": 0.0089, "step": 995 }, { "epoch": 1.9195180722891565, "grad_norm": 0.1540212482213974, "learning_rate": 1.388794430481532e-05, "loss": 0.0072, "step": 996 }, { "epoch": 1.9214457831325302, "grad_norm": 0.1359291970729828, "learning_rate": 1.3845257408787807e-05, "loss": 0.0131, "step": 997 }, { "epoch": 1.9233734939759035, "grad_norm": 0.25486138463020325, "learning_rate": 1.3802601460802967e-05, "loss": 0.0198, "step": 998 }, { "epoch": 1.9253012048192772, "grad_norm": 0.28815609216690063, "learning_rate": 1.3759976675348754e-05, "loss": 0.014, "step": 999 }, { "epoch": 1.9272289156626505, "grad_norm": 0.15648497641086578, "learning_rate": 1.3717383266756403e-05, "loss": 0.0065, "step": 1000 }, { "epoch": 1.929156626506024, "grad_norm": 0.16912540793418884, "learning_rate": 1.367482144919941e-05, "loss": 0.0059, "step": 1001 }, { "epoch": 1.9310843373493976, "grad_norm": 0.16896723210811615, "learning_rate": 1.3632291436692397e-05, "loss": 0.0054, "step": 1002 }, { "epoch": 1.933012048192771, "grad_norm": 0.20287497341632843, "learning_rate": 1.3589793443090064e-05, "loss": 0.0097, "step": 1003 }, { "epoch": 1.9349397590361446, "grad_norm": 0.14804276823997498, "learning_rate": 1.3547327682086114e-05, "loss": 0.0125, "step": 1004 }, { "epoch": 1.936867469879518, "grad_norm": 0.23820064961910248, "learning_rate": 1.3504894367212171e-05, "loss": 0.0131, "step": 1005 }, { "epoch": 1.9387951807228916, "grad_norm": 0.25607362389564514, "learning_rate": 1.34624937118367e-05, "loss": 0.0115, "step": 1006 }, { "epoch": 1.940722891566265, "grad_norm": 0.37233737111091614, "learning_rate": 1.3420125929163976e-05, "loss": 0.0309, "step": 1007 }, { "epoch": 1.9426506024096386, "grad_norm": 0.19426730275154114, "learning_rate": 1.3377791232232929e-05, "loss": 0.0078, "step": 1008 }, { "epoch": 1.944578313253012, "grad_norm": 0.2784160077571869, "learning_rate": 1.333548983391617e-05, "loss": 0.0142, "step": 1009 }, { "epoch": 1.9465060240963856, "grad_norm": 0.11407195776700974, "learning_rate": 1.3293221946918853e-05, "loss": 0.0035, "step": 1010 }, { "epoch": 1.948433734939759, "grad_norm": 0.3965436816215515, "learning_rate": 1.325098778377762e-05, "loss": 0.0242, "step": 1011 }, { "epoch": 1.9503614457831326, "grad_norm": 0.18520519137382507, "learning_rate": 1.3208787556859543e-05, "loss": 0.0096, "step": 1012 }, { "epoch": 1.952289156626506, "grad_norm": 0.2783315181732178, "learning_rate": 1.3166621478361075e-05, "loss": 0.0103, "step": 1013 }, { "epoch": 1.9542168674698797, "grad_norm": 0.22714459896087646, "learning_rate": 1.3124489760306917e-05, "loss": 0.0078, "step": 1014 }, { "epoch": 1.956144578313253, "grad_norm": 0.1257915049791336, "learning_rate": 1.3082392614549036e-05, "loss": 0.0077, "step": 1015 }, { "epoch": 1.9580722891566265, "grad_norm": 0.15592887997627258, "learning_rate": 1.3040330252765526e-05, "loss": 0.0106, "step": 1016 }, { "epoch": 1.96, "grad_norm": 0.19295449554920197, "learning_rate": 1.2998302886459586e-05, "loss": 0.0082, "step": 1017 }, { "epoch": 1.9619277108433735, "grad_norm": 0.15544794499874115, "learning_rate": 1.2956310726958472e-05, "loss": 0.0068, "step": 1018 }, { "epoch": 1.963855421686747, "grad_norm": 0.25899502635002136, "learning_rate": 1.291435398541236e-05, "loss": 0.0086, "step": 1019 }, { "epoch": 1.9657831325301205, "grad_norm": 0.34639033675193787, "learning_rate": 1.2872432872793379e-05, "loss": 0.0116, "step": 1020 }, { "epoch": 1.967710843373494, "grad_norm": 0.1628410518169403, "learning_rate": 1.283054759989447e-05, "loss": 0.0055, "step": 1021 }, { "epoch": 1.9696385542168675, "grad_norm": 0.9273788928985596, "learning_rate": 1.2788698377328385e-05, "loss": 0.0264, "step": 1022 }, { "epoch": 1.971566265060241, "grad_norm": 0.163126140832901, "learning_rate": 1.2746885415526594e-05, "loss": 0.0046, "step": 1023 }, { "epoch": 1.9734939759036143, "grad_norm": 0.1475439816713333, "learning_rate": 1.2705108924738223e-05, "loss": 0.0056, "step": 1024 }, { "epoch": 1.975421686746988, "grad_norm": 0.1654318869113922, "learning_rate": 1.2663369115029034e-05, "loss": 0.0056, "step": 1025 }, { "epoch": 1.9773493975903613, "grad_norm": 0.20536045730113983, "learning_rate": 1.2621666196280333e-05, "loss": 0.0101, "step": 1026 }, { "epoch": 1.979277108433735, "grad_norm": 0.19256474077701569, "learning_rate": 1.258000037818792e-05, "loss": 0.0059, "step": 1027 }, { "epoch": 1.9812048192771083, "grad_norm": 0.2605120539665222, "learning_rate": 1.2538371870261053e-05, "loss": 0.0115, "step": 1028 }, { "epoch": 1.983132530120482, "grad_norm": 0.14840295910835266, "learning_rate": 1.249678088182137e-05, "loss": 0.0046, "step": 1029 }, { "epoch": 1.9850602409638554, "grad_norm": 0.17585207521915436, "learning_rate": 1.2455227622001851e-05, "loss": 0.0086, "step": 1030 }, { "epoch": 1.9869879518072289, "grad_norm": 0.11044781655073166, "learning_rate": 1.241371229974579e-05, "loss": 0.0034, "step": 1031 }, { "epoch": 1.9889156626506024, "grad_norm": 0.25584840774536133, "learning_rate": 1.2372235123805672e-05, "loss": 0.0245, "step": 1032 }, { "epoch": 1.9908433734939759, "grad_norm": 0.25962474942207336, "learning_rate": 1.2330796302742211e-05, "loss": 0.0104, "step": 1033 }, { "epoch": 1.9927710843373494, "grad_norm": 0.33408522605895996, "learning_rate": 1.2289396044923238e-05, "loss": 0.0176, "step": 1034 }, { "epoch": 1.994698795180723, "grad_norm": 0.479950487613678, "learning_rate": 1.2248034558522682e-05, "loss": 0.0113, "step": 1035 }, { "epoch": 1.9966265060240964, "grad_norm": 0.16567294299602509, "learning_rate": 1.2206712051519518e-05, "loss": 0.0036, "step": 1036 }, { "epoch": 1.99855421686747, "grad_norm": 0.19343771040439606, "learning_rate": 1.2165428731696713e-05, "loss": 0.0077, "step": 1037 }, { "epoch": 2.0, "grad_norm": 0.22895601391792297, "learning_rate": 1.2124184806640202e-05, "loss": 0.0114, "step": 1038 }, { "epoch": 2.0019277108433733, "grad_norm": 0.15838384628295898, "learning_rate": 1.208298048373782e-05, "loss": 0.0043, "step": 1039 }, { "epoch": 2.003855421686747, "grad_norm": 0.681065559387207, "learning_rate": 1.2041815970178268e-05, "loss": 0.0214, "step": 1040 }, { "epoch": 2.0057831325301203, "grad_norm": 0.3357350528240204, "learning_rate": 1.2000691472950081e-05, "loss": 0.0079, "step": 1041 }, { "epoch": 2.007710843373494, "grad_norm": 0.15238308906555176, "learning_rate": 1.1959607198840568e-05, "loss": 0.0041, "step": 1042 }, { "epoch": 2.0096385542168673, "grad_norm": 0.11763229966163635, "learning_rate": 1.1918563354434784e-05, "loss": 0.0033, "step": 1043 }, { "epoch": 2.011566265060241, "grad_norm": 0.3759301006793976, "learning_rate": 1.1877560146114515e-05, "loss": 0.0128, "step": 1044 }, { "epoch": 2.0134939759036143, "grad_norm": 0.1143188625574112, "learning_rate": 1.1836597780057183e-05, "loss": 0.0078, "step": 1045 }, { "epoch": 2.015421686746988, "grad_norm": 0.20059260725975037, "learning_rate": 1.179567646223485e-05, "loss": 0.0149, "step": 1046 }, { "epoch": 2.0173493975903614, "grad_norm": 0.15569567680358887, "learning_rate": 1.1754796398413196e-05, "loss": 0.0038, "step": 1047 }, { "epoch": 2.019277108433735, "grad_norm": 0.1153278723359108, "learning_rate": 1.1713957794150423e-05, "loss": 0.0041, "step": 1048 }, { "epoch": 2.0212048192771084, "grad_norm": 0.1838717758655548, "learning_rate": 1.1673160854796307e-05, "loss": 0.0041, "step": 1049 }, { "epoch": 2.023132530120482, "grad_norm": 0.12264502793550491, "learning_rate": 1.1632405785491077e-05, "loss": 0.0043, "step": 1050 }, { "epoch": 2.0250602409638554, "grad_norm": 0.14363229274749756, "learning_rate": 1.159169279116445e-05, "loss": 0.0066, "step": 1051 }, { "epoch": 2.026987951807229, "grad_norm": 0.1316995471715927, "learning_rate": 1.1551022076534585e-05, "loss": 0.0024, "step": 1052 }, { "epoch": 2.0289156626506024, "grad_norm": 0.13392619788646698, "learning_rate": 1.1510393846107001e-05, "loss": 0.0051, "step": 1053 }, { "epoch": 2.0308433734939757, "grad_norm": 3.0086817741394043, "learning_rate": 1.1469808304173658e-05, "loss": 0.0334, "step": 1054 }, { "epoch": 2.0327710843373494, "grad_norm": 0.17756076157093048, "learning_rate": 1.1429265654811803e-05, "loss": 0.0068, "step": 1055 }, { "epoch": 2.0346987951807227, "grad_norm": 0.13250532746315002, "learning_rate": 1.1388766101883038e-05, "loss": 0.0087, "step": 1056 }, { "epoch": 2.0366265060240965, "grad_norm": 0.3534089922904968, "learning_rate": 1.1348309849032257e-05, "loss": 0.0076, "step": 1057 }, { "epoch": 2.0385542168674697, "grad_norm": 0.11939049512147903, "learning_rate": 1.1307897099686627e-05, "loss": 0.0029, "step": 1058 }, { "epoch": 2.0404819277108435, "grad_norm": 0.11862517893314362, "learning_rate": 1.1267528057054562e-05, "loss": 0.0062, "step": 1059 }, { "epoch": 2.0424096385542168, "grad_norm": 0.1539212018251419, "learning_rate": 1.1227202924124704e-05, "loss": 0.0067, "step": 1060 }, { "epoch": 2.0443373493975905, "grad_norm": 0.17163440585136414, "learning_rate": 1.118692190366491e-05, "loss": 0.0055, "step": 1061 }, { "epoch": 2.0462650602409638, "grad_norm": 0.12304897606372833, "learning_rate": 1.1146685198221222e-05, "loss": 0.0036, "step": 1062 }, { "epoch": 2.0481927710843375, "grad_norm": 0.17319051921367645, "learning_rate": 1.1106493010116842e-05, "loss": 0.0058, "step": 1063 }, { "epoch": 2.050120481927711, "grad_norm": 0.2242443859577179, "learning_rate": 1.1066345541451127e-05, "loss": 0.0059, "step": 1064 }, { "epoch": 2.0520481927710845, "grad_norm": 0.09533938020467758, "learning_rate": 1.1026242994098597e-05, "loss": 0.0033, "step": 1065 }, { "epoch": 2.053975903614458, "grad_norm": 0.11697929352521896, "learning_rate": 1.0986185569707852e-05, "loss": 0.0038, "step": 1066 }, { "epoch": 2.0559036144578315, "grad_norm": 0.2563149333000183, "learning_rate": 1.0946173469700625e-05, "loss": 0.0158, "step": 1067 }, { "epoch": 2.057831325301205, "grad_norm": 0.21836932003498077, "learning_rate": 1.0906206895270739e-05, "loss": 0.0085, "step": 1068 }, { "epoch": 2.059759036144578, "grad_norm": 0.1798071414232254, "learning_rate": 1.0866286047383094e-05, "loss": 0.0053, "step": 1069 }, { "epoch": 2.061686746987952, "grad_norm": 0.08937730640172958, "learning_rate": 1.0826411126772675e-05, "loss": 0.0025, "step": 1070 }, { "epoch": 2.063614457831325, "grad_norm": 0.0942138060927391, "learning_rate": 1.0786582333943499e-05, "loss": 0.0017, "step": 1071 }, { "epoch": 2.065542168674699, "grad_norm": 0.13076582551002502, "learning_rate": 1.0746799869167679e-05, "loss": 0.0033, "step": 1072 }, { "epoch": 2.067469879518072, "grad_norm": 0.0993233174085617, "learning_rate": 1.0707063932484357e-05, "loss": 0.0046, "step": 1073 }, { "epoch": 2.069397590361446, "grad_norm": 0.3046741485595703, "learning_rate": 1.0667374723698698e-05, "loss": 0.009, "step": 1074 }, { "epoch": 2.071325301204819, "grad_norm": 0.12197669595479965, "learning_rate": 1.0627732442380932e-05, "loss": 0.0034, "step": 1075 }, { "epoch": 2.073253012048193, "grad_norm": 0.12721140682697296, "learning_rate": 1.058813728786531e-05, "loss": 0.0048, "step": 1076 }, { "epoch": 2.075180722891566, "grad_norm": 0.10011966526508331, "learning_rate": 1.0548589459249112e-05, "loss": 0.0026, "step": 1077 }, { "epoch": 2.07710843373494, "grad_norm": 0.3314201831817627, "learning_rate": 1.0509089155391661e-05, "loss": 0.0284, "step": 1078 }, { "epoch": 2.079036144578313, "grad_norm": 0.32739701867103577, "learning_rate": 1.0469636574913288e-05, "loss": 0.0088, "step": 1079 }, { "epoch": 2.080963855421687, "grad_norm": 0.13805675506591797, "learning_rate": 1.043023191619438e-05, "loss": 0.0042, "step": 1080 }, { "epoch": 2.0828915662650602, "grad_norm": 0.14789745211601257, "learning_rate": 1.039087537737435e-05, "loss": 0.0037, "step": 1081 }, { "epoch": 2.0848192771084335, "grad_norm": 0.15518991649150848, "learning_rate": 1.0351567156350617e-05, "loss": 0.0044, "step": 1082 }, { "epoch": 2.0867469879518072, "grad_norm": 0.08380113542079926, "learning_rate": 1.0312307450777706e-05, "loss": 0.0019, "step": 1083 }, { "epoch": 2.0886746987951805, "grad_norm": 0.17892400920391083, "learning_rate": 1.027309645806613e-05, "loss": 0.0065, "step": 1084 }, { "epoch": 2.0906024096385543, "grad_norm": 0.5497608780860901, "learning_rate": 1.0233934375381489e-05, "loss": 0.0238, "step": 1085 }, { "epoch": 2.0925301204819275, "grad_norm": 1.0189186334609985, "learning_rate": 1.019482139964344e-05, "loss": 0.0092, "step": 1086 }, { "epoch": 2.0944578313253013, "grad_norm": 0.12144117057323456, "learning_rate": 1.015575772752472e-05, "loss": 0.0038, "step": 1087 }, { "epoch": 2.0963855421686746, "grad_norm": 0.1115315854549408, "learning_rate": 1.0116743555450148e-05, "loss": 0.0024, "step": 1088 }, { "epoch": 2.0983132530120483, "grad_norm": 0.22671759128570557, "learning_rate": 1.0077779079595631e-05, "loss": 0.0136, "step": 1089 }, { "epoch": 2.1002409638554216, "grad_norm": 2.0009827613830566, "learning_rate": 1.003886449588719e-05, "loss": 0.0493, "step": 1090 }, { "epoch": 2.1021686746987953, "grad_norm": 0.11907301843166351, "learning_rate": 1.0000000000000006e-05, "loss": 0.0034, "step": 1091 }, { "epoch": 2.1040963855421686, "grad_norm": 0.31257638335227966, "learning_rate": 9.961185787357346e-06, "loss": 0.0129, "step": 1092 }, { "epoch": 2.1060240963855423, "grad_norm": 0.11033743619918823, "learning_rate": 9.922422053129674e-06, "loss": 0.0184, "step": 1093 }, { "epoch": 2.1079518072289156, "grad_norm": 0.2575698494911194, "learning_rate": 9.883708992233626e-06, "loss": 0.0054, "step": 1094 }, { "epoch": 2.1098795180722894, "grad_norm": 0.12921132147312164, "learning_rate": 9.845046799331029e-06, "loss": 0.0037, "step": 1095 }, { "epoch": 2.1118072289156626, "grad_norm": 0.21405921876430511, "learning_rate": 9.806435668827941e-06, "loss": 0.006, "step": 1096 }, { "epoch": 2.113734939759036, "grad_norm": 0.12929430603981018, "learning_rate": 9.76787579487363e-06, "loss": 0.0049, "step": 1097 }, { "epoch": 2.1156626506024097, "grad_norm": 0.1793181151151657, "learning_rate": 9.729367371359681e-06, "loss": 0.0086, "step": 1098 }, { "epoch": 2.117590361445783, "grad_norm": 0.2182074338197708, "learning_rate": 9.690910591918936e-06, "loss": 0.0106, "step": 1099 }, { "epoch": 2.1195180722891567, "grad_norm": 0.0705680400133133, "learning_rate": 9.652505649924547e-06, "loss": 0.0012, "step": 1100 }, { "epoch": 2.12144578313253, "grad_norm": 0.10509738326072693, "learning_rate": 9.614152738489021e-06, "loss": 0.0048, "step": 1101 }, { "epoch": 2.1233734939759037, "grad_norm": 0.13775436580181122, "learning_rate": 9.575852050463268e-06, "loss": 0.0089, "step": 1102 }, { "epoch": 2.125301204819277, "grad_norm": 0.15230101346969604, "learning_rate": 9.537603778435545e-06, "loss": 0.0065, "step": 1103 }, { "epoch": 2.1272289156626507, "grad_norm": 0.24702346324920654, "learning_rate": 9.499408114730583e-06, "loss": 0.016, "step": 1104 }, { "epoch": 2.129156626506024, "grad_norm": 0.1082577034831047, "learning_rate": 9.461265251408575e-06, "loss": 0.0036, "step": 1105 }, { "epoch": 2.1310843373493977, "grad_norm": 0.1063847690820694, "learning_rate": 9.423175380264211e-06, "loss": 0.0037, "step": 1106 }, { "epoch": 2.133012048192771, "grad_norm": 0.07686953246593475, "learning_rate": 9.385138692825729e-06, "loss": 0.0031, "step": 1107 }, { "epoch": 2.1349397590361447, "grad_norm": 0.2046380341053009, "learning_rate": 9.347155380353912e-06, "loss": 0.0087, "step": 1108 }, { "epoch": 2.136867469879518, "grad_norm": 0.1341692954301834, "learning_rate": 9.30922563384121e-06, "loss": 0.0045, "step": 1109 }, { "epoch": 2.1387951807228918, "grad_norm": 0.09870535880327225, "learning_rate": 9.271349644010672e-06, "loss": 0.003, "step": 1110 }, { "epoch": 2.140722891566265, "grad_norm": 0.18708615005016327, "learning_rate": 9.233527601315069e-06, "loss": 0.0042, "step": 1111 }, { "epoch": 2.1426506024096383, "grad_norm": 0.5175634026527405, "learning_rate": 9.195759695935907e-06, "loss": 0.0173, "step": 1112 }, { "epoch": 2.144578313253012, "grad_norm": 0.14939036965370178, "learning_rate": 9.158046117782464e-06, "loss": 0.0031, "step": 1113 }, { "epoch": 2.1465060240963854, "grad_norm": 0.2837410569190979, "learning_rate": 9.120387056490851e-06, "loss": 0.0097, "step": 1114 }, { "epoch": 2.148433734939759, "grad_norm": 0.11088677495718002, "learning_rate": 9.082782701423047e-06, "loss": 0.0026, "step": 1115 }, { "epoch": 2.1503614457831324, "grad_norm": 0.07785166054964066, "learning_rate": 9.045233241665947e-06, "loss": 0.0019, "step": 1116 }, { "epoch": 2.152289156626506, "grad_norm": 0.17568141222000122, "learning_rate": 9.007738866030427e-06, "loss": 0.0039, "step": 1117 }, { "epoch": 2.1542168674698794, "grad_norm": 0.12652266025543213, "learning_rate": 8.970299763050356e-06, "loss": 0.0033, "step": 1118 }, { "epoch": 2.156144578313253, "grad_norm": 0.16801467537879944, "learning_rate": 8.932916120981695e-06, "loss": 0.0076, "step": 1119 }, { "epoch": 2.1580722891566264, "grad_norm": 0.18313169479370117, "learning_rate": 8.895588127801545e-06, "loss": 0.0052, "step": 1120 }, { "epoch": 2.16, "grad_norm": 0.07546049356460571, "learning_rate": 8.858315971207146e-06, "loss": 0.0022, "step": 1121 }, { "epoch": 2.1619277108433734, "grad_norm": 0.4039839208126068, "learning_rate": 8.821099838614996e-06, "loss": 0.0203, "step": 1122 }, { "epoch": 2.163855421686747, "grad_norm": 0.09244243055582047, "learning_rate": 8.783939917159897e-06, "loss": 0.002, "step": 1123 }, { "epoch": 2.1657831325301204, "grad_norm": 0.18327835202217102, "learning_rate": 8.746836393693978e-06, "loss": 0.0055, "step": 1124 }, { "epoch": 2.167710843373494, "grad_norm": 0.22010307013988495, "learning_rate": 8.709789454785809e-06, "loss": 0.0077, "step": 1125 }, { "epoch": 2.1696385542168675, "grad_norm": 0.09438297897577286, "learning_rate": 8.67279928671939e-06, "loss": 0.0032, "step": 1126 }, { "epoch": 2.1715662650602408, "grad_norm": 0.20782770216464996, "learning_rate": 8.635866075493318e-06, "loss": 0.0028, "step": 1127 }, { "epoch": 2.1734939759036145, "grad_norm": 0.1958685964345932, "learning_rate": 8.598990006819756e-06, "loss": 0.0047, "step": 1128 }, { "epoch": 2.1754216867469878, "grad_norm": 0.06459935009479523, "learning_rate": 8.562171266123528e-06, "loss": 0.0015, "step": 1129 }, { "epoch": 2.1773493975903615, "grad_norm": 0.33486708998680115, "learning_rate": 8.525410038541218e-06, "loss": 0.0094, "step": 1130 }, { "epoch": 2.179277108433735, "grad_norm": 0.5755940079689026, "learning_rate": 8.488706508920202e-06, "loss": 0.0067, "step": 1131 }, { "epoch": 2.1812048192771085, "grad_norm": 0.10840924829244614, "learning_rate": 8.452060861817738e-06, "loss": 0.0082, "step": 1132 }, { "epoch": 2.183132530120482, "grad_norm": 0.18611350655555725, "learning_rate": 8.415473281500037e-06, "loss": 0.0059, "step": 1133 }, { "epoch": 2.1850602409638555, "grad_norm": 0.11245249956846237, "learning_rate": 8.378943951941301e-06, "loss": 0.0107, "step": 1134 }, { "epoch": 2.186987951807229, "grad_norm": 0.12284426391124725, "learning_rate": 8.342473056822873e-06, "loss": 0.0025, "step": 1135 }, { "epoch": 2.1889156626506026, "grad_norm": 0.12542888522148132, "learning_rate": 8.306060779532245e-06, "loss": 0.0059, "step": 1136 }, { "epoch": 2.190843373493976, "grad_norm": 0.1287655532360077, "learning_rate": 8.26970730316215e-06, "loss": 0.0022, "step": 1137 }, { "epoch": 2.1927710843373496, "grad_norm": 0.1818632185459137, "learning_rate": 8.233412810509669e-06, "loss": 0.0131, "step": 1138 }, { "epoch": 2.194698795180723, "grad_norm": 0.09687745571136475, "learning_rate": 8.197177484075284e-06, "loss": 0.0025, "step": 1139 }, { "epoch": 2.1966265060240966, "grad_norm": 0.16103452444076538, "learning_rate": 8.161001506061979e-06, "loss": 0.0031, "step": 1140 }, { "epoch": 2.19855421686747, "grad_norm": 0.2711680233478546, "learning_rate": 8.124885058374302e-06, "loss": 0.0034, "step": 1141 }, { "epoch": 2.200481927710843, "grad_norm": 0.17613105475902557, "learning_rate": 8.088828322617473e-06, "loss": 0.0044, "step": 1142 }, { "epoch": 2.202409638554217, "grad_norm": 0.2298487424850464, "learning_rate": 8.052831480096464e-06, "loss": 0.0168, "step": 1143 }, { "epoch": 2.20433734939759, "grad_norm": 0.17042206227779388, "learning_rate": 8.016894711815067e-06, "loss": 0.007, "step": 1144 }, { "epoch": 2.206265060240964, "grad_norm": 0.2830466628074646, "learning_rate": 7.98101819847501e-06, "loss": 0.0091, "step": 1145 }, { "epoch": 2.208192771084337, "grad_norm": 0.22089065611362457, "learning_rate": 7.945202120475063e-06, "loss": 0.0046, "step": 1146 }, { "epoch": 2.210120481927711, "grad_norm": 0.1716073900461197, "learning_rate": 7.909446657910072e-06, "loss": 0.0032, "step": 1147 }, { "epoch": 2.212048192771084, "grad_norm": 0.16140373051166534, "learning_rate": 7.873751990570104e-06, "loss": 0.0057, "step": 1148 }, { "epoch": 2.213975903614458, "grad_norm": 0.1671605408191681, "learning_rate": 7.838118297939529e-06, "loss": 0.0039, "step": 1149 }, { "epoch": 2.2159036144578312, "grad_norm": 0.10933005809783936, "learning_rate": 7.802545759196117e-06, "loss": 0.005, "step": 1150 }, { "epoch": 2.217831325301205, "grad_norm": 0.07819998264312744, "learning_rate": 7.76703455321014e-06, "loss": 0.0025, "step": 1151 }, { "epoch": 2.2197590361445783, "grad_norm": 0.36211854219436646, "learning_rate": 7.73158485854344e-06, "loss": 0.0151, "step": 1152 }, { "epoch": 2.221686746987952, "grad_norm": 0.09098304808139801, "learning_rate": 7.696196853448612e-06, "loss": 0.0027, "step": 1153 }, { "epoch": 2.2236144578313253, "grad_norm": 0.17442144453525543, "learning_rate": 7.660870715868018e-06, "loss": 0.006, "step": 1154 }, { "epoch": 2.225542168674699, "grad_norm": 0.09785338491201401, "learning_rate": 7.625606623432933e-06, "loss": 0.0041, "step": 1155 }, { "epoch": 2.2274698795180723, "grad_norm": 0.19399888813495636, "learning_rate": 7.590404753462653e-06, "loss": 0.0125, "step": 1156 }, { "epoch": 2.2293975903614456, "grad_norm": 0.11080623418092728, "learning_rate": 7.55526528296362e-06, "loss": 0.0022, "step": 1157 }, { "epoch": 2.2313253012048193, "grad_norm": 0.14067359268665314, "learning_rate": 7.520188388628473e-06, "loss": 0.0123, "step": 1158 }, { "epoch": 2.2332530120481926, "grad_norm": 0.14533625543117523, "learning_rate": 7.485174246835227e-06, "loss": 0.0039, "step": 1159 }, { "epoch": 2.2351807228915663, "grad_norm": 0.1253812462091446, "learning_rate": 7.4502230336463466e-06, "loss": 0.003, "step": 1160 }, { "epoch": 2.2371084337349396, "grad_norm": 0.12766572833061218, "learning_rate": 7.415334924807869e-06, "loss": 0.0044, "step": 1161 }, { "epoch": 2.2390361445783133, "grad_norm": 0.11985791474580765, "learning_rate": 7.380510095748535e-06, "loss": 0.0071, "step": 1162 }, { "epoch": 2.2409638554216866, "grad_norm": 0.15505346655845642, "learning_rate": 7.3457487215788605e-06, "loss": 0.0046, "step": 1163 }, { "epoch": 2.2428915662650604, "grad_norm": 0.18983210623264313, "learning_rate": 7.311050977090343e-06, "loss": 0.0079, "step": 1164 }, { "epoch": 2.2448192771084337, "grad_norm": 0.19279207289218903, "learning_rate": 7.276417036754479e-06, "loss": 0.0042, "step": 1165 }, { "epoch": 2.2467469879518074, "grad_norm": 0.21539707481861115, "learning_rate": 7.241847074721964e-06, "loss": 0.0087, "step": 1166 }, { "epoch": 2.2486746987951807, "grad_norm": 0.07004354894161224, "learning_rate": 7.207341264821783e-06, "loss": 0.002, "step": 1167 }, { "epoch": 2.2506024096385544, "grad_norm": 0.2203039526939392, "learning_rate": 7.172899780560345e-06, "loss": 0.0069, "step": 1168 }, { "epoch": 2.2525301204819277, "grad_norm": 0.12474718689918518, "learning_rate": 7.138522795120606e-06, "loss": 0.0122, "step": 1169 }, { "epoch": 2.2544578313253014, "grad_norm": 0.09078995883464813, "learning_rate": 7.104210481361204e-06, "loss": 0.0025, "step": 1170 }, { "epoch": 2.2563855421686747, "grad_norm": 0.141757071018219, "learning_rate": 7.069963011815584e-06, "loss": 0.0039, "step": 1171 }, { "epoch": 2.258313253012048, "grad_norm": 0.14944659173488617, "learning_rate": 7.035780558691141e-06, "loss": 0.0025, "step": 1172 }, { "epoch": 2.2602409638554217, "grad_norm": 0.06723666191101074, "learning_rate": 7.001663293868328e-06, "loss": 0.0014, "step": 1173 }, { "epoch": 2.262168674698795, "grad_norm": 0.11966485530138016, "learning_rate": 6.967611388899826e-06, "loss": 0.0067, "step": 1174 }, { "epoch": 2.2640963855421687, "grad_norm": 0.08943185210227966, "learning_rate": 6.933625015009666e-06, "loss": 0.0036, "step": 1175 }, { "epoch": 2.266024096385542, "grad_norm": 0.04511453956365585, "learning_rate": 6.899704343092359e-06, "loss": 0.0014, "step": 1176 }, { "epoch": 2.2679518072289158, "grad_norm": 0.1867951601743698, "learning_rate": 6.865849543712058e-06, "loss": 0.009, "step": 1177 }, { "epoch": 2.269879518072289, "grad_norm": 0.23791250586509705, "learning_rate": 6.832060787101658e-06, "loss": 0.0117, "step": 1178 }, { "epoch": 2.271807228915663, "grad_norm": 0.13210316002368927, "learning_rate": 6.798338243162008e-06, "loss": 0.0024, "step": 1179 }, { "epoch": 2.273734939759036, "grad_norm": 0.1601375937461853, "learning_rate": 6.764682081461002e-06, "loss": 0.013, "step": 1180 }, { "epoch": 2.27566265060241, "grad_norm": 0.21996766328811646, "learning_rate": 6.73109247123273e-06, "loss": 0.0074, "step": 1181 }, { "epoch": 2.277590361445783, "grad_norm": 0.15780030190944672, "learning_rate": 6.6975695813766465e-06, "loss": 0.0052, "step": 1182 }, { "epoch": 2.279518072289157, "grad_norm": 0.18146437406539917, "learning_rate": 6.664113580456739e-06, "loss": 0.0265, "step": 1183 }, { "epoch": 2.28144578313253, "grad_norm": 0.12033495306968689, "learning_rate": 6.630724636700618e-06, "loss": 0.0026, "step": 1184 }, { "epoch": 2.283373493975904, "grad_norm": 0.25268155336380005, "learning_rate": 6.59740291799873e-06, "loss": 0.0046, "step": 1185 }, { "epoch": 2.285301204819277, "grad_norm": 0.19043004512786865, "learning_rate": 6.564148591903488e-06, "loss": 0.0063, "step": 1186 }, { "epoch": 2.2872289156626504, "grad_norm": 0.06894923001527786, "learning_rate": 6.530961825628432e-06, "loss": 0.0012, "step": 1187 }, { "epoch": 2.289156626506024, "grad_norm": 0.16378818452358246, "learning_rate": 6.4978427860474015e-06, "loss": 0.0048, "step": 1188 }, { "epoch": 2.2910843373493974, "grad_norm": 0.11130444705486298, "learning_rate": 6.464791639693648e-06, "loss": 0.0049, "step": 1189 }, { "epoch": 2.293012048192771, "grad_norm": 0.10573417693376541, "learning_rate": 6.431808552759083e-06, "loss": 0.0019, "step": 1190 }, { "epoch": 2.2949397590361444, "grad_norm": 0.13344882428646088, "learning_rate": 6.398893691093367e-06, "loss": 0.0033, "step": 1191 }, { "epoch": 2.296867469879518, "grad_norm": 0.12659135460853577, "learning_rate": 6.366047220203088e-06, "loss": 0.0032, "step": 1192 }, { "epoch": 2.2987951807228915, "grad_norm": 0.10152821987867355, "learning_rate": 6.333269305250971e-06, "loss": 0.0027, "step": 1193 }, { "epoch": 2.300722891566265, "grad_norm": 0.1889944225549698, "learning_rate": 6.300560111055006e-06, "loss": 0.0062, "step": 1194 }, { "epoch": 2.3026506024096385, "grad_norm": 2.3101227283477783, "learning_rate": 6.2679198020876275e-06, "loss": 0.0113, "step": 1195 }, { "epoch": 2.304578313253012, "grad_norm": 0.6224933862686157, "learning_rate": 6.235348542474908e-06, "loss": 0.0273, "step": 1196 }, { "epoch": 2.3065060240963855, "grad_norm": 0.1908419281244278, "learning_rate": 6.202846495995705e-06, "loss": 0.0056, "step": 1197 }, { "epoch": 2.3084337349397592, "grad_norm": 0.10968491435050964, "learning_rate": 6.170413826080856e-06, "loss": 0.0034, "step": 1198 }, { "epoch": 2.3103614457831325, "grad_norm": 0.23200668394565582, "learning_rate": 6.138050695812343e-06, "loss": 0.0042, "step": 1199 }, { "epoch": 2.3122891566265062, "grad_norm": 0.12442032992839813, "learning_rate": 6.105757267922481e-06, "loss": 0.0045, "step": 1200 }, { "epoch": 2.3142168674698795, "grad_norm": 0.14563624560832977, "learning_rate": 6.073533704793122e-06, "loss": 0.0035, "step": 1201 }, { "epoch": 2.316144578313253, "grad_norm": 0.11523722857236862, "learning_rate": 6.04138016845478e-06, "loss": 0.0088, "step": 1202 }, { "epoch": 2.3180722891566266, "grad_norm": 0.2000943422317505, "learning_rate": 6.009296820585871e-06, "loss": 0.0059, "step": 1203 }, { "epoch": 2.32, "grad_norm": 0.10698592662811279, "learning_rate": 5.977283822511879e-06, "loss": 0.0028, "step": 1204 }, { "epoch": 2.3219277108433736, "grad_norm": 0.1533137410879135, "learning_rate": 5.945341335204547e-06, "loss": 0.0044, "step": 1205 }, { "epoch": 2.323855421686747, "grad_norm": 0.1235835999250412, "learning_rate": 5.9134695192810695e-06, "loss": 0.0043, "step": 1206 }, { "epoch": 2.3257831325301206, "grad_norm": 0.1916925013065338, "learning_rate": 5.8816685350032575e-06, "loss": 0.0066, "step": 1207 }, { "epoch": 2.327710843373494, "grad_norm": 0.08812380582094193, "learning_rate": 5.849938542276801e-06, "loss": 0.0022, "step": 1208 }, { "epoch": 2.3296385542168676, "grad_norm": 0.13387660682201385, "learning_rate": 5.818279700650393e-06, "loss": 0.0037, "step": 1209 }, { "epoch": 2.331566265060241, "grad_norm": 0.2309022694826126, "learning_rate": 5.786692169314954e-06, "loss": 0.0049, "step": 1210 }, { "epoch": 2.3334939759036146, "grad_norm": 0.09956549853086472, "learning_rate": 5.755176107102833e-06, "loss": 0.002, "step": 1211 }, { "epoch": 2.335421686746988, "grad_norm": 0.06035687029361725, "learning_rate": 5.723731672487043e-06, "loss": 0.002, "step": 1212 }, { "epoch": 2.337349397590361, "grad_norm": 0.06850237399339676, "learning_rate": 5.69235902358038e-06, "loss": 0.0013, "step": 1213 }, { "epoch": 2.339277108433735, "grad_norm": 0.12068171054124832, "learning_rate": 5.661058318134711e-06, "loss": 0.0041, "step": 1214 }, { "epoch": 2.3412048192771087, "grad_norm": 0.13146616518497467, "learning_rate": 5.6298297135401355e-06, "loss": 0.0022, "step": 1215 }, { "epoch": 2.343132530120482, "grad_norm": 0.15160737931728363, "learning_rate": 5.598673366824212e-06, "loss": 0.0036, "step": 1216 }, { "epoch": 2.3450602409638552, "grad_norm": 0.26196014881134033, "learning_rate": 5.567589434651164e-06, "loss": 0.0151, "step": 1217 }, { "epoch": 2.346987951807229, "grad_norm": 0.12898831069469452, "learning_rate": 5.536578073321073e-06, "loss": 0.006, "step": 1218 }, { "epoch": 2.3489156626506023, "grad_norm": 0.11385104805231094, "learning_rate": 5.505639438769146e-06, "loss": 0.0052, "step": 1219 }, { "epoch": 2.350843373493976, "grad_norm": 0.14569509029388428, "learning_rate": 5.47477368656486e-06, "loss": 0.0048, "step": 1220 }, { "epoch": 2.3527710843373493, "grad_norm": 0.12406075745820999, "learning_rate": 5.443980971911238e-06, "loss": 0.0028, "step": 1221 }, { "epoch": 2.354698795180723, "grad_norm": 0.3730498254299164, "learning_rate": 5.413261449644039e-06, "loss": 0.0043, "step": 1222 }, { "epoch": 2.3566265060240963, "grad_norm": 0.1449914574623108, "learning_rate": 5.382615274230987e-06, "loss": 0.0075, "step": 1223 }, { "epoch": 2.35855421686747, "grad_norm": 0.20739100873470306, "learning_rate": 5.352042599770995e-06, "loss": 0.0061, "step": 1224 }, { "epoch": 2.3604819277108433, "grad_norm": 0.05786775052547455, "learning_rate": 5.321543579993398e-06, "loss": 0.0015, "step": 1225 }, { "epoch": 2.362409638554217, "grad_norm": 0.09043122828006744, "learning_rate": 5.2911183682571446e-06, "loss": 0.0034, "step": 1226 }, { "epoch": 2.3643373493975903, "grad_norm": 0.2685496211051941, "learning_rate": 5.260767117550094e-06, "loss": 0.0076, "step": 1227 }, { "epoch": 2.3662650602409636, "grad_norm": 0.17694126069545746, "learning_rate": 5.230489980488165e-06, "loss": 0.0148, "step": 1228 }, { "epoch": 2.3681927710843373, "grad_norm": 0.11609307676553726, "learning_rate": 5.200287109314633e-06, "loss": 0.0049, "step": 1229 }, { "epoch": 2.370120481927711, "grad_norm": 0.1257704645395279, "learning_rate": 5.1701586558993285e-06, "loss": 0.0031, "step": 1230 }, { "epoch": 2.3720481927710844, "grad_norm": 0.27177703380584717, "learning_rate": 5.140104771737899e-06, "loss": 0.0058, "step": 1231 }, { "epoch": 2.3739759036144576, "grad_norm": 0.13928169012069702, "learning_rate": 5.110125607951024e-06, "loss": 0.0051, "step": 1232 }, { "epoch": 2.3759036144578314, "grad_norm": 0.679577648639679, "learning_rate": 5.0802213152836514e-06, "loss": 0.0173, "step": 1233 }, { "epoch": 2.3778313253012047, "grad_norm": 0.16769403219223022, "learning_rate": 5.0503920441042845e-06, "loss": 0.0045, "step": 1234 }, { "epoch": 2.3797590361445784, "grad_norm": 0.09427493065595627, "learning_rate": 5.0206379444041764e-06, "loss": 0.0024, "step": 1235 }, { "epoch": 2.3816867469879517, "grad_norm": 0.33908671140670776, "learning_rate": 4.990959165796585e-06, "loss": 0.0088, "step": 1236 }, { "epoch": 2.3836144578313254, "grad_norm": 0.18106943368911743, "learning_rate": 4.961355857516034e-06, "loss": 0.0094, "step": 1237 }, { "epoch": 2.3855421686746987, "grad_norm": 0.5833203196525574, "learning_rate": 4.931828168417583e-06, "loss": 0.0086, "step": 1238 }, { "epoch": 2.3874698795180724, "grad_norm": 0.09108569473028183, "learning_rate": 4.902376246976015e-06, "loss": 0.0014, "step": 1239 }, { "epoch": 2.3893975903614457, "grad_norm": 0.10596407204866409, "learning_rate": 4.873000241285153e-06, "loss": 0.0043, "step": 1240 }, { "epoch": 2.3913253012048195, "grad_norm": 0.10775511711835861, "learning_rate": 4.8437002990570835e-06, "loss": 0.0014, "step": 1241 }, { "epoch": 2.3932530120481927, "grad_norm": 0.9646345973014832, "learning_rate": 4.8144765676214245e-06, "loss": 0.0525, "step": 1242 }, { "epoch": 2.395180722891566, "grad_norm": 0.20530278980731964, "learning_rate": 4.7853291939245814e-06, "loss": 0.008, "step": 1243 }, { "epoch": 2.3971084337349398, "grad_norm": 0.1682119369506836, "learning_rate": 4.756258324528995e-06, "loss": 0.0044, "step": 1244 }, { "epoch": 2.3990361445783135, "grad_norm": 0.45536917448043823, "learning_rate": 4.727264105612439e-06, "loss": 0.0186, "step": 1245 }, { "epoch": 2.4009638554216868, "grad_norm": 0.3017471730709076, "learning_rate": 4.698346682967258e-06, "loss": 0.0106, "step": 1246 }, { "epoch": 2.40289156626506, "grad_norm": 0.1226554661989212, "learning_rate": 4.669506201999625e-06, "loss": 0.0035, "step": 1247 }, { "epoch": 2.404819277108434, "grad_norm": 0.13750068843364716, "learning_rate": 4.640742807728837e-06, "loss": 0.0038, "step": 1248 }, { "epoch": 2.406746987951807, "grad_norm": 0.11531024426221848, "learning_rate": 4.612056644786575e-06, "loss": 0.0021, "step": 1249 }, { "epoch": 2.408674698795181, "grad_norm": 0.1143675372004509, "learning_rate": 4.583447857416175e-06, "loss": 0.0028, "step": 1250 }, { "epoch": 2.410602409638554, "grad_norm": 0.0914216861128807, "learning_rate": 4.554916589471898e-06, "loss": 0.0027, "step": 1251 }, { "epoch": 2.412530120481928, "grad_norm": 0.18339012563228607, "learning_rate": 4.526462984418221e-06, "loss": 0.0037, "step": 1252 }, { "epoch": 2.414457831325301, "grad_norm": 0.11073138564825058, "learning_rate": 4.498087185329105e-06, "loss": 0.003, "step": 1253 }, { "epoch": 2.416385542168675, "grad_norm": 0.20792435109615326, "learning_rate": 4.469789334887265e-06, "loss": 0.009, "step": 1254 }, { "epoch": 2.418313253012048, "grad_norm": 0.09485629945993423, "learning_rate": 4.441569575383471e-06, "loss": 0.0033, "step": 1255 }, { "epoch": 2.420240963855422, "grad_norm": 0.11831793934106827, "learning_rate": 4.413428048715851e-06, "loss": 0.0021, "step": 1256 }, { "epoch": 2.422168674698795, "grad_norm": 0.11818034201860428, "learning_rate": 4.38536489638911e-06, "loss": 0.0041, "step": 1257 }, { "epoch": 2.4240963855421684, "grad_norm": 0.2583082616329193, "learning_rate": 4.3573802595138945e-06, "loss": 0.0039, "step": 1258 }, { "epoch": 2.426024096385542, "grad_norm": 0.3120201826095581, "learning_rate": 4.329474278806034e-06, "loss": 0.0087, "step": 1259 }, { "epoch": 2.427951807228916, "grad_norm": 0.1258879452943802, "learning_rate": 4.301647094585855e-06, "loss": 0.0046, "step": 1260 }, { "epoch": 2.429879518072289, "grad_norm": 0.15144586563110352, "learning_rate": 4.273898846777473e-06, "loss": 0.0054, "step": 1261 }, { "epoch": 2.4318072289156625, "grad_norm": 0.15615184605121613, "learning_rate": 4.246229674908067e-06, "loss": 0.0072, "step": 1262 }, { "epoch": 2.433734939759036, "grad_norm": 0.09690173715353012, "learning_rate": 4.218639718107225e-06, "loss": 0.003, "step": 1263 }, { "epoch": 2.4356626506024095, "grad_norm": 0.23884955048561096, "learning_rate": 4.1911291151062e-06, "loss": 0.0109, "step": 1264 }, { "epoch": 2.4375903614457832, "grad_norm": 0.0905768945813179, "learning_rate": 4.163698004237222e-06, "loss": 0.0027, "step": 1265 }, { "epoch": 2.4395180722891565, "grad_norm": 0.09168912470340729, "learning_rate": 4.136346523432821e-06, "loss": 0.0018, "step": 1266 }, { "epoch": 2.4414457831325302, "grad_norm": 0.17878012359142303, "learning_rate": 4.109074810225118e-06, "loss": 0.0048, "step": 1267 }, { "epoch": 2.4433734939759035, "grad_norm": 0.09913790971040726, "learning_rate": 4.08188300174513e-06, "loss": 0.0021, "step": 1268 }, { "epoch": 2.4453012048192773, "grad_norm": 0.16615812480449677, "learning_rate": 4.054771234722106e-06, "loss": 0.0066, "step": 1269 }, { "epoch": 2.4472289156626506, "grad_norm": 0.09618276357650757, "learning_rate": 4.027739645482784e-06, "loss": 0.0043, "step": 1270 }, { "epoch": 2.4491566265060243, "grad_norm": 0.33473479747772217, "learning_rate": 4.0007883699507855e-06, "loss": 0.0236, "step": 1271 }, { "epoch": 2.4510843373493976, "grad_norm": 0.15051880478858948, "learning_rate": 3.973917543645867e-06, "loss": 0.0068, "step": 1272 }, { "epoch": 2.453012048192771, "grad_norm": 0.24134816229343414, "learning_rate": 3.947127301683249e-06, "loss": 0.0194, "step": 1273 }, { "epoch": 2.4549397590361446, "grad_norm": 0.10495353490114212, "learning_rate": 3.920417778772967e-06, "loss": 0.0042, "step": 1274 }, { "epoch": 2.4568674698795183, "grad_norm": 0.2294938713312149, "learning_rate": 3.893789109219171e-06, "loss": 0.0224, "step": 1275 }, { "epoch": 2.4587951807228916, "grad_norm": 0.13710513710975647, "learning_rate": 3.867241426919446e-06, "loss": 0.0046, "step": 1276 }, { "epoch": 2.460722891566265, "grad_norm": 0.06754808127880096, "learning_rate": 3.840774865364157e-06, "loss": 0.0019, "step": 1277 }, { "epoch": 2.4626506024096386, "grad_norm": 0.24797780811786652, "learning_rate": 3.8143895576357605e-06, "loss": 0.0063, "step": 1278 }, { "epoch": 2.464578313253012, "grad_norm": 0.1476449817419052, "learning_rate": 3.788085636408143e-06, "loss": 0.0055, "step": 1279 }, { "epoch": 2.4665060240963856, "grad_norm": 0.22397096455097198, "learning_rate": 3.7618632339459616e-06, "loss": 0.0164, "step": 1280 }, { "epoch": 2.468433734939759, "grad_norm": 0.21596969664096832, "learning_rate": 3.7357224821039497e-06, "loss": 0.0112, "step": 1281 }, { "epoch": 2.4703614457831327, "grad_norm": 0.2775099575519562, "learning_rate": 3.7096635123263068e-06, "loss": 0.0112, "step": 1282 }, { "epoch": 2.472289156626506, "grad_norm": 0.07963326573371887, "learning_rate": 3.683686455645974e-06, "loss": 0.0013, "step": 1283 }, { "epoch": 2.4742168674698797, "grad_norm": 0.1253802627325058, "learning_rate": 3.6577914426840266e-06, "loss": 0.0038, "step": 1284 }, { "epoch": 2.476144578313253, "grad_norm": 0.10258597880601883, "learning_rate": 3.631978603648989e-06, "loss": 0.0023, "step": 1285 }, { "epoch": 2.4780722891566267, "grad_norm": 0.17102380096912384, "learning_rate": 3.6062480683361935e-06, "loss": 0.0025, "step": 1286 }, { "epoch": 2.48, "grad_norm": 0.09547360241413116, "learning_rate": 3.580599966127123e-06, "loss": 0.003, "step": 1287 }, { "epoch": 2.4819277108433733, "grad_norm": 0.08008653670549393, "learning_rate": 3.5550344259887438e-06, "loss": 0.0023, "step": 1288 }, { "epoch": 2.483855421686747, "grad_norm": 0.07712296396493912, "learning_rate": 3.5295515764729003e-06, "loss": 0.0015, "step": 1289 }, { "epoch": 2.4857831325301207, "grad_norm": 0.21118703484535217, "learning_rate": 3.5041515457156303e-06, "loss": 0.0041, "step": 1290 }, { "epoch": 2.487710843373494, "grad_norm": 0.10772393643856049, "learning_rate": 3.4788344614365155e-06, "loss": 0.0029, "step": 1291 }, { "epoch": 2.4896385542168673, "grad_norm": 0.2353268563747406, "learning_rate": 3.453600450938073e-06, "loss": 0.0072, "step": 1292 }, { "epoch": 2.491566265060241, "grad_norm": 0.2897944152355194, "learning_rate": 3.428449641105107e-06, "loss": 0.0205, "step": 1293 }, { "epoch": 2.4934939759036143, "grad_norm": 0.19756680727005005, "learning_rate": 3.4033821584040383e-06, "loss": 0.0065, "step": 1294 }, { "epoch": 2.495421686746988, "grad_norm": 0.13538534939289093, "learning_rate": 3.378398128882305e-06, "loss": 0.0025, "step": 1295 }, { "epoch": 2.4973493975903613, "grad_norm": 0.2301637977361679, "learning_rate": 3.3534976781677142e-06, "loss": 0.0071, "step": 1296 }, { "epoch": 2.499277108433735, "grad_norm": 0.0965796634554863, "learning_rate": 3.3286809314678137e-06, "loss": 0.0024, "step": 1297 }, { "epoch": 2.5012048192771084, "grad_norm": 0.0777980163693428, "learning_rate": 3.30394801356926e-06, "loss": 0.0013, "step": 1298 }, { "epoch": 2.503132530120482, "grad_norm": 0.3157603442668915, "learning_rate": 3.279299048837177e-06, "loss": 0.0228, "step": 1299 }, { "epoch": 2.5050602409638554, "grad_norm": 0.15660233795642853, "learning_rate": 3.2547341612145654e-06, "loss": 0.0056, "step": 1300 }, { "epoch": 2.506987951807229, "grad_norm": 0.21655581891536713, "learning_rate": 3.2302534742216586e-06, "loss": 0.0081, "step": 1301 }, { "epoch": 2.5089156626506024, "grad_norm": 0.09475889801979065, "learning_rate": 3.205857110955277e-06, "loss": 0.0029, "step": 1302 }, { "epoch": 2.5108433734939757, "grad_norm": 0.13174696266651154, "learning_rate": 3.18154519408826e-06, "loss": 0.0059, "step": 1303 }, { "epoch": 2.5127710843373494, "grad_norm": 0.10386355221271515, "learning_rate": 3.1573178458688102e-06, "loss": 0.0042, "step": 1304 }, { "epoch": 2.514698795180723, "grad_norm": 0.12700854241847992, "learning_rate": 3.133175188119899e-06, "loss": 0.0041, "step": 1305 }, { "epoch": 2.5166265060240964, "grad_norm": 0.1617022454738617, "learning_rate": 3.109117342238639e-06, "loss": 0.0053, "step": 1306 }, { "epoch": 2.5185542168674697, "grad_norm": 0.8668884038925171, "learning_rate": 3.085144429195688e-06, "loss": 0.0084, "step": 1307 }, { "epoch": 2.5204819277108435, "grad_norm": 0.22429344058036804, "learning_rate": 3.061256569534634e-06, "loss": 0.0053, "step": 1308 }, { "epoch": 2.5224096385542167, "grad_norm": 0.08967582136392593, "learning_rate": 3.037453883371375e-06, "loss": 0.0018, "step": 1309 }, { "epoch": 2.5243373493975905, "grad_norm": 0.1251695454120636, "learning_rate": 3.0137364903935464e-06, "loss": 0.0037, "step": 1310 }, { "epoch": 2.5262650602409638, "grad_norm": 0.09026174992322922, "learning_rate": 2.990104509859897e-06, "loss": 0.0024, "step": 1311 }, { "epoch": 2.5281927710843375, "grad_norm": 0.34319114685058594, "learning_rate": 2.966558060599689e-06, "loss": 0.0063, "step": 1312 }, { "epoch": 2.5301204819277108, "grad_norm": 0.20300136506557465, "learning_rate": 2.9430972610121087e-06, "loss": 0.0054, "step": 1313 }, { "epoch": 2.532048192771084, "grad_norm": 0.19160760939121246, "learning_rate": 2.9197222290656737e-06, "loss": 0.0095, "step": 1314 }, { "epoch": 2.533975903614458, "grad_norm": 0.18991442024707794, "learning_rate": 2.8964330822976227e-06, "loss": 0.006, "step": 1315 }, { "epoch": 2.5359036144578315, "grad_norm": 0.1801903396844864, "learning_rate": 2.873229937813349e-06, "loss": 0.0067, "step": 1316 }, { "epoch": 2.537831325301205, "grad_norm": 0.07068303227424622, "learning_rate": 2.850112912285783e-06, "loss": 0.0015, "step": 1317 }, { "epoch": 2.539759036144578, "grad_norm": 0.1404612809419632, "learning_rate": 2.8270821219548296e-06, "loss": 0.0036, "step": 1318 }, { "epoch": 2.541686746987952, "grad_norm": 0.12199504673480988, "learning_rate": 2.8041376826267862e-06, "loss": 0.0068, "step": 1319 }, { "epoch": 2.5436144578313256, "grad_norm": 0.2167249619960785, "learning_rate": 2.7812797096737253e-06, "loss": 0.0048, "step": 1320 }, { "epoch": 2.545542168674699, "grad_norm": 0.07466506212949753, "learning_rate": 2.7585083180329575e-06, "loss": 0.0017, "step": 1321 }, { "epoch": 2.547469879518072, "grad_norm": 0.11736353486776352, "learning_rate": 2.7358236222064283e-06, "loss": 0.003, "step": 1322 }, { "epoch": 2.549397590361446, "grad_norm": 0.16602204740047455, "learning_rate": 2.7132257362601453e-06, "loss": 0.005, "step": 1323 }, { "epoch": 2.551325301204819, "grad_norm": 0.15473629534244537, "learning_rate": 2.6907147738236193e-06, "loss": 0.0077, "step": 1324 }, { "epoch": 2.553253012048193, "grad_norm": 0.07868973910808563, "learning_rate": 2.6682908480892567e-06, "loss": 0.0013, "step": 1325 }, { "epoch": 2.555180722891566, "grad_norm": 0.2137845754623413, "learning_rate": 2.645954071811847e-06, "loss": 0.0092, "step": 1326 }, { "epoch": 2.55710843373494, "grad_norm": 0.11191053688526154, "learning_rate": 2.623704557307949e-06, "loss": 0.0031, "step": 1327 }, { "epoch": 2.559036144578313, "grad_norm": 0.3080642521381378, "learning_rate": 2.6015424164553295e-06, "loss": 0.0104, "step": 1328 }, { "epoch": 2.5609638554216865, "grad_norm": 0.08816439658403397, "learning_rate": 2.579467760692427e-06, "loss": 0.004, "step": 1329 }, { "epoch": 2.56289156626506, "grad_norm": 0.17154981195926666, "learning_rate": 2.557480701017776e-06, "loss": 0.0035, "step": 1330 }, { "epoch": 2.564819277108434, "grad_norm": 0.09479143470525742, "learning_rate": 2.5355813479894464e-06, "loss": 0.0034, "step": 1331 }, { "epoch": 2.5667469879518072, "grad_norm": 0.26139333844184875, "learning_rate": 2.513769811724487e-06, "loss": 0.0076, "step": 1332 }, { "epoch": 2.5686746987951805, "grad_norm": 0.16864238679409027, "learning_rate": 2.4920462018983816e-06, "loss": 0.0046, "step": 1333 }, { "epoch": 2.5706024096385542, "grad_norm": 0.1133158802986145, "learning_rate": 2.4704106277444884e-06, "loss": 0.0034, "step": 1334 }, { "epoch": 2.572530120481928, "grad_norm": 0.27522334456443787, "learning_rate": 2.4488631980534995e-06, "loss": 0.0127, "step": 1335 }, { "epoch": 2.5744578313253013, "grad_norm": 0.13547387719154358, "learning_rate": 2.427404021172868e-06, "loss": 0.0031, "step": 1336 }, { "epoch": 2.5763855421686745, "grad_norm": 0.13478629291057587, "learning_rate": 2.406033205006313e-06, "loss": 0.0039, "step": 1337 }, { "epoch": 2.5783132530120483, "grad_norm": 0.11515481770038605, "learning_rate": 2.3847508570132226e-06, "loss": 0.0029, "step": 1338 }, { "epoch": 2.5802409638554216, "grad_norm": 0.21657171845436096, "learning_rate": 2.36355708420815e-06, "loss": 0.011, "step": 1339 }, { "epoch": 2.5821686746987953, "grad_norm": 0.11441601067781448, "learning_rate": 2.342451993160262e-06, "loss": 0.006, "step": 1340 }, { "epoch": 2.5840963855421686, "grad_norm": 0.13475841283798218, "learning_rate": 2.3214356899928036e-06, "loss": 0.0051, "step": 1341 }, { "epoch": 2.5860240963855423, "grad_norm": 0.053035832941532135, "learning_rate": 2.300508280382572e-06, "loss": 0.0012, "step": 1342 }, { "epoch": 2.5879518072289156, "grad_norm": 0.12467508763074875, "learning_rate": 2.279669869559358e-06, "loss": 0.0024, "step": 1343 }, { "epoch": 2.589879518072289, "grad_norm": 0.10572273284196854, "learning_rate": 2.2589205623054646e-06, "loss": 0.0024, "step": 1344 }, { "epoch": 2.5918072289156626, "grad_norm": 0.17056365311145782, "learning_rate": 2.238260462955142e-06, "loss": 0.0064, "step": 1345 }, { "epoch": 2.5937349397590364, "grad_norm": 0.07940494269132614, "learning_rate": 2.2176896753940637e-06, "loss": 0.0012, "step": 1346 }, { "epoch": 2.5956626506024096, "grad_norm": 0.10416694730520248, "learning_rate": 2.1972083030588244e-06, "loss": 0.0092, "step": 1347 }, { "epoch": 2.597590361445783, "grad_norm": 0.2384328842163086, "learning_rate": 2.176816448936423e-06, "loss": 0.0067, "step": 1348 }, { "epoch": 2.5995180722891567, "grad_norm": 0.14279082417488098, "learning_rate": 2.156514215563703e-06, "loss": 0.0059, "step": 1349 }, { "epoch": 2.6014457831325304, "grad_norm": 0.08462683111429214, "learning_rate": 2.1363017050268886e-06, "loss": 0.0021, "step": 1350 }, { "epoch": 2.6033734939759037, "grad_norm": 0.09768491238355637, "learning_rate": 2.1161790189610377e-06, "loss": 0.0038, "step": 1351 }, { "epoch": 2.605301204819277, "grad_norm": 0.25498896837234497, "learning_rate": 2.0961462585495474e-06, "loss": 0.0114, "step": 1352 }, { "epoch": 2.6072289156626507, "grad_norm": 0.15635675191879272, "learning_rate": 2.076203524523637e-06, "loss": 0.0054, "step": 1353 }, { "epoch": 2.609156626506024, "grad_norm": 0.11619213968515396, "learning_rate": 2.056350917161836e-06, "loss": 0.007, "step": 1354 }, { "epoch": 2.6110843373493977, "grad_norm": 0.18085338175296783, "learning_rate": 2.0365885362895053e-06, "loss": 0.0061, "step": 1355 }, { "epoch": 2.613012048192771, "grad_norm": 0.14492927491664886, "learning_rate": 2.016916481278306e-06, "loss": 0.0114, "step": 1356 }, { "epoch": 2.6149397590361447, "grad_norm": 0.21257621049880981, "learning_rate": 1.997334851045709e-06, "loss": 0.0057, "step": 1357 }, { "epoch": 2.616867469879518, "grad_norm": 0.11539656668901443, "learning_rate": 1.9778437440545085e-06, "loss": 0.0071, "step": 1358 }, { "epoch": 2.6187951807228913, "grad_norm": 0.1642933189868927, "learning_rate": 1.95844325831231e-06, "loss": 0.0054, "step": 1359 }, { "epoch": 2.620722891566265, "grad_norm": 0.10779479146003723, "learning_rate": 1.9391334913710545e-06, "loss": 0.0028, "step": 1360 }, { "epoch": 2.6226506024096388, "grad_norm": 0.14295366406440735, "learning_rate": 1.9199145403265175e-06, "loss": 0.0048, "step": 1361 }, { "epoch": 2.624578313253012, "grad_norm": 0.13454844057559967, "learning_rate": 1.9007865018178107e-06, "loss": 0.0072, "step": 1362 }, { "epoch": 2.6265060240963853, "grad_norm": 0.778252363204956, "learning_rate": 1.8817494720269302e-06, "loss": 0.0071, "step": 1363 }, { "epoch": 2.628433734939759, "grad_norm": 0.11488679051399231, "learning_rate": 1.8628035466782268e-06, "loss": 0.0038, "step": 1364 }, { "epoch": 2.630361445783133, "grad_norm": 0.15560875833034515, "learning_rate": 1.8439488210379687e-06, "loss": 0.0043, "step": 1365 }, { "epoch": 2.632289156626506, "grad_norm": 0.10538071393966675, "learning_rate": 1.8251853899138306e-06, "loss": 0.0041, "step": 1366 }, { "epoch": 2.6342168674698794, "grad_norm": 0.12866193056106567, "learning_rate": 1.8065133476544306e-06, "loss": 0.0034, "step": 1367 }, { "epoch": 2.636144578313253, "grad_norm": 0.2045469433069229, "learning_rate": 1.7879327881488584e-06, "loss": 0.0141, "step": 1368 }, { "epoch": 2.6380722891566264, "grad_norm": 0.12423976510763168, "learning_rate": 1.769443804826194e-06, "loss": 0.0047, "step": 1369 }, { "epoch": 2.64, "grad_norm": 0.1007109209895134, "learning_rate": 1.751046490655046e-06, "loss": 0.0031, "step": 1370 }, { "epoch": 2.6419277108433734, "grad_norm": 0.0681275874376297, "learning_rate": 1.7327409381430804e-06, "loss": 0.0019, "step": 1371 }, { "epoch": 2.643855421686747, "grad_norm": 0.1645517498254776, "learning_rate": 1.7145272393365498e-06, "loss": 0.0035, "step": 1372 }, { "epoch": 2.6457831325301204, "grad_norm": 0.13689427077770233, "learning_rate": 1.6964054858198386e-06, "loss": 0.0086, "step": 1373 }, { "epoch": 2.6477108433734937, "grad_norm": 0.10440093278884888, "learning_rate": 1.6783757687150149e-06, "loss": 0.0019, "step": 1374 }, { "epoch": 2.6496385542168674, "grad_norm": 0.1142532229423523, "learning_rate": 1.6604381786813383e-06, "loss": 0.0047, "step": 1375 }, { "epoch": 2.651566265060241, "grad_norm": 0.10430166125297546, "learning_rate": 1.6425928059148312e-06, "loss": 0.0027, "step": 1376 }, { "epoch": 2.6534939759036145, "grad_norm": 0.2315254956483841, "learning_rate": 1.624839740147819e-06, "loss": 0.0071, "step": 1377 }, { "epoch": 2.6554216867469878, "grad_norm": 0.15356265008449554, "learning_rate": 1.6071790706484746e-06, "loss": 0.0109, "step": 1378 }, { "epoch": 2.6573493975903615, "grad_norm": 0.1332363784313202, "learning_rate": 1.589610886220383e-06, "loss": 0.0046, "step": 1379 }, { "epoch": 2.659277108433735, "grad_norm": 0.18892519176006317, "learning_rate": 1.5721352752020602e-06, "loss": 0.0138, "step": 1380 }, { "epoch": 2.6612048192771085, "grad_norm": 0.10537895560264587, "learning_rate": 1.5547523254665598e-06, "loss": 0.0066, "step": 1381 }, { "epoch": 2.663132530120482, "grad_norm": 0.1308947205543518, "learning_rate": 1.5374621244209965e-06, "loss": 0.0039, "step": 1382 }, { "epoch": 2.6650602409638555, "grad_norm": 0.11358808726072311, "learning_rate": 1.5202647590060983e-06, "loss": 0.0029, "step": 1383 }, { "epoch": 2.666987951807229, "grad_norm": 0.12029009312391281, "learning_rate": 1.5031603156958064e-06, "loss": 0.0032, "step": 1384 }, { "epoch": 2.6689156626506025, "grad_norm": 0.36994072794914246, "learning_rate": 1.4861488804968093e-06, "loss": 0.024, "step": 1385 }, { "epoch": 2.670843373493976, "grad_norm": 0.1263083666563034, "learning_rate": 1.4692305389481232e-06, "loss": 0.0047, "step": 1386 }, { "epoch": 2.6727710843373496, "grad_norm": 0.15056709945201874, "learning_rate": 1.452405376120658e-06, "loss": 0.0014, "step": 1387 }, { "epoch": 2.674698795180723, "grad_norm": 0.10418888181447983, "learning_rate": 1.4356734766167925e-06, "loss": 0.0035, "step": 1388 }, { "epoch": 2.676626506024096, "grad_norm": 0.12220565974712372, "learning_rate": 1.4190349245699443e-06, "loss": 0.0063, "step": 1389 }, { "epoch": 2.67855421686747, "grad_norm": 0.14774753153324127, "learning_rate": 1.402489803644156e-06, "loss": 0.008, "step": 1390 }, { "epoch": 2.6804819277108436, "grad_norm": 0.14384198188781738, "learning_rate": 1.3860381970336544e-06, "loss": 0.0039, "step": 1391 }, { "epoch": 2.682409638554217, "grad_norm": 0.10995055735111237, "learning_rate": 1.3696801874624698e-06, "loss": 0.0028, "step": 1392 }, { "epoch": 2.68433734939759, "grad_norm": 0.12208505719900131, "learning_rate": 1.353415857183966e-06, "loss": 0.0029, "step": 1393 }, { "epoch": 2.686265060240964, "grad_norm": 0.16018439829349518, "learning_rate": 1.337245287980482e-06, "loss": 0.0068, "step": 1394 }, { "epoch": 2.688192771084337, "grad_norm": 5.2112274169921875, "learning_rate": 1.3211685611628844e-06, "loss": 0.1645, "step": 1395 }, { "epoch": 2.690120481927711, "grad_norm": 0.12426120787858963, "learning_rate": 1.3051857575701732e-06, "loss": 0.0044, "step": 1396 }, { "epoch": 2.692048192771084, "grad_norm": 0.13931375741958618, "learning_rate": 1.2892969575690685e-06, "loss": 0.0035, "step": 1397 }, { "epoch": 2.693975903614458, "grad_norm": 0.1804540753364563, "learning_rate": 1.273502241053608e-06, "loss": 0.0108, "step": 1398 }, { "epoch": 2.695903614457831, "grad_norm": 0.12313607335090637, "learning_rate": 1.2578016874447596e-06, "loss": 0.0073, "step": 1399 }, { "epoch": 2.697831325301205, "grad_norm": 0.1301470398902893, "learning_rate": 1.2421953756899985e-06, "loss": 0.0037, "step": 1400 }, { "epoch": 2.6997590361445782, "grad_norm": 0.12769126892089844, "learning_rate": 1.226683384262919e-06, "loss": 0.0041, "step": 1401 }, { "epoch": 2.701686746987952, "grad_norm": 0.20923997461795807, "learning_rate": 1.21126579116285e-06, "loss": 0.0101, "step": 1402 }, { "epoch": 2.7036144578313253, "grad_norm": 0.09334482997655869, "learning_rate": 1.1959426739144497e-06, "loss": 0.0022, "step": 1403 }, { "epoch": 2.7055421686746985, "grad_norm": 0.06848987936973572, "learning_rate": 1.1807141095673291e-06, "loss": 0.0013, "step": 1404 }, { "epoch": 2.7074698795180723, "grad_norm": 0.14552196860313416, "learning_rate": 1.1655801746956463e-06, "loss": 0.0066, "step": 1405 }, { "epoch": 2.709397590361446, "grad_norm": 0.11259587109088898, "learning_rate": 1.1505409453977334e-06, "loss": 0.0045, "step": 1406 }, { "epoch": 2.7113253012048193, "grad_norm": 0.23408068716526031, "learning_rate": 1.135596497295719e-06, "loss": 0.0181, "step": 1407 }, { "epoch": 2.7132530120481926, "grad_norm": 0.1483619660139084, "learning_rate": 1.1207469055351395e-06, "loss": 0.0042, "step": 1408 }, { "epoch": 2.7151807228915663, "grad_norm": 0.1170588880777359, "learning_rate": 1.105992244784555e-06, "loss": 0.0059, "step": 1409 }, { "epoch": 2.7171084337349396, "grad_norm": 0.15649215877056122, "learning_rate": 1.0913325892351857e-06, "loss": 0.0023, "step": 1410 }, { "epoch": 2.7190361445783133, "grad_norm": 0.0980108231306076, "learning_rate": 1.0767680126005443e-06, "loss": 0.0019, "step": 1411 }, { "epoch": 2.7209638554216866, "grad_norm": 0.14913050830364227, "learning_rate": 1.0622985881160396e-06, "loss": 0.0018, "step": 1412 }, { "epoch": 2.7228915662650603, "grad_norm": 0.0827481672167778, "learning_rate": 1.0479243885386347e-06, "loss": 0.0023, "step": 1413 }, { "epoch": 2.7248192771084336, "grad_norm": 0.15648555755615234, "learning_rate": 1.0336454861464706e-06, "loss": 0.0033, "step": 1414 }, { "epoch": 2.7267469879518074, "grad_norm": 0.10614357888698578, "learning_rate": 1.0194619527385007e-06, "loss": 0.0029, "step": 1415 }, { "epoch": 2.7286746987951807, "grad_norm": 0.07111652940511703, "learning_rate": 1.0053738596341355e-06, "loss": 0.0026, "step": 1416 }, { "epoch": 2.7306024096385544, "grad_norm": 0.11736573278903961, "learning_rate": 9.91381277672867e-07, "loss": 0.005, "step": 1417 }, { "epoch": 2.7325301204819277, "grad_norm": 0.18440629541873932, "learning_rate": 9.774842772139537e-07, "loss": 0.0038, "step": 1418 }, { "epoch": 2.734457831325301, "grad_norm": 0.11000041663646698, "learning_rate": 9.636829281360116e-07, "loss": 0.0034, "step": 1419 }, { "epoch": 2.7363855421686747, "grad_norm": 0.15212605893611908, "learning_rate": 9.499772998367018e-07, "loss": 0.0038, "step": 1420 }, { "epoch": 2.7383132530120484, "grad_norm": 0.07784705609083176, "learning_rate": 9.36367461232377e-07, "loss": 0.002, "step": 1421 }, { "epoch": 2.7402409638554217, "grad_norm": 0.1096726506948471, "learning_rate": 9.22853480757715e-07, "loss": 0.0028, "step": 1422 }, { "epoch": 2.742168674698795, "grad_norm": 0.17528535425662994, "learning_rate": 9.094354263653971e-07, "loss": 0.0065, "step": 1423 }, { "epoch": 2.7440963855421687, "grad_norm": 0.09263470768928528, "learning_rate": 8.961133655257548e-07, "loss": 0.0031, "step": 1424 }, { "epoch": 2.746024096385542, "grad_norm": 0.14822180569171906, "learning_rate": 8.828873652264303e-07, "loss": 0.0043, "step": 1425 }, { "epoch": 2.7479518072289157, "grad_norm": 0.11577019095420837, "learning_rate": 8.697574919720497e-07, "loss": 0.004, "step": 1426 }, { "epoch": 2.749879518072289, "grad_norm": 0.11681873351335526, "learning_rate": 8.567238117838683e-07, "loss": 0.0035, "step": 1427 }, { "epoch": 2.7518072289156628, "grad_norm": 0.1191524937748909, "learning_rate": 8.437863901994592e-07, "loss": 0.0022, "step": 1428 }, { "epoch": 2.753734939759036, "grad_norm": 0.1528361737728119, "learning_rate": 8.309452922723849e-07, "loss": 0.0042, "step": 1429 }, { "epoch": 2.75566265060241, "grad_norm": 0.42052382230758667, "learning_rate": 8.18200582571842e-07, "loss": 0.0149, "step": 1430 }, { "epoch": 2.757590361445783, "grad_norm": 0.13524137437343597, "learning_rate": 8.055523251823705e-07, "loss": 0.0029, "step": 1431 }, { "epoch": 2.759518072289157, "grad_norm": 0.0980493426322937, "learning_rate": 7.930005837035138e-07, "loss": 0.0036, "step": 1432 }, { "epoch": 2.76144578313253, "grad_norm": 0.17335453629493713, "learning_rate": 7.805454212494967e-07, "loss": 0.0066, "step": 1433 }, { "epoch": 2.7633734939759034, "grad_norm": 0.13746409118175507, "learning_rate": 7.681869004489218e-07, "loss": 0.0066, "step": 1434 }, { "epoch": 2.765301204819277, "grad_norm": 0.18556399643421173, "learning_rate": 7.559250834444332e-07, "loss": 0.0073, "step": 1435 }, { "epoch": 2.767228915662651, "grad_norm": 0.09743557125329971, "learning_rate": 7.437600318924332e-07, "loss": 0.0023, "step": 1436 }, { "epoch": 2.769156626506024, "grad_norm": 0.10671001672744751, "learning_rate": 7.316918069627488e-07, "loss": 0.003, "step": 1437 }, { "epoch": 2.7710843373493974, "grad_norm": 0.10671380162239075, "learning_rate": 7.197204693383231e-07, "loss": 0.0021, "step": 1438 }, { "epoch": 2.773012048192771, "grad_norm": 0.06824454665184021, "learning_rate": 7.078460792149311e-07, "loss": 0.0017, "step": 1439 }, { "epoch": 2.7749397590361444, "grad_norm": 0.12668560445308685, "learning_rate": 6.960686963008556e-07, "loss": 0.0035, "step": 1440 }, { "epoch": 2.776867469879518, "grad_norm": 0.10260980576276779, "learning_rate": 6.843883798166029e-07, "loss": 0.0027, "step": 1441 }, { "epoch": 2.7787951807228914, "grad_norm": 0.09880302101373672, "learning_rate": 6.728051884945941e-07, "loss": 0.0029, "step": 1442 }, { "epoch": 2.780722891566265, "grad_norm": 0.305993914604187, "learning_rate": 6.613191805788699e-07, "loss": 0.0112, "step": 1443 }, { "epoch": 2.7826506024096385, "grad_norm": 0.10707511752843857, "learning_rate": 6.499304138248064e-07, "loss": 0.0062, "step": 1444 }, { "epoch": 2.784578313253012, "grad_norm": 0.0986943170428276, "learning_rate": 6.386389454988195e-07, "loss": 0.0021, "step": 1445 }, { "epoch": 2.7865060240963855, "grad_norm": 0.1458776742219925, "learning_rate": 6.274448323780724e-07, "loss": 0.0094, "step": 1446 }, { "epoch": 2.788433734939759, "grad_norm": 0.09657061100006104, "learning_rate": 6.163481307501995e-07, "loss": 0.0026, "step": 1447 }, { "epoch": 2.7903614457831325, "grad_norm": 0.1462988704442978, "learning_rate": 6.053488964130183e-07, "loss": 0.0075, "step": 1448 }, { "epoch": 2.792289156626506, "grad_norm": 0.15330864489078522, "learning_rate": 5.94447184674245e-07, "loss": 0.0067, "step": 1449 }, { "epoch": 2.7942168674698795, "grad_norm": 0.1513473242521286, "learning_rate": 5.836430503512236e-07, "loss": 0.0106, "step": 1450 }, { "epoch": 2.7961445783132532, "grad_norm": 0.2151842713356018, "learning_rate": 5.729365477706505e-07, "loss": 0.0062, "step": 1451 }, { "epoch": 2.7980722891566265, "grad_norm": 0.13624203205108643, "learning_rate": 5.623277307682929e-07, "loss": 0.0045, "step": 1452 }, { "epoch": 2.8, "grad_norm": 0.12075261026620865, "learning_rate": 5.518166526887214e-07, "loss": 0.0073, "step": 1453 }, { "epoch": 2.8019277108433736, "grad_norm": 0.11320624500513077, "learning_rate": 5.41403366385047e-07, "loss": 0.002, "step": 1454 }, { "epoch": 2.803855421686747, "grad_norm": 0.08470363914966583, "learning_rate": 5.310879242186606e-07, "loss": 0.0021, "step": 1455 }, { "epoch": 2.8057831325301206, "grad_norm": 0.15221907198429108, "learning_rate": 5.208703780589419e-07, "loss": 0.0019, "step": 1456 }, { "epoch": 2.807710843373494, "grad_norm": 0.12709103524684906, "learning_rate": 5.107507792830335e-07, "loss": 0.0052, "step": 1457 }, { "epoch": 2.8096385542168676, "grad_norm": 0.10888515412807465, "learning_rate": 5.007291787755586e-07, "loss": 0.0023, "step": 1458 }, { "epoch": 2.811566265060241, "grad_norm": 0.25710970163345337, "learning_rate": 4.908056269283789e-07, "loss": 0.0073, "step": 1459 }, { "epoch": 2.8134939759036146, "grad_norm": 0.08488702774047852, "learning_rate": 4.809801736403308e-07, "loss": 0.0016, "step": 1460 }, { "epoch": 2.815421686746988, "grad_norm": 0.1282006949186325, "learning_rate": 4.7125286831698034e-07, "loss": 0.0035, "step": 1461 }, { "epoch": 2.8173493975903616, "grad_norm": 0.08955442905426025, "learning_rate": 4.6162375987037766e-07, "loss": 0.004, "step": 1462 }, { "epoch": 2.819277108433735, "grad_norm": 0.11310838907957077, "learning_rate": 4.520928967188054e-07, "loss": 0.0022, "step": 1463 }, { "epoch": 2.821204819277108, "grad_norm": 0.15055686235427856, "learning_rate": 4.426603267865326e-07, "loss": 0.0042, "step": 1464 }, { "epoch": 2.823132530120482, "grad_norm": 0.14379452168941498, "learning_rate": 4.333260975035769e-07, "loss": 0.0089, "step": 1465 }, { "epoch": 2.8250602409638557, "grad_norm": 0.1795361489057541, "learning_rate": 4.240902558054827e-07, "loss": 0.013, "step": 1466 }, { "epoch": 2.826987951807229, "grad_norm": 0.06829468160867691, "learning_rate": 4.1495284813305003e-07, "loss": 0.0018, "step": 1467 }, { "epoch": 2.8289156626506022, "grad_norm": 0.35213515162467957, "learning_rate": 4.0591392043213275e-07, "loss": 0.0144, "step": 1468 }, { "epoch": 2.830843373493976, "grad_norm": 0.11828093230724335, "learning_rate": 3.969735181533918e-07, "loss": 0.0028, "step": 1469 }, { "epoch": 2.8327710843373493, "grad_norm": 0.13286921381950378, "learning_rate": 3.881316862520712e-07, "loss": 0.0042, "step": 1470 }, { "epoch": 2.834698795180723, "grad_norm": 0.10271132737398148, "learning_rate": 3.7938846918776917e-07, "loss": 0.0047, "step": 1471 }, { "epoch": 2.8366265060240963, "grad_norm": 0.09422904253005981, "learning_rate": 3.707439109242139e-07, "loss": 0.0061, "step": 1472 }, { "epoch": 2.83855421686747, "grad_norm": 0.10817123204469681, "learning_rate": 3.6219805492905934e-07, "loss": 0.0029, "step": 1473 }, { "epoch": 2.8404819277108433, "grad_norm": 0.10254565626382828, "learning_rate": 3.53750944173632e-07, "loss": 0.0044, "step": 1474 }, { "epoch": 2.842409638554217, "grad_norm": 0.11423154920339584, "learning_rate": 3.45402621132751e-07, "loss": 0.0059, "step": 1475 }, { "epoch": 2.8443373493975903, "grad_norm": 0.15620556473731995, "learning_rate": 3.3715312778449305e-07, "loss": 0.005, "step": 1476 }, { "epoch": 2.846265060240964, "grad_norm": 0.1081036925315857, "learning_rate": 3.2900250560998546e-07, "loss": 0.004, "step": 1477 }, { "epoch": 2.8481927710843373, "grad_norm": 0.38650745153427124, "learning_rate": 3.209507955932001e-07, "loss": 0.0076, "step": 1478 }, { "epoch": 2.8501204819277106, "grad_norm": 0.1864783614873886, "learning_rate": 3.129980382207509e-07, "loss": 0.0092, "step": 1479 }, { "epoch": 2.8520481927710843, "grad_norm": 0.1458069533109665, "learning_rate": 3.05144273481679e-07, "loss": 0.0058, "step": 1480 }, { "epoch": 2.853975903614458, "grad_norm": 0.14836257696151733, "learning_rate": 2.9738954086726334e-07, "loss": 0.014, "step": 1481 }, { "epoch": 2.8559036144578314, "grad_norm": 0.10147511214017868, "learning_rate": 2.8973387937081485e-07, "loss": 0.0047, "step": 1482 }, { "epoch": 2.8578313253012047, "grad_norm": 0.13740235567092896, "learning_rate": 2.821773274874828e-07, "loss": 0.0028, "step": 1483 }, { "epoch": 2.8597590361445784, "grad_norm": 0.16089461743831635, "learning_rate": 2.7471992321406624e-07, "loss": 0.0168, "step": 1484 }, { "epoch": 2.8616867469879517, "grad_norm": 0.0599152147769928, "learning_rate": 2.6736170404880744e-07, "loss": 0.0017, "step": 1485 }, { "epoch": 2.8636144578313254, "grad_norm": 0.148875430226326, "learning_rate": 2.6010270699122096e-07, "loss": 0.0045, "step": 1486 }, { "epoch": 2.8655421686746987, "grad_norm": 0.26763641834259033, "learning_rate": 2.529429685419027e-07, "loss": 0.007, "step": 1487 }, { "epoch": 2.8674698795180724, "grad_norm": 0.1743084192276001, "learning_rate": 2.458825247023389e-07, "loss": 0.0112, "step": 1488 }, { "epoch": 2.8693975903614457, "grad_norm": 0.21380828320980072, "learning_rate": 2.3892141097473063e-07, "loss": 0.0103, "step": 1489 }, { "epoch": 2.8713253012048194, "grad_norm": 2.185253620147705, "learning_rate": 2.3205966236181433e-07, "loss": 0.0195, "step": 1490 }, { "epoch": 2.8732530120481927, "grad_norm": 0.11854024976491928, "learning_rate": 2.252973133666947e-07, "loss": 0.0034, "step": 1491 }, { "epoch": 2.8751807228915665, "grad_norm": 0.36487653851509094, "learning_rate": 2.1863439799265195e-07, "loss": 0.0063, "step": 1492 }, { "epoch": 2.8771084337349397, "grad_norm": 0.1029730811715126, "learning_rate": 2.1207094974298847e-07, "loss": 0.0049, "step": 1493 }, { "epoch": 2.879036144578313, "grad_norm": 0.10066278278827667, "learning_rate": 2.056070016208489e-07, "loss": 0.0021, "step": 1494 }, { "epoch": 2.8809638554216868, "grad_norm": 0.21477262675762177, "learning_rate": 1.9924258612906256e-07, "loss": 0.0052, "step": 1495 }, { "epoch": 2.8828915662650605, "grad_norm": 0.29007601737976074, "learning_rate": 1.929777352699791e-07, "loss": 0.0065, "step": 1496 }, { "epoch": 2.8848192771084338, "grad_norm": 0.32320499420166016, "learning_rate": 1.8681248054529754e-07, "loss": 0.0334, "step": 1497 }, { "epoch": 2.886746987951807, "grad_norm": 0.12790757417678833, "learning_rate": 1.8074685295591754e-07, "loss": 0.0034, "step": 1498 }, { "epoch": 2.888674698795181, "grad_norm": 0.12194570153951645, "learning_rate": 1.7478088300178608e-07, "loss": 0.0038, "step": 1499 }, { "epoch": 2.890602409638554, "grad_norm": 0.13514107465744019, "learning_rate": 1.6891460068173548e-07, "loss": 0.0042, "step": 1500 }, { "epoch": 2.892530120481928, "grad_norm": 0.09762352705001831, "learning_rate": 1.631480354933346e-07, "loss": 0.0016, "step": 1501 }, { "epoch": 2.894457831325301, "grad_norm": 0.10607658326625824, "learning_rate": 1.5748121643274661e-07, "loss": 0.0062, "step": 1502 }, { "epoch": 2.896385542168675, "grad_norm": 0.0920143872499466, "learning_rate": 1.519141719945738e-07, "loss": 0.0025, "step": 1503 }, { "epoch": 2.898313253012048, "grad_norm": 0.17520834505558014, "learning_rate": 1.4644693017172418e-07, "loss": 0.0045, "step": 1504 }, { "epoch": 2.900240963855422, "grad_norm": 0.49769192934036255, "learning_rate": 1.4107951845526267e-07, "loss": 0.0059, "step": 1505 }, { "epoch": 2.902168674698795, "grad_norm": 0.06354644149541855, "learning_rate": 1.3581196383427586e-07, "loss": 0.0021, "step": 1506 }, { "epoch": 2.904096385542169, "grad_norm": 0.09340358525514603, "learning_rate": 1.3064429279573853e-07, "loss": 0.0036, "step": 1507 }, { "epoch": 2.906024096385542, "grad_norm": 0.06073952466249466, "learning_rate": 1.255765313243762e-07, "loss": 0.001, "step": 1508 }, { "epoch": 2.9079518072289154, "grad_norm": 0.1323407143354416, "learning_rate": 1.206087049025384e-07, "loss": 0.008, "step": 1509 }, { "epoch": 2.909879518072289, "grad_norm": 0.18533159792423248, "learning_rate": 1.1574083851007e-07, "loss": 0.0086, "step": 1510 }, { "epoch": 2.911807228915663, "grad_norm": 0.09885486960411072, "learning_rate": 1.1097295662418018e-07, "loss": 0.0023, "step": 1511 }, { "epoch": 2.913734939759036, "grad_norm": 0.08286528289318085, "learning_rate": 1.0630508321932687e-07, "loss": 0.0029, "step": 1512 }, { "epoch": 2.9156626506024095, "grad_norm": 0.1265413761138916, "learning_rate": 1.0173724176709254e-07, "loss": 0.003, "step": 1513 }, { "epoch": 2.917590361445783, "grad_norm": 0.0776480957865715, "learning_rate": 9.726945523606646e-08, "loss": 0.0013, "step": 1514 }, { "epoch": 2.9195180722891565, "grad_norm": 0.14106431603431702, "learning_rate": 9.290174609172697e-08, "loss": 0.0204, "step": 1515 }, { "epoch": 2.9214457831325302, "grad_norm": 0.10813348740339279, "learning_rate": 8.863413629633277e-08, "loss": 0.0026, "step": 1516 }, { "epoch": 2.9233734939759035, "grad_norm": 0.11505429446697235, "learning_rate": 8.446664730881182e-08, "loss": 0.0038, "step": 1517 }, { "epoch": 2.9253012048192772, "grad_norm": 0.18488599359989166, "learning_rate": 8.039930008465257e-08, "loss": 0.0094, "step": 1518 }, { "epoch": 2.9272289156626505, "grad_norm": 0.19229602813720703, "learning_rate": 7.643211507579296e-08, "loss": 0.0062, "step": 1519 }, { "epoch": 2.929156626506024, "grad_norm": 0.0876188799738884, "learning_rate": 7.25651122305293e-08, "loss": 0.0024, "step": 1520 }, { "epoch": 2.9310843373493976, "grad_norm": 0.15103434026241302, "learning_rate": 6.87983109934054e-08, "loss": 0.0056, "step": 1521 }, { "epoch": 2.9330120481927713, "grad_norm": 0.1714266538619995, "learning_rate": 6.51317303051191e-08, "loss": 0.0047, "step": 1522 }, { "epoch": 2.9349397590361446, "grad_norm": 0.30670225620269775, "learning_rate": 6.156538860242922e-08, "loss": 0.0111, "step": 1523 }, { "epoch": 2.936867469879518, "grad_norm": 0.13250356912612915, "learning_rate": 5.809930381805773e-08, "loss": 0.0033, "step": 1524 }, { "epoch": 2.9387951807228916, "grad_norm": 0.10350223630666733, "learning_rate": 5.4733493380603183e-08, "loss": 0.0028, "step": 1525 }, { "epoch": 2.9407228915662653, "grad_norm": 0.1638195812702179, "learning_rate": 5.1467974214456374e-08, "loss": 0.0037, "step": 1526 }, { "epoch": 2.9426506024096386, "grad_norm": 0.11159276962280273, "learning_rate": 4.830276273970258e-08, "loss": 0.003, "step": 1527 }, { "epoch": 2.944578313253012, "grad_norm": 0.09866586327552795, "learning_rate": 4.5237874872052776e-08, "loss": 0.0032, "step": 1528 }, { "epoch": 2.9465060240963856, "grad_norm": 0.17825454473495483, "learning_rate": 4.227332602275924e-08, "loss": 0.0105, "step": 1529 }, { "epoch": 2.948433734939759, "grad_norm": 0.10379356890916824, "learning_rate": 3.940913109853561e-08, "loss": 0.0055, "step": 1530 }, { "epoch": 2.9503614457831326, "grad_norm": 0.23834416270256042, "learning_rate": 3.66453045014814e-08, "loss": 0.0044, "step": 1531 }, { "epoch": 2.952289156626506, "grad_norm": 0.11515571922063828, "learning_rate": 3.398186012901539e-08, "loss": 0.0042, "step": 1532 }, { "epoch": 2.9542168674698797, "grad_norm": 0.14170049130916595, "learning_rate": 3.141881137379788e-08, "loss": 0.0073, "step": 1533 }, { "epoch": 2.956144578313253, "grad_norm": 0.237248957157135, "learning_rate": 2.8956171123670774e-08, "loss": 0.0055, "step": 1534 }, { "epoch": 2.9580722891566262, "grad_norm": 0.07076071947813034, "learning_rate": 2.6593951761588744e-08, "loss": 0.0016, "step": 1535 }, { "epoch": 2.96, "grad_norm": 0.1100577786564827, "learning_rate": 2.4332165165557032e-08, "loss": 0.0026, "step": 1536 }, { "epoch": 2.9619277108433737, "grad_norm": 0.11576279252767563, "learning_rate": 2.2170822708573736e-08, "loss": 0.0036, "step": 1537 }, { "epoch": 2.963855421686747, "grad_norm": 0.2067718207836151, "learning_rate": 2.0109935258565415e-08, "loss": 0.0063, "step": 1538 }, { "epoch": 2.9657831325301203, "grad_norm": 0.15040244162082672, "learning_rate": 1.8149513178347122e-08, "loss": 0.0081, "step": 1539 }, { "epoch": 2.967710843373494, "grad_norm": 0.14071759581565857, "learning_rate": 1.6289566325555783e-08, "loss": 0.006, "step": 1540 }, { "epoch": 2.9696385542168677, "grad_norm": 0.32527413964271545, "learning_rate": 1.4530104052610239e-08, "loss": 0.0021, "step": 1541 }, { "epoch": 2.971566265060241, "grad_norm": 0.06794515997171402, "learning_rate": 1.2871135206651287e-08, "loss": 0.0016, "step": 1542 }, { "epoch": 2.9734939759036143, "grad_norm": 0.08525913208723068, "learning_rate": 1.1312668129519477e-08, "loss": 0.0023, "step": 1543 }, { "epoch": 2.975421686746988, "grad_norm": 0.14025282859802246, "learning_rate": 9.854710657688504e-09, "loss": 0.0025, "step": 1544 }, { "epoch": 2.9773493975903613, "grad_norm": 0.15709802508354187, "learning_rate": 8.497270122242996e-09, "loss": 0.0038, "step": 1545 }, { "epoch": 2.979277108433735, "grad_norm": 0.1520087569952011, "learning_rate": 7.240353348834106e-09, "loss": 0.0027, "step": 1546 }, { "epoch": 2.9812048192771083, "grad_norm": 0.13271088898181915, "learning_rate": 6.083966657646212e-09, "loss": 0.003, "step": 1547 }, { "epoch": 2.983132530120482, "grad_norm": 0.0962211862206459, "learning_rate": 5.028115863370265e-09, "loss": 0.0021, "step": 1548 }, { "epoch": 2.9850602409638554, "grad_norm": 0.11485985666513443, "learning_rate": 4.072806275163821e-09, "loss": 0.0039, "step": 1549 }, { "epoch": 2.9869879518072286, "grad_norm": 0.15437521040439606, "learning_rate": 3.2180426966332833e-09, "loss": 0.0048, "step": 1550 }, { "epoch": 2.9889156626506024, "grad_norm": 0.09884651750326157, "learning_rate": 2.4638294258072513e-09, "loss": 0.0032, "step": 1551 }, { "epoch": 2.990843373493976, "grad_norm": 0.30931419134140015, "learning_rate": 1.810170255116539e-09, "loss": 0.0038, "step": 1552 }, { "epoch": 2.9927710843373494, "grad_norm": 0.3311678469181061, "learning_rate": 1.2570684713719695e-09, "loss": 0.0247, "step": 1553 }, { "epoch": 2.9946987951807227, "grad_norm": 0.13150249421596527, "learning_rate": 8.045268557443919e-10, "loss": 0.0029, "step": 1554 }, { "epoch": 2.9966265060240964, "grad_norm": 0.10827342420816422, "learning_rate": 4.5254768376468137e-10, "loss": 0.0119, "step": 1555 }, { "epoch": 2.99855421686747, "grad_norm": 0.10358250141143799, "learning_rate": 2.011327252948725e-10, "loss": 0.0038, "step": 1556 }, { "epoch": 3.0, "grad_norm": 0.09550733864307404, "learning_rate": 5.028324453482114e-11, "loss": 0.0016, "step": 1557 }, { "epoch": 3.0, "step": 1557, "total_flos": 2.043435500286509e+18, "train_loss": 0.016654981696585226, "train_runtime": 5294.7714, "train_samples_per_second": 9.403, "train_steps_per_second": 0.294 } ], "logging_steps": 1, "max_steps": 1557, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 92, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.043435500286509e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }